diff --git a/apps/pg-brain-split-recover/pg_backups_20250409_155736/freeleaps-prod-gitea-postgresql-ha-postgresql-0_backup.tar.gz b/apps/pg-brain-split-recover/pg_backups_20250409_155736/freeleaps-prod-gitea-postgresql-ha-postgresql-0_backup.tar.gz new file mode 100644 index 00000000..a2b2dabb Binary files /dev/null and b/apps/pg-brain-split-recover/pg_backups_20250409_155736/freeleaps-prod-gitea-postgresql-ha-postgresql-0_backup.tar.gz differ diff --git a/apps/pg-brain-split-recover/pg_backups_20250409_155736/freeleaps-prod-gitea-postgresql-ha-postgresql-1_backup.tar.gz b/apps/pg-brain-split-recover/pg_backups_20250409_155736/freeleaps-prod-gitea-postgresql-ha-postgresql-1_backup.tar.gz new file mode 100644 index 00000000..b36586ff Binary files /dev/null and b/apps/pg-brain-split-recover/pg_backups_20250409_155736/freeleaps-prod-gitea-postgresql-ha-postgresql-1_backup.tar.gz differ diff --git a/apps/pg-brain-split-recover/pg_backups_20250409_155736/freeleaps-prod-gitea-postgresql-ha-postgresql-2_backup.tar.gz b/apps/pg-brain-split-recover/pg_backups_20250409_155736/freeleaps-prod-gitea-postgresql-ha-postgresql-2_backup.tar.gz new file mode 100644 index 00000000..c413365e Binary files /dev/null and b/apps/pg-brain-split-recover/pg_backups_20250409_155736/freeleaps-prod-gitea-postgresql-ha-postgresql-2_backup.tar.gz differ diff --git a/apps/pg-brain-split-recover/repmgr-split-brain-recovery.sh b/apps/pg-brain-split-recover/repmgr-split-brain-recovery.sh new file mode 100755 index 00000000..42775dd2 --- /dev/null +++ b/apps/pg-brain-split-recover/repmgr-split-brain-recovery.sh @@ -0,0 +1,341 @@ +#!/bin/bash +# filepath: repmgr-split-brain-recovery.sh + +set -e + +NAMESPACE="freeleaps-prod" +STATEFULSET="freeleaps-prod-gitea-postgresql-ha-postgresql" +HEADLESS_SVC="${STATEFULSET}-headless.${NAMESPACE}.svc.freeleaps.cluster" +REPMGR_USER="repmgr" +REPMGR_PASSWORD="WGZ47gbUTLvo" +POSTGRES_PASSWORD="X9H2*9M2ZWYmuZ" +REPMGR_DB="repmgr" +POSTGRES_USER="postgres" +BACKUP_DIR="/tmp/pg_backup_$(date +%Y%m%d_%H%M%S)" +LOCAL_BACKUP_DIR="./pg_backups_$(date +%Y%m%d_%H%M%S)" + +echo "===== PostgreSQL Repmgr Split-Brain Recovery =====" +echo "This script will attempt to fix the repmgr split-brain issue" +echo "" + +# Create local backup directory +mkdir -p $LOCAL_BACKUP_DIR + +# Function to run commands in a pod +run_in_pod() { + local pod=$1 + local cmd=$2 + kubectl exec -n $NAMESPACE $pod -- bash -c "$cmd" +} + +# Function to get PostgreSQL WAL position +get_wal_position() { + local pod=$1 + run_in_pod $pod "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT pg_current_wal_lsn();\"" +} + +# Function to check if node is primary +is_primary() { + local pod=$1 + local result=$(run_in_pod $pod "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT pg_is_in_recovery();\"") + if [[ $result == *"f"* ]]; then + return 0 # is primary + else + return 1 # is standby + fi +} + +# Function to backup databases from a pod +backup_databases() { + local pod=$1 + local backup_path="$BACKUP_DIR/$pod" + + echo "Creating backup directory in the pod..." + run_in_pod $pod "mkdir -p $backup_path" + + echo "Getting list of databases..." + local databases=$(run_in_pod $pod "PGPASSWORD=$POSTGRES_PASSWORD psql -U $POSTGRES_USER -t -c \"SELECT datname FROM pg_database WHERE datname NOT IN ('template0', 'template1', 'postgres')\" | tr -d ' '") + + echo "Backing up databases: $databases" + for db in $databases; do + echo "Backing up database: $db" + run_in_pod $pod "PGPASSWORD=$POSTGRES_PASSWORD pg_dump -U $POSTGRES_USER -Fc $db > $backup_path/${db}.dump" + done + + # Also backup global objects (roles, tablespaces) + echo "Backing up global objects..." + run_in_pod $pod "PGPASSWORD=$POSTGRES_PASSWORD pg_dumpall -U $POSTGRES_USER --globals-only > $backup_path/globals.sql" + + # Backup PostgreSQL configuration + echo "Backing up PostgreSQL configuration..." + run_in_pod $pod "cp /bitnami/postgresql/conf/postgresql.conf $backup_path/ 2>/dev/null || true" + run_in_pod $pod "cp /bitnami/postgresql/conf/pg_hba.conf $backup_path/ 2>/dev/null || true" + + # Copy repmgr configuration + echo "Backing up repmgr configuration..." + run_in_pod $pod "cp /etc/repmgr.conf $backup_path/ 2>/dev/null || true" + + # Tar the backup files + echo "Creating archive of the backup..." + run_in_pod $pod "tar -czf ${backup_path}.tar.gz -C $(dirname $backup_path) $(basename $backup_path)" + + # Copy backup to local machine + echo "Copying backup to local machine..." + kubectl cp $NAMESPACE/$pod:${backup_path}.tar.gz $LOCAL_BACKUP_DIR/${pod}_backup.tar.gz + + # Cleanup backup in the pod + echo "Cleaning up backup files in the pod..." + run_in_pod $pod "rm -rf $backup_path ${backup_path}.tar.gz" +} + +echo "Step 0: Checking current status of the cluster..." +for i in 0 1 2; do + POD="${STATEFULSET}-${i}" + echo -n "Node ${i} ($POD): " + + # Check if node is running as primary + if is_primary $POD; then + PRIMARY_STATE="running as primary" + echo "$PRIMARY_STATE" + else + echo "running as standby" + fi + + # Get WAL position + WAL_POS=$(get_wal_position $POD 2>/dev/null || echo "N/A") + if [ "$WAL_POS" != "N/A" ]; then + echo " - WAL position: $WAL_POS" + # Store WAL positions for comparison + declare "WAL_POS_${i}=$WAL_POS" + fi +done + +echo "" +echo "Step 1: Backing up all databases from each node..." +for i in 0 1 2; do + POD="${STATEFULSET}-${i}" + echo "Backing up data from node $i ($POD)..." + backup_databases $POD +done + +echo "All backups completed and stored in: $LOCAL_BACKUP_DIR" +echo "" + +echo "Determining most advanced node based on WAL position..." + +# Get the primary nodes from each pod - there might be more than one in split-brain +for i in 0 1 2; do + POD="${STATEFULSET}-${i}" + # Get node information + NODE_INFO=$(run_in_pod $POD "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT node_id, node_name, type, active FROM repmgr.nodes WHERE node_name = '$POD';\"" 2>/dev/null || echo "") + + if [ -n "$NODE_INFO" ]; then + echo "Node ${i} info: $NODE_INFO" + + # Store if this node thinks it's a primary + if [[ $NODE_INFO == *"primary"* ]]; then + echo "Node ${i} is configured as a primary" + declare "NODE_${i}_IS_PRIMARY=true" + else + declare "NODE_${i}_IS_PRIMARY=false" + fi + + # Check if node is actually running as primary using pg_is_in_recovery() + if is_primary $POD; then + echo "Node ${i} is running as primary (pg_is_in_recovery=false)" + declare "NODE_${i}_RUNNING_AS_PRIMARY=true" + else + declare "NODE_${i}_RUNNING_AS_PRIMARY=false" + fi + else + echo "Could not get info for node ${i}" + declare "NODE_${i}_IS_PRIMARY=false" + declare "NODE_${i}_RUNNING_AS_PRIMARY=false" + fi +done + +echo "" +echo "Analyzing WAL positions to determine the most advanced node..." + +# Compare WAL positions +if [ -n "${WAL_POS_0}" ] && [ -n "${WAL_POS_1}" ] && [ -n "${WAL_POS_2}" ]; then + # We have all WAL positions, find the most advanced + if run_in_pod ${STATEFULSET}-0 "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT '${WAL_POS_0}' > '${WAL_POS_1}' AND '${WAL_POS_0}' > '${WAL_POS_2}';\"" | grep -q 't'; then + NEW_PRIMARY=0 + elif run_in_pod ${STATEFULSET}-0 "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT '${WAL_POS_1}' > '${WAL_POS_2}';\"" | grep -q 't'; then + NEW_PRIMARY=1 + else + NEW_PRIMARY=2 + fi +elif [ -n "${WAL_POS_0}" ] && [ -n "${WAL_POS_1}" ]; then + # Only nodes 0 and 1 have WAL positions + if run_in_pod ${STATEFULSET}-0 "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT '${WAL_POS_0}' > '${WAL_POS_1}';\"" | grep -q 't'; then + NEW_PRIMARY=0 + else + NEW_PRIMARY=1 + fi +elif [ -n "${WAL_POS_0}" ] && [ -n "${WAL_POS_2}" ]; then + # Only nodes 0 and 2 have WAL positions + if run_in_pod ${STATEFULSET}-0 "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT '${WAL_POS_0}' > '${WAL_POS_2}';\"" | grep -q 't'; then + NEW_PRIMARY=0 + else + NEW_PRIMARY=2 + fi +elif [ -n "${WAL_POS_1}" ] && [ -n "${WAL_POS_2}" ]; then + # Only nodes 1 and 2 have WAL positions + if run_in_pod ${STATEFULSET}-1 "PGPASSWORD=$REPMGR_PASSWORD psql -U $REPMGR_USER -d $REPMGR_DB -t -c \"SELECT '${WAL_POS_1}' > '${WAL_POS_2}';\"" | grep -q 't'; then + NEW_PRIMARY=1 + else + NEW_PRIMARY=2 + fi +elif [ -n "${WAL_POS_0}" ]; then + NEW_PRIMARY=0 +elif [ -n "${WAL_POS_1}" ]; then + NEW_PRIMARY=1 +elif [ -n "${WAL_POS_2}" ]; then + NEW_PRIMARY=2 +else + echo "Could not determine most advanced node. Using node 0 as default primary." + NEW_PRIMARY=0 +fi + +echo "Selected node ${NEW_PRIMARY} as the new primary based on WAL position." +# Fix the bad substitution by using proper indirection +eval WAL_POS_VALUE=\$WAL_POS_${NEW_PRIMARY} +if [ -n "$WAL_POS_VALUE" ]; then + echo "WAL position: $WAL_POS_VALUE" +fi +echo "" + +# Confirm with user +read -p "Backups completed. Do you want to proceed with fixing the split-brain issue? (y/n): " CONFIRM +if [[ "$CONFIRM" != "y" ]]; then + echo "Operation cancelled. Backups are still available at $LOCAL_BACKUP_DIR" + exit 1 +fi + +echo "" +echo "Step 2: Registering node ${NEW_PRIMARY} as primary..." +PRIMARY_POD="${STATEFULSET}-${NEW_PRIMARY}" + +# Create a temporary script to run repmgr commands +run_in_pod $PRIMARY_POD "cat > /tmp/register_primary.sh << EOF +#!/bin/bash +export PGUSER='$REPMGR_USER' +export PGPASSWORD='$REPMGR_PASSWORD' +export PGDATABASE='$REPMGR_DB' +export PATH=\$PATH:/opt/bitnami/repmgr/bin:/opt/bitnami/postgresql/bin + +# Try to find repmgr +repmgr_bin=\$(find /opt/bitnami -name repmgr -type f | head -1) +if [ -z \"\$repmgr_bin\" ]; then + echo \"Could not find repmgr binary\" + exit 1 +fi + +\$repmgr_bin -f /etc/repmgr.conf primary register --force +EOF +chmod +x /tmp/register_primary.sh" + +# Run the script directly +run_in_pod $PRIMARY_POD "bash /tmp/register_primary.sh" + +# Stop PostgreSQL on other nodes +for i in 0 1 2; do + if [ $i -ne $NEW_PRIMARY ]; then + STANDBY_POD="${STATEFULSET}-${i}" + echo "Step 3: Stopping PostgreSQL on standby node ${i}..." + run_in_pod $STANDBY_POD "/opt/bitnami/scripts/postgresql-repmgr/stop.sh" + + echo "Step 4: Cloning primary data to standby node ${i}..." + + # Create a temporary script for cloning the standby that doesn't rely on specific user + run_in_pod $STANDBY_POD "cat > /tmp/clone_standby.sh << EOF +#!/bin/bash +export PGUSER='$REPMGR_USER' +export PGPASSWORD='$REPMGR_PASSWORD' +export PGDATABASE='$REPMGR_DB' +export PATH=\$PATH:/opt/bitnami/repmgr/bin:/opt/bitnami/postgresql/bin + +# Remove existing data +rm -rf /bitnami/postgresql/data/* + +# Try to find repmgr +repmgr_bin=\$(find /opt/bitnami -name repmgr -type f | head -1) +if [ -z \"\$repmgr_bin\" ]; then + echo \"Could not find repmgr binary\" + exit 1 +fi + +\$repmgr_bin -h ${PRIMARY_POD}.${HEADLESS_SVC} -p 5432 standby clone --force +EOF +chmod +x /tmp/clone_standby.sh" + + # Run the clone script directly + run_in_pod $STANDBY_POD "bash /tmp/clone_standby.sh" + + echo "Step 5: Starting PostgreSQL on standby node ${i}..." + run_in_pod $STANDBY_POD "/opt/bitnami/scripts/postgresql-repmgr/start.sh" + + echo "Step 6: Registering node ${i} as standby..." + + # Create a temporary script for registering the standby + run_in_pod $STANDBY_POD "cat > /tmp/register_standby.sh << EOF +#!/bin/bash +export PGUSER='$REPMGR_USER' +export PGPASSWORD='$REPMGR_PASSWORD' +export PGDATABASE='$REPMGR_DB' +export PATH=\$PATH:/opt/bitnami/repmgr/bin:/opt/bitnami/postgresql/bin + +# Try to find repmgr +repmgr_bin=\$(find /opt/bitnami -name repmgr -type f | head -1) +if [ -z \"\$repmgr_bin\" ]; then + echo \"Could not find repmgr binary\" + exit 1 +fi + +\$repmgr_bin -f /etc/repmgr.conf standby register --force +EOF +chmod +x /tmp/register_standby.sh" + + # Run the register script directly + run_in_pod $STANDBY_POD "bash /tmp/register_standby.sh" + fi +done + +echo "" +echo "Step 7: Checking final cluster status..." + +# Create a temporary script for checking cluster status +run_in_pod $PRIMARY_POD "cat > /tmp/cluster_status.sh << EOF +#!/bin/bash +export PGUSER='$REPMGR_USER' +export PGPASSWORD='$REPMGR_PASSWORD' +export PGDATABASE='$REPMGR_DB' +export PATH=\$PATH:/opt/bitnami/repmgr/bin:/opt/bitnami/postgresql/bin + +# Try to find repmgr +repmgr_bin=\$(find /opt/bitnami -name repmgr -type f | head -1) +if [ -z \"\$repmgr_bin\" ]; then + echo \"Could not find repmgr binary\" + exit 1 +fi + +\$repmgr_bin -f /etc/repmgr.conf cluster show +EOF +chmod +x /tmp/cluster_status.sh" + +# Run the cluster status script directly +FINAL_STATUS=$(run_in_pod $PRIMARY_POD "bash /tmp/cluster_status.sh") +echo "$FINAL_STATUS" + +# Clean up temporary scripts +for i in 0 1 2; do + POD="${STATEFULSET}-${i}" + run_in_pod $POD "rm -f /tmp/register_primary.sh /tmp/clone_standby.sh /tmp/register_standby.sh /tmp/cluster_status.sh" || true +done + +echo "" +echo "Split-brain recovery completed." +echo "Your database backups are available at: $LOCAL_BACKUP_DIR" +echo "Please verify that the cluster is now in a consistent state." \ No newline at end of file diff --git a/freeleaps/helm-pkg/3rd/gitea/values.prod.yaml b/freeleaps/helm-pkg/3rd/gitea/values.prod.yaml index 341e31f6..d83fbb9b 100644 --- a/freeleaps/helm-pkg/3rd/gitea/values.prod.yaml +++ b/freeleaps/helm-pkg/3rd/gitea/values.prod.yaml @@ -672,6 +672,9 @@ postgresql-ha: enabled: true clusterDomain: freeleaps.cluster postgresql: + image: + repository: freeleaps/postgresql-repmgr + tag: 16.3.0-debian-12-r20 pdb: create: false repmgrPassword: WGZ47gbUTLvo diff --git a/infra/posgresql/Dockerfile b/infra/posgresql/Dockerfile new file mode 100644 index 00000000..e3509c14 --- /dev/null +++ b/infra/posgresql/Dockerfile @@ -0,0 +1,3 @@ +FROM bitnami/postgresql-repmgr:16.3.0-debian-12-r20 + +ADD librepmgr.sh /opt/bitnami/scripts/librepmgr.sh \ No newline at end of file diff --git a/infra/posgresql/librepmgr.sh b/infra/posgresql/librepmgr.sh new file mode 100644 index 00000000..9f3217e1 --- /dev/null +++ b/infra/posgresql/librepmgr.sh @@ -0,0 +1,894 @@ +#!/bin/bash +# Copyright Broadcom, Inc. All Rights Reserved. +# SPDX-License-Identifier: APACHE-2.0 +# +# Bitnami Postgresql Repmgr library + +# shellcheck disable=SC1091 + +# Load Generic Libraries +. /opt/bitnami/scripts/libfile.sh +. /opt/bitnami/scripts/libfs.sh +. /opt/bitnami/scripts/liblog.sh +. /opt/bitnami/scripts/libos.sh +. /opt/bitnami/scripts/libvalidations.sh +. /opt/bitnami/scripts/libnet.sh + +######################## +# Get repmgr node id +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# String +######################### +repmgr_get_node_id() { + local num + if [[ "$REPMGR_NODE_ID" != "" ]]; then + echo "$REPMGR_NODE_ID" + else + num="${REPMGR_NODE_NAME##*-}" + if [[ "$num" != "" ]]; then + num=$((num + REPMGR_NODE_ID_START_SEED)) + echo "$num" + fi + fi +} + +######################## +# Get repmgr password method +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# String +######################### +repmgr_get_env_password() { + if [[ "$REPMGR_USE_PASSFILE" = "true" ]]; then + echo "PGPASSFILE=${REPMGR_PASSFILE_PATH}" + else + echo "PGPASSWORD=${REPMGR_PASSWORD}" + fi +} + +######################## +# Get repmgr conninfo password method +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# String +######################### +repmgr_get_conninfo_password() { + if [[ "$REPMGR_USE_PASSFILE" = "true" ]]; then + echo "passfile=${REPMGR_PASSFILE_PATH}" + else + echo "password=${REPMGR_PASSWORD}" + fi +} + +######################## +# Validate settings in REPMGR_* env. variables +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# None +######################### +repmgr_validate() { + info "Validating settings in REPMGR_* env vars..." + local error_code=0 + + # Auxiliary functions + print_validation_error() { + error "$1" + error_code=1 + } + + if [[ -z "$REPMGR_PARTNER_NODES" ]]; then + print_validation_error "The list of partner nodes cannot be empty. Set the environment variable REPMGR_PARTNER_NODES with a comma separated list of partner nodes." + fi + if [[ -z "$REPMGR_PRIMARY_HOST" ]]; then + print_validation_error "The initial primary host is required. Set the environment variable REPMGR_PRIMARY_HOST with the initial primary host." + fi + if [[ -z "$REPMGR_NODE_NAME" ]]; then + print_validation_error "The node name is required. Set the environment variable REPMGR_NODE_NAME with the node name." + elif [[ ! "$REPMGR_NODE_NAME" =~ ^.*+-[0-9]+$ ]]; then + print_validation_error "The node name does not follow the required format. Valid format: ^.*+-[0-9]+$" + fi + if [[ -z "$(repmgr_get_node_id)" ]]; then + print_validation_error "The node id is required. Set the environment variable REPMGR_NODE_ID with the node id." + fi + if [[ -z "$REPMGR_NODE_NETWORK_NAME" ]]; then + print_validation_error "The node network name is required. Set the environment variable REPMGR_NODE_NETWORK_NAME with the node network name." + fi + # Credentials validations + if [[ -z "$REPMGR_USERNAME" ]] || [[ -z "$REPMGR_PASSWORD" ]]; then + print_validation_error "The repmgr credentials are mandatory. Set the environment variables REPMGR_USERNAME and REPMGR_PASSWORD with the repmgr credentials." + fi + + if [[ "$REPMGR_USE_PASSFILE" = "true" ]]; then + local -r psql_major_version="$(postgresql_get_major_version)" + if [[ "$psql_major_version" -le "9" ]]; then + warn "Variable REPMGR_USE_PASSFILE is not compatible with PostgreSQL ${psql_major_version}. It will be disabled." + export REPMGR_USE_PASSFILE="false" + fi + fi + + if [[ -z "$REPMGR_NODE_TYPE" ]] || ! [[ "$REPMGR_NODE_TYPE" =~ ^(data|witness)$ ]]; then + print_validation_error "Set the environment variable REPMGR_NODE_TYPE to 'data' or 'witness'." + fi + + if ! is_yes_no_value "$REPMGR_PGHBA_TRUST_ALL"; then + print_validation_error "The allowed values for REPMGR_PGHBA_TRUST_ALL are: yes or no." + fi + if ! is_yes_no_value "$REPMGR_UPGRADE_EXTENSION"; then + print_validation_error "The allowed values for REPMGR_UPGRADE_EXTENSION are: yes or no." + fi + + if ! [[ "$REPMGR_FAILOVER" =~ ^(automatic|manual)$ ]]; then + print_validation_error "The allowed values for REPMGR_FAILOVER are: automatic or manual." + fi + + [[ "$error_code" -eq 0 ]] || exit "$error_code" +} + +######################## +# Ask partner nodes which node is the primary +# Globals: +# REPMGR_* +# Arguments: +# Non +# Returns: +# String[] - (host port) +######################### +repmgr_get_upstream_node() { + local primary_conninfo + local pretending_primary_host="" + local pretending_primary_port="" + local host="" + local port="" + local suggested_primary_host="" + local suggested_primary_port="" + + if [[ -n "$REPMGR_PARTNER_NODES" ]]; then + info "Querying all partner nodes for common upstream node..." + read -r -a nodes <<<"$(tr ',;' ' ' <<<"${REPMGR_PARTNER_NODES}")" + for node in "${nodes[@]}"; do + # intentionally accept inncorect address (without [schema:]// ) + [[ "$node" =~ ^(([^:/?#]+):)?// ]] || node="tcp://${node}" + host="$(parse_uri "$node" 'host')" + port="$(parse_uri "$node" 'port')" + port="${port:-$REPMGR_PRIMARY_PORT}" + debug "Checking node '$host:$port'..." + local query="SELECT conninfo FROM repmgr.show_nodes WHERE (upstream_node_name IS NULL OR upstream_node_name = '') AND active=true" + if ! primary_conninfo="$(echo "$query" | NO_ERRORS=true postgresql_remote_execute "$host" "$port" "$REPMGR_DATABASE" "$REPMGR_USERNAME" "$REPMGR_PASSWORD" "-tA")"; then + debug "Skipping: failed to get primary from the node '$host:$port'!" + continue + elif [[ -z "$primary_conninfo" ]]; then + debug "Skipping: failed to get information about primary nodes!" + continue + elif [[ "$(echo "$primary_conninfo" | wc -l)" -eq 1 ]]; then + suggested_primary_host="$(echo "$primary_conninfo" | awk -F 'host=' '{print $2}' | awk '{print $1}')" + suggested_primary_port="$(echo "$primary_conninfo" | awk -F 'port=' '{print $2}' | awk '{print $1}')" + debug "Pretending primary role node - '${suggested_primary_host}:${suggested_primary_port}'" + if [[ -n "$pretending_primary_host" ]]; then + if [[ "${pretending_primary_host}:${pretending_primary_port}" != "${suggested_primary_host}:${suggested_primary_port}" ]]; then + warn "Conflict of pretending primary role nodes (previously: '${pretending_primary_host}:${pretending_primary_port}', now: '${suggested_primary_host}:${suggested_primary_port}')" + pretending_primary_host="" && pretending_primary_port="" && break + fi + else + debug "Pretending primary set to '${suggested_primary_host}:${suggested_primary_port}'!" + pretending_primary_host="$suggested_primary_host" + pretending_primary_port="$suggested_primary_port" + fi + else + warn "There were more than one primary when getting primary from node '$host:$port'" + pretending_primary_host="" && pretending_primary_port="" && break + fi + done + fi + + echo "$pretending_primary_host" + echo "$pretending_primary_port" +} + +######################## +# Gets the node that is currently set as primary node +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# String[] - (host port) +######################### +repmgr_get_primary_node() { + local upstream_node + local upstream_host + local upstream_port + local primary_host="" + local primary_port="$REPMGR_PRIMARY_PORT" + + readarray -t upstream_node < <(repmgr_get_upstream_node) + upstream_host=${upstream_node[0]} + upstream_port=${upstream_node[1]:-$REPMGR_PRIMARY_PORT} + [[ -n "$upstream_host" ]] && info "Auto-detected primary node: '${upstream_host}:${upstream_port}'" + + if [[ -f "$REPMGR_PRIMARY_ROLE_LOCK_FILE_NAME" ]]; then + info "This node was acting as a primary before restart!" + + if [[ -z "$upstream_host" ]] || [[ "${upstream_host}:${upstream_port}" = "${REPMGR_NODE_NETWORK_NAME}:${REPMGR_PORT_NUMBER}" ]]; then + info "Can not find new primary. Starting PostgreSQL normally..." + else + info "Current master is '${upstream_host}:${upstream_port}'. Cloning/rewinding it and acting as a standby node..." + rm -f "$REPMGR_PRIMARY_ROLE_LOCK_FILE_NAME" + export REPMGR_SWITCH_ROLE="yes" + primary_host="$upstream_host" + primary_port="$upstream_port" + fi + else + if [[ -z "$upstream_host" ]]; then + if [[ "${REPMGR_PRIMARY_HOST}:${REPMGR_PRIMARY_PORT}" != "${REPMGR_NODE_NETWORK_NAME}:${REPMGR_PORT_NUMBER}" ]]; then + primary_host="$REPMGR_PRIMARY_HOST" + primary_port="$REPMGR_PRIMARY_PORT" + fi + else + if [[ "${upstream_host}:${upstream_port}" = "${REPMGR_NODE_NETWORK_NAME}:${REPMGR_PORT_NUMBER}" ]]; then + info "Avoid setting itself as primary. Starting PostgreSQL normally..." + else + primary_host="$upstream_host" + primary_port="$upstream_port" + fi + fi + fi + + [[ -n "$primary_host" ]] && debug "Primary node: '${primary_host}:${primary_port}'" + echo "$primary_host" + echo "$primary_port" +} + +######################## +# Generates env vars for the node +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# Series of exports to be used as 'eval' arguments +######################### +repmgr_set_role() { + local role="standby" + local primary_node + local primary_host + local primary_port + + readarray -t primary_node < <(repmgr_get_primary_node) + primary_host=${primary_node[0]} + primary_port=${primary_node[1]:-$REPMGR_PRIMARY_PORT} + + if [[ "$REPMGR_NODE_TYPE" = "data" ]]; then + if [[ -z "$primary_host" ]]; then + info "There are no nodes with primary role. Assuming the primary role..." + role="primary" + else + info "Node configured as standby" + role="standby" + fi + else + info "Node configured as witness" + role="witness" + fi + + cat <"${POSTGRESQL_MOUNTED_CONF_DIR}/pg_hba.conf" </dev/null; then + debug "User injected custom configuration detected!" + fi + ensure_dir_exists "$POSTGRESQL_MOUNTED_CONF_DIR" + if repmgr_is_file_external "postgresql.conf"; then + cp "${REPMGR_MOUNTED_CONF_DIR}/postgresql.conf" "${POSTGRESQL_MOUNTED_CONF_DIR}/postgresql.conf" + else + repmgr_inject_postgresql_configuration + fi + if repmgr_is_file_external "pg_hba.conf"; then + cp "${REPMGR_MOUNTED_CONF_DIR}/pg_hba.conf" "${POSTGRESQL_MOUNTED_CONF_DIR}/pg_hba.conf" + else + repmgr_inject_pghba_configuration + fi + if [[ "$REPMGR_USE_PASSFILE" = "true" ]] && [[ ! -f "${REPMGR_PASSFILE_PATH}" ]]; then + echo "*:*:*:${REPMGR_USERNAME}:${REPMGR_PASSWORD}" >"${REPMGR_PASSFILE_PATH}" + chmod 600 "${REPMGR_PASSFILE_PATH}" + fi +} + +######################## +# Generates repmgr config files +# Globals: +# REPMGR_* +# POSTGRESQL_* +# Arguments: +# None +# Returns: +# None +######################### +repmgr_generate_repmgr_config() { + info "Preparing repmgr configuration..." + + # If using a distinct WAL directory (${POSTGRESQL_DATA_DIR}/pg_wal is a symlink to an existing dir or $POSTGRESQL_INITDB_WAL_DIR is set a custom value during 1st boot), + # set the "--waldir" option accordingly + local -r waldir=$(postgresql_get_waldir) + local -r waldir_option=$([[ -n "$waldir" ]] && echo "--waldir=$waldir") + + cat <>"${REPMGR_CONF_FILE}.tmp" +event_notification_command='${REPMGR_EVENTS_DIR}/router.sh %n %e %s "%t" "%d"' +ssh_options='-o "StrictHostKeyChecking no" -v' +use_replication_slots='${REPMGR_USE_REPLICATION_SLOTS}' +pg_bindir='${POSTGRESQL_BIN_DIR}' + +# FIXME: these 2 parameter should work +node_id=$(repmgr_get_node_id) +node_name='${REPMGR_NODE_NAME}' +location='${REPMGR_NODE_LOCATION}' +conninfo='user=${REPMGR_USERNAME} $(repmgr_get_conninfo_password) host=${REPMGR_NODE_NETWORK_NAME} dbname=${REPMGR_DATABASE} port=${REPMGR_PORT_NUMBER} connect_timeout=${REPMGR_CONNECT_TIMEOUT}' +failover='${REPMGR_FAILOVER}' +promote_command='$(repmgr_get_env_password) repmgr standby promote -f "${REPMGR_CONF_FILE}" --log-level DEBUG --verbose' +follow_command='$(repmgr_get_env_password) repmgr standby follow -f "${REPMGR_CONF_FILE}" -W --log-level DEBUG --verbose' +reconnect_attempts='${REPMGR_RECONNECT_ATTEMPTS}' +reconnect_interval='${REPMGR_RECONNECT_INTERVAL}' +log_level='${REPMGR_LOG_LEVEL}' +priority='${REPMGR_NODE_PRIORITY}' +monitoring_history='${REPMGR_MONITORING_HISTORY}' +monitor_interval_secs='${REPMGR_MONITOR_INTERVAL_SECS}' +degraded_monitoring_timeout='${REPMGR_DEGRADED_MONITORING_TIMEOUT}' +data_directory='${POSTGRESQL_DATA_DIR}' +async_query_timeout='${REPMGR_MASTER_RESPONSE_TIMEOUT}' +pg_ctl_options='-o "--config-file=\"${POSTGRESQL_CONF_FILE}\" --external_pid_file=\"${POSTGRESQL_PID_FILE}\" --hba_file=\"${POSTGRESQL_PGHBA_FILE}\""' +pg_basebackup_options='$waldir_option' +EOF + + if is_boolean_yes "$REPMGR_FENCE_OLD_PRIMARY"; then + cat <>"${REPMGR_CONF_FILE}.tmp" +child_nodes_disconnect_command='/bin/bash -c ". /opt/bitnami/scripts/libpostgresql.sh && . /opt/bitnami/scripts/postgresql-env.sh && postgresql_stop && kill -TERM 1"' +EOF + if [[ -v REPMGR_CHILD_NODES_CHECK_INTERVAL ]]; then + cat <>"${REPMGR_CONF_FILE}.tmp" +child_nodes_check_interval=${REPMGR_CHILD_NODES_CHECK_INTERVAL} +EOF + fi + if [[ -v REPMGR_CHILD_NODES_CONNECTED_MIN_COUNT ]]; then + cat <>"${REPMGR_CONF_FILE}.tmp" +child_nodes_connected_min_count=${REPMGR_CHILD_NODES_CONNECTED_MIN_COUNT} +EOF + fi + if [[ -v REPMGR_CHILD_NODES_DISCONNECT_TIMEOUT ]]; then + cat <>"${REPMGR_CONF_FILE}.tmp" +child_nodes_disconnect_timeout=${REPMGR_CHILD_NODES_DISCONNECT_TIMEOUT} +EOF + fi + fi + + if [[ "$REPMGR_FENCE_OLD_PRIMARY" == "true" ]]; then + cat <>"${REPMGR_CONF_FILE}.tmp" +child_nodes_disconnect_command='/bin/bash -c ". /opt/bitnami/scripts/libpostgresql.sh && . /opt/bitnami/scripts/postgresql-env.sh && postgresql_stop && kill -TERM 1"' +EOF + if [[ -v REPMGR_CHILD_NODES_CHECK_INTERVAL ]]; then + cat <>"${REPMGR_CONF_FILE}.tmp" +child_nodes_check_interval=${REPMGR_CHILD_NODES_CHECK_INTERVAL} +EOF + fi + if [[ -v REPMGR_CHILD_NODES_CONNECTED_MIN_COUNT ]]; then + cat <>"${REPMGR_CONF_FILE}.tmp" +child_nodes_connected_min_count=${REPMGR_CHILD_NODES_CONNECTED_MIN_COUNT} +EOF + fi + if [[ -v REPMGR_CHILD_NODES_DISCONNECT_TIMEOUT ]]; then + cat <>"${REPMGR_CONF_FILE}.tmp" +child_nodes_disconnect_timeout=${REPMGR_CHILD_NODES_DISCONNECT_TIMEOUT} +EOF + fi + fi + + if [[ -f "${REPMGR_MOUNTED_CONF_DIR}/repmgr.conf" ]]; then + # remove from default the overrided keys, and append the desired conf + grep -xvFf "${REPMGR_CONF_FILE}.tmp" "${REPMGR_MOUNTED_CONF_DIR}/repmgr.conf" | awk -F"=" '{print $1;}' >"${REPMGR_CONF_FILE}.keys" && grep -v -f "${REPMGR_CONF_FILE}.keys" "${REPMGR_CONF_FILE}.tmp" >"$REPMGR_CONF_FILE" && cat "${REPMGR_MOUNTED_CONF_DIR}/repmgr.conf" >>"$REPMGR_CONF_FILE" + else + cp "${REPMGR_CONF_FILE}.tmp" "${REPMGR_CONF_FILE}" + fi + + if [[ "$REPMGR_USE_PASSFILE" = "true" ]]; then + echo "passfile='${REPMGR_PASSFILE_PATH}'" >>"$REPMGR_CONF_FILE" + fi +} + +######################## +# Waits until the primary node responds +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# None +######################### +repmgr_wait_primary_node() { + local return_value=1 + local -i timeout=60 + local -i step=10 + local -i max_tries=$((timeout / step)) + local schemata + info "Waiting for primary node..." + debug "Wait for schema $REPMGR_DATABASE.repmgr on '${REPMGR_CURRENT_PRIMARY_HOST}:${REPMGR_CURRENT_PRIMARY_PORT}', will try $max_tries times with $step delay seconds (TIMEOUT=$timeout)" + for ((i = 0; i <= timeout; i += step)); do + local query="SELECT 1 FROM information_schema.schemata WHERE catalog_name='$REPMGR_DATABASE' AND schema_name='repmgr'" + if ! schemata="$(echo "$query" | NO_ERRORS=true postgresql_remote_execute "$REPMGR_CURRENT_PRIMARY_HOST" "$REPMGR_CURRENT_PRIMARY_PORT" "$REPMGR_DATABASE" "$REPMGR_USERNAME" "$REPMGR_PASSWORD" "-tA")"; then + debug "Host '${REPMGR_CURRENT_PRIMARY_HOST}:${REPMGR_CURRENT_PRIMARY_PORT}' is not accessible" + else + if [[ $schemata -ne 1 ]]; then + debug "Schema $REPMGR_DATABASE.repmgr is still not accessible" + else + debug "Schema $REPMGR_DATABASE.repmgr exists!" + return_value=0 && break + fi + fi + sleep "$step" + done + return $return_value +} + +######################## +# Clones data from primary node +# Globals: +# REPMGR_* +# POSTGRESQL_* +# Arguments: +# None +# Returns: +# None +######################### +repmgr_clone_primary() { + # Clears WAL directory if existing (pg_basebackup requires the WAL dir to be empty) + local -r waldir=$(postgresql_get_waldir) + if [[ -d "$waldir" ]]; then + info "Deleting existing WAL directory $waldir..." + rm -rf "$waldir" && ensure_dir_exists "$waldir" + fi + + info "Cloning data from primary node..." + local -r flags=("-f" "$REPMGR_CONF_FILE" "-h" "$REPMGR_CURRENT_PRIMARY_HOST" "-p" "$REPMGR_CURRENT_PRIMARY_PORT" "-U" "$REPMGR_USERNAME" "-d" "$REPMGR_DATABASE" "-D" "$POSTGRESQL_DATA_DIR" "standby" "clone" "--fast-checkpoint" "--force") + + if [[ "$REPMGR_USE_PASSFILE" = "true" ]]; then + PGPASSFILE="$REPMGR_PASSFILE_PATH" debug_execute "${REPMGR_BIN_DIR}/repmgr" "${flags[@]}" + else + PGPASSWORD="$REPMGR_PASSWORD" debug_execute "${REPMGR_BIN_DIR}/repmgr" "${flags[@]}" + fi + +} + +######################## +# Execute pg_rewind to get data from the primary node +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# None +######################### +repmgr_pgrewind() { + info "Running pg_rewind data to primary node..." + # local -r flags=("-D" "$POSTGRESQL_DATA_DIR" "--source-server" "host=${REPMGR_CURRENT_PRIMARY_HOST} port=${REPMGR_CURRENT_PRIMARY_PORT} user=${REPMGR_USERNAME} dbname=${REPMGR_DATABASE}") + # FIX ISSUE WITH: https://github.com/bitnami/containers/issues/52213 + local -r flags=("-D" "$POSTGRESQL_DATA_DIR" "--source-server" "host=${REPMGR_CURRENT_PRIMARY_HOST} port=${REPMGR_CURRENT_PRIMARY_PORT} user=${REPMGR_USERNAME} dbname=${REPMGR_DATABASE}" "--config-file=postgresql.auto.conf") + + if [[ "$REPMGR_USE_PASSFILE" = "true" ]]; then + PGPASSFILE="$REPMGR_PASSFILE_PATH" debug_execute "${POSTGRESQL_BIN_DIR}/pg_rewind" "${flags[@]}" + else + PGPASSWORD="$REPMGR_PASSWORD" debug_execute "${POSTGRESQL_BIN_DIR}/pg_rewind" "${flags[@]}" + fi +} + +######################## +# Rejoin node +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# None +######################### +repmgr_rewind() { + info "Rejoining node..." + + ensure_dir_exists "$POSTGRESQL_DATA_DIR" + if is_boolean_yes "$REPMGR_USE_PGREWIND"; then + info "Using pg_rewind to primary node..." + if ! repmgr_pgrewind; then + warn "pg_rewind failed, resorting to data cloning" + repmgr_clone_primary + fi + else + repmgr_clone_primary + fi +} + +######################## +# Register a node as primary +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# None +######################### +repmgr_register_primary() { + info "Registering Primary..." + local -r flags=("-f" "$REPMGR_CONF_FILE" "master" "register" "--force") + + debug_execute "${REPMGR_BIN_DIR}/repmgr" "${flags[@]}" +} + +######################## +# Unregister standby node +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# None +######################### +repmgr_unregister_standby() { + info "Unregistering standby node..." + + local -r flags=("standby" "unregister" "-f" "$REPMGR_CONF_FILE" "--node-id=$(repmgr_get_node_id)") + + # The command below can fail when the node doesn't exist yet + debug_execute "${REPMGR_BIN_DIR}/repmgr" "${flags[@]}" || true +} + +######################## +# Unregister witness +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# None +######################### +repmgr_unregister_witness() { + info "Unregistering witness node..." + local -r flags=("-f" "$REPMGR_CONF_FILE" "witness" "unregister" "-h" "$REPMGR_CURRENT_PRIMARY_HOST" "-p" "$REPMGR_CURRENT_PRIMARY_PORT" "--verbose") + + # The command below can fail when the node doesn't exist yet + if [[ "$REPMGR_USE_PASSFILE" = "true" ]]; then + PGPASSFILE="$REPMGR_PASSFILE_PATH" debug_execute "${REPMGR_BIN_DIR}/repmgr" "${flags[@]}" || true + else + PGPASSWORD="$REPMGR_PASSWORD" debug_execute "${REPMGR_BIN_DIR}/repmgr" "${flags[@]}" || true + fi +} + +######################## +# Register witness +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# None +######################### +repmgr_register_witness() { + info "Registering witness node..." + local -r flags=("-f" "$REPMGR_CONF_FILE" "witness" "register" "-h" "$REPMGR_CURRENT_PRIMARY_HOST" "-p" "$REPMGR_CURRENT_PRIMARY_PORT" "--force" "--verbose") + + repmgr_wait_primary_node + + if [[ "$REPMGR_USE_PASSFILE" = "true" ]]; then + PGPASSFILE="$REPMGR_PASSFILE_PATH" debug_execute "${REPMGR_BIN_DIR}/repmgr" "${flags[@]}" + else + PGPASSWORD="$REPMGR_PASSWORD" debug_execute "${REPMGR_BIN_DIR}/repmgr" "${flags[@]}" + fi +} + +######################## +# Standby follow. +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# None +######################### +repmgr_standby_follow() { + info "Running standby follow..." + local -r flags=("standby" "follow" "-f" "$REPMGR_CONF_FILE" "-W" "--log-level" "DEBUG" "--verbose") + + if [[ "$REPMGR_USE_PASSFILE" = "true" ]]; then + PGPASSFILE="$REPMGR_PASSFILE_PATH" debug_execute "${REPMGR_BIN_DIR}/repmgr" "${flags[@]}" + else + PGPASSWORD="$REPMGR_PASSWORD" debug_execute "${REPMGR_BIN_DIR}/repmgr" "${flags[@]}" + fi + +} + +######################## +# Resgister a node as standby +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# None +######################### +repmgr_register_standby() { + info "Registering Standby node..." + local -r flags=("standby" "register" "-f" "$REPMGR_CONF_FILE" "--force" "--verbose") + + debug_execute "${REPMGR_BIN_DIR}/repmgr" "${flags[@]}" +} + +######################## +# Upgrade repmgr extension +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# None +######################### +repmgr_upgrade_extension() { + info "Upgrading repmgr extension..." + + echo "ALTER EXTENSION repmgr UPDATE" | postgresql_execute "$REPMGR_DATABASE" "$REPMGR_USERNAME" "$REPMGR_PASSWORD" +} + +######################## +# Initialize repmgr service +# Globals: +# REPMGR_* +# Arguments: +# None +# Returns: +# None +######################### +repmgr_initialize() { + debug "Node ID: '$(repmgr_get_node_id)', Rol: '$REPMGR_ROLE', Primary Node: '${REPMGR_CURRENT_PRIMARY_HOST}:${REPMGR_CURRENT_PRIMARY_PORT}'" + info "Initializing Repmgr..." + + ensure_dir_exists "$REPMGR_LOCK_DIR" + am_i_root && chown "$POSTGRESQL_DAEMON_USER:$POSTGRESQL_DAEMON_GROUP" "$REPMGR_LOCK_DIR" + + if [[ "$REPMGR_ROLE" = "standby" ]]; then + repmgr_wait_primary_node || exit 1 + repmgr_rewind + fi + postgresql_initialize + if ! repmgr_is_file_external "postgresql.conf"; then + # Allow remote connections, required to register primary and standby nodes + postgresql_enable_remote_connections + # Configure port and restrict access to PostgreSQL (MD5) + postgresql_set_property "port" "$POSTGRESQL_PORT_NUMBER" + + postgresql_configure_replication_parameters + postgresql_configure_fsync + fi + if ! repmgr_is_file_external "pg_hba.conf"; then + is_boolean_yes "$REPMGR_PGHBA_TRUST_ALL" || postgresql_restrict_pghba + fi + if [[ "$REPMGR_ROLE" = "primary" ]]; then + if is_boolean_yes "$POSTGRESQL_FIRST_BOOT"; then + postgresql_start_bg + repmgr_create_repmgr_user + repmgr_create_repmgr_db + # Restart PostgreSQL + postgresql_stop + postgresql_start_bg + repmgr_register_primary + # Allow running custom initialization scripts + postgresql_custom_init_scripts + # Set synchronous replication + POSTGRESQL_CLUSTER_APP_NAME="$REPMGR_PARTNER_NODES" + export POSTGRESQL_CLUSTER_APP_NAME + postgresql_configure_synchronous_replication + elif is_boolean_yes "$REPMGR_UPGRADE_EXTENSION"; then + # Upgrade repmgr extension + postgresql_start_bg + repmgr_upgrade_extension + else + debug "Skipping repmgr configuration..." + fi + elif [[ "$REPMGR_ROLE" = "standby" ]]; then + local -r psql_major_version="$(postgresql_get_major_version)" + + POSTGRESQL_MASTER_PORT_NUMBER="$REPMGR_CURRENT_PRIMARY_PORT" + export POSTGRESQL_MASTER_PORT_NUMBER + POSTGRESQL_MASTER_HOST="$REPMGR_CURRENT_PRIMARY_HOST" + export POSTGRESQL_MASTER_HOST + + postgresql_configure_recovery + postgresql_start_bg + repmgr_unregister_standby + repmgr_register_standby + + if [[ "$psql_major_version" -lt "12" ]]; then + info "Check if primary running..." + repmgr_wait_primary_node + repmgr_standby_follow + fi + elif [[ "$REPMGR_ROLE" = "witness" ]]; then + postgresql_start_bg + repmgr_create_repmgr_user + repmgr_create_repmgr_db + repmgr_unregister_witness + repmgr_register_witness + fi +} \ No newline at end of file