Public
Snippet $146 authored by Owo Sugiana

Uji Point In Time Recovery

Edited
check-backup.bash
# Sebelum menjalankan ini ikuti petunjuk pembuatan replika di
# https://wiki.opensipkd.com/doku.php?id=panduan:postgresql:replika-postgresql

version=$1
standby_cluster=$2  # Local cluster
primary_ssh_port=$3
local_ssh_user=$4

if [ -z "$standby_cluster" ]; then
    echo Versi dan standby cluster harus disebutkan.
    echo Contoh: sh $0 14 slave
    exit
fi

recovery_cluster=arsip$standby_cluster
recovery_cluster_dir=/var/lib/postgresql/$version/$recovery_cluster
if [ ! -d "$recovery_cluster_dir" ]; then
    echo $recovery_cluster_dir belum ada.
    echo Gunakan pg_basebackup untuk menyalinnya dari primary.
    recovery_cluster=""
fi

standby_cluster_dir=/var/lib/postgresql/$version/$standby_cluster
standby_conf=/etc/postgresql/$version/$standby_cluster/postgresql.conf
if [ -f "$standby_conf" ]; then
    psql_bin=psql
    pg_ctl_bin=/usr/lib/postgresql/$version/bin/pg_ctl
else
    standby_conf=$standby_cluster_dir/postgresql.conf
    psql_bin=/usr/local/pgsql/bin/psql
    pg_ctl_bin=/usr/local/pgsql/bin/pg_ctl
fi
if [ ! -f "$standby_conf" ]; then
    echo $standby_conf tidak ada.
    exit 1
fi

if [ -z "$primary_ssh_port" ]; then
    primary_ssh_port=22
fi

if command -v python3 &> /dev/null
then
    py_bin=python3
else
    py_bin=python
fi

awal_pengujian=$(date +%s)
echo Standby conf: $standby_conf

standby_port=$(grep "port =" $standby_conf | awk '{print $3}')
[ -z "$standby_port" ] && echo "Standby port tidak ditemukan" && exit 1
echo Standby port: $standby_port

is_gt_v11=$(python3 -c "print($version > 11)")
if [ $is_gt_v11 = "True" ]; then
    conninfo=$($psql_bin -p $standby_port -c "SHOW primary_conninfo" | sed -n 3p | xargs)
else
    s=$(grep primary_conninfo $standby_cluster_dir/recovery.conf)
    conninfo=$($py_bin -c "s = $s; print(s)")
fi

if [ -z "$conninfo" ]; then
    echo Cluster $standby_cluster_dir bukan replika.
    exit 1
fi

primary_host=$($py_bin -c "s = '$conninfo'; print(dict([x.split('=') for x in s.split()])['host'])")
primary_port=$($py_bin -c "s = '$conninfo'; print(dict([x.split('=') for x in s.split()])['port'])")

cmd="ssh -p $primary_ssh_port $primary_host 'psql -p $primary_port -c \"SHOW data_directory\"'"
echo $cmd
s=$(eval $cmd | sed -n 3p | awk '{print $1}')
[ -z "$s" ] && exit 1
primary_cluster=$(echo $s | awk -F"/" '{print $6}')
echo Primary cluster: $primary_cluster

cmd="ssh -p $primary_ssh_port $primary_host 'psql -p $primary_port -c \"\dt\"'"
found=$(eval $cmd | grep coba)
if [ -z "$found" ]; then
    cmd="ssh -p $primary_ssh_port $primary_host 'psql -p $primary_port -c \"CREATE TABLE coba (tgl timestamp WITH TIME ZONE PRIMARY KEY DEFAULT now())\"'"
    echo $cmd
    eval $cmd && create_table=1
fi

cmd="ssh -p $primary_ssh_port $primary_host 'psql -p $primary_port -c \"INSERT INTO coba DEFAULT VALUES\"'"
echo $cmd
eval $cmd

cmd="ssh -p $primary_ssh_port $primary_host 'psql -p $primary_port -c \"SET TIME ZONE UTC; SELECT tgl FROM coba ORDER BY 1 DESC LIMIT 1\"'"
echo $cmd
primary_data=$(eval $cmd | grep "^\ 2" | xargs)

echo "Beri kesempatan standby server menyalin data ..."
sleep 5
cmd="$psql_bin -p $standby_port -c \"SET TIME ZONE UTC; SELECT tgl FROM coba ORDER BY 1 DESC LIMIT 1\""
echo $cmd
standby_data=$(eval $cmd | grep "^\ 2" | xargs)

echo "Primary Data: $primary_data"
echo "Standby Data: $standby_data"
if [ "$standby_data" = "$primary_data" ]; then
    status_standby="true"
else
    status_standby="false"
fi

cmd="psql -c \"\dt\" | grep status_backup"
found=$(eval $cmd)
if [ -z "$found" ]; then
    cat << 'EOF' >status_backup.tmp
CREATE TABLE status_backup(
    id SERIAL PRIMARY KEY,
    tgl timestamp WITH TIME ZONE NOT NULL DEFAULT now(),
    version text NOT NULL,
    cluster text NOT NULL,
    port integer NOT NULL,
    status boolean NOT NULL,
    primary_host text NOT NULL,
    primary_cluster text NOT NULL,
    primary_port text NOT NULL,
    UNIQUE (version, cluster));
EOF
    cat status_backup.tmp
    psql -f status_backup.tmp
    rm status_backup.tmp
fi

cmd="psql -c \"SELECT 1 FROM status_backup WHERE version = '$version' AND cluster = '$standby_cluster'\""
found=$(eval $cmd | grep 1)
if [ -z "$found" ]; then
    sql="INSERT INTO status_backup (version, cluster, port, status, primary_host, primary_cluster, primary_port) VALUES ('$version', '$standby_cluster', $standby_port, $status_standby, '$primary_host', '$primary_cluster', $primary_port)"
else
    sql="UPDATE status_backup SET tgl = now(), port = $standby_port, status = $status_standby, primary_host = '$primary_host', primary_cluster = '$primary_cluster', primary_port = $primary_port WHERE version = '$version' AND cluster = '$standby_cluster'"
fi
cmd="psql -c \"$sql\""
echo $cmd
eval $cmd
if [ "$status_standby" = "true" ]; then
    echo SISTEM REPLIKA OK.
else
    echo SISTEM REPLIKA GAGAL.
    exit 1
fi

if [ -z "$recovery_cluster" ]; then
    exit 0
fi

tunggu=5
echo Tunggu $tunggu detik jelang DELETE ...
sleep $tunggu

cmd="ssh -p $primary_ssh_port $primary_host 'psql -p $primary_port -c \"DELETE FROM coba\"'"
echo $cmd
eval $cmd || exit 1

primary_log=/var/log/postgresql/postgresql-$version-$primary_cluster.log
cmd="ssh -p $primary_ssh_port $primary_host 'grep -a \"DELETE FROM coba\" $primary_log | tail -n 1'"
echo $cmd
s=$(eval $cmd)
[ -z "$s" ] && exit 1
echo $s
target_time=$(echo $s | awk -F. '{print $1}')

echo Tunggu $tunggu detik jelang recovery ...
sleep $tunggu

cmd="ssh -p $primary_ssh_port $primary_host 'psql -p $primary_port -c \"SHOW timezone\"'"
echo $cmd
timezone=$(eval $cmd | sed -n 3p | xargs)
cmd="$psql_bin -p $standby_port -c \"SET TIME ZONE '$timezone'; SELECT '$target_time'::timestamptz\""
echo $cmd
target_time=$(eval $cmd | grep ^\ 2 | xargs)

if [ -d /usr/lib/postgresql/$version ]; then
    is_etc=1
    recovery_conf_dir=/etc/postgresql/$version/$recovery_cluster
else
    recovery_conf_dir=$recovery_cluster_dir
fi

recovery_conf=$recovery_conf_dir/postgresql.conf
if [ -f $recovery_conf ]; then
    recovery_port=$(grep "port =" $recovery_conf | awk '{print $3}')
else
    last_port=$(find /etc/postgresql -name postgresql.conf -exec grep "port =" {} \; | sort | tail -n 1 | awk '{print $3}')
    recovery_port=$(( last_port + 1 ))
    cd /etc/postgresql/$version
    cp -rv $standby_cluster $recovery_cluster
    cd $recovery_cluster
    sed -e "s/^port = \([0-9]*\)/port = $recovery_port/g" postgresql.conf > postgresql.conf.tmp 
    sed -e "s/^cluster_name = .*/cluster_name = '$version\/$recovery_cluster'/g" postgresql.conf.tmp > postgresql.conf
    sed -e "s/^stats_temp_directory = .*'/stats_temp_directory = '\/var\/run\/postgresql\/$version-$recovery_cluster.pg_stat_tmp'/g" postgresql.conf > postgresql.conf.tmp
    sed -e "s/data_directory = .*/data_directory = '\/var\/lib\/postgresql\/$version\/$recovery_cluster'/g" postgresql.conf.tmp > postgresql.conf
    sed -e "s/hba_file = .*/hba_file = '\/etc\/postgresql\/$version\/$recovery_cluster\/pg_hba.conf'/g" postgresql.conf > postgresql.conf.tmp
    sed -e "s/ident_file = .*/ident_file = '\/etc\/postgresql\/$version\/$recovery_cluster\/pg_ident.conf'/g" postgresql.conf.tmp > postgresql.conf
    sed -e "s/external_pid_file = .*/external_pid_file = '\/var\/run\/postgresql\/$version-$recovery_cluster.pid'/g" postgresql.conf > postgresql.conf.tmp
    mv postgresql.conf.tmp postgresql.conf
    create_recovery_conf=1
fi

if [ $is_gt_v11 = "True" ]; then
    touch $recovery_cluster_dir/recovery.signal
    if [ -n "$is_etc" ]; then
        recovery_option=$recovery_conf_dir/conf.d/recovery.conf
    fi
else
    recovery_option=$recovery_cluster_dir/recovery.conf
    rm -f $recovery_cluster_dir/recovery.done
fi

cmd="ssh -p $primary_ssh_port $primary_host 'psql -p $primary_port -c \"SHOW archive_command\"'"
echo $cmd
archive_command=$(eval $cmd | sed -n 3p)
history_format=$(echo $archive_command | awk -F":" '{print $2}')
if [ -z "$local_ssh_user" ]; then
    local_ssh_user=$(echo $archive_command | awk -F"@" '{print $1}' | awk '{print $NF}')
fi
restore_command="scp $local_ssh_user@localhost:$history_format %p"

is_gt_v91=$(python3 -c "print($version > 9.1)")
if [ $is_gt_v91 = "True" ]; then
    restore_command="$restore_command || scp $local_ssh_user@localhost:$history_format.partial %p"
else
    arsip_dir=$(echo $archive_command | awk -F":" '{print $2}' | awk -F"/" '{print $1}')
    cmd="rsync -va $local_ssh_user@localhost:$arsip_dir/pg_xlog/ $recovery_cluster_dir/pg_xlog/"
    echo $cmd
    eval $cmd || exit 1

    for f in $(ls $standby_cluster_dir/server.*) ; do
        short=$(echo $f | awk -F"/" '{print $7}')
        [ -f $recovery_cluster_dir/$short ] && continue
        link=$standby_cluster_dir/$short
        source=$(ls -l $link | awk '{print $11}')
        ln -s $source $recovery_cluster_dir/$short || exit 1
        ls -l $recovery_cluster_dir/$short
    done
fi

echo "restore_command = '$restore_command'" > $recovery_option
echo "recovery_target_time = '$target_time'" >> $recovery_option
cmd="cat $recovery_option"
echo $cmd
eval $cmd

cmd="$pg_ctl_bin status -D $recovery_cluster_dir"
echo $cmd
is_running=$(eval $cmd | grep "is running")
if [ -n "$is_running" ]; then
    cmd="$pg_ctl_bin stop -D $recovery_cluster_dir"
    echo $cmd
    eval $cmd || exit 1
fi

recovery_log=/var/log/postgresql/postgresql-$version-$recovery_cluster.log
echo Kosongkan $recovery_log
:> $recovery_log
mkdir -p /var/run/postgresql/$version-$recovery_cluster.pg_stat_tmp
$pg_ctl_bin start \
    -D $recovery_cluster_dir \
    -l $recovery_log \
    -o "-c config_file=$recovery_conf" || exit 1

echo Tunggu server aktif ...
sleep 5

if [ ! -f $recovery_log ]; then
    echo $recovery_log tidak terbentuk.
    exit 1
fi

echo Tunggu recovery selesai ...
timeout=600
old_line=""
is_gt_v9=$(python3 -c "print($version > 9.6)")
awal=$(date +%s)
while true; do
    if [ $is_gt_v9 = "True" ]; then
        ready=$(grep "pg_wal_replay_resume" $recovery_log | tail -n 1)
    else
        ready=$(grep -E "pg_xlog_replay_resume|recovery complete|pemulihan selesai" $recovery_log | tail -n 1)
    fi
    if [ -n "$ready" ]; then
        echo $ready
        break
    fi
    new_line=$(tail -n 1 $recovery_log)
    if [ ! "$new_line" = "$old_line" ]; then
        old_line=$new_line
        echo $new_line
    fi
    akhir=$(date +%s)
    durasi=$(( akhir - awal ))
    is_timeout=$(python3 -c "print($durasi > $timeout)")
    if [ $is_timeout = "True" ]; then
        tail $recovery_log
        echo Timeout.
        exit 1
    fi
    fatal=$(grep FATAL $recovery_log | tail -n 1)
    if [ -n "$fatal" ]; then
        tail $recovery_log
        exit 1
    fi
    sleep 1
done

cmd="$psql_bin -p $recovery_port -c \"SET TIME ZONE UTC; SELECT tgl FROM coba ORDER BY 1 DESC LIMIT 1\""
echo $cmd
recovery_data=$(eval $cmd | grep "^\ 2" | xargs)

$pg_ctl_bin stop \
    -D /var/lib/postgresql/$version/$recovery_cluster

if [ -n "$create_table" ]; then
    cmd="ssh -p $primary_ssh_port $primary_host 'psql -p '$primary_port' -c \"DROP TABLE coba\"'"
    echo $cmd
    eval $cmd
fi

if [ -n "$create_recovery_conf" ]; then
    echo Hapus $recovery_conf_dir
    rm -r $recovery_conf_dir
fi

echo Primary Data : $primary_data
echo Recovery Data: $recovery_data
if [ "$recovery_data" = "$primary_data" ]; then
    status_recovery="true"
else
    status_recovery="false"
fi

sql="SELECT 1 FROM status_backup WHERE version = '$version' AND cluster = '$recovery_cluster'"
found=$(psql -c "$sql" | grep 1)
if [ -z "$found" ]; then
    sql="INSERT INTO status_backup (version, cluster, port, status, primary_host, primary_cluster, primary_port) VALUES ('$version', '$recovery_cluster', $recovery_port, $status_recovery, '$primary_host', '$primary_cluster', $primary_port)"
else
    sql="UPDATE status_backup SET tgl = now(), port = $recovery_port, status = $status_recovery, primary_host = '$primary_host', primary_cluster = '$primary_cluster', primary_port = $primary_port  WHeRE version = '$version' AND cluster = '$recovery_cluster'"
fi
cmd="psql -c \"$sql\""
echo $cmd
eval $cmd
if [ "$status_recovery" = "true" ]; then
    echo SISTEM ARSIP OK.
    exit 0
else
    echo SISTEM ARSIP GAGAL.
    exit 1
fi