环境
虚拟机centos7.6
ha1:192.168.99.200
ha2:192.168.99.202
postgresql 12.1
repmgr 5.0.0
$ su root
--创建系统用户和组
# groupadd postgres -g 1001
# useradd postgres -g 1001 -u 1001
# id postgres
uid=1001(postgres) gid=1001(postgres) groups=1001(postgres)
# cd /opt
--源文件解压目录
# mkdir -p /opt/postgresql-12.1
# chown postgres:postgres postgresql-12.1
--安装文件目录
# mkdir -p /opt/pg12
# chown -r postgres:postgres pg12
# chmod 0700 pg12
# ln -s /opt/pg12 /opt/pgsql
--数据目录
# mkdir -p /opt/data6000
# chown -r postgres:postgres data6000
# chmod 0700 data6000
--归档目录
# mkdir -p /opt/archive6000
# chown -r postgres:postgres archive6000
# chmod 0700 archive6000
# su postgres
$ wget https://ftp.postgresql.org/pub/source/v12.1/postgresql-12.1.tar.gz
$ chmod 750 postgresql-12.1.tar.gz
$ tar zxvf postgresql-12.1.tar.gz
$ cd postgresql-12.1/
--编译
$ ./configure --prefix=/opt/pg12 --with-pgport=6000
--安装
$ gmake world && gmake install-world
--配置开机自启动
$ su root
# cp /opt/postgresql-12.1/contrib/start-scripts/linux /etc/init.d/postgres-12
# chmod x /etc/init.d/postgres-12
# chkconfig postgres-12 on
# vi /etc/init.d/postgres-12
#! /bin/sh
## edit from here
prefix=/opt/pgsql
pgdata="/opt/data6000"
pguser=postgres
## stop editing here
path=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
daemon="$prefix/bin/postmaster"
pgctl="$prefix/bin/pg_ctl"
set -e
# only start if we can find the postmaster.
test -x $daemon ||
{
echo "$daemon not found"
if [ "$1" = "stop" ]
then exit 0
else exit 5
fi
}
# parse command line parameters.
case $1 in
start)
echo -n "starting postgresql: "
su - $pguser -c "$pgctl -d '$pgdata' start >>/dev/null &"
echo "ok"
;;
stop)
echo -n "stopping postgresql: "
su - $pguser -c "$pgctl stop -d '$pgdata' -s"
echo "ok"
;;
restart)
echo -n "restarting postgresql: "
su - $pguser -c "$pgctl stop -d '$pgdata' -s"
su - $pguser -c "$pgctl -d '$pgdata' start >>/dev/null &"
echo "ok"
;;
reload)
echo -n "reload postgresql: "
su - $pguser -c "$pgctl reload -d '$pgdata' -s"
echo "ok"
;;
status)
su - $pguser -c "$pgctl status -d '$pgdata'"
;;
*)
# print help
echo "usage: $0 {start|stop|restart|reload|status}" 1>&2
exit 1
;;
esac
exit 0
--删除源文件
# cd /opt
# rm -rf postgresql-12.1/
--初始化数据目录
# su postgres
$ /opt/pgsql/bin/initdb \
-d/opt/data6000 \
-eutf8 \
-upostgres \
-w
--服务启停
# service postgres-12 start | stop
$ /opt/pgsql/bin/pg_ctl -d /opt/data6000 start |stop
$ vi postgresql.conf
listen_addresses = '*'
port=6000
wal_level=replica
archive_mode=on
archive_command='cp %p /opt/archive6000/%f'
max_wal_senders=10
wal_keep_segments=512
hot_standby=on
primary_conninfo='host=192.168.99.202 port=6000 user=repuser'
logging_collector=on
log_destination=csvlog
log_directory='/opt/data6000/pg_log'
log_filename='pg_log_%y-%m-%d_%h%m%s.log'
log_file_mode=0600
log_truncate_on_rotation=on
log_rotation_age=1d
log_rotation_size=10mb
log_min_messages=warning
log_min_duration_statement=30s
log_checkpoints=on
log_connections=on
log_disconnections=on
log_duration=on
log_lock_waits=on
log_statement=ddl
--机器ha1(192.168.99.200)配置pg_hba.conf
$ vi pg_hba.conf
host all all 192.168.99.1/24 md5
host replication all 192.168.99.1/24 trust
$ /opt/pgsql/bin/psql -p6000 -h127.0.0.1 -upostgres
create user repuser replication login connection limit 5 encrypted password'123456';
create table t(id int,info text);
insert into t values(1,now()),(2,now()),(3,now());
--重启ha1(192.168.99.200)
# service postgres-12 restart
--机器ha2(192.168.99.202)使用pg_basebackup做基础备份
$ /opt/pgsql/bin/pg_basebackup -d /opt/data6000 -fp -xs -v -p -h192.168.99.200 -p6000 -urepuser
--如果报错请检查防火墙等配置,例如下面开放6000端口
# firewall-cmd --zone=public --add-port=6000/tcp --permanent
# firewall-cmd --reload
--机器ha2(192.168.99.202)修改postgresql.conf
--修改primary_conninfo信息
$ vi postgresql.conf
listen_addresses = '*'
port=6000
wal_level=replica
archive_mode=on
archive_command='cp %p /opt/archive6000/%f'
max_wal_senders=10
wal_keep_segments=512
hot_standby=on
primary_conninfo='host=192.168.99.200 port=6000 user=repuser'
logging_collector=on
log_destination=csvlog
log_directory='/opt/data6000/pg_log'
log_filename='pg_log_%y-%m-%d_%h%m%s.log'
log_file_mode=0600
log_truncate_on_rotation=on
log_rotation_age=1d
log_rotation_size=10mb
log_min_messages=warning
log_min_duration_statement=30s
log_checkpoints=on
log_connections=on
log_disconnections=on
log_duration=on
log_lock_waits=on
log_statement=ddl
--机器ha2(192.168.99.202)配置standby.signal
$ touch /opt/data6000/standby.signal
--启动ha2(192.168.99.202)
$ /opt/pgsql/bin/pg_ctl -d/opt/data6000 start &
--异步流复制搭建完成
主库:ha1(192.168.99.200)
备库:ha2(192.168.99.202)
--ntp时间同步
# yum install ntp ntpdate
# ntpdate cn.pool.ntp.org
# hwclock --systohc
$ /opt/pgsql/bin/psql -p6000 -upostgres postgres
create user repmgr superuser connection limit 10 password '123456' ;
create database repmgr owner repmgr;
local repmgr repmgr trust
host repmgr repmgr 127.0.0.1/32 trust
host repmgr repmgr 192.168.99.0/24 trust
local replication repmgr trust
host replication repmgr 127.0.0.1/32 trust
host replication repmgr 192.168.99.0/24 trust
# wget https://github.com/2ndquadrant/repmgr/archive/v5.0.0.tar.gz
# chmod 755 repmgr-5.0.0.tar.gz
# chown postgres:postgres repmgr-5.0.0.tar.gz
# tar zxvf repmgr-5.0.0.tar.gz
# cd /opt/repmgr-5.0.0
# ./configure pg_config=/opt/pgsql/bin/pg_config
# make && make install
--停备库:ha2(192.168.99.202),并删除数据文件
# service postgres-12 stop
# rm -rf archive6000/*
# rm -rf data6000/*
--数据库访问免密
# mkdir /home/postgres
# chown -r postgres:postgres /home/postgres/
# chmod -r 0700 /home/postgres/
# usermod -d /home/postgres postgres
# su - postgres
$ chmod 0600 .pgpass
$ cat .pgpass
192.168.99.200:6000:repmgr:repmgr:123456
192.168.99.202:6000:repmgr:repmgr:123456
--ssh免密步骤(操作系统用户postgres)
--1.每个节点都执行
$ ssh-keygen -t rsa
--2.每个节点分别执行
$ ssh-copy-id -i .ssh/id_rsa.pub postgres@192.168.99.202
$ ssh-copy-id -i .ssh/id_rsa.pub postgres@192.168.99.200
--测试ssh互通
--ha1(192.168.99.200)
# vi /opt/repmgr.conf
node_id=1
node_name='node1'
conninfo='host=192.168.99.200 port=6000 user=repmgr dbname=repmgr connect_timeout=2'
data_directory='/opt/data6000'
replication_user='repmgr'
replication_type='physical'
repmgr_bindir='/opt/pgsql/bin'
pg_bindir='/opt/pgsql/bin'
monitoring_history=yes
monitor_interval_secs=5
log_level='debug'
log_file='/opt/data6000/repmgr.log'
# chown -r postgres:postgres repmgr.conf
# chmod 0600 repmgr.conf
--ha2(192.168.99.202)测试repmgr是否能免密访问主库
$ /opt/pgsql/bin/psql -p6000 -h192.168.99.200 -urepmgr
--ha1(192.168.99.200)
$ /opt/pgsql/bin/repmgr -f /opt/repmgr.conf primary register --force
info: connecting to primary database...
notice: attempting to install extension "repmgr"
notice: "repmgr" extension successfully installed
notice: primary node record (id: 1) registered
--ha2(192.168.99.202)
# vi /opt/repmgr.conf
node_id=2
node_name='node2'
conninfo='host=192.168.99.202 port=6000 user=repmgr dbname=repmgr connect_timeout=2'
data_directory='/opt/data6000'
replication_user='repmgr'
replication_type='physical'
repmgr_bindir='/opt/pgsql/bin'
pg_bindir='/opt/pgsql/bin'
monitoring_history=yes
monitor_interval_secs=5
log_level='debug'
log_file='/opt/data6000/repmgr.log'
# chown -r postgres:postgres repmgr.conf
# chmod 0600 repmgr.conf
--check
$ /opt/pgsql/bin/repmgr -h 192.168.99.200 -p6000 -u repmgr -d repmgr -f /opt/repmgr.conf standby clone --dry-run
--clone
$ /opt/pgsql/bin/repmgr -h 192.168.99.200 -p6000 -u repmgr -d repmgr -f /opt/repmgr.conf standby clone
--ha2(192.168.99.202)修改配置文件,primary_conninfo里host修改ha2指向ha1
# vi /opt/data6000/postgresql.conf
primary_conninfo='host=192.168.99.200 port=6000 user=repuser'
--启动备库
# service postgres-12 start
--主库查看postgres进程
[postgres@ha1 ~]$ ps f -u postgres
pid tty stat time command
6155 pts/1 s 0:00 -bash
6408 pts/1 r 0:00 \_ ps f -u postgres
6337 ? ss 0:00 /opt/pg12/bin/postgres -d /opt/data6000
6338 ? ss 0:00 \_ postgres: logger
6340 ? ss 0:00 \_ postgres: checkpointer
6341 ? ss 0:00 \_ postgres: background writer
6342 ? ss 0:00 \_ postgres: walwriter
6343 ? ss 0:00 \_ postgres: autovacuum launcher
6344 ? ss 0:00 \_ postgres: archiver last was 000000030000000000000012.00000028.backup
6345 ? ss 0:00 \_ postgres: stats collector
6346 ? ss 0:00 \_ postgres: logical replication launcher
6404 ? ss 0:00 \_ postgres: walsender repmgr 192.168.99.202(55371) streaming 0/13001a28
--备库查看postgres进程
[postgres@ha2 ~]$ ps f -u postgres
pid tty stat time command
6297 pts/1 s 0:00 -bash
7040 pts/1 r 0:00 \_ ps f -u postgres
3880 pts/0 s 0:00 -bash
7030 ? ss 0:00 /opt/pg12/bin/postgres -d /opt/data6000
7031 ? ss 0:00 \_ postgres: logger
7032 ? ss 0:00 \_ postgres: startup recovering 000000030000000000000013
7033 ? ss 0:00 \_ postgres: checkpointer
7034 ? ss 0:00 \_ postgres: background writer
7035 ? ss 0:00 \_ postgres: stats collector
7036 ? ss 0:00 \_ postgres: walreceiver
--可以看到主库walsender进程,备库walreceiver进程
--主库查看
select * from pg_stat_replication;
--ha2(192.168.99.202)
[postgres@ha2 ~]$ /opt/pgsql/bin/repmgr -f /opt/repmgr.conf standby register --force
info: connecting to local node "node2" (id: 2)
info: connecting to primary database
info: standby registration complete
notice: standby node "node2" (id: 2) successfully registered
--ha1和ha2查看状态一致
[postgres@ha2 ~]$ /opt/pgsql/bin/repmgr -f /opt/repmgr.conf cluster show
id | name | role | status | upstream | location | priority | timeline | connection string
---- ------- --------- ----------- ---------- ---------- ---------- ---------- ---------------------------------------------------------------------------
1 | node1 | primary | * running | | default | 100 | 1 | host=192.168.99.200 port=6000 user=repmgr dbname=repmgr connect_timeout=2
2 | node2 | standby | running | node1 | default | 100 | 1 | host=192.168.99.202 port=6000 user=repmgr dbname=repmgr connect_timeout=2
[postgres@ha2 ~]$ /opt/pgsql/bin/repmgr -f /opt/repmgr.conf standby switchover -u repmgr --verbose
notice: using provided configuration file "/opt/repmgr.conf"
warning: following problems with command line parameters detected:
database connection parameters not required when executing unknown action
notice: executing switchover on node "node2" (id: 2)
info: searching for primary node
info: checking if node 1 is primary
info: current primary node is 1
info: ssh connection to host "192.168.99.200" succeeded
info: 0 pending archive files
info: replication lag on this standby is 0 seconds
notice: local node "node2" (id: 2) will be promoted to primary; current primary "node1" (id: 1) will be demoted to standby
notice: stopping current primary node "node1" (id: 1)
notice: issuing checkpoint
detail: executing server command "/opt/pgsql/bin/pg_ctl -d '/opt/data6000' -w -m fast stop"
info: checking for primary shutdown; 1 of 60 attempts ("shutdown_check_timeout")
info: checking for primary shutdown; 2 of 60 attempts ("shutdown_check_timeout")
notice: current primary has been cleanly shut down at location 0/14000028
notice: promoting standby to primary
detail: promoting server "node2" (id: 2) using pg_promote()
notice: waiting up to 60 seconds (parameter "promote_check_timeout") for promotion to complete
info: standby promoted to primary after 1 second(s)
notice: standby promote successful
detail: server "node2" (id: 2) was successfully promoted to primary
info: local node 1 can attach to rejoin target node 2
detail: local node's recovery point: 0/14000028; rejoin target node's fork point: 0/140000a0
notice: setting node 1's upstream to node 2
warning: unable to ping "host=192.168.99.200 port=6000 user=repmgr dbname=repmgr connect_timeout=2"
detail: pqping() returned "pqping_no_response"
notice: starting server using "/opt/pgsql/bin/pg_ctl -w -d '/opt/data6000' start"
notice: node rejoin successful
detail: node 1 is now attached to node 2
notice: switchover was successful
detail: node "node2" is now primary and node "node1" is attached as standby
notice: standby switchover has completed successfully
--从日志可以看到
1.check主库为node1
2.ssh主库node1
3.主库node1检查归档文件和wal
4.主库node1触发checkpoint
5.fast关闭主库node1
6.check主库node1是否关闭
7.主库node1关闭clean后开始promte备库node2
8.节点node2调用pg_promote()函数提升备库为主库
9.等待60秒(promote_check_timeout) 以便promotion完成
10.节点node2完成promotion
11.node1满足rejoin到node2
12.node1执行pg_ctl启动服务,rejoin,attached node2
13.switchover完成
--查看集群状态
[postgres@ha2 ~]$ /opt/pgsql/bin/repmgr -f /opt/repmgr.conf cluster show
id | name | role | status | upstream | location | priority | timeline | connection string
---- ------- --------- ----------- ---------- ---------- ---------- ---------- ---------------------------------------------------------------------------
1 | node1 | standby | running | node2 | default | 100 | 4 | host=192.168.99.200 port=6000 user=repmgr dbname=repmgr connect_timeout=2
2 | node2 | primary | * running | | default | 100 | 4 | host=192.168.99.202 port=6000 user=repmgr dbname=repmgr connect_timeout=2
[postgres@ha1 ~]$ /opt/pgsql/bin/repmgr -f /opt/repmgr.conf standby switchover -u repmgr --verbose
notice: using provided configuration file "/opt/repmgr.conf"
warning: following problems with command line parameters detected:
database connection parameters not required when executing unknown action
notice: executing switchover on node "node1" (id: 1)
info: searching for primary node
info: checking if node 2 is primary
info: current primary node is 2
info: ssh connection to host "192.168.99.202" succeeded
info: 0 pending archive files
info: replication lag on this standby is 0 seconds
notice: local node "node1" (id: 1) will be promoted to primary; current primary "node2" (id: 2) will be demoted to standby
notice: stopping current primary node "node2" (id: 2)
notice: issuing checkpoint
detail: executing server command "/opt/pgsql/bin/pg_ctl -d '/opt/data6000' -w -m fast stop"
info: checking for primary shutdown; 1 of 60 attempts ("shutdown_check_timeout")
info: checking for primary shutdown; 2 of 60 attempts ("shutdown_check_timeout")
notice: current primary has been cleanly shut down at location 0/15000028
notice: promoting standby to primary
detail: promoting server "node1" (id: 1) using pg_promote()
notice: waiting up to 60 seconds (parameter "promote_check_timeout") for promotion to complete
info: standby promoted to primary after 1 second(s)
notice: standby promote successful
detail: server "node1" (id: 1) was successfully promoted to primary
info: local node 2 can attach to rejoin target node 1
detail: local node's recovery point: 0/15000028; rejoin target node's fork point: 0/150000a0
notice: setting node 2's upstream to node 1
warning: unable to ping "host=192.168.99.202 port=6000 user=repmgr dbname=repmgr connect_timeout=2"
detail: pqping() returned "pqping_no_response"
notice: starting server using "/opt/pgsql/bin/pg_ctl -w -d '/opt/data6000' start"
notice: node rejoin successful
detail: node 2 is now attached to node 1
notice: switchover was successful
detail: node "node1" is now primary and node "node2" is attached as standby
notice: standby switchover has completed successfully
--从日志可以看到
1.check主库为node2
2.ssh主库node2
3.主库node2检查归档文件和wal
4.主库node2触发checkpoint
5.fast关闭主库node2
6.check主库node2是否关闭
7.主库node2关闭clean后开始promte备库node1
8.节点node1调用pg_promote()函数提升备库为主库
9.等待60秒(promote_check_timeout) 以便promotion完成
10.节点node1完成promotion
11.node2满足rejoin到node1
12.node2执行pg_ctl启动服务,rejoin,attached node1
13.switchover完成
--查看集群状态
[postgres@ha1 ~]$ /opt/pgsql/bin/repmgr -f /opt/repmgr.conf cluster show
id | name | role | status | upstream | location | priority | timeline | connection string
---- ------- --------- ----------- ---------- ---------- ---------- ---------- ---------------------------------------------------------------------------
1 | node1 | primary | * running | | default | 100 | 5 | host=192.168.99.200 port=6000 user=repmgr dbname=repmgr connect_timeout=2
2 | node2 | standby | running | node1 | default | 100 | 4 | host=192.168.99.202 port=6000 user=repmgr dbname=repmgr connect_timeout=2
经过两次switchover已经恢复为ha1为主库,ha2为备库。
error: unable to connect via ssh to host "192.168.99.200", user ""
--排查觉得可能是主机名重复引起,因为是虚拟机复制,只改了ip没改主机名
--改完主机名后重新配置ssh互信还是有问题,别的普通用户正常
--删除postgres用户和组
# userdel postgres
# groupdel postgres
# rm -rf /home/postgres/
--重新创建用户和组
# groupadd postgres -g 1001
# useradd postgres -g 1001 -u 1001
# passwd postgres
# su - postgres
--重新配置ssh免密
--1.每个节点都执行
$ ssh-keygen -t rsa
--2.每个节点分别执行
$ ssh-copy-id -i .ssh/id_rsa.pub 192.168.99.202
$ ssh-copy-id -i .ssh/id_rsa.pub 192.168.99.200
--再次测试ssh免密访问成功
[postgres@ha2 ~]$ /opt/pgsql/bin/repmgr -f /opt/repmgr.conf standby register --force
info: connecting to local node "node2" (id: 2)
error: unable to connect to local node "node2" (id: 2)
hint: to register a standby which is not running, additionally provide the primary connection parameters
--备库ha2数据库服务未启动,注册repmgr会报错,启动服务即可
「喜欢文章,快来给作者赞赏墨值吧」
【米乐app官网下载的版权声明】本文为墨天轮用户原创内容,转载时必须标注文章的来源(墨天轮),文章链接,文章作者等基本信息,否则作者和墨天轮有权追究责任。如果您发现墨天轮中有涉嫌抄袭或者侵权的内容,欢迎发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。