postgresql高可用之repmgr两节点搭建及switchover操作 -m6米乐安卓版下载

多米爸比 2020-02-05

3376

环境
虚拟机centos7.6
ha1:192.168.99.200
ha2:192.168.99.202
postgresql 12.1
repmgr 5.0.0

$ su root
--创建系统用户和组
# groupadd postgres -g 1001
# useradd postgres -g 1001 -u 1001
# id postgres
uid=1001(postgres) gid=1001(postgres) groups=1001(postgres)
# cd /opt
--源文件解压目录
# mkdir -p /opt/postgresql-12.1
# chown postgres:postgres postgresql-12.1
--安装文件目录
# mkdir -p /opt/pg12
# chown -r postgres:postgres pg12
# chmod 0700 pg12
# ln -s /opt/pg12 /opt/pgsql
--数据目录
# mkdir -p /opt/data6000
# chown -r postgres:postgres data6000
# chmod 0700 data6000
 
--归档目录
# mkdir -p /opt/archive6000
# chown -r postgres:postgres archive6000
# chmod 0700 archive6000
# su postgres
$ wget https://ftp.postgresql.org/pub/source/v12.1/postgresql-12.1.tar.gz
$ chmod 750 postgresql-12.1.tar.gz
$ tar zxvf postgresql-12.1.tar.gz
$ cd postgresql-12.1/
--编译
$ ./configure --prefix=/opt/pg12 --with-pgport=6000
--安装
$ gmake world && gmake install-world
--配置开机自启动
$ su root
# cp /opt/postgresql-12.1/contrib/start-scripts/linux /etc/init.d/postgres-12
# chmod  x /etc/init.d/postgres-12
# chkconfig postgres-12 on
# vi /etc/init.d/postgres-12
#! /bin/sh
## edit from here
prefix=/opt/pgsql
pgdata="/opt/data6000"
pguser=postgres
## stop editing here
path=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
daemon="$prefix/bin/postmaster"
pgctl="$prefix/bin/pg_ctl"
set -e
# only start if we can find the postmaster.
test -x $daemon ||
{
	echo "$daemon not found"
	if [ "$1" = "stop" ]
	then exit 0
	else exit 5
	fi
}
# parse command line parameters.
case $1 in
  start)
	echo -n "starting postgresql: "
        su - $pguser -c "$pgctl -d '$pgdata' start >>/dev/null &"	
echo "ok"
	;;
  stop)
	echo -n "stopping postgresql: "
	su - $pguser -c "$pgctl stop -d '$pgdata' -s"
	echo "ok"
	;;
  restart)
	echo -n "restarting postgresql: "
    su - $pguser -c "$pgctl stop -d '$pgdata' -s"
    su - $pguser -c "$pgctl -d '$pgdata' start >>/dev/null &"	
       echo "ok"
	;;
  reload)
	echo -n "reload postgresql: "
	su - $pguser -c "$pgctl reload -d '$pgdata' -s"
	echo "ok"
	;;
  status)
	su - $pguser -c "$pgctl status -d '$pgdata'"
	;;
  *)
	# print help
	echo "usage: $0 {start|stop|restart|reload|status}" 1>&2
	exit 1
	;;
esac
exit 0
--删除源文件
# cd /opt
# rm -rf postgresql-12.1/
--初始化数据目录
# su postgres
$ /opt/pgsql/bin/initdb \
-d/opt/data6000 \
-eutf8 \
-upostgres \
-w
--服务启停
# service postgres-12 start | stop
$ /opt/pgsql/bin/pg_ctl -d /opt/data6000 start |stop

$ vi postgresql.conf
listen_addresses = '*'
port=6000
wal_level=replica
archive_mode=on
archive_command='cp %p /opt/archive6000/%f'
max_wal_senders=10
wal_keep_segments=512
hot_standby=on
primary_conninfo='host=192.168.99.202 port=6000 user=repuser'
logging_collector=on
log_destination=csvlog
log_directory='/opt/data6000/pg_log'
log_filename='pg_log_%y-%m-%d_%h%m%s.log'
log_file_mode=0600
log_truncate_on_rotation=on
log_rotation_age=1d
log_rotation_size=10mb
log_min_messages=warning
log_min_duration_statement=30s
log_checkpoints=on
log_connections=on
log_disconnections=on
log_duration=on
log_lock_waits=on
log_statement=ddl
--机器ha1(192.168.99.200)配置pg_hba.conf
$ vi pg_hba.conf
host all  all 192.168.99.1/24  md5
host replication  all  192.168.99.1/24 trust
$ /opt/pgsql/bin/psql -p6000 -h127.0.0.1 -upostgres
create user repuser replication login connection limit 5 encrypted password'123456';
create table t(id int,info text);
insert into t values(1,now()),(2,now()),(3,now());
--重启ha1(192.168.99.200)
# service postgres-12 restart
--机器ha2(192.168.99.202)使用pg_basebackup做基础备份
$ /opt/pgsql/bin/pg_basebackup -d /opt/data6000 -fp -xs -v -p -h192.168.99.200 -p6000 -urepuser
--如果报错请检查防火墙等配置，例如下面开放6000端口
# firewall-cmd --zone=public --add-port=6000/tcp --permanent
# firewall-cmd --reload
--机器ha2(192.168.99.202)修改postgresql.conf
--修改primary_conninfo信息
$ vi postgresql.conf
listen_addresses = '*'
port=6000
wal_level=replica
archive_mode=on
archive_command='cp %p /opt/archive6000/%f'
max_wal_senders=10
wal_keep_segments=512
hot_standby=on
primary_conninfo='host=192.168.99.200 port=6000 user=repuser'
logging_collector=on
log_destination=csvlog
log_directory='/opt/data6000/pg_log'
log_filename='pg_log_%y-%m-%d_%h%m%s.log'
log_file_mode=0600
log_truncate_on_rotation=on
log_rotation_age=1d
log_rotation_size=10mb
log_min_messages=warning
log_min_duration_statement=30s
log_checkpoints=on
log_connections=on
log_disconnections=on
log_duration=on
log_lock_waits=on
log_statement=ddl
--机器ha2(192.168.99.202)配置standby.signal
$ touch /opt/data6000/standby.signal
--启动ha2(192.168.99.202)
$ /opt/pgsql/bin/pg_ctl -d/opt/data6000 start &
--异步流复制搭建完成
主库：ha1(192.168.99.200)
备库：ha2(192.168.99.202)
--ntp时间同步
# yum install ntp ntpdate
# ntpdate cn.pool.ntp.org
# hwclock --systohc

$ /opt/pgsql/bin/psql -p6000 -upostgres postgres
create user repmgr superuser connection limit 10 password '123456' ;
create database repmgr owner repmgr;

local   repmgr        repmgr                              trust
host    repmgr        repmgr      127.0.0.1/32            trust
host    repmgr        repmgr      192.168.99.0/24          trust
local   replication   repmgr                              trust
host    replication   repmgr      127.0.0.1/32            trust
host    replication   repmgr      192.168.99.0/24          trust

# wget https://github.com/2ndquadrant/repmgr/archive/v5.0.0.tar.gz
# chmod 755 repmgr-5.0.0.tar.gz
# chown postgres:postgres repmgr-5.0.0.tar.gz
# tar zxvf repmgr-5.0.0.tar.gz
# cd /opt/repmgr-5.0.0
# ./configure pg_config=/opt/pgsql/bin/pg_config
# make && make install

--停备库：ha2(192.168.99.202)，并删除数据文件
# service postgres-12 stop
# rm -rf archive6000/*
# rm -rf data6000/*

--数据库访问免密
# mkdir /home/postgres
# chown -r postgres:postgres /home/postgres/
# chmod -r 0700 /home/postgres/
# usermod -d /home/postgres postgres
# su - postgres
$ chmod 0600 .pgpass
$ cat .pgpass 
192.168.99.200:6000:repmgr:repmgr:123456
192.168.99.202:6000:repmgr:repmgr:123456
--ssh免密步骤(操作系统用户postgres)
--1.每个节点都执行
$ ssh-keygen -t rsa
--2.每个节点分别执行
$ ssh-copy-id -i .ssh/id_rsa.pub postgres@192.168.99.202
$ ssh-copy-id -i .ssh/id_rsa.pub postgres@192.168.99.200
--测试ssh互通

--ha1(192.168.99.200)
# vi /opt/repmgr.conf
node_id=1
node_name='node1'
conninfo='host=192.168.99.200 port=6000 user=repmgr dbname=repmgr connect_timeout=2'
data_directory='/opt/data6000'
replication_user='repmgr'
replication_type='physical'
repmgr_bindir='/opt/pgsql/bin'
pg_bindir='/opt/pgsql/bin'
monitoring_history=yes
monitor_interval_secs=5
log_level='debug'
log_file='/opt/data6000/repmgr.log'
# chown -r postgres:postgres repmgr.conf
# chmod 0600 repmgr.conf
--ha2(192.168.99.202)测试repmgr是否能免密访问主库
$ /opt/pgsql/bin/psql -p6000 -h192.168.99.200 -urepmgr

--ha1(192.168.99.200)
$ /opt/pgsql/bin/repmgr -f /opt/repmgr.conf primary register --force
info: connecting to primary database...
notice: attempting to install extension "repmgr"
notice: "repmgr" extension successfully installed
notice: primary node record (id: 1) registered

--ha2(192.168.99.202)
# vi /opt/repmgr.conf
node_id=2
node_name='node2'
conninfo='host=192.168.99.202 port=6000 user=repmgr dbname=repmgr connect_timeout=2'
data_directory='/opt/data6000'
replication_user='repmgr'
replication_type='physical'
repmgr_bindir='/opt/pgsql/bin'
pg_bindir='/opt/pgsql/bin'
monitoring_history=yes
monitor_interval_secs=5
log_level='debug'
log_file='/opt/data6000/repmgr.log'
# chown -r postgres:postgres repmgr.conf
# chmod 0600 repmgr.conf
--check
$ /opt/pgsql/bin/repmgr -h 192.168.99.200 -p6000 -u repmgr -d repmgr -f /opt/repmgr.conf standby clone --dry-run
--clone
$ /opt/pgsql/bin/repmgr -h 192.168.99.200 -p6000 -u repmgr -d repmgr -f /opt/repmgr.conf standby clone
--ha2(192.168.99.202)修改配置文件，primary_conninfo里host修改ha2指向ha1
# vi /opt/data6000/postgresql.conf
primary_conninfo='host=192.168.99.200 port=6000 user=repuser'
--启动备库
# service postgres-12 start
--主库查看postgres进程
[postgres@ha1 ~]$  ps f -u postgres
  pid tty      stat   time command
 6155 pts/1    s      0:00 -bash
 6408 pts/1    r      0:00  \_ ps f -u postgres
 6337 ?        ss     0:00 /opt/pg12/bin/postgres -d /opt/data6000
 6338 ?        ss     0:00  \_ postgres: logger   
 6340 ?        ss     0:00  \_ postgres: checkpointer   
 6341 ?        ss     0:00  \_ postgres: background writer   
 6342 ?        ss     0:00  \_ postgres: walwriter   
 6343 ?        ss     0:00  \_ postgres: autovacuum launcher   
 6344 ?        ss     0:00  \_ postgres: archiver   last was 000000030000000000000012.00000028.backup
 6345 ?        ss     0:00  \_ postgres: stats collector   
 6346 ?        ss     0:00  \_ postgres: logical replication launcher  
 6404 ?        ss     0:00  \_ postgres: walsender repmgr 192.168.99.202(55371) streaming 0/13001a28
--备库查看postgres进程
[postgres@ha2 ~]$  ps f -u postgres
  pid tty      stat   time command
 6297 pts/1    s      0:00 -bash
 7040 pts/1    r      0:00  \_ ps f -u postgres
 3880 pts/0    s      0:00 -bash
 7030 ?        ss     0:00 /opt/pg12/bin/postgres -d /opt/data6000
 7031 ?        ss     0:00  \_ postgres: logger   
 7032 ?        ss     0:00  \_ postgres: startup   recovering 000000030000000000000013
 7033 ?        ss     0:00  \_ postgres: checkpointer   
 7034 ?        ss     0:00  \_ postgres: background writer   
 7035 ?        ss     0:00  \_ postgres: stats collector   
 7036 ?        ss     0:00  \_ postgres: walreceiver 
--可以看到主库walsender进程，备库walreceiver进程   
--主库查看
select * from pg_stat_replication;

--ha2(192.168.99.202)
[postgres@ha2 ~]$ /opt/pgsql/bin/repmgr -f /opt/repmgr.conf standby register --force
info: connecting to local node "node2" (id: 2)
info: connecting to primary database
info: standby registration complete
notice: standby node "node2" (id: 2) successfully registered

--ha1和ha2查看状态一致
[postgres@ha2 ~]$ /opt/pgsql/bin/repmgr -f /opt/repmgr.conf cluster show
 id | name  | role    | status    | upstream | location | priority | timeline | connection string                                                        
---- ------- --------- ----------- ---------- ---------- ---------- ---------- ---------------------------------------------------------------------------
 1  | node1 | primary | * running |          | default  | 100      | 1        | host=192.168.99.200 port=6000 user=repmgr dbname=repmgr connect_timeout=2
 2  | node2 | standby |   running | node1    | default  | 100      | 1        | host=192.168.99.202 port=6000 user=repmgr dbname=repmgr connect_timeout=2

[postgres@ha2 ~]$ /opt/pgsql/bin/repmgr -f /opt/repmgr.conf standby switchover -u repmgr --verbose
notice: using provided configuration file "/opt/repmgr.conf"
warning: following problems with command line parameters detected:
  database connection parameters not required when executing unknown action
notice: executing switchover on node "node2" (id: 2)
info: searching for primary node
info: checking if node 1 is primary
info: current primary node is 1
info: ssh connection to host "192.168.99.200" succeeded
info: 0 pending archive files
info: replication lag on this standby is 0 seconds
notice: local node "node2" (id: 2) will be promoted to primary; current primary "node1" (id: 1) will be demoted to standby
notice: stopping current primary node "node1" (id: 1)
notice: issuing checkpoint
detail: executing server command "/opt/pgsql/bin/pg_ctl  -d '/opt/data6000' -w -m fast stop"
info: checking for primary shutdown; 1 of 60 attempts ("shutdown_check_timeout")
info: checking for primary shutdown; 2 of 60 attempts ("shutdown_check_timeout")
notice: current primary has been cleanly shut down at location 0/14000028
notice: promoting standby to primary
detail: promoting server "node2" (id: 2) using pg_promote()
notice: waiting up to 60 seconds (parameter "promote_check_timeout") for promotion to complete
info: standby promoted to primary after 1 second(s)
notice: standby promote successful
detail: server "node2" (id: 2) was successfully promoted to primary
info: local node 1 can attach to rejoin target node 2
detail: local node's recovery point: 0/14000028; rejoin target node's fork point: 0/140000a0
notice: setting node 1's upstream to node 2
warning: unable to ping "host=192.168.99.200 port=6000 user=repmgr dbname=repmgr connect_timeout=2"
detail: pqping() returned "pqping_no_response"
notice: starting server using "/opt/pgsql/bin/pg_ctl  -w -d '/opt/data6000' start"
notice: node rejoin successful
detail: node 1 is now attached to node 2
notice: switchover was successful
detail: node "node2" is now primary and node "node1" is attached as standby
notice: standby switchover has completed successfully
--从日志可以看到
1.check主库为node1
2.ssh主库node1
3.主库node1检查归档文件和wal
4.主库node1触发checkpoint
5.fast关闭主库node1
6.check主库node1是否关闭
7.主库node1关闭clean后开始promte备库node2
8.节点node2调用pg_promote()函数提升备库为主库
9.等待60秒(promote_check_timeout) 以便promotion完成
10.节点node2完成promotion
11.node1满足rejoin到node2
12.node1执行pg_ctl启动服务，rejoin，attached node2
13.switchover完成
--查看集群状态
[postgres@ha2 ~]$ /opt/pgsql/bin/repmgr -f /opt/repmgr.conf cluster show
 id | name  | role    | status    | upstream | location | priority | timeline | connection string                                                        
---- ------- --------- ----------- ---------- ---------- ---------- ---------- ---------------------------------------------------------------------------
 1  | node1 | standby |   running | node2    | default  | 100      | 4        | host=192.168.99.200 port=6000 user=repmgr dbname=repmgr connect_timeout=2
 2  | node2 | primary | * running |          | default  | 100      | 4        | host=192.168.99.202 port=6000 user=repmgr dbname=repmgr connect_timeout=2

[postgres@ha1 ~]$ /opt/pgsql/bin/repmgr -f /opt/repmgr.conf standby switchover -u repmgr --verbose
notice: using provided configuration file "/opt/repmgr.conf"
warning: following problems with command line parameters detected:
  database connection parameters not required when executing unknown action
notice: executing switchover on node "node1" (id: 1)
info: searching for primary node
info: checking if node 2 is primary
info: current primary node is 2
info: ssh connection to host "192.168.99.202" succeeded
info: 0 pending archive files
info: replication lag on this standby is 0 seconds
notice: local node "node1" (id: 1) will be promoted to primary; current primary "node2" (id: 2) will be demoted to standby
notice: stopping current primary node "node2" (id: 2)
notice: issuing checkpoint
detail: executing server command "/opt/pgsql/bin/pg_ctl  -d '/opt/data6000' -w -m fast stop"
info: checking for primary shutdown; 1 of 60 attempts ("shutdown_check_timeout")
info: checking for primary shutdown; 2 of 60 attempts ("shutdown_check_timeout")
notice: current primary has been cleanly shut down at location 0/15000028
notice: promoting standby to primary
detail: promoting server "node1" (id: 1) using pg_promote()
notice: waiting up to 60 seconds (parameter "promote_check_timeout") for promotion to complete
info: standby promoted to primary after 1 second(s)
notice: standby promote successful
detail: server "node1" (id: 1) was successfully promoted to primary
info: local node 2 can attach to rejoin target node 1
detail: local node's recovery point: 0/15000028; rejoin target node's fork point: 0/150000a0
notice: setting node 2's upstream to node 1
warning: unable to ping "host=192.168.99.202 port=6000 user=repmgr dbname=repmgr connect_timeout=2"
detail: pqping() returned "pqping_no_response"
notice: starting server using "/opt/pgsql/bin/pg_ctl  -w -d '/opt/data6000' start"
notice: node rejoin successful
detail: node 2 is now attached to node 1
notice: switchover was successful
detail: node "node1" is now primary and node "node2" is attached as standby
notice: standby switchover has completed successfully
--从日志可以看到
1.check主库为node2
2.ssh主库node2
3.主库node2检查归档文件和wal
4.主库node2触发checkpoint
5.fast关闭主库node2
6.check主库node2是否关闭
7.主库node2关闭clean后开始promte备库node1
8.节点node1调用pg_promote()函数提升备库为主库
9.等待60秒(promote_check_timeout) 以便promotion完成
10.节点node1完成promotion
11.node2满足rejoin到node1
12.node2执行pg_ctl启动服务，rejoin，attached node1
13.switchover完成
--查看集群状态
[postgres@ha1 ~]$ /opt/pgsql/bin/repmgr -f /opt/repmgr.conf cluster show
 id | name  | role    | status    | upstream | location | priority | timeline | connection string                                                        
---- ------- --------- ----------- ---------- ---------- ---------- ---------- ---------------------------------------------------------------------------
 1  | node1 | primary | * running |          | default  | 100      | 5        | host=192.168.99.200 port=6000 user=repmgr dbname=repmgr connect_timeout=2
 2  | node2 | standby |   running | node1    | default  | 100      | 4        | host=192.168.99.202 port=6000 user=repmgr dbname=repmgr connect_timeout=2
经过两次switchover已经恢复为ha1为主库，ha2为备库。

error: unable to connect via ssh to host "192.168.99.200", user ""
--排查觉得可能是主机名重复引起，因为是虚拟机复制，只改了ip没改主机名
--改完主机名后重新配置ssh互信还是有问题，别的普通用户正常
--删除postgres用户和组
# userdel postgres
# groupdel postgres
# rm -rf /home/postgres/
--重新创建用户和组
# groupadd postgres -g 1001
# useradd postgres -g 1001 -u 1001
# passwd postgres
# su - postgres
--重新配置ssh免密
--1.每个节点都执行
$ ssh-keygen -t rsa
--2.每个节点分别执行
$ ssh-copy-id -i .ssh/id_rsa.pub 192.168.99.202
$ ssh-copy-id -i .ssh/id_rsa.pub 192.168.99.200
--再次测试ssh免密访问成功

[postgres@ha2 ~]$ /opt/pgsql/bin/repmgr -f /opt/repmgr.conf standby register --force
info: connecting to local node "node2" (id: 2)
error: unable to connect to local node "node2" (id: 2)
hint: to register a standby which is not running, additionally provide the primary connection parameters
--备库ha2数据库服务未启动，注册repmgr会报错，启动服务即可

postgresql

「喜欢文章，快来给作者赞赏墨值吧」

文章被以下合辑收录

墨天轮最受欢迎的技术文档-高可用架构篇（共124篇）

数据库高可用如何实现？这份合辑你或许可以用到！本文为大家整理了墨天轮社区上一些受欢迎的数据库高可用架构实践主题相关文档，包含oracle、postgresql、mysql三类数据库，欢迎大家下载查看🌹

postgresql工作实践（共101篇）

关注

postgresql高可用之repmgr两节点搭建及switchover操作 -m6米乐安卓版下载

文章被以下合辑收录

评论