1、数据库集群检查
节点1
[root@dbrac1 backup]# /home/app/11.2.0/grid/bin/crsctl check crs
crs-4638: oracle high availability services is online
crs-4535: cannot communicate with cluster ready services crs-4529: cluster synchronization services is online
crs-4534: cannot communicate with event manager
crs-4535:无法与集群就绪服务通信
crs-4534:无法与事件管理器通信
2、数据库状态检查
节点2
[root@dbrac2 dev]# /home/app/11.2.0/grid/bin/crsctl status res -t
--------------------------------------------------------------------------------
name target state server state_details
--------------------------------------------------------------------------------
local resources
--------------------------------------------------------------------------------
ora.arch.dg
online online dbrac2
ora.data.dg
online online dbrac2
ora.listener.lsnr
online online dbrac2
ora.ocr.dg
online online dbrac2
ora.redo.dg
online online dbrac2
ora.redo1.dg
online online dbrac2
ora.asm
online online dbrac2 started
ora.net1.network
online online dbrac2
ora.ons
online online dbrac2
--------------------------------------------------------------------------------
cluster resources
--------------------------------------------------------------------------------
ora.listener_scan1.lsnr
1 online online dbrac2
ora.cvu
1 online online dbrac2
ora.oc4j
1 online online dbrac2
ora.orcl.db
1 online online dbrac2 open
2 online offline
ora.dbrac1.vip
1 online intermediate dbrac2 failed over
ora.dbrac2.vip
1 online online dbrac2
ora.scan1.vip
1 online online dbrac2
数据库集群节点1无法打开,且重启无效
首先肯定要去检查报警日志的,查哪些?
crs-4535: cannot communicate with cluster ready services
crs-4534: cannot communicate with event manager
这些是集群日志
[grid@dbrac1 ~]$ cd /home/app/11.2.0/grid/log/dbrac1/
[grid@dbrac1 dbrac1]$ ls -lrt
total 156
drwxr-x— 2 root oinstall 6 nov 27 2020 gnsd
drwxr-x— 2 grid oinstall 6 nov 27 2020 srvm
drwxr-x— 2 grid oinstall 6 nov 27 2020 diskmon
drwxr-x— 4 grid oinstall 34 nov 27 2020 cvu
drwxr-xr-x 2 root oinstall 6 nov 27 2020 acfssec
drwxr-x— 2 grid oinstall 6 nov 27 2020 acfsrepl
drwxr-x— 2 grid oinstall 6 nov 27 2020 acfslog
drwxrwxr-t 4 root oinstall 31 nov 27 2020 agent
drwxr-x— 2 grid oinstall 6 nov 27 2020 admin
drwxr-x— 2 root oinstall 6 nov 27 2020 acfsreplroot
drwxr-x— 2 grid oinstall 43 nov 27 2020 mdnsd
drwxr-x— 2 root oinstall 47 nov 27 2020 crfmond
drwxr-x— 2 root oinstall 47 nov 27 2020 crflogd
drwxr-x— 2 grid oinstall 41 nov 27 2020 evmd
drwxrwxr-t 5 grid oinstall 88 dec 17 2020 racg
drwxr-x— 2 root oinstall 201 may 11 21:33 crsd
drwxr-x— 2 root oinstall 213 may 24 09:21 ohasd
drwxr-x— 2 grid oinstall 212 oct 13 06:11 cssd
drwxr-x— 2 root oinstall 224 oct 19 13:27 ctssd
drwxrwxrwt 2 grid oinstall 4096 oct 19 14:16 client
drwxr-x— 2 grid oinstall 65 oct 19 14:50 gpnpd
-rw-rw-r-- 1 grid oinstall 151975 oct 19 15:51 alertqhxsdbrac1.log
drwxr-x— 2 grid oinstall 213 oct 19 16:08 gipcd
这些是数据库和asm日志(与此次问题无关)
首先比对下异常的节点和正常的节点2启动的进程
节点1
[root@dbrac1 dev]# ps -ef| grep d.bin
root 330721 1 4 14:50 ? 00:00:01 /home/app/11.2.0/grid/bin/ohasd.bin reboot
grid 330909 1 0 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/oraagent.bin
grid 330920 1 0 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/mdnsd.bin
grid 330950 1 1 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/gpnpd.bin
root 330989 1 1 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/orarootagent.bin
grid 330992 1 2 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/gipcd.bin
root 331006 1 9 14:50 ? 00:00:02 /home/app/11.2.0/grid/bin/osysmond.bin
root 331029 1 0 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/cssdmonitor
root 331047 1 0 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/cssdagent
grid 331076 1 2 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/ocssd.bin
root 331223 1 1 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/ologgerd -m dbrac2 -r -d /home/app/11.2.0/grid/crf/db/dbrac1
root 331481 1 2 14:51 ? 00:00:00 /home/app/11.2.0/grid/bin/octssd.bin reboot
root 331551 274510 0 14:51 pts/2 00:00:00 grep --color=auto d.bin
[root@qhxsdbrac1 dev]# ps -ef| grep d.bin | wc -l
14
节点2
[root@qhxsdbrac2 dev]# ps -ef| grep d.bin
root 76375 395109 0 16:23 pts/3 00:00:00 grep --color=auto d.bin
root 320436 1 0 5月28 ? 08:07:30 /home/app/11.2.0/grid/bin/ologgerd -m qhxsdbrac1 -r -d /home/app/11.2.0/grid/crf/db/dbrac2
grid 339608 1 0 2021 ? 01:28:23 /home/app/11.2.0/grid/bin/tnslsnr listener -inherit
grid 340795 1 0 2021 ? 04:12:04 /home/app/11.2.0/grid/bin/scriptagent.bin
grid 340818 1 0 2021 ? 01:44:17 /home/app/11.2.0/grid/bin/tnslsnr listener_scan1 -inherit
root 420308 1 0 2020 ? 4-23:24:52 /home/app/11.2.0/grid/bin/ohasd.bin reboot
grid 420430 1 0 2020 ? 3-11:46:22 /home/app/11.2.0/grid/bin/oraagent.bin
grid 420441 1 0 2020 ? 01:01:51 /home/app/11.2.0/grid/bin/mdnsd.bin
grid 420452 1 0 2020 ? 13:43:44 /home/app/11.2.0/grid/bin/gpnpd.bin
root 420462 1 0 2020 ? 4-17:22:04 /home/app/11.2.0/grid/bin/orarootagent.bin
grid 420465 1 0 2020 ? 4-22:02:03 /home/app/11.2.0/grid/bin/gipcd.bin
root 420478 1 13 2020 ? 89-13:54:28 /home/app/11.2.0/grid/bin/osysmond.bin
root 420491 1 0 2020 ? 23:50:01 /home/app/11.2.0/grid/bin/cssdmonitor
root 420508 1 0 2020 ? 23:56:28 /home/app/11.2.0/grid/bin/cssdagent
grid 420520 1 0 2020 ? 4-09:00:12 /home/app/11.2.0/grid/bin/ocssd.bin
root 420617 1 0 2020 ? 3-02:27:49 /home/app/11.2.0/grid/bin/octssd.bin reboot
grid 420644 1 0 2020 ? 3-01:06:44 /home/app/11.2.0/grid/bin/evmd.bin
root 420737 1 1 2020 ? 7-07:53:37 /home/app/11.2.0/grid/bin/crsd.bin reboot
grid 420813 420644 0 2020 ? 00:00:00 /home/app/11.2.0/grid/bin/evmlogger.bin -o /home/app/11.2.0/grid/evm/log/evmlogger.info -l /home/app/11.2.0/grid/evm/log/evmlogger.log
grid 420852 1 0 2020 ? 2-02:15:58 /home/app/11.2.0/grid/bin/oraagent.bin
root 420856 1 0 2020 ? 5-22:49:01 /home/app/11.2.0/grid/bin/orarootagent.bin
oracle 421019 1 0 2020 ? 4-01:21:57 /home/app/11.2.0/grid/bin/oraagent.bin
[root@qhxsdbrac2 dev]# ps -ef| grep d.bin | wc -l
22
比对以后,节点1缺少以下进程,高光的需要优先观察的
grid 339608 1 0 2021 ? 01:28:23 /home/app/11.2.0/grid/bin/tnslsnr listener -inherit
grid 340795 1 0 2021 ? 04:12:04 /home/app/11.2.0/grid/bin/scriptagent.bin
grid 340818 1 0 2021 ? 01:44:17 /home/app/11.2.0/grid/bin/tnslsnr listener_scan1 -inherit
grid 420644 1 0 2020 ? 3-01:06:44 /home/app/11.2.0/grid/bin/evmd.bin
root 420737 1 1 2020 ? 7-07:53:37 /home/app/11.2.0/grid/bin/crsd.bin reboot grid 420813 420644 0 2020 ? 00:00:00 /home/app/11.2.0/grid/bin/evmlogger.bin -o /home/app/11.2.0/grid/evm/log/evmlogger.info -l /home/app/11.2.0/grid/evm/log/evmlogger.log
先看crs,发现几个疑点
第一,最新的报警日志居然是2022年5月28号,怀疑是服务器时间出现了问题,结果是正常的,说明五月份宕机,运行了5个月都没人知道,nb!
第二个,我可以理解不能打开ocr文件,但是为什么asm登录拒绝(这个没多想)
2022-05-28 07:47:37.580: [ crsmain][1032308544] checking the ocr device
2022-05-28 07:47:37.580: [ crsmain][932718336] policy engine is not initialized yet!
2022-05-28 07:47:37.581: [ crsmain][1032308544] sync-up with ocr
2022-05-28 07:47:37.581: [ crsmain][1032308544] connecting to the css daemon
2022-05-28 07:47:37.581: [ crsmain][1032308544] getting local node number
2022-05-28 07:47:37.582: [ crsmain][1032308544] initializing ocr
[ clwal][1032308544]clsw_initialize: olr initlevel [70000]
2022-05-28 07:47:40.853: \[ ocrasm\]\[1032308544\]proprasmo: error in open/create file in dg \[ocr\] [ ocrasm][1032308544]slos : slos: cat=7, opn=kgfoal06, dep=1017, loc=kgfokge
2022-05-28 07:47:40.853: [ ocrasm][1032308544]asm error stack : ora-01017: invalid username/password; logon denied
2022-05-28 07:47:41.909: [ ocrasm][1032308544]proprasmo: kgfocheckmount returned [7]
2022-05-28 07:47:41.909: [ ocrasm][1032308544]proprasmo: the asm instance is down
2022-05-28 07:47:41.911: [ ocrraw][1032308544]proprioo: failed to open [ ocr]. returned proprasmo() with [26]. marking location as unavailable.
2022-05-28 07:47:41.911: [ ocrraw][1032308544]proprioo: no ocr/olr devices are usable
2022-05-28 07:47:41.911: [ ocrasm][1032308544]proprasmcl: asmhandle is null
2022-05-28 07:47:41.911: [ gipc][1032308544] gipccheckinitialization: possible incompatible non-threaded init from [prom.c : 690], original from [clsss.c : 5343]
2022-05-28 07:47:41.913: [ default][1032308544]clsvactversion:4: retrieving active version from local storage.
2022-05-28 07:47:41.916: [ ocrraw][1032308544]proprrepauto: the local ocr configuration matches with the configuration published by ocr cache writer. no repair required.
2022-05-28 07:47:41.917: [ ocrraw][1032308544]proprinit: could not open raw device
2022-05-28 07:47:41.917: [ ocrasm][1032308544]proprasmcl: asmhandle is null
2022-05-28 07:47:41.919: [ ocrapi][1032308544]a_init:16!: backend init unsuccessful : [26]
2022-05-28 07:47:41.919: [ crsocr][1032308544] ocr context init failure. error: proc-26: error while accessing the physical storage
ora-01017: invalid username/password; logon denied
2022-05-28 07:47:41.919: [ crsd][1032308544] created alert : (:crsd00111:) : could not init ocr, error: proc-26: error while accessing the physical storage
ora-01017: invalid username/password; logon denied
2022-05-28 07:47:41.919: [ crsd][1032308544][panic] crsd exiting: could not init ocr, code: 26
2022-05-28 07:47:41.919: [ crsd][1032308544] done.
再看evm,关键点
第一,和crs报警日志差不多
第二、暂时没想法,去看看数据库和asm的日志
2022-05-28 07:47:32.571: [ crsmain][690448192] initializing ocr
[ clwal][690448192]clsw_initialize: olr initlevel [70000]
2022-05-28 07:47:35.365: \[ ocrasm\]\[690448192\]proprasmo: error in open/create file in dg \[ocr\] [ ocrasm][690448192]slos : slos: cat=7, opn=kgfoal06, dep=1017, loc=kgfokge
2022-05-28 07:47:35.365: [ ocrasm][690448192]asm error stack : ora-01017: invalid username/password; logon denied
2022-05-28 07:47:36.423: [ ocrasm][690448192]proprasmo: kgfocheckmount returned [7]
2022-05-28 07:47:36.423: [ ocrasm][690448192]proprasmo: the asm instance is down
2022-05-28 07:47:36.424: [ ocrraw][690448192]proprioo: failed to open [ ocr]. returned proprasmo() with [26]. marking location as unavailable.
2022-05-28 07:47:36.424: [ ocrraw][690448192]proprioo: no ocr/olr devices are usable
2022-05-28 07:47:36.424: [ ocrasm][690448192]proprasmcl: asmhandle is null
2022-05-28 07:47:36.425: [ gipc][690448192] gipccheckinitialization: possible incompatible non-threaded init from [prom.c : 690], original from [clsss.c : 5343]
2022-05-28 07:47:36.426: [ default][690448192]clsvactversion:4: retrieving active version from local storage.
2022-05-28 07:47:36.430: [ ocrraw][690448192]proprrepauto: the local ocr configuration matches with the configuration published by ocr cache writer. no repair required.
2022-05-28 07:47:36.431: [ ocrraw][690448192]proprinit: could not open raw device
2022-05-28 07:47:36.431: [ ocrasm][690448192]proprasmcl: asmhandle is null
2022-05-28 07:47:36.433: [ ocrapi][690448192]a_init:16!: backend init unsuccessful : [26]
2022-05-28 07:47:36.434: [ crsocr][690448192] ocr context init failure. error: proc-26: error while accessing the physical storage
ora-01017: invalid username/password; logon denied
2022-05-28 07:47:36.434: [ crsd][690448192] created alert : (:crsd00111:) : could not init ocr, error: proc-26: error while accessing the physical storage
ora-01017: invalid username/password; logon denied
2022-05-28 07:47:36.434: [ crsd][690448192][panic] crsd exiting: could not init ocr, code: 26
2022-05-28 07:47:36.434: [ crsd][690448192] done.
接着看asm集群日志注意点:
第一,高光的显示直连失败、不能连接、拒绝登录
第二,真没注意这个错误,陷入停滞,去看看磁盘的属主属组?
sat may 28 07:16:54 2022
note: client exited [266185]
warning: asm communication error: op 0 state 0x0 (15055)
error: direct connection failure with asm note: deferred communication with asm instance
errors in file /home/app/grid/diag/asm/ asm/ asm1/trace/ asm1_ora_266205.trc:
ora-15055: unable to connect to asm instance
ora-01017: invalid username/password; logon denied
note: deferred map free for map id 2
tue oct 18 19:23:17 2022
error 29701: unexpected return code 6 from the cluster synchronization service
errors in file /home/app/grid/diag/asm/ asm/ asm1/trace/ asm1_lmon_266127.trc:
ora-29701: unable to connect to cluster synchronization service
tue oct 18 19:23:17 2022
system state dump requested by (instance=1, osid=266127 (lmon)), summary=[abnormal instance termination].
lmon (ospid: 266127): terminating the instance due to error 29701
system state dumped to trace file /home/app/grid/diag/asm/ asm/ asm1/trace/ asm1_diag_266121_20221018192317.trc
dumping diagnostic data in directory=[cdmp_20221018192317], requested by (instance=1, osid=266127 (lmon)), summary=[abnormal instance termination].
instance terminated by lmon, pid = 266127
节点1、2的属主属组都对
[root@dbrac1 dev]# ls -lrt /dev/asm*
brw-rw---- 1 grid asmadmin 65, 128 10月 19 10:18 /dev/asm-ocr2
brw-rw---- 1 grid asmadmin 65, 0 10月 19 10:18 /dev/asm-arch1
brw-rw---- 1 grid asmadmin 65, 32 10月 19 10:18 /dev/asm-arch2
brw-rw---- 1 grid asmadmin 65, 64 10月 19 14:51 /dev/asm-arch3
brw-rw---- 1 grid asmadmin 65, 112 10月 19 14:51 /dev/asm-ocr3
brw-rw---- 1 grid asmadmin 8, 240 10月 19 17:00 /dev/asm-data3
brw-rw---- 1 grid asmadmin 8, 64 10月 19 17:00 /dev/asm-data5
brw-rw---- 1 grid asmadmin 8, 80 10月 19 17:00 /dev/asm-data7
brw-rw---- 1 grid asmadmin 8, 48 10月 19 17:00 /dev/asm-data6
brw-rw---- 1 grid asmadmin 8, 96 10月 19 17:00 /dev/asm-data8
brw-rw---- 1 grid asmadmin 8, 16 10月 19 17:00 /dev/asm-data9
brw-rw---- 1 grid asmadmin 8, 32 10月 19 17:00 /dev/asm-data4
brw-rw---- 1 grid asmadmin 8, 144 10月 19 17:00 /dev/asm-data12
brw-rw---- 1 grid asmadmin 8, 128 10月 19 17:00 /dev/asm-data10
brw-rw---- 1 grid asmadmin 8, 192 10月 19 17:00 /dev/asm-data1
brw-rw---- 1 grid asmadmin 65, 224 10月 19 17:00 /dev/asm-redo2
brw-rw---- 1 grid asmadmin 65, 208 10月 19 17:00 /dev/asm-redo1
brw-rw---- 1 grid asmadmin 8, 208 10月 19 17:00 /dev/asm-data2
brw-rw---- 1 grid asmadmin 8, 176 10月 19 17:00 /dev/asm-data13
brw-rw---- 1 grid asmadmin 8, 160 10月 19 17:00 /dev/asm-data11
brw-rw---- 1 grid asmadmin 65, 176 10月 19 17:00 /dev/asm-ocr1
思维停滞!请求外援
1、sqlplus /as sysam登陆失败,从这点入手
2、不用crsctl命令,用专用的srvctl命令
第一个就拒绝登录的思路,sqlnet.ora可以限制
节点1
[oracle@dbrac1 ~]$ vim /home/app/11.2.0/grid/network/admin/sqlnet.ora
# sqlnet.ora.qhxsdbrac1 network configuration file: /home/app/11.2.0/grid/network/admin/sqlnet.ora.qhxsdbrac1
# generated by oracle configuration tools.
names.directory_path= (tnsnames, ezconnect)
sqlnet.authentication_services=(none)
adr_base = /home/app/grid
节点2
[root@dbrac2 dev]# vim /home/app/11.2.0/grid/network/admin/sqlnet.ora
# sqlnet.ora.qhxsdbrac2 network configuration file: /home/app/11.2.0/grid/network/admin/sqlnet.ora.qhxsdbrac2
# generated by oracle configuration tools.
names.directory_path= (tnsnames, ezconnect)
adr_base = /home/app/grid
经过对比,发现节点1的sqlnet.ora日志多一行 sqlnet.authentication_services=(none)
百度意思是 if sqlnet. authentication_services=none then a valid username and password is need to connect to asm instance.
将其注销即正常启动,没做任何操作
[oracle@dbrac1 ~]$ vim /home/app/11.2.0/grid/network/admin/sqlnet.ora
# sqlnet.ora.dbrac1 network configuration file: /home/app/11.2.0/grid/network/admin/sqlnet.ora.dbrac1
# generated by oracle configuration tools.
names.directory_path= (tnsnames, ezconnect)
#sqlnet.authentication\_services=(none)
adr_base = /home/app/grid
2、不用crsctl命令,用专用的srvctl命令(方便追踪日志)
两者有何区别?下回再验证吧
~