1. 问题描述
- 12c 集群 CRS 无法启动,执行几分钟返回报错,从日志中没有发现明确报错信息。
[root@rac2 bin]# ./crsctl start crs
CRS-4124: Oracle High Availability Services startup failed.
CRS-4000: Command Start failed, or completed with errors.
2. 问题分析
- 集群 alert 日志,没有明确报错信息
2025-09-02 18:08:49.075 [CLSECHO(44939)]CRS-10132: Oracle High Availability Service was restarted at least 10 times within the last 60 seconds. Stop auto-restarting Oracle High Availability Service.
- 集群 crsctl 日志,没有明确报错信息
[root@rac2 trace]# cat crsctl_44421.trc
Trace file /u01/app/grid/diag/crs/rac2/crs/trace/crsctl_44421.trc
Oracle Database 12c Clusterware Release 12.2.0.1.0 - Production Copyright 1996, 2016 Oracle. All rights reserved.
2025-09-02 18:08:43.857*:kgfpm.c@1138: kgfpmInitPatchIter: npatches 0
2025-09-02 18:08:43.912 : CRSCTL:811044928: query_releasepatch: No patches have been applied
2025-09-02 18:10:43.988 :GIPCXCPT:811044928: gipcShutdownF: clscrs_term
CRFCLI:811044928: crfcli_init: status 0
2025-09-02 18:10:43.990 : CRFCLI:811044928: crfcli_datatransfer:crfm_init failed so trying to get data from CHM offline mode.
2025-09-02 18:10:44.032 :GIPCXCPT:811044928: gipcShutdownF: clscrs_term
- 集群进程没有启动
[root@rac2 trace]# ps -ef |grep d.bin
root 54600 37533 0 18:34 pts/2 00:00:00 grep --color=auto d.bin
- ohasd 进程调起,2024 年启动进程,由于 2024 年关闭后没有再启动,尝试杀掉23348 进程后,自动拉起init.ohasd run 进程。
[root@rac2 trace]# ps -ef |grep ohas
root 23348 1 0 2024 ? 11:57:32 /bin/sh /etc/init.d/init.ohasd run >/dev/null 2>&1 </dev/null
root 54359 37533 0 18:33 pts/2 00:00:00 grep --color=auto ohas
[root@rac2 trace]# kill -9 23348
[root@rac2 trace]# ps -ef |grep ohas
root 54485 1 0 18:33 ? 00:00:00 /bin/sh /etc/init.d/init.ohasd run >/dev/null 2>&1 </dev/null
root 54536 37533 0 18:33 pts/2 00:00:00 grep --color=auto ohas
- 再次启动 crsctl start crs 仍然报错CRS-4124: Oracle High Availability Services startup failed.
[root@rac2 bin]# ./crsctl start crs
CRS-4124: Oracle High Availability Services startup failed.
CRS-4000: Command Start failed, or completed with errors.
- strace 跟踪 crsctl start crs 命令
ps -ef |grep crsctl
strace -p 44421
- strace 日志输出,频繁输出connect(57, {sa_family=AF_LOCAL, ```
sun_path="/var/tmp/.oracle/sOHASD_UI_SOCKET"}, 110) = -1 ENOENT (No such file or directory)
uname({sysname="Linux", nodename="rac2", ...}) = 0
socketpair(AF_LOCAL, SOCK_STREAM, 0, [60, 61]) = 0
ioctl(60, FIONBIO, [1]) = 0
fcntl(60, F_GETFD) = 0
fcntl(60, F_SETFD, FD_CLOEXEC) = 0
ioctl(61, FIONBIO, [1]) = 0
fcntl(61, F_GETFD) = 0
fcntl(61, F_SETFD, FD_CLOEXEC) = 0
socketpair(AF_LOCAL, SOCK_STREAM, 0, [62, 63]) = 0
ioctl(62, FIONBIO, [1]) = 0
fcntl(62, F_GETFD) = 0
fcntl(62, F_SETFD, FD_CLOEXEC) = 0
ioctl(63, FIONBIO, [1]) = 0
fcntl(63, F_GETFD) = 0
fcntl(63, F_SETFD, FD_CLOEXEC) = 0
socket(AF_LOCAL, SOCK_STREAM, 0) = 64
fcntl(64, F_GETFD) = 0
fcntl(64, F_SETFD, FD_CLOEXEC) = 0
ioctl(64, FIONBIO, [1]) = 0
sendto(60, "\4", 1, MSG_NOSIGNAL, NULL, 0) = 1
connect(64, {sa_family=AF_LOCAL, sun_path="/var/tmp/.oracle/sprocr_local_conn_0_PROL"}, 110) = -1 ENOENT (No such file or directory)
socketpair(AF_LOCAL, SOCK_STREAM, 0, [65, 66]) = 0
ioctl(65, FIONBIO, [1]) = 0
fcntl(65, F_GETFD) = 0
fcntl(65, F_SETFD, FD_CLOEXEC) = 0
ioctl(66, FIONBIO, [1]) = 0
fcntl(66, F_GETFD) = 0
fcntl(66, F_SETFD, FD_CLOEXEC) = 0
shutdown(64, SHUT_RDWR) = 0
close(64) = 0
times(NULL) = 3681111708
shutdown(65, SHUT_RDWR) = 0
close(65) = 0
shutdown(66, SHUT_RDWR) = 0
close(66) = 0
lseek(57, 6144, SEEK_SET) = 6144
read(57, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(57, 6656, SEEK_SET) = 6656
read(57, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(57, 7168, SEEK_SET) = 7168
read(57, "\0\0\0\0\0\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(57, 6144, SEEK_SET) = 6144
read(57, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(57, 6656, SEEK_SET) = 6656
read(57, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(57, 7168, SEEK_SET) = 7168
read(57, "\0\0\0\0\0\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
close(57) = 0
shutdown(60, SHUT_RDWR) = 0
close(60) = 0
shutdown(61, SHUT_RDWR) = 0
close(61) = 0
shutdown(62, SHUT_RDWR) = 0
close(62) = 0
shutdown(63, SHUT_RDWR) = 0
close(63) = 0
uname({sysname="Linux", nodename="rac2", ...}) = 0
socket(AF_LOCAL, SOCK_STREAM, 0) = 57
fcntl(57, F_GETFD) = 0
fcntl(57, F_SETFD, FD_CLOEXEC) = 0
ioctl(57, FIONBIO, [1]) = 0
connect(57, {sa_family=AF_LOCAL, sun_path="/var/tmp/.oracle/sOHASD_UI_SOCKET"}, 110) = -1 ENOENT (No such file or directory)
shutdown(57, SHUT_RDWR) = 0
close(57) = 0
times(NULL) = 3681111709
lseek(50, 143360, SEEK_SET) = 143360
read(50, "\t\0\37\22\0\0>\0 \22\0\0q\0!\22\0\0\244\0\"\22\0\0\313\0#\22\1\0\363\0"..., 512) = 512
lseek(50, 316928, SEEK_SET) = 316928
read(50, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(50, 317440, SEEK_SET) = 317440
read(50, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(50, 317952, SEEK_SET) = 317952
read(50, "\0\0\0\0\0\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(50, 143360, SEEK_SET) = 143360
read(50, "\t\0\37\22\0\0>\0 \22\0\0q\0!\22\0\0\244\0\"\22\0\0\313\0#\22\1\0\363\0"..., 512) = 512
lseek(50, 316928, SEEK_SET) = 316928
read(50, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(50, 317440, SEEK_SET) = 317440
read(50, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(50, 317952, SEEK_SET) = 317952
read(50, "\0\0\0\0\0\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
nanosleep({5, 0}, ^Cstrace: Process 49212 detached
- 检查发现节点 2 没有目录/var/tmp/.oracle/,集群启动时需要在/var/tmp/.oracle/下存在临时文件
[root@rac2 trace]# cd /var/tmp
[root@rac2 tmp]# ls -la
总用量 4
drwxrwxrwt. 8 root root 4096 9月 21 2024 .
drwxr-xr-x. 20 root root 282 10月 13 2023 ..
drwxr-xr-x. 2 abrt abrt 6 10月 13 2023 abrt
- 而正常节点 1 存在目录/var/tmp/.oracle/,且目录下存在大量临时文件
[root@rac1 tmp]# ls -la
总用量 20
drwxrwxrwt. 45 root root 4096 9月 2 17:42 .
drwxr-xr-x. 20 root root 282 10月 12 2023 ..
drwxr-xr-x. 2 abrt abrt 6 10月 12 2023 abrt
drwxrwxrwt 2 root oinstall 8192 9月 2 18:15 .oracle
- 节点 2 手动创建目录/var/tmp/.oracle/
[oracle@rac2:/var/tmp]$mkdir .oracle
[oracle@rac2:/var/tmp]$chmod 777 .oracle/
[oracle@rac2:/var/tmp]$ls -la
total 4
drwxrwxrwt. 9 root root 4096 Sep 2 18:38 .
drwxr-xr-x. 20 root root 282 Oct 13 2023 ..
drwxr-xr-x. 2 abrt abrt 6 Oct 13 2023 abrt
-rw-r--r-- 1 oracle oinstall 0 Sep 2 18:26 a.sql
drwxrwxrwx 2 oracle oinstall 6 Sep 2 18:38 .oracle
- 再次启动集群成功
[root@rac2 bin]# ./crsctl start crs
CRS-4123: Oracle High Availability Services has been started.
3. MOS 文档/var/tmp/.oracle 相关问题
/tmp/.oracle or /var/tmp/.oracle directories or their files (Doc ID 2614225.1)
不要删除.oracle 目录或者文件,这会影响集群正常运行
Bug 27530270 - OHASD Fails to Start if Directory /var/tmp/.oracle is Missing (Doc ID 27530270.8)
这个文档中提到解决方案就是重建目录/var/tmp/.oracle
4. 问题总结
- /var/tmp/.oracle 目录被删除,导致集群无法启动。oracle 和 grid 用户相关文件及目录不建议删除。