从11G开始,安装RAC已经变成了一个体力活儿,但是RAC安装完成后,如何保证系统的稳定运行,如何得到系统的性能,这个对后期在线系统的稳定运行影响巨大。
下面是总结了最近1年多来工程实施中的一些经验。
###################################################################################
## 0. 环境检查
## 1. IO性能测试
## 2. 网络性能测试
## 3. 高可用测试
## 4. HP-UX 硬件查看命令
## 5. 日志收集
## 备注:该文档基于HP-UX平台整理,平台不同时请参考平台对应的操作命令
###################################################################################
===================================================================================
## 0. 环境检查
===================================================================================
-----------------------------------------------------------------------------------
# 0.1 系统时间/时区
-----------------------------------------------------------------------------------
# date
# cat /etc/TIMEZONE
# cat /etc/default/tz
# echo $TZ
-----------------------------------------------------------------------------------
# 0.2 检测内核参数/异步IO/网络参数
# 异步IO文件权限正确值为: oracle/oinstall 660 101 0x000104 /dev/async
-----------------------------------------------------------------------------------
--HP
# kctune | sort
# ls -la /dev/async
# cat /etc/rc.config.d/nddconf
# netstat -inw
--Linux
# sysctl -a | sort
-----------------------------------------------------------------------------------
# 0.3 互信检查
-----------------------------------------------------------------------------------
$ more /etc/hosts | grep -Ev '^#|^$|127.0.0.1|vip|scan|:' | awk '{print "ssh " $2 " date;"}' > ping.sh
$ sh ./ping.sh
-----------------------------------------------------------------------------------
# 0.4 集群状态检查
-----------------------------------------------------------------------------------
# crsctl stat res -t
# olsnodes -n -i -s -t
# olsnodes -c
# oifcfg getif
-----------------------------------------------------------------------------------
# 0.5 检查实例运行状态
-----------------------------------------------------------------------------------
SQL>
set linesize 160;
select inst_id,instance_name, to_char(startup_time, 'yyyy-mm-dd hh34:mi:ss') START_TIME, status from gv$instance;
-----------------------------------------------------------------------------------
# 0.6 检查ASM DG容量
-----------------------------------------------------------------------------------
SQL>
select group_number,
name,
state,
type,
ROUND(total_mb / 1024, 2) TOTAL_GB,
ROUND(free_mb / 1024, 2) FREE_GB,
round((total_mb - free_mb) / (case total_mb
when 0 then
1
else
total_mb
end) * 100,
2) ratio
from v$asm_diskgroup;
===================================================================================
## 1. IO性能测试
===================================================================================
-----------------------------------------------------------------------------------
# 1.1 calibrate_io测试
-----------------------------------------------------------------------------------
SELECT D.NAME,
I.ASYNCH_IO
FROM V$DATAFILE D,
V$IOSTAT_FILE I
WHERE D.FILE# = I.FILE_NO
AND I.FILETYPE_NAME = 'Data File';
--Enabled both direct I/O and asynchronous I/O where possible.
ALTER SYSTEM SET filesystemio_options=setall SCOPE=SPFILE;
SET SERVEROUTPUT ON
DECLARE
l_latency PLS_INTEGER;
l_iops PLS_INTEGER;
l_mbps PLS_INTEGER;
BEGIN
DBMS_RESOURCE_MANAGER.CALIBRATE_IO (num_physical_disks => 26, --> 磁盘数量 select count(1) from v$asm_disk;
max_latency => 20, --> 最大延迟
max_iops => l_iops,
max_mbps => l_mbps,
actual_latency => l_latency);
DBMS_OUTPUT.put_line('Max IOPS = ' || l_iops);
DBMS_OUTPUT.put_line('Max MBPS = ' || l_mbps);
DBMS_OUTPUT.put_line('Latency = ' || l_latency);
END;
/
--监控IO及HBA卡的速率
# sar -H 2 10000
-----------------------------------------------------------------------------------
# 1.2 fio测试
-----------------------------------------------------------------------------------
===================================================================================
## 2. 网络性能测试
===================================================================================
-----------------------------------------------------------------------------------
# 2.1 netperf测试
-----------------------------------------------------------------------------------
cd /usr/contrib/bin/tools
--服务器端
# netserver -4 -L 192.168.30.76 -p 8000
--客户端
# netperf -H 192.168.30.76 -p 8000 -n 20 -l 60
--测试UDP发包
# netperf -t UDP_RR -H 192.168.30.76 -p 8000 -l 10 -- -r 1500 1500
# netperf -t UDP_RR -H 192.168.30.76 -p 8000 -l 10 -- -r 9000 9000
-----------------------------------------------------------------------------------
## 2.2 iperf测试,Linux中为iperf3
-----------------------------------------------------------------------------------
--服务器端
# iperf -s -p 5001 -i 2
--客户端
# iperf -c 192.168.30.76 -P 4 -t 30 -i 2 -p 5001
-----------------------------------------------------------------------------------
## 2.3 ftp速率测试
-----------------------------------------------------------------------------------
ftp> put "|dd if=/dev/zero bs=32k count=10000" /dev/null
-----------------------------------------------------------------------------------
# 2.4 MUT测试为9000 (tcpdump需要root权限)
-----------------------------------------------------------------------------------
--lan902 心跳网卡
--192.168.20.67 对端心跳网卡地址
--从该节点通过访问对方的心跳IP来建立tcp连接,随后看上面抓到的TCP包即可看到mss值。如果为8960,则说明mtu为9000.
--通过netperf或ping进行发包测试,ICMP 长度为 8976
cd /usr/contrib/bin/tools
# tcpdump -i lan902 ip host 192.168.20.67
# telnet 192.168.20.67 22
# netperf -H 192.168.20.67 -p 8000 -n 20 -l 60
# ping 192.168.20.67 9000
===================================================================================
## 3. 高可用测试
===================================================================================
-----------------------------------------------------------------------------------
# 3.1 存储链路测试
# 查看操作系统日志中链路disabled和endable状态,查看HBA卡数据流量信息
# 查看磁盘链路聚合情况
# 存储链路测试过程中,全程开启DBMS_RESOURCE_MANAGER.CALIBRATE_IO进行加压
-----------------------------------------------------------------------------------
--确认设备链路
# ioscan -m dsf /dev/rdisk/disk101
# tail -f /var/adm/syslog/syslog.log
# sar -H 2 10000
-----------------------------------------------------------------------------------
# 3.2 网卡/交换机测试
# 查看集群日志是否有网络中断情况,查看ASM告警日志 (grid用户)
# 查看网卡聚合情况,查看网卡日志,查看是否丢包
# 网络测试过程中,全程开启netperf发包进行加压
-----------------------------------------------------------------------------------
grid$ cd $ORACLE_HOME/log/`hostname`/; tail -f alert`hostname`.log
grid$ cd $ORACLE_BASE/diag/asm/+asm/$ORACLE_SID/trace; tail -f alert_$ORACLE_SID.log
# glance -> l
# nwmgr -S apa -c lan902 -v
# cd /var/adm; netfmt -f nettl.LOG000
# ping IP
===================================================================================
## 4. HP-UX 硬件查看命令
===================================================================================
-----------------------------------------------------------------------------------
## 4.1 查看设备
-----------------------------------------------------------------------------------
--FC设备
# ioscan -fnCfc
--网卡设备
# ioscan -fnClan
-----------------------------------------------------------------------------------
## 4.2 查看FC设备状态
-----------------------------------------------------------------------------------
# fcmsutil /dev/fcd1
-----------------------------------------------------------------------------------
## 4.3 查看磁盘多路径信息
-----------------------------------------------------------------------------------
# ioscan -m dsf /dev/rdisk/disk101
-----------------------------------------------------------------------------------
## 4.4 查看网卡信息
-----------------------------------------------------------------------------------
# lanscan -qv
# netstat -inw
-----------------------------------------------------------------------------------
## 4.5 查看网卡绑定信息
-----------------------------------------------------------------------------------
--网卡绑定信息
# nwmgr -S apa -c lan902 -v
--网卡优先级
# nwmgr -g -A all -c lan5 -S apa
===================================================================================
## 5. 日志收集
===================================================================================
-----------------------------------------------------------------------------------
## 5.1 ASM日志 (grid)
-----------------------------------------------------------------------------------
cd $ORACLE_BASE/diag/asm/+asm/$ORACLE_SID/trace; tail -f alert_$ORACLE_SID.log
-----------------------------------------------------------------------------------
## 5.2 集群日志 (grid)
-----------------------------------------------------------------------------------
cd $ORACLE_HOME/log/`hostname`/; tail -f alert`hostname`.log
-----------------------------------------------------------------------------------
## 5.3 数据库日志 (oracle)
-----------------------------------------------------------------------------------
cd $ORACLE_BASE/diag/rdbms/`echo ${ORACLE_SID%?}`/$ORACLE_SID/trace; tail -f alert_$ORACLE_SID.log
-----------------------------------------------------------------------------------
## 5.4 操作系统日志 (root)
-----------------------------------------------------------------------------------
tail -f /var/adm/syslog/syslog.log
-----------------------------------------------------------------------------------
## 5.5 网卡日志 (root)
-----------------------------------------------------------------------------------
cd /var/adm; netfmt -f nettl.LOG000