参考链接:https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd/#troubleshooting-osds
OSD故障处理总结
在定位OSD故障之前,首先检查MON和网络。执行ceph health或ceph -s命令,如果发现MON有报错,应当去MON上定位问题。其次检查网络是否正常运行,因为OSD的性能极大程度地受到网络影响。在主机端检查丢包,在交换机端检查CRC错误。
获取OSDs的数据信息
要查看是否所有的OSDs都健康运行,请执行:
$ ceph osd stat
# 4个OSD,全部up,为健康状态
[root@node-1 ~]# ceph osd stat
4 osds: 4 up (since 12m), 4 in (since 3d); epoch: e6835
发现up和in状态的OSD数量和总数不匹配,可进一步执行下列命令列出所有OSD状态。
$ ceph osd tree
[root@node-1 ~]# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 0.03918 root default
-3 0.01959 host node-1
0 hdd 0.00980 osd.0 up 1.00000 1.00000
3 hdd 0.00980 osd.3 up 0.09999 1.00000
-5 0.00980 host node-2
1 hdd 0.00980 osd.1 up 1.00000 1.00000
-7 0.00980 host node-3
2 hdd 0.00980 osd.2 up 1.00000 1.00000
如果存在OSD状态为down,说明可能是OSD进程挂了,尝试下列命令来启动进程。
systemctl restart ceph-osd@1
Ceph logs
如果没有特别修改的话,Ceph log 的默认路径为/var/log/ceph。
OSD的log也在此目录下。
[root@node-1 ~]# ls /var/log/ceph/
ceph.audit.log ceph-mon.0.log
ceph-client.admin.log ceph-client.foonew.log
ceph.log ceph-mon.1.log
ceph-mon.2.log ceph-mon.node-1.log
ceph-mds.10.log ceph-mds.1.log
ceph-mds.3.log ceph-mon.q.log
ceph-mds.cepfs10.log ceph-osd.0.log
ceph-mds.cephfs.log ceph-osd.3.log
ceph-mds.mds1.log ceph-volume.log
ceph-mds.mds2.log ceph-volume-systemd.log
ceph-mds.node-1.log ceph-mgr.node-1.log
可以使用下列命令,将OSD日志输出级别调高,方便在日志文件中看到更加详细的日志信息。
ceph daemon osd.0 config set debug_osd 20/20
admin socket
使用admin socket获取运行时信息。查看Ceph守护进程的所有sockets:
$ ls /var/run/ceph
[root@node-1 ~]# ls /var/run/ceph
ceph-mds.cephfs.asok ceph-mds.mds2.asok ceph-mon.node-1.asok ceph-osd.3.asok
ceph-mds.mds1.asok ceph-mgr.node-1.asok ceph-osd.0.asok
查看守护进程的sockets支持哪些命令及其用途:
$ ceph daemon {deamon-name/socket-file} help
[root@node-1 ~]# ceph daemon osd.0 help
{
"bluefs debug_inject_read_zeros": "Injects 8K zeros into next BlueFS read. Debug only.",
"bluestore allocator dump block": "dump allocator free regions",
"bluestore allocator dump bluefs-db": "dump allocator free regions",
"bluestore allocator fragmentation block": "give allocator fragmentation (0-no fragmentation, 1-absolute fragmentation)",
"bluestore allocator fragmentation bluefs-db": "give allocator fragmentation (0-no fragmentation, 1-absolute fragmentation)",
"bluestore allocator score block": "give score on allocator fragmentation (0-no fragmentation, 1-absolute fragmentation)",
"bluestore allocator score bluefs-db": "give score on allocator fragmentation (0-no fragmentation, 1-absolute fragmentation)",
"bluestore bluefs available": "Report available space for bluefs. If alloc_size set, make simulation.",
"bluestore bluefs stats": "Dump internal statistics for bluefs.",
"calc_objectstore_db_histogram": "Generate key value histogram of kvdb(rocksdb) which used by bluestore",
"compact": "Commpact object store's omap. WARNING: Compaction probably slows your requests",
"config diff": "dump diff of current config and default config",
"config diff get": "dump diff get <field>: dump diff of current and default config setting <field>",
"config get": "config get <field>: get the config value",
"config help": "get config setting schema and descriptions",
"config set": "config set <field> <val> [<val> ...]: set a config variable",
"config show": "dump current config settings",
"config unset": "config unset <field>: unset a config variable",
"dump_blacklist": "dump blacklisted clients and times",
"dump_blocked_ops": "show the blocked ops currently in flight",
"dump_historic_ops": "show recent ops",
"dump_historic_ops_by_duration": "show slowest recent ops, sorted by duration",
"dump_historic_slow_ops": "show slowest recent ops",
"dump_mempools": "get mempool stats",
"dump_objectstore_kv_stats": "print statistics of kvdb which used by bluestore",
"dump_op_pq_state": "dump op priority queue state",
"dump_ops_in_flight": "show the ops currently in flight",
"dump_osd_network": "Dump

本文详细介绍了Ceph OSD的故障排查步骤,包括检查MON和网络、查看OSD状态、分析日志、利用admin socket、监控文件系统空间及IO情况。此外,还提到了OSD性能问题的诊断,如网络问题、内存不足、慢请求和OSD抖动,并给出了相应的解决方案和调整建议。

被折叠的 条评论
为什么被折叠?



