业务向ES集群写入数据时收到报错:
ElasticsearchStatusException[
Elasticsearch exception [
type=cluster_block_exception, reason=index [esx_busin_sdx_index_test] blocked by:
[TOO_MANY_REQUESTS/12/disk usage exceeded flood-stage watermark,
index has read-only-allow-delete block
];
]
]
由于磁盘空间不足,只允许对索引做读操作和删除索引。
检查磁盘空间使用
查看集群健康状态和磁盘使用情况:
[root@eshost ~]# curl -u esadmin:adminPass123 -X GET "localhost:9200/_cluster/health?pretty"
{
"cluster_name" : "escluster",
"status" : "yellow",
"timed_out" : false,
"number_of_nodes" : 47,
"number_of_data_nodes" : 41,
"active_primary_shards" : 11721,
"active_shards" : 23296,
"relocating_shards" : 0,
"initializing_shards" : 0,
"unassigned_shards" : 95,
"delayed_unassigned_shards" : 0,
"number_of_pending_tasks" : 0,
"number_of_in_flight_fetch" : 0,
"task_max_waiting_in_queue_millis" : 0,
"active_shards_percent_as_number" : 99.59386088666581
}
[root@eshost ~]# curl -u esadmin:adminPass123 -X GET "localhost:9200/_cluster/stats?pretty" | grep disk
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 11544 100 11544 0 0 38217 0 --:--:-- --:--:-- --:--:-- 38225
查看各节点磁盘使用:
[root@eshost ~]# curl -u esadmin:adminPass123 -X GET "localhost:9200/_nodes/stats/fs?pretty"
[root@eshost ~]# ansible -i /home/xuser/hosts/hosts_es es_uat -m shell -a "df -Th | grep es"
22.23.55.85 | CHANGED | rc=0 >>
Filesystem Type Size Used Avail Use% Mounted on
/dev/mapper/esdata0-lv_esdata0 xfs 6.0T 5.9T 110G 99% /esdata0
/dev/mapper/esdata1-lv_esdata1 xfs 6.0T 5.9T 115G 99% /esdata1
22.23.55.25 | CHANGED | rc=0 >>
Filesystem Type Size Used Avail Use% Mounted on
/dev/mapper/VolGroup1-lv_es_node1 xfs 6.0T 5.9T 117G 99% /es_node1
/dev/mapper/VolGroup3-lv_es_node3 xfs 6.0T 5.8T 235G 97% /es_node3
/dev/mapper/VolGroup2-lv_es_node2 xfs 6.0T 5.8T 232G 97% /es_node2
...
...
果然是磁盘空间不够了。如果本地磁盘空间没办法扩容的话,可以想办法删除不再需要的旧数据。
清理索引旧数据
按索引大小降序排列,查看各索引占用的磁盘空间:
[root@eshost ~]# curl -u esadmin:adminPass123 -s "localhost:9200/_cat/indices?v&s=store.size:desc" | head -n 10
health status index uuid pri rep docs.count docs.deleted store.size pri.store.size
green open esx_infra_security_msbgfilelog_2024-10 IXEblLAlRlmxxxxxx 10 1 3526854010 0 3tb 1.5tb
green open esx_infra_security_msbgfilelog_2025-01 jczfgnyaRAWxxxxxx 10 1 3557913663 0 2.9tb 1.4tb
green open esx_infra_security_msbgfilelog_2025-03 WgobLQ_dSxyxxxxxx 10 1 3544630550 0 2.9tb 1.4tb
green open esx_infra_security_msbgfilelog_2024-12 mZFA8PSMQE6xxxxxx 10 1 3496172725 0 2.9tb 1.4tb
green open esx_infra_security_msbgfilelog_2025-05 RT40X4fJQRSxxxxxx 10 1 3438523516 0 2.8tb 1.4tb
green open esx_infra_security_msbgfilelog_2025-04 5O023SGLTSGxxxxxx 10 1 3385972439 0 2.8tb 1.4tb
green open esx_infra_security_msbgfilelog_2024-09 _mXc1NOjRryxxxxxx 10 1 3326446791 0 2.8tb 1.4tb
green open esx_infra_security_msbgfilelog_2024-11 yZv3UrPFTryxxxxxx 10 1 3279153784 0 2.8tb 1.4tb
green open esx_infra_security_msbgfilelog_2025-02 HEeQwDMuTbCxxxxxx 10 1 3204650774 0 2.6tb 1.3tb
其中:
store.size
:索引总大小(主分片 + 副本)。pri.store.size
:主分片大小。
由于索引名esx_infra_security_msbgfilelog_
后缀是按月份创建的,可以按如下方法删除该索引10个月前的旧数据:
echo $(date --date='12 months ago' +%Y-%m)
echo $(date --date='11 months ago' +%Y-%m)
echo $(date --date='10 months ago' +%Y-%m)
curl -u esadmin:adminPass123 -X DELETE "localhost:9200/esx_infra_security_msbgfilelog_$(date --date='12 months ago' +%Y-%m)"
curl -u esadmin:adminPass123 -X DELETE "localhost:9200/esx_infra_security_msbgfilelog_$(date --date='11 months ago' +%Y-%m)"
curl -u esadmin:adminPass123 -X DELETE "localhost:9200/esx_infra_security_msbgfilelog_$(date --date='10 months ago' +%Y-%m)"
检查是否删除成功:
[root@eshost ~]# curl -u esadmin:adminPass123 -s "localhost:9200/_cat/indices?v&s=store.size:desc" | head -n 10
health status index uuid pri rep docs.count docs.deleted store.size pri.store.size
green open esx_infra_security_msbgfilelog_2025-01 jczfgnyaRAWZxxxxx 10 1 3557913663 0 2.9tb 1.4tb
green open esx_infra_security_msbgfilelog_2025-03 WgobLQ_dSxypxxxxx 10 1 3544630550 0 2.9tb 1.4tb
green open esx_infra_security_msbgfilelog_2024-12 mZFA8PSMQE6Ixxxxx 10 1 3496172725 0 2.9tb 1.4tb
green open esx_infra_security_msbgfilelog_2025-05 RT40X4fJQRSYxxxxx 10 1 3438601248 0 2.8tb 1.4tb
green open esx_infra_security_msbgfilelog_2025-04 5O023SGLTSG-xxxxx 10 1 3385972439 0 2.8tb 1.4tb
green open esx_infra_security_msbgfilelog_2024-11 yZv3UrPFTry2xxxxx 10 1 3279153784 0 2.8tb 1.4tb
green open esx_infra_security_msbgfilelog_2025-02 HEeQwDMuTbChxxxxx 10 1 3204650774 0 2.6tb 1.3tb
green open esx_infra_server_linux_2025-05 HWfsGcr0TMyexxxxx 10 1 3124772748 0 2tb 1tb
green open esx_infra_server_linux_2024-12 jo2U-VyYRjusxxxxx 10 1 2985056711 0 1.3tb 680.3gb
[root@eshost ~]# curl -u esadmin:adminPass123 -X GET "localhost:9200/_cluster/health?pretty" | grep status