Clickhouse SQL 统计脚本
CREATE TABLE IF NOT EXISTS field_empty_stats (
id UInt32 AUTO_INCREMENT,
database String,
table_name String,
column_name String,
column_type String,
empty_count UInt64,
total_rows UInt64,
empty_percent Float64,
run_time DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY (database, table_name, column_name, run_time);
WITH tables AS (
SELECT
database,
name AS table_name
FROM system.tables
WHERE database = 'your_database'
AND table_name IN ('table1', 'table2')
)
SELECT
t.database,
t.table_name,
c.name AS column_name,
c.type AS column_type,
CASE
WHEN c.type LIKE 'String%' THEN
(SELECT sum(TRIM(IFNULL({col:Identifier}, '')) = '')
FROM {db:Identifier}.{table:Identifier})
WHEN c.type IN (
'UInt8','UInt16','UInt32','UInt64',
'Int8','Int16','Int32','Int64',
'Float32','Float64','Decimal'
) THEN
(SELECT sum(IFNULL({col:Identifier}, 0) = 0)
FROM {db:Identifier}.{table:Identifier})
ELSE
(SELECT sum({col:Identifier} IS NULL)
FROM {db:Identifier}.{table:Identifier})
END AS empty_count,
(SELECT count() FROM {db:Identifier}.{table:Identifier}) AS total_rows,
ROUND(empty_count * 100.0 / total_rows, 2) AS empty_percent
FROM system.columns c
JOIN tables t
ON c.database = t.database AND c.table = t.table_name
SETTINGS
allow_experimental_analyzer = 1,
format_template = 'SELECT \'{database}\', \'{table_name}\', \'{column_name}\', \'{column_type}\', ${{col:Identifier}}, ${{table:Identifier}}, ${{db:Identifier}}'
C++ 控制台程序
#include <clickhouse/client.h>
#include <iostream>
#include <vector>
#include <ctime>
#include <iomanip>
#include <sstream>
using namespace clickhouse;
std::string currentDateTime() {
auto now = std::time(nullptr);
auto tm = *std::localtime(&now);
std::ostringstream oss;
oss << std::put_time(&tm, "%Y-%m-%d %H:%M:%S");
return oss.str();
}
int main(int argc, char* argv[]) {
if (argc < 3) {
std::cerr << "Usage: " << argv[0] << " <database> <table1> [table2 ...]\n";
return 1;
}
const std::string database = argv[1];
std::vector<std::string> tables;
for (int i = 2; i < argc; ++i) {
tables.push_back(argv[i]);
}
ClientOptions opts;
opts.SetHost("localhost");
opts.SetPort(9000);
opts.SetUser("default");
opts.SetPassword("");
Client client(opts);
std::ostringstream sql;
sql << "WITH tables AS ("
<< " SELECT database, name AS table_name "
<< " FROM system.tables "
<< " WHERE database = '" << database << "' "
<< " AND table_name IN (";
for (size_t i = 0; i < tables.size(); ++i) {
sql << "'" << tables[i] << "'";
if (i < tables.size() - 1) sql << ", ";
}
sql << ") )"
<< "INSERT INTO field_empty_stats "
<< "(database, table_name, column_name, column_type, empty_count, total_rows, empty_percent) "
<< "SELECT "
<< " t.database, "
<< " t.table_name, "
<< " c.name AS column_name, "
<< " c.type AS column_type, "
<< " CASE "
<< " WHEN c.type LIKE 'String%' THEN "
<< " (SELECT sum(TRIM(IFNULL({col:Identifier}, '')) = '') "
<< " FROM {db:Identifier}.{table:Identifier}) "
<< " WHEN c.type IN ( "
<< " 'UInt8','UInt16','UInt32','UInt64', "
<< " 'Int8','Int16','Int32','Int64', "
<< " 'Float32','Float64','Decimal' "
<< " ) THEN "
<< " (SELECT sum(IFNULL({col:Identifier}, 0) = 0) "
<< " FROM {db:Identifier}.{table:Identifier}) "
<< " ELSE "
<< " (SELECT sum({col:Identifier} IS NULL) "
<< " FROM {db:Identifier}.{table:Identifier}) "
<< " END AS empty_count, "
<< " (SELECT count() FROM {db:Identifier}.{table:Identifier}) AS total_rows, "
<< " ROUND(empty_count * 100.0 / total_rows, 2) AS empty_percent "
<< "FROM system.columns c "
<< "JOIN tables t ON c.database = t.database AND c.table = t.table_name "
<< "SETTINGS "
<< " allow_experimental_analyzer = 1, "
<< " format_template = "
<< "'SELECT \\'{database}\\', \\'{table_name}\\', \\'{column_name}\\', "
<< "\\'{column_type}\\', ${{col:Identifier}}, ${{table:Identifier}}, ${{db:Identifier}}'";
try {
client.Execute(sql.str());
std::cout << "Successfully analyzed " << tables.size()
<< " tables in database: " << database << "\n";
std::cout << "Results written to field_empty_stats at: "
<< currentDateTime() << std::endl;
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
return 0;
}
使用说明
- 创建结果表:
CREATE TABLE IF NOT EXISTS field_empty_stats (
id UInt32 AUTO_INCREMENT,
database String,
table_name String,
column_name String,
column_type String,
empty_count UInt64,
total_rows UInt64,
empty_percent Float64,
run_time DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY (database, table_name, column_name, run_time);
- 编译程序:
g++ -std=c++17 -o field_analyzer field_analyzer.cpp -lclickhouse-cpp-lib
- 运行程序:
./field_analyzer your_database table1 table2 table3
处理逻辑说明
字段类型 |
空值判定逻辑 |
统计内容 |
String |
TRIM(IFNULL(column, '')) = '' |
空字符串 |
数值类型 |
IFNULL(column, 0) = 0 |
零值 |
其他类型 |
column IS NULL |
NULL值 |
结果表结构
列名 |
类型 |
说明 |
id |
UInt32 |
自增ID |
database |
String |
数据库名称 |
table_name |
String |
表名称 |
column_name |
String |
字段名称 |
column_type |
String |
字段类型 |
empty_count |
UInt64 |
空值/零值/NULL记录数 |
total_rows |
UInt64 |
表总行数 |
empty_percent |
Float64 |
空值百分比(保留两位小数) |
run_time |
DateTime |
分析执行时间 |
注意事项
- 确保Clickhouse服务器运行在默认端口9000(或修改代码中的端口)
- 需要安装clickhouse-cpp客户端库
- 程序自动处理以下数据类型:
- 字符串类型:空字符串统计
- 数值类型:零值统计
- 其他类型:NULL值统计
- 结果表使用MergeTree引擎,按分析时间排序
- 百分比计算保留两位小数
这个解决方案实现了:
- 动态分析指定数据库的多个表
- 自动识别字段类型并应用相应的空值逻辑
- 结果写入包含自增ID和时间戳的统计表
- 命令行参数指定数据库和表名
- 完整的错误处理机制