Linux下GCC和C++实现统计Clickhouse数据仓库指定表中各字段的空值、空字符串或零值比例

发布于:2025-06-09 ⋅ 阅读:(18) ⋅ 点赞:(0)

Clickhouse SQL 统计脚本

/* 创建结果表(如果不存在) */
CREATE TABLE IF NOT EXISTS field_empty_stats (
    id UInt32 AUTO_INCREMENT,
    database String,
    table_name String,
    column_name String,
    column_type String,
    empty_count UInt64,
    total_rows UInt64,
    empty_percent Float64,
    run_time DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY (database, table_name, column_name, run_time);

/* 字段空值统计查询 */
WITH tables AS (
    SELECT 
        database,
        name AS table_name
    FROM system.tables 
    WHERE database = 'your_database'
      AND table_name IN ('table1', 'table2')  -- 替换为实际表名
)
SELECT 
    t.database,
    t.table_name,
    c.name AS column_name,
    c.type AS column_type,
    CASE 
        WHEN c.type LIKE 'String%' THEN
            (SELECT sum(TRIM(IFNULL({col:Identifier}, '')) = '') 
            FROM {db:Identifier}.{table:Identifier})
        WHEN c.type IN (
            'UInt8','UInt16','UInt32','UInt64',
            'Int8','Int16','Int32','Int64',
            'Float32','Float64','Decimal'
        ) THEN
            (SELECT sum(IFNULL({col:Identifier}, 0) = 0) 
            FROM {db:Identifier}.{table:Identifier})
        ELSE 
            (SELECT sum({col:Identifier} IS NULL) 
            FROM {db:Identifier}.{table:Identifier})
    END AS empty_count,
    (SELECT count() FROM {db:Identifier}.{table:Identifier}) AS total_rows,
    ROUND(empty_count * 100.0 / total_rows, 2) AS empty_percent
FROM system.columns c
JOIN tables t 
    ON c.database = t.database AND c.table = t.table_name
SETTINGS 
    allow_experimental_analyzer = 1,
    format_template = 'SELECT \'{database}\', \'{table_name}\', \'{column_name}\', \'{column_type}\', ${{col:Identifier}}, ${{table:Identifier}}, ${{db:Identifier}}'

C++ 控制台程序

#include <clickhouse/client.h>
#include <iostream>
#include <vector>
#include <ctime>
#include <iomanip>
#include <sstream>

using namespace clickhouse;

// 获取当前时间戳字符串
std::string currentDateTime() {
    auto now = std::time(nullptr);
    auto tm = *std::localtime(&now);
    std::ostringstream oss;
    oss << std::put_time(&tm, "%Y-%m-%d %H:%M:%S");
    return oss.str();
}

int main(int argc, char* argv[]) {
    if (argc < 3) {
        std::cerr << "Usage: " << argv[0] << " <database> <table1> [table2 ...]\n";
        return 1;
    }

    const std::string database = argv[1];
    std::vector<std::string> tables;
    for (int i = 2; i < argc; ++i) {
        tables.push_back(argv[i]);
    }

    // 连接Clickhouse
    ClientOptions opts;
    opts.SetHost("localhost");
    opts.SetPort(9000);
    opts.SetUser("default");
    opts.SetPassword("");
    Client client(opts);

    // 构建动态SQL
    std::ostringstream sql;
    sql << "WITH tables AS ("
        << "  SELECT database, name AS table_name "
        << "  FROM system.tables "
        << "  WHERE database = '" << database << "' "
        << "    AND table_name IN (";
    
    for (size_t i = 0; i < tables.size(); ++i) {
        sql << "'" << tables[i] << "'";
        if (i < tables.size() - 1) sql << ", ";
    }
    
    sql << ") )"
        << "INSERT INTO field_empty_stats "
        << "(database, table_name, column_name, column_type, empty_count, total_rows, empty_percent) "
        << "SELECT "
        << "  t.database, "
        << "  t.table_name, "
        << "  c.name AS column_name, "
        << "  c.type AS column_type, "
        << "  CASE "
        << "    WHEN c.type LIKE 'String%' THEN "
        << "      (SELECT sum(TRIM(IFNULL({col:Identifier}, '')) = '') "
        << "       FROM {db:Identifier}.{table:Identifier}) "
        << "    WHEN c.type IN ( "
        << "      'UInt8','UInt16','UInt32','UInt64', "
        << "      'Int8','Int16','Int32','Int64', "
        << "      'Float32','Float64','Decimal' "
        << "    ) THEN "
        << "      (SELECT sum(IFNULL({col:Identifier}, 0) = 0) "
        << "      FROM {db:Identifier}.{table:Identifier}) "
        << "    ELSE "
        << "      (SELECT sum({col:Identifier} IS NULL) "
        << "       FROM {db:Identifier}.{table:Identifier}) "
        << "  END AS empty_count, "
        << "  (SELECT count() FROM {db:Identifier}.{table:Identifier}) AS total_rows, "
        << "  ROUND(empty_count * 100.0 / total_rows, 2) AS empty_percent "
        << "FROM system.columns c "
        << "JOIN tables t ON c.database = t.database AND c.table = t.table_name "
        << "SETTINGS "
        << "  allow_experimental_analyzer = 1, "
        << "  format_template = "
        << "'SELECT \\'{database}\\', \\'{table_name}\\', \\'{column_name}\\', "
        << "\\'{column_type}\\', ${{col:Identifier}}, ${{table:Identifier}}, ${{db:Identifier}}'";

    try {
        // 执行查询
        client.Execute(sql.str());
        
        std::cout << "Successfully analyzed " << tables.size() 
                  << " tables in database: " << database << "\n";
        std::cout << "Results written to field_empty_stats at: " 
                  << currentDateTime() << std::endl;
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return 1;
    }

    return 0;
}

使用说明

  1. 创建结果表
CREATE TABLE IF NOT EXISTS field_empty_stats (
    id UInt32 AUTO_INCREMENT,
    database String,
    table_name String,
    column_name String,
    column_type String,
    empty_count UInt64,
    total_rows UInt64,
    empty_percent Float64,
    run_time DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY (database, table_name, column_name, run_time);
  1. 编译程序
g++ -std=c++17 -o field_analyzer field_analyzer.cpp -lclickhouse-cpp-lib
  1. 运行程序
./field_analyzer your_database table1 table2 table3

处理逻辑说明

字段类型 空值判定逻辑 统计内容
String TRIM(IFNULL(column, '')) = '' 空字符串
数值类型 IFNULL(column, 0) = 0 零值
其他类型 column IS NULL NULL值

结果表结构

列名 类型 说明
id UInt32 自增ID
database String 数据库名称
table_name String 表名称
column_name String 字段名称
column_type String 字段类型
empty_count UInt64 空值/零值/NULL记录数
total_rows UInt64 表总行数
empty_percent Float64 空值百分比(保留两位小数)
run_time DateTime 分析执行时间

注意事项

  1. 确保Clickhouse服务器运行在默认端口9000(或修改代码中的端口)
  2. 需要安装clickhouse-cpp客户端库
  3. 程序自动处理以下数据类型:
    • 字符串类型:空字符串统计
    • 数值类型:零值统计
    • 其他类型:NULL值统计
  4. 结果表使用MergeTree引擎,按分析时间排序
  5. 百分比计算保留两位小数

这个解决方案实现了:

  • 动态分析指定数据库的多个表
  • 自动识别字段类型并应用相应的空值逻辑
  • 结果写入包含自增ID和时间戳的统计表
  • 命令行参数指定数据库和表名
  • 完整的错误处理机制

网站公告

今日签到

点亮在社区的每一天
去签到