美团龙猫AI修正的二分法提取xlsx的sheet.xml某个范围的数据到csv文件的C程序

发布于:2025-09-07 ⋅ 阅读:(21) ⋅ 点赞:(0)

这次交互的次数比较多,主要是改用逐个字符解析以应对无换行符的xml文件,同时重写了标签和属性处理。修改后的main函数 - 支持命令行参数。
限制:范围支持单字母的列,即A-Z,xml文件无共享字符串。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define MAX_LINE_LENGTH 4096
#define MAX_CELL_CONTENT 1024
#define MAX_SHEET_ROWS 1048576 // Excel最大行数

// 用户输入范围
typedef struct {
    int start_row;
    int end_row;
    char start_col; // 列字母,如'A'
    char end_col;   // 列字母,如'Z'
} ParseRange;

// 解析结果
typedef struct {
    int row;
    char col;
    char value[MAX_CELL_CONTENT];
    int is_empty; // 空单元格标记
} CellData;

// 全局变量
CellData *results = NULL;
int result_count = 0;
int result_capacity = 0;
// 在全局变量区添加:
ParseRange current_parse_range = {0}; // 保存当前解析范围
// 函数声明
int binary_search_start(FILE *file, ParseRange range);
int find_row_by_binary_search(FILE *file, int target_row, long *start_pos, long *end_pos);
void parse_row_data(FILE *file, ParseRange range, long start_pos, long end_pos);
void add_cell_result(int row, char col, const char *value, int is_empty);
int is_cell_in_range(int row, char col, ParseRange range);
int compare_row_col(int row1, char col1, int row2, char col2);
void free_results();


#include <time.h>  // 需要包含头文件

/**
 * 获取程序启动后的时间(秒)
 * @return 从程序启动到现在的秒数(浮点数,精度毫秒)
 */
double get_runtime_seconds() {
    static clock_t start_t = 0;
    if (start_t == 0) {
        start_t = clock(); // 首次调用记录启动时间
        return 0.0;
    }
    return (double)(clock() - start_t) / CLOCKS_PER_SEC;
}

/**
 * 获取格式化时间字符串(用于调试输出)
 * @param prefix 输出前缀(如"解析完成")
 * @return 格式化字符串,示例: "[00:01.234] 解析完成"
 */
const char* get_timestamped_msg(const char *prefix) {
    static char buf[64];
    double seconds = get_runtime_seconds();
    int mins = (int)seconds / 60;
    int secs = (int)seconds % 60;
    int msecs = (int)((seconds - (int)seconds) * 1000);
    snprintf(buf, sizeof(buf), "[%02d:%02d.%03d] %s", mins, secs, msecs, prefix);
    return buf;
}


/**
 * 主解析函数 - 修正版本
 * @param filename XML文件路径
 * @param range 解析范围
 * @return 0成功,-1失败
 */

// 修改parse_sheet_xml函数开头:
int parse_sheet_xml(const char *filename, ParseRange range) {
    current_parse_range = range; // 保存范围
    // ... 原有代码

//int parse_sheet_xml(const char *filename, ParseRange range) {
    FILE *file = fopen(filename, "r");
    if (!file) {
        perror("无法打开文件");
        return -1;
    }

    // 初始化结果数组
    result_count = 0;
    result_capacity = 1024;
    results = (CellData *)malloc(result_capacity * sizeof(CellData));
    if (!results) {
        fclose(file);
        return -1;
    }

    // 二分查找起始行
    if (binary_search_start(file, range)) {
        // 直接开始解析,从当前位置开始
        char buffer[MAX_LINE_LENGTH];
        long row_start_pos = ftell(file);
        
        // 向前查找最近的<row标签开始位置
        for (long pos = row_start_pos; pos >= 0; pos--) {
            fseek(file, pos, SEEK_SET);
            if (fgetc(file) == '<') {
                // 检查是否是<row标签
                int is_row_tag = 1;
                for (int i = 1; i < 4; i++) {
                    if (fgetc(file) != "row"[i]) {
                        is_row_tag = 0;
                        break;
                    }
                }
                if (is_row_tag) {
                    row_start_pos = pos;  // 记录<row标签的起始位置
                    fseek(file, pos, SEEK_SET); // 定位到<row标签开始
                    break;
                }
            }
        }
        
        // 获取文件大小作为结束边界
        long file_size;
        fseek(file, 0, SEEK_END);
        file_size = ftell(file);
        
        // 定位到<row标签开始位置,准备解析
        fseek(file, row_start_pos, SEEK_SET);
printf("%s 二分查找\n", get_timestamped_msg(""));
        printf("二分查找到row_start_pos=%d\n",row_start_pos);
        // 直接解析数据 - 从<row标签开始到文件末尾
        parse_row_data(file, range, row_start_pos, file_size);
    }

    fclose(file);
    return 0;
}


/**
 * 二分查找定位起始行
 * @param file 文件指针
 * @param range 解析范围
 * @return 是否找到起始行
 */
int binary_search_start(FILE *file, ParseRange range) {
    long file_size = 0;
    long low, high, mid;
    
    // 获取文件大小
    fseek(file, 0, SEEK_END);
    file_size = ftell(file);
    fseek(file, 0, SEEK_SET);

    low = 0;
    high = file_size;
    int last_found_row = -1;
    long last_found_pos = -1;

    while (low <= high) {
        mid = (low + high) / 2;
        fseek(file, mid, SEEK_SET);

        // 向前查找最近的<row标签
        char buffer[MAX_LINE_LENGTH];
        long row_start_pos = -1;
        int row_num = -1;

        // 从mid位置向前扫描,找到前一个<row标签
        for (long pos = mid; pos >= low && pos >= 0; pos--) {
            fseek(file, pos, SEEK_SET);
            if (fgetc(file) == '<') {
                if (pos + 4 <= file_size && fgetc(file) == 'r' && 
                    fgetc(file) == 'o' && fgetc(file) == 'w') {
                    row_start_pos = pos;
                    break;
                }
            }
        }

        // 如果向前没找到,从mid向后找
        if (row_start_pos == -1) {
            for (long pos = mid; pos <= high && pos < file_size - 4; pos++) {
                fseek(file, pos, SEEK_SET);
                if (fgetc(file) == '<') {
                    if (pos + 4 <= file_size && fgetc(file) == 'r' && 
                        fgetc(file) == 'o' && fgetc(file) == 'w') {
                        row_start_pos = pos;
                        break;
                    }
                }
            }
        }

        if (row_start_pos == -1) {
            // 没有找到<row标签
            if (mid == low) break;
            high = mid - 1;
            continue;
        }

        // 解析行号
        fseek(file, row_start_pos, SEEK_SET);
        while (fgets(buffer, MAX_LINE_LENGTH, file)) {
            if (strstr(buffer, "<row")) {
                char *row_attr = strstr(buffer, "r=\"");
                if (row_attr) {
                    row_attr += 3; // 跳过r="
                    row_num = atoi(row_attr);
                    break;
                }
            }
        }

        if (row_num == -1) {
            // 解析行号失败,调整搜索范围
            if (row_start_pos < range.start_row) low = mid + 1;
            else high = mid - 1;
            continue;
        }

        if (row_num == range.start_row) {
            // 找到精确匹配
            last_found_row = row_num;
            last_found_pos = row_start_pos;
            break;
        } else if (row_num < range.start_row) {
            // 当前行小于目标行
            if (row_num > last_found_row) {
                last_found_row = row_num;
                last_found_pos = row_start_pos;
            }
            low = mid + 1;
        } else {
            // 当前行大于目标行
            high = mid - 1;
        }
    }

    // 如果找到了合适的起始位置
    if (last_found_row != -1) {
        //printf(" last_found_pos=%d\n",  last_found_pos);
        fseek(file, last_found_pos, SEEK_SET);
        return 1;
    }
    return 0;
}



/**
 * 添加单元格结果到结果数组
 */
void add_cell_result(int row, char col, const char *value, int is_empty) {

    // 关键修复:只保存用户指定范围内的单元格
    if (row < current_parse_range.start_row || row > current_parse_range.end_row ||
        col < current_parse_range.start_col || col > current_parse_range.end_col) {
        return; // 直接返回,不保存范围外的数据
    }
    // 扩展结果数组
    if (result_count >= result_capacity) {
        result_capacity *= 2;
        results = (CellData *)realloc(results, result_capacity * sizeof(CellData));
        if (!results) {
            fprintf(stderr, "内存分配失败\n");
            return;
        }
    }

    results[result_count].row = row;
    results[result_count].col = col;
    strncpy(results[result_count].value, value, MAX_CELL_CONTENT - 1);
    results[result_count].value[MAX_CELL_CONTENT - 1] = '\0';
    results[result_count].is_empty = is_empty;
    result_count++;
//printf("row=%d,col=%c ",row,col);
}

/**
 * 检查单元格是否在用户指定范围内
 */
int is_cell_in_range(int row, char col, ParseRange range) {
    if (row < range.start_row || row > range.end_row) return 0;
    if (col < range.start_col || col > range.end_col) return 0;
    return 1;
}

/**
 * 比较两个行列坐标
 * @return -1: row1<col1 < row2<col2, 0: 相等, 1: row1<col1 > row2<col2
 */
int compare_row_col(int row1, char col1, int row2, char col2) {
    if (row1 != row2) return (row1 < row2) ? -1 : 1;
    if (col1 != col2) return (col1 < col2) ? -1 : 1;
    return 0;
}

/**
 * 释放结果内存
 */
void free_results() {
    if (results) {
        free(results);
        results = NULL;
    }
    result_count = 0;
    result_capacity = 0;
}

/**
 * 打印解析结果
 */
void print_results() {
    printf("解析结果:\n");
    for (int i = 0; i < result_count; i++) {
        if (results[i].is_empty) {
            printf("单元格 %c%d: (空)\n", results[i].col, results[i].row);
        } else {
            printf("单元格 %c%d: %s\n", results[i].col, results[i].row, results[i].value);
        }
    }
}

/**
 * 以Excel的A1表示法打印解析范围
 * 例如:A1:H7
 * @param range 要打印的解析范围
 */
void print_parse_range(ParseRange range) {
    printf("解析范围: %c%d:%c%d\n", 
           range.start_col, range.start_row,
           range.end_col, range.end_row);
}
// 使用示例


// 前面的所有函数保持不变...

/**
 * 从用户输入解析Excel格式范围(如A1:H5)
 * @param input 用户输入的字符串
 * @param range 输出解析结果
 * @return 0成功,-1失败
 */
int parse_excel_range(const char *input, ParseRange *range) {
    if (!input || !range) return -1;
    
    char start_col = '\0', end_col = '\0';
    int start_row = 0, end_row = 0;
    int parsed = 0;
    
    // 跳过空白
    while (isspace(*input)) input++;
    
    // 解析起始列
    if (isalpha(*input)) {
        start_col = toupper(*input);
        input++;
        
        // 解析起始行
        char *end_ptr;
        start_row = strtol(input, &end_ptr, 10);
        if (end_ptr > input) {
            input = end_ptr;
            
            // 解析分隔符
            while (isspace(*input)) input++;
            if (*input == ':') {
                input++;
                while (isspace(*input)) input++;
                
                // 解析结束列
                if (isalpha(*input)) {
                    end_col = toupper(*input);
                    input++;
                    
                    // 解析结束行
                    end_row = strtol(input, &end_ptr, 10);
                    if (end_ptr > input) {
                        parsed = 1;
                    }
                }
            }
        }
    }
    
    if (parsed) {
        range->start_row = start_row;
        range->end_row = end_row;
        range->start_col = start_col;
        range->end_col = end_col;
printf("start_row=%d,end_row=%d",start_row,end_row);
        return 0;
    }
    return -1;
}


/**
 * 将结果保存为CSV文件 - 修正版本
 * @param filename 输出CSV文件名
 * @return 0成功,-1失败
 */
int save_results_to_csv(const char *filename) {
    if (!filename || result_count == 0) return -1;
    printf("result_count=%d\n",result_count);
    FILE *csv = fopen(filename, "w");
    if (!csv) {
        perror("无法创建CSV文件");
        return -1;
    }
    
    // 收集所有行号
    int *rows = (int *)malloc(result_count * sizeof(int));
    int row_count = 0;
    
    for (int i = 0; i < result_count; i++) {
        int found = 0;
        for (int j = 0; j < row_count; j++) {
            if (rows[j] == results[i].row) {
                found = 1;
                break;
            }
        }
        if (!found) {
            rows[row_count++] = results[i].row;
        }
    }
    
    // 按行号排序
    for (int i = 0; i < row_count - 1; i++) {
        for (int j = i + 1; j < row_count; j++) {
            if (rows[i] > rows[j]) {
                int temp = rows[i];
                rows[i] = rows[j];
                rows[j] = temp;
            }
        }
    }
    

    int start_col = results[0].col; // 实际起始列
    int end_col = results[0].col;   // 实际结束列
    for (int i = 1; i < result_count; i++) {
        if (results[i].col < start_col) start_col = results[i].col;
        if (results[i].col > end_col) end_col = results[i].col;
    }
    

    int col_count = end_col - start_col + 1;
    
    // 写入标题行
    fprintf(csv, "Row,");
    for (int c = 0; c < col_count; c++) {
        fprintf(csv, "%c", start_col + c);
        if (c < col_count - 1) fprintf(csv, ",");
    }
    fprintf(csv, "\n");
    
    // 为每一行生成CSV数据
    for (int r = 0; r < row_count; r++) {
        int current_row = rows[r];
        
        // 检查该行是否有数据(在用户指定范围内)
        int has_data = 0;
        for (int i = 0; i < result_count; i++) {
            if (results[i].row == current_row && 
                results[i].col >= start_col && 
                results[i].col <= end_col &&
                !results[i].is_empty) {
                has_data = 1;
                break;
            }
        }
        
        if (!has_data) continue; // 跳过全空行
        
        // 生成该行的CSV数据
        fprintf(csv, "%d,", current_row); // 行号作为第一列
        
        for (int c = 0; c < col_count; c++) {
            char col = start_col + c;
            char *value = NULL;
            int is_empty = 1;
            
            // 查找该列的数据
            for (int i = 0; i < result_count; i++) {
                if (results[i].row == current_row && 
                    results[i].col == col) {
                    value = results[i].value;
                    is_empty = results[i].is_empty;
                    break;
                }
            }
            
            if (!is_empty && value && strlen(value) > 0) {
                // 转义CSV特殊字符
                if (strchr(value, ',') || strchr(value, '"') || strchr(value, '\n')) {
                    fprintf(csv, "\"%s\"", value);
                } else {
                    fprintf(csv, "%s", value);
                }
            } else {
                // 空单元格
                fprintf(csv, "");
            }
            
            if (c < col_count - 1) {
                fprintf(csv, ",");
            }
        }
        fprintf(csv, "\n");
    }
    
    free(rows);
    fclose(csv);
    printf("结果已保存到: %s\n", filename);
    return 0;
}
/**
 * 处理XML缓冲区内容 - 提取为独立函数
 * @param buffer 要处理的XML内容
 * @param range 解析范围
 * @param in_row 输入/输出:是否在行内
 * @param current_row 输入/输出:当前行号
 * @param current_cell_col 输入/输出:当前单元格列
 * @param temp_value 临时值存储
 */
void process_xml_buffer(char *buffer, ParseRange range, int *in_row, int *current_row,
                       char *current_cell_col, char *temp_value) {
    char *pos = buffer;
    
    // 处理每行中的标签
    while (*pos) {
        if (strncmp(pos, "<row", 4) == 0) {
            // 解析行号
            char *row_attr = strstr(pos, "r=\"");
            if (row_attr) {
                row_attr += 3;
                *current_row = atoi(row_attr);
            }
            *in_row = 1;
            pos += 4;
        }
        else if (strncmp(pos, "</row>", 6) == 0) {
            // 行结束
            if (*current_row >= range.end_row) {
                // 超过用户指定范围,停止解析
                return;
            }
            *in_row = 0;
            *current_row = -1;
            pos += 6;
        }
        else if (*in_row && strncmp(pos, "<c ", 3) == 0) {
            // 解析单元格
            char *col_attr = strstr(pos, "r=\"");
            char *value_start = NULL;
            int is_empty = 0;
            int cell_has_value = 0;
            int is_self_closing = 0;

            if (col_attr) {
                col_attr += 3;
                *current_cell_col = col_attr[0];
                
                // 检查自闭合标签
                char *self_close = strstr(pos, "/>");
                if (self_close) {
                    is_self_closing = 1;
                }
                
                // 跳过列字母和数字分隔符
                while (isdigit(col_attr[0])) col_attr++;
                
                // 检查单元格值
                char *v_tag = strstr(pos, "<v>");
                if (v_tag) {
                    value_start = v_tag + 3;
                    char *v_end = strstr(v_tag, "</v>");
                    if (v_end) {
                        *v_end = '\0';
                        strncpy(temp_value, value_start, MAX_CELL_CONTENT - 1);
                        temp_value[MAX_CELL_CONTENT - 1] = '\0';
                        cell_has_value = 1;
                    }
                }

                // 自闭合标签一定是空单元格
                if (is_self_closing || !cell_has_value) {
                    is_empty = 1;
                    temp_value[0] = '\0';
                }

                if (is_cell_in_range(*current_row, *current_cell_col, range)) {
                    add_cell_result(*current_row, *current_cell_col, temp_value, is_empty);
                }
            }
            pos += 3;
        }
        else if (strncmp(pos, "</c>", 4) == 0) {
            // 单元格结束
            *current_cell_col = '\0';
            pos += 4;
        }
        else {
            pos++;
        }
    }
}
/**
 * 处理标签 - 修复字符串类型和范围
 */
void process_tag(const char *tag_name, const char *attr_value, int attr_count,
                ParseRange range, int *in_row, int *current_row,
                char *current_cell_col, char *temp_value, int *is_self_closing,
                int *value_started, int *value_len) {
    
    if(1==0)printf("调试: 处理标签 '%s', value_started=%d\n", tag_name, *value_started);
    
    int is_end_tag = (tag_name[0] == '/');
    const char *tag = is_end_tag ? tag_name + 1 : tag_name;
    
    // 范围检查 - 修复:确保使用用户指定范围
    if (*current_row >= 1 && *current_row <= range.end_row && 
        *current_cell_col >= range.start_col && *current_cell_col <= range.end_col) {
        
        if (strcmp(tag, "row") == 0) {
            if (is_end_tag) {
                *in_row = 0;
                *current_row = -1;
                if(1==0)printf("调试: 行结束\n");
            } else {
                *in_row = 1;
                if(1==0)printf("调试: 进入行\n");
            }
        }
        else if (strcmp(tag, "c") == 0) {
            if (is_end_tag) {
                *current_cell_col = '\0';
                *value_started = 0;
                *value_len = 0;
                if(1==0)printf("调试: 单元格结束\n");
            } else if (*is_self_closing) {
                if (is_cell_in_range(*current_row, *current_cell_col, range)) {
                    if(1==0)printf("调试: 空单元格 %c%d\n", *current_cell_col, *current_row);
                    add_cell_result(*current_row, *current_cell_col, "", 1);
                }
                *current_cell_col = '\0';
            }
        }
        else if (strcmp(tag, "v") == 0) {
            if (is_end_tag) {
                temp_value[*value_len] = '\0';
                if (*value_len > 0 && is_cell_in_range(*current_row, *current_cell_col, range)) {
                    if(1==0)printf("调试: 数值结束 %c%d='%s'\n", *current_cell_col, *current_row, temp_value);
                    add_cell_result(*current_row, *current_cell_col, temp_value, 0);
                }
                *value_started = 0;
                *value_len = 0;
            } else {
                *value_started = 1;
                *value_len = 0;
                temp_value[0] = '\0';
                if(1==0)printf("调试: 数值开始\n");
            }
        }
        else if (strcmp(tag, "t") == 0) {
            // 修复:处理字符串类型 <t>标签
            if (is_end_tag) {
                temp_value[*value_len] = '\0';
                if (*value_len > 0 && is_cell_in_range(*current_row, *current_cell_col, range)) {
                    if(1==0)printf("调试: 字符串结束 %c%d='%s'\n", *current_cell_col, *current_row, temp_value);
                    add_cell_result(*current_row, *current_cell_col, temp_value, 0);
                }
                *value_started = 0;
                *value_len = 0;
            } else {
                *value_started = 1;
                *value_len = 0;
                temp_value[0] = '\0';
                if(1==0)printf("调试: 字符串开始\n");
            }
        }
    } else {
        // 不在用户指定范围内,跳过
        if (strcmp(tag, "row") == 0 && !is_end_tag) {
            *in_row = 1;
        } else if (strcmp(tag, "/row") == 0) {
            *in_row = 0;
            *current_row = -1;
        }
    }
}

/**
 * 处理属性 - 修复:记录字符串类型
 */
void process_attribute(const char *tag_name, const char *attr_name, const char *attr_value,
                      ParseRange range, int *in_row, int *current_row,
                      char *current_cell_col, char *temp_value,
                      int *value_started, int *value_len) {
    
    if(1==0)printf("调试: 属性 %s=%s, 标签=%s\n", attr_name, attr_value, tag_name);
    
    int is_end_tag = (tag_name[0] == '/');
    const char *tag = is_end_tag ? tag_name + 1 : tag_name;
    
    // 全局变量:记录当前单元格类型
    static char cell_type[16] = {0};
    
    if (strcmp(attr_name, "r") == 0) {
        if (strcmp(tag, "row") == 0) {
            *current_row = atoi(attr_value );//+ 1
            if(1==0)printf("调试: 行号=%d\n", *current_row);
        } else if (strcmp(tag, "c") == 0) {
            *current_cell_col = attr_value[0];
            if(1==0)printf("调试: 列=%c\n", *current_cell_col);
        }
    }
    else if (strcmp(attr_name, "t") == 0) {
        // 修复:记录单元格类型
        strncpy(cell_type, attr_value, sizeof(cell_type) - 1);
        if(1==0)printf("调试: 单元格类型=%s\n", cell_type);
        
        // 特殊处理:inlineStr类型需要从<t>标签取值
        if (strcmp(attr_value, "inlineStr") == 0) {
            if(1==0)printf("调试: 检测到字符串类型单元格\n");
        }
    }
    else if (strcmp(attr_name, "s") == 0) {
        // 样式属性,可用于优化
    }
}


/**
 * 解析行数据 - 最终修复版本
 * @param file 文件指针
 * @param range 解析范围
 * @param start_pos 起始位置
 * @param end_pos 结束位置
 */
void parse_row_data(FILE *file, ParseRange range, long start_pos, long end_pos) {
    char temp_value[MAX_CELL_CONTENT];
    int in_row = 0;
    int current_row = -1;
    char current_cell_col = '\0';
    int value_started = 0;
    int value_len = 0;
    int is_self_closing = 0;
    
    // 状态:0=普通文本, 1=标签开始, 2=标签名, 3=属性名, 4=属性值, 5=值内容
    int state = 0;
    char tag_name[32] = {0};
    char attr_name[16] = {0};
    char attr_value[256] = {0};
    int tag_len = 0;
    int attr_len = 0;
    int quote_char = 0;
    
    fseek(file, start_pos, SEEK_SET);
    
    int c;
    while ((c = fgetc(file)) != EOF && ftell(file) <= end_pos) {
        switch (state) {
            case 0: // 普通文本
                if (c == '<') {
                    state = 1;
                    tag_len = 0;
                    tag_name[0] = '\0';
                    is_self_closing = 0;
                } else if (value_started) {
                    // 值内容 - 关键修复:直接捕获
                    if (value_len < MAX_CELL_CONTENT - 1) {
                        temp_value[value_len++] = c;
                        temp_value[value_len] = '\0';
                        if(1==0)if(1==0)printf("调试: 捕获值 '%c', 当前值='%s'\n", c, temp_value);
                    }
                }
                break;
                
            case 1: // 标签开始 '<'
                if (c == '/') {
                    // 结束标签 </tag>
                    tag_name[tag_len++] = c;
                } else if (c == '>' || c == ' ') {
                    // 开始标签 <tag> 或 <tag ...
                    tag_name[tag_len] = '\0';
                    
                    // 处理标签
                    process_tag(tag_name, NULL, 0, range, &in_row, &current_row,
                               &current_cell_col, temp_value, &is_self_closing,
                               &value_started, &value_len);
                    
                    if (c == ' ') state = 3; // 属性
                    else state = 0; // 文本
                } else if (c == '?' || c == '!') {
                    // 跳过 <?xml>, <!-- -->
                    state = 0;
                } else if (tag_len < sizeof(tag_name) - 1) {
                    tag_name[tag_len++] = c;
                    state = 2; // 进入标签名
                }
                break;
                
            case 2: // 标签名
                if (c == '>' || c == ' ') {
                    tag_name[tag_len] = '\0';
                    
                    // 处理标签
                    process_tag(tag_name, NULL, 0, range, &in_row, &current_row,
                               &current_cell_col, temp_value, &is_self_closing,
                               &value_started, &value_len);
                    
                    if (c == ' ') state = 3; // 属性
                    else state = 0; // 文本
                } else if (c == '/' && (c = fgetc(file)) == '>') {
                    // 自闭合标签 <tag/>
                    tag_name[tag_len] = '\0';
                    is_self_closing = 1;
                    process_tag(tag_name, NULL, 0, range, &in_row, &current_row,
                               &current_cell_col, temp_value, &is_self_closing,
                               &value_started, &value_len);
                    state = 0;
                } else if (tag_len < sizeof(tag_name) - 1) {
                    tag_name[tag_len++] = c;
                }
                break;
                
            case 3: // 属性名
                if (c == '=') {
                    attr_name[attr_len] = '\0';
                    state = 4;
                    attr_len = 0;
                } else if (c == '>' || (c == '/' && (c = fgetc(file)) == '>')) {
                    // 无属性标签结束
                    process_tag(tag_name, NULL, 0, range, &in_row, &current_row,
                               &current_cell_col, temp_value, &is_self_closing,
                               &value_started, &value_len);
                    state = 0;
                } else if (attr_len < sizeof(attr_name) - 1) {
                    attr_name[attr_len++] = c;
                }
                break;
                
            case 4: // 属性值
                if (c == '"' || c == '\'') {
                    quote_char = c;
                    int val_len = 0;
                    
                    // 读取属性值
                    while ((c = fgetc(file)) != EOF && c != quote_char && val_len < sizeof(attr_value) - 1) {
                        attr_value[val_len++] = c;
                    }
                    attr_value[val_len] = '\0';
                    
                    // 处理属性
                    process_attribute(tag_name, attr_name, attr_value, range, &in_row,
                                    &current_row, &current_cell_col, temp_value,
                                    &value_started, &value_len);
                    
                    state = 3; // 回到属性名
                }
                break;
        }
        
        // 调试:每处理100个字符输出一次
        static int counter = 0;
        if (++counter % 100 == 0) {
            if(1==0)if(1==0)printf("调试: 状态=%d, 字符='%c', 值='%s', 行=%d, 列=%c, value_started=%d\n",
                   state, c, value_started ? temp_value : "", current_row, current_cell_col, value_started);
        }
        
        if (current_row > range.end_row) return;
    }
}

// 修改后的main函数 - 支持命令行参数
int main(int argc, char *argv[]) {
    char filename[1024] = {0};
    char csv_filename[1024] = {0};
    char range_input[64] = {0};
    ParseRange range;
    int interactive_mode = 0; // 交互模式标志
printf("%s 程序启动\n", get_timestamped_msg(""));    
    // 解析命令行参数
    if (argc == 4) {
        // 命令行模式: program xml_file range csv_file
        strncpy(filename, argv[1], sizeof(filename) - 1);
        strncpy(range_input, argv[2], sizeof(range_input) - 1);
        strncpy(csv_filename, argv[3], sizeof(csv_filename) - 1);
        
        // 确保csv文件名有扩展名
        if (!strstr(csv_filename, ".csv")) {
            strcat(csv_filename, ".csv");
        }
    } else if (argc == 1) {
        // 无参数,进入交互模式
        interactive_mode = 1;
        
        printf("=== Excel XML解析器 (交互模式) ===\n");
        printf("请输入XML文件路径: ");
        if (!fgets(filename, sizeof(filename), stdin)) {
            printf("错误: 无法读取文件名\n");
            return -1;
        }
        filename[strcspn(filename, "\n")] = 0;
        
        printf("请输入解析范围 (格式如 A1:H5): ");
        if (!fgets(range_input, sizeof(range_input), stdin)) {
            printf("错误: 无法读取范围\n");
            return -1;
        }
        range_input[strcspn(range_input, "\n")] = 0;
        
        printf("请输入CSV输出文件名 (默认: output.csv): ");
        if (!fgets(csv_filename, sizeof(csv_filename), stdin)) {
            strcpy(csv_filename, "output.csv");
        }
        csv_filename[strcspn(csv_filename, "\n")] = 0;
        if (strlen(csv_filename) == 0) {
            strcpy(csv_filename, "output.csv");
        }
        if (!strstr(csv_filename, ".csv")) {
            strcat(csv_filename, ".csv");
        }
    } else {
        printf("用法:\n");
        printf("  %s <xml文件路径> <范围(A1:H5)> <csv输出文件名>\n", argv[0]);
        printf("  %s (进入交互模式)\n", argv[0]);
        printf("示例:\n");
        printf("  %s sheet.xml B3:H5 result.csv\n", argv[0]);
        return -1;
    }
    
    // 解析范围
    if (parse_excel_range(range_input, &range) != 0) {
        printf("失败: 范围格式错误,请使用格式如 A1:H5\n");
        return -1;
    }
    
    print_parse_range(range);
    
    // 执行解析
    if (parse_sheet_xml(filename, range) == 0) {
        // 命令行模式不输出详细结果,仅保存CSV
        if (interactive_mode) {
            printf("\n解析成功!\n");
            print_results();
            printf("\n正在保存到CSV...\n");
        } else {
            // 命令行模式:静默处理
            printf("成功: 解析完成\n");
        }
printf("%s XML解析完成\n", get_timestamped_msg(""));        
        // 保存CSV
        if (save_results_to_csv(csv_filename) == 0) {
printf("%s CSV保存完成\n", get_timestamped_msg(""));

            if (interactive_mode) {
                printf("完成!\n");
            }
        } else {
            printf("失败: 无法保存CSV文件\n");
            free_results();
            return -1;
        }
    } else {
        printf("失败: 解析XML文件失败\n");
        free_results();
        return -1;
    }
    
    free_results();
    return 0;
}

编译和运行

gcc bsxml13.c -o bsxml -O3

time ./bsxml /shujv/par/dknyc/xl/worksheets/sheet1.xml A210000:Z211000 obig32.csv
[00:00.000]  程序启动
start_row=210000,end_row=211000解析范围: A210000:Z211000
[01:26.800]  二分查找
二分查找到row_start_pos=358064889
成功: 解析完成
[01:26.881]  XML解析完成
result_count=28161
结果已保存到: obig32.csv
[01:30.019]  CSV保存完成

real	1m39.296s
user	0m40.100s
sys	0m49.920s

可见对于行数多的靠后范围,二分查找比较慢,张泽鹏先生已经决定和AI PK一下,拭目以待。


网站公告

今日签到

点亮在社区的每一天
去签到