另外几种语言挑战100万行字符串文本排序

发布于:2025-08-01 ⋅ 阅读:(19) ⋅ 点赞:(0)

1.javascript
张泽鹏先生创作

const fs = require('fs');
const process = require('process');

const args = process.argv.slice(2);
if (args.length !== 1) {
    console.error(`Usage: ${process.argv[1]} <filename>`);
    process.exit(1);
}

const content = fs.readFileSync(args[0], 'utf8');
const lines = content.split('\n');
if (lines[lines.length - 1] === '') {
    lines.pop();
}

lines.sort();

process.stdout.write(lines.join('\n') + '\n');

执行

time node-v24.4.1-linux-x64/bin/node main.js varchar.txt >qsort.txt

real    0m4.455s
user    0m1.416s
sys     0m0.495s

2.rust
张泽鹏先生创作
main.rs

use std::env;
use std::fs::File;
use std::io::{self, BufRead, BufReader, BufWriter, Write};
use std::process;

fn main() {
    let args: Vec<String> = env::args().collect();
    
    if args.len() != 2 {
        eprintln!("Usage: {} <filename>", args[0]);
        process::exit(1);
    }
    
    let filename = &args[1];
    
    if let Err(e) = sort_file_lines(filename) {
        eprintln!("Error: {}", e);
        process::exit(1);
    }
}

fn sort_file_lines(filename: &str) -> io::Result<()> {
    let file = File::open(filename)?;
    let reader = BufReader::with_capacity(10 * 1024, file);
    let mut lines: Vec<String> = reader.lines().collect::<Result<Vec<_>, _>>()?;
    lines.sort_unstable();

    let stdout = io::stdout();
    let mut writer = BufWriter::with_capacity(64 * 1024, stdout.lock());
    for line in lines {
        writeln!(writer, "{}", line)?;
    }
    writer.flush()?;
    Ok(())
}

cargo.toml

[package]
name = "rust-sort"
version = "0.1.0"
edition = "2024"

[profile.release]
lto = true
strip = true

[dependencies]

编译执行

cargo build --release

time rust-sort/target/release/rust-sort varchar.txt >qsort.txt

real    0m2.333s
user    0m0.596s
sys     0m0.193s

4.张泽鹏先生重写的zig语言

const std = @import("std");
const ArrayList = std.ArrayList;

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    defer _ = gpa.deinit();
    const allocator = gpa.allocator();

    const output = std.io.getStdOut().writer();
    var stream = std.io.bufferedWriter(output);
    const stdout = stream.writer();

    // 1. Get filename from command line arguments
    var args = std.process.args();
    _ = args.next(); // Skip program name
    const filename = args.next() orelse {
        std.debug.print("Usage: program <filename>\n", .{});
        return;
    };

    // 2. Open and read the file
    const file = std.fs.cwd().openFile(filename, .{}) catch |err| {
        std.debug.print("Open file '{s}' failed: {}\n", .{ filename, err });
        return;
    };
    defer file.close();

    // 3. Read file lines
    const content = try file.readToEndAlloc(allocator, std.math.maxInt(usize));
    defer allocator.free(content);

    var lines = ArrayList([]const u8).init(allocator);
    defer lines.deinit();

    var iterator = std.mem.splitScalar(u8, content, '\n');
    while (iterator.next()) |line| {
        try lines.append(line);
    }

    // 4. Sort
    std.mem.sort([]const u8, lines.items, {}, compare);

    for (lines.items) |line| {
        try stdout.print("{s}\n", .{line});
    }
    try stream.flush();
}

fn compare(context: void, a: []const u8, b: []const u8) bool {
    _ = context;
    return std.mem.order(u8, a, b) == .lt;
}

关键修改是用std.mem.splitScalar代替了mem.tokenizeSequence,提升很大
编译执行

zig build-exe main.zig -O ReleaseFast

 time ./main varchar.txt > zsort.txt

real    0m3.748s
user    0m0.522s
sys     0m0.252s

4.deepseek重写的c语言
提示词:

编写c语言程序,将文件内容读入buf,然后遍历buf,遇到\n就记录偏移地址到一个100万元素的数组a,然后将\n位置填写\0,遍历完成后,对a执行排序,只修改a的元素存放的buf地址,然后浏览a,按地址输出每个字符串+\n到stdout

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

#define MAX_LINES 1000000  // 最多100万行

int compare_offsets(const void *a, const void *b) {
    return strcmp(*(const char **)a, *(const char **)b);
}

int main(int argc, char *argv[]) {
    if (argc != 2) {
        fprintf(stderr, "Usage: %s <filename>\n", argv[0]);
        return 1;
    }

    // 打开文件并获取大小
    int fd = open(argv[1], O_RDONLY);
    if (fd == -1) {
        perror("open");
        return 1;
    }

    struct stat st;
    if (fstat(fd, &st) == -1) {
        perror("fstat");
        close(fd);
        return 1;
    }

    size_t file_size = st.st_size;

    // 分配缓冲区并读取文件
    char *buf = malloc(file_size + 1);
    if (!buf) {
        perror("malloc");
        close(fd);
        return 1;
    }

    if (read(fd, buf, file_size) != file_size) {
        perror("read");
        free(buf);
        close(fd);
        return 1;
    }
    close(fd);
    buf[file_size] = '\0';  // 确保以null结尾

    // 记录每行起始地址
    char *lines[MAX_LINES];
    size_t line_count = 0;

    lines[line_count++] = buf;  // 第一行开始

    // 遍历缓冲区,记录每行起始地址并将\n替换为\0
    for (char *p = buf; *p && line_count < MAX_LINES; p++) {
        if (*p == '\n') {
            *p = '\0';
            if (*(p + 1)) {  // 如果不是文件末尾
                lines[line_count++] = p + 1;
            }
        }
    }

    // 对行指针数组进行排序
    qsort(lines, line_count, sizeof(char *), compare_offsets);

    // 输出排序后的行
    for (size_t i = 0; i < line_count; i++) {
        printf("%s\n", lines[i]);
    }

    free(buf);
    return 0;
}

编译执行

 gcc sort_lines.c -o ds_sort -O3

time ./ds_sort varchar.txt > csort.txt

real    0m3.656s
user    0m0.421s
sys     0m0.261s

因为换了amd的机器,我把varchar.txt每行长度扩大了一倍,与上文的计时不可比,所以同时给出amd的机器Linux的sort命令的结果如下:

time sort varchar.txt > xsort.txt

real    0m3.434s
user    0m0.528s
sys     0m0.369s

总结,这几种语言的效率都差不多,修改后的zig跑到了同一起跑线。rust出奇地快,比sort命令还快。我都怀疑它对系统time作修改了,但张泽鹏先生说rust应该没这么无聊。
另外,在windows下编译c程序,需要扩大栈大小,否则执行出错。如下所示:

gcc sort_lines.c -o mingw_sort -O3 -Wl,--stack=68435456

timer64 mingw_sort varchar.txt > msort.txt

Kernel  Time =     0.171 =   18%
User    Time =     0.609 =   64%
Process Time =     0.781 =   83%    Virtual  Memory =    110 MB
Global  Time =     0.939 =  100%    Physical Memory =    113 MB

其中timer64来自7-benchmark, 它的结果被输送到重定向的文件。
测试时间说明,c语言本身并不慢,但是在wsl+docker环境中比较慢。


网站公告

今日签到

点亮在社区的每一天
去签到