摩尔线程 国产显卡 MUSA 并行编程 学习笔记-2024/12/04

发布于:2024-12-07 ⋅ 阅读:(111) ⋅ 点赞:(0)

Learning Roadmap:

Section 1: Intro to Parallel Programming & MUSA

  1. Deep Learning Ecosystem(摩尔线程 国产显卡 MUSA 并行编程 学习笔记-2024/11/30-CSDN博客
  2. Ubuntu+Driver+Toolkit+conda+pytorch+torch_musa环境安装(2024/11/24-Ubuntu Windows双系统安装 | 2024/11/30-GPU驱动&MUSA Toolkit安装)
  3. C/C++ Review(摩尔线程国产显卡 MUSA 并行编程学习笔记-2024/11/22-CSDN博客
  4. GPU intros(摩尔线程国产显卡 MUSA 并行编程学习笔记-2024/11/25-CSDN博客
  5. GPU硬件架构 (摩尔线程国产显卡 MUSA 并行编程学习笔记-2024/11/26-CSDN博客)
  6. Write First Kernels (Here) (2024/11/27-线程层级 | 2024/11/28-First MUSA Kernel to Count Thread | 2024/12/02-向量相加 | 2024/12/03-向量相加(3D))
  7. MUSA API
  8. Faster Matrix Multiplication
  9. Triton
  10. Pytorch Extensions(摩尔线程国产显卡 MUSA 并行编程学习笔记-2024/11/21-CSDN博客
  11. MNIST Multilayer Perceptron

Section 2: Parallel Programming & MUSA in Depth

  1. Analyzing Parallel Program Performance on a Quad-Core CPU
  2. Scheduling Task Graphs on a Multi-Core CPU
  3. A Simple Renderer in MUSA
  4. Optimizing DNN Performance on DNN Accelerator Hardware
  5. llm.c

Ref:摩尔学院 High-Performance Computing with GPUs | Stanford CS149 - Video | Stanford CS149 - Syllabus

Kernel to Multiply Matrix

Ref:  High-Performance Computing with GPUs Chapter 5 | 摩尔学院 - MUSA基础

下面的代码将用CPU与GPU分别对两个矩阵(Matrix A: 256 * 512, Matrix B: 512 * 256)进行相乘,并计算对应的平均耗时

代码地址

MUSA PLAY GROUND - Github

代码

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <musa_runtime.h>

#define M 256  // Number of rows in A and C
#define K 512   // Number of columns in A and rows in B
#define N 256  // Number of columns in B and C
#define BLOCK_SIZE 32

// Example 3x2 @ 2x4 = 3x4 -> (M x K) @ (K x N) = (M x N)
// A = [[1, 2], 
//      [3, 4], 
//      [5, 6]]

// B = [[7, 8, 9, 10],
//      [11, 12, 13, 14]]

// C = A * B = [[1*7 + 2*11, 1*8 + 2*12, 1*9 + 2*13, 1*10 + 2*14],
//              [3*7 + 4*11, 3*8 + 4*12, 3*9 + 4*13, 3*10 + 4*14],
//              [5*7 + 6*11, 5*8 + 6*12, 5*9 + 6*13, 5*10 + 6*14]]

// C = [[29, 32, 35, 38],
//      [65, 72, 79, 86],
//      [101, 112, 123, 134]]


// CPU matrix multiplication
void matmul_cpu(float *A, float *B, float *C, int m, int k, int n) {
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
            float sum = 0.0f;
            for (int l = 0; l < k; l++) {
                sum += A[i * k + l] * B[l * n + j];
            }
            C[i * n + j] = sum;
        }
    }
}

// MUSA kernel for matrix multiplication
__global__ void matmul_gpu(float *A, float *B, float *C, int m, int k, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < m && col < n) {
        float sum = 0.0f;
        for (int l = 0; l < k; l++) {
            sum += A[row * k + l] * B[l * n + col];
        }
        C[row * n + col] = sum;
    }
}

// Initialize matrix with random values
void init_matrix(float *mat, int rows, int cols) {
    for (int i = 0; i < rows * cols; i++) {
        mat[i] = (float)rand() / RAND_MAX;
    }
}

// Function to measure execution time
double get_time() {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec + ts.tv_nsec * 1e-9;
}

int main() {
    float *h_A, *h_B, *h_C_cpu, *h_C_gpu;
    float *d_A, *d_B, *d_C;
    int size_A = M * K * sizeof(float);
    int size_B = K * N * sizeof(float);
    int size_C = M * N * sizeof(float);

    // Allocate host memory
    h_A = (float*)malloc(size_A);
    h_B = (float*)malloc(size_B);
    h_C_cpu = (float*)malloc(size_C);
    h_C_gpu = (float*)malloc(size_C);

    // Initialize matrices
    srand(time(NULL));
    init_matrix(h_A, M, K);
    init_matrix(h_B, K, N);

    // Allocate device memory
    musaMalloc(&d_A, size_A);
    musaMalloc(&d_B, size_B);
    musaMalloc(&d_C, size_C);

    // Copy data to device
    musaMemcpy(d_A, h_A, size_A, musaMemcpyHostToDevice);
    musaMemcpy(d_B, h_B, size_B, musaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (M + BLOCK_SIZE - 1) / BLOCK_SIZE);

    // Warm-up runs
    printf("Performing warm-up runs...\n");
    for (int i = 0; i < 3; i++) {
        matmul_cpu(h_A, h_B, h_C_cpu, M, K, N);
        matmul_gpu<<<gridDim, blockDim>>>(d_A, d_B, d_C, M, K, N);
        musaDeviceSynchronize();
    }

    // Benchmark CPU implementation
    printf("Benchmarking CPU implementation...\n");
    double cpu_total_time = 0.0;
    for (int i = 0; i < 20; i++) {
        double start_time = get_time();
        matmul_cpu(h_A, h_B, h_C_cpu, M, K, N);
        double end_time = get_time();
        cpu_total_time += end_time - start_time;
    }
    double cpu_avg_time = cpu_total_time / 20.0;

    // Benchmark GPU implementation
    printf("Benchmarking GPU implementation...\n");
    double gpu_total_time = 0.0;
    for (int i = 0; i < 20; i++) {
        double start_time = get_time();
        matmul_gpu<<<gridDim, blockDim>>>(d_A, d_B, d_C, M, K, N);
        musaDeviceSynchronize();
        double end_time = get_time();
        gpu_total_time += end_time - start_time;
    }
    double gpu_avg_time = gpu_total_time / 20.0;

    // Print results
    printf("CPU average time: %f microseconds\n", (cpu_avg_time * 1e6f));
    printf("GPU average time: %f microseconds\n", (gpu_avg_time * 1e6f));
    printf("Speedup: %fx\n", cpu_avg_time / gpu_avg_time);

    // Free memory
    free(h_A);
    free(h_B);
    free(h_C_cpu);
    free(h_C_gpu);
    musaFree(d_A);
    musaFree(d_B);
    musaFree(d_C);

    return 0;
}

编译

    mcc '02 matmul.mu' -o matmul -mtgpu -O2 -lmusart

   ./matmul

输出结果

 如图所示,GPU提速明显

Notes

同步函数

musaDeviceSynchronize()

确保kernel相关的任务都执行完毕。执行完成后方可安全的执行下一个kernel

__syncthreads()

用途:在同一个block内,同步所有线程的执行。在线程块内所有线程到达此命令前,所有线程都不会执行其后的指令


典型用例:当有多个线程要访问SharedMemory的同一地址,而这个地址存储的值被修改,则需要用__syncthreads同步

注意事项:调用_syncthreads时,必须保证block内所有线程都会调用到这个函数,否则会出错