C++ SIMD性能优化

发布于:2024-06-17 ⋅ 阅读:(145) ⋅ 点赞:(0)
// 使用SIMD指令优化的向量加法
//<mmintrin.h> MMX
//<xmmintrin.h>	SSE
//<emmintrin.h>	SSE2
//<pmmintrin.h>	SSE3
//<tmmintrin.h>	SSSE3
//<smmintrin.h>	SSE4.1
//<nmmintrin.h> SSE4.2
//<wmmintrin.h> AES
//<immintrin.h>	AVX, AVX2, FMA, BMI, POPCNT, AVX512
//<x86intrin.h>	Auto(GCC)
//<intrin.h> Auto(MSVC)
#include <emmintrin.h> // 包含SSE2指令集
#include <valarray>
#include <iostream>
#include <chrono>
#include <vector>

__m128i vector_add(__m128i a, __m128i b) {
    return _mm_add_epi32(a, b);
}

void add_vectors(int* a, int* b, int* c, int size) {
    for (int i = 0; i < size; i += 4) {
        __m128i va = _mm_load_si128((__m128i*)(a + i));
        __m128i vb = _mm_load_si128((__m128i*)(b + i));
        __m128i vc = _mm_add_epi32(va, vb);
        _mm_store_si128((__m128i*)(c + i), vc);
    }
}

int test() {
    std::valarray<float> a = { 1.0, 2.0, 3.0, 4.0 };
    std::valarray<float> b = { 5.0, 6.0, 7.0, 8.0 };
    std::valarray<float> c = a + b;
    for (auto& element : c) {
        std::cout << element << " ";
    }
    std::cout << std::endl;

    // 数据量小
    std::vector<int> a1(10);
    std::vector<int> b1(10);
    for (size_t i = 0; i < 10; i++)
    {
        a1[i] = i;
        b1[i] = i + 1;
    }
    std::vector<int> c1(10);

    std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
    add_vectors(&a1[0], &b1[0], &c1[0], 4);
    std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now();
    std::cout << "simd cost " << (t2 - t1).count() << std::endl;

    std::chrono::steady_clock::time_point t3 = std::chrono::steady_clock::now();
    for (size_t i = 0; i < c1.size(); ++i)
        c1.at(i) = a1.at(i) + b1.at(i);
    std::chrono::steady_clock::time_point t4 = std::chrono::steady_clock::now();
    std::cout << "cost " << (t4 - t3).count() << std::endl;


    // 数据量大
    std::vector<int> a2(10000);
    std::vector<int> b2(10000);
    for (size_t i = 0; i < 10000; i++)
    {
        a2[i] = i;
        b2[i] = i + 1;
    }
    std::vector<int> c2(10000);

    t1 = std::chrono::steady_clock::now();
    add_vectors(&a2[0], &b2[0], &c2[0], 4);
    t2 = std::chrono::steady_clock::now();
    std::cout << "simd cost " << (t2 - t1).count() << std::endl;

   t3 = std::chrono::steady_clock::now();
    for (size_t i = 0; i < c2.size(); ++i)
        c2.at(i) = a2.at(i) + b2.at(i);
   t4 = std::chrono::steady_clock::now();
    std::cout << "cost " << (t4 - t3).count() << std::endl;

    //for (const auto& ele : c1)
    //    std::cout << ele << " ";
    return 0;
}

输出

6 8 10 12
simd cost 500
cost 400
simd cost 5700
cost 49200


总结: 数据运算量小无效果,数据运算量大效果提升显著

参考

GitHub - parallel101/simdtutor: x86-64 SIMD矢量优化系列教程

GitHub - google/highway: Performance-portable, length-agnostic SIMD with runtime dispatch


创作不易,小小的支持一下吧!


网站公告

今日签到

点亮在社区的每一天
去签到