CUDA各种内存和使用方法-EW帮帮网

文章目录

在这里插入图片描述

1、全局内存

特点：容量最大，访问延时最大，所有线程都可以访问。 线性内存。
cudaMalloc() 动态分配内存
cudaFree() 释放内存
_device_ int buff[N] 使用固定长度的静态全局内存
动态内存使用cudaMemcpy 在host 和 device之间传输
静态内存 host 和 device 之间传输数据：
cudaMemcpyToSymbol() host 传到device
cudaMemcpyFromSymbol() device传到host

在这里插入图片描述

#include <cuda.h>
#include <cuda_runtime.h>
__device__ int d_x = 1;
__device__ int d_y[2];
__global__ void cudaOut(void) {
	d_y[0]+= d_x;
	d_y[1]+= d_x;
}
int main(void) {
	int h_y[2] = { 10,20 };
	CHECK(cudaMemcpyToSymbol(d_y, h_y, sizeof(int) * 2));
	cudaOut<<<1,1>>>();
	CHECK(cudaDeviceSynchronize());
	CHECK(cudaMemcpyFromSymbol(h_y, d_y, sizeof(int) * 2));
	return 0;
}

2、局部内存

特点：存储线程私有的局部变量，在寄存器内存不足时使用。

3、共享内存

特点：片上内存，访问速度快，线程块内共享。
使用：存储线程块中的共享数据，加速线程间的数据处理
每个SM的共享内存数量是一定的，也就是说，如果在单个线程块中分配过度的共享内存，将会限制活跃线程束的数量；合适分配单个线程块的共享内存，使得SM的使用率最大化，起到加速的作用。

例如：blockSize = 128，一个SM有2048个线程，那么一个SM能同时处理16个block。如果SM有96K的共享内存，每个block则分配96 / 16 = 6K，太大其他block无法获得使用。

__syncthreads 通常用于协调同一块中线程间的通信。

__global__ void kernel_function(parameters) {
     // 假设算法的特殊设计导致对变量 data 的访问不得不是非合并的
     // 1. 定义共享内存
     __shared__ float s_data [data_size];
     // 2. 将 data 复制到共享内存
     s_data[copy_index] = data[copy_index];
     // 3. 等待线程块中所有线程完成复制操作
     __syncthreads();
     // 4. 进行操作（包含非合并内存访问）
     operations(data);
}

3.1 静态共享内存

// 静态共享内存
__global__ void staticReverse(int* d, int n){
   	__shared__ int s[64];
    int t = threadIdx.x;
    s[t] = d[t];
    __syncthreads();
}
int main(void){
staticReverse << <1, n >> > (d_d, n);
}

3.2 动态共享内存

调用时指定共享内存大小
如果一个共享内存的大小在编译时是未知的, 则需要添加 extern修饰，在内核调用时动态分配。

__global__ void dynamicReverse(int* d, int n){
    extern __shared__ int s[];
    int t = threadIdx.x;
    s[t] = d[t];
}
int main(void){
dynamicReverse << <1, n, n * sizeof(int) >> > (d_d, n);
}

4、纹理内存

特点：优化二维数据的访问，具有缓存加速
使用：适用图像处理和大规模数据访问
使用时关键在把内存绑定到纹理上。
使用实例：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include"math.h"

#define BLOCK_DIM 512

__device__ unsigned char ComputeSobel(
	unsigned char ul, // upper left
	unsigned char um, // upper middle
	unsigned char ur, // upper right
	unsigned char ml, // middle left
	unsigned char mm, // middle (unused)
	unsigned char mr, // middle right
	unsigned char ll, // lower left
	unsigned char lm, // lower middle
	unsigned char lr // lower right
){
	short Horz = ur + 2 * mr + lr - ul - 2 * ml - ll;
	short Vert = ul + 2 * um + ur - ll - 2 * lm - lr;
	short Sum = (short)((abs((int)Horz) + abs((int)Vert)));

	if (Sum < 0)
           return 0;
	else if (Sum > 0xff) 
           return 0xff;
	return (unsigned char)Sum;
}
__global__ void SobelTex(
        unsigned char* pSobelOriginal, 
        unsigned int Pitch,
	int w, int h,  
        cudaTextureObject_t tex){

   unsigned char* pSobel = (unsigned char*)(((char*)pSobelOriginal) + 
                            blockIdx.x * Pitch);

   for (int i = threadIdx.x; i < w; i += blockDim.x){
      uchar pix00 = tex2D<unsigned char>(tex, (float)i - 1, (float)blockIdx.x - 1);
      uchar pix01 = tex2D<unsigned char>(tex, (float)i + 0, (float)blockIdx.x - 1);
      uchar pix02 = tex2D<unsigned char>(tex, (float)i + 1, (float)blockIdx.x - 1);
      uchar pix10 = tex2D<unsigned char>(tex, (float)i - 1, (float)blockIdx.x + 0);
      uchar pix11 = tex2D<unsigned char>(tex, (float)i + 0, (float)blockIdx.x + 0);
      uchar pix12 = tex2D<unsigned char>(tex, (float)i + 1, (float)blockIdx.x + 0);
      uchar pix20 = tex2D<unsigned char>(tex, (float)i - 1, (float)blockIdx.x + 1);
      uchar pix21 = tex2D<unsigned char>(tex, (float)i + 0, (float)blockIdx.x + 1);
      uchar pix22 = tex2D<unsigned char>(tex, (float)i + 1, (float)blockIdx.x + 1);
      pSobel[i] = ComputeSobel(pix00, pix01, pix02,
			       pix10, pix11, pix12,
			       pix20, pix21, pix22);
   }
}

extern "C" void sobelFilter(unsigned char* data, 
                            unsigned char* result, 
                            int iw, int ih){
    // 1 在设备内存中分配CUDA数组
	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned char>();
  	cudaArray* cuArray;
	cudaMallocArray(&cuArray, &channelDesc, iw, ih);
   // 2 复制主机数据data到设备内存
	cudaMemcpy2DToArray(cuArray, 0, 0, 
                        data, 
                        iw * sizeof(unsigned char),
		            	iw * sizeof(unsigned char), ih, cudaMemcpyHostToDevice);
  // 3 明确纹理
	cudaResourceDesc resDesc;
	memset(&resDesc, 0, sizeof(cudaResourceDesc));
	resDesc.resType = cudaResourceTypeArray;
	resDesc.res.array.array = cuArray;   
   // 4 明确纹理参数
	cudaTextureDesc texDesc;
	memset(&texDesc, 0, sizeof(cudaTextureDesc));
	texDesc.normalizedCoords = false;
	texDesc.filterMode       = cudaFilterModePoint;
	texDesc.addressMode[0]   = cudaAddressModeWrap;
	texDesc.readMode         = cudaReadModeElementType; 
 // 5 创建纹理对象
	cudaTextureObject_t texObj;
	cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
// 6 分配设备内存，存储处理结果
	unsigned char* d_result;
	cudaMalloc((void**)&d_result, iw * ih * sizeof(unsigned char));
// 7 处理
	SobelTex << <ih, 384 >> > (d_result, iw, iw, ih, texObj);
// 8 将处理结果返回到主机
        cudaMemcpy(result, d_result, 
                   iw * ih * sizeof(unsigned char), 
                   cudaMemcpyDeviceToHost);
// 9 释放内存 [不知道这里的顺序有没有讲究]
	cudaFree(d_result);
	cudaFreeArray(cuArray);
	cudaDestroyTextureObject(texObj);
}

5、常量内存

特点：存储只读数据，访问速度快，广播式访问。数量有限，最多64KB
使用：频繁访问的常量数据，所有线程块都能访问，全局可见。
使用_constant_修饰
只能通过cudaMemcpyToSymbol() 或**cudaMemcpyToSymbolAsync()**进行数据传输

const int N = 4;
// 定义结构体
struct ConstStruct {
    float array[N];
    float singleValue;
};
// 常量内存变量声明
__constant__ float d_constArray[N];    // 数组
__constant__ float d_singleValue;      // 单个变量
__constant__ ConstStruct d_constStruct; // 结构体
// 定义核函数
__global__ void kernelFunction(float* d_result, float* d_result_s) {
    int idx = threadIdx.x;
    if (idx < N) {
        // 从常量内存中读取常量数组和单个值，将结果存入 d_result
        d_result[idx] = d_constArray[idx] + d_singleValue;
        // 从常量内存中的结构体读取数据，将数组中的值和单个值相加，并存入 d_result_s
        d_result_s[idx] = d_constStruct.array[idx] + d_constStruct.singleValue;
    }
}
int main() {
    // 将数据从主机复制到常量内存
    CHECK(cudaMemcpyToSymbol(d_constArray, h_array, N * sizeof(float)));
    CHECK(cudaMemcpyToSymbol(d_singleValue, &h_singleValue, sizeof(float)));
    CHECK(cudaMemcpyToSymbol(d_constStruct, &h_constStruct, sizeof(ConstStruct)));
	kernelFunction << <1, N >> > (d_result, d_result_s);
}

6、寄存器内存

特点：片上内存，访问速度最快
使用：局部变量

7、用CUDA运行时API函数查询设备

该段介绍用 CUDA 运行时 API 函数查询所用 GPU 的规格，可以通过以下代码查看显卡的信息：

#include <iostream>
#include <cuda_runtime.h>
#include "error_check.cuh"
int main(int argc, char* argv[]) {
	int device_id = 0;//如果你不止一个显卡，可以切换ID，输出不同显卡的信息
	if (argc > 1) device_id = atoi(argv[1]);
	CHECK(cudaSetDevice(device_id));
	cudaDeviceProp prop;
	CHECK(cudaGetDeviceProperties(&prop, device_id));
	printf("Device id: %d\n", device_id);
	printf("Device name: %s\n", prop.name);
	printf("Compute capability: %d.%d\n", prop.major, prop.minor);
	printf("Amount of global memory: %g GB\n", prop.totalGlobalMem / (1024.0 * 1024 * 1024));
	printf("Amount of constant memory: %g KB\n", prop.totalConstMem / 1024.0);
	printf("Maximum grid size: %d %d %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
	printf("Maximum block size: %d %d %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
	printf("Number of SMs: %d\n", prop.multiProcessorCount);
	printf("Maximum amount of shared memory per block: %g KB\n", prop.sharedMemPerBlock / 1024.0);//每个线程块可以使用的最大共享内存
	printf("Maximum amount of shared memory per SM: %g KB\n", prop.sharedMemPerMultiprocessor / 1024.0);//个SM可以分配的最大共享内存总量
	printf("Maximum number of registers per block: %d K\n", prop.regsPerBlock / 1024);//每个线程块可以使用的最大寄存器数量
	printf("Maximum number of registers per SM: %d K\n", prop.regsPerMultiprocessor / 1024);//每个SM可以分配的最大寄存器总量
	printf("Maximum number of threads per block: %d\n", prop.maxThreadsPerBlock);
	printf("Maximum number of threads per SM: %d\n", prop.maxThreadsPerMultiProcessor);//每个SM可以同时运行的最大线程数量
	return 0;
}

CUDA 错误检测

#define CHECK(call)                 \    
do                                  \
{                                   \
    const cudaError_t error_code = call;                \
    if (error_code != cudaSuccess)                  \
    {                                   \
    printf("CUDA ERROR:\n");                    \
        printf("    FILE:   %s\n", __FILE__);           \
        printf("    LINE:   %d\n", __LINE__);           \
        printf("    ERROR CODE: %d\n", error_code);         \
        printf("    ERROR TEXT: %s\n", cudaGetErrorString(error_code)); \
    exit(1);                            \
    }                                   \
} while(0)

CUDA各种内存和使用方法