cuda的一个简单的乘法运算

ruijiege / 2023-08-31 / 原文

#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

__global__ void compute(float* a,float* b,float* c){
    int d0 = gridDim.z;
    int d1 = gridDim.y;
    int d2 = gridDim.x;
    int d3 = blockDim.z;
    int d4 = blockDim.y;
    int d5 = blockDim.x;

    // 构成了一个tensor是d0 x d1 x d2 x d3 x d4 x d5
    int p0 = blockIdx.z;
    int p1 = blockIdx.y;
    int p2 = blockIdx.x;
    int p3 = threadIdx.z;
    int p4 = threadIdx.y;
    int p5 = threadIdx.x;

    int position = (((((p0 * d1) + p1) * d2 + p2) * d3 + p3) * d4 + p4) * d5 + p5;

    //int position = ((blockIdx.y * gridDim.x) + blockIdx.x + threadIdx.y) * blockDim.x + threadIdx.x;
    //int position = ((gridDim.x * blockIdx.y + blockIdx.x) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
    //int position = (blockDim.x * blockIdx.x + threadIdx.x);
    // c[position] = a[position] * b[position];
    c[position] =  a[position] * b[position];
}

int main(int argc, char const *argv[])
{
    /* code */
    const int num = 3;
    float a[num] = {1, 2, 3};
    float b[num] = {5, 7, 9};
    float c[num] = {0};

    size_t size_array = sizeof(c);
    float* device_a = nullptr;
    float* device_b = nullptr;
    float* device_c = nullptr;

    cudaMalloc(&device_a,size_array);
    cudaMalloc(&device_b,size_array);
    cudaMalloc(&device_c,size_array);

    cudaMemcpy(device_a,a,size_array,cudaMemcpyHostToDevice);
    cudaMemcpy(device_b,b,size_array,cudaMemcpyHostToDevice);

    compute<<<1,3>>>(device_a,device_b,device_c);
    cudaMemcpy(c,device_c,size_array,cudaMemcpyDeviceToHost);
    for(int i:c){
        std::cout<<i<<std::endl;
    }
    return 0;
}