#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
__global__ void compute(float* a,float* b,float* c){
int d0 = gridDim.z;
int d1 = gridDim.y;
int d2 = gridDim.x;
int d3 = blockDim.z;
int d4 = blockDim.y;
int d5 = blockDim.x;
// 构成了一个tensor是d0 x d1 x d2 x d3 x d4 x d5
int p0 = blockIdx.z;
int p1 = blockIdx.y;
int p2 = blockIdx.x;
int p3 = threadIdx.z;
int p4 = threadIdx.y;
int p5 = threadIdx.x;
int position = (((((p0 * d1) + p1) * d2 + p2) * d3 + p3) * d4 + p4) * d5 + p5;
//int position = ((blockIdx.y * gridDim.x) + blockIdx.x + threadIdx.y) * blockDim.x + threadIdx.x;
//int position = ((gridDim.x * blockIdx.y + blockIdx.x) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
//int position = (blockDim.x * blockIdx.x + threadIdx.x);
// c[position] = a[position] * b[position];
c[position] = a[position] * b[position];
}
int main(int argc, char const *argv[])
{
/* code */
const int num = 3;
float a[num] = {1, 2, 3};
float b[num] = {5, 7, 9};
float c[num] = {0};
size_t size_array = sizeof(c);
float* device_a = nullptr;
float* device_b = nullptr;
float* device_c = nullptr;
cudaMalloc(&device_a,size_array);
cudaMalloc(&device_b,size_array);
cudaMalloc(&device_c,size_array);
cudaMemcpy(device_a,a,size_array,cudaMemcpyHostToDevice);
cudaMemcpy(device_b,b,size_array,cudaMemcpyHostToDevice);
compute<<<1,3>>>(device_a,device_b,device_c);
cudaMemcpy(c,device_c,size_array,cudaMemcpyDeviceToHost);
for(int i:c){
std::cout<<i<<std::endl;
}
return 0;
}