矩阵相加CUDA实现
//矩阵相加的CUDA程序实现 //Author: Eric Lv //Email: Eric2014_Lv@sjtu.edu.cn //Date: 6/7/2017 #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include <math.h> #include <stdlib.h> //#include <cuda.h> #define N 32 __global__ void matrix_add(const int a[][N], const int b[][N], int c[][N]) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int idy = blockIdx.y * blockDim.y + threadIdx.y; c[idx][idy] = a[idx][idy] + b[idx][idy]; } int main(void) { int i; int *dev_a, *dev_b, *dev_c; int *host_a, *host_b, *host_c; //分配block里面线程的维数 N*N dim3 threads_in_block (N, N); cudaError_t err = cudaSuccess; host_a = (int *)malloc( sizeof(int) * N * N ); host_b = (int *)malloc( sizeof(int) * N * N ); host_c = (int *)malloc( sizeof(int) * N * N ); err = cudaMalloc((void **)&dev_a, sizeof(int) * N * N); if(err != cudaSuccess) { printf("cudaMalloc (a) is failed!\n"); return -1; } err = cudaMalloc((void **)&dev_b, sizeof(int) * N * N); if(err != cudaSuccess) { printf("cudaMalloc (b) is failed!\n"); return -1; } err = cudaMalloc((void **)&dev_c, sizeof(int) * N * N); if(err != cudaSuccess) { printf("cudaMalloc (c) is failed!\n"); return -1; } for(i = 0; i < N * N; i++) { host_a[i] = 2*i+1; host_b[i] = 3*i-1; } err = cudaMemcpy(dev_a, host_a, sizeof(int) * N * N, cudaMemcpyHostToDevice); if(err != cudaSuccess) { printf("Host to device (a) is failed!\n"); return -1; } err = cudaMemcpy(dev_b, host_b, sizeof(int) * N * N, cudaMemcpyHostToDevice); if(err != cudaSuccess) { printf("Host to device (b) is failed!\n"); return -1; } // 调用GPU上的核函数 matrix_add<<<1, threads_in_block>>>((int (*)[N])dev_a, (int (*)[N])dev_b, (int (*)[N])dev_c); err = cudaMemcpy(host_c, dev_c, sizeof(int) * N * N, cudaMemcpyDeviceToHost); if(err != cudaSuccess) { printf("Device to host (c) is failed!\n"); return -1; } for (i = 0; i < N * N; i++) { if (host_a[i] + host_b[i] != host_c[i]) { printf("a[%d]%d + b[%d]%d != c[%d]%d.\n", i, host_a[i], i, host_b[i], i, host_c[i]); return -1; } } printf("Congratulations! All entris are correct! You have finished the CUDA code!\n"); free(host_a); free(host_b); free(host_c); cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); return 0; }