در این مطلب جمع دو ماتریس به کمک کودا به همراه محاسبه ی زمان انجام فرایند امده است:
#include <stdio.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <chrono>
#include <iostream>
#define N 100
#define BLOCK_DIM 10
__global__ void matrixAdd(int *a, int *b, int *c) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = col + row * N;
if (col < N && row < N) {
c[index] = a[index] + b[index];
}
}
int main() {
using namespace std;
using namespace std::chrono;
auto StartTime = steady_clock::now();
clock_t time;
time = clock();
int a[N][N], b[N][N], c[N][N];
int *dev_a, *dev_b, *dev_c;
int size = N * N * sizeof(int);
for (int i = 0; i<N; i++)
for (int j = 0; j<N; j++){
a[i][j] = rand();
b[i][j] = rand();
}
cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, size);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
printf("dimBlock.x = %d, dimBlock.y = %d\n", dimBlock.x, dimBlock.y);
//dim3 dimGrid((int)ceil(N/dimBlock.x),(int)ceil(N/dimBlock.y));
dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x, (N + dimBlock.y - 1) / dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n", dimGrid.x, dimGrid.y);
matrixAdd << <dimGrid, dimBlock >> >(dev_a, dev_b, dev_c);
cudaDeviceSynchronize();
cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);
for (int i = 0; i<N; i++){
for (int j = 0; j<N; j++){
printf("%d\t", c[i][j]);
}
printf("\n");
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
time = clock() - time;
cout << "It took me " << time << " clicks and " << ((float)time) / CLOCKS_PER_SEC << " seconds" << endl;
getchar();
return 0;
}
که خروجی آن مطابق شکل زیر است:
زمان انجام محاسبه برابر 1.743 ثانیه ;که معادل 0.02 دقیقه می باشد.