CUDA 计时器编程
在进行 CUDA 编程时,需要利用计时方法查看程序运行速度。本文是一个简单实现。
在进行 CUDA 编程时,需要利用计时方法查看程序运行速度。
首先给出头文件 gputimer.h
#ifndef __GPU_TIMER_H__
#define __GPU_TIMER_H__
struct GpuTimer
{
cudaEvent_t start;
cudaEvent_t stop;
GpuTimer()
{
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~GpuTimer()
{
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void Start()
{
cudaEventRecord(start, 0);
}
void Stop()
{
cudaEventRecord(stop, 0);
}
float Elapsed()
{
float elapsed;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
return elapsed;
}
};
#endif /* __GPU_TIMER_H__ */
通用用法
GpuTimer timer;
timer.Start();
// launch the kernal
kernal<<<1, ARRAY_SIZE>>>(d_out, d_in);
timer.Stop();
printf("Time elapsed = %g ms\n", timer.Elapsed()); // 输出
实际运用,计算 1000 个数的平方
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include "gputimer.h"
#include <stdio.h>
#include <stdlib.h>
__global__ void square(float* d_out, float* d_in) {
int idx = threadIdx.x;
float f = d_in[idx];
d_out[idx] = f * f;
}
int main() {
GpuTimer timer;
const int ARRAY_SIZE = 1000;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
// generate the input array on the host
float h_in[ARRAY_SIZE];
for (int i = 0; i < ARRAY_SIZE; i++) {
h_in[i] = float(i);
}
float h_out[ARRAY_SIZE];
// declare GPU memory pointers
float* d_in;
float* d_out;
// allocate GPU memory
cudaMalloc((void **)&d_in, ARRAY_BYTES);
cudaMalloc((void**)&d_out, ARRAY_BYTES);
// transfer the array to the GPU
cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
timer.Start();
// launch the kernal
square<<<1, ARRAY_SIZE>>>(d_out, d_in);
timer.Stop();
// copy back the result array to the CPU
cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
// print out the resulting array
for (int i = 0; i < ARRAY_SIZE; i++) {
printf("%f", h_out[i]);
printf(((i % 4) != 3) ? "\t" : "\n");
}
printf("Time elapsed = %g ms\n", timer.Elapsed());
// free GPU memory allocation
cudaFree(d_in);
cudaFree(d_out);
system("pause");
return 0;
}
运行结果: