标签:使用 tput 失败 whether debug ptime get unsigned deb
关于CPU - GPU交互的简单接口函数。
源代码:
1 //
2 // This sample illustrates the usage of CUDA events for both GPU timing and
3 // overlapping CPU and GPU execution. Events are inserted into a stream
4 // of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can
5 // perform computations while GPU is executing (including DMA memcopies
6 // between the host and device). CPU can query CUDA events to determine
7 // whether GPU has completed tasks.
8 //
9
10 // includes, system
11 #include 12
13 // includes CUDA Runtime
14 #include 15 #include "device_launch_parameters.h"
16
17 // includes, project
18 #include 19 #include // helper utility functions
20
21 __global__ void increment_kernel(int *g_data, int inc_value)
22 {
23 int idx = blockIdx.x * blockDim.x + threadIdx.x;
24 g_data[idx] = g_data[idx] + inc_value;
25 }
26
27 bool correct_output(int *data, const int n, const int x)
28 {
29 for (int i = 0; i )
30 {
31 if (data[i] != x)
32 {
33 printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
34 return false;
35 }
36 }
37 return true;
38 }
39
40 int main(int argc, char *argv[])
41 {
42 int devID;
43 cudaDeviceProp deviceProps;
44
45 printf("[%s] - Starting...\n", argv[0]);
46
47 // This will pick the best possible CUDA capable device
48 devID = findCudaDevice(argc, (const char **)argv);
49
50 // get device name
51 checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
52 printf("CUDA device [%s]\n", deviceProps.name);
53
54 int n = 16 * 1024 * 1024;
55 int nbytes = n * sizeof(int);
56 int value = 26;
57
58 // allocate host memory
59 int *a = 0;
60 checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
61 memset(a, 0, nbytes);
62
63 // allocate device memory
64 int *d_a = 0;
65 checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
66 checkCudaErrors(cudaMemset(d_a, 255, nbytes));
67
68 // set kernel launch configuration
69 dim3 threads = dim3(512, 1);
70 dim3 blocks = dim3(n / threads.x, 1);
71
72 // create cuda event handles
73 cudaEvent_t start, stop;
74 checkCudaErrors(cudaEventCreate(&start));
75 checkCudaErrors(cudaEventCreate(&stop));
76
77 StopWatchInterface *timer = NULL;
78 sdkCreateTimer(&timer);
79 sdkResetTimer(&timer);
80
81 checkCudaErrors(cudaDeviceSynchronize());
82 float gpu_time = 0.0f;
83
84 // asynchronously issue work to the GPU (all to stream 0)
85 sdkStartTimer(&timer);
86 cudaEventRecord(start, 0);
87 cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
88 increment_kernel 0, 0 >> >(d_a, value);
89 cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
90 cudaEventRecord(stop, 0);
91 sdkStopTimer(&timer);
92
93 // have CPU do some work while waiting for stage 1 to finish
94 unsigned long int counter = 0;
95
96 while (cudaEventQuery(stop) == cudaErrorNotReady)
97 counter++;
98
99 checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
100
101 // print the cpu and gpu times
102 printf("time spent executing by the GPU: %.2f\n", gpu_time);
103 printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
104 printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
105
106 // check the output for correctness
107 bool bFinalResults = correct_output(a, n, value);
108
109 // release resources
110 checkCudaErrors(cudaEventDestroy(start));
111 checkCudaErrors(cudaEventDestroy(stop));
112 checkCudaErrors(cudaFreeHost(a));
113 checkCudaErrors(cudaFree(d_a));
114
115 getchar();
116 exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
117 }
? 输出结果:
[D:\Code\CUDA\ProjectCUDA\x64\Debug\ProjectCUDA.exe] - Starting...
GPU Device 0: "GeForce GTX 1070" with compute capability 6.1
CUDA device [GeForce GTX 1070]
time spent executing by the GPU: 11.65
time spent by CPU in CUDA calls: 0.04
CPU executed 5161 iterations while waiting for GPU to finish
? 新姿势:
● 调用主函数时的第0个参数作为程序名字符串,可以用于输出。
1 int main(int argc, char *argv[])
2 ...
3 printf("%s", argv[0]);
● 寻找最合适的CUDA设备?
inline int findCudaDevice(int argc, const char **argv)
使用中,直接使用主函数的参数,返回设备编号
int devID = findCudaDevice(argc, (const char **)argv);
● 在没有附加flag的情况下申请主机内存,注意使用cudaFreeHost释放
1 int *a, nbytes = n * sizeof(int);
2 cudaMallocHost((void **)&a, nbytes);
3 ...
4 cudaFreeHost(a);
● 记录CPU调用CUDA所用的时间
1 StopWatchInterface *timer = NULL;
2 sdkCreateTimer(&timer);
3 sdkResetTimer(&timer);
4 sdkStartTimer(&timer);
5
6 ...// 核函数调用
7
8 sdkStopTimer(&timer);
9 printf("%.2f ms", sdkGetTimerValue(&timer));
● 查看GPU队列状态的函数
extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event);
使用中,stop为放置到流中的一个事件,cudaEventQuery(stop)返回时间的状态,等于cudaSuccess(宏,值等于0)表示已经发生;等于cudaErrorNotReady(宏,值等于35)表示尚未发生。源代码中利用这段时间让CPU空转,记录了迭代次数。
while (cudaEventQuery(stop) == cudaErrorNotReady) counter++;
● 中关于返回成功和失败的宏
1 #define EXIT_SUCCESS 0
2 #define EXIT_FAILURE 1
0_Simple__asyncAPI
标签:使用 tput 失败 whether debug ptime get unsigned deb
原文地址:http://www.cnblogs.com/cuancuancuanhao/p/7723570.html