Cuda stream Test
1/* 2 * copyright 1993-2010 NVIDIA Corporation. all rights reserved. 3*4 * NVIDIA Corporation and its Licensors retain all intellectual property and 5 * proprietary rights in and to this software and related documentation. 6 * any use, reproduction, disclosure, or distribution of this software 7 * and related documentation without an express license agreement from 8 * NVIDIA Corporation is stric Tly prohibited. 9*10 * Please refer to the applicable NVIDIA End User License Agreement (EULA) 11 * associated with this source code for terms and conditions that govern 12 * your use of this NVIDIA software. 13*14 */15 16 17 # include ".. /common/book. H "18 # include" Cuda. H "19 # include" cuda_runtime.h "20 # include" device_launch_parameters.h "21 # define N (1024*1024) 22 # define full_data_size (N * 20) 23 24 25 _ global _ void kernel (int * a, int * B, int * c) {26 int idx = threadidx. X + blockidx. x * blockdim. x; 27 if (idx <n) {28 // the last two digits of idx 29 int idx1 = (idx + 1) % 256; 30 int idx2 = (idx + 2) % 256; 31 float as = (a [idx] + A [idx1] + A [idx2])/3.0f; 32 float BS = (B [idx] + B [idx1] + B [idx2])/3.0f; 33 C [idx] = (AS + BS)/2; 34} 35} 36 37 38 int main (void) {39 cudadeviceprop prop; 40 int whichdevice; 41 handle_error (cudagetdevice (& whichdevice); 42 handle_error (cudagetdeviceproperties (& prop, whichdevice); 43 If (! Prop. deviceoverlap) {44 printf ("device will not handle overlaps, so no speed up from streams \ n"); 45 return 0; 46} 47 48 cudaevent_t start, stop; 49 float elapsedtime; 50 51 cudastream_t stream; 52 int * host_a, * host_ B, * host_c; 53 int * dev_a, * dev_ B, * dev_c; 54 55 // start the timers 56 handle_error (cudaeventcreate (& START); 57 handle_error (cudaeventcreate (& stop )); 58 59 // initialize stream 60 handle_error (cudastreamcreate (& stream); 61 62 // allocate the memory on the GPU 63 handle_error (cudamalloc (void **) & dev_a, 64 N * sizeof (INT); 65 handle_error (cudamalloc (void **) & dev_ B, 66 N * sizeof (INT ))); 67 handle_error (cudamalloc (void **) & dev_c, 68 N * sizeof (INT ))); 69 70 // allocated because GPU-accessed host has no paging memory (locked memory page) 71 handle_error (cudahostalloc (void **) & host_a, 72 full_data_size * sizeof (INT ), 73 cudahostallocdefault); 74 handle_error (cudahostalloc (void **) & host_ B, 75 full_data_size * sizeof (INT), 76 cudahostallocdefault )); 77 handle_error (cudahostalloc (void **) & host_c, 78 full_data_size * sizeof (INT), 79 cudahostallocdefault); 80 81 For (INT I = 0; I <full_data_size; I ++) {82 host_a [I] = rand (); 83 host_ B [I] = rand (); 84} 85 86 handle_error (cudaeventrecord (START, 0 )); 87 // now loop over full data, in bite-sized chunks 88 for (INT I = 0; I <full_data_size; I ++ = N) {89 // asynchronously copy the memory value on the host to 90 handle_error (cudamemcpyasync (dev_a, host_a + I, 91 N * sizeof (INT), 92 cudamemcpyhosttodevice, 93 stream); 94 handle_error (cudamemcpyasync (dev_ B, host_ B + I, 95 N * sizeof (INT), 96 cudamemcpyhosttodevice, 97 stream )); 98 99 kernel <n/256,256, 0, stream> (dev_a, dev_ B, dev_c ); 100 101 // copy the calculated value to host 102 handle_error (cudamemcpyasync (host_c + I, dev_c, 103 N * sizeof (INT), 104 cudamemcpydevicetohost, 105 stream )); 106 107} 108 // copy the result block from the lock page to host memory 109 handle_error (cudastreamsynchronize (Stream); 110 111 handle_error (cudaeventrecord (STOP, 0 )); 112 113 handle_error (cudaeventsynchronize (STOP); 114 handle_error (cudaeventelapsedtime (& elapsedtime, 115 start, stop); 116 printf ("time taken: % 3.1f Ms \ n ", fail); 117 118 // cleanup the streams and memory119 handle_error (cudafreehost (host_a); 120 handle_error (cudafreehost (host_ B); 121 handle_error (cudafreehost (host_c )); 122 handle_error (cudafree (dev_a); 123 handle_error (cudafree (dev_ B); 124 handle_error (cudafree (dev_c); 125 handle_error (cudastreamdestroy (Stream )); 126 127 return 0; 128}
Package and download a project
Cuda stream test = basic_single_stream