CUDA C: Difference between revisions
| (12 intermediate revisions by the same user not shown) | |||
| Line 10: | Line 10: | ||
* CUDA Toolkit  | * CUDA Toolkit  | ||
* CUDA SDK  | * CUDA SDK  | ||
'''NOTE: Matlab 2012a does not work with CUDA versions higher than 4.0!'''  | |||
===Install the driver===  | ===Install the driver===  | ||
| Line 66: | Line 68: | ||
</pre>  | </pre>  | ||
==   | == Program Example ==  | ||
<source lang="c">  | <source lang="c">  | ||
#include <stdio.h>  | |||
__global__ void   | #define N 5  | ||
__global__ void VecSub(float* A, float* B, float* C){  | |||
	int i = threadIdx.x;  | |||
	C[i] = A[i] - B[i];  | |||
}  | }  | ||
int main(){  | int main(){  | ||
	size_t size = N * sizeof(float);  | |||
	// Allocate vectors in host memory  | |||
	float* h_A = (float*)malloc(size);  | |||
	float* h_B = (float*)malloc(size);  | |||
	float* h_C = (float*)malloc(size);  | |||
	// Initialize input vectors  | |||
	for (int i = 0; i < N; i++){  | |||
		h_B[i] = 2*N - i;  | |||
		h_A[i] = 2*h_B[i] + i;  | |||
	}  | |||
	// Allocate vectors in device memory  | |||
	float* d_A; cudaMalloc(&d_A, size);  | |||
	float* d_B; cudaMalloc(&d_B, size);  | |||
	float* d_C; cudaMalloc(&d_C, size);  | |||
	// Copy vectors from host memory to device memory  | |||
	cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);  | |||
	cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);  | |||
	// Invoke kernel  | |||
	int threadsPerBlock = 256;  | |||
	int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;  | |||
	VecSub<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C);  | |||
	// Copy result from device memory to host memory  | |||
	// h_C contains the result in host memory  | |||
	cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);  | |||
	// Free device memory  | |||
	cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);  | |||
	// print result  | |||
	for (int i = 0; i < N; i++){  | |||
		printf("%f - %f = %f\n", h_A[i], h_B[i], h_C[i]);  | |||
	}  | |||
	// Free host memory  | |||
	free(h_A); free(h_B); free(h_C);  | |||
}  | }  | ||
</source>  | </source>  | ||
| Line 232: | Line 271: | ||
cudaEventDestroy( start );  | cudaEventDestroy( start );  | ||
cudaEventDestroy( stop );  | cudaEventDestroy( stop );  | ||
</source>  | </source>  | ||
| Line 314: | Line 345: | ||
** stream 1: copy from device to host  | ** stream 1: copy from device to host  | ||
** stream 2: copy from device to host  | ** stream 2: copy from device to host  | ||
== Random Numbers ==  | |||
<source lang="c">  | |||
__global__ void run(float* A, float* C){  | |||
	curandState s;  | |||
	curand_init(blockIdx.x*threadIdx.x, 0, 0, &s);  | |||
	float randomElement = A[curand(&s) % N];  | |||
	...  | |||
}  | |||
</source>  | |||
==Query device properties==  | |||
<source lang="c">  | |||
cudaDeviceProp prop;  | |||
cudaGetDeviceProperties(&prop, 0);  | |||
int blocks = 2*prop.multiProcessorCount;  | |||
my_kernel<<<blocks, 256>>>(args);  | |||
</source>  | |||
==Card Properties==  | ==Card Properties==  | ||
Latest revision as of 23:30, 31 January 2013
References
- http://en.wikipedia.org/wiki/GeForce_400_Series
 - http://developer.nvidia.com/object/gpucomputing.html
 - http://developer.nvidia.com/nvidia-gpu-computing-documentation
 
Installation
Download
Download the appropriate files from http://developer.nvidia.com/cuda-downloads :
- NVIDIA developer driver
 - CUDA Toolkit
 - CUDA SDK
 
NOTE: Matlab 2012a does not work with CUDA versions higher than 4.0!
Install the driver
- first, uninstall any NVIDIA drivers that are part of your Linux distribution:
 
sudo apt-get --purge remove nvidia-*
- next, make sure the 
nvidiamodule won't get loaded when you reboot: 
cd /etc/init mv gdm.conf gdm.conf.hidden
- reboot
 - log into a terminal, cd to the dev driver file, and run it:
 
sudo sh devdriver_4.2_linux_64_295.41.run
- After the installation succeeds, re-enable video:
 
cd /etc/init mv gdm.conf.hidden gdm.conf (called lightdm.conf in 12.04)
- reboot again to your updated video driver
 
Install the CUDA software
- Uninstall any previous versions of the CUDA Toolkit and the GPU Computing SDK.
 
sudo rm -rf /usr/local/cuda rm -rf ~/programs/NVIDIA_GPU_Computing_SDK
- run the CUDA toolkit installer as root:
 
sudo sh cudatoolkit_4.2.9_linux_64_ubuntu10.04.run
- make sure PATH is set correctly in 
/etc/profile: 
PATH=$PATH:/usr/local/cuda/bin
- as root, create the file 
/etc/ld.so.conf.d/cuda.confand edit it to look like this: 
/usr/local/cuda/lib64 /usr/local/cuda/lib
Then run this command to add these to the library path (Ubuntu no longer uses the LD_LIBRARY_PATH environment variable):
sudo ldconfig -v
- install the SDK as a regular user
 
sh gpucomputingsdk_4.2.9_linux.run
- compile the SDK examples:
 
cd ~/programs/NVIDIA_GPU_Computing_SDK/C make
Program Example
#include <stdio.h>
#define N 5
__global__ void VecSub(float* A, float* B, float* C){
	int i = threadIdx.x;
	C[i] = A[i] - B[i];
}
int main(){
	size_t size = N * sizeof(float);
	// Allocate vectors in host memory
	float* h_A = (float*)malloc(size);
	float* h_B = (float*)malloc(size);
	float* h_C = (float*)malloc(size);
	// Initialize input vectors
	for (int i = 0; i < N; i++){
		h_B[i] = 2*N - i;
		h_A[i] = 2*h_B[i] + i;
	}
	// Allocate vectors in device memory
	float* d_A; cudaMalloc(&d_A, size);
	float* d_B; cudaMalloc(&d_B, size);
	float* d_C; cudaMalloc(&d_C, size);
	// Copy vectors from host memory to device memory
	cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
	// Invoke kernel
	int threadsPerBlock = 256;
	int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
	VecSub<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C);
	// Copy result from device memory to host memory
	// h_C contains the result in host memory
	cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
	// Free device memory
	cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
	// print result
	for (int i = 0; i < N; i++){
		printf("%f - %f = %f\n", h_A[i], h_B[i], h_C[i]);
	}
	// Free host memory
	free(h_A); free(h_B); free(h_C);
}
Compiling
- compile with nvcc
 - programs that rely on glut need to say so:
 
nvcc -lglut ripple.cu
- To require/optimize for compute capability 2.1 (for example):
 
nvcc -arch=sm_21
Indexing
- An NVIDIA GPU has a handful of processors on it, each of which contains a handful of "CUDA cores".
 - Each core may run a single block of threads.
 - Blocks are organized into grids.
 - threadIdx, blockIdx, and blockDim are three-dimensional variables for indexing threads within their blocks and grids
 - For an array that spans multiple blocks, use something like this:
 
unsigned int threadIndex = blockIdx.x*blockDim.x + threadIdx.x;
We want to run N threads given a constraint of K threads per block.
Solution 1:
- kernel call:  
myKernel<<<(N+K-1)/K, K>>> - kernel code:
 
unsigned int threadIndex = blockIdx.x*blockDim.x + threadIdx.x;
if (threadIndex < N)
    ...
Solution 2:
- kernel call:  
myKernel<<<K, K>>> - kernel code:
 
unsigned int threadIndex = blockIdx.x*blockDim.x + threadIdx.x;
while(threadIndex < N){
    ...
    threadIndex += blockDim.x*gridDim.x;
}
Keywords
- A 
__device__function runs on the gpu and can only be called from a__global__function or other__device__function. - A 
__shared__variable is shared among all threads of a block. - Use 
__syncthreads()to ensure that every thread has caught up before proceeding further. NOTE: make sure every thread reaches the__syncthreads()call, or the program will hang. 
Memory
Memory is organized into
- thread memory (local variable in a kernel)
 - block memory - shared among a block of threads (use __shared__ keyword)
 - global memory - read/write for all threads (use __global__ keyword)
 - constant memory - read-only (use __constant__ keyword)
 - texture memory - read-only for specialized uses (see examples below)
 
Allocate an array on the host:
float* h_A = (float*)malloc(size);
Allocate an array on the device (GPU):
float* d_A;
cudaMalloc(&d_A, size);
Copy array from host to device:
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
Copy array from host to device constant memory:
cudaMemcpyToSymbol(d_A, h_A, size);
Copy result array from device to host:
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
Free device memory
cudaFree(d_A);
cudaFree(d_C);
- For a two dimensional array, use 
cudaMallocPitch()andcudaMemcpy2D() - For a three-dimensional array, use 
cudaMalloc3D()andcudaMemcpy3D() - See the "NVIDIA CUDA C Programming Guide" for nontrivial examples of these.
 
One-dimensional Texture Memory
Define a global device texture reference:
texture<float> tex_TA;
Create array on host and then copy from host to device as normal!
Bind device object to texture reference:
cudaBindTexture(NULL, tex_TA, d_A, size);
Access value at index idx:
value = tex1Dfetch(tex_TA, idx);
Free texture memory:
cudaUnbindTexture(tex_TA);
Two-dimensional Texture Memory
Define a DIMxDIM matrix as a device texture:
texture<float, 2> d_TM;
Copy array from host to device texture:
cudaChannelFormatDesc desc = cudaCreateChannelDesk<float>();
cudaBindTexture2D(NULL, d_TM, h_M, desc, DIM, DIM size);
Access value at row, column:
value = tex2D(d_TA, row, column);
Free texture memory:
cudaUnbindTexture(d_TA);
Measuring Performance
// capture the start time
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
cudaEventRecord( start, 0 );
// do work with the gpu...
// get stop time, and display the timing results
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
float elapsedTime;
cudaEventElapsedTime( &elapsedTime, start, stop ) );
printf( "Time to generate: %3.1f ms\n", elapsedTime );
cudaEventDestroy( start );
cudaEventDestroy( stop );
Atomics
- An atomic operation guarantees that a thread can read and modify a memory location without interference from any other thread.
 - Atomic operations on global memory require compute capability 1.1
 - Atomic operations on shared memory require compute capability 1.2
 - To atomically add value x to variable total:
 
atomicAdd( &total, x );
- To atomically add value x to array entry totals[i]:
 
atomicAdd( &(totals[i]), x );
Page-locked host memory
- Normal memory on the host is reserved with malloc(). This memory may be paged to disk by the OS.
 - Page-locked memory on the host may not be paged to disk and is reserved with cudaHostAlloc(). This allows safe direct memory access (DMA), which tends to be faster.
 - Reserve host memory, use, then free:
 
int* a;
cudaHostAlloc( (void**)&a, size*sizeof(*a), cudaHostAllocDefault);
... do something with a ...
cudaFreeHost(a);
Streams
- A stream is a sequence of actions that are scheduled to be processed in order, but perhaps asynchronously. Multiple independent streams may be processed simultaneously. If the gpu supports device overlap, then a kernel may be running in one stream while a memory transfer is occurring in another. Newer gpu's also support concurrent kernel execution.
 - check for device overlap capability
 
cudaDeviceProp prop;
int whichDevice;
cudaGetDevice(&whichDevice);
cudaGetDeviceProperties(&prop, whichDevice);
if (!prop.deviceOverlap){
    printf( "Device will not handle overlaps, so no speed up from streams\n" );
    return 0;
}
- create a stream:
 
cudaStream_t stream;
cudaStreamCreate(&stream);
- copy memory asynchronously within the stream (host memory must be page-locked):
 
cudaMemcpyAsync( dev_a, host_a, size, cudaMemcpyHostToDevice, stream);
- run a kernel within the stream:
 
kernel<<<N/256, 256, 0, stream>>>(args);
- tell host to wait for stream to finish:
 
cudaStreamSynchronize(stream);
- clean up the stream:
 
cudaStreamDestroy(stream);
- To use multiple streams in parallel, stagger the tasks for each in your code.  Instead of
- stream 1: copy from host to device
 - stream 1: kernel
 - stream 1: copy from device to host
 - stream 2: copy from host to device
 - stream 2: kernel
 - stream 2: copy from device to host
 
 - do this:
- stream 1: copy from host to device
 - stream 2: copy from host to device
 - stream 1: kernel
 - stream 2: kernel
 - stream 1: copy from device to host
 - stream 2: copy from device to host
 
 
Random Numbers
__global__ void run(float* A, float* C){
	curandState s;
	curand_init(blockIdx.x*threadIdx.x, 0, 0, &s);
	float randomElement = A[curand(&s) % N];
	...
}
Query device properties
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int blocks = 2*prop.multiProcessorCount;
my_kernel<<<blocks, 256>>>(args);
Card Properties
CUDA Device Query (Runtime API) version (CUDART static linking) There is 1 device supporting CUDA Device 0: "GeForce GTX 460" CUDA Driver Version: 3.20 CUDA Runtime Version: 3.20 CUDA Capability Major/Minor version number: 2.1 Total amount of global memory: 1072889856 bytes Multiprocessors x Cores/MP = Cores: 7 (MP) x 48 (Cores/MP) = 336 (Cores) Total amount of constant memory: 65536 bytes Total amount of shared memory per block: 49152 bytes Total number of registers available per block: 32768 Warp size: 32 Maximum number of threads per block: 1024 Maximum sizes of each dimension of a block: 1024 x 1024 x 64 Maximum sizes of each dimension of a grid: 65535 x 65535 x 1 Maximum memory pitch: 2147483647 bytes Texture alignment: 512 bytes Clock rate: 1.45 GHz Concurrent copy and execution: Yes Run time limit on kernels: Yes Integrated: No Support host page-locked memory mapping: Yes Compute mode: Default (multiple host threads can use this device simultaneously) Concurrent kernel execution: Yes Device has ECC support enabled: No Device is using TCC driver mode: No