Wednesday, 6 March 2013

CUDA+QT+CMake on Ubuntu:

I'm about to implement some a CUDA program by QT creator and CMake on Ubuntu. I haven't played with them before this note will explain how I get through and make they work together.

I reviewed about cooperating between them and they should works properly, please see sn0v and Dayal.

Here is a brief explanation of the setting process:
  1. setting CUDA
    • download the CUDA 5 product from here
    • follow sn0v's instruction
      • install development tools:   
        • sudo apt-get install freeglut3-dev build-essential libx11-dev libxmu-dev libxi-dev libgl1-mesa-glx libglu1-mesa libglu1-mesa-devcuda
      • disable some graphics components by adding the blacklist: 
        • gedit /etc/modprobe.d/blacklist.conf
        • blacklist amd76x_edac
          blacklist vga16fb
          blacklist nouveau
          blacklist rivafb
          blacklist nvidiafb
          blacklist rivatv
      • to make sure the previous graphics components is clean:   
        • sudo apt-get remove --purge nvidia*
      • Now we are ready to install CUDA, firstly we need to reboot and press Ctrl+Alt+F1 and login window. It will switch to command-line mode.
      • Turn off the GUI service and install CUDA
        • sudo service lightdm stop 
        • sudo ./<CUDAInstallFile>.run
      • Follow the CUDA installer instruction. At this point you should get CUDA toolkit and CUDA Samples.
  2. Installing QT creator, CMake and compliers by following Dayal's tutorial.
    • Here is the list of all necessary packages (install by Synaptic Package Manager)
      • gcc
      • g++
      • gdb
      • qtcreator
      •  valgrind
      • cmake
    • It is almost done, next step is to create some simple project.
  3. Open QT creator and add new text file (CMakeLists.txt).
    • write this into the txt file
      • cmake_minimum_required(VERSION 2.6.2)

        project(CMakeCUDA)

        find_package(CUDA)
        if (CUDA_FOUND)
        message(" * CUDA ${CUDA_VERSION} was found")
        else(CUDA_FOUND)
            message(" * CUDA is not found")
            message(FATAL_ERROR "Not all CUDA libraries are found")
        endif(CUDA_FOUND)

        # add sources files
        set(SRCS
             main.cpp
             cuda.cu
        )
        cuda_add_executable(CMakeCUDA ${SRCS})
    •  Add main.cpp and write the code
      •  
    • Add cuda.cu file
    • reopen the CMakeLists.txt but this time open by the cmake wizard
  4. You just finished setting up a simple CUDA project. When you build and run in terminal you will see output numbers, please see click here to see more explanation of the program.
------------------------------------main.cpp-------------------------------------
//main.cpp
// simple warpper class of CUDA function.
// Vector A abd B is inititialized and transfered to the device.
// Thedevice performs addition and transfers the data back to host.
// by Wasit Limprasert created on 19-07-2011
// updated 23-08-2011: adding comment
#include <stdlib.h>
#include <stdio.h>
extern "C" void CUDA_Constructor(int** g_A,int** g_B,int size);
extern "C" void CUDA_SetData(int* g_dist, int* h_src, int size);
extern "C" void CUDA_Add(int* g_A,int* g_B,int size);
extern "C" void CUDA_GetData(int* h_dist, int* g_src,int size);
class VectorOperation{
public:
    int *h_A,*g_A;//h_A host pointer of vector A and g_A is a ponter to global memory on device.
    int *h_B,*g_B;
    int size;
    VectorOperation(int _size);
    ~VectorOperation(void){}
    void init(void);
    void SetA(int* h_src){CUDA_SetData(g_A,h_src,size);}
    void SetB(int* h_src){CUDA_SetData(g_B,h_src,size);}
    void Add(void){CUDA_Add(g_A,g_B,size);}
    void Result(void);
};
//constractor
//setting size of vectors and memory allocation on both host and device.
VectorOperation::VectorOperation(int _size){
    size=_size;
    CUDA_Constructor(&g_A,&g_B,size);
    h_A=(int*)malloc(sizeof(int)*size);
    h_B=(int*)malloc(sizeof(int)*size);
}
//initialization
//generating data for vector A and B then copy the data to device.
void VectorOperation::init(void){
    printf("A  =");
    for(int i=0;i<size;i++){
        h_A[i]=1;
        printf("%2d ",h_A[i]);
    }
    SetA(h_A);
    printf("\nB  =");
    for(int i=0;i<size;i++){
        h_B[i]=i;
        printf("%2d ",h_B[i]);
    }
    SetB(h_B);
    printf("\n");
}
//reading result,which is saved in g_A back to h_A and print out.
void VectorOperation::Result(void){
    CUDA_GetData(h_A,g_A,size);
    printf("A+B:");
    for(int i=0;i<size;i++){
        printf("%2d ",h_A[i]);
    }
    printf("\n");
}
//main
int main(){
    printf("Simple CUDA vector addition.\n");
    VectorOperation P = VectorOperation(16);
    P.init();
    P.Add();
    P.Result();
    getchar();
}
------------------------------------------------cuda.cu-------------------------------
//cuda.cu
//simple CUDA functions
//by Wasit 208-2011
#include <stdio.h>
#include <cuda_runtime.h>
//memory allocation on device side
extern "C" void CUDA_Constructor(int** g_A,int** g_B,int size){
    cudaMalloc(g_A,sizeof(int)*size);
    cudaMalloc(g_B,sizeof(int)*size);
}
//copying data from host to device
extern "C" void CUDA_SetData(int* g_dist, int* h_src,int size){
    cudaMemcpy(g_dist,h_src,sizeof(int)*size,cudaMemcpyHostToDevice);
}
//CUDA Kernel block and thread ID are indicated by blockIdx and threadIdx, respectively
__global__ void Kernel_Add(int* g_A,int* g_B){
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    g_A[x]=g_A[x]+g_B[x];
}
//Addition function
//number of thread and block is set before call Kernel
extern "C" void CUDA_Add(int* g_A,int* g_B,int size){
    int threadnum=16;
    int blocknum=size/threadnum;
    Kernel_Add<<<threadnum,blocknum>>>(g_A,g_B);
}
//read data back to host
extern "C" void CUDA_GetData(int* h_dist, int* g_src,int size){
    cudaMemcpy(h_dist,g_src,sizeof(int)*size,cudaMemcpyDeviceToHost);
}