CUDA+QT+CMake on Ubuntu:
I'm abo
ut to implement some a CUDA program by QT creator and CMake on Ubuntu. I haven't played with them before this note will explain how I get through and make they work together.
I reviewed about cooperating between them and they should works properly, please see 
sn0v and 
Dayal.
Here is a brief explanation of the setting process:
- setting CUDA
 
- download the CUDA 5 product from here
 
- follow sn0v's instruction
 
- install development tools:   
 
- sudo apt-get install freeglut3-dev build-essential libx11-dev libxmu-dev libxi-dev libgl1-mesa-glx libglu1-mesa libglu1-mesa-devcuda
 
- disable some graphics components by adding the blacklist: 
 
- gedit /etc/modprobe.d/blacklist.conf
 
- blacklist amd76x_edac
blacklist vga16fb
blacklist nouveau
blacklist rivafb
blacklist nvidiafb
blacklist rivatv 
- to make sure the previous graphics components is clean:   
 
- sudo apt-get remove --purge nvidia*
 
- Now we are ready to install CUDA, firstly we need to reboot and press Ctrl+Alt+F1 and login window. It will switch to command-line mode.
 
- Turn off the GUI service and install CUDA
 
- sudo service lightdm stop 
 
- sudo ./<CUDAInstallFile>.run
 
- Follow the CUDA installer instruction. At this point you should get CUDA toolkit and CUDA Samples.
 
- Installing QT creator, CMake and compliers by following Dayal's tutorial.
 
- Here is the list of all necessary packages (install by Synaptic Package Manager)
 
- gcc
 
- g++
 
- gdb
 
- qtcreator
 
-  valgrind
 
- cmake
 
- It is almost done, next step is to create some simple project.
 
- Open QT creator and add new text file (CMakeLists.txt).
 
- write this into the txt file
 
- cmake_minimum_required(VERSION 2.6.2)
project(CMakeCUDA)
find_package(CUDA)
if (CUDA_FOUND)
message(" * CUDA ${CUDA_VERSION} was found")
else(CUDA_FOUND)
    message(" * CUDA is not found")
    message(FATAL_ERROR "Not all CUDA libraries are found")
endif(CUDA_FOUND)
# add sources files
set(SRCS
     main.cpp
     cuda.cu
)
cuda_add_executable(CMakeCUDA ${SRCS}) 
-  Add main.cpp and write the code
 
- Add cuda.cu file
 
- reopen the CMakeLists.txt but this time open by the cmake wizard
 
- You just finished setting up a simple CUDA project. When you build and run in terminal you will see output numbers, please see click here to see more explanation of the program.
 
------------------------------------main.cpp------------------------------------- 
//main.cpp
// simple warpper class of CUDA function.
// Vector A abd B is inititialized and transfered to the device.
// Thedevice performs addition and transfers the data back to host.
// by Wasit Limprasert created on 19-07-2011
// updated 23-08-2011: adding comment
#include <stdlib.h>
#include <stdio.h>
extern "C" void CUDA_Constructor(int** g_A,int** g_B,int size);
extern "C" void CUDA_SetData(int* g_dist, int* h_src, int size);
extern "C" void CUDA_Add(int* g_A,int* g_B,int size);
extern "C" void CUDA_GetData(int* h_dist, int* g_src,int size);
class VectorOperation{
public:
    int *h_A,*g_A;//h_A host pointer of vector A and g_A is a ponter to global memory on device.
    int *h_B,*g_B;
    int size;
    VectorOperation(int _size);
    ~VectorOperation(void){}
    void init(void);
    void SetA(int* h_src){CUDA_SetData(g_A,h_src,size);}
    void SetB(int* h_src){CUDA_SetData(g_B,h_src,size);}
    void Add(void){CUDA_Add(g_A,g_B,size);}
    void Result(void);
};
//constractor
//setting size of vectors and memory allocation on both host and device.
VectorOperation::VectorOperation(int _size){
    size=_size;
    CUDA_Constructor(&g_A,&g_B,size);
    h_A=(int*)malloc(sizeof(int)*size);
    h_B=(int*)malloc(sizeof(int)*size);
}
//initialization
//generating data for vector A and B then copy the data to device.
void VectorOperation::init(void){
    printf("A  =");
    for(int i=0;i<size;i++){
        h_A[i]=1;
        printf("%2d ",h_A[i]);
    }
    SetA(h_A);
    printf("\nB  =");
    for(int i=0;i<size;i++){
        h_B[i]=i;
        printf("%2d ",h_B[i]);
    }
    SetB(h_B);
    printf("\n");
}
//reading result,which is saved in g_A back to h_A and print out.
void VectorOperation::Result(void){
    CUDA_GetData(h_A,g_A,size);
    printf("A+B:");
    for(int i=0;i<size;i++){
        printf("%2d ",h_A[i]);
    }
    printf("\n");
}
//main
int main(){
    printf("Simple CUDA vector addition.\n");
    VectorOperation P = VectorOperation(16);
    P.init();
    P.Add();
    P.Result();
    getchar();
}
------------------------------------------------cuda.cu-------------------------------
//cuda.cu
//simple CUDA functions
//by Wasit 208-2011
#include <stdio.h>
#include <cuda_runtime.h>
//memory allocation on device side
extern "C" void CUDA_Constructor(int** g_A,int** g_B,int size){
    cudaMalloc(g_A,sizeof(int)*size);
    cudaMalloc(g_B,sizeof(int)*size);
}
//copying data from host to device
extern "C" void CUDA_SetData(int* g_dist, int* h_src,int size){
    cudaMemcpy(g_dist,h_src,sizeof(int)*size,cudaMemcpyHostToDevice);
}
//CUDA Kernel block and thread ID are indicated by blockIdx and threadIdx, respectively
__global__ void Kernel_Add(int* g_A,int* g_B){
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    g_A[x]=g_A[x]+g_B[x];
}
//Addition function
//number of thread and block is set before call Kernel
extern "C" void CUDA_Add(int* g_A,int* g_B,int size){
    int threadnum=16;
    int blocknum=size/threadnum;
    Kernel_Add<<<threadnum,blocknum>>>(g_A,g_B);
}
//read data back to host
extern "C" void CUDA_GetData(int* h_dist, int* g_src,int size){
    cudaMemcpy(h_dist,g_src,sizeof(int)*size,cudaMemcpyDeviceToHost);
}