2009-10-10

Cも手習い

いわゆるハローワールドソース。"hello.c"として保存して

#include <stdio.h>

int main(void) {
  printf("Hello, world\n");
  return 0;
}

"gcc hello.c"とコマンドを打つと、"a.exe"なる実行ファイルができる。
"Makefile"という拡張子なしのファイルを以下のように作ると、"make"というコマンドを打つことで、"hello.exe"という実行ファイルができる。eclipseなどでは、このMakefileもどんどん自動で作ってくれるらしいが、まだ、eclipseの恩恵を得られていないので、手書きしている。

TARGET = hello
CFLAGS = -g
OBJS   = hello.o
 
all : $(TARGET)
 
$(TARGET) : $(OBJS)
	gcc $(OBJS) -o $@
 
clean :
	rm -f $(TARGET).exe $(OBJS)

少し、CUDAっぽくする。CUDAのサンプル(スカラー積のサンプル)をここからとってきて、CUDA的なところをコメントアウトして、Cの練習。

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>

float scalarProdCPU(
    float *h_A,
    float *h_B,int size){
	int i;
	
	float ret=0.0f;
	for(i=0;i<size;i++){
	ret=ret+h_A[i]*h_B[i];
	}
	return(ret);
}

float RandFloat(float low, float high){
    float t = (float)rand() / (float)RAND_MAX;
    return (1.0f - t) * low + t * high;
}

//Total number of input vector pairs; arbitrary
const int VECTOR_N = 256;
//Number of elements per vector; arbitrary, 
//but strongly preferred to be a multiple of warp size
//to meet memory coalescing constraints
const int ELEMENT_N = 4096;
//Total number of data elements

int main(int argc, char **argv){
    float *h_A, *h_B, *h_C_CPU, *h_C_GPU;
    float *d_A, *d_B, *d_C;
    double delta, ref, sum_delta, sum_ref, L1norm;
    unsigned int hTimer;
    int i;

int    DATA_N = VECTOR_N * ELEMENT_N;

int   DATA_SZ = DATA_N * sizeof(float);
int RESULT_SZ = VECTOR_N  * sizeof(float);

    //CUT_DEVICE_INIT();
    //CUT_SAFE_CALL( cutCreateTimer(&hTimer) );

    printf("Initializing data...\n");
        printf("...allocating CPU memory.\n");
        h_A     = (float *)malloc(DATA_SZ);
        h_B     = (float *)malloc(DATA_SZ);
        h_C_CPU = (float *)malloc(RESULT_SZ);
        //h_C_GPU = (float *)malloc(RESULT_SZ);

        printf("...allocating GPU memory.\n");
        //CUDA_SAFE_CALL( cudaMalloc((void **)&d_A, DATA_SZ)   );
        //CUDA_SAFE_CALL( cudaMalloc((void **)&d_B, DATA_SZ)   );
        //CUDA_SAFE_CALL( cudaMalloc((void **)&d_C, RESULT_SZ) );

        printf("...generating input data in CPU mem.\n");
        srand(123);
        //Generating input data on CPU
        for(i = 0; i < DATA_N; i++){
            h_A[i] = RandFloat(0.0f, 1.0f);
            h_B[i] = RandFloat(0.0f, 1.0f);
        }

        printf("...copying input data to GPU mem.\n");
        //Copy options data to GPU memory for further processing 
        //CUDA_SAFE_CALL( cudaMemcpy(d_A, h_A, DATA_SZ, cudaMemcpyHostToDevice) );
        //CUDA_SAFE_CALL( cudaMemcpy(d_B, h_B, DATA_SZ, cudaMemcpyHostToDevice) );
    printf("Data init done.\n");


    printf("Executing GPU kernel...\n");
        //CUDA_SAFE_CALL( cudaThreadSynchronize() );
        //CUT_SAFE_CALL( cutResetTimer(hTimer) );
        //CUT_SAFE_CALL( cutStartTimer(hTimer) );
        //scalarProdGPU<<<128, 256>>>(d_C, d_A, d_B, VECTOR_N, ELEMENT_N);
        //CUT_CHECK_ERROR("scalarProdGPU() execution failed\n");
        //CUDA_SAFE_CALL( cudaThreadSynchronize() );
        //CUT_SAFE_CALL( cutStopTimer(hTimer) );
    //printf("GPU time: %f msecs.\n", cutGetTimerValue(hTimer));

    printf("Reading back GPU result...\n");
        //Read back GPU results to compare them to CPU results
        //CUDA_SAFE_CALL( cudaMemcpy(h_C_GPU, d_C, RESULT_SZ, cudaMemcpyDeviceToHost) );


    printf("Checking GPU results...\n");
        printf("..running CPU scalar product calculation\n");
        h_C_CPU[0]=scalarProdCPU(h_A, h_B,DATA_N);
/*
        printf("...comparing the results\n");
        //Calculate max absolute difference and L1 distance
        //between CPU and GPU results
        sum_delta = 0;
        sum_ref   = 0;
        for(i = 0; i < VECTOR_N; i++){
            delta = fabs(h_C_GPU[i] - h_C_CPU[i]);
            ref   = h_C_CPU[i];
            sum_delta += delta;
            sum_ref   += ref;
        }
        L1norm = sum_delta / sum_ref;
    printf("L1 error: %E\n", L1norm);
    printf((L1norm < 1e-6) ? "TEST PASSED\n" : "TEST FAILED\n");
*/

    printf("Shutting down...\n");
        //CUDA_SAFE_CALL( cudaFree(d_C) );
        //CUDA_SAFE_CALL( cudaFree(d_B)   );
       //CUDA_SAFE_CALL( cudaFree(d_A)   );
        //free(h_C_GPU);
        free(h_C_CPU);
        free(h_B);
        free(h_A);
        //CUT_SAFE_CALL( cutDeleteTimer(hTimer) );

    //CUT_EXIT(argc, argv);
}

"test.c"用のMakefile

TARGET = test
CFLAGS = -g
OBJS   = test.o
 
all : $(TARGET)
 
$(TARGET) : $(OBJS)
	gcc $(OBJS) -o $@
 
clean :
	rm -f $(TARGET).exe $(OBJS)