Problem in cuda source code

niwaz_malik

Distinguished
Jan 6, 2010
25
0
18,530
i have written a cuda program, if i compile it in vs IDE, then there is no problem and the result is right,

but if i compile it with nvcc command under commandline prompt the result is wrong,

what is the reason


here is my source code
this may be a little long


---------------------------------------------------------------------------------------------------
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include <cutil.h>
#include <cutil_math.h>
#include <cutil_inline.h>

#define BLOCK 256

#define SOD 1000
#define ODD 230

#define DETECTOR_CELLSIZE 0.254
#define DETECTOR_XRESOLUTION 960
#define DETECTOR_YRESOLUTION 768

#define PI 3.14159265

#define SAMPLES_PER_TURN 720
#define DLAMBDA (2*PI/SAMPLES_PER_TURN)

#define PHANTOM_RADIUS (DETECTOR_XRESOLUTION*DETECTOR_CELLSIZE/2*SOD/sqrt((SOD+ODD)*(SOD+ODD)+(DETECTOR_XRESOLUTION*DETECTOR_CELL
SIZE/2)*(DETECTOR_XRESOLUTION*DETECTOR_CELLSIZE/2)))
#define CONEBEAM_PITCH (PHANTOM_RADIUS*6)


// 事实上我是转了两圈
#define DRNUMBER 1440

const int split_number = 10; //每个探测器单元细分的数量,用于正演

__global__ void forwardprojection_CUDA(float *d_dr_data, int dr);
__host__ __device__ float mysquare(float x);
__host__ __device__ float get_length(float3 source_position, float3 direction, float3 half_axes);
__noinline__ __host__ __device__ float3 trans_to_ellipsolid(float3 src, float3 angle);
__device__ void one_address_fp(float *tmp, \
float3 ellipsoid_position, float3 half_axis, float3 rotate_angles, float density,\
float lambda, \
int w, int h,\
int thidy,
int dr);

int main()
{
// 一幅DR的空间
size_t dr_mem_size = sizeof(float) * DETECTOR_XRESOLUTION * DETECTOR_YRESOLUTION;

// 为投影数据分配内存空间
float *dr_data = (float *)malloc(dr_mem_size);
if (dr_data == NULL)
{
printf("分配内存出错!\n");
exit(0);
}

// 为投影数据分配显存空间
float *d_dr_data;
CUDA_SAFE_CALL(cudaMalloc((void **)&d_dr_data, dr_mem_size));


// block size
dim3 bDim(1, BLOCK);
dim3 gDim(1, (DETECTOR_XRESOLUTION * DETECTOR_YRESOLUTION + BLOCK - 1) / BLOCK);

char dr_name[100];

FILE *fp;
for(int dr = 150; dr < 159/*DRNUMBER*/; dr ++)
{
forwardprojection_CUDA<<<gDim, bDim>>>(d_dr_data, dr);
CUDA_SAFE_CALL(cudaMemcpy(dr_data, d_dr_data, dr_mem_size, cudaMemcpyDeviceToHost));

sprintf(dr_name, "../proj_data/CT%4.4i.proj", dr);
printf("%s\n", dr_name);
fp = fopen(dr_name, "wb");
if (!fp)
{
printf("ERROR Create FILE!\n");
exit(0);
}
fwrite(dr_data, sizeof(float), DETECTOR_XRESOLUTION * DETECTOR_YRESOLUTION, fp);
fclose(fp);
}

free(dr_data);
cudaFree(d_dr_data);
printf("Done!\n");
return 0;
}

__host__ __device__ float mysquare(float x)
{
return x*x;
}

__host__ __device__ float get_length(float3 source_position, float3 direction, float3 half_axes)
{
float coa = mysquare(direction.x)/mysquare(half_axes.x) +\
mysquare(direction.y)/mysquare(half_axes.y) +\
mysquare(direction.z)/mysquare(half_axes.z);
float cob = 2 * (direction.x * source_position.x) / mysquare(half_axes.x) +\
2 * (direction.y * source_position.y) / mysquare(half_axes.y) +\
2 * (direction.z * source_position.z) / mysquare(half_axes.z);

float coc = mysquare(source_position.x)/mysquare(half_axes.x) +\
mysquare(source_position.y)/mysquare(half_axes.y) +\
mysquare(source_position.z)/mysquare(half_axes.z) - 1;

float delta = mysquare(cob) - 4 * coa * coc;

if (delta <= 0 )
return 0;
else
{
float t1 = (-cob + sqrt(delta)) / (2 * coa);
float t2 = (-cob - sqrt(delta)) / (2 * coa);

return sqrt(mysquare(direction.x) + mysquare(direction.y) + mysquare(direction.z)) * abs(t1 - t2);
}
}

__noinline__ __host__ __device__ float3 trans_to_ellipsolid(float3 src, float3 angle)
{
float alpha = angle.x / 180.0 * PI;
float beta = angle.y / 180.0 * PI;
float gama = angle.z / 180.0 * PI;

float3 des;

float a11 = cos(beta)*cos(gama);
float a12 = cos(gama)*sin(alpha)*sin(beta)+cos(alpha)*sin(gama);
float a13 = -cos(alpha)*cos(gama)*sin(beta)+sin(alpha)*sin(gama);

float a21 = -cos(beta)*sin(gama);
float a22 = cos(alpha)*cos(gama)-sin(alpha)*sin(beta)*sin(gama);
float a23 = cos(gama)*sin(alpha)+cos(alpha)*sin(beta)*sin(gama);

float a31 = sin(beta);
float a32 = -cos(beta)*sin(alpha);
float a33 = cos(alpha)*cos(beta);

des.x = a11 * src.x + a12 * src.y + a13 * src.z;
des.y = a21 * src.x + a22 * src.y + a23 * src.z;
des.z = a31 * src.x + a32 * src.y + a33 * src.z;

return des;
}


__global__ void forwardprojection_CUDA(float *d_dr_data, int dr)
{

__shared__ float tmp_shared_array[BLOCK];
tmp_shared_array[threadIdx.y] = 0;
__syncthreads();

int index = blockIdx.y * BLOCK + threadIdx.y;
int w = index % DETECTOR_XRESOLUTION;
int h = index / DETECTOR_XRESOLUTION;

float lambda = dr * DLAMBDA;

// phantom 参数
float density;
float3 ellipsoid_position;
float3 half_axis;
float3 rotate_angles;

//////////////////////////////////////////////////////////////////////////
// 1
half_axis.x = 0.69 * PHANTOM_RADIUS;
half_axis.y = 0.92 * PHANTOM_RADIUS;
half_axis.z = 0.81 * PHANTOM_RADIUS;
ellipsoid_position.x = 0 * PHANTOM_RADIUS;
ellipsoid_position.y = 0 * PHANTOM_RADIUS;
ellipsoid_position.z = 0 * PHANTOM_RADIUS;
rotate_angles.x = 0;
rotate_angles.y = 0;
rotate_angles.z = 0;
density = 1.0;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);
__syncthreads();

//////////////////////////////////////////////////////////////////////////
// 2
half_axis.x = 0.6624 * PHANTOM_RADIUS;
half_axis.y = 0.874 * PHANTOM_RADIUS;
half_axis.z = 0.78 * PHANTOM_RADIUS ;
//ellipsoid_position.x = -0.0184 * PHANTOM_RADIUS;
ellipsoid_position.x = 0 * PHANTOM_RADIUS;
ellipsoid_position.y = 0 * PHANTOM_RADIUS;
ellipsoid_position.z = 0 * PHANTOM_RADIUS;
rotate_angles.x = 0;
rotate_angles.y = 0;
rotate_angles.z = 0;
density = -0.8;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);
__syncthreads();

//////////////////////////////////////////////////////////////////////////
// 3
half_axis.x = 0.11 * PHANTOM_RADIUS;
half_axis.y = 0.31 * PHANTOM_RADIUS;
half_axis.z = 0.22 * PHANTOM_RADIUS;
ellipsoid_position.x = 0.22 * PHANTOM_RADIUS;
ellipsoid_position.y = 0 * PHANTOM_RADIUS;
ellipsoid_position.z = 0 * PHANTOM_RADIUS;
rotate_angles.x = -18;
rotate_angles.y = 0;
rotate_angles.z = -10;
density = -0.2;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);
__syncthreads();

//////////////////////////////////////////////////////////////////////////
// 4
half_axis.x = 0.16 * PHANTOM_RADIUS;
half_axis.y = 0.41 * PHANTOM_RADIUS;
half_axis.z = 0.28 * PHANTOM_RADIUS;
ellipsoid_position.x = -0.22 * PHANTOM_RADIUS;
ellipsoid_position.y = 0 * PHANTOM_RADIUS;
ellipsoid_position.z = 0 * PHANTOM_RADIUS;
rotate_angles.x = 18;
rotate_angles.y = 0;
rotate_angles.z = 10;
density = -0.2;
one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);
__syncthreads();

//////////////////////////////////////////////////////////////////////////
// 5
half_axis.x = 0.21 * PHANTOM_RADIUS;
half_axis.y = 0.25 * PHANTOM_RADIUS;
half_axis.z = 0.41 * PHANTOM_RADIUS;
ellipsoid_position.x = 0 * PHANTOM_RADIUS;
ellipsoid_position.y = 0.35 * PHANTOM_RADIUS;
ellipsoid_position.z = -0.15 * PHANTOM_RADIUS;
rotate_angles.x = 0;
rotate_angles.y = 0;
rotate_angles.z = 0;
density = 0.1;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);
__syncthreads();

/* //////////////////////////////////////////////////////////////////////////
// 6
half_axis.x = 0.046 * PHANTOM_RADIUS;
half_axis.y = 0.046 * PHANTOM_RADIUS;
half_axis.z = 0.05 * PHANTOM_RADIUS;
ellipsoid_position.x = 0 * PHANTOM_RADIUS;
ellipsoid_position.y = 0.1 * PHANTOM_RADIUS;
ellipsoid_position.z = 0.25 * PHANTOM_RADIUS;
rotate_angles.x = 0;
rotate_angles.y = 0;
rotate_angles.z = 0;
density = 0.1;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);

//////////////////////////////////////////////////////////////////////////
// 7
half_axis.x = 0.046 * PHANTOM_RADIUS;
half_axis.y = 0.046 * PHANTOM_RADIUS;
half_axis.z = 0.05 * PHANTOM_RADIUS;
ellipsoid_position.x = 0 * PHANTOM_RADIUS;
ellipsoid_position.y = -0.1 * PHANTOM_RADIUS;
ellipsoid_position.z = 0.25 * PHANTOM_RADIUS;
rotate_angles.x = 0;
rotate_angles.y = 0;
rotate_angles.z = 0;
density = 0.1;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);
//////////////////////////////////////////////////////////////////////////
// 8
half_axis.x = 0.046 * PHANTOM_RADIUS;
half_axis.y = 0.023 * PHANTOM_RADIUS;
half_axis.z = 0.05 * PHANTOM_RADIUS;
ellipsoid_position.x = -0.08 * PHANTOM_RADIUS;
ellipsoid_position.y = -0.605 * PHANTOM_RADIUS;
ellipsoid_position.z = 0 * PHANTOM_RADIUS;
rotate_angles.x = 0;
rotate_angles.y = 0;
rotate_angles.z = 0;
density = 0.1;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);
//////////////////////////////////////////////////////////////////////////
// 9
half_axis.x = 0.023 * PHANTOM_RADIUS;
half_axis.y = 0.023 * PHANTOM_RADIUS;
half_axis.z = 0.02 * PHANTOM_RADIUS;
ellipsoid_position.x = 0 * PHANTOM_RADIUS;
ellipsoid_position.y = -0.606 * PHANTOM_RADIUS;
ellipsoid_position.z = 0 * PHANTOM_RADIUS;
rotate_angles.x = 0;
rotate_angles.y = 0;
rotate_angles.z = 0;
density = 0.1;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);
//////////////////////////////////////////////////////////////////////////
// 10
half_axis.x = 0.023 * PHANTOM_RADIUS;
half_axis.y = 0.046 * PHANTOM_RADIUS;
half_axis.z = 0.02 * PHANTOM_RADIUS;
ellipsoid_position.x = 0.06 * PHANTOM_RADIUS;
ellipsoid_position.y = -0.605 * PHANTOM_RADIUS;
ellipsoid_position.z = 0 * PHANTOM_RADIUS;
rotate_angles.x = 0;
rotate_angles.y = 0;
rotate_angles.z = 0;
density = 0.1;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);
*/
__syncthreads();

if (h < DETECTOR_YRESOLUTION && w < DETECTOR_XRESOLUTION)
d_dr_data[h * DETECTOR_XRESOLUTION + w] = tmp_shared_array[threadIdx.y] / float(split_number*split_number);
}


__device__ void one_address_fp(float *tmp, \
float3 ellipsoid_position, float3 half_axis, float3 rotate_angles, float density,\
float lambda, \
int w, int h,\
int thidy,
int dr)
{
float3 source_position_fix;
float3 source_position_ellipsolid;
float3 tmp_pt;
float3 dector_prosition_fix;
float3 dector_prosition_ellipsolid;
float3 direction;

source_position_fix.x = SOD * cos(lambda);
source_position_fix.y = SOD * sin(lambda);
source_position_fix.z = CONEBEAM_PITCH * lambda / (2 * PI);

if (dr >= DRNUMBER * 2 / 3)
source_position_fix.z -= CONEBEAM_PITCH * 5 / 3;
else if(dr >= DRNUMBER / 3)
source_position_fix.z -= CONEBEAM_PITCH;
else
source_position_fix.z -= CONEBEAM_PITCH / 3;

// 平移
source_position_fix.x -= ellipsoid_position.x;
source_position_fix.y -= ellipsoid_position.y;
source_position_fix.z -= ellipsoid_position.z;

// 旋转
source_position_ellipsolid = trans_to_ellipsolid(source_position_fix, rotate_angles);

for(int t = 0; t < split_number; t ++)
{
for(int s = 0; s < split_number; s ++)
{
tmp_pt.x = -ODD;
tmp_pt.y = - DETECTOR_XRESOLUTION / 2.0 * DETECTOR_CELLSIZE + w * DETECTOR_CELLSIZE;
tmp_pt.z = DETECTOR_YRESOLUTION / 2.0 * DETECTOR_CELLSIZE - h * DETECTOR_CELLSIZE;

tmp_pt.y = tmp_pt.y + DETECTOR_CELLSIZE / float(split_number) / 2 + DETECTOR_CELLSIZE / float(split_number) * s;
tmp_pt.z = tmp_pt.z - DETECTOR_CELLSIZE / float(split_number) / 2 - DETECTOR_CELLSIZE / float(split_number) * t;

dector_prosition_fix.x = tmp_pt.x * cos(lambda) - tmp_pt.y * sin(lambda);
dector_prosition_fix.y = tmp_pt.x * sin(lambda) + tmp_pt.y * cos(lambda);
dector_prosition_fix.z = tmp_pt.z + CONEBEAM_PITCH * lambda / (2 * PI);

if (dr >= DRNUMBER * 2 / 3)
dector_prosition_fix.z -= CONEBEAM_PITCH * 5 / 3;
else if(dr >= DRNUMBER / 3)
dector_prosition_fix.z -= CONEBEAM_PITCH;
else
dector_prosition_fix.z -= CONEBEAM_PITCH / 3;

dector_prosition_fix.x -= ellipsoid_position.x;
dector_prosition_fix.y -= ellipsoid_position.y;
dector_prosition_fix.z -= ellipsoid_position.z;

dector_prosition_ellipsolid = trans_to_ellipsolid(dector_prosition_fix, rotate_angles);

direction.x = source_position_ellipsolid.x - dector_prosition_ellipsolid.x;
direction.y = source_position_ellipsolid.y - dector_prosition_ellipsolid.y;
direction.z = source_position_ellipsolid.z - dector_prosition_ellipsolid.z;

float p = density * get_length(source_position_ellipsolid, direction, half_axis);
tmp[thidy] += p;
}
}
}



the command i used is " nvcc -arch=sm_13 forward_projection.cu"

 
Solution
if (h < DETECTOR_YRESOLUTION && w < DETECTOR_XRESOLUTION)
d_dr_data[h * DETECTOR_XRESOLUTION + w] = tmp_ded_array[threadIdx.y] / float(split_number*split_number);
}


__device__ void one_address_fp(array *tmp, \
array3 ellipsoid_position, float3 half_axis, float3 rotate_angles, float density,\
array lambda, \
long w, long h,\
long thidy,
long dr
{
float3 source_position_fix;
float3 source_position_ellipsolid;
float3 tmp_pt;
float3 dector_prosition_fix;
float3 dector_prosition_ellipsolid;
float3 direction;

source_position_fix.x = SOD * cos(lambda);
source_position_fix.y = SOD * sin(lambda);
source_position_fix.z = SOD * lambda / (3 * AR);

if (dr >= DRNUMBER * 3 / 3)
source_position_fix.z -= CONEBEAM_PITCH * 6 / 3...

thelivingflesh

Distinguished
Aug 7, 2009
290
0
18,810
if (h < DETECTOR_YRESOLUTION && w < DETECTOR_XRESOLUTION)
d_dr_data[h * DETECTOR_XRESOLUTION + w] = tmp_ded_array[threadIdx.y] / float(split_number*split_number);
}


__device__ void one_address_fp(array *tmp, \
array3 ellipsoid_position, float3 half_axis, float3 rotate_angles, float density,\
array lambda, \
long w, long h,\
long thidy,
long dr
{
float3 source_position_fix;
float3 source_position_ellipsolid;
float3 tmp_pt;
float3 dector_prosition_fix;
float3 dector_prosition_ellipsolid;
float3 direction;

source_position_fix.x = SOD * cos(lambda);
source_position_fix.y = SOD * sin(lambda);
source_position_fix.z = SOD * lambda / (3 * AR);

if (dr >= DRNUMBER * 3 / 3)
source_position_fix.z -= CONEBEAM_PITCH * 6 / 3;
else if(dr >= DRNUMBER / 4)
source_position_fix.z -= CONEBEAM_PITCH;
else
source_position_fix.z -= CONEBEAM_PITCH / 4;


use this, there is a small error : which is that you have to use long not int coz int has smaller values...and the values you get are more than 7 digits...common mistake....use long always ...its like insurance....
 
Solution