Friday, April 15, 2011

2

Got really stuck recently, fortunately I finally found the bug.

I wrote a small GPU version bilinear interpolation image scaling Kernel but got some weird results, the output image was randomly shift right/left a few pixel for no reason.

At beginning, I thought the mistakes happened in Kernels. Then I notice it might be some “data type conversion& data type accuracy”  based on some “seems to be correct” results, but I was on the wrong direction.

The problem actually happens with OpenCV data matrix aligned problem. Previously, it ran “looks perfect” on other kernels because input and output are both falsely aligned.Anyway….

The performance seems to be good so far. My scaling kernel (running on my low-end nvs 3100m) took 0.28 milliseconds while cvResize from OpenCV took 6.7 milliseconds on single operation . That’s 20x speedup

Though memory transfer latency is not taken into account  and cvResize is higher level of processing function and its slow (see performance comparison on read from CFILE , c++ fileread API, FILE from C ) , there are many better GPU will have much stronger performance.

Capture

Scaling Kernel

   1: __global__ void DownScale(float*O_data, float scale)
   2: {
   3:  
   4:     //__shared__  float tile[16][16];
   5:  
   6:     unsigned int tid_x =  threadIdx.x + __umul24(blockIdx.x,blockDim.x);
   8:  
   9:  
  10:     /*****Mapping to Unscaled Image****/
  11:     unsigned int Pixel_x = (unsigned int)(tid_x*scale) ; 
  12:     unsigned int Pixel_y = (unsigned int)(tid_y*scale) ; 
  13:     float a = tid_x*scale - Pixel_x;
  14:     float b = tid_x*scale - Pixel_y;
   7:     unsigned int tid_y =  threadIdx.y + __umul24(blockIdx.y,blockDim.y);
  15:     b= 0;
  16:     //unsigned int index = tid_x  + tid_y*Img_Attr.Image_width;
  17:  
  18:      
  19:     /****Memory Coalescing is even slower!!**/ 
  20:         /*if (tid_x < Img_Attr.Image_width&&tid_y < Img_Attr.Image_height)
  21:             tile[threadIdx.x][threadIdx.y] = tex2D(Image_tex, Pixel_x,Pixel_y);
  22:     __syncthreads();
  23: 
  24: 
  25:         if (tid_x < Img_Attr.Image_width&&tid_y < Img_Attr.Image_height)
  26:             O_data[index] = tile[threadIdx.x][threadIdx.y];
  27:     */
  28:     ////////////////////////////////////////////////////////////
  29:     /*****************Bilinear Interpolation****************/
  30:     if(tid_x < Img_Attr.Image_width &&tid_y  < Img_Attr.Image_height)
  31:         O_data[tid_x  + tid_y*Img_Attr.Image_width]  = (1.0f - a)*(1.0f - b)*tex2D(Image_tex,Pixel_x ,Pixel_y) 
  32:                                                                                         + a*(1.0f - b)*tex2D(Image_tex, Pixel_x + 1,Pixel_y) 
  33:                                                                                     + (1.0f - a)*b*tex2D(Image_tex, Pixel_x,Pixel_y + 1)
  34:                                                                                     + a*b*tex2D(Image_tex, Pixel_x + 1,Pixel_y + 1)  ;
  35: }

Integral Kernel



   1: //*********************Compute Integral********************/
   2:  
   3: __global__  void Compute_IntegralBin(float*Bin_Img,float*His_Img)
   4: {
   5:     __shared__ float cache[CELL_SIZE][CELL_SIZE];
   6:     __shared__ float cache_t[CELL_SIZE];
   7:  
   8:     unsigned int tid_x =  threadIdx.x + __umul24(blockIdx.x,blockDim.x);
   9:     unsigned int tid_y =  threadIdx.y + __umul24(blockIdx.y,blockDim.y);
  10:     
  11:     //unsigned int block_x = blockIdx.x;
  12:     //unsigned int block_y = blockIdx.y;
  13:  
  14:     unsigned int cacheId_x = threadIdx.x ;
  15:     unsigned int cacheId_y = threadIdx.y;
  16:     //int cacheIndex = threadIdx.x;
  17:  
  18:     if(tid_x  + tid_y*Img_Attr.Image_width < Img_Attr.Image_size)
  19:     {
  20:         float sum;
  21:         for(int Bin_id = 0; Bin_id < K; Bin_id ++)
  22:         {
  23:             
  24:             cache[cacheId_x][cacheId_y]  = Bin_Img[tid_x  + tid_y*Img_Attr.Image_width + Bin_id* Img_Attr.Image_size] ;
  25:             __syncthreads();
  26:             
  27:             /////////////////////////////////
  28:             
  29:             sum = 0;
  30:             /*********Version One,Correct given all integral in the cell*****/
  31:             /*
  32:             for  (int i  = 0; i <= cacheId_y ; i ++) 
  33:                 sum  += cache[cacheId_x][i];                                                        
  34: 
  35:             __syncthreads();
  36:             cache[cacheId_x][cacheId_y] = sum;
  37: 
  38:             __syncthreads();
  39: 
  40:             sum = 0;
  41: 
  42:             for  (int i  = 0; i <= cacheId_x ; i ++) 
  43:                 sum  += cache[i][cacheId_y];            
  44:             __syncthreads();
  45:             */
  46:             /****************************************************/
  47:  
  48:             /*********************Version 2 compute sum only*********/
  49:             int i = CELL_SIZE/2;
  50:             while (i != 0) 
  51:             {
  52:                 if (cacheId_x < i)
  53:                     cache[cacheId_x][cacheId_y] += cache[cacheId_x + i][cacheId_y];
  54:                 __syncthreads();
  55:                 i /= 2;
  56:             }
  57:  
  58:             if(cacheId_x == 0)
  59:             {
  60:                 cache_t[cacheId_y] = cache[0][cacheId_y];
  61:                 __syncthreads();
  62:  
  63:                 i = CELL_SIZE/2;
  64:                 while (i != 0) 
  65:                 {
  66:                     if (cacheId_y < i)
  67:                         cache_t[cacheId_y] += cache_t[cacheId_y + i];
  68:                     __syncthreads();
  69:                     i /= 2;
  70:                 }
  71:                 if(cacheId_y == 0)
  72:                     sum = cache_t[0];
  73:             }
  74:             
  75:             /***********************************************************/
  76:  
  77:             Bin_Img[tid_x  + tid_y*Img_Attr.Image_width + Bin_id* Img_Attr.Image_size]  = sum/16;
  78:  
  79:             //////////////TEST////////////
  80:             if (cacheId_x  == 0 &&cacheId_y  == 0) 
  81:             His_Img [blockIdx.x + blockIdx.y*gridDim.x +Bin_id* __umul24(gridDim.x,gridDim.y)] = sum/16;
  82:             /////////////////////////////
  83:         }
  84:  
  85:         //
  86:         
  87:     }
  88: }


Gradient and Bins



   1: /*******************Compute Gradient and assign them to bin Images ******/
   2: __global__ void Compute_GradientBin(float*O_data,float*Bin_Img)
   3: {
   4:  
   5:  
   6:     unsigned int tid_x =  threadIdx.x + __umul24(blockIdx.x,blockDim.x);
   7:     unsigned int tid_y =  threadIdx.y + __umul24(blockIdx.y,blockDim.y);
   8:  
   9:     float angle;
  10:     float Gradx,Grady;
  11:     
  12:     ///////////////////////////////////////////////Access thhe image/////////////////////////////////
  13:     //#if Texture REading////////////////////////////////////TExture Memory
  14:     if(tid_x < Img_Attr.Image_width&&tid_y < Img_Attr.Image_height)
  15:     {
  16:  
  17:         if(tid_x == 0 || tid_x == Img_Attr.Image_width - 1 )
  18:             Gradx = 0.0f;
  19:         else 
  20:             Gradx = (tex2D(Image_tex, tid_x + 1,tid_y) - tex2D(Image_tex, tid_x - 1,tid_y));
  21:                 //+ tex2D(Image_tex, tid_x + 1,tid_y - 1 ) - tex2D(Image_tex, tid_x - 1,tid_y - 1)
  22:                 //+ tex2D(Image_tex, tid_x + 1,tid_y + 1) - tex2D(Image_tex, tid_x - 1,tid_y + 1) ;
  23:         if (tid_y == 0|| tid_y == Img_Attr. Image_height - 1)
  24:             Grady = 0.0f;
  25:         else 
  26:             Grady =  (tex2D(Image_tex, tid_x ,tid_y + 1) - tex2D(Image_tex, tid_x,tid_y - 1)) ;
  27:                 //+ tex2D(Image_tex, tid_x - 1,tid_y + 1 ) - tex2D(Image_tex, tid_x - 1,tid_y - 1)
  28:                 //+ tex2D(Image_tex, tid_x + 1,tid_y + 1) - tex2D(Image_tex, tid_x + 1,tid_y - 1) ;
  29:  
  30:         ///////////////////////////////////////////////////////////////////////////////////////////////
  31:  
  32:  
  33:         /////////////////////////////Angle of the Gradient///////////////// 
  34:     
  35:         if(Gradx != 0)
  36:         {angle  = atan(Grady/Gradx)/3.14159f  >= 0.0f ? atan(Grady/Gradx)/3.14159f : atan(Grady/Gradx)/3.14159f + 1.0f ;
  37:             if(Gradx < 0.0f && angle == 0.0f)
  38:                 angle = 1.0f;
  39:         }
  40:         else
  41:             angle = 0.5f;//Though Condition Gradx == 0&&Grady == 0 exsits , it does not matter, cos its magnitude will be zero, which would affect Feature 
  42:  
  43:         //Plain Output 
  44:         O_data[tid_x  + tid_y*Img_Attr.Image_width] = 0.5*(sqrtf(Grady*Grady + Gradx*Gradx));
  45:  
  46:         ///////////////////Bins    
  47:         angle = angle*K;
  48:         //////////////////////////////////Bin Gradient Images
  49:         for(int Bin_id = 0; Bin_id < K;Bin_id++)
  50:         {    
  51:             Bin_Img[tid_x  + tid_y*Img_Attr.Image_width + Bin_id*Img_Attr.Image_size]  = 0.0f;
  52:             if(angle <=  (float)Bin_id + 1.0f&&angle >= (float)Bin_id)
  53:                 Bin_Img[tid_x  + tid_y*Img_Attr.Image_width + Bin_id*Img_Attr.Image_size] = 0.5*sqrtf(Gradx*Gradx + Grady*Grady);    
  54:         }        
  55:     }
  56:  
  57: }

2 comments:

  1. Nice progress. If you include memory transfer, do you still expect the GPU version to perform as well? If you want to test on a high end card, talk to Jon about using one of the machines in the SIG lab. Is the CPU version in OpenCV that you are comparing against single-threaded? Does it have any cache or SIMD optimizations?

    ReplyDelete
  2. Re @Patrick Cozzi:
    For this single operation comparison, I would not expect that. Memory transfer will be large enough to hide the 0.28ms processing time.
    But for whole operation(All the kernels),which scales to 100x larger while keep the same CPU -GPU- CPU memory transfer, the performance will be better in GPU.

    The CPU version in OpenCV is a built-in function which I dont look into it. but i am quite sure it's complete but slower implementation for pure scaling processing purpose

    ReplyDelete