opencl初探-sobel检测


sobel检测的C版本,neonGPU的时间比较。


Platform:LG G3, Adreno 330 ,img size 3264x2448


sobel:

C code

neon

GPU

73

13

42+3.7+6.6

 单位:ms GPU时间=memorytime+Queued time+Run time




Sobel org

Sobel vector

Sobel vector +mem_fence

Queued time

4.6

7.2

2.8

Wait time

0.07

0.09

0.07

Run time

66.9

7.3

6.6






typedef unsigned char BYTE;
void sobel(BYTE *src,int w,int h,BYTE *Ix,BYTE *Iy)
{
int src_step = w;
int dst_step = w;
int x, height = h - 2;
BYTE
* dstX = Ix+dst_step;
BYTE
* dstY = Iy+dst_step;
for( ; height--; src += src_step, dstX += dst_step, dstY += dst_step )
{
const BYTE* src2 = src + src_step;
const BYTE* src3 = src + src_step*2;

for( x = 1; x < w-1 ; x++ )
{
short t0 = 0 ;
short t1 = 0 ;
t0
= -src[x-1]+src[x+1] ;
t1
= src[x-1]+(src[x]<<1)+src[x+1];

t0
+= ((-src2[x-1]+src2[x+1])<<1) ;

t0
+= -src3[x-1]+src3[x+1] ;
t1
-= ( src3[x-1]+(src3[x]<<1)+src3[x+1] );

dstX[x]
= t0>>3;
dstY[x]
= t1>>3;
}
}
}


void sobel_neon(BYTE *src,int w,int h,BYTE *Ix,BYTE *Iy)
{
int src_step = w;
int dst_step = w;
int x, height = h - 2;
BYTE
* dstX = Ix+dst_step;
BYTE
* dstY = Iy+dst_step;
for( ; height--; src += src_step, dstX += dst_step, dstY += dst_step )
{
const BYTE* src2 = src + src_step;
const BYTE* src3 = src + src_step*2;
x
= 1;
while((x+8) <= w-1 )
{
uint8x8_t left
= vld1_u8(src+x-1);
uint8x8_t mid
= vld1_u8(src+x) ;
uint8x8_t right
= vld1_u8(src+x+1) ;

int16x8_t t0
= vreinterpretq_s16_u16( vsubl_u8(right,left) ) ;
int16x8_t t1
= vaddq_s16( vreinterpretq_s16_u16( vaddl_u8(left,right) ) ,
vreinterpretq_s16_u16( vshll_n_u8(mid,
1) ) );

left
= vld1_u8(src2+x-1);
right
= vld1_u8(src2+x+1) ;
int16x8_t temp
= vreinterpretq_s16_u16( vsubl_u8(right,left) );
t0
= vaddq_s16(t0,vshlq_n_s16(temp,1));

left
= vld1_u8(src3+x-1);
mid
= vld1_u8(src3+x) ;
right
= vld1_u8(src3+x+1) ;
t0
= vaddq_s16(t0,vreinterpretq_s16_u16( vsubl_u8(right,left) ));
temp
= vaddq_s16( vreinterpretq_s16_u16( vaddl_u8(left,right) ) ,
vreinterpretq_s16_u16( vshll_n_u8(mid,
1) ) );
t1
= vsubq_s16(t1,temp);

vst1_s8((int8_t
*)dstX+x,vshrn_n_s16(t0,3));
vst1_s8((int8_t
*)dstY+x,vshrn_n_s16(t1,3));
x
+= 8;
}
while( (x) < w-1 )
{
short t0 = 0 ;
short t1 = 0 ;
t0
= -src[x-1]+src[x+1] ;
t1
= src[x-1]+(src[x]<<1)+src[x+1];

t0
+= ((-src2[x-1]+src2[x+1])<<1) ;

t0
+= -src3[x-1]+src3[x+1] ;
t1
-= ( src3[x-1]+(src3[x]<<1)+src3[x+1] );

dstX[x]
= t0>>3;
dstY[x]
= t1>>3;
x
++;
}
}
}
View Code

 

本站声明
本文转载自:http://www.cnblogs.com/mlj318/p/5039284.html     作者:mlj318     发布日期:2015/12/11     本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系我们删除。


 
© 2014-2017 ITdaan.com 粤ICP备14056181号