#define GSIZE 64
__attribute__((num_simd_work_items(4)))
__attribute__((reqd_work_group_size(1,1,64)))
__kernel void classification(__global uchar * restrict textures, __global uchar * restrict SVS, __global float * restrict alphas,
float gamma,__global float * restrict result)
{
uint gidx = get_global_id(0); //img_w
uint gidy = get_global_id(1); //img_h
uint gidz = get_global_id(2); //sv_num
int img_w = get_global_size(0);
uint lid = get_local_id(2); //64
uint group_size_z = get_num_groups(2);
uint group_id_z = get_group_id(2);
__local uchar texture[90];
__local float alpha_local[GSIZE];
__local float res[GSIZE];
__local float gamma_local;
if(lid==0){
gamma_local = gamma;
#pragma unroll
for(int i=0;i<90;i++){
texture[i] = textures[(gidy*img_w+gidx)*90+i];
if(i<GSIZE){
alpha_local[i] = alphas[group_id_z*GSIZE+i];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int temp = 0;
int i=0;
#pragma unroll
for(i=0;i<86;i+=4){
int t0 = SVS[gidz*90+i]-texture[i];
int t1 = SVS[gidz*90+i+1]-texture[i+1];
temp += t0*t0+t1*t1;
t0 = SVS[gidz*90+i+2]-texture[i+2];
t1 = SVS[gidz*90+i+3]-texture[i+3];
temp += t0*t0+t1*t1;
}
#pragma unroll
for(;i<90;i++){
int t0 = SVS[gidz*90+i]-texture[i];
temp += t0*t0;
}
res[lid] = alpha_local[lid]*exp(-1*gamma_local*(float)temp);
barrier(CLK_LOCAL_MEM_FENCE);
if(lid==0){
float sum=0;
#pragma unroll
for(int i=0;i<GSIZE-3;i+=4){
sum += res[i];
sum += res[i+1];
sum += res[i+2];
sum += res[i+3];
}
result[(gidy*img_w+gidx)*group_size_z+group_id_z] = sum;
}
}