Hello,
I have written two kernels to notice the difference in fixed and floating point operations.
a)
__kernel
__attribute__((task))
void test_multiplier(global char *restrict in, global char *restrict weights, global int *restrict out) {
int output = 0;
#pragma unroll 100
for(int i=0; i<VEC_SIZE; i++){
output += in[i] * weights[i];
}
*out = output;
}
b)
__kernel
__attribute__((task))
void test_multiplier(global float *restrict in, global float *restrict weights, global float *restrict out) {
int output = 0;
#pragma unroll 100
for(int i=0; i<VEC_SIZE; i++){
output += in[i] * weights[i];
}
*out = output;
}
Both the kernels give me the same number of DSPs, i.e 100 (unroll factor). I was expecting 25 DSPs in the 8 bit (char argument) case. Does aoc compiler optimize well for fixed point quantizations?
I have written two kernels to notice the difference in fixed and floating point operations.
a)
__kernel
__attribute__((task))
void test_multiplier(global char *restrict in, global char *restrict weights, global int *restrict out) {
int output = 0;
#pragma unroll 100
for(int i=0; i<VEC_SIZE; i++){
output += in[i] * weights[i];
}
*out = output;
}
b)
__kernel
__attribute__((task))
void test_multiplier(global float *restrict in, global float *restrict weights, global float *restrict out) {
int output = 0;
#pragma unroll 100
for(int i=0; i<VEC_SIZE; i++){
output += in[i] * weights[i];
}
*out = output;
}
Both the kernels give me the same number of DSPs, i.e 100 (unroll factor). I was expecting 25 DSPs in the 8 bit (char argument) case. Does aoc compiler optimize well for fixed point quantizations?