Hi,
I'm trying to figure out why my kernel breaks when I try to use the #pragma unroll statement for a loop. I'm using the 16.1 SDK and the device I'm running on is an Arria 10/DE5a-net and compiling it without any compiler flags (i.e. no relaxed floating point operations).
The kernel in question is posted below. It's part of a physics simulation and a bit long, but I'm posting the entire code, just to make sure I'm not cutting out an important part.
The kernel works fine in emulation. The kernel also works fine running on the device if I comment out the #pragma unroll statement for the innermost loop statement. But with the statement enabled, I only get NANs in the output. I can't see a hint of any problem in the compiler report or anywhere else. I know that the kernel is not well written or optimized yet. But at this point I'm just trying to understand why the unrolling fails.
Any help to solve this mystery would be appreciated!
Hanno
I'm trying to figure out why my kernel breaks when I try to use the #pragma unroll statement for a loop. I'm using the 16.1 SDK and the device I'm running on is an Arria 10/DE5a-net and compiling it without any compiler flags (i.e. no relaxed floating point operations).
The kernel in question is posted below. It's part of a physics simulation and a bit long, but I'm posting the entire code, just to make sure I'm not cutting out an important part.
The kernel works fine in emulation. The kernel also works fine running on the device if I comment out the #pragma unroll statement for the innermost loop statement. But with the statement enabled, I only get NANs in the output. I can't see a hint of any problem in the compiler report or anywhere else. I know that the kernel is not well written or optimized yet. But at this point I'm just trying to understand why the unrolling fails.
Any help to solve this mystery would be appreciated!
Hanno
Code:
#define N 9
__attribute__((task))
__kernel void vector_add(
__global const float *restrict p_in,
__global const float *restrict p_mass_in,
__global float *restrict p_out )
{
const float dt = 0.01;
const long steps = 10000;
float p_pos[3*N];
float p_vel[3*N];
float p_mass[N];
for (int i=0; i<N; i++){
p_pos[i*3+0] = p_in[i*6+0];
p_pos[i*3+1] = p_in[i*6+1];
p_pos[i*3+2] = p_in[i*6+2];
p_vel[i*3+0] = p_in[i*6+3];
p_vel[i*3+1] = p_in[i*6+4];
p_vel[i*3+2] = p_in[i*6+5];
p_mass[i] = p_mass_in[i];
}
float dt12 = dt/2.;
for(long k=0;k<steps;k++){
for(int i=0; i<N; i++){
p_pos[3*i+0] += dt12*p_vel[3*i+0];
p_pos[3*i+1] += dt12*p_vel[3*i+1];
p_pos[3*i+2] += dt12*p_vel[3*i+2];
}
for(int i=0; i<N; i++){
const float pix = p_pos[3*i+0];
const float piy = p_pos[3*i+1];
const float piz = p_pos[3*i+2];
float ax = 0.;
float ay = 0.;
float az = 0.;
#pragma unroll
for(int j=0; j<N; j++){
const float dx = p_pos[3*j+0] - pix;
const float dy = p_pos[3*j+1] - piy;
const float dz = p_pos[3*j+2] - piz;
const float pre_sqrt = dx*dx + dy*dy + dz*dz;
const float _r = sqrtf(pre_sqrt);
const float pre_recip = p_mass[j]/(pre_sqrt * _r);
const float prefact = ((i==j)?0.:pre_recip);
ax -= prefact*dx;
ay -= prefact*dy;
az -= prefact*dz;
}
p_vel[3*i+0] += dt*ax;
p_vel[3*i+1] += dt*ay;
p_vel[3*i+2] += dt*az;
}
for(int i=0; i<N; i++){
p_pos[3*i+0] += dt12*p_vel[3*i+0];
p_pos[3*i+1] += dt12*p_vel[3*i+1];
p_pos[3*i+2] += dt12*p_vel[3*i+2];
}
}
for (int i=0; i<N; i++){
p_out[i*6+0] = p_pos[i*3+0];
p_out[i*6+1] = p_pos[i*3+1];
p_out[i*6+2] = p_pos[i*3+2];
p_out[i*6+3] = p_vel[i*3+0];
p_out[i*6+4] = p_vel[i*3+1];
p_out[i*6+5] = p_vel[i*3+2];
}
}