Developer Guide
Intel® oneAPI DPC++/C++ Compiler Handbook for FPGAs
FPGA Loop Directives
The following table summarizes loop directives:
| Directive (Pragma, Attribute, or Function) |
Description | Example |
|---|---|---|
| disable_loop_pipelining | Directs the Intel® oneAPI DPC++/C++ Compiler to disable pipelining of a loop. |
[[intel::disable_loop_pipelining]]
for (int i = 1; i < N; i++) {
int j = a[i-1];
// Memory dependency induces
// a high-latency loop feedback path
a[i] = foo(j)
} |
| initiation_interval | Forces a loop to have a loop initialization interval (II) of a specified value. |
// ii set to 5
[[intel::initiation_interval(5)]]
for (int i = 0; i < N; ++i){
} |
| ivdep | Ignores memory dependencies between iterations of this loop. Applying the ivdep attribute to a variable that is used in a lambda function or a variable that is passed as a function argument can result in functional failures in your kernel. |
// ivdep loop
[[intel::ivdep]] for (…) {}//ivdep safelen
[[intel::ivdep(safelen)]] for (;;) {} // ivdep accessor
[[intel::ivdep(accessorA)]] for (;;) {}//ivdep array safelen
[[intel::ivdep(accessorA, safelen)]]
for (;;){} |
| loop_coalesce | Coalesces nested loops into a single loop without affecting the loop functionality. |
[[intel::loop_coalesce(2)]]
for (int i = 0; i < N; i++)
for (int j = 0; j < M; j++)
sum[i][j] += i+j; |
| max_concurrency | Limits the number of iterations of a loop that can simultaneously execute at any time. |
//max concurrency set to 1
[[intel::max_concurrency(1)]]
for (int i = 0; i < c; ++i){
} |
| max_interleaving | Maximizes the throughput and hardware resource occupancy of pipelined inner loops in a loop nest. |
// Loop j is pipelined with ii=1
for (int j = 0; j < M; j++) {
int a[N];
// Loop i is pipelined with ii=2
[[intel::max_interleaving(1)]]
for (int i = 1; i < N; i++) {
a[i] = foo(i)
}
…
} |
| speculated_iterations | Improves the performance of pipelined loops. |
[[intel::speculated_iterations(1)]]
while (m*m*m < N) {
m += 1;
}
dst[0] = m; |
| unroll | Unrolls a loop in the kernel code. |
// unroll factor N set to 2
#pragma unroll 2
for(size_t k = 0; k < 4; k++){
mac += data_in[(gid * 4) + k] * coeff[k];
} |
| nofusion | Prevents the compiler from fusing the annotated loop with any of the adjacent loops. | for (int x = 0; x < N; x++) {
a1_acc[x] = x;
}
[[intel::nofusion]]
for (int x = 0; x < N; x++) {
a2_acc[x] = x;
} |
| sycl::ext::intel::fpga_loop_fuse<v>(f) | Fuses loops within the function f up to a depth of v >= 1, where v = 1 by default. | [=]() [[intel::kernel_args_restrict]] {
sycl::ext::intel::fpga_loop_fuse<v>{
for (int x = 0; x < N; x++) {
for (int y = 0; y < N; y++) {
for (int z = 0; z < N; z++) {
a1_acc[x][y][z] = 0;
}
}
}
for (int x = 0; x < N + 1; x++) {
for (int y = 0; y < N + 1; y++) {
for (int z = 0; z < N + 1; z++) {
a2_acc[x][y][z] = 0;
}
}
}
}
} |
| sycl::ext::intel::fpga_loop_fuse<v><v>(f) | Fuses loops within the function f up to a depth v >= 1 while overriding fusion-safety checks. Here, v = 1 by default. | [=]() { //Kernel
sycl::ext::intel::fpga_loop_fuse_independent([&] {
for(int x = 0; x < N; x++){
a3_acc[x] = x;
}
for(int x = 0; x < N + 1; x++){
a4_acc[x] = x;
}
});
} |