Developer Guide
Intel® oneAPI DPC++/C++ Compiler Handbook for FPGAs
FPGA Loop Directives
The following table summarizes loop directives:
| Directive (Pragma, Attribute, or Function) | Description | Example | 
|---|---|---|
| disable_loop_pipelining | Directs the Intel® oneAPI DPC++/C++ Compiler to disable pipelining of a loop. | [[intel::disable_loop_pipelining]]
for (int i = 1; i < N; i++) {
  int j = a[i-1];
  // Memory dependency induces 
  // a high-latency loop feedback path
  a[i] = foo(j)
} | 
| initiation_interval | Forces a loop to have a loop initialization interval (II) of a specified value. | // ii set to 5
[[intel::initiation_interval(5)]]
  for (int i = 0; i < N; ++i){
} | 
| ivdep | Ignores memory dependencies between iterations of this loop. Applying the ivdep attribute to a variable that is used in a lambda function or a variable that is passed as a function argument can result in functional failures in your kernel. | // ivdep loop
[[intel::ivdep]] for (…) {}//ivdep safelen 
[[intel::ivdep(safelen)]] for (;;) {} // ivdep accessor
[[intel::ivdep(accessorA)]] for (;;) {}//ivdep array safelen
[[intel::ivdep(accessorA, safelen)]]
for (;;){} | 
| loop_coalesce | Coalesces nested loops into a single loop without affecting the loop functionality. | [[intel::loop_coalesce(2)]] 
for (int i = 0; i < N; i++)
  for (int j = 0; j < M; j++)
    sum[i][j] += i+j; | 
| max_concurrency | Limits the number of iterations of a loop that can simultaneously execute at any time. | //max concurrency set to 1
[[intel::max_concurrency(1)]] 
  for (int i = 0; i < c; ++i){
} | 
| max_interleaving | Maximizes the throughput and hardware resource occupancy of pipelined inner loops in a loop nest. | // Loop j is pipelined with ii=1
for (int j = 0; j < M; j++) {
  int a[N];
  // Loop i is pipelined with ii=2 
  [[intel::max_interleaving(1)]]
  for (int i = 1; i < N; i++) {
    a[i] = foo(i)
  }
  …
} | 
| speculated_iterations | Improves the performance of pipelined loops. | [[intel::speculated_iterations(1)]]
  while (m*m*m < N) {
    m += 1;
  }
  dst[0] = m; | 
| unroll | Unrolls a loop in the kernel code. | // unroll factor N set to 2
#pragma unroll 2
for(size_t k = 0; k < 4; k++){
  mac += data_in[(gid * 4) + k] * coeff[k];
} | 
| nofusion | Prevents the compiler from fusing the annotated loop with any of the adjacent loops. | for (int x = 0; x < N; x++) { 
  a1_acc[x] = x; 
}
[[intel::nofusion]] 
for (int x = 0; x < N; x++) { 
  a2_acc[x] = x; 
} | 
| sycl::ext::intel::fpga_loop_fuse<v>(f) | Fuses loops within the function f up to a depth of v >= 1, where v = 1 by default. | [=]() [[intel::kernel_args_restrict]] { 
  sycl::ext::intel::fpga_loop_fuse<v>{
    for (int x = 0; x < N; x++) {
      for (int y = 0; y < N; y++) {
        for (int z = 0; z < N; z++) {
          a1_acc[x][y][z] = 0;
        }
      }
    }
    for (int x = 0; x < N + 1; x++) {
      for (int y = 0; y < N + 1; y++) {
        for (int z = 0; z < N + 1; z++) {
          a2_acc[x][y][z] = 0;
        }
      }
    }
  }
} | 
| sycl::ext::intel::fpga_loop_fuse<v><v>(f) | Fuses loops within the function f up to a depth v >= 1 while overriding fusion-safety checks. Here, v = 1 by default. | [=]() { //Kernel
  sycl::ext::intel::fpga_loop_fuse_independent([&] {
    for(int x = 0; x < N; x++){
      a3_acc[x] = x;
    }
    for(int x = 0; x < N + 1; x++){
      a4_acc[x] = x;
    }
  });
} |