With Intel® C++ Compiler 19.0 and above and with Intel® Fortran Compiler 19.1 and above, we have support for the SIMD implementation of the inclusive and exclusive scan. Starting with OpenMP* Version 5.0, the reduction clause supports scans patterns.
Explicit syntax for inclusive scan using C++
#pragma omp simd reduction[parallel](inscan, operator:list)
#pragma omp scan inclusive(item-list)
Explicit syntax for inclusive scan using Fortran
!$omp simd reduction(inscan, operator : list)
!$omp scan inclusive(item-list)
Explicit syntax for exclusive scan using C++
#pragma omp simd reduction[parallel](inscan, operator:list)
#pragma omp scan exclusive(item-list)
Explicit syntax for exclusive scan using Fortran
!$omp simd reduction(inscan, operator : list)
!$omp scan exclusive(item-list)
Below is a C++ code snippet which uses prefix sum with SIMD scan feature. The implementation offers serial version of the code, as well as, SIMD version of both Inclusive and Exclusive scans.
Exclusive Scan |
Inclusive Scan |
#include<iostream>
#include<math.h>
#include<string.h>
#include<chrono>
#define N 16
using namespace std;
int main(){
int a[N], serial_scan[N], naive_scan[N], simd_scan[N], scan_a;
for(int i = 0; i < N; i++){
a[i] = i;
serial_scan[i] = 0;
simd_scan[i] = 0;
}
//Serial Scan with "+" operator
auto start = std::chrono::system_clock::now();
scan_a = 0;
for(int i = 0; i < N; i++){
serial_scan[i] = scan_a;
scan_a += a[i];
}
auto stop = std::chrono::system_clock::now();
std::cout<<"Serial Scan Output:\n";
for(int i = 0; i < N; i++)
std::cout<<serial_scan[i]<<"\t";
std::cout<<"\n";
std::chrono::duration<double> elapsed_seconds = stop-start;
std::cout<<"Time taken in seconds is "<<elapsed_seconds.count()<<"\n";
//SIMD Scan with "+" operator
start = std::chrono::system_clock::now();
scan_a = 0;
#pragma omp simd reduction(inscan, +:scan_a)
for(int i = 0; i < N; i++){
simd_scan[i] = scan_a;
#pragma omp scan exclusive(scan_a)
scan_a += a[i];
}
stop = std::chrono::system_clock::now();
std::cout<<"SIMD Scan Output:\n";
for(int i = 0; i < N; i++)
std::cout<<simd_scan[i]<<"\t";
std::cout<<"\n";
elapsed_seconds = stop - start;
std::cout<<"Time taken in seconds is "<<elapsed_seconds.count()<<"\n";
return 0;
}
|
#include<iostream>
#include<math.h>
#include<string.h>
#include<chrono>
#define N 16
using namespace std;
int main(){
int a[N], serial_scan[N], naive_scan[N], simd_scan[N], scan_a;
for(int i = 0; i < N; i++){
a[i] = i;
serial_scan[i] = 0;
simd_scan[i] = 0;
}
//Serial Scan with "+" operator
auto start = std::chrono::system_clock::now();
scan_a = 0;
for(int i = 0; i < N; i++){
scan_a += a[i];
serial_scan[i] = scan_a;
}
auto stop = std::chrono::system_clock::now();
std::cout<<"Serial Scan Output:\n";
for(int i = 0; i < N; i++)
std::cout<<serial_scan[i]<<"\t";
std::cout<<"\n";
std::chrono::duration<double> elapsed_seconds = stop-start;
std::cout<<"Time taken in seconds is "<<elapsed_seconds.count()<<"\n";
//SIMD Scan with "+" operator
start = std::chrono::system_clock::now();
scan_a = 0;
#pragma omp simd reduction(inscan, +:scan_a)
for(int i = 0; i < N; i++){
scan_a += a[i];
#pragma omp scan inclusive(scan_a)
simd_scan[i] = scan_a;
}
stop = std::chrono::system_clock::now();
std::cout<<"SIMD Scan Output:\n";
for(int i = 0; i < N; i++)
std::cout<<simd_scan[i]<<"\t";
std::cout<<"\n";
elapsed_seconds = stop - start;
std::cout<<"Time taken in seconds is "<<elapsed_seconds.count()<<"\n";
return 0;
}
|
Here is the Fortran version of the same code snippet.
Exclusive Scan |
Inclusive Scan |
program exclusive
implicit none
integer, parameter :: n = 16
integer, parameter :: ntimes = 10000000 ! iterate enough times to accumulate
some CPU time
integer :: a(N), serial_scan(N), naive_scan(N), simd_scan(N), scn_
a
integer :: i, j
real(8) :: start, stop
!!! initialize
do i = 1, n
a(i) = i-1
end do
serial_scan = 0
simd_scan = 0
!!! Serial Scan with "+" operator
call cpu_time(start)
do j = 1, ntimes
scn_a = 0
do i = 1, n
serial_scan(i) = scn_a
scn_a = scn_a + a(i)
end do
end do
call cpu_time(stop)
print *, "Serial Scan Output: "
print *, serial_scan
print *, "Time taken in seconds is ",stop-start
!!! SIMD Scan with "+" operator
call cpu_time(start)
do j = 1, ntimes
scn_a = 0
!$omp simd reduction(inscan, +:scn_a)
do i = 1, n
simd_scan(i) = scn_a
!$omp scan exclusive(scn_a)
scn_a = scn_a + a(i)
end do
end do
call cpu_time(stop)
print *,"SIMD Scan Output: "
print *, simd_scan
print *, "Time taken in seconds is ",stop-start
stop
end
|
program inclusive
implicit none
integer, parameter :: n = 16
integer, parameter :: ntimes = 10000000 ! iterate enough times to accumulate
some CPU time
integer :: a(N), serial_scan(N), naive_scan(N), simd_scan(N), scn_
a
integer :: i, j
real(8) :: start, stop
!!! initialize
do i = 1, n
a(i) = i-1
end do
serial_scan = 0
simd_scan = 0
!!! Serial Scan with "+" operator
call cpu_time(start)
do j = 1, ntimes
scn_a = 0
do i = 1, n
scn_a = scn_a + a(i)
serial_scan(i) = scn_a
end do
end do
call cpu_time(stop)
print *, "Serial Scan Output: "
print *, serial_scan
print *, "Time taken in seconds is ",stop-start
!!! SIMD Scan with "+" operator
call cpu_time(start)
do j = 1, ntimes
scn_a = 0
!$omp simd reduction(inscan, +:scn_a)
do i = 1, n
scn_a = scn_a + a(i)
!$omp scan inclusive(scn_a)
simd_scan(i) = scn_a
end do
end do
call cpu_time(stop)
print *, "SIMD Scan Output: "
print *, simd_scan
print *, "Time taken in seconds is ",stop-start
stop
end
|
The single threaded SIMD implementation runs ~2x faster when targeting Intel® Advanced Vector Extensions (Intel® AVX2) in comparison serial, non-vector implementation for both C++ and Fortran implementations.
Machine Specification:
Processor: Intel® Xeon® CPU E7-4850 v3 @ 2.20GHz
RAM: 512 GB
Compiler Version: Intel® C++ Compiler 19.0
Compiler Flags: -std=c++11 -xCORE-AVX2
Date of performance run: 7/11/2019
***Performance results are based on testing as of 7/11/2019 and may not reflect all publicly available security updates. See configuration disclosure for details. No product can be absolutely secure.
