Offloading oneMKL Computations onto the GPU
The Intel® oneAPI Math Kernel Library (oneMKL) improves performance
with math routines for software applications that solve large
computational problems. oneMKL provides BLAS and LAPACK linear algebra
routines, fast Fourier transforms, vectorized math functions, random
number generation functions, and other functionality.
You can use OpenMP directives to offload oneMKL computations onto the
GPU. There are two ways to do this.
One way involves using the Intel-specific OpenMP extension
target variant
dispatch
. You would place the call to the oneMKL routine inside a target
variant dispatch
construct, as shown in the example below. In this example,
arrays A
, B
, and C
used in the multiplication are mapped to the
device before the call to the oneMKL routine cblas_dgemm
. The
use_device_ptr(A,B,C)
clause is used on the target variant dispatch
directive to indicate that A
, B
, and C
point to objects that have
corresponding storage on the device. When cblas_dgemm
is called, the
corresponding device pointers for A
, B
, and C
will be passed as
arguments, and the device copies of A
, B
, and C
will be used in the
computation.// clang-format off
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include "mkl.h"
#include "mkl_omp_offload.h"
#define min(x,y) (((x) < (y)) ? (x) : (y))
int main()
{
double *A, *B, *C, *C_fl;
int m, n, k, i, j, q;
double alpha, beta;
double sum;
int pass;
printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n"
" Intel MKL function dgemm, where A, B, and C are matrices and \n"
" alpha and beta are double precision scalars\n\n");
m = 2000, k = 200, n = 1000;
printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
" A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n);
alpha = 1.0; beta = 0.0;
printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
" performance \n\n");
A = (double *)mkl_malloc( m * k * sizeof( double ), 64 );
B = (double *)mkl_malloc( k * n * sizeof( double ), 64 );
C = (double *)mkl_malloc( m * n * sizeof( double ), 64 );
C_fl = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
if (A == NULL || B == NULL || C == NULL || C_fl == NULL) {
printf( "\n ERROR: Cannot allocate memory for matrices. Exiting... \n\n");
return 1;
}
printf (" Intializing matrices \n\n");
for (i = 0; i < (m*k); i++) {
A[i] = (double)(i+1);
}
for (i = 0; i < (k*n); i++) {
B[i] = (double)(-i-1);
}
for (i = 0; i < (m*n); i++) {
C[i] = 0.0;
C_fl[i] = 0.0;
}
printf (" Computing matrix product using Intel MKL dgemm function via CBLAS interface \n\n");
#pragma omp target data map(to: A[0:m*k], B[0:k*n]) map(tofrom: C[0:m*n])
{
#pragma omp target variant dispatch use_device_ptr(A, B, C)
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, k, alpha, A, k, B, n, beta, C, n);
}
printf ("\n Top left corner of matrix C: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.5G", C[j+i*n]);
}
printf ("\n");
}
printf (" Computing matrix product using for-loops \n");
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
sum = 0.0;
for (q = 0; q < k; q++)
sum += A[k*i+q] * B[n*q+j];
C_fl[n*i+j] = sum;
}
}
printf ("\n Top left corner of matrix C_fl: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.5G", C_fl[j+i*n]);
}
printf ("\n");
}
printf (" Computations completed. Verifying... \n\n");
pass = 1;
for (i = 0; i < (m*n); i++) {
if (C[i] != C_fl[i]) {
pass = 0;
break;
}
}
if (pass)
printf ("\n **** PASS **** \n");
else
printf (" **** FAIL **** \n");
printf ("\n Deallocating memory \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
printf (" Example completed. \n\n");
return 0;
}
Another way to inform the compiler that oneMKL computations should be
offloaded onto the GPU is by using the OpenMP 5.1
dispatch
directive, as shown in the example below. In this example too, arrays
A
, B
, and C
are mapped to the device before the call to
the oneMKL routine cblas_dgemm
. No clauses are needed on the
dispatch
directive. When cblas_dgemm
is called, the
corresponding device pointers for A
, B
, and C
will be
passed as arguments, so the device copies of A
, B
, and C
will be used in the computation. In order for this example to compile,
you need to add the compiler option -fopenmp-version=51
.// clang-format off
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include "mkl.h"
#include "mkl_omp_offload.h"
#define min(x,y) (((x) < (y)) ? (x) : (y))
int main()
{
double *A, *B, *C, *C_fl;
int m, n, k, i, j, q;
double alpha, beta;
double sum;
int pass;
printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n"
" Intel MKL function dgemm, where A, B, and C are matrices and \n"
" alpha and beta are double precision scalars\n\n");
m = 2000, k = 200, n = 1000;
printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
" A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n);
alpha = 1.0; beta = 0.0;
printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
" performance \n\n");
A = (double *)mkl_malloc( m * k * sizeof( double ), 64 );
B = (double *)mkl_malloc( k * n * sizeof( double ), 64 );
C = (double *)mkl_malloc( m * n * sizeof( double ), 64 );
C_fl = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
if (A == NULL || B == NULL || C == NULL || C_fl == NULL) {
printf( "\n ERROR: Cannot allocate memory for matrices. Exiting... \n\n");
return 1;
}
printf (" Intializing matrices \n\n");
for (i = 0; i < (m*k); i++) {
A[i] = (double)(i+1);
}
for (i = 0; i < (k*n); i++) {
B[i] = (double)(-i-1);
}
for (i = 0; i < (m*n); i++) {
C[i] = 0.0;
C_fl[i] = 0.0;
}
printf (" Computing matrix product using Intel MKL dgemm function via CBLAS interface \n\n");
#pragma omp target data map(to: A[0:m*k], B[0:k*n]) map(tofrom: C[0:m*n])
{
#pragma omp dispatch
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, k, alpha, A, k, B, n, beta, C, n);
}
printf ("\n Top left corner of matrix C: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.5G", C[j+i*n]);
}
printf ("\n");
}
printf (" Computing matrix product using for-loops \n");
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
sum = 0.0;
for (q = 0; q < k; q++)
sum += A[k*i+q] * B[n*q+j];
C_fl[n*i+j] = sum;
}
}
printf ("\n Top left corner of matrix C_fl: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.5G", C_fl[j+i*n]);
}
printf ("\n");
}
printf (" Computations completed. Verifying... \n\n");
pass = 1;
for (i = 0; i < (m*n); i++) {
if (C[i] != C_fl[i]) {
pass = 0;
break;
}
}
if (pass)
printf ("\n **** PASS **** \n");
else
printf (" **** FAIL **** \n");
printf ("\n Deallocating memory \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
printf (" Example completed. \n\n");
return 0;
}
Notes
- If a oneMKL routine is not called from atarget variant dispatch(ordispatch) region, or if offload is disabled, then the oneMKL computations will be executed on the CPU.
- Only one call to a oneMKL routine can be issued from an OpenMPtarget variant dispatch(ordispatch) region. If there are two consecutive calls to oneMKL routines, then the calls should be placed in separatetarget variant dispatch(ordispatch) constructs.
Batching of oneMKL GEMM Calls
The oneMKL library includes “batch” routines that allow the user to batch
several oneMKL calls into a single MKL call. At runtime, oneMKL will
intelligently execute all of the matrix operations to optimize overall
performance.
For example, the
cblas_dgemm
routine computes a matrix-matrix
product of two general matrices a
and b
, returning the result
in a matrix c
. The cblas_dgemm
interface is shown below.void cblas_dgemm (const CBLAS_LAYOUT layout,
const CBLAS_TRANSPOSE transa, const CBLAS_TRANSPOSE transb,
const MKL_INT m, const MKL_INT n, const MKL_INT k,
const double alpha, const double *a,
const MKL_INT lda, const double *b,
const MKL_INT ldb, const double beta,
double *c, const MKL_INT ldc);
The
cblas_dgemm_batch
routine is similar to the cblas_dgemm
routine, but the cblas_dgemm_batch
routine performs matrix-matrix
operations with groups of matrices, processing a number of groups at
once. A group contains matrices with the same parameters.The
cblas_dgemm_batch
interface is shown below. Note that the
interface resembles the cblas_dgemm
interface. However, it
involves passing matrix arguments as arrays of pointers to matrices,
and passing parameters as arrays of parameters.void cblas_dgemm_batch (const CBLAS_LAYOUT layout,
const CBLAS_TRANSPOSE* transa_array, const CBLAS_TRANSPOSE* transb_array,
const MKL_INT* m_array, const MKL_INT* n_array, const MKL_INT* k_array,
const double* alpha_array, const double **a_array,
const MKL_INT* lda_array, const double **b_array,
const MKL_INT* ldb_array, const double* beta_array,
double **c_array, const MKL_INT* ldc_array,
const MKL_INT group_count, const MKL_INT* group_size);
The batch operation is defined as follows:
idx = 0
for i = 0 .. group_count - 1
alpha and beta in alpha_array[i] and beta_array[i]
for j = 0 .. group_size[i] - 1
a, b, and c matrices in a_array[idx], b_array[idx], and c_array[idx], respectively
c := alpha*op(a)*op(b) + beta*c,
idx = idx + 1
end for
end for
where:
- op(X) is one of op(X) = X, or op(X) = XT, or op(X) = XH,
- alpha and beta are scalar elements of alpha_array and beta_array,
- a, b and c are matrices such that for m, n, and k which are elements of m_array, n_array, and k_array:
- op(a) is an m-by-k matrix,
- op(b) is a k-by-n matrix,
- C is an m-by-n matrix.
- a, b, and c represent matrices stored at addresses pointed to by a_array, b_array, and c_array, respectively. The number of entries in a_array, b_array, and c_array is total_batch_count = the sum of all of the group_size entries.
It is possible to batch the multiplications of different shapes and
parameters by packaging them into groups, where each group consists of
multiplications of matrices of the same shapes (same m, n, and k) and
the same parameters.
The basic assumption for the batch API are that all operations in a
batch (whether in the same group or different groups) are independent
of one another. So oneMKL does not guarantee any particular ordering
between operations in a batch, and in general will try to execute
multiple operations in parallel.
In general, the larger you can make the batch size, the better. This
allows oneMKL to better parallelize the operations and distribute the
work across the GPU.
We illustrate how two calls to
cblas_dgemm
can be replaced with
one call to cblas_dgemm_batch
. The following example includes two
calls to cblas_dgemm
.// clang-format off
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include "mkl.h"
#include "mkl_omp_offload.h"
#define min(x,y) (((x) < (y)) ? (x) : (y))
#define epsilon 0.0000001f
bool compare(double x, double y)
{
if(fabs(x - y) <= epsilon)
return true; // x and y are the same
return false; // x and y are not the same
}
int main()
{
double *A1, *B1, *C1, *C1_fl;
double *A2, *B2, *C2, *C2_fl;
int m, n, k, i, j, q;
double alpha, beta;
double sum;
int fail;
double t_start, t_end;
m = 2000, k = 200, n = 1000;
alpha = 1.0; beta = 0.0;
printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
" performance \n\n");
A1 = (double *)mkl_malloc (m*k*sizeof( double ), 64 );
B1 = (double *)mkl_malloc (k*n*sizeof( double ), 64 );
C1 = (double *)mkl_malloc (m*n*sizeof( double ), 64 );
C1_fl = (double *)mkl_malloc (m*n*sizeof( double ), 64 );
A2 = (double *)mkl_malloc (m*k*sizeof( double ), 64 );
B2 = (double *)mkl_malloc (k*n*sizeof( double ), 64 );
C2 = (double *)mkl_malloc (m*n*sizeof( double ), 64 );
C2_fl = (double *)mkl_malloc (m*n*sizeof( double ), 64 );
if (A1 == NULL || B1 == NULL || C1 == NULL || C1_fl == NULL ||
A2 == NULL || B2 == NULL || C2 == NULL || C2_fl == NULL) {
printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
return 1;
}
printf (" Intializing matrix data \n\n");
for (i = 0; i < (m*k); i++) {
A1[i] = A2[i] = (double)(i+1);
}
for (i = 0; i < (k*n); i++) {
B1[i] = B2[i] = (double)(-i-1);
}
for (i = 0; i < (m*n); i++) {
C1[i] = C2[i] = 0.0;
C1_fl[i] = C2_fl[i] = 0.0;
}
printf (" \nComputing matrix product using Intel MKL cblas_dgemm function \n");
t_start = omp_get_wtime();
#pragma omp target data \
map(to: A1[0:m*k], B1[0:k*n], A2[0:m*k], B2[0:k*n]) \
map(tofrom: C1[0:m*n], C2[0:m*n])
{
#pragma omp target variant dispatch use_device_ptr(A1, B1, C1)
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, k, alpha, A1, k, B1, n, beta, C1, n);
#pragma omp target variant dispatch use_device_ptr(A2, B2, C2)
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, k, alpha, A2, k, B2, n, beta, C2, n);
}
t_end = omp_get_wtime();
printf ("\n Top left corner of matrix C1: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.5G", C1[j+i*n]);
}
printf ("\n");
}
printf ("\n Top left corner of matrix C2: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.5G", C2[j+i*n]);
}
printf ("\n");
}
printf (" \nComputing matrix product using for-loops \n");
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
sum = 0.0;
for (q = 0; q < k; q++)
sum += A1[k*i+q] * B1[n*q+j];
C1_fl[n*i+j] = sum;
}
}
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
sum = 0.0;
for (q = 0; q < k; q++)
sum += A2[k*i+q] * B2[n*q+j];
C2_fl[n*i+j] = sum;
}
}
printf ("\n Top left corner of matrix C1: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.5G", C1_fl[j+i*n]);
}
printf ("\n");
}
printf ("\n Top left corner of matrix C2: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.5G", C2_fl[j+i*n]);
}
printf ("\n");
}
printf ("\n Computations completed. Verifying... \n\n");
fail = 0;
for (i = 0; i < (m*n); i++) {
if (! compare(C1[i], C1_fl[i]) || ! compare(C2[i], C2_fl[i])) {
fail = 1;
break;
}
}
if (fail) {
printf (" **** FAIL **** \n");
}
else {
printf(" time = %lf seconds\n", t_end - t_start);
printf (" **** PASS **** \n");
}
mkl_free(A1);
mkl_free(B1);
mkl_free(C1);
mkl_free(C1_fl);
mkl_free(A2);
mkl_free(B2);
mkl_free(C2);
mkl_free(C2_fl);
return 0;
}
The two calls to
cblas_dgemm
in the above example can be batched
together, resulting in one call to cblas_dgemm_batch
, as shown in
the following example. Note that the batch is composed of one group of
size 2, since we have two matrix multiplications with the same set of
parameters (layout
, transa
, transb
, m
, n
, k
,
alpha
, lda
, ldb
, beta
, and ldc
). total_batch_size
in this case is 2.// clang-format off
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include "mkl.h"
#include "mkl_omp_offload.h"
#define min(x,y) (((x) < (y)) ? (x) : (y))
#define epsilon 0.0000001f
bool compare(double x, double y)
{
if(fabs(x - y) <= epsilon)
return true; // x and y are the same
return false; // x and y are not the same
}
int main()
{
double *A1, *B1, *C1, *C1_fl;
double *A2, *B2, *C2, *C2_fl;
int m, n, k, i, j, q;
double alpha, beta;
double sum;
int fail;
double t_start, t_end;
m = 2000, k = 200, n = 1000;
alpha = 1.0; beta = 0.0;
printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
" performance \n\n");
A1 = (double *)mkl_malloc (m*k*sizeof( double ), 64 );
B1 = (double *)mkl_malloc (k*n*sizeof( double ), 64 );
C1 = (double *)mkl_malloc (m*n*sizeof( double ), 64 );
C1_fl = (double *)mkl_malloc (m*n*sizeof( double ), 64 );
A2 = (double *)mkl_malloc (m*k*sizeof( double ), 64 );
B2 = (double *)mkl_malloc (k*n*sizeof( double ), 64 );
C2 = (double *)mkl_malloc (m*n*sizeof( double ), 64 );
C2_fl = (double *)mkl_malloc (m*n*sizeof( double ), 64 );
if (A1 == NULL || B1 == NULL || C1 == NULL || C1_fl == NULL ||
A2 == NULL || B2 == NULL || C2 == NULL || C2_fl == NULL) {
printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
return 1;
}
printf (" Intializing matrix data \n\n");
for (i = 0; i < (m*k); i++) {
A1[i] = A2[i] = (double)(i+1);
}
for (i = 0; i < (k*n); i++) {
B1[i] = B2[i] = (double)(-i-1);
}
for (i = 0; i < (m*n); i++) {
C1[i] = C2[i] = 0.0;
C1_fl[i] = C2_fl[i] = 0.0;
}
printf (" \nComputing matrix product using Intel MKL cblas_dgemm_batch function \n");
#define GRP_COUNT 1 // 1 group
MKL_INT group_count = GRP_COUNT;
MKL_INT group_sizes[GRP_COUNT] = {2}; // 8 matrix multiplications
CBLAS_TRANSPOSE transa_array[GRP_COUNT] = {CblasNoTrans};
CBLAS_TRANSPOSE transb_array[GRP_COUNT] = {CblasNoTrans};
MKL_INT m_array[GRP_COUNT] = {m};
MKL_INT n_array[GRP_COUNT] = {n};
MKL_INT k_array[GRP_COUNT] = {k};
MKL_INT lda_array[GRP_COUNT] = {k};
MKL_INT ldb_array[GRP_COUNT] = {n};
MKL_INT ldc_array[GRP_COUNT] = {n};
double alpha_array[GRP_COUNT] = {alpha};
double beta_array[GRP_COUNT] = {beta};
// Number of matrix multiplications = 2
double **a_array, **b_array, **c_array;
a_array = (double **)mkl_calloc(2, sizeof( double* ), 64);
b_array = (double **)mkl_calloc(2, sizeof( double* ), 64);
c_array = (double **)mkl_calloc(2, sizeof( double* ), 64);
t_start = omp_get_wtime();
// Call cblas_dgemm_batch
#pragma omp target enter data \
map(to: A1[0:m*k], B1[0:k*n], C1[0:m*n]) \
map(to: A2[0:m*k], B2[0:k*n], C2[0:m*n])
#pragma omp target data use_device_ptr(A1, B1, C1, A2, B2, C2)
{
a_array[0] = A1, a_array[1] = A2;
b_array[0] = B1, b_array[1] = B2;
c_array[0] = C1, c_array[1] = C2;
}
#pragma omp target data \
map(to:a_array[0:2], b_array[0:2], c_array[0:2])
{
#pragma omp target variant dispatch \
use_device_ptr(a_array, b_array, c_array)
{
cblas_dgemm_batch (
CblasRowMajor,
transa_array,
transb_array,
m_array,
n_array,
k_array,
alpha_array,
(const double **)a_array,
lda_array,
(const double **)b_array,
ldb_array,
beta_array,
c_array,
ldc_array,
group_count,
group_sizes);
}
} // end target data map
#pragma omp target exit data \
map(from: C1[0:m*n], C2[0:m*n])
t_end = omp_get_wtime();
printf ("\n Top left corner of matrix C1: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.5G", C1[j+i*n]);
}
printf ("\n");
}
printf ("\n Top left corner of matrix C2: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.5G", C2[j+i*n]);
}
printf ("\n");
}
printf (" \nComputing matrix product using for-loops \n");
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
sum = 0.0;
for (q = 0; q < k; q++)
sum += A1[k*i+q] * B1[n*q+j];
C1_fl[n*i+j] = sum;
}
}
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
sum = 0.0;
for (q = 0; q < k; q++)
sum += A2[k*i+q] * B2[n*q+j];
C2_fl[n*i+j] = sum;
}
}
printf ("\n Top left corner of matrix C1: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.5G", C1_fl[j+i*n]);
}
printf ("\n");
}
printf ("\n Top left corner of matrix C2: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.5G", C2_fl[j+i*n]);
}
printf ("\n");
}
printf ("\n Computations completed. Verifying... \n\n");
fail = 0;
for (i = 0; i < (m*n); i++) {
if (! compare(C1[i], C1_fl[i]) || ! compare(C2[i], C2_fl[i])) {
fail = 1;
break;
}
}
if (fail) {
printf (" **** FAIL **** \n");
}
else {
printf(" time = %lf seconds\n", t_end - t_start);
printf (" **** PASS **** \n");
}
mkl_free(A1);
mkl_free(B1);
mkl_free(C1);
mkl_free(C1_fl);
mkl_free(A2);
mkl_free(B2);
mkl_free(C2);
mkl_free(C2_fl);
return 0;
}
The performance of the above two examples when running on the
particular ATS GPU used (1-tile only) was as follows:
dgemm_example_01.cpp (two calls to cblas_dgemm): 2.976183 seconds
dgemm_batch_example_01.cpp (one call to cblas_dgemm_batch): 1.881641 seconds
A more complex example of batching is shown below. In this example, we
have a batch composed of 3 groups (GROUP_COUNT=3). The size of each
group is a randomly chosen number between 1 and 10. Several parameters
(
layout
, transA
, transB
, m
, n
, and k
) are
chosen randomly, but in each group the parameters are the same for all
the multiplications. The total_batch_size is equal to the sum of all the group
sizes.// clang-format off
/*******************************************************************************
* Copyright 2019-2021 Intel Corporation.
*
* This software and the related documents are Intel copyrighted materials, and
* your use of them is governed by the express license under which they were
* provided to you (License). Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute, disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents are provided as is, with no express
* or implied warranties, other than those that are expressly stated in the
* License.
*******************************************************************************/
#include <stdio.h>
#include <omp.h>
#include "mkl.h"
#include "mkl_omp_offload.h"
#include "common.h"
#define GROUP_COUNT 3
int dnum = 0;
int main() {
CBLAS_LAYOUT layout = (rand_int(0,1) == 0) ? CblasColMajor : CblasRowMajor;
CBLAS_TRANSPOSE *transA, *transB;
MKL_INT *m, *n, *k, *lda, *ldb, *ldc;
double *alpha, *beta;
MKL_INT *group_size, *sizea_array, *sizeb_array, *sizec_array, total_batch_size = 0, sizea, sizeb, sizec;
double **a_array, **b_array, **c_array, **c_ref_array;
double **a_array_dev, **b_array_dev, **c_array_dev;
transA = (CBLAS_TRANSPOSE *)mkl_malloc(GROUP_COUNT * sizeof(CBLAS_TRANSPOSE), 64);
transB = (CBLAS_TRANSPOSE *)mkl_malloc(GROUP_COUNT * sizeof(CBLAS_TRANSPOSE), 64);
m = (MKL_INT *)mkl_malloc(GROUP_COUNT * sizeof(MKL_INT), 64);
n = (MKL_INT *)mkl_malloc(GROUP_COUNT * sizeof(MKL_INT), 64);
k = (MKL_INT *)mkl_malloc(GROUP_COUNT * sizeof(MKL_INT), 64);
lda = (MKL_INT *)mkl_malloc(GROUP_COUNT * sizeof(MKL_INT), 64);
ldb = (MKL_INT *)mkl_malloc(GROUP_COUNT * sizeof(MKL_INT), 64);
ldc = (MKL_INT *)mkl_malloc(GROUP_COUNT * sizeof(MKL_INT), 64);
group_size = (MKL_INT *)mkl_malloc(GROUP_COUNT * sizeof(MKL_INT), 64);
alpha = (double *)mkl_malloc(GROUP_COUNT * sizeof(double), 64);
beta = (double *)mkl_malloc(GROUP_COUNT * sizeof(double), 64);
if ((m == NULL) || (n == NULL) || (k == NULL) || (lda == NULL) || (ldb == NULL) || (ldc == NULL) ||
(group_size == NULL) || (alpha == NULL) || (beta == NULL)) {
printf("Cannot allocate input arrays\n");
return 1;
}
MKL_INT i, j, p, idx;
for (i = 0; i < GROUP_COUNT; i++) {
transA[i] = (rand_int(0,1) == 0) ? CblasNoTrans : CblasTrans;
transB[i] = (rand_int(0,1) == 0) ? CblasNoTrans : CblasTrans;
alpha[i] = rand_double_scalar();
beta[i] = rand_double_scalar();
m[i] = rand_int(1, 20);
n[i] = rand_int(1, 20);
k[i] = rand_int(1, 20);
lda[i] = MAX(m[i], k[i]);
ldb[i] = MAX(k[i], n[i]);
ldc[i] = MAX(m[i], n[i]);
group_size[i] = rand_int(1, 10);
total_batch_size += group_size[i];
#ifdef MKL_ILP64
printf("Group %lld: layout = %s, transA = %s, transB = %s, m = %lld, n = %lld, k = %lld, lda = %lld, ldb = %lld, ldc = %lld, alpha = %lf, beta = %lf, group_size = %lld\n",
i, (layout == CblasColMajor) ? "Column Major" : "Row Major",
(transA[i] == CblasNoTrans) ? "Non Transpose" : "Transpose",
(transB[i] == CblasNoTrans) ? "Non Transpose" : "Transpose",
m[i], n[i], k[i], lda[i], ldb[i], ldc[i], alpha[i], beta[i], group_size[i]);
#else
printf("Group %d: layout = %s, transA = %s, transB = %s, m = %d, n = %d, k = %d, lda = %d, ldb = %d, ldc = %d, alpha = %lf, beta = %lf, group_size = %d\n",
i, (layout == CblasColMajor) ? "Column Major" : "Row Major",
(transA[i] == CblasNoTrans) ? "Non Transpose" : "Transpose",
(transB[i] == CblasNoTrans) ? "Non Transpose" : "Transpose",
m[i], n[i], k[i], lda[i], ldb[i], ldc[i], alpha[i], beta[i], group_size[i]);
#endif
}
sizea_array = (MKL_INT *)mkl_malloc(sizeof(MKL_INT) * total_batch_size, 64);
sizeb_array = (MKL_INT *)mkl_malloc(sizeof(MKL_INT) * total_batch_size, 64);
sizec_array = (MKL_INT *)mkl_malloc(sizeof(MKL_INT) * total_batch_size, 64);
a_array = (double **)mkl_malloc(sizeof(double *) * total_batch_size, 64);
b_array = (double **)mkl_malloc(sizeof(double *) * total_batch_size, 64);
c_array = (double **)mkl_malloc(sizeof(double *) * total_batch_size, 64);
a_array_dev = (double **)mkl_malloc(sizeof(double *) * total_batch_size, 64);
b_array_dev = (double **)mkl_malloc(sizeof(double *) * total_batch_size, 64);
c_array_dev = (double **)mkl_malloc(sizeof(double *) * total_batch_size, 64);
c_ref_array = (double **)mkl_malloc(sizeof(double *) * total_batch_size, 64);
if ((sizea_array == NULL) || (sizeb_array == NULL) || (sizec_array == NULL) || (a_array == NULL) ||
(b_array == NULL) || (c_array == NULL) || (a_array_dev == NULL) || (b_array_dev == NULL) ||
(c_array_dev == NULL) || (c_ref_array == NULL)) {
printf("Cannot allocate matrices and size arrays\n");
return 1;
}
idx = 0;
for (i = 0; i < GROUP_COUNT; i++) {
sizea = (((layout == CblasRowMajor) && (transA[i] == CblasTrans)) ||
((layout == CblasColMajor) && (transA[i] == CblasNoTrans))) ? lda[i] * k[i] : m[i] * lda[i];
sizeb = (((layout == CblasRowMajor) && (transB[i] == CblasTrans)) ||
((layout == CblasColMajor) && (transB[i] == CblasNoTrans))) ? ldb[i] * n[i] : k[i] * ldb[i];
sizec = (layout == CblasColMajor) ? ldc[i] * n[i] : ldc[i] * m[i];
for (j = 0; j < group_size[i]; j++) {
a_array[idx] = (double *)mkl_malloc(sizeof(double) * sizea, 64);
a_array_dev[idx] = a_array[idx];
sizea_array[idx] = sizea;
if (a_array[idx] == NULL) {
printf("cannot allocate a matrices\n");
return 1;
}
b_array[idx] = (double *)mkl_malloc(sizeof(double) * sizeb, 64);
b_array_dev[idx] = b_array[idx];
sizeb_array[idx] = sizeb;
if (b_array[idx] == NULL) {
printf("cannot allocate b matrices\n");
return 1;
}
c_array[idx] = (double *)mkl_malloc(sizeof(double) * sizec, 64);
c_array_dev[idx] = c_array[idx];
sizec_array[idx] = sizec;
if (c_array[idx] == NULL) {
printf("cannot allocate c matrices\n");
return 1;
}
c_ref_array[idx] = (double *)mkl_malloc(sizeof(double) * sizec, 64);
if (c_ref_array[idx] == NULL) {
printf("cannot allocate c_ref matrices\n");
return 1;
}
init_double_array(sizea, a_array[idx], 1);
init_double_array(sizeb, b_array[idx], 1);
init_double_array(sizec, c_array[idx], 1);
for (p = 0; p < sizec_array[idx]; p++) c_ref_array[idx][p] = c_array[idx][p];
idx++;
}
}
// run gemm_batch on host, use standard oneMKL interface
cblas_dgemm_batch(layout, transA, transB, m, n, k, alpha, (const double **) a_array, lda,
(const double **) b_array, ldb, beta, c_ref_array, ldc, GROUP_COUNT, group_size);
double *a, *b, *c;
for (i = 0; i < total_batch_size; i++) {
a = a_array[i];
b = b_array[i];
c = c_array[i];
#pragma omp target enter data map(to:a[0:sizea_array[i]],b[0:sizeb_array[i]],c[0:sizec_array[i]])
#pragma omp target data use_device_ptr(a,b,c)
{
a_array_dev[i] = a;
b_array_dev[i] = b;
c_array_dev[i] = c;
}
}
#pragma omp target data map(to:a_array_dev[0:total_batch_size], \
b_array_dev[0:total_batch_size], \
c_array_dev[0:total_batch_size]) device(dnum)
{
#pragma omp target variant dispatch device(dnum) use_device_ptr(a_array_dev, b_array_dev, \
c_array_dev)
{
cblas_dgemm_batch(layout, transA, transB, m, n, k, alpha, (const double **) a_array_dev, lda, (const double **) b_array_dev, ldb, beta, c_array_dev, ldc, GROUP_COUNT, group_size);
}
}
for (i = 0; i < total_batch_size; i++) {
a = a_array[i];
b = b_array[i];
c = c_array[i];
#pragma omp target exit data map(from:a[0:sizea_array[i]],b[0:sizeb_array[i]],c[0:sizec_array[i]])
}
double computed, reference, diff;
MKL_INT l;
idx = 0;
for (p = 0; p < GROUP_COUNT; p++) {
for (l = 0; l < group_size[p]; l++) {
for (i = 0; i < m[p]; i++) {
for (j = 0; j < n[p]; j++) {
if (layout == CblasColMajor) {
computed = c_array[idx][i + j * ldc[p]];
reference = c_ref_array[idx][i + j * ldc[p]];
}
else {
computed = c_array[idx][j + i * ldc[p]];
reference = c_ref_array[idx][j + i * ldc[p]];
}
diff = computed - reference;
diff = (diff > 0) ? diff : -diff;
if (diff > 0.0001) {
#ifdef MKL_ILP64
printf("Error in matrix %lld (group = %lld, matrix index in group = %lld) at index [%lld][%lld], computed = %lf, reference = %lf, difference = %lf\n", idx, p, l, i, j, computed, reference, diff);
#else
printf("Error in matrix %d at index [%d][%d], computed = %lf, reference = %lf, difference = %lf\n", idx, i, j, computed, reference, diff);
#endif
free_double_matrices(a_array, total_batch_size);
free_double_matrices(b_array, total_batch_size);
free_double_matrices(c_array, total_batch_size);
free_double_matrices(c_ref_array, total_batch_size);
mkl_free(a_array);
mkl_free(b_array);
mkl_free(c_array);
mkl_free(c_ref_array);
mkl_free(a_array_dev);
mkl_free(b_array_dev);
mkl_free(c_array_dev);
mkl_free(sizea_array);
mkl_free(sizeb_array);
mkl_free(sizec_array);
mkl_free(transA); mkl_free(transB);
mkl_free(m); mkl_free(n); mkl_free(k);
mkl_free(lda); mkl_free(ldb); mkl_free(ldc); mkl_free(group_size);
mkl_free(alpha); mkl_free(beta);
return 1;
}
}
}
idx++;
}
}
printf("Validation PASSED\n");
free_double_matrices(a_array, total_batch_size);
free_double_matrices(b_array, total_batch_size);
free_double_matrices(c_array, total_batch_size);
free_double_matrices(c_ref_array, total_batch_size);
mkl_free(a_array);
mkl_free(b_array);
mkl_free(c_array);
mkl_free(c_ref_array);
mkl_free(a_array_dev);
mkl_free(b_array_dev);
mkl_free(c_array_dev);
mkl_free(sizea_array);
mkl_free(sizeb_array);
mkl_free(sizec_array);
mkl_free(transA); mkl_free(transB);
mkl_free(m); mkl_free(n); mkl_free(k);
mkl_free(lda); mkl_free(ldb); mkl_free(ldc); mkl_free(group_size);
mkl_free(alpha); mkl_free(beta);
return 0;
}
For more information about the Intel oneAPI Math Kernel Library, see: