Developer Guide

Contents

Offloading oneMKL Computations onto the GPU

The Intel® oneAPI Math Kernel Library (oneMKL) improves performance with math routines for software applications that solve large computational problems. oneMKL provides BLAS and LAPACK linear algebra routines, fast Fourier transforms, vectorized math functions, random number generation functions, and other functionality.
You can use OpenMP directives to offload oneMKL computations onto the GPU. There are two ways to do this.
One way involves using the Intel-specific OpenMP extension
target variant dispatch
. You would place the call to the oneMKL routine inside a
target variant dispatch
construct, as shown in the example below. In this example, arrays
A
,
B
, and
C
used in the multiplication are mapped to the device before the call to the oneMKL routine
cblas_dgemm
. The
use_device_ptr(A,B,C)
clause is used on the
target variant dispatch
directive to indicate that
A
,
B
, and
C
point to objects that have corresponding storage on the device. When
cblas_dgemm
is called, the corresponding device pointers for
A
,
B
, and
C
will be passed as arguments, and the device copies of
A
,
B
, and
C
will be used in the computation.
// clang-format off #include <stdio.h> #include <stdlib.h> #include <omp.h> #include "mkl.h" #include "mkl_omp_offload.h" #define min(x,y) (((x) < (y)) ? (x) : (y)) int main() { double *A, *B, *C, *C_fl; int m, n, k, i, j, q; double alpha, beta; double sum; int pass; printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n" " Intel MKL function dgemm, where A, B, and C are matrices and \n" " alpha and beta are double precision scalars\n\n"); m = 2000, k = 200, n = 1000; printf (" Initializing data for matrix multiplication C=A*B for matrix \n" " A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n); alpha = 1.0; beta = 0.0; printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n" " performance \n\n"); A = (double *)mkl_malloc( m * k * sizeof( double ), 64 ); B = (double *)mkl_malloc( k * n * sizeof( double ), 64 ); C = (double *)mkl_malloc( m * n * sizeof( double ), 64 ); C_fl = (double *)mkl_malloc( m*n*sizeof( double ), 64 ); if (A == NULL || B == NULL || C == NULL || C_fl == NULL) { printf( "\n ERROR: Cannot allocate memory for matrices. Exiting... \n\n"); return 1; } printf (" Intializing matrices \n\n"); for (i = 0; i < (m*k); i++) { A[i] = (double)(i+1); } for (i = 0; i < (k*n); i++) { B[i] = (double)(-i-1); } for (i = 0; i < (m*n); i++) { C[i] = 0.0; C_fl[i] = 0.0; } printf (" Computing matrix product using Intel MKL dgemm function via CBLAS interface \n\n"); #pragma omp target data map(to: A[0:m*k], B[0:k*n]) map(tofrom: C[0:m*n]) { #pragma omp target variant dispatch use_device_ptr(A, B, C) cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, k, B, n, beta, C, n); } printf ("\n Top left corner of matrix C: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(n,6); j++) { printf ("%12.5G", C[j+i*n]); } printf ("\n"); } printf (" Computing matrix product using for-loops \n"); for (i = 0; i < m; i++) { for (j = 0; j < n; j++) { sum = 0.0; for (q = 0; q < k; q++) sum += A[k*i+q] * B[n*q+j]; C_fl[n*i+j] = sum; } } printf ("\n Top left corner of matrix C_fl: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(n,6); j++) { printf ("%12.5G", C_fl[j+i*n]); } printf ("\n"); } printf (" Computations completed. Verifying... \n\n"); pass = 1; for (i = 0; i < (m*n); i++) { if (C[i] != C_fl[i]) { pass = 0; break; } } if (pass) printf ("\n **** PASS **** \n"); else printf (" **** FAIL **** \n"); printf ("\n Deallocating memory \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); printf (" Example completed. \n\n"); return 0; }
Another way to inform the compiler that oneMKL computations should be offloaded onto the GPU is by using the OpenMP 5.1
dispatch
directive, as shown in the example below. In this example too, arrays
A
,
B
, and
C
are mapped to the device before the call to the oneMKL routine
cblas_dgemm
. No clauses are needed on the
dispatch
directive. When
cblas_dgemm
is called, the corresponding device pointers for
A
,
B
, and
C
will be passed as arguments, so the device copies of
A
,
B
, and
C
will be used in the computation. In order for this example to compile, you need to add the compiler option
-fopenmp-version=51
.
// clang-format off #include <stdio.h> #include <stdlib.h> #include <omp.h> #include "mkl.h" #include "mkl_omp_offload.h" #define min(x,y) (((x) < (y)) ? (x) : (y)) int main() { double *A, *B, *C, *C_fl; int m, n, k, i, j, q; double alpha, beta; double sum; int pass; printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n" " Intel MKL function dgemm, where A, B, and C are matrices and \n" " alpha and beta are double precision scalars\n\n"); m = 2000, k = 200, n = 1000; printf (" Initializing data for matrix multiplication C=A*B for matrix \n" " A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n); alpha = 1.0; beta = 0.0; printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n" " performance \n\n"); A = (double *)mkl_malloc( m * k * sizeof( double ), 64 ); B = (double *)mkl_malloc( k * n * sizeof( double ), 64 ); C = (double *)mkl_malloc( m * n * sizeof( double ), 64 ); C_fl = (double *)mkl_malloc( m*n*sizeof( double ), 64 ); if (A == NULL || B == NULL || C == NULL || C_fl == NULL) { printf( "\n ERROR: Cannot allocate memory for matrices. Exiting... \n\n"); return 1; } printf (" Intializing matrices \n\n"); for (i = 0; i < (m*k); i++) { A[i] = (double)(i+1); } for (i = 0; i < (k*n); i++) { B[i] = (double)(-i-1); } for (i = 0; i < (m*n); i++) { C[i] = 0.0; C_fl[i] = 0.0; } printf (" Computing matrix product using Intel MKL dgemm function via CBLAS interface \n\n"); #pragma omp target data map(to: A[0:m*k], B[0:k*n]) map(tofrom: C[0:m*n]) { #pragma omp dispatch cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, k, B, n, beta, C, n); } printf ("\n Top left corner of matrix C: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(n,6); j++) { printf ("%12.5G", C[j+i*n]); } printf ("\n"); } printf (" Computing matrix product using for-loops \n"); for (i = 0; i < m; i++) { for (j = 0; j < n; j++) { sum = 0.0; for (q = 0; q < k; q++) sum += A[k*i+q] * B[n*q+j]; C_fl[n*i+j] = sum; } } printf ("\n Top left corner of matrix C_fl: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(n,6); j++) { printf ("%12.5G", C_fl[j+i*n]); } printf ("\n"); } printf (" Computations completed. Verifying... \n\n"); pass = 1; for (i = 0; i < (m*n); i++) { if (C[i] != C_fl[i]) { pass = 0; break; } } if (pass) printf ("\n **** PASS **** \n"); else printf (" **** FAIL **** \n"); printf ("\n Deallocating memory \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); printf (" Example completed. \n\n"); return 0; }
Notes
  • If a oneMKL routine is not called from a
    target variant dispatch
    (or
    dispatch
    ) region, or if offload is disabled, then the oneMKL computations will be executed on the CPU.
  • Only one call to a oneMKL routine can be issued from an OpenMP
    target variant dispatch
    (or
    dispatch
    ) region. If there are two consecutive calls to oneMKL routines, then the calls should be placed in separate
    target variant dispatch
    (or
    dispatch
    ) constructs.

Batching of oneMKL GEMM Calls

The oneMKL library includes “batch” routines that allow the user to batch several oneMKL calls into a single MKL call. At runtime, oneMKL will intelligently execute all of the matrix operations to optimize overall performance.
For example, the
cblas_dgemm
routine computes a matrix-matrix product of two general matrices
a
and
b
, returning the result in a matrix
c
. The
cblas_dgemm
interface is shown below.
void cblas_dgemm (const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE transa, const CBLAS_TRANSPOSE transb, const MKL_INT m, const MKL_INT n, const MKL_INT k, const double alpha, const double *a, const MKL_INT lda, const double *b, const MKL_INT ldb, const double beta, double *c, const MKL_INT ldc);
The
cblas_dgemm_batch
routine is similar to the
cblas_dgemm
routine, but the
cblas_dgemm_batch
routine performs matrix-matrix operations with groups of matrices, processing a number of groups at once. A group contains matrices with the same parameters.
The
cblas_dgemm_batch
interface is shown below. Note that the interface resembles the
cblas_dgemm
interface. However, it involves passing matrix arguments as arrays of pointers to matrices, and passing parameters as arrays of parameters.
void cblas_dgemm_batch (const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE* transa_array, const CBLAS_TRANSPOSE* transb_array, const MKL_INT* m_array, const MKL_INT* n_array, const MKL_INT* k_array, const double* alpha_array, const double **a_array, const MKL_INT* lda_array, const double **b_array, const MKL_INT* ldb_array, const double* beta_array, double **c_array, const MKL_INT* ldc_array, const MKL_INT group_count, const MKL_INT* group_size);
The batch operation is defined as follows:
idx = 0 for i = 0 .. group_count - 1 alpha and beta in alpha_array[i] and beta_array[i] for j = 0 .. group_size[i] - 1 a, b, and c matrices in a_array[idx], b_array[idx], and c_array[idx], respectively c := alpha*op(a)*op(b) + beta*c, idx = idx + 1 end for end for
where:
  • op(X) is one of op(X) = X, or op(X) = XT, or op(X) = XH,
  • alpha and beta are scalar elements of alpha_array and beta_array,
  • a, b and c are matrices such that for m, n, and k which are elements of m_array, n_array, and k_array:
  • op(a) is an m-by-k matrix,
  • op(b) is a k-by-n matrix,
  • C is an m-by-n matrix.
  • a, b, and c represent matrices stored at addresses pointed to by a_array, b_array, and c_array, respectively. The number of entries in a_array, b_array, and c_array is total_batch_count = the sum of all of the group_size entries.
It is possible to batch the multiplications of different shapes and parameters by packaging them into groups, where each group consists of multiplications of matrices of the same shapes (same m, n, and k) and the same parameters.
The basic assumption for the batch API are that all operations in a batch (whether in the same group or different groups) are independent of one another. So oneMKL does not guarantee any particular ordering between operations in a batch, and in general will try to execute multiple operations in parallel.
In general, the larger you can make the batch size, the better. This allows oneMKL to better parallelize the operations and distribute the work across the GPU.
We illustrate how two calls to
cblas_dgemm
can be replaced with one call to
cblas_dgemm_batch
. The following example includes two calls to
cblas_dgemm
.
// clang-format off #include <stdio.h> #include <stdlib.h> #include <math.h> #include <omp.h> #include "mkl.h" #include "mkl_omp_offload.h" #define min(x,y) (((x) < (y)) ? (x) : (y)) #define epsilon 0.0000001f bool compare(double x, double y) { if(fabs(x - y) <= epsilon) return true; // x and y are the same return false; // x and y are not the same } int main() { double *A1, *B1, *C1, *C1_fl; double *A2, *B2, *C2, *C2_fl; int m, n, k, i, j, q; double alpha, beta; double sum; int fail; double t_start, t_end; m = 2000, k = 200, n = 1000; alpha = 1.0; beta = 0.0; printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n" " performance \n\n"); A1 = (double *)mkl_malloc (m*k*sizeof( double ), 64 ); B1 = (double *)mkl_malloc (k*n*sizeof( double ), 64 ); C1 = (double *)mkl_malloc (m*n*sizeof( double ), 64 ); C1_fl = (double *)mkl_malloc (m*n*sizeof( double ), 64 ); A2 = (double *)mkl_malloc (m*k*sizeof( double ), 64 ); B2 = (double *)mkl_malloc (k*n*sizeof( double ), 64 ); C2 = (double *)mkl_malloc (m*n*sizeof( double ), 64 ); C2_fl = (double *)mkl_malloc (m*n*sizeof( double ), 64 ); if (A1 == NULL || B1 == NULL || C1 == NULL || C1_fl == NULL || A2 == NULL || B2 == NULL || C2 == NULL || C2_fl == NULL) { printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n"); return 1; } printf (" Intializing matrix data \n\n"); for (i = 0; i < (m*k); i++) { A1[i] = A2[i] = (double)(i+1); } for (i = 0; i < (k*n); i++) { B1[i] = B2[i] = (double)(-i-1); } for (i = 0; i < (m*n); i++) { C1[i] = C2[i] = 0.0; C1_fl[i] = C2_fl[i] = 0.0; } printf (" \nComputing matrix product using Intel MKL cblas_dgemm function \n"); t_start = omp_get_wtime(); #pragma omp target data \ map(to: A1[0:m*k], B1[0:k*n], A2[0:m*k], B2[0:k*n]) \ map(tofrom: C1[0:m*n], C2[0:m*n]) { #pragma omp target variant dispatch use_device_ptr(A1, B1, C1) cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A1, k, B1, n, beta, C1, n); #pragma omp target variant dispatch use_device_ptr(A2, B2, C2) cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A2, k, B2, n, beta, C2, n); } t_end = omp_get_wtime(); printf ("\n Top left corner of matrix C1: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(n,6); j++) { printf ("%12.5G", C1[j+i*n]); } printf ("\n"); } printf ("\n Top left corner of matrix C2: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(n,6); j++) { printf ("%12.5G", C2[j+i*n]); } printf ("\n"); } printf (" \nComputing matrix product using for-loops \n"); for (i = 0; i < m; i++) { for (j = 0; j < n; j++) { sum = 0.0; for (q = 0; q < k; q++) sum += A1[k*i+q] * B1[n*q+j]; C1_fl[n*i+j] = sum; } } for (i = 0; i < m; i++) { for (j = 0; j < n; j++) { sum = 0.0; for (q = 0; q < k; q++) sum += A2[k*i+q] * B2[n*q+j]; C2_fl[n*i+j] = sum; } } printf ("\n Top left corner of matrix C1: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(n,6); j++) { printf ("%12.5G", C1_fl[j+i*n]); } printf ("\n"); } printf ("\n Top left corner of matrix C2: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(n,6); j++) { printf ("%12.5G", C2_fl[j+i*n]); } printf ("\n"); } printf ("\n Computations completed. Verifying... \n\n"); fail = 0; for (i = 0; i < (m*n); i++) { if (! compare(C1[i], C1_fl[i]) || ! compare(C2[i], C2_fl[i])) { fail = 1; break; } } if (fail) { printf (" **** FAIL **** \n"); } else { printf(" time = %lf seconds\n", t_end - t_start); printf (" **** PASS **** \n"); } mkl_free(A1); mkl_free(B1); mkl_free(C1); mkl_free(C1_fl); mkl_free(A2); mkl_free(B2); mkl_free(C2); mkl_free(C2_fl); return 0; }
The two calls to
cblas_dgemm
in the above example can be batched together, resulting in one call to
cblas_dgemm_batch
, as shown in the following example. Note that the batch is composed of one group of size 2, since we have two matrix multiplications with the same set of parameters (
layout
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
lda
,
ldb
,
beta
, and
ldc
). total_batch_size in this case is 2.
// clang-format off #include <stdio.h> #include <stdlib.h> #include <math.h> #include <omp.h> #include "mkl.h" #include "mkl_omp_offload.h" #define min(x,y) (((x) < (y)) ? (x) : (y)) #define epsilon 0.0000001f bool compare(double x, double y) { if(fabs(x - y) <= epsilon) return true; // x and y are the same return false; // x and y are not the same } int main() { double *A1, *B1, *C1, *C1_fl; double *A2, *B2, *C2, *C2_fl; int m, n, k, i, j, q; double alpha, beta; double sum; int fail; double t_start, t_end; m = 2000, k = 200, n = 1000; alpha = 1.0; beta = 0.0; printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n" " performance \n\n"); A1 = (double *)mkl_malloc (m*k*sizeof( double ), 64 ); B1 = (double *)mkl_malloc (k*n*sizeof( double ), 64 ); C1 = (double *)mkl_malloc (m*n*sizeof( double ), 64 ); C1_fl = (double *)mkl_malloc (m*n*sizeof( double ), 64 ); A2 = (double *)mkl_malloc (m*k*sizeof( double ), 64 ); B2 = (double *)mkl_malloc (k*n*sizeof( double ), 64 ); C2 = (double *)mkl_malloc (m*n*sizeof( double ), 64 ); C2_fl = (double *)mkl_malloc (m*n*sizeof( double ), 64 ); if (A1 == NULL || B1 == NULL || C1 == NULL || C1_fl == NULL || A2 == NULL || B2 == NULL || C2 == NULL || C2_fl == NULL) { printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n"); return 1; } printf (" Intializing matrix data \n\n"); for (i = 0; i < (m*k); i++) { A1[i] = A2[i] = (double)(i+1); } for (i = 0; i < (k*n); i++) { B1[i] = B2[i] = (double)(-i-1); } for (i = 0; i < (m*n); i++) { C1[i] = C2[i] = 0.0; C1_fl[i] = C2_fl[i] = 0.0; } printf (" \nComputing matrix product using Intel MKL cblas_dgemm_batch function \n"); #define GRP_COUNT 1 // 1 group MKL_INT group_count = GRP_COUNT; MKL_INT group_sizes[GRP_COUNT] = {2}; // 8 matrix multiplications CBLAS_TRANSPOSE transa_array[GRP_COUNT] = {CblasNoTrans}; CBLAS_TRANSPOSE transb_array[GRP_COUNT] = {CblasNoTrans}; MKL_INT m_array[GRP_COUNT] = {m}; MKL_INT n_array[GRP_COUNT] = {n}; MKL_INT k_array[GRP_COUNT] = {k}; MKL_INT lda_array[GRP_COUNT] = {k}; MKL_INT ldb_array[GRP_COUNT] = {n}; MKL_INT ldc_array[GRP_COUNT] = {n}; double alpha_array[GRP_COUNT] = {alpha}; double beta_array[GRP_COUNT] = {beta}; // Number of matrix multiplications = 2 double **a_array, **b_array, **c_array; a_array = (double **)mkl_calloc(2, sizeof( double* ), 64); b_array = (double **)mkl_calloc(2, sizeof( double* ), 64); c_array = (double **)mkl_calloc(2, sizeof( double* ), 64); t_start = omp_get_wtime(); // Call cblas_dgemm_batch #pragma omp target enter data \ map(to: A1[0:m*k], B1[0:k*n], C1[0:m*n]) \ map(to: A2[0:m*k], B2[0:k*n], C2[0:m*n]) #pragma omp target data use_device_ptr(A1, B1, C1, A2, B2, C2) { a_array[0] = A1, a_array[1] = A2; b_array[0] = B1, b_array[1] = B2; c_array[0] = C1, c_array[1] = C2; } #pragma omp target data \ map(to:a_array[0:2], b_array[0:2], c_array[0:2]) { #pragma omp target variant dispatch \ use_device_ptr(a_array, b_array, c_array) { cblas_dgemm_batch ( CblasRowMajor, transa_array, transb_array, m_array, n_array, k_array, alpha_array, (const double **)a_array, lda_array, (const double **)b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_sizes); } } // end target data map #pragma omp target exit data \ map(from: C1[0:m*n], C2[0:m*n]) t_end = omp_get_wtime(); printf ("\n Top left corner of matrix C1: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(n,6); j++) { printf ("%12.5G", C1[j+i*n]); } printf ("\n"); } printf ("\n Top left corner of matrix C2: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(n,6); j++) { printf ("%12.5G", C2[j+i*n]); } printf ("\n"); } printf (" \nComputing matrix product using for-loops \n"); for (i = 0; i < m; i++) { for (j = 0; j < n; j++) { sum = 0.0; for (q = 0; q < k; q++) sum += A1[k*i+q] * B1[n*q+j]; C1_fl[n*i+j] = sum; } } for (i = 0; i < m; i++) { for (j = 0; j < n; j++) { sum = 0.0; for (q = 0; q < k; q++) sum += A2[k*i+q] * B2[n*q+j]; C2_fl[n*i+j] = sum; } } printf ("\n Top left corner of matrix C1: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(n,6); j++) { printf ("%12.5G", C1_fl[j+i*n]); } printf ("\n"); } printf ("\n Top left corner of matrix C2: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(n,6); j++) { printf ("%12.5G", C2_fl[j+i*n]); } printf ("\n"); } printf ("\n Computations completed. Verifying... \n\n"); fail = 0; for (i = 0; i < (m*n); i++) { if (! compare(C1[i], C1_fl[i]) || ! compare(C2[i], C2_fl[i])) { fail = 1; break; } } if (fail) { printf (" **** FAIL **** \n"); } else { printf(" time = %lf seconds\n", t_end - t_start); printf (" **** PASS **** \n"); } mkl_free(A1); mkl_free(B1); mkl_free(C1); mkl_free(C1_fl); mkl_free(A2); mkl_free(B2); mkl_free(C2); mkl_free(C2_fl); return 0; }
The performance of the above two examples when running on the particular ATS GPU used (1-tile only) was as follows:
dgemm_example_01.cpp (two calls to cblas_dgemm): 2.976183 seconds dgemm_batch_example_01.cpp (one call to cblas_dgemm_batch): 1.881641 seconds
A more complex example of batching is shown below. In this example, we have a batch composed of 3 groups (GROUP_COUNT=3). The size of each group is a randomly chosen number between 1 and 10. Several parameters (
layout
,
transA
,
transB
,
m
,
n
, and
k
) are chosen randomly, but in each group the parameters are the same for all the multiplications. The total_batch_size is equal to the sum of all the group sizes.
// clang-format off /******************************************************************************* * Copyright 2019-2021 Intel Corporation. * * This software and the related documents are Intel copyrighted materials, and * your use of them is governed by the express license under which they were * provided to you (License). Unless the License provides otherwise, you may not * use, modify, copy, publish, distribute, disclose or transmit this software or * the related documents without Intel's prior written permission. * * This software and the related documents are provided as is, with no express * or implied warranties, other than those that are expressly stated in the * License. *******************************************************************************/ #include <stdio.h> #include <omp.h> #include "mkl.h" #include "mkl_omp_offload.h" #include "common.h" #define GROUP_COUNT 3 int dnum = 0; int main() { CBLAS_LAYOUT layout = (rand_int(0,1) == 0) ? CblasColMajor : CblasRowMajor; CBLAS_TRANSPOSE *transA, *transB; MKL_INT *m, *n, *k, *lda, *ldb, *ldc; double *alpha, *beta; MKL_INT *group_size, *sizea_array, *sizeb_array, *sizec_array, total_batch_size = 0, sizea, sizeb, sizec; double **a_array, **b_array, **c_array, **c_ref_array; double **a_array_dev, **b_array_dev, **c_array_dev; transA = (CBLAS_TRANSPOSE *)mkl_malloc(GROUP_COUNT * sizeof(CBLAS_TRANSPOSE), 64); transB = (CBLAS_TRANSPOSE *)mkl_malloc(GROUP_COUNT * sizeof(CBLAS_TRANSPOSE), 64); m = (MKL_INT *)mkl_malloc(GROUP_COUNT * sizeof(MKL_INT), 64); n = (MKL_INT *)mkl_malloc(GROUP_COUNT * sizeof(MKL_INT), 64); k = (MKL_INT *)mkl_malloc(GROUP_COUNT * sizeof(MKL_INT), 64); lda = (MKL_INT *)mkl_malloc(GROUP_COUNT * sizeof(MKL_INT), 64); ldb = (MKL_INT *)mkl_malloc(GROUP_COUNT * sizeof(MKL_INT), 64); ldc = (MKL_INT *)mkl_malloc(GROUP_COUNT * sizeof(MKL_INT), 64); group_size = (MKL_INT *)mkl_malloc(GROUP_COUNT * sizeof(MKL_INT), 64); alpha = (double *)mkl_malloc(GROUP_COUNT * sizeof(double), 64); beta = (double *)mkl_malloc(GROUP_COUNT * sizeof(double), 64); if ((m == NULL) || (n == NULL) || (k == NULL) || (lda == NULL) || (ldb == NULL) || (ldc == NULL) || (group_size == NULL) || (alpha == NULL) || (beta == NULL)) { printf("Cannot allocate input arrays\n"); return 1; } MKL_INT i, j, p, idx; for (i = 0; i < GROUP_COUNT; i++) { transA[i] = (rand_int(0,1) == 0) ? CblasNoTrans : CblasTrans; transB[i] = (rand_int(0,1) == 0) ? CblasNoTrans : CblasTrans; alpha[i] = rand_double_scalar(); beta[i] = rand_double_scalar(); m[i] = rand_int(1, 20); n[i] = rand_int(1, 20); k[i] = rand_int(1, 20); lda[i] = MAX(m[i], k[i]); ldb[i] = MAX(k[i], n[i]); ldc[i] = MAX(m[i], n[i]); group_size[i] = rand_int(1, 10); total_batch_size += group_size[i]; #ifdef MKL_ILP64 printf("Group %lld: layout = %s, transA = %s, transB = %s, m = %lld, n = %lld, k = %lld, lda = %lld, ldb = %lld, ldc = %lld, alpha = %lf, beta = %lf, group_size = %lld\n", i, (layout == CblasColMajor) ? "Column Major" : "Row Major", (transA[i] == CblasNoTrans) ? "Non Transpose" : "Transpose", (transB[i] == CblasNoTrans) ? "Non Transpose" : "Transpose", m[i], n[i], k[i], lda[i], ldb[i], ldc[i], alpha[i], beta[i], group_size[i]); #else printf("Group %d: layout = %s, transA = %s, transB = %s, m = %d, n = %d, k = %d, lda = %d, ldb = %d, ldc = %d, alpha = %lf, beta = %lf, group_size = %d\n", i, (layout == CblasColMajor) ? "Column Major" : "Row Major", (transA[i] == CblasNoTrans) ? "Non Transpose" : "Transpose", (transB[i] == CblasNoTrans) ? "Non Transpose" : "Transpose", m[i], n[i], k[i], lda[i], ldb[i], ldc[i], alpha[i], beta[i], group_size[i]); #endif } sizea_array = (MKL_INT *)mkl_malloc(sizeof(MKL_INT) * total_batch_size, 64); sizeb_array = (MKL_INT *)mkl_malloc(sizeof(MKL_INT) * total_batch_size, 64); sizec_array = (MKL_INT *)mkl_malloc(sizeof(MKL_INT) * total_batch_size, 64); a_array = (double **)mkl_malloc(sizeof(double *) * total_batch_size, 64); b_array = (double **)mkl_malloc(sizeof(double *) * total_batch_size, 64); c_array = (double **)mkl_malloc(sizeof(double *) * total_batch_size, 64); a_array_dev = (double **)mkl_malloc(sizeof(double *) * total_batch_size, 64); b_array_dev = (double **)mkl_malloc(sizeof(double *) * total_batch_size, 64); c_array_dev = (double **)mkl_malloc(sizeof(double *) * total_batch_size, 64); c_ref_array = (double **)mkl_malloc(sizeof(double *) * total_batch_size, 64); if ((sizea_array == NULL) || (sizeb_array == NULL) || (sizec_array == NULL) || (a_array == NULL) || (b_array == NULL) || (c_array == NULL) || (a_array_dev == NULL) || (b_array_dev == NULL) || (c_array_dev == NULL) || (c_ref_array == NULL)) { printf("Cannot allocate matrices and size arrays\n"); return 1; } idx = 0; for (i = 0; i < GROUP_COUNT; i++) { sizea = (((layout == CblasRowMajor) && (transA[i] == CblasTrans)) || ((layout == CblasColMajor) && (transA[i] == CblasNoTrans))) ? lda[i] * k[i] : m[i] * lda[i]; sizeb = (((layout == CblasRowMajor) && (transB[i] == CblasTrans)) || ((layout == CblasColMajor) && (transB[i] == CblasNoTrans))) ? ldb[i] * n[i] : k[i] * ldb[i]; sizec = (layout == CblasColMajor) ? ldc[i] * n[i] : ldc[i] * m[i]; for (j = 0; j < group_size[i]; j++) { a_array[idx] = (double *)mkl_malloc(sizeof(double) * sizea, 64); a_array_dev[idx] = a_array[idx]; sizea_array[idx] = sizea; if (a_array[idx] == NULL) { printf("cannot allocate a matrices\n"); return 1; } b_array[idx] = (double *)mkl_malloc(sizeof(double) * sizeb, 64); b_array_dev[idx] = b_array[idx]; sizeb_array[idx] = sizeb; if (b_array[idx] == NULL) { printf("cannot allocate b matrices\n"); return 1; } c_array[idx] = (double *)mkl_malloc(sizeof(double) * sizec, 64); c_array_dev[idx] = c_array[idx]; sizec_array[idx] = sizec; if (c_array[idx] == NULL) { printf("cannot allocate c matrices\n"); return 1; } c_ref_array[idx] = (double *)mkl_malloc(sizeof(double) * sizec, 64); if (c_ref_array[idx] == NULL) { printf("cannot allocate c_ref matrices\n"); return 1; } init_double_array(sizea, a_array[idx], 1); init_double_array(sizeb, b_array[idx], 1); init_double_array(sizec, c_array[idx], 1); for (p = 0; p < sizec_array[idx]; p++) c_ref_array[idx][p] = c_array[idx][p]; idx++; } } // run gemm_batch on host, use standard oneMKL interface cblas_dgemm_batch(layout, transA, transB, m, n, k, alpha, (const double **) a_array, lda, (const double **) b_array, ldb, beta, c_ref_array, ldc, GROUP_COUNT, group_size); double *a, *b, *c; for (i = 0; i < total_batch_size; i++) { a = a_array[i]; b = b_array[i]; c = c_array[i]; #pragma omp target enter data map(to:a[0:sizea_array[i]],b[0:sizeb_array[i]],c[0:sizec_array[i]]) #pragma omp target data use_device_ptr(a,b,c) { a_array_dev[i] = a; b_array_dev[i] = b; c_array_dev[i] = c; } } #pragma omp target data map(to:a_array_dev[0:total_batch_size], \ b_array_dev[0:total_batch_size], \ c_array_dev[0:total_batch_size]) device(dnum) { #pragma omp target variant dispatch device(dnum) use_device_ptr(a_array_dev, b_array_dev, \ c_array_dev) { cblas_dgemm_batch(layout, transA, transB, m, n, k, alpha, (const double **) a_array_dev, lda, (const double **) b_array_dev, ldb, beta, c_array_dev, ldc, GROUP_COUNT, group_size); } } for (i = 0; i < total_batch_size; i++) { a = a_array[i]; b = b_array[i]; c = c_array[i]; #pragma omp target exit data map(from:a[0:sizea_array[i]],b[0:sizeb_array[i]],c[0:sizec_array[i]]) } double computed, reference, diff; MKL_INT l; idx = 0; for (p = 0; p < GROUP_COUNT; p++) { for (l = 0; l < group_size[p]; l++) { for (i = 0; i < m[p]; i++) { for (j = 0; j < n[p]; j++) { if (layout == CblasColMajor) { computed = c_array[idx][i + j * ldc[p]]; reference = c_ref_array[idx][i + j * ldc[p]]; } else { computed = c_array[idx][j + i * ldc[p]]; reference = c_ref_array[idx][j + i * ldc[p]]; } diff = computed - reference; diff = (diff > 0) ? diff : -diff; if (diff > 0.0001) { #ifdef MKL_ILP64 printf("Error in matrix %lld (group = %lld, matrix index in group = %lld) at index [%lld][%lld], computed = %lf, reference = %lf, difference = %lf\n", idx, p, l, i, j, computed, reference, diff); #else printf("Error in matrix %d at index [%d][%d], computed = %lf, reference = %lf, difference = %lf\n", idx, i, j, computed, reference, diff); #endif free_double_matrices(a_array, total_batch_size); free_double_matrices(b_array, total_batch_size); free_double_matrices(c_array, total_batch_size); free_double_matrices(c_ref_array, total_batch_size); mkl_free(a_array); mkl_free(b_array); mkl_free(c_array); mkl_free(c_ref_array); mkl_free(a_array_dev); mkl_free(b_array_dev); mkl_free(c_array_dev); mkl_free(sizea_array); mkl_free(sizeb_array); mkl_free(sizec_array); mkl_free(transA); mkl_free(transB); mkl_free(m); mkl_free(n); mkl_free(k); mkl_free(lda); mkl_free(ldb); mkl_free(ldc); mkl_free(group_size); mkl_free(alpha); mkl_free(beta); return 1; } } } idx++; } } printf("Validation PASSED\n"); free_double_matrices(a_array, total_batch_size); free_double_matrices(b_array, total_batch_size); free_double_matrices(c_array, total_batch_size); free_double_matrices(c_ref_array, total_batch_size); mkl_free(a_array); mkl_free(b_array); mkl_free(c_array); mkl_free(c_ref_array); mkl_free(a_array_dev); mkl_free(b_array_dev); mkl_free(c_array_dev); mkl_free(sizea_array); mkl_free(sizeb_array); mkl_free(sizec_array); mkl_free(transA); mkl_free(transB); mkl_free(m); mkl_free(n); mkl_free(k); mkl_free(lda); mkl_free(ldb); mkl_free(ldc); mkl_free(group_size); mkl_free(alpha); mkl_free(beta); return 0; }
For more information about the Intel oneAPI Math Kernel Library, see:

Product and Performance Information

1

Performance varies by use, configuration and other factors. Learn more at www.Intel.com/PerformanceIndex.