Fortran Example
We will use a Fortran example to illustrate how choosing memory allocations appropriately can avoid redundant data transfers and boost performance.
The Fortran example uses OpenMP offload and is adapted from NWChem, a high-performance computational chemistry application.
In the example, there is a 4-level loop nest. The innermost k-loop calls the omp_fbody routine which offloads two calls to sgemm onto the device, and then offloads a reduction computation (reduction variables emp4i and emp5i) onto the device.
In the loop nest, some arrays are updated on the host (namely, arrays Tkj, Kkj, Jia, Kia, Tia, Xia, and t1v1). So we need to make the values of these arrays on the device consistent with their values on the host.
Version 1
In the first (naive) version of the program, we allocate all 11 arrays in Shared memory using the allocate directive.
!$omp allocate allocator(omp_target_shared_mem_alloc) allocate( f1n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( f2n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_shared_mem_alloc) allocate( eorb(1:nbf) ) ...
Shared allocations are accessible by the host and the device, and automatically migrate between the host and the device as needed. So the same pointer to a Shared allocation may be used on the host and device.
Version 1 is shown below.
      include "mkl_omp_offload.f90"
      subroutine omp_fbody(f1n,f2n,eorb,                          &
                           ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
                           Jia, Kia, Tia, Xia, Tkj, Kkj,          &
                           t1v1,t1v2)
        use omp_lib
        use onemkl_blas_omp_offload_lp64
        use iso_fortran_env
        implicit none
        real, intent(inout) :: emp4,emp5
        integer, intent(in) :: ncor,nocc,nvir
        integer, intent(in) :: a,i,j,k, klo
        real, intent(inout) :: f1n(nvir,nvir)
        real, intent(inout) :: f2n(nvir,nvir)
        real, intent(in)    :: eorb(*)
        real, intent(in)    :: Jia(*), Kia(*), Tia(*), Xia(*)
        real, intent(in)    :: Tkj(*), Kkj(*)
        real, intent(in)    :: t1v1(nvir),t1v2(nvir)
        real    :: emp4i,emp5i
        real    :: eaijk,denom
        integer :: lnov,lnvv
        integer :: b,c
        real    :: f1nbc,f1ncb,f2nbc,f2ncb
        real    :: t1v1b,t1v2b
        lnov=nocc*nvir
        lnvv=nvir*nvir
        emp4i = 0.0
        emp5i = 0.0
        !$omp dispatch
        call sgemm('n','t',nvir,nvir,nvir,1.0,Jia,nvir,       &
                   Tkj(1+(k-klo)*lnvv),nvir,1.0,f1n,nvir)
        !$omp dispatch
        call sgemm('n','t',nvir,nvir,nvir,1.0,Kia,nvir,       &
                   Tkj(1+(k-klo)*lnvv),nvir,1.0, f2n,nvir)
        !$omp dispatch
        call sgemm('n','n',nvir,nvir,nocc,-1.0,Tia,nvir,      &
                   Kkj(1+(k-klo)*lnov),nocc,1.0, f1n,nvir)
        !$omp dispatch
        call sgemm('n','n',nvir,nvir,nocc,-1.0,Xia,nvir,      &
                   Kkj(1+(k-klo)*lnov),nocc,1.0, f2n,nvir)
        eaijk = eorb(a) - ( eorb(ncor+i)+eorb(ncor+j)+eorb(ncor+k) )
        !$omp target teams distribute parallel do collapse(2)       &
        !$omp     reduction(+:emp4i,emp5i)                          &
        !$omp     private(f1nbc,f1ncb,f2nbc,f2ncb)                  &
        !$omp     private(t1v1b,t1v2b)                              &
        !$omp     private(denom) firstprivate(eaijk,nvir,ncor,nocc)
        do b=1,nvir
           do c=1,nvir
              denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk )
              f1nbc = f1n(b,c);
              f1ncb = f1n(c,b);
              f2nbc = f2n(b,c);
              f2ncb = f2n(c,b);
              t1v1b = t1v1(b);
              t1v2b = t1v2(b);
              emp4i = emp4i + (denom*t1v1b*f1nbc) + (denom*2*f1ncb)
              emp5i = emp5i + (denom*t1v2b*f2nbc) + (denom*3*f2ncb)
          enddo
        enddo
        !$omp end target teams distribute parallel do
        emp4 = emp4 + emp4i
        emp5 = emp5 + emp5i
        end
      subroutine init_array_1(arr, m)
        implicit none
        real, intent(inout) :: arr(m)
        integer m, i
        do i = 1, m
           arr(i) = 1.0/(100.0 + i-1)
        end do
        end subroutine init_array_1
      subroutine init_array_2(arr, m, n)
        implicit none
        real, intent(inout) :: arr(m, n)
        integer m, n, i, j
        do i = 1, m
           do j = 1, n
              arr(i,j) = 1.0/(100.0 + ((i-1) * n) + j)
           end do
        end do
      end subroutine init_array_2
      program main
        use omp_lib
        use iso_fortran_env
        implicit none
        interface
           subroutine omp_fbody(f1n,f2n,eorb,                  &
                        ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
                        Jia, Kia, Tia, Xia, Tkj, Kkj,          &
                        t1v1,t1v2)
           real, intent(inout) :: emp4,emp5
           integer, intent(in) :: ncor,nocc,nvir
           integer, intent(in) :: a,i,j,k, klo
           real, intent(inout) :: f1n(nvir,nvir)
           real, intent(inout) :: f2n(nvir,nvir)
           real, intent(in)    :: eorb(*)
           real, intent(in)    :: Jia(*), Kia(*), Tia(*), Xia(*), Tkj(*), Kkj(*)
           real, intent(in)    :: t1v1(nvir),t1v2(nvir)
           end subroutine omp_fbody
        end interface
        integer :: ncor, nocc, nvir, maxiter, nkpass
        integer :: nbf, lnvv, lnov, kchunk
        real, allocatable :: eorb(:)
        real, allocatable :: f1n(:,:)
        real, allocatable :: f2n(:,:)
        real, allocatable :: Jia(:)
        real, allocatable :: Kia(:)
        real, allocatable :: Tia(:)
        real, allocatable :: Xia(:)
        real, allocatable :: Tkj(:)
        real, allocatable :: Kkj(:)
        real, allocatable :: t1v1(:),t1v2(:)
        real emp4, emp5
        integer :: a, b, c, i, j, k
        integer :: klo, khi, iter
        double precision, allocatable :: timers(:)
        double precision :: t0, t1, tsum, tmax, tmin, tavg
!       Run parameters
        nocc = 256
        nvir = 2048
        maxiter = 50
        nkpass = 1
        ncor = 0
        print *, "Run parameters:"
        print *, "nocc    =", nocc
        print *, "nvir    =", nvir
        print *, "maxiter =", maxiter
        print *, "nkpass  =", nkpass
        print *, "ncor    =", ncor
        print *, " "
!       Allocate and initialize arrays.
        nbf = ncor + nocc + nvir
        lnvv = nvir * nvir
        lnov = nocc * nvir
        kchunk = (nocc - 1)/nkpass + 1
        !$omp allocate allocator(omp_target_shared_mem_alloc)
        allocate( f1n(1:nvir,1:nvir) )
        !$omp allocate allocator(omp_target_shared_mem_alloc)
        allocate( f2n(1:nvir,1:nvir) )
        !$omp allocate allocator(omp_target_shared_mem_alloc)
        allocate( eorb(1:nbf) )
        !$omp allocate allocator(omp_target_shared_mem_alloc)
        allocate( Jia(1:lnvv) )
        !$omp allocate allocator(omp_target_shared_mem_alloc)
        allocate( Kia(1:lnvv) )
        !$omp allocate allocator(omp_target_shared_mem_alloc)
        allocate( Tia(1:lnov*nocc) )
        !$omp allocate allocator(omp_target_shared_mem_alloc)
        allocate( Xia(1:lnov*nocc))
        !$omp allocate allocator(omp_target_shared_mem_alloc)
        allocate( Tkj(1:kchunk*lnvv) )
        !$omp allocate allocator(omp_target_shared_mem_alloc)
        allocate( Kkj(1:kchunk*lnvv) )
        !$omp allocate allocator(omp_target_shared_mem_alloc)
        allocate( t1v1(1:lnvv) )
        !$omp allocate allocator(omp_target_shared_mem_alloc)
        allocate( t1v2(1:lnvv) )
!
        call init_array_1(eorb, nbf)
        call init_array_1(Jia, lnvv)
        call init_array_1(Kia, lnvv)
        call init_array_1(Tia, lnov*nocc)
        call init_array_1(Xia, lnov*nocc)
        call init_array_1(Tkj, kchunk*lnvv)
        call init_array_1(Kkj, kchunk*lnov)
        call init_array_1(t1v1, lnvv)
        call init_array_1(t1v2, lnvv)
        call init_array_2(f1n, nvir, nvir)
        call init_array_2(f2n, nvir, nvir)
        print *, "End of initialization"
        allocate (timers(1:maxiter))
        emp4=0.0
        emp5=0.0
        a=1
        iter=1
        do klo = 1, nocc, kchunk
           khi = MIN(nocc, klo+kchunk-1)
           do j = 1, nocc
#if defined(DO_UPDATE_ARRAYS)
!             Update elements of Tkj and KKj.
              Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0
              Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0
#endif
              do i = 1, nocc
#if defined(DO_UPDATE_ARRAYS)
!                Update elements of Jia, Kia, Tia, Xia arrays.
                 Jia(lnvv) = Jia(lnvv) + 1.0
                 Kia(lnvv) = Kia(lnvv) + 1.0
                 Tia(lnov) = Tia(lnov) + 1.0
                 Xia(lnov) = Xia(lnov) + 1.0
#endif
                 do k = klo, MIN(khi,i)
#if defined(DO_UPDATE_ARRAYS)
!                   Update elements of t1v1 array.
                    t1v1(:) = Tkj(lnvv-nvir+1:lnvv)
#endif
                    t0 = omp_get_wtime()
                    call omp_fbody(f1n,f2n,eorb,                     &
                              ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
                              Jia, Kia, Tia, Xia, Tkj, Kkj,          &
                              t1v1,t1v2)
                    t1 = omp_get_wtime()
                    timers(iter) = (t1-t0)
                    if (iter .eq. maxiter) then
                        print *, "Stopping after ", iter, "iterations"
                        print *, " "
                        goto 150
                    endif
!                   Prevent NAN for large maxiter...
                    if (emp4 >  1000.0) then
                        emp4 = emp4 - 1000.0
                    endif
                    if (emp4 < -1000.0) then
                        emp4 = emp4 + 1000.0
                    endif
                    if (emp5 >  1000.0) then
                        emp5 = emp5 - 1000.0
                    endif
                    if (emp5 < -1000.0) then
                        emp5 = emp5 + 1000.0
                     endif
                    iter = iter + 1
                 end do ! k = klo, MIN(khi,i)
              end do ! do i = 1, nocc
           end do ! do j = 1, nocc
        end do ! do klo = 1, nocc, kchunk
 150    CONTINUE
        tsum =  0.0
        tmax = -1.0e10
        tmin =  1.0e10
        do i = 2, iter
           tsum = tsum + timers(i)
           tmax = MAX(tmax,timers(i))
           tmin = MIN(tmin,timers(i))
        end do
        tavg = tsum / (iter - 1)
        print *, "TOTAL ITER: ", iter
        write(*, 110) " TIMING: min=", tmin, ", max=", tmax, ", avg of iters after first=", tavg, " seconds"
 110    format (A, F9.6, A, F9.6, A, F9.6, A)
        write(*, 120) " emp4 = ", emp4, " emp5 =", emp5
 120    format (A, F15.3, A, F15.3)
        print *, "END"
        deallocate (f1n)
        deallocate (f2n)
        deallocate (eorb)
        deallocate (Jia)
        deallocate (Kia)
        deallocate (Tia)
        deallocate (Xia)
        deallocate (Tkj)
        deallocate (Kkj)
        deallocate (t1v1)
        deallocate (t1v2)
        deallocate (timers)
        end program
 
   While this version is straightforward to program, it is not the most efficient.
Version 2
In Version 2 of the program, we allocate all 11 arrays in system memory using plain allocate, and use the map clause to map the arrays to the device:
real, allocatable :: f1n(:,:) real, allocatable :: f2n(:,:) real, allocatable :: eorb(:) ... !$omp target data & map(to: f1n, f2n, eorb) & map(to: Jia, Kia, Tia, Xia) & map(to: Tkj, Kkj) & map(to: t1v1, t1v2)
When an array that is mapped to the device is updated on the host, we use the OpenMP target update to directive to copy the new values of the array from the host to the device. We place the directive at the highest level in the loop nest as possible, to avoid unnecessary copying of the array.
Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0 !$omp target update to (Tkj, Kkj)
The full Version 2 is shown below.
      include "mkl_omp_offload.f90"
      subroutine omp_fbody(f1n,f2n,eorb,                          &
                           ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
                           Jia, Kia, Tia, Xia, Tkj, Kkj,          &
                           t1v1,t1v2)
        use omp_lib
        use onemkl_blas_omp_offload_lp64
        use iso_fortran_env
        implicit none
        real, intent(inout) :: emp4,emp5
        integer, intent(in) :: ncor,nocc,nvir
        integer, intent(in) :: a,i,j,k, klo
        real, intent(inout) :: f1n(nvir,nvir)
        real, intent(inout) :: f2n(nvir,nvir)
        real, intent(in)    :: eorb(*)
        real, intent(in)    :: Jia(*), Kia(*), Tia(*), Xia(*)
        real, intent(in)    :: Tkj(*), Kkj(*)
        real, intent(in)    :: t1v1(nvir),t1v2(nvir)
        real    :: emp4i,emp5i
        real    :: eaijk,denom
        integer :: lnov,lnvv
        integer :: b,c
        real    :: f1nbc,f1ncb,f2nbc,f2ncb
        real    :: t1v1b,t1v2b
        lnov=nocc*nvir
        lnvv=nvir*nvir
        emp4i = 0.0
        emp5i = 0.0
        !$omp dispatch
        call sgemm('n','t',nvir,nvir,nvir,1.0,Jia,nvir,       &
                   Tkj(1+(k-klo)*lnvv),nvir,1.0,f1n,nvir)
        !$omp dispatch
        call sgemm('n','t',nvir,nvir,nvir,1.0,Kia,nvir,       &
                   Tkj(1+(k-klo)*lnvv),nvir,1.0, f2n,nvir)
        !$omp dispatch
        call sgemm('n','n',nvir,nvir,nocc,-1.0,Tia,nvir,      &
                   Kkj(1+(k-klo)*lnov),nocc,1.0, f1n,nvir)
        !$omp dispatch
        call sgemm('n','n',nvir,nvir,nocc,-1.0,Xia,nvir,      &
                   Kkj(1+(k-klo)*lnov),nocc,1.0, f2n,nvir)
        eaijk = eorb(a) - ( eorb(ncor+i)+eorb(ncor+j)+eorb(ncor+k) )
        !$omp target teams distribute parallel do collapse(2)       &
        !$omp     reduction(+:emp4i,emp5i)                          &
        !$omp     private(f1nbc,f1ncb,f2nbc,f2ncb)                  &
        !$omp     private(t1v1b,t1v2b)                              &
        !$omp     private(denom) firstprivate(eaijk,nvir,ncor,nocc)
        do b=1,nvir
           do c=1,nvir
              denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk )
              f1nbc = f1n(b,c);
              f1ncb = f1n(c,b);
              f2nbc = f2n(b,c);
              f2ncb = f2n(c,b);
              t1v1b = t1v1(b);
              t1v2b = t1v2(b);
              emp4i = emp4i + (denom*t1v1b*f1nbc) + (denom*2*f1ncb)
              emp5i = emp5i + (denom*t1v2b*f2nbc) + (denom*3*f2ncb)
          enddo
        enddo
        !$omp end target teams distribute parallel do
        emp4 = emp4 + emp4i
        emp5 = emp5 + emp5i
        end
      subroutine init_array_1(arr, m)
        implicit none
        real, intent(inout) :: arr(m)
        integer m, i
        do i = 1, m
           arr(i) = 1.0/(100.0 + i-1)
        end do
        end subroutine init_array_1
      subroutine init_array_2(arr, m, n)
        implicit none
        real, intent(inout) :: arr(m, n)
        integer m, n, i, j
        do i = 1, m
           do j = 1, n
              arr(i,j) = 1.0/(100.0 + ((i-1) * n) + j)
           end do
        end do
      end subroutine init_array_2
      program main
        use omp_lib
        use iso_fortran_env
        implicit none
        interface
           subroutine omp_fbody(f1n,f2n,eorb,                  &
                        ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
                        Jia, Kia, Tia, Xia, Tkj, Kkj,          &
                        t1v1,t1v2)
           real, intent(inout) :: emp4,emp5
           integer, intent(in) :: ncor,nocc,nvir
           integer, intent(in) :: a,i,j,k, klo
           real, intent(inout) :: f1n(nvir,nvir)
           real, intent(inout) :: f2n(nvir,nvir)
           real, intent(in)    :: eorb(*)
           real, intent(in)    :: Jia(*), Kia(*), Tia(*), Xia(*), Tkj(*), Kkj(*)
           real, intent(in)    :: t1v1(nvir),t1v2(nvir)
           end subroutine omp_fbody
        end interface
        integer :: ncor, nocc, nvir, maxiter, nkpass
        integer :: nbf, lnvv, lnov, kchunk
        real, allocatable :: eorb(:)
        real, allocatable :: f1n(:,:)
        real, allocatable :: f2n(:,:)
        real, allocatable :: Jia(:)
        real, allocatable :: Kia(:)
        real, allocatable :: Tia(:)
        real, allocatable :: Xia(:)
        real, allocatable :: Tkj(:)
        real, allocatable :: Kkj(:)
        real, allocatable :: t1v1(:),t1v2(:)
        real emp4, emp5
        integer :: a, b, c, i, j, k
        integer :: klo, khi, iter
        double precision, allocatable :: timers(:)
        double precision :: t0, t1, tsum, tmax, tmin, tavg
!       Run parameters
        nocc = 256
        nvir = 2048
        maxiter = 50
        nkpass = 1
        ncor = 0
        print *, "Run parameters:"
        print *, "nocc    =", nocc
        print *, "nvir    =", nvir
        print *, "maxiter =", maxiter
        print *, "nkpass  =", nkpass
        print *, "ncor    =", ncor
        print *, " "
!       Allocate and initialize arrays.
        nbf = ncor + nocc + nvir
        lnvv = nvir * nvir
        lnov = nocc * nvir
        kchunk = (nocc - 1)/nkpass + 1
        allocate( f1n(1:nvir,1:nvir) )
        allocate( f2n(1:nvir,1:nvir) )
        allocate( eorb(1:nbf) )
        allocate( Jia(1:lnvv) )
        allocate( Kia(1:lnvv) )
        allocate( Tia(1:lnov*nocc) )
        allocate( Xia(1:lnov*nocc))
        allocate( Tkj(1:kchunk*lnvv) )
        allocate( Kkj(1:kchunk*lnvv) )
        allocate( t1v1(1:lnvv) )
        allocate( t1v2(1:lnvv) )
!
        call init_array_1(eorb, nbf)
        call init_array_1(Jia, lnvv)
        call init_array_1(Kia, lnvv)
        call init_array_1(Tia, lnov*nocc)
        call init_array_1(Xia, lnov*nocc)
        call init_array_1(Tkj, kchunk*lnvv)
        call init_array_1(Kkj, kchunk*lnov)
        call init_array_1(t1v1, lnvv)
        call init_array_1(t1v2, lnvv)
        call init_array_2(f1n, nvir, nvir)
        call init_array_2(f2n, nvir, nvir)
        print *, "End of initialization"
        allocate (timers(1:maxiter))
        emp4=0.0
        emp5=0.0
        a=1
        iter=1
        !$omp target data                   &
           map(to: f1n, f2n, eorb)          &
           map(to: Jia, Kia, Tia, Xia)      &
           map(to: Tkj, Kkj)                &
           map(to: t1v1, t1v2)
        do klo = 1, nocc, kchunk
           khi = MIN(nocc, klo+kchunk-1)
           do j = 1, nocc
#if defined(DO_UPDATE_ARRAYS)
!             Update elements of Tkj and KKj.
              Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0
              Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0
              !$omp target update to (Tkj, Kkj)
#endif
              do i = 1, nocc
#if defined(DO_UPDATE_ARRAYS)
!                Update elements of Jia, Kia, Tia, Xia arrays.
                 Jia(lnvv) = Jia(lnvv) + 1.0
                 Kia(lnvv) = Kia(lnvv) + 1.0
                 Tia(lnov) = Tia(lnov) + 1.0
                 Xia(lnov) = Xia(lnov) + 1.0
                 !$omp target update to (Jia, Kia, Tia, Xia)
#endif
                 do k = klo, MIN(khi,i)
#if defined(DO_UPDATE_ARRAYS)
!                   Update elements of t1v1 array.
                    t1v1(:) = Tkj(lnvv-nvir+1:lnvv)
                    !$omp target update to (t1v1)
#endif
                    t0 = omp_get_wtime()
                    call omp_fbody(f1n,f2n,eorb,                     &
                              ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
                              Jia, Kia, Tia, Xia, Tkj, Kkj,          &
                              t1v1,t1v2)
                    t1 = omp_get_wtime()
                    timers(iter) = (t1-t0)
                    if (iter .eq. maxiter) then
                        print *, "Stopping after ", iter, "iterations"
                        print *, " "
                        goto 150
                    endif
!                   Prevent NAN for large maxiter...
                    if (emp4 >  1000.0) then
                        emp4 = emp4 - 1000.0
                    endif
                    if (emp4 < -1000.0) then
                        emp4 = emp4 + 1000.0
                    endif
                    if (emp5 >  1000.0) then
                        emp5 = emp5 - 1000.0
                    endif
                    if (emp5 < -1000.0) then
                        emp5 = emp5 + 1000.0
                     endif
                    iter = iter + 1
                 end do ! k = klo, MIN(khi,i)
              end do ! do i = 1, nocc
           end do ! do j = 1, nocc
        end do ! do klo = 1, nocc, kchunk
 150    CONTINUE
        !$omp end target data
        tsum =  0.0
        tmax = -1.0e10
        tmin =  1.0e10
        do i = 2, iter
           tsum = tsum + timers(i)
           tmax = MAX(tmax,timers(i))
           tmin = MIN(tmin,timers(i))
        end do
        tavg = tsum / (iter - 1)
        print *, "TOTAL ITER: ", iter
        write(*, 110) " TIMING: min=", tmin, ", max=", tmax, ", avg of iters after first=", tavg, " seconds"
 110    format (A, F9.6, A, F9.6, A, F9.6, A)
        write(*, 120) " emp4 = ", emp4, " emp5 =", emp5
 120    format (A, F15.3, A, F15.3)
        print *, "END"
        deallocate (f1n)
        deallocate (f2n)
        deallocate (eorb)
        deallocate (Jia)
        deallocate (Kia)
        deallocate (Tia)
        deallocate (Xia)
        deallocate (Tkj)
        deallocate (Kkj)
        deallocate (t1v1)
        deallocate (t1v2)
        deallocate (timers)
        end program
 
  Version 3
In the third version of the program, we consider which arrays are used on the host only, on the device only, or on both the host and the device. We also consider whether the array values are updated during the execution of the program. This information is used to decide where to allocate the arrays, how to initialize them, and whether to update their values on the device.
Arrays f1 and f2 are used as work arrays on the device (to store the results of calls to SGEMM), and they are not accessed on the host. So we allocate them directly on the device.
Since f1 and f2 are allocated on the device, we call init_array_2() to initialize the arrays in an OpenMP target construct.
The other 9 arrays are accessed on both the host and the device, so we allocate them in Host memory and and call init_array_1() to initialize the arrays. The arrays are then mapped to the device using the map clause.
We use the OpenMP target update to directive to copy the updated values of Tkj, Kkj, Jia, Kia, Tia, Xia, and t1v1 from the host to the device.
!$omp allocate allocator(omp_target_device_mem_alloc) allocate( f1n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_device_mem_alloc) allocate( f2n(1:nvir,1:nvir) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( eorb(1:nbf) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( Jia(1:lnvv) ) !$omp allocate allocator(omp_target_host_mem_alloc) allocate( Kia(1:lnvv) ) ... !$omp target data & map(to: eorb) & map(to: Jia, Kia, Tia, Xia) & map(to: Tkj, Kkj) & map(to: t1v1, t1v2) ... Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0 Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0 !$omp target update to (Tkj, Kkj)
The full Version 3 of the program is shown below.
      include "mkl_omp_offload.f90"
      subroutine omp_fbody(f1n,f2n,eorb,                          &
                           ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
                           Jia, Kia, Tia, Xia, Tkj, Kkj,          &
                           t1v1,t1v2)
        use omp_lib
        use onemkl_blas_omp_offload_lp64
        use iso_fortran_env
        implicit none
        real, intent(inout) :: emp4,emp5
        integer, intent(in) :: ncor,nocc,nvir
        integer, intent(in) :: a,i,j,k, klo
        real, intent(inout) :: f1n(nvir,nvir)
        real, intent(inout) :: f2n(nvir,nvir)
        real, intent(in)    :: eorb(*)
        real, intent(in)    :: Jia(*), Kia(*), Tia(*), Xia(*)
        real, intent(in)    :: Tkj(*), Kkj(*)
        real, intent(in)    :: t1v1(nvir),t1v2(nvir)
        real    :: emp4i,emp5i
        real    :: eaijk,denom
        integer :: lnov,lnvv
        integer :: b,c
        real    :: f1nbc,f1ncb,f2nbc,f2ncb
        real    :: t1v1b,t1v2b
        lnov=nocc*nvir
        lnvv=nvir*nvir
        emp4i = 0.0
        emp5i = 0.0
        !$omp dispatch
        call sgemm('n','t',nvir,nvir,nvir,1.0,Jia,nvir,       &
                   Tkj(1+(k-klo)*lnvv),nvir,1.0,f1n,nvir)
        !$omp dispatch
        call sgemm('n','t',nvir,nvir,nvir,1.0,Kia,nvir,       &
                   Tkj(1+(k-klo)*lnvv),nvir,1.0, f2n,nvir)
        !$omp dispatch
        call sgemm('n','n',nvir,nvir,nocc,-1.0,Tia,nvir,      &
                   Kkj(1+(k-klo)*lnov),nocc,1.0, f1n,nvir)
        !$omp dispatch
        call sgemm('n','n',nvir,nvir,nocc,-1.0,Xia,nvir,      &
                   Kkj(1+(k-klo)*lnov),nocc,1.0, f2n,nvir)
        eaijk = eorb(a) - ( eorb(ncor+i)+eorb(ncor+j)+eorb(ncor+k) )
        !$omp target teams distribute parallel do collapse(2)       &
        !$omp     reduction(+:emp4i,emp5i)                          &
        !$omp     private(f1nbc,f1ncb,f2nbc,f2ncb)                  &
        !$omp     private(t1v1b,t1v2b)                              &
        !$omp     private(denom) firstprivate(eaijk,nvir,ncor,nocc)
        do b=1,nvir
           do c=1,nvir
              denom=-1.0/( eorb(ncor+nocc+b)+eorb(ncor+nocc+c)+eaijk )
              f1nbc = f1n(b,c);
              f1ncb = f1n(c,b);
              f2nbc = f2n(b,c);
              f2ncb = f2n(c,b);
              t1v1b = t1v1(b);
              t1v2b = t1v2(b);
              emp4i = emp4i + (denom*t1v1b*f1nbc) + (denom*2*f1ncb)
              emp5i = emp5i + (denom*t1v2b*f2nbc) + (denom*3*f2ncb)
          enddo
        enddo
        !$omp end target teams distribute parallel do
        emp4 = emp4 + emp4i
        emp5 = emp5 + emp5i
        end
      subroutine init_array_1(arr, m)
        implicit none
        real, intent(inout) :: arr(m)
        integer m, i
        do i = 1, m
           arr(i) = 1.0/(100.0 + i-1)
        end do
        end subroutine init_array_1
      subroutine init_array_2(arr, m, n)
        implicit none
        real, intent(inout) :: arr(m, n)
        integer m, n, i, j
        !$omp target teams distribute parallel do
        do i = 1, m
           do j = 1, n
              arr(i,j) = 1.0/(100.0 + ((i-1) * n) + j)
           end do
        end do
      end subroutine init_array_2
      program main
        use omp_lib
        use iso_fortran_env
        implicit none
        interface
           subroutine omp_fbody(f1n,f2n,eorb,                  &
                        ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
                        Jia, Kia, Tia, Xia, Tkj, Kkj,          &
                        t1v1,t1v2)
           real, intent(inout) :: emp4,emp5
           integer, intent(in) :: ncor,nocc,nvir
           integer, intent(in) :: a,i,j,k, klo
           real, intent(inout) :: f1n(nvir,nvir)
           real, intent(inout) :: f2n(nvir,nvir)
           real, intent(in)    :: eorb(*)
           real, intent(in)    :: Jia(*), Kia(*), Tia(*), Xia(*), Tkj(*), Kkj(*)
           real, intent(in)    :: t1v1(nvir),t1v2(nvir)
           end subroutine omp_fbody
        end interface
        integer :: ncor, nocc, nvir, maxiter, nkpass
        integer :: nbf, lnvv, lnov, kchunk
        real, allocatable :: eorb(:)
        real, allocatable :: f1n(:,:)
        real, allocatable :: f2n(:,:)
        real, allocatable :: Jia(:)
        real, allocatable :: Kia(:)
        real, allocatable :: Tia(:)
        real, allocatable :: Xia(:)
        real, allocatable :: Tkj(:)
        real, allocatable :: Kkj(:)
        real, allocatable :: t1v1(:),t1v2(:)
        real emp4, emp5
        integer :: a, b, c, i, j, k
        integer :: klo, khi, iter
        double precision, allocatable :: timers(:)
        double precision :: t0, t1, tsum, tmax, tmin, tavg
!       Run parameters
        nocc = 256
        nvir = 2048
        maxiter = 50
        nkpass = 1
        ncor = 0
        print *, "Run parameters:"
        print *, "nocc    =", nocc
        print *, "nvir    =", nvir
        print *, "maxiter =", maxiter
        print *, "nkpass  =", nkpass
        print *, "ncor    =", ncor
        print *, " "
!       Allocate and initialize arrays.
        nbf = ncor + nocc + nvir
        lnvv = nvir * nvir
        lnov = nocc * nvir
        kchunk = (nocc - 1)/nkpass + 1
        !$omp allocate allocator(omp_target_device_mem_alloc)
        allocate( f1n(1:nvir,1:nvir) )
        !$omp allocate allocator(omp_target_device_mem_alloc)
        allocate( f2n(1:nvir,1:nvir) )
        !$omp allocate allocator(omp_target_host_mem_alloc)
        allocate( eorb(1:nbf) )
        !$omp allocate allocator(omp_target_host_mem_alloc)
        allocate( Jia(1:lnvv) )
        !$omp allocate allocator(omp_target_host_mem_alloc)
        allocate( Kia(1:lnvv) )
        !$omp allocate allocator(omp_target_host_mem_alloc)
        allocate( Tia(1:lnov*nocc) )
        !$omp allocate allocator(omp_target_host_mem_alloc)
        allocate( Xia(1:lnov*nocc))
        !$omp allocate allocator(omp_target_host_mem_alloc)
        allocate( Tkj(1:kchunk*lnvv) )
        !$omp allocate allocator(omp_target_host_mem_alloc)
        allocate( Kkj(1:kchunk*lnvv) )
        !$omp allocate allocator(omp_target_host_mem_alloc)
        allocate( t1v1(1:lnvv) )
        !$omp allocate allocator(omp_target_host_mem_alloc)
        allocate( t1v2(1:lnvv) )
!
        call init_array_1(eorb, nbf)
        call init_array_1(Jia, lnvv)
        call init_array_1(Kia, lnvv)
        call init_array_1(Tia, lnov*nocc)
        call init_array_1(Xia, lnov*nocc)
        call init_array_1(Tkj, kchunk*lnvv)
        call init_array_1(Kkj, kchunk*lnov)
        call init_array_1(t1v1, lnvv)
        call init_array_1(t1v2, lnvv)
        call init_array_2(f1n, nvir, nvir)
        call init_array_2(f2n, nvir, nvir)
        print *, "End of initialization"
        allocate (timers(1:maxiter))
        emp4=0.0
        emp5=0.0
        a=1
        iter=1
        !$omp target data                   &
           map(to: eorb)                    &
           map(to: Jia, Kia, Tia, Xia)      &
           map(to: Tkj, Kkj)                &
           map(to: t1v1, t1v2)
        do klo = 1, nocc, kchunk
           khi = MIN(nocc, klo+kchunk-1)
           do j = 1, nocc
#if defined(DO_UPDATE_ARRAYS)
!             Update elements of Tkj and KKj.
              Tkj((khi-klo+1)*lnvv) = Tkj((khi-klo+1)*lnvv) + 1.0
              Kkj((khi-klo+1)*lnov) = Kkj((khi-klo+1)*lnov) + 1.0
              !$omp target update to (Tkj, Kkj)
#endif
              do i = 1, nocc
#if defined(DO_UPDATE_ARRAYS)
!                Update elements of Jia, Kia, Tia, Xia arrays.
                 Jia(lnvv) = Jia(lnvv) + 1.0
                 Kia(lnvv) = Kia(lnvv) + 1.0
                 Tia(lnov) = Tia(lnov) + 1.0
                 Xia(lnov) = Xia(lnov) + 1.0
                 !$omp target update to (Jia, Kia, Tia, Xia)
#endif
                 do k = klo, MIN(khi,i)
#if defined(DO_UPDATE_ARRAYS)
!                   Update elements of t1v1 array.
                    t1v1(:) = Tkj(lnvv-nvir+1:lnvv)
                    !$omp target update to (t1v1)
#endif
                    t0 = omp_get_wtime()
                    call omp_fbody(f1n,f2n,eorb,                     &
                              ncor,nocc,nvir, emp4,emp5,a,i,j,k,klo, &
                              Jia, Kia, Tia, Xia, Tkj, Kkj,          &
                              t1v1,t1v2)
                    t1 = omp_get_wtime()
                    timers(iter) = (t1-t0)
                    if (iter .eq. maxiter) then
                        print *, "Stopping after ", iter, "iterations"
                        print *, " "
                        goto 150
                    endif
!                   Prevent NAN for large maxiter...
                    if (emp4 >  1000.0) then
                        emp4 = emp4 - 1000.0
                    endif
                    if (emp4 < -1000.0) then
                        emp4 = emp4 + 1000.0
                    endif
                    if (emp5 >  1000.0) then
                        emp5 = emp5 - 1000.0
                    endif
                    if (emp5 < -1000.0) then
                        emp5 = emp5 + 1000.0
                     endif
                    iter = iter + 1
                 end do ! k = klo, MIN(khi,i)
              end do ! do i = 1, nocc
           end do ! do j = 1, nocc
        end do ! do klo = 1, nocc, kchunk
 150    CONTINUE
        !$omp end target data
        tsum =  0.0
        tmax = -1.0e10
        tmin =  1.0e10
        do i = 2, iter
           tsum = tsum + timers(i)
           tmax = MAX(tmax,timers(i))
           tmin = MIN(tmin,timers(i))
        end do
        tavg = tsum / (iter - 1)
        print *, "TOTAL ITER: ", iter
        write(*, 110) " TIMING: min=", tmin, ", max=", tmax, ", avg of iters after first=", tavg, " seconds"
 110    format (A, F9.6, A, F9.6, A, F9.6, A)
        write(*, 120) " emp4 = ", emp4, " emp5 =", emp5
 120    format (A, F15.3, A, F15.3)
        print *, "END"
        deallocate (f1n)
        deallocate (f2n)
        deallocate (eorb)
        deallocate (Jia)
        deallocate (Kia)
        deallocate (Tia)
        deallocate (Xia)
        deallocate (Tkj)
        deallocate (Kkj)
        deallocate (t1v1)
        deallocate (t1v2)
        deallocate (timers)
        end program
 
  Performance Comparison
The following commands were used to compile, link, and run the various versions. In the following, substitute the source filename for TEST.f.
Compile:
ifx -O3 -fiopenmp -fopenmp-targets=spir64 -DMKL -qmkl -fpp -free -D DO_UPDATE_ARRAYS TEST.f -c -o TEST.o
Link:
ifx -fiopenmp -fopenmp-targets=spir64 -qmkl -fsycl -L${MKLROOT}/lib/intel64 -liomp5 -lsycl -lOpenCL -lstdc++ -lpthread -lm -ldl -lmkl_sycl -D DO_UPDATE_ARRAYS TEST.o -o TEST.exe
Run:
LIBOMPTARGET_LEVEL_ZERO_COMMAND_BATCH=copy OMP_TARGET_OFFLOAD=MANDATORY ZE_AFFINITY_MASK=0.0 LIBOMPTARGET_DEBUG=0 LIBOMPTARGET_PLUGIN=level0 TEST.exe 
   We compared the performance of the three versions on the particular GPU used (1-stack only).
The average iteration time for the various versions was as follows.
Version 1 (test-SharedMem.f: 0.115163 seconds Version 2 (test-Map-UpdateTo.f): 0.002012 seconds Version 3 (test-DeviceMem-Map-UpdateTo.f): 0.001978 seconds