Intel® DPC++ Compatibility Tool Developer Guide and Reference
A newer version of this document is available. Customers should click here to go to the newest version.
DPCT1020
Message
Migration of <api name>, if it is called from __global__ or __device__ function, is not supported. You may need to redesign the code to use the host-side <api name> instead, which submits this call to the SYCL queue automatically.
Detailed Help
The warning message is generated in cases where the <api name> itself submits the SYCL* kernel to the command queue, and the caller of <api-name> is the SYCL kernel that is submitted to the command queue itself. It results in device-side enqueue of the kernel, which is not supported by SYCL 2020.
Suggestions to Fix
Redesign the code to use the host-side API, which submits this call to the SYCL queue automatically.
For example, this original CUDA* code:
__global__ void kernel(float *d_data) {
  int tid = threadIdx.x;
  d_data[tid + 1] = tid;
  __syncthreads();
  if (tid == 0) {
    cublasHandle_t handle;
    cublasCreate(&handle);
    cublasSasum(handle, 128, d_data + 1, 1, d_data)
    cublasDestroy(handle);
  }
}
void foo() {
  float *d_data;
  cudaMalloc((void **)&d_data, sizeof(float) * (1 + 128));
  kernel<<<1, 128>>>(d_data);
  float data;
  cudaMemcpy(data, d_data, sizeof(float), cudaMemcpyDeviceToHost);
  cudaFree(d_data);
}results in the following migrated SYCL code:
void kernel(float *d_data, sycl::nd_item<3> item_ct1) {
  int tid = item_ct1.get_local_id(2);
  d_data[tid + 1] = tid;
  item_ct1.barrier();
  if (tid == 0) {
    /*
    DPCT1021:2: Migration of cublasHandle_t in __global__ or __device__ function
    is not supported. You may need to redesign the code.
    */
    cublasHandle_t handle;
    handle = &dpct::get_default_queue();
    /*
    DPCT1020:1: Migration of cublasSasum, if it is called from __global__ or
    __device__ function, is not supported. You may need to redesign the code to
    use the host-side oneapi::mkl::blas::column_major::asum instead, which submits
    this call to the SYCL queue automatically.
    */
    cublasSasum(handle, 128, d_data + 1, 1, d_data);
    handle = nullptr;
  }
}
void foo() {
  dpct::device_ext &dev_ct1 = dpct::get_current_device();
  sycl::queue &q_ct1 = dev_ct1.default_queue();
  float *d_data;
  d_data = sycl::malloc_device<float>((1 + 128), q_ct1);
  q_ct1.parallel_for(
      sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
      [=](sycl::nd_item<3> item_ct1) {
        kernel(d_data, item_ct1);
      });
  float data;
  q_ct1.memcpy(&data, d_data, sizeof(float)).wait();
  sycl::free(d_data, q_ct1);
}which is rewritten to:
void kernel(float *d_data, sycl::nd_item<3> item_ct1) {
  int tid = item_ct1.get_local_id(2);
  d_data[tid + 1] = tid;
}
void foo() {
  dpct::device_ext &dev_ct1 = dpct::get_current_device();
  sycl::queue &q_ct1 = dev_ct1.default_queue();
  float *d_data;
  d_data = sycl::malloc_device<float>((1 + 128), q_ct1);
  q_ct1.parallel_for(
      sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
      [=](sycl::nd_item<3> item_ct1) {
        kernel(d_data, item_ct1);
      });
  oneapi::mkl::blas::column_major::asum(q_ct1, 128, d_data + 1, 1, d_data);
  float data;
  q_ct1.memcpy(&data, d_data, sizeof(float)).wait();
  sycl::free(d_data, q_ct1);
}