Visible to Intel only — GUID: GUID-A4D68FB6-AA64-4BC7-B688-6142BBC1EF5B
Sample Application
The sample code below shows how to use oneCCL API to perform allreduce communication for SYCL USM memory.
#include <iostream> #include <mpi.h> #include "oneapi/ccl.hpp" void mpi_finalize() { int is_finalized = 0; MPI_Finalized(&is_finalized); if (!is_finalized) { MPI_Finalize(); } } int main(int argc, char* argv[]) { constexpr size_t count = 10 * 1024 * 1024; int size = 0; int rank = 0; ccl::init(); MPI_Init(nullptr, nullptr); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); atexit(mpi_finalize); sycl::default_selector device_selector; sycl::queue q(device_selector); std::cout << "Running on " << q.get_device().get_info<sycl::info::device::name>() << "\n"; /* create kvs */ ccl::shared_ptr_class<ccl::kvs> kvs; ccl::kvs::address_type main_addr; if (rank == 0) { kvs = ccl::create_main_kvs(); main_addr = kvs->get_address(); MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD); } else { MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD); kvs = ccl::create_kvs(main_addr); } /* create communicator */ auto dev = ccl::create_device(q.get_device()); auto ctx = ccl::create_context(q.get_context()); auto comm = ccl::create_communicator(size, rank, dev, ctx, kvs); /* create stream */ auto stream = ccl::create_stream(q); /* create buffers */ auto send_buf = sycl::malloc_device<int>(count, q); auto recv_buf = sycl::malloc_device<int>(count, q); /* open buffers and modify them on the device side */ auto e = q.submit([&](auto& h) { h.parallel_for(count, [=](auto id) { send_buf[id] = rank + id + 1; recv_buf[id] = -1; }); }); int check_sum = 0; for (int i = 1; i <= size; ++i) { check_sum += i; } /* do not wait completion of kernel and provide it as dependency for operation */ std::vector<ccl::event> deps; deps.push_back(ccl::create_event(e)); /* invoke allreduce */ auto attr = ccl::create_operation_attr<ccl::allreduce_attr>(); ccl::allreduce(send_buf, recv_buf, count, ccl::reduction::sum, comm, stream, attr, deps).wait(); /* open recv_buf and check its correctness on the device side */ sycl::buffer<int> check_buf(count); q.submit([&](auto& h) { sycl::accessor check_buf_acc(check_buf, h, sycl::write_only); h.parallel_for(count, [=](auto id) { if (recv_buf[id] != static_cast<int>(check_sum + size * id)) { check_buf_acc[id] = -1; } }); }); q.wait_and_throw(); /* print out the result of the test on the host side */ { sycl::host_accessor check_buf_acc(check_buf, sycl::read_only); size_t i; for (i = 0; i < count; i++) { if (check_buf_acc[i] == -1) { std::cout << "FAILED\n"; break; } } if (i == count) { std::cout << "PASSED\n"; } } sycl::free(send_buf, q); sycl::free(recv_buf, q); }
Build details
Run the sample
Intel® MPI Library is required for running the sample. Make sure that MPI environment is set up.
To run the sample, use the following command:
mpiexec <parameters> ./sample
where <parameters> represents optional mpiexec parameters such as node count, processes per node, hosts, and so on.