5.1. Optimization for Improved DMA Performance
Implementation of NUMA (non-uniform memory access) optimization in fpga_dma_test.c allows the processor to access its own local memory faster than accessing non-local memory (memory local to another processor).
A typical NUMA configuration is shown in the diagram below. The local access represents access from a core to memory local to the same core. The remote access illustrates the path taken when a core on Node 0 accesses memory that resides in memory local to Node 1.
Figure 5. Typical NUMA Configuration
Use the following code to implement NUMA optimization in your test application:
// Set up proper affinity if requested
if (cpu_affinity || memory_affinity) {
unsigned dom = 0, bus = 0, dev = 0, func = 0;
fpga_properties props;
int retval;
#if(FPGA_DMA_DEBUG)
char str[4096];
#endif
res = fpgaGetProperties(afc_token, &props);
ON_ERR_GOTO(res, out_destroy_tok, "fpgaGetProperties");
res = fpgaPropertiesGetBus(props, (uint8_t *) & bus);
ON_ERR_GOTO(res, out_destroy_tok, "fpgaPropertiesGetBus");
res = fpgaPropertiesGetDevice(props, (uint8_t *) & dev);
ON_ERR_GOTO(res, out_destroy_tok, "fpgaPropertiesGetDevice");
res = fpgaPropertiesGetFunction(props, (uint8_t *) & func);
ON_ERR_GOTO(res, out_destroy_tok, "fpgaPropertiesGetFunction");
// Find the device from the topology
hwloc_topology_t topology;
hwloc_topology_init(&topology);
hwloc_topology_set_flags(topology,
HWLOC_TOPOLOGY_FLAG_IO_DEVICES);
hwloc_topology_load(topology);
hwloc_obj_t obj = hwloc_get_pcidev_by_busid(topology, dom, bus, dev, func);
hwloc_obj_t obj2 = hwloc_get_non_io_ancestor_obj(topology, obj);
#if (FPGA_DMA_DEBUG)
hwloc_obj_type_snprintf(str, 4096, obj2, 1);
printf("%s\n", str);
hwloc_obj_attr_snprintf(str, 4096, obj2, " :: ", 1);
printf("%s\n", str);
hwloc_bitmap_taskset_snprintf(str, 4096, obj2->cpuset);
printf("CPUSET is %s\n", str);
hwloc_bitmap_taskset_snprintf(str, 4096, obj2->nodeset);
printf("NODESET is %s\n", str);
#endif
if (memory_affinity) {
#if HWLOC_API_VERSION > 0x00020000
retval = hwloc_set_membind(topology, obj2->nodeset,
HWLOC_MEMBIND_THREAD, HWLOC_MEMBIND_MIGRATE | HWLOC_MEMBIND_BYNODESET);
#else
retval =
hwloc_set_membind_nodeset(topology, obj2->nodeset,
HWLOC_MEMBIND_THREAD,
HWLOC_MEMBIND_MIGRATE);
#endif
ON_ERR_GOTO(retval, out_destroy_tok, "hwloc_set_membind");
}
if (cpu_affinity) {
retval = hwloc_set_cpubind(topology, obj2->cpuset, HWLOC_CPUBIND_STRICT);
ON_ERR_GOTO(retval, out_destroy_tok, "hwloc_set_cpubind");
}
}