Intel® C++ Compiler Classic Developer Guide and Reference

ID 767249
Date 7/13/2023
Public
Document Table of Contents

Efficiency with Structure of Arrays Example

This example demonstrates the efficiency of using a Structure of Arrays (SoA) approach by comparing the assembly generated from a simple SIMD loop using an Array of Structures (AoS) approach with the assembly generated using the SoA approach of SDLT.

Array of Structures: Non-unit stride access version

Source:

#include <stdio.h>

#define N 1024

typedef struct RGBs {
    float r;
    float g;
    float b;
} RGBTy;


void main()
{
    RGBTy a[N];  
    #pragma omp simd
    for (int k = 0; k<N; ++k) {
        a[k].r = k*1.5;  // non-unit stride access
        a[k].g = k*2.5;  // non-unit stride access
        a[k].b = k*3.5;  // non-unit stride access
    }
    std::cout << "k =" << 10 <<
        ", a[k].r =" << a[10].r <<
        ", a[k].g =" << a[10].g <<
        ", a[k].b =" << a[10].b << std::endl;
}

AVX2 assembly generated (69 instructions):

..TOP_OF_LOOP:
        vcvtdq2ps %ymm7, %ymm1
        lea       (%rax), %rcx
        vcvtdq2ps %ymm5, %ymm2
        vpaddd    %ymm3, %ymm7, %ymm7
        vpaddd    %ymm3, %ymm5, %ymm5
        vmulps    %ymm1, %ymm4, %ymm8
        vmulps    %ymm1, %ymm6, %ymm12
        vmulps    %ymm2, %ymm6, %ymm14
        vmulps    %ymm1, %ymm0, %ymm1
        vmulps    %ymm2, %ymm4, %ymm10
        addl      $16, %edx
        vextractf128 $1, %ymm8, %xmm9
        vmovss    %xmm8, (%rsp,%rcx)
        vmovss    %xmm9, 48(%rsp,%rcx)
        vextractps $1, %xmm8, 12(%rsp,%rcx)
        vextractps $2, %xmm8, 24(%rsp,%rcx)
        vextractps $3, %xmm8, 36(%rsp,%rcx)
        vmulps    %ymm2, %ymm0, %ymm8
        vextractps $1, %xmm9, 60(%rsp,%rcx)
        vextractps $2, %xmm9, 72(%rsp,%rcx)
        vextractps $3, %xmm9, 84(%rsp,%rcx)
        vextractf128 $1, %ymm12, %xmm13
        vextractf128 $1, %ymm14, %xmm15
        vextractf128 $1, %ymm1, %xmm2
        vextractf128 $1, %ymm8, %xmm9
        vmovss    %xmm12, 4(%rsp,%rax)
        vmovss    %xmm13, 52(%rsp,%rax)
        vextractps $1, %xmm12, 16(%rsp,%rax)
        vextractps $2, %xmm12, 28(%rsp,%rax)
        vextractps $3, %xmm12, 40(%rsp,%rax)
        vextractps $1, %xmm13, 64(%rsp,%rax)
        vextractps $2, %xmm13, 76(%rsp,%rax)
        vextractps $3, %xmm13, 88(%rsp,%rax)
        vmovss    %xmm14, 100(%rsp,%rax)
        vextractps $1, %xmm14, 112(%rsp,%rax)
        vextractps $2, %xmm14, 124(%rsp,%rax)
        vextractps $3, %xmm14, 136(%rsp,%rax)
        vmovss    %xmm15, 148(%rsp,%rax)
        vextractps $1, %xmm15, 160(%rsp,%rax)
        vextractps $2, %xmm15, 172(%rsp,%rax)
        vextractps $3, %xmm15, 184(%rsp,%rax)
        vmovss    %xmm1, 8(%rsp,%rax)
        vextractps $1, %xmm1, 20(%rsp,%rax)
        vextractps $2, %xmm1, 32(%rsp,%rax)
        vextractps $3, %xmm1, 44(%rsp,%rax)
        vmovss    %xmm2, 56(%rsp,%rax)
        vextractps $1, %xmm2, 68(%rsp,%rax)
        vextractps $2, %xmm2, 80(%rsp,%rax)
        vextractps $3, %xmm2, 92(%rsp,%rax)
        vmovss    %xmm8, 104(%rsp,%rax)
        vextractps $1, %xmm8, 116(%rsp,%rax)
        vextractps $2, %xmm8, 128(%rsp,%rax)
        vextractps $3, %xmm8, 140(%rsp,%rax)
        vmovss    %xmm9, 152(%rsp,%rax)
        vextractps $1, %xmm9, 164(%rsp,%rax)
        vextractps $2, %xmm9, 176(%rsp,%rax)
        vextractps $3, %xmm9, 188(%rsp,%rax)
        addq      $192, %rax
        vextractf128 $1, %ymm10, %xmm11
        vmovss    %xmm10, 96(%rsp,%rcx)
        vmovss    %xmm11, 144(%rsp,%rcx)
        vextractps $1, %xmm10, 108(%rsp,%rcx)
        vextractps $2, %xmm10, 120(%rsp,%rcx)
        vextractps $3, %xmm10, 132(%rsp,%rcx)
        vextractps $1, %xmm11, 156(%rsp,%rcx)
        vextractps $2, %xmm11, 168(%rsp,%rcx)
        vextractps $3, %xmm11, 180(%rsp,%rcx)
        cmpl      $1024, %edx
        jb        ..TOP_OF_LOOP

Structure of Arrays: Using SDLT for unit stride access

To introduce the use of SDLT, the code below will:

  • declare a primitive,

  • use an soa1d_container instead of an array

  • use an accessor inside a SIMD loop to generate efficient code

  • use a proxy object’s data member interface to access individual data members of an element inside the container

Source:

#include <stdio.h>
#include <sdlt/sdlt.h>

#define N 1024

typedef struct RGBs {
    float r;
    float g;
    float b;
} RGBTy;

SDLT_PRIMITIVE(RGBTy, r, g, b)

void main()
{
    // Use SDLT to get SOA data layout
    sdlt::soa1d_container<RGBTy> aContainer(N);
    auto a = aContainer.access();

    // use SDLT Data Member Interface to access struct members r, g, and b.
    // achieve unit-stride access after vectorization
    #pragma omp simd
    for (int k = 0; k<N; k++) {
        a[k].r() = k*1.5;
        a[k].g() = k*2.5;
        a[k].b() = k*3.5;
    }
    std::cout << "k =" << 10 <<
        ", a[k].r =" << a[10].r() <<
        ", a[k].g =" << a[10].g() <<
        ", a[k].b =" << a[10].b() << std::endl;
}

AVX2 assemply generated (19 instructions):

..TOP_OF_LOOP:
        vpaddd    %ymm4, %ymm3, %ymm12
        vcvtdq2ps %ymm3, %ymm7
        vcvtdq2ps %ymm12, %ymm10
        vmulps    %ymm7, %ymm2, %ymm5
        vmulps    %ymm7, %ymm1, %ymm6
        vmulps    %ymm7, %ymm0, %ymm8
        vmulps    %ymm10, %ymm2, %ymm3
        vmulps    %ymm10, %ymm1, %ymm9
        vmulps    %ymm10, %ymm0, %ymm11
        vmovups   %ymm5, (%r13,%rax,4)
        vmovups   %ymm6, (%r15,%rax,4)
        vmovups   %ymm8, (%rbx,%rax,4)
        vmovups   %ymm3, 32(%r13,%rax,4)
        vmovups   %ymm9, 32(%r15,%rax,4)
        vmovups   %ymm11, 32(%rbx,%rax,4)
        vpaddd    %ymm4, %ymm12, %ymm3
        addq      $16, %rax
        cmpq      $1024, %rax
        jb        ..TOP_OF_LOOP

Both versions appear to have unrolled the loop twice. When examining the assembly generated for AVX2 instruction set, we can see a measurable reduction in the number of instructions (19 vs. 69) when we are able to perform unit stride access using SDLT. Also, at runtime, the soa1d_container aligned its data allocation and will gain any of the architectural advantages that come with using aligned instead of unaligned SIMD stores.