Intel® C++ Compiler Classic Developer Guide and Reference

ID 767249
Date 3/31/2023
Public

A newer version of this document is available. Customers should click here to go to the newest version.

Document Table of Contents

Intrinsics for FP Permutation Operations

The prototypes for Intel® Advanced Vector Extensions 512 (Intel® AVX-512) intrinsics are located in the zmmintrin.h header file.

To use these intrinsics, include the immintrin.h file as follows:

#include <immintrin.h>


Intrinsic Name

Operation

Corresponding
Intel® AVX-512 Instruction

_mm512_permutex2var_pd, _mm512_mask_permutex2var_pd, _mm512_mask2_permutex2var_pd, _mm512_maskz_permutex2var_pd

Shuffle float64 elements across lanes.

VPERMI2PD

_mm512_permutex2var_ps, _mm512_mask_permutex2var_ps, _mm512_mask2_permutex2var_ps, _mm512_maskz_permutex2var_ps

Shuffle float32 elements across lanes.

VPERMI2PS

_mm512_permute_pd, _mm512_mask_permute_pd, _mm512_maskz_permute_pd

Shuffle float64 elements within 128-bit lanes.

VPERMILPD, VPERMPD

_mm512_permutevar_pd, _mm512_mask_permutevar_pd, _mm512_maskz_permutevar_pd

Shuffle float64 elements within 128-bit lanes.

VPERMPD

_mm512_permutex_pd, _mm512_mask_permutex_pd, _mm512_maskz_permutex_pd

Shuffle float64 elements within lanes.

VPERMPD

_mm512_permutexvar_pd, _mm512_mask_permutexvar_pd, _mm512_maskz_permutexvar_pd

Shuffle float64 elements across lanes.

VPERMPD

_mm512_permute_ps, _mm512_mask_permute_ps, _mm512_maskz_permute_ps

Shuffle float32 elements within lanes.

VPERMILPS

_mm512_permutevar_ps, _mm512_mask_permutevar_ps, _mm512_maskz_permutevar_ps

Shuffle float32 elements within lanes.

VPERMPS, VPERMILPS

_mm512_permutexvar_ps, _mm512_mask_permutexvar_ps, _mm512_maskz_permutexvar_ps

Shuffle float32 elements across lanes.

VPERMPS


variable definition
k

writemask used as a selector

a

first source vector element

b

second source vector element

src

source element to use based on writemask result

idx

index


_mm512_permutex2var_pd

extern __m512d __cdecl _mm512_permutex2var_pd(__m512d a, __m512i idx, __m512d b);

Shuffles float64 elements in a and b across lanes using the corresponding selector and index in idx, and stores the result.



_mm512_mask_permutex2var_pd

extern __m512d __cdecl _mm512_mask_permutex2var_pd(__m512d a, __mmask8 k, __m512i idx, __m512d b);

Shuffles float64 elements in a and b across lanes using the corresponding selector and index in idx, and stores the result using writemask k (elements are copied from a when the corresponding mask bit is not set).



_mm512_mask2_permutex2var_pd

extern __m512d __cdecl _mm512_mask2_permutex2var_pd(__m512d a, __m512i idx, __mmask8 k, __m512d b);

Shuffles float64 elements in a and b across lanes using the corresponding selector and index in idx, and stores the results using writemask k (elements are copied from idx when the corresponding mask bit is not set)



_mm512_maskz_permutex2var_pd

extern __m512d __cdecl _mm512_maskz_permutex2var_pd(__mmask8 k, __m512d a, __m512i idx, __m512d b);

Shuffles float64 elements in a and b across lanes using the corresponding selector and index in idx, and stores the result using zeromask k (elements are zeroed out when the corresponding mask bit is not set).



_mm512_permutex2var_ps

extern __m512 __cdecl _mm512_permutex2var_ps(__m512 a, __m512i idx, __m512 b);

Shuffles float32 elements in a and b across lanes using the corresponding selector and index in idx, and stores the result.



_mm512_mask2_permutex2var_ps

extern __m512 __cdecl _mm512_mask_permutex2var_ps(__m512 a, __mmask16 k, __m512i idx, __m512 b);

Shuffles float32 elements in a and b across lanes using the corresponding selector and index in idx, and stores the result using writemask k (elements are copied from idx when the corresponding mask bit is not set).



_mm512_mask_permutex2var_ps

extern __m512 __cdecl _mm512_mask2_permutex2var_ps(__m512 a, __m512i idx, __mmask16 k, __m512 b);

Shuffles float32 elements in a and b across lanes using the corresponding selector and index in idx, and stores the result using writemask k (elements are copied from a when the corresponding mask bit is not set).



_mm512_maskz_permutex2var_ps

extern __m512 __cdecl _mm512_maskz_permutex2var_ps(__mmask16 k, __m512 a, __m512i idx, __m512 b);

Shuffles float32 elements in a and b across lanes using the corresponding selector and index in idx, and stores the result using zeromask k (elements are zeroed out when the corresponding mask bit is not set).



_mm512_permute_pd

extern __m512d __cdecl _mm512_permute_pd(__m512d a, const int imm);

Shuffles float64 elements in a within 128-bit lanes using the control in imm, and stores the result.



_mm512_mask_permute_pd

extern __m512d __cdecl _mm512_mask_permute_pd(__m512d src, __mmask8 k, __m512d a, const int imm);

Shuffles float64 elements in a within 128-bit lanes using the control in imm, and stores the result using writemask k (elements are copied from src when the corresponding mask bit is not set).



_mm512_maskz_permute_pd

extern __m512d __cdecl _mm512_maskz_permute_pd(__mmask8 k, __m512d a, const int imm);

Shuffles float64 elements in a within 128-bit lanes using the control in imm, and stores the result using zeromask k (elements are zeroed out when the corresponding mask bit is not set).



_mm512_permutevar_pd

extern __m512d __cdecl _mm512_permutevar_pd(__m512d a, __m512i b);

Shuffles float64 elements in a within 128-bit lanes using the control in b, and stores the result.



_mm512_mask_permutevar_pd

extern __m512d __cdecl _mm512_mask_permutevar_pd(__m512d src, __mmask8 k, __m512d a, __m512i b);

Shuffles float64 elements in a within 128-bit lanes using the control in b, and stores the result using writemask k (elements are copied from src when the corresponding mask bit is not set).



_mm512_maskz_permutevar_pd

extern __m512d __cdecl _mm512_maskz_permutevar_pd(__mmask8 k, __m512d a, __m512i b);

Shuffles float64 elements in a within 128-bit lanes using the control in b, and stores the result using zeromask k (elements are zeroed out when the corresponding mask bit is not set).



_mm512_permute_ps

extern __m512 __cdecl _mm512_permute_ps(__m512 a, const int imm);

Shuffles float32 elements in a within 128-bit lanes using the control in imm, and stores the result.



_mm512_mask_permute_ps

extern __m512 __cdecl _mm512_mask_permute_ps(__m512 src, __mmask16 k, __m512 a, const int imm);

Shuffles float32 elements in a within 128-bit lanes using the control in imm, and stores the result using writemask k (elements are copied from src when the corresponding mask bit is not set).



_mm512_maskz_permute_ps

extern __m512 __cdecl _mm512_maskz_permute_ps(__mmask16 k, __m512 a, const int imm);

Shuffles float32 elements in a within 128-bit lanes using the control in imm, and stores the result using zeromask k (elements are zeroed out when the corresponding mask bit is not set).



_mm512_permutevar_ps

extern __m512 __cdecl _mm512_permutevar_ps(__m512 a, __m512i b);

Shuffles float32 elements in a within 128-bit lanes using the control in b, and stores the result.



_mm512_mask_permutevar_ps

extern __m512 __cdecl _mm512_mask_permutevar_ps(__m512 src, __mmask16 k, __m512 a, __m512i b);

Shuffles float32 elements in a within 128-bit lanes using the control in b, and stores the result using writemask k (elements are copied from src when the corresponding mask bit is not set).



_mm512_maskz_permutevar_ps

extern __m512 __cdecl _mm512_maskz_permutevar_ps(__mmask16 k, __m512 a, __m512i b);

Shuffles float32 elements in a within 128-bit lanes using the control in b, and stores the result using zeromask k (elements are zeroed out when the corresponding mask bit is not set).



_mm512_permutex_pd

extern __m512d __cdecl _mm512_permutex_pd(__m512d a, const int imm);

Shuffles float64 elements in a within 256-bit lanes using the control in imm, and stores the result.



_mm512_mask_permutex_pd

extern __m512d __cdecl _mm512_mask_permutex_pd(__m512d src, __mmask8 k, __m512d a, const int imm);

Shuffles float64 elements in a within 256-bit lanes using the control in imm, and stores the result using writemask k (elements are copied from src when the corresponding mask bit is not set).



_mm512_maskz_permutex_pd

extern __m512d __cdecl _mm512_maskz_permutex_pd(__mmask8 k, __m512d a, const int imm);

Shuffles float64 elements in a within 256-bit lanes using the control in imm, and stores the result using zeromask k (elements are zeroed out when the corresponding mask bit is not set).



_mm512_permutexvar_pd

extern __m512d __cdecl _mm512_permutexvar_pd(__m512i idx, __m512d a);

Shuffles float64 elements in a across lanes using the corresponding index in idx, and stores the result.



_mm512_mask_permutexvar_pd

extern __m512d __cdecl _mm512_mask_permutexvar_pd(__m512d src, __mmask8 k, __m512i idx, __m512d a);

Shuffles float64 elements in a across lanes using the corresponding index in idx, and stores the result using writemask k (elements are copied from src when the corresponding mask bit is not set).



_mm512_maskz_permutexvar_pd

extern __m512d __cdecl _mm512_maskz_permutexvar_pd(__mmask8 k, __m512i idx, __m512d a);

Shuffles float64 elements in a across lanes using the corresponding index in idx, and stores the result using zeromask k (elements are zeroed out when the corresponding mask bit is not set).



_mm512_permutexvar_ps

extern __m512 __cdecl _mm512_permutexvar_ps(__m512i idx, __m512 a);

Shuffles float32 elements in a across lanes using the corresponding index in idx, and stores the result.



_mm512_mask_permutexvar_ps

extern __m512 __cdecl _mm512_mask_permutexvar_ps(__m512 src, __mmask16 k, __m512i idx, __m512 a);

Shuffles float32 elements in a across lanes using the corresponding index in idx, and stores the result using writemask k (elements are copied from src when the corresponding mask bit is not set).



_mm512_maskz_permutexvar_ps

extern __m512 __cdecl _mm512_maskz_permutexvar_ps(__mmask16 k, __m512i idx, __m512 a);

Shuffles float32 elements in a across lanes using the corresponding index in idx, and stores the result using zeromask k (elements are zeroed out when the corresponding mask bit is not set).