Reference Implementations for Intel® Architecture Approximation Instructions VRCP14, VRSQRT14, VRCP28, VRSQRT28, and VEXP2

Published: 12/28/2015  

Last Updated: 12/28/2015

By Marius Cornea

 

We are providing two files, RECIP14.c and RECIP28EXP2.c, containing reference implementations for the scalar versions of 10 approximation instructions introduced in the Intel® Architecture Instruction Set Extensions Programming Reference document. The files can be downloaded from the links provided above.

RECIP14.c contains emulation routines for the underlying algorithms of:

  • VRCP14PD - Compute Approximate Reciprocals of Packed Float64 Values with relative error of less than 2-14
  • VRCP14SD - Compute Approximate Reciprocal of Scalar Float64 Value with relative error of less than 2-14
  • VRCP14PS - Compute Approximate Reciprocals of Packed Float32 Values with relative error of less than 2-14
  • VRCP14SS - Compute Approximate Reciprocal of Scalar Float32 Value with relative error of less than 2-14
  • VRSQRT14PD - Compute Approximate Reciprocals of Square Roots of Packed Float64 Values with relative error of less than 2-14
  • VRSQRT14SD - Compute Approximate Reciprocal of Square Root of Scalar Float64 Value with relative error of less than 2-14
  • VRSQRT14PS - Compute Approximate Reciprocals of Square Roots of PackedFloat32 Values with relative error of less than 2-14
  • VRSQRT14SS - Compute Approximate Reciprocal of Square Root of Scalar Float32 Value with relative error of less than 2-14

The corresponding emulation routines (only scalar versions) are:

  • RCP14S - reciprocal approximation for Float32
  • RCP14D - reciprocal approximation for Float64
  • RSQRT14S - reciprocal square root approximation for Float32
  • RSQRT14D - reciprocal square root approximation for Float64

RECIP28EXP2.c contains emulation routines for the underlying algorithms of:

  • VRCP28PD - Approximation to the Reciprocal of Packed Double Precision Floating-Point Values with Less Than 2-28 Relative Error
  • VRCP28SD - Approximation to the Reciprocal of Scalar Double Precision Floating-Point Value with Less Than 2-28 Relative Error
  • VRCP28PS - Approximation to the Reciprocal of Packed Single Precision Floating-Point Values with Less Than 2-28 Relative Error
  • VRCP28SS - Approximation to the Reciprocal of Scalar Single Precision Floating-Point Value with Less Than 2-28 Relative Error
  • VRSQRT28PD - Approximation to the Reciprocal Square Root of Packed Double Precision Floating-Point Values with Less Than 2-28 Relative Error
  • VRSQRT28SD - Approximation to the Reciprocal Square Root of Scalar Double Precision Floating-Point Value with Less Than 2^-28 Relative Error
  • VRSQRT28PS - Approximation to the Reciprocal Square Root of Packed Single Precision Floating-Point Values with Less Than 2-28 Relative Error
  • VRSQRT28SS - Approximation to the Reciprocal Square Root of Scalar Single Precision Floating-Point Value with Less Than 2-28 Relative Error
  • VEXP2PD - Approximation to the Exponential 2x of Packed Double Precision Floating-Point Values with Less Than 2-23 Relative Error
  • VEXP2PS - Approximation to the Exponential 2x of Packed Single Precision Floating-Point Values with Less Than 2-23 Relative Error

The corresponding emulation routines (only scalar versions) are:

  • RCP28S - reciprocal approximation for Float32
  • RCP28D - reciprocal approximation for Float64
  • RSQRT28S - reciprocal square root approximation for Float32
  • RSQRT28D - reciprocal square root approximation for Float64
  • EXP2S - Base-2 exponential approximation for Float32
  • EXP2D - Base-2 exponential approximation for Float64

The reference functions have to be compiled with the DAZ and FTZ mode turned off (e.g. with the Intel compiler for Linux, using the -no-ftz option), and have to be run with the rounding mode set to round-to-nearest, and with floating-point exceptions masked.

Usage example for RCP14S and RCP14D

The following example may be compiled with any of the following (or other, equivalent) commands:

icc -no-ftz -Wall -Werror main.c RECIP14.c
gcc -m32 -Wall -Werror main.c RECIP14.c -lm
gcc -Wall -Werror main.c RECIP14.c -lm

where main.c is shown below:

#include <stdio.h>
typedef union {
  unsigned int u;
  float f;
} type32;
typedef union {
  unsigned long long u;
  double f;
} type64;
extern void RCP14S (unsigned int mxcsr, type32 *dst, type32 src);
extern void RCP14D (unsigned int mxcsr, type64 *dst, type64 src);

int main () {
  type32 dst32, src32;
  type64 dst64, src64;
  unsigned int mxcsr = 0x00000000;

  printf ("MXCSR = %8.8x\n", mxcsr);
  src32.f = 3.0;
  RCP14S (mxcsr, &dst32, src32);
  printf ("RCP14S(%f = %8.8x HEX) = (%f = %8.8x HEX)\n", src32.f, src32.u,
      dst32.f, dst32.u);
  src64.f = 3.0;
  RCP14D (mxcsr, &dst64, src64);
  printf ("RCP14D(%f = %16.16llx HEX) = (%f = %16.16llx HEX)\n", src64.f,
      src64.u, dst64.f, dst64.u);
  return (0);
}

 

Usage example for RSQRT14S and RSQRT14D

The following example may be compiled with any of the following (or other, equivalent) commands:

        icc -no-ftz -Wall -Werror main.c RECIP14.c
        gcc -m32 -Wall -Werror main.c RECIP14.c -lm
        gcc -Wall -Werror main.c RECIP14.c -lm

where main.c is shown below:

#include <stdio.h>
typedef union {
  unsigned int u;
  float f;
} type32;
typedef union {
  unsigned long long u;
  double f;
} type64;
extern void RSQRT14S (unsigned int mxcsr, type32 *dst, type32 src);
extern void RSQRT14D (unsigned int mxcsr, type64 *dst, type64 src);

int main () {
  type32 dst32, src32;
  type64 dst64, src64;
  unsigned int mxcsr = 0x00000000;

  printf ("MXCSR = %8.8x\n", mxcsr);
  src32.f = 2.0;
  RSQRT14S (mxcsr, &dst32, src32);
  printf ("RSQRT14S(%f = %8.8x HEX) = (%f = %8.8x HEX)\n", src32.f, src32.u,
      dst32.f, dst32.u);
  src64.f = 2.0;
  RSQRT14D (mxcsr, &dst64, src64);
  printf ("RSQRT14D(%f = %16.16llx HEX) = (%f = %16.16llx HEX)\n", src64.f,
      src64.u, dst64.f, dst64.u);
  return (0);
}

 

Usage example for RCP28S and RCP28D

The following example may be compiled with any of the following (or other, equivalent) commands:

        icc -no-ftz -Wall -Werror main.c RECIP28EXP2.c
        gcc -m32 -Wall -Werror main.c RECIP28EXP2.c -lm
        gcc -Wall -Werror main.c RECIP28EXP2.c -lm

where main.c is shown below:

#include <stdio.h>
typedef union {
  unsigned int u;
  float f;
} type32;
typedef union {
  unsigned long long u;
  double f;
} type64;
extern unsigned int RCP28S (type32 *dst, type32 src);
extern unsigned int RCP28D (type64 *dst, type64 src);

int main () {
  type32 dst32, src32;
  type64 dst64, src64;
  unsigned int flags = 0x00000000; // PUOZDI

  printf ("FLAGS = %2.2x\n", flags);
  src32.f = 3.0;
  flags = RCP28S (&dst32, src32);
  printf ("RCP28S(%f = %8.8x HEX) = (%f = %8.8x HEX) flags = %2.2x\n",
      src32.f, src32.u, dst32.f, dst32.u, flags);
  src64.f = 3.0;
  flags = RCP28D (&dst64, src64);
  printf ("RCP28D(%f = %16.16llx HEX) = (%f = %16.16llx HEX) flags = %2.2x\n",
      src64.f, src64.u, dst64.f, dst64.u, flags);
  return (0);
}

 

Usage example for RSQRT28S and RSQRT28D

The following example may be compiled with any of the following (or other, equivalent) commands:

        icc -no-ftz -Wall -Werror main.c RECIP28EXP2.c
        gcc -m32 -Wall -Werror main.c RECIP28EXP2.c -lm
        gcc -Wall -Werror main.c RECIP28EXP2.c -lm

where main.c is shown below:

#include <stdio.h>
typedef union {
  unsigned int u;
  float f;
} type32;
typedef union {
  unsigned long long u;
  double f;
} type64;

extern unsigned int RSQRT28S (type32 *dst, type32 src);
extern unsigned int RSQRT28D (type64 *dst, type64 src);

int main () {
  type32 dst32, src32;
  type64 dst64, src64;
  unsigned int flags = 0x00000000; // PUOZDI

  printf ("FLAGS = %2.2x\n", flags);
  src32.f = 2.0;
  flags = RSQRT28S (&dst32, src32);
  printf ("RSQRT28S(%f = %8.8x HEX) = (%f = %8.8x HEX) flags = %2.2x\n",
      src32.f, src32.u, dst32.f, dst32.u, flags);
  src64.f = 2.0;
  flags = RSQRT28D (&dst64, src64);
  printf ("RSQRT28D(%f = %16.16llx HEX) = (%f = %16.16llx HEX) flags = %2.2x\n",
      src64.f, src64.u, dst64.f, dst64.u, flags);
  return (0);
}

 

Usage example for EXP2S and EXP2D

The following example may be compiled with any of the following (or other, equivalent) commands:

        icc -no-ftz -Wall -Werror main.c RECIP28EXP2.c
        gcc -m32 -Wall -Werror main.c RECIP28EXP2.c -lm
        gcc -Wall -Werror main.c RECIP28EXP2.c -lm

where main.c is shown below:

#include <stdio.h>
typedef union {
  unsigned int u;
  float f;
} type32;
typedef union {
  unsigned long long u;
  double f;
} type64;
extern unsigned int EXP2S (type32 *dst, type32 src);
extern unsigned int EXP2D (type64 *dst, type64 src);

int main () {
  type32 dst32, src32;
  type64 dst64, src64;
  unsigned int flags = 0x00000000; // PUOZDI

  printf ("FLAGS = %2.2x\n", flags);
  src32.f = 1.5;
  flags = EXP2S (&dst32, src32);
  printf ("EXP2S(%f = %8.8x HEX) = (%f = %8.8x HEX) flags = %2.2x\n",
      src32.f, src32.u, dst32.f, dst32.u, flags);
  src64.f = 1.5;
  flags = EXP2D (&dst64, src64);
  printf ("EXP2D(%f = %16.16llx HEX) = (%f = %16.16llx HEX) flags = %2.2x\n",
      src64.f, src64.u, dst64.f, dst64.u, flags);
  return (0);
}

 

Product and Performance Information

1

Performance varies by use, configuration and other factors. Learn more at www.Intel.com/PerformanceIndex.