main.cpp
#include "../../commonfiles/ymmval.h"
extern "C" void AvxPfpArithmeticFloat(const YmmVal* a, const YmmVal* b, YmmVal c[6]);
extern "C" void AvxPfpArithmeticDouble(const YmmVal* a, const YmmVal* b, YmmVal c[5]);
void AvxPfpArithmeticFloatCpp(void)
{
__attribute__ ((aligned(32))) YmmVal a;
__attribute__ ((aligned(32))) YmmVal b;
__attribute__ ((aligned(32))) YmmVal c[6];
a.r32[0] = 2.0f; b.r32[0] = 12.5f;
a.r32[1] = 3.5f; b.r32[1] = 52.125f;
a.r32[2] = -10.75f; b.r32[2] = 17.5f;
a.r32[3] = 15.0f; b.r32[3] = 13.982f;
a.r32[4] = -12.125f; b.r32[4] = -4.75f;
a.r32[5] = 3.875f; b.r32[5] = 3.0625f;
a.r32[6] = 2.0f; b.r32[6] = 7.875f;
a.r32[7] = -6.35f; b.r32[7] = -48.1875f;
AvxPfpArithmeticFloat(&a, &b, c);
printf("Results for AvxPfpArithmeticFloat()\n\n");
printf(" i a b Add Sub Mul Div Abs Neg\n");
printf("--------------------------------------------------------------------------\n");
for (int i = 0; i < 8; i++)
{
const char* fs = "%8.3f ";
printf("%2d ", i);
printf(fs, a.r32[i]);
printf(fs, b.r32[i]);
printf(fs, c[0].r32[i]);
printf(fs, c[1].r32[i]);
printf(fs, c[2].r32[i]);
printf(fs, c[3].r32[i]);
printf(fs, c[4].r32[i]);
printf(fs, c[5].r32[i]);
printf("\n");
}
}
void AvxPfpArithmeticDoubleCpp(void)
{
__attribute__ ((aligned(32))) YmmVal a;
__attribute__ ((aligned(32))) YmmVal b;
__attribute__ ((aligned(32))) YmmVal c[5];
a.r64[0] = 12.0; b.r64[0] = 0.875;
a.r64[1] = 13.5; b.r64[1] = -125.25;
a.r64[2] = 18.75; b.r64[2] = 72.5;
a.r64[3] = 5.0; b.r64[3] = -98.375;
AvxPfpArithmeticDouble(&a, &b, c);
printf("\n\nResults for AvxPfpArithmeticDouble()\n\n");
printf(" i a b Min Max Sqrt a HorAdd HorSub\n");
printf("--------------------------------------------------------------------------\n");
for (int i = 0; i < 4; i++)
{
const char* fs = "%9.3lf ";
printf("%2d ", i);
printf(fs, a.r64[i]);
printf(fs, b.r64[i]);
printf(fs, c[0].r64[i]);
printf(fs, c[1].r64[i]);
printf(fs, c[2].r64[i]);
printf(fs, c[3].r64[i]);
printf(fs, c[4].r64[i]);
printf("\n");
}
}
int main(int argc, char* argv[])
{
AvxPfpArithmeticFloatCpp();
AvxPfpArithmeticDoubleCpp();
return 0;
}
avxpackedfloatingpointarithmetic.asm
; Name: avxpackedfloatingpointarithmetic.asm
;
; Build: g++ -c -m32 main.cpp -o main.o
; nasm -f elf32 -o avxpackedfloatingpointarithmetic.o avxpackedfloatingpointarithmetic.asm
; g++ -m32 -o avxpackedfloatingpointarithmetic avxpackedfloatingpointarithmetic.o main.o ../../commonfiles/ymmval.o
;
; Source: Modern x86 Assembly Language Programming p. 378
global AvxPfpArithmeticFloat
global AvxPfpArithmeticDouble
section .data
align 16
; Mask value for packed SPFP absolute value
AbsMask dd 0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff
dd 0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff
; Mask value for packed SPFP negation
NegMask dd 0x80000000,0x80000000,0x80000000,0x80000000
dd 0x80000000,0x80000000,0x80000000,0x80000000
section .text
; extern "C" void AvxPfpArithmeticFloat_(const YmmVal* a, const YmmVal* b, YmmVal c[6]);
;
; Description: The following function illustrates how to use common
; packed SPFP arithmetic instructions using the YMM
; registers.
;
; Requires: AVX
%define a [ebp+8]
%define b [ebp+12]
%define c [ebp+16]
AvxPfpArithmeticFloat:
push ebp
mov ebp,esp
; Load argument values. Note that the vmovaps instruction
; requires proper aligment of operands in memory.
mov eax,a ;eax = ptr to a
mov ecx,b ;ecx = ptr to b
mov edx,c ;edx = ptr to c
vmovaps ymm0,[eax] ;ymm0 = a
vmovaps ymm1,[ecx] ;ymm1 = b
; Perform packed SPFP addition, subtraction, multiplication,
; and division
vaddps ymm2,ymm0,ymm1 ;a + b
vmovaps [edx],ymm2
vsubps ymm3,ymm0,ymm1 ;a - b
vmovaps [edx+32],ymm3
vmulps ymm4,ymm0,ymm1 ;a * b
vmovaps [edx+64],ymm4
vdivps ymm5,ymm0,ymm1 ;a / b
vmovaps [edx+96],ymm5
; Compute packed SPFP absolute value
vmovups ymm6,[AbsMask] ;ymm6 = AbsMask
vandps ymm7,ymm0,ymm6 ;ymm7 = packed fabs
vmovaps [edx+128],ymm7
; Compute packed SPFP negation
vxorps ymm7,ymm0,[NegMask] ;ymm7 = packed neg.
vmovaps [edx+160],ymm7
; Zero upper 128-bit of all YMM registers to avoid potential x86-AVX
; to x86-SSE transition penalties.
vzeroupper
pop ebp
ret
; extern "C" void AvxPfpArithmeticDouble_(const YmmVal* a, const YmmVal* b, YmmVal c[5]);
;
; Description: The following function illustrates how to use common
; packed DPFP arithmetic instructions using the YMM
; registers.
;
; Requires: AVX
%define a [ebp+8]
%define b [ebp+12]
%define c [ebp+16]
AvxPfpArithmeticDouble:
push ebp
mov ebp,esp
; Load argument values. Note that the vmovapd instruction
; requires proper aligment of operands in memory.
mov eax,a ;eax = ptr to a
mov ecx,b ;ecx = ptr to b
mov edx,c ;edx = ptr to c
vmovapd ymm0,[eax] ;ymm0 = a
vmovapd ymm1,[ecx] ;ymm1 = b
; Compute packed min, max and square root
vminpd ymm2,ymm0,ymm1
vmaxpd ymm3,ymm0,ymm1
vsqrtpd ymm4,ymm0
; Perform horizontal addition and subtraction
vhaddpd ymm5,ymm0,ymm1
vhsubpd ymm6,ymm0,ymm1
; Save the results
vmovapd [edx],ymm2
vmovapd [edx+32],ymm3
vmovapd [edx+64],ymm4
vmovapd [edx+96],ymm5
vmovapd [edx+128],ymm6
; Zero upper 128-bit of all YMM registers to avoid potential x86-AVX
; to x86-SSE transition penalties.
vzeroupper
pop ebp
ret
build
g++ -c -m32 main.cpp -o main.o
nasm -f elf32 -o avxpackedfloatingpointarithmetic.o avxpackedfloatingpointarithmetic.asm
g++ -m32 -o avxpackedfloatingpointarithmetic avxpackedfloatingpointarithmetic.o main.o ../../commonfiles/ymmval.o