main.cpp
#include "../../commonfiles/xmmval.h"
#define _USE_MATH_DEFINES
#include <math.h>

extern "C" void SsePackedFpMath32(const XmmVal* a, const XmmVal* b, XmmVal c[8]);
extern "C" void SsePackedFpMath64(const XmmVal* a, const XmmVal* b, XmmVal c[8]);

void SsePackedFpMath32Cpp(void)
{
	__attribute__ ((aligned(16))) XmmVal a;
	__attribute__ ((aligned(16))) XmmVal b;
	__attribute__ ((aligned(16))) XmmVal c[8];
    char buff[256];

    a.r32[0] = 36.0f;
    a.r32[1] = (float)(1.0 / 32.0);
    a.r32[2] = 2.0f;
    a.r32[3] = 42.0f;

    b.r32[0] = -(float)(1.0 / 9.0);
    b.r32[1] = 64.0f;
    b.r32[2] = -0.0625f;
    b.r32[3] = 8.666667f;

    SsePackedFpMath32(&a, &b, c);
    printf("\nResults for SsePackedFpMath32_\n");
    printf("a:         %s\n", a.ToString_r32(buff, sizeof(buff)));
    printf("b:         %s\n", b.ToString_r32(buff, sizeof(buff)));
    printf("\n");
    printf("addps:     %s\n", c[0].ToString_r32(buff, sizeof(buff)));
    printf("subps:     %s\n", c[1].ToString_r32(buff, sizeof(buff)));
    printf("mulps:     %s\n", c[2].ToString_r32(buff, sizeof(buff)));
    printf("divps:     %s\n", c[3].ToString_r32(buff, sizeof(buff)));
    printf("absps a:   %s\n", c[4].ToString_r32(buff, sizeof(buff)));
    printf("sqrtps a:  %s\n", c[5].ToString_r32(buff, sizeof(buff)));
    printf("minps:     %s\n", c[6].ToString_r32(buff, sizeof(buff)));
    printf("maxps:     %s\n", c[7].ToString_r32(buff, sizeof(buff)));
}

void SsePackedFpMath64Cpp(void)
{
	__attribute__ ((aligned(16))) XmmVal a;
	__attribute__ ((aligned(16))) XmmVal b;
	__attribute__ ((aligned(16))) XmmVal c[8];
    char buff[256];

    a.r64[0] = 2.0;
    a.r64[1] = M_PI;
    b.r64[0] = M_E;
    b.r64[1] = -M_1_PI;

    SsePackedFpMath64(&a, &b, c);
    printf("\nResults for SsePackedFpMath64_\n");
    printf("a:         %s\n", a.ToString_r64(buff, sizeof(buff)));
    printf("b:         %s\n", b.ToString_r64(buff, sizeof(buff)));
    printf("\n");
    printf("addpd:     %s\n", c[0].ToString_r64(buff, sizeof(buff)));
    printf("subpd:     %s\n", c[1].ToString_r64(buff, sizeof(buff)));
    printf("mulpd:     %s\n", c[2].ToString_r64(buff, sizeof(buff)));
    printf("divpd:     %s\n", c[3].ToString_r64(buff, sizeof(buff)));
    printf("abspd a:   %s\n", c[4].ToString_r64(buff, sizeof(buff)));
    printf("sqrtpd a:  %s\n", c[5].ToString_r64(buff, sizeof(buff)));
    printf("minpd:     %s\n", c[6].ToString_r64(buff, sizeof(buff)));
    printf("maxpd:     %s\n", c[7].ToString_r64(buff, sizeof(buff)));
}

int main(int argc, char* argv[])
{
    SsePackedFpMath32Cpp();
    SsePackedFpMath64Cpp();
}
ssepackedfloatingpointarithmetic.asm
; Name:     ssepackedfloatingpointarithmetic.asm
;
; Build:    g++ -c -m32 main.cpp -o main.o -std=c++11
;           nasm -f elf32 -o ssepackedfloatingpointarithmetic.o ssepackedfloatingpointarithmetic.asm
;           g++ -m32 -o ssepackedfloatingpointarithmetic ssepackedfloatingpointarithmetic.o main.o ../../commonfiles/xmmval.o
;
; Source:   Modern x86 Assembly Language Programming p. 238

global SsePackedFpMath32
global SsePackedFpMath64

section .data
; Mask values used to calculate floating-point absolute values
align 16

    Pfp32Abs:   dd 0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff
    Pfp64Abs:   dq 0x7fffffffffffffff,0x7fffffffffffffff

section .text

; extern "C" void SsePackedFpMath32(const XmmVal* a, const XmmVal* b, XmmVal c[8]);
;
; Description:  The following function demonstrates basic math using
;               packed single-precision floating-point values.
;
; Requires:     SSE

%define	a   [ebp+8]
%define	b   [ebp+12]
%define	c   [ebp+16]

SsePackedFpMath32:
    push    ebp
    mov     ebp,esp
    ; Load packed SP floating-point values
    mov     eax,a                        ;eax = 'a'
    mov     ecx,b                        ;ecx = 'b'
    mov     edx,c                        ;edx = 'c'
    movaps 	xmm0,[eax]                   ;xmm0 = *a
    movaps 	xmm1,[ecx]                   ;xmm1 = *b
    ; Packed SP floating-point addition
    movaps  xmm2,xmm0
    addps   xmm2,xmm1
    movaps  [edx+0],xmm2
    ; Packed SP floating-point subtraction
    movaps  xmm2,xmm0
    subps   xmm2,xmm1
    movaps  [edx+16],xmm2
    ; Packed SP floating-point multiplication
    movaps  xmm2,xmm0
    mulps   xmm2,xmm1
    movaps  [edx+32],xmm2
    ; Packed SP floating-point division
    movaps  xmm2,xmm0
    divps   xmm2,xmm1
    movaps  [edx+48],xmm2
    ; Packed SP floating-point absolute value
    movaps  xmm2,xmm0
    andps   xmm2,[Pfp32Abs]
    movaps  [edx+64],xmm2
    ; Packed SP floating-point square root
    sqrtps  xmm2,xmm0
    movaps  [edx+80],xmm2
    ; Packed SP floating-point minimum
    movaps  xmm2,xmm0
    minps   xmm2,xmm1
    movaps  [edx+96],xmm2
    ; Packed SP floating-point maximum
    maxps   xmm0,xmm1
    movaps  [edx+112],xmm0
    pop     ebp
    ret

; extern "C" void SsePackedFpMath64(const XmmVal* a, const XmmVal* b, XmmVal c[8]);
;
; Description:  The following function demonstrates basic math using
;               packed double-precision floating-point values.
;
; Requires:     SSE2

%define	a   [ebp+8]
%define	b   [ebp+12]
%define	c   [ebp+16]

SsePackedFpMath64:	
    push    ebp
    mov     ebp,esp
    ; Load packed DP floating-point values
    mov     eax,a                        ;eax = 'a'
    mov     ecx,b                        ;ecx = 'b'
    mov     edx,c                        ;edx = 'c'
    movapd  xmm0,[eax]                   ;xmm0 = *a
    movapd  xmm1,[ecx]                   ;xmm1 = *b
    ; Packed DP floating-point addition
    movapd  xmm2,xmm0
    addpd   xmm2,xmm1
    movapd  [edx+0],xmm2
    ; Packed DP floating-point subtraction
    movapd  xmm2,xmm0
    subpd   xmm2,xmm1
    movapd  [edx+16],xmm2
    ; Packed DP floating-point multiplication
    movapd  xmm2,xmm0
    mulpd   xmm2,xmm1
    movapd  [edx+32],xmm2
    ; Packed DP floating-point division
    movapd  xmm2,xmm0
    divpd   xmm2,xmm1
    movapd  [edx+48],xmm2
    ; Packed DP floating-point absolute value
    movapd  xmm2,xmm0
    andpd   xmm0,[Pfp64Abs]
    movapd  [edx+64],xmm2
    ; Packed DP floating-point square root
    sqrtpd  xmm2,xmm0
    movapd  [edx+80],xmm2
    ; Packed DP floating-point minimum
    movapd  xmm2,xmm0
    minpd   xmm2,xmm1
    movapd  [edx+96],xmm2
    ; Packed DP floating-point maximum
    maxpd   xmm0,xmm1
    movapd  [edx+112],xmm0
    pop     ebp
    ret
build
g++ -c -m32 main.cpp -o main.o -std=c++11
nasm -f elf32 -o ssepackedfloatingpointarithmetic.o ssepackedfloatingpointarithmetic.asm
g++ -m32 -o ssepackedfloatingpointarithmetic ssepackedfloatingpointarithmetic.o main.o ../../commonfiles/xmmval.o