main.cpp
#include "../../commonfiles/xmmval.h"
#include <stdio.h>

extern "C" void SsePiAddI16(const XmmVal* a, const XmmVal* b, XmmVal c[2]);
extern "C" void SsePiSubI32(const XmmVal* a, const XmmVal* b, XmmVal* c);
extern "C" void SsePiMul32(const XmmVal* a, const XmmVal* b, XmmVal c[2]);

void SsePiAddI16Cpp(void)
{
	__attribute__ ((aligned(16))) XmmVal a;
	__attribute__ ((aligned(16))) XmmVal b;
	__attribute__ ((aligned(16))) XmmVal c[2];
	
    char buff[256];

    a.i16[0] = 10;          b.i16[0] = 100;
    a.i16[1] = 200;         b.i16[1] = -200;
    a.i16[2] = 30;          b.i16[2] = 32760;
    a.i16[3] = -32766;      b.i16[3] = -400;
    a.i16[4] = 50;          b.i16[4] = 500;
    a.i16[5] = 60;          b.i16[5] = -600;
    a.i16[6] = 32000;       b.i16[6] = 1200;
    a.i16[7] = -32000;      b.i16[7] = -950;

    SsePiAddI16(&a, &b, c);

    printf("\nResults for SsePiAddI16\n");
    printf("a:    %s\n", a.ToString_i16(buff, sizeof(buff)));
    printf("b:    %s\n", b.ToString_i16(buff, sizeof(buff)));
    printf("c[0]: %s\n", c[0].ToString_i16(buff, sizeof(buff)));
    printf("\n");
    printf("a:    %s\n", a.ToString_i16(buff, sizeof(buff)));
    printf("b:    %s\n", b.ToString_i16(buff, sizeof(buff)));
    printf("c[1]: %s\n", c[1].ToString_i16(buff, sizeof(buff)));
}

void SsePiSubI32Cpp(void)
{
	__attribute__ ((aligned(16))) XmmVal a;
	__attribute__ ((aligned(16))) XmmVal b;
	__attribute__ ((aligned(8))) XmmVal c;       // Misaligned XmmVal
    char buff[256];

    a.i32[0] = 800;        b.i32[0] = 250;
    a.i32[1] = 500;        b.i32[1] = -2000;
    a.i32[2] = 1000;       b.i32[2] = -40;
    a.i32[3] = 900;        b.i32[3] = 1200;

    SsePiSubI32(&a, &b, &c);

    printf("\nResults for SsePiSubI32\n");
    printf("a: %s\n", a.ToString_i32(buff, sizeof(buff)));
    printf("b: %s\n", b.ToString_i32(buff, sizeof(buff)));
    printf("c: %s\n", c.ToString_i32(buff, sizeof(buff)));
}

void SsePiMul32Cpp(void)
{
	__attribute__ ((aligned(16))) XmmVal a;
	__attribute__ ((aligned(16))) XmmVal b;
	__attribute__ ((aligned(16))) XmmVal c[2];
    char buff[256];

    a.i32[0] = 10;          b.i32[0] = 100;
    a.i32[1] = 20;          b.i32[1] = -200;
    a.i32[2] = -30;         b.i32[2] = 300;
    a.i32[3] = -40;         b.i32[3] = -400;

    SsePiMul32(&a, &b, c);

    printf("\nResults for SsePiMul32\n");
    printf("a:    %s\n", a.ToString_i32(buff, sizeof(buff)));
    printf("b:    %s\n", b.ToString_i32(buff, sizeof(buff)));
    printf("c[0]: %s\n", c[0].ToString_i32(buff, sizeof(buff)));
    printf("\n");
    printf("a:    %s\n", a.ToString_i32(buff, sizeof(buff)));
    printf("b:    %s\n", b.ToString_i32(buff, sizeof(buff)));
    printf("c[1]: %s\n", c[1].ToString_i64(buff, sizeof(buff)));
}

int main(int argc, char* argv[])
{
    SsePiAddI16Cpp();
    SsePiSubI32Cpp();
    SsePiMul32Cpp();
    return 0;
}
ssepackedintegerfundamentals.asm
; Name:     ssepackedintegerfundamentals.asm
;
; Build:    g++ -c -m32 main.cpp -o main.o
;           nasm -f elf32 -o ssepackedintegerfundamentals.o ssepackedintegerfundamentals.asm
;           g++ -m32 -o ssepackedintegerfundamentals ssepackedintegerfundamentals.o main.o ../../commonfiles/xmmval.o
;
; Source:   Modern x86 Assembly Language Programming p. 273

global  SsePiAddI16
global  SsePiSubI32
global  SsePiMul32

section .text

; extern "C" void SsePiAddI16(const XmmVal* a, const XmmVal* b, XmmVal c[2]);
;
; Description:  The following function demonstrates packed signed word
;               addition using wraparound and saturated modes.
;
; Requires:     SSE2

%define a   [ebp+8]
%define b   [ebp+12]
%define c   [ebp+16]

SsePiAddI16:
    push    ebp
    mov     ebp,esp
; Initialize
    mov     eax,a                   ;eax = pointer to a
    mov     ecx,b                   ;ecx = pointer to b
    mov     edx,c                   ;edx = pointer to c
; Load XmmVals a and b
    movdqa  xmm0,[eax]              ;xmm0 = a
    movdqa  xmm1,xmm0
    movdqa  xmm2,[ecx]              ;xmm2 = b
; Perform packed word additions
    paddw   xmm0,xmm2               ;packed add - wraparound
    paddsw  xmm1,xmm2               ;packed add - saturated
; Save results
    movdqa  [edx],xmm0              ;save c[0]
    movdqa  [edx+16],xmm1           ;save c[1]
    pop     ebp
    ret

; extern "C" void SsePiSubI32_(const XmmVal* a, const XmmVal* b, XmmVal* c);
;
; Description:  The following function demonstrates packed signed
;               doubleword subtraction.
;
; Requires:     SSE2

%define a   [ebp+8]
%define b   [ebp+12]
%define c   [ebp+16]

SsePiSubI32:
    push    ebp
    mov     ebp,esp
; Initialize
    mov     eax,a                       ;eax = pointer to a
    mov     ecx,b                       ;ecx = pointer to b
    mov     edx,c                       ;edx = pointer to c
; Perform packed doubleword subtraction
    movdqa  xmm0,[eax]                  ;xmm0 = a
    psubd   xmm0,[ecx]                  ;xmm0 = a - b
    movdqu  [edx],xmm0                  ;save result to unaligned mem
    pop     ebp
    ret

; extern "C" void SsePiMul32_(const XmmVal* a, const XmmVal* b, XmmVal c[2]);
;
; Description:  The following function demonstrates packed doubleword
;               multiplication.
;
; Requires:     SSE4.1

%define a   [ebp+8]
%define b   [ebp+12]
%define c   [ebp+16]

SsePiMul32:
    push    ebp
    mov     ebp,esp
; Initialize
    mov     eax,a                       ;eax = pointer to a
    mov     ecx,b                       ;ecx = pointer to b
    mov     edx,c                       ;edx = pointer to c
; Load values and perform the multiplication
    movdqa  xmm0,[eax]                  ;xmm0 = a
    movdqa  xmm1,[ecx]                  ;xmm1 = b
    movdqa  xmm2,xmm0
    pmulld  xmm0,xmm1                   ;signed dword mul - low result
    pmuldq  xmm2,xmm1                   ;signed dword mul - qword result
    movdqa  [edx],xmm0                  ;c[0] = pmulld result
    movdqa  [edx+16],xmm2               ;c[1] = pmuldq result
    pop     ebp
    ret
build
g++ -c -m32 main.cpp -o main.o
nasm -f elf32 -o ssepackedintegerfundamentals.o ssepackedintegerfundamentals.asm
g++ -m32 -o ssepackedintegerfundamentals ssepackedintegerfundamentals.o main.o ../../commonfiles/xmmval.o