main.cpp
#include <stdio.h>
#include "../../commonfiles/ymmval.h"

extern "C" void AvxPiI16(YmmVal* a, YmmVal* b, YmmVal c[6]);
extern "C" void AvxPiI32(YmmVal* a, YmmVal* b, YmmVal c[5]);

void AvxPiI16Cpp(void)
{
    __attribute__((aligned(32)))  YmmVal a;
    __attribute__((aligned(32)))  YmmVal b;
    __attribute__((aligned(32)))  YmmVal c[6];

    a.i16[0] = 10;       b.i16[0] = 1000;
    a.i16[1] = 20;       b.i16[1] = 2000;
    a.i16[2] = 3000;     b.i16[2] = 30;
    a.i16[3] = 4000;     b.i16[3] = 40;

    a.i16[4] = 30000;    b.i16[4] = 3000;       // add overflow
    a.i16[5] = 6000;     b.i16[5] = 32000;      // add overflow
    a.i16[6] = 2000;     b.i16[6] = -31000;     // sub overflow
    a.i16[7] = 4000;     b.i16[7] = -30000;     // sub overflow

    a.i16[8] = 4000;    b.i16[8] = -2500;
    a.i16[9] = 3600;     b.i16[9] = -1200;
    a.i16[10] = 6000;    b.i16[10] = 9000;
    a.i16[11] = -20000;  b.i16[11] = -20000;

    a.i16[12] = -25000;  b.i16[12] = -27000;    // add overflow
    a.i16[13] = 8000;    b.i16[13] = 28700;     // add overflow
    a.i16[14] = 3;       b.i16[14] = -32766;    // sub overflow
    a.i16[15] = -15000;  b.i16[15] = 24000;     // sub overflow

    AvxPiI16(&a, &b, c);

    printf("\nResults for AvxPiI16()\n\n");
    printf("i        a       b   vpaddw vpaddsw  vpsubw vpsubsw  vpminsw vpmaxsw\n");
    printf("--------------------------------------------------------------------\n");

    for (int i = 0; i < 16; i++)
    {
        const char* fs = "%7d ";

        printf("%2d ", i);
        printf(fs, a.i16[i]);
        printf(fs, b.i16[i]);
        printf(fs, c[0].i16[i]);
        printf(fs, c[1].i16[i]);
        printf(fs, c[2].i16[i]);
        printf(fs, c[3].i16[i]);
        printf(fs, c[4].i16[i]);
        printf(fs, c[5].i16[i]);
        printf("\n");
    }
}

void AvxPiI32Cpp(void)
{
    __attribute__((aligned(32)))  YmmVal a;
    __attribute__((aligned(32)))  YmmVal b;
    __attribute__((aligned(32)))  YmmVal c[5];

    a.i32[0] = 64;        b.i32[0] = 4;
    a.i32[1] = 1024;      b.i32[1] = 5;
    a.i32[2] = -2048;     b.i32[2] = 2;
    a.i32[3] = 8192;      b.i32[3] = 5;
    a.i32[4] = -256;      b.i32[4] = 8;
    a.i32[5] = 4096;      b.i32[5] = 7;
    a.i32[6] = 16;        b.i32[6] = 3;
    a.i32[7] = 512;       b.i32[7] = 6;

    AvxPiI32(&a, &b, c);

    printf("\nResults for AvxPiI32()\n\n");
    printf("i         a        b    vphaddd  vphsubd  vpmulld  vpsllvd  vpsravd\n");
    printf("-------------------------------------------------------------------\n");

    for (int i = 0; i < 8; i++)
    {
        const char* fs = "%8d ";

        printf("%2d ", i);
        printf(fs, a.i32[i]);
        printf(fs, b.i32[i]);
        printf(fs, c[0].i32[i]);
        printf(fs, c[1].i32[i]);
        printf(fs, c[2].i32[i]);
        printf(fs, c[3].i32[i]);
        printf(fs, c[4].i32[i]);
        printf("\n");
    }
}

int main(int argc, char* argv[])
{
    AvxPiI16Cpp();
    AvxPiI32Cpp();
    return 0;
}
avxpackedintegerarithmetic.asm
; Name:     avxpackedintegerarithmetic.asm
;
; Build:    g++ -c -m32 main.cpp -o main.o -std=c++11
;           nasm -f elf32 -o avxpackedintegerarithmetic.o avxpackedintegerarithmetic.asm
;           g++ -m32 -o avxpackedintegerarithmetic avxpackedintegerarithmetic.o main.o ../../commonfiles/ymmval.o
;
; Source:   Modern x86 Assembly Language Programming p. 406

global AvxPiI16
global AvxPiI32

section .text

; extern "C" void AvxPiI16(YmmVal* a, YmmVal* b, YmmVal c[6]);
;
; Description:  The following function illustrates use of various
;               packed 16-bit integer arithmetic instructions
;               using 256-bit wide operands.
;
; Requires:     AVX2

%define a   [ebp+8]
%define b   [ebp+12]
%define c   [ebp+16]

AvxPiI16:
    push    ebp
    mov     ebp,esp

; Load argument values
    mov     eax,a                    ;eax = ptr to a
    mov     ecx,b                    ;ecx = ptr to b
    mov     edx,c                    ;edx = ptr to c

; Load a and b, which must be properly aligned
    vmovdqa ymm0,[eax]               ;ymm0 = a
    vmovdqa ymm1,[ecx]               ;ymm1 = b

; Perform packed arithmetic operations
    vpaddw  ymm2,ymm0,ymm1           ;add
    vpaddsw ymm3,ymm0,ymm1           ;add with signed saturation
    vpsubw  ymm4,ymm0,ymm1           ;sub
    vpsubsw ymm5,ymm0,ymm1           ;sub with signed saturation
    vpminsw ymm6,ymm0,ymm1           ;signed minimums
    vpmaxsw ymm7,ymm0,ymm1           ;signed maximums

; Save results
    vmovdqa [edx],ymm2               ;save vpaddw result
    vmovdqa [edx+32],ymm3            ;save vpaddsw result
    vmovdqa [edx+64],ymm4            ;save vpsubw result
    vmovdqa [edx+96],ymm5            ;save vpsubsw result
    vmovdqa [edx+128],ymm6           ;save vpminsw result
    vmovdqa [edx+160],ymm7           ;save vpmaxsw result

    vzeroupper
    pop     ebp
    ret

; extern "C" void AvxPiI32(YmmVal* a, YmmVal* b, YmmVal c[5]);
;
; Description:  The following function illustrates use of various
;               packed 32-bit integer arithmetic instructions
;               using 256-bit wide operands.
;
; Requires:     AVX2

%define a   [ebp+8]
%define b   [ebp+12]
%define c   [ebp+16]

AvxPiI32:
    push ebp
    mov ebp,esp

; Load argument values
    mov     eax,a                    ;eax = ptr to a
    mov     ecx,b                    ;ecx = ptr to b
    mov     edx,c                    ;edx = ptr to c

; Load a and b, which must be properly aligned
    vmovdqa ymm0,[eax]               ;ymm0 = a
    vmovdqa ymm1,[ecx]               ;ymm1 = b

; Perform packed arithmetic operations
    vphaddd ymm2,ymm0,ymm1           ;horizontal add
    vphsubd ymm3,ymm0,ymm1           ;horizontal sub
    vpmulld ymm4,ymm0,ymm1           ;signed mul (low 32 bits)
    vpsllvd ymm5,ymm0,ymm1           ;shift left logical
    vpsravd ymm6,ymm0,ymm1           ;shift right arithmetic

; Save results
    vmovdqa [edx],ymm2               ;save vphaddd result
    vmovdqa [edx+32],ymm3            ;save vphsubd result
    vmovdqa [edx+64],ymm4            ;save vpmulld result
    vmovdqa [edx+96],ymm5            ;save vpsllvd result
    vmovdqa [edx+128],ymm6           ;save vpsravd result

    vzeroupper
    pop     ebp
    ret
build
g++ -c -m32 main.cpp -o main.o -std=c++11
nasm -f elf32 -o avxpackedintegerarithmetic.o avxpackedintegerarithmetic.asm
g++ -m32 -o avxpackedintegerarithmetic avxpackedintegerarithmetic.o main.o ../../commonfiles/ymmval.o