main.cpp
#include <stdio.h>
#include "../../commonfiles/ymmval.h"

extern "C" void AvxPiUnpackDQ(YmmVal* a, YmmVal* b, YmmVal c[2]);
extern "C" void AvxPiPackDW(YmmVal* a, YmmVal* b, YmmVal* c);

void AvxPiUnpackDQCpp(void)
{
    __attribute__((aligned(32)))  YmmVal a;
    __attribute__((aligned(32)))  YmmVal b;
    __attribute__((aligned(32)))  YmmVal c[2];

    a.i32[0] = 0x0000;  b.i32[0] = 0x8888;
    a.i32[1] = 0x1111;  b.i32[1] = 0x9999;
    a.i32[2] = 0x2222;  b.i32[2] = 0xaaaa;
    a.i32[3] = 0x3333;  b.i32[3] = 0xbbbb;

    a.i32[4] = 0x4444;  b.i32[4] = 0xcccc;
    a.i32[5] = 0x5555;  b.i32[5] = 0xdddd;
    a.i32[6] = 0x6666;  b.i32[6] = 0xeeee;
    a.i32[7] = 0x7777;  b.i32[7] = 0xffff;

    AvxPiUnpackDQ(&a, &b, c);

    printf("\nResults for AvxPiUnpackDQ()\n\n");
    printf("i   a           b           vpunpckldq  vpunpckhdq\n");
    printf("--------------------------------------------------\n");

    for (int i = 0; i < 8; i++)
    {
        const char* fs = "0x%04X      ";

        printf("%-2d  ", i);
        printf(fs, a.u32[i]);
        printf(fs, b.u32[i]);
        printf(fs, c[0].u32[i]);
        printf(fs, c[1].u32[i]);
        printf("\n");
    }
}

void AvxPiPackDWCpp(void)
{
    char buff[256];
    __attribute__((aligned(32)))  YmmVal a;
    __attribute__((aligned(32)))  YmmVal b;
    __attribute__((aligned(32)))  YmmVal c;

    a.i32[0] = 10;          b.i32[0] = 32768;
    a.i32[1] = -200000;     b.i32[1] = 6500;
    a.i32[2] = 300000;      b.i32[2] = 42000;
    a.i32[3] = -4000;       b.i32[3] = -68000;

    a.i32[4] = 9000;        b.i32[4] = 25000;
    a.i32[5] = 80000;       b.i32[5] = 500000;
    a.i32[6] = 200;         b.i32[6] = -7000;
    a.i32[7] = -32769;      b.i32[7] = 12500;

    AvxPiPackDW(&a, &b, &c);
    printf("\nResults for AvxPiPackDW()\n\n");

    printf("a lo %s\n", a.ToString_i32(buff, sizeof(buff), false));
    printf("a hi %s\n", a.ToString_i32(buff, sizeof(buff), true));
    printf("\n");

    printf("b lo %s\n", b.ToString_i32(buff, sizeof(buff), false));
    printf("b hi %s\n", b.ToString_i32(buff, sizeof(buff), true));
    printf("\n");

    printf("c lo %s\n", c.ToString_i16(buff, sizeof(buff), false));
    printf("c hi %s\n", c.ToString_i16(buff, sizeof(buff), true));
}

int main(int argc, char* argv[])
{
    AvxPiUnpackDQCpp();
    AvxPiPackDWCpp();
    return 0;
}
avxpackedintegerunpack.asm
; Name:     avxpackedintegerunpack.asm
;
; Build:    g++ -c -m32 main.cpp -o main.o -std=c++11
;           nasm -f elf32 -o avxpackedintegerunpack.o avxpackedintegerunpack.asm
;           g++ -m32 -o avxpackedintegerunpack avxpackedintegerunpack.o main.o ../../commonfiles/ymmval.o
;
; Source:   Modern x86 Assembly Language Programming p. 412

global AvxPiUnpackDQ
global AvxPiPackDW

section .text

; extern "C" void AvxPiUnpackDQ(YmmVal* a, YmmVal* b, YmmVal c[2]);
;
; Description:  The following function demonstrates use of the
;               vpunpckldq and vpunpckhdq instructions using
;               256-bit wide operands.
;
; Requires:     AVX2

%define a   [ebp+8]
%define b   [ebp+12]
%define c   [ebp+16]

AvxPiUnpackDQ:
    push    ebp
    mov     ebp,esp

; Load argument values
    mov     eax,a                           ;eax = ptr to a
    mov     ecx,b                           ;ecx = ptr to b
    mov     edx,c                           ;edx = ptr to c
    vmovdqa ymm0,[eax]                      ;ymm0 = a
    vmovdqa ymm1,[ecx]                      ;ymm1 = b

; Perform dword to qword unpacks
    vpunpckldq ymm2,ymm0,ymm1           ;unpack low doublewords
    vpunpckhdq ymm3,ymm0,ymm1           ;unpack high doublewords
    vmovdqa    [edx],ymm2               ;save low result
    vmovdqa    [edx+32],ymm3            ;save high result

    vzeroupper
    pop     ebp
    ret

; extern "C" void AviPiPackDW(YmmVal* a, YmmVal* b, YmmVal* c);
;
; Description:  The following function demonstrates use of the
;               vpackssdw using 256-bit wide operands.
;
; Requires:     AVX2

%define a   [ebp+8]
%define b   [ebp+12]
%define c   [ebp+16]

AvxPiPackDW:
    push    ebp
    mov     ebp,esp

; Load argument values
    mov     eax,a                           ;eax = ptr to a
    mov     ecx,b                           ;ecx = ptr to b
    mov     edx,c                           ;edx = ptr to c
    vmovdqa ymm0,[eax]                      ;ymm0 = a
    vmovdqa ymm1,[ecx]                      ;ymm1 = b

; Perform pack dword to word with signed saturation
    vpackssdw ymm2,ymm0,ymm1                ;ymm2 = packed words
    vmovdqa   [edx],ymm2                    ;save result
    vzeroupper
    pop     ebp
    ret
build
g++ -c -m32 main.cpp -o main.o -std=c++11
nasm -f elf32 -o avxpackedintegerunpack.o avxpackedintegerunpack.asm
g++ -m32 -o avxpackedintegerunpack avxpackedintegerunpack.o main.o ../../commonfiles/ymmval.o