main.cpp
#include <stdio.h>
#include "../../commonfiles/xmmval.h"
#include "../../commonfiles/ymmval.h"
#include <stdlib.h>

extern "C" void AvxGatherFloat(YmmVal* des, YmmVal* indices, YmmVal* mask, const float* x);
extern "C" void AvxGatherI64(YmmVal* des, XmmVal* indices, YmmVal* mask, const Int64* x);

void AvxGatherFloatPrint(const char* msg, YmmVal& des, YmmVal& indices, YmmVal& mask)
{
    printf("\n%s\n", msg);

    for (int i = 0; i < 8; i++)
    {
        printf("ElementID: %d  ", i);
        printf("des: %8.1f  ", des.r32[i]);
        printf("indices: %4d  ", indices.i32[i]);
        printf("mask: 0x%08X\n", mask.i32[i]);
    }
}

void AvxGatherI64Print(const char* msg, YmmVal& des, XmmVal& indices, YmmVal& mask)
{
    printf("\n%s\n", msg);

    for (int i = 0; i < 4; i++)
    {
        printf("ElementID: %d  ", i);
        printf("des: %8lld  ", des.i64[i]);
        printf("indices: %4d  ", indices.i32[i]);
        printf("mask: 0x%016llX\n", mask.i64[i]);
    }
}

void AvxGatherFloatCpp(void)
{
    const int merge_no = 0;
    const int merge_yes = 0x80000000;
    const int n = 15;
    float x[n];
    __attribute__((aligned(32))) YmmVal des;
    __attribute__((aligned(32))) YmmVal indices;
    __attribute__((aligned(32))) YmmVal mask;

    // Initialize the test array
    srand(22);
    for (int i = 0; i < n; i++)
        x[i] = (float)(rand() % 1000);

    // Load des with initial values
    for (int i = 0; i < 8; i++)
        des.r32[i] = -1.0f;

    // Initialize the indices
    indices.i32[0] = 2;
    indices.i32[1] = 1;
    indices.i32[2] = 6;
    indices.i32[3] = 5;
    indices.i32[4] = 4;
    indices.i32[5] = 13;
    indices.i32[6] = 11;
    indices.i32[7] = 9;

    // Initialize the mask value
    mask.i32[0] = merge_yes;
    mask.i32[1] = merge_yes;
    mask.i32[2] = merge_no;
    mask.i32[3] = merge_yes;
    mask.i32[4] = merge_yes;
    mask.i32[5] = merge_no;
    mask.i32[6] = merge_yes;
    mask.i32[7] = merge_yes;

    printf("\nResults for AvxGatherFloat()\n");
    printf("Test array\n");
    for (int i = 0; i < n; i++)
        printf("x[%02d]: %6.1f\n", i, x[i]);
    printf("\n");

    const char* s1 = "Values BEFORE call to AvxGatherFloat()";
    const char* s2 = "Values AFTER call to AvxGatherFloat()";

    AvxGatherFloatPrint(s1, des, indices, mask);
    AvxGatherFloat(&des, &indices, &mask, x);
    AvxGatherFloatPrint(s2, des, indices, mask);
}

void AvxGatherI64Cpp(void)
{
    const Int64 merge_no = 0;
    const Int64 merge_yes = 0x8000000000000000LL;
    const int n = 15;
    Int64 x[n];
    __attribute__((aligned(32))) YmmVal des;
    __attribute__((aligned(16))) XmmVal indices;
    __attribute__((aligned(32))) YmmVal mask;

    // Initialize the test array
    srand(36);
    for (int i = 0; i < n; i++)
        x[i] = (Int64)(rand() % 1000);

    // Load des with initial values
    for (int i = 0; i < 4; i++)
        des.i64[i] = -1;

    // Initialize the indices and mask elements
    indices.i32[0] = 2;
    indices.i32[1] = 7;
    indices.i32[2] = 9;
    indices.i32[3] = 12;

    mask.i64[0] = merge_yes;
    mask.i64[1] = merge_yes;
    mask.i64[2] = merge_no;
    mask.i64[3] = merge_yes;

    printf("\nResults for AvxGatherI64()\n");
    printf("Test array\n");
    for (int i = 0; i < n; i++)
        printf("x[%02d]: %8lld\n", i, x[i]);
    printf("\n");

    const char* s1 = "Values BEFORE call to AvxGatherI64()";
    const char* s2 = "Values AFTER call to AvxGatherI64()";

    AvxGatherI64Print(s1, des, indices, mask);
    AvxGatherI64(&des, &indices, &mask, x);
    AvxGatherI64Print(s2, des, indices, mask);
}

int main(int argc, char* argv[])
{
    AvxGatherFloatCpp();
    AvxGatherI64Cpp();
    return 0;
}
avxgather.asm
; Name:     avxgather.asm
;
; Build:    g++ -c -m32 main.cpp -o main.o
;           nasm -f elf32 -o avxgather.o avxgather.asm
;           g++ -m32 -o avxgather avxgather.o main.o
;
; Source:   Modern x86 Assembly Language Programming p. 463

global AvxGatherFloat
global AvxGatherI64

section .text

; extern "C" void AvxGatherFloat(YmmVal* des, YmmVal* indices, YmmVal* mask, const float* x);
;
; Description:  The following function demonstrates use of the
;               vgatherdps instruction.
;
; Requires:     AVX2

%define des     [ebp+8]
%define indices [ebp+12]
%define mask    [ebp+16]
%define x       [ebp+20]

AvxGatherFloat:
    push    ebp
    mov     ebp,esp
    push    ebx

; Load argument values. The contents of des are loaded into ymm0
; prior to execution of the vgatherdps instruction in order to
; demonstrate the conditional effects of the control mask.
    mov        eax,des                         ;eax = ptr to des
    mov        ebx,indices                     ;ebx = ptr to indices
    mov        ecx,mask                        ;ecx = ptr to mask
    mov        edx,x                           ;edx = ptr to x
    vmovaps    ymm0,[eax]                      ;ymm0 = des (initial values)
    vmovdqa    ymm1,[ebx]                      ;ymm1 = indices
    vmovdqa    ymm2,[ecx]                      ;ymm2 = mask

; Perform the gather operation and save the results.
    vgatherdps ymm0,[edx+ymm1*4],ymm2          ;ymm0 = gathered elements
    vmovaps    [eax],ymm0                      ;save des
    vmovdqa    [ebx],ymm1                      ;save indices (unchanged)
    vmovdqa    [ecx],ymm2                      ;save mask (all zeros)

    vzeroupper
    pop     ebx
    pop     ebp
    ret

; extern "C" void AvxGatherI64(YmmVal* des, XmmVal* indices, YmmVal* mask, const Int64* x);
;
; Description:  The following function demonstrates use of the vpgatherdq
;               instruction.
;
; Requires:     AVX2

%define des     [ebp+8]
%define indices [ebp+12]
%define mask    [ebp+16]
%define x       [ebp+20]

AvxGatherI64:
    push    ebp
    mov     ebp,esp
    push    ebx

; Load argument values. Note that the indices are loaded
; into register XMM1.
    mov        eax,des                         ;eax = ptr to des
    mov        ebx,indices                     ;ebx = ptr to indices
    mov        ecx,mask                        ;ecx = ptr to mask
    mov        edx,x                           ;edx = ptr to x
    vmovdqa    ymm0,[eax]                      ;ymm0 = des (initial values)
    vmovdqa    xmm1,[ebx]                      ;xmm1 = indices
    vmovdqa    ymm2,[ecx]                      ;ymm2 = mask

; Perform the gather and save the results.  Note that the
; scale factor matches the size of the gathered elements.
    vpgatherdq ymm0,[edx+xmm1*8],ymm2          ;ymm0 = gathered elements
    vmovdqa    [eax],ymm0                      ;save des
    vmovdqa    [ebx],xmm1                      ;save indices (unchanged)
    vmovdqa    [ecx],ymm2                      ;save mask (all zeros)

    vzeroupper
    pop     ebx
    pop     ebp
    ret
build
g++ -c -m32 main.cpp -o main.o
nasm -f elf32 -o avxgather.o avxgather.asm
g++ -m32 -o avxgather avxgather.o main.o ../../commonfiles/xmmval.o ../../commonfiles/ymmval.o