main.cpp
#include <stdio.h>
#include "../../commonfiles/xmmval.h"
#include "../../commonfiles/ymmval.h"
#include <stdlib.h>
extern "C" void AvxGatherFloat(YmmVal* des, YmmVal* indices, YmmVal* mask, const float* x);
extern "C" void AvxGatherI64(YmmVal* des, XmmVal* indices, YmmVal* mask, const Int64* x);
void AvxGatherFloatPrint(const char* msg, YmmVal& des, YmmVal& indices, YmmVal& mask)
{
printf("\n%s\n", msg);
for (int i = 0; i < 8; i++)
{
printf("ElementID: %d ", i);
printf("des: %8.1f ", des.r32[i]);
printf("indices: %4d ", indices.i32[i]);
printf("mask: 0x%08X\n", mask.i32[i]);
}
}
void AvxGatherI64Print(const char* msg, YmmVal& des, XmmVal& indices, YmmVal& mask)
{
printf("\n%s\n", msg);
for (int i = 0; i < 4; i++)
{
printf("ElementID: %d ", i);
printf("des: %8lld ", des.i64[i]);
printf("indices: %4d ", indices.i32[i]);
printf("mask: 0x%016llX\n", mask.i64[i]);
}
}
void AvxGatherFloatCpp(void)
{
const int merge_no = 0;
const int merge_yes = 0x80000000;
const int n = 15;
float x[n];
__attribute__((aligned(32))) YmmVal des;
__attribute__((aligned(32))) YmmVal indices;
__attribute__((aligned(32))) YmmVal mask;
// Initialize the test array
srand(22);
for (int i = 0; i < n; i++)
x[i] = (float)(rand() % 1000);
// Load des with initial values
for (int i = 0; i < 8; i++)
des.r32[i] = -1.0f;
// Initialize the indices
indices.i32[0] = 2;
indices.i32[1] = 1;
indices.i32[2] = 6;
indices.i32[3] = 5;
indices.i32[4] = 4;
indices.i32[5] = 13;
indices.i32[6] = 11;
indices.i32[7] = 9;
// Initialize the mask value
mask.i32[0] = merge_yes;
mask.i32[1] = merge_yes;
mask.i32[2] = merge_no;
mask.i32[3] = merge_yes;
mask.i32[4] = merge_yes;
mask.i32[5] = merge_no;
mask.i32[6] = merge_yes;
mask.i32[7] = merge_yes;
printf("\nResults for AvxGatherFloat()\n");
printf("Test array\n");
for (int i = 0; i < n; i++)
printf("x[%02d]: %6.1f\n", i, x[i]);
printf("\n");
const char* s1 = "Values BEFORE call to AvxGatherFloat()";
const char* s2 = "Values AFTER call to AvxGatherFloat()";
AvxGatherFloatPrint(s1, des, indices, mask);
AvxGatherFloat(&des, &indices, &mask, x);
AvxGatherFloatPrint(s2, des, indices, mask);
}
void AvxGatherI64Cpp(void)
{
const Int64 merge_no = 0;
const Int64 merge_yes = 0x8000000000000000LL;
const int n = 15;
Int64 x[n];
__attribute__((aligned(32))) YmmVal des;
__attribute__((aligned(16))) XmmVal indices;
__attribute__((aligned(32))) YmmVal mask;
// Initialize the test array
srand(36);
for (int i = 0; i < n; i++)
x[i] = (Int64)(rand() % 1000);
// Load des with initial values
for (int i = 0; i < 4; i++)
des.i64[i] = -1;
// Initialize the indices and mask elements
indices.i32[0] = 2;
indices.i32[1] = 7;
indices.i32[2] = 9;
indices.i32[3] = 12;
mask.i64[0] = merge_yes;
mask.i64[1] = merge_yes;
mask.i64[2] = merge_no;
mask.i64[3] = merge_yes;
printf("\nResults for AvxGatherI64()\n");
printf("Test array\n");
for (int i = 0; i < n; i++)
printf("x[%02d]: %8lld\n", i, x[i]);
printf("\n");
const char* s1 = "Values BEFORE call to AvxGatherI64()";
const char* s2 = "Values AFTER call to AvxGatherI64()";
AvxGatherI64Print(s1, des, indices, mask);
AvxGatherI64(&des, &indices, &mask, x);
AvxGatherI64Print(s2, des, indices, mask);
}
int main(int argc, char* argv[])
{
AvxGatherFloatCpp();
AvxGatherI64Cpp();
return 0;
}
avxgather.asm
; Name: avxgather.asm
;
; Build: g++ -c -m32 main.cpp -o main.o
; nasm -f elf32 -o avxgather.o avxgather.asm
; g++ -m32 -o avxgather avxgather.o main.o
;
; Source: Modern x86 Assembly Language Programming p. 463
global AvxGatherFloat
global AvxGatherI64
section .text
; extern "C" void AvxGatherFloat(YmmVal* des, YmmVal* indices, YmmVal* mask, const float* x);
;
; Description: The following function demonstrates use of the
; vgatherdps instruction.
;
; Requires: AVX2
%define des [ebp+8]
%define indices [ebp+12]
%define mask [ebp+16]
%define x [ebp+20]
AvxGatherFloat:
push ebp
mov ebp,esp
push ebx
; Load argument values. The contents of des are loaded into ymm0
; prior to execution of the vgatherdps instruction in order to
; demonstrate the conditional effects of the control mask.
mov eax,des ;eax = ptr to des
mov ebx,indices ;ebx = ptr to indices
mov ecx,mask ;ecx = ptr to mask
mov edx,x ;edx = ptr to x
vmovaps ymm0,[eax] ;ymm0 = des (initial values)
vmovdqa ymm1,[ebx] ;ymm1 = indices
vmovdqa ymm2,[ecx] ;ymm2 = mask
; Perform the gather operation and save the results.
vgatherdps ymm0,[edx+ymm1*4],ymm2 ;ymm0 = gathered elements
vmovaps [eax],ymm0 ;save des
vmovdqa [ebx],ymm1 ;save indices (unchanged)
vmovdqa [ecx],ymm2 ;save mask (all zeros)
vzeroupper
pop ebx
pop ebp
ret
; extern "C" void AvxGatherI64(YmmVal* des, XmmVal* indices, YmmVal* mask, const Int64* x);
;
; Description: The following function demonstrates use of the vpgatherdq
; instruction.
;
; Requires: AVX2
%define des [ebp+8]
%define indices [ebp+12]
%define mask [ebp+16]
%define x [ebp+20]
AvxGatherI64:
push ebp
mov ebp,esp
push ebx
; Load argument values. Note that the indices are loaded
; into register XMM1.
mov eax,des ;eax = ptr to des
mov ebx,indices ;ebx = ptr to indices
mov ecx,mask ;ecx = ptr to mask
mov edx,x ;edx = ptr to x
vmovdqa ymm0,[eax] ;ymm0 = des (initial values)
vmovdqa xmm1,[ebx] ;xmm1 = indices
vmovdqa ymm2,[ecx] ;ymm2 = mask
; Perform the gather and save the results. Note that the
; scale factor matches the size of the gathered elements.
vpgatherdq ymm0,[edx+xmm1*8],ymm2 ;ymm0 = gathered elements
vmovdqa [eax],ymm0 ;save des
vmovdqa [ebx],xmm1 ;save indices (unchanged)
vmovdqa [ecx],ymm2 ;save mask (all zeros)
vzeroupper
pop ebx
pop ebp
ret
build
g++ -c -m32 main.cpp -o main.o
nasm -f elf32 -o avxgather.o avxgather.asm
g++ -m32 -o avxgather avxgather.o main.o ../../commonfiles/xmmval.o ../../commonfiles/ymmval.o