main.cpp
#include <stdio.h>
#include "../../commonfiles/ymmval.h"
#include <math.h>
extern "C" void AvxPermuteInt32(YmmVal* des, YmmVal* src, YmmVal* ind);
extern "C" void AvxPermuteFloat(YmmVal* des, YmmVal* src, YmmVal* ind);
extern "C" void AvxPermuteFloatIl(YmmVal* des, YmmVal* src, YmmVal* ind);
void AvxPermuteInt32Cpp(void)
{
__attribute__((aligned(32))) YmmVal des, src, ind;
src.i32[0] = 10; ind.i32[0] = 3;
src.i32[1] = 20; ind.i32[1] = 7;
src.i32[2] = 30; ind.i32[2] = 0;
src.i32[3] = 40; ind.i32[3] = 4;
src.i32[4] = 50; ind.i32[4] = 6;
src.i32[5] = 60; ind.i32[5] = 6;
src.i32[6] = 70; ind.i32[6] = 1;
src.i32[7] = 80; ind.i32[7] = 2;
AvxPermuteInt32(&des, &src, &ind);
printf("\nResults for AvxPermuteInt32()\n");
for (int i = 0; i < 8; i++)
{
printf("des[%d]: %5d ", i, des.i32[i]);
printf("ind[%d]: %5d ", i, ind.i32[i]);
printf("src[%d]: %5d ", i, src.i32[i]);
printf("\n");
}
}
void AvxPermuteFloatCpp(void)
{
__attribute__((aligned(32))) YmmVal des, src, ind;
// src1 indices must be between 0 and 7.
src.r32[0] = 800.0f; ind.i32[0] = 3;
src.r32[1] = 700.0f; ind.i32[1] = 7;
src.r32[2] = 600.0f; ind.i32[2] = 0;
src.r32[3] = 500.0f; ind.i32[3] = 4;
src.r32[4] = 400.0f; ind.i32[4] = 6;
src.r32[5] = 300.0f; ind.i32[5] = 6;
src.r32[6] = 200.0f; ind.i32[6] = 1;
src.r32[7] = 100.0f; ind.i32[7] = 2;
AvxPermuteFloat(&des, &src, &ind);
printf("\nResults for AvxPermuteFloat()\n");
for (int i = 0; i < 8; i++)
{
printf("des[%d]: %8.1f ", i, des.r32[i]);
printf("ind[%d]: %5d ", i, ind.i32[i]);
printf("src[%d]: %8.1f ", i, src.r32[i]);
printf("\n");
}
}
void AvxPermuteFloatIlCpp(void)
{
__attribute__((aligned(32))) YmmVal des, src, ind;
// Lower lane
src.r32[0] = sqrt(10.0f); ind.i32[0] = 3;
src.r32[1] = sqrt(20.0f); ind.i32[1] = 2;
src.r32[2] = sqrt(30.0f); ind.i32[2] = 2;
src.r32[3] = sqrt(40.0f); ind.i32[3] = 0;
// Upper lane
src.r32[4] = sqrt(50.0f); ind.i32[4] = 1;
src.r32[5] = sqrt(60.0f); ind.i32[5] = 3;
src.r32[6] = sqrt(70.0f); ind.i32[6] = 3;
src.r32[7] = sqrt(80.0f); ind.i32[7] = 2;
AvxPermuteFloatIl(&des, &src, &ind);
printf("\nResults for AvxPermuteFloatIl()\n");
for (int i = 0; i < 8; i++)
{
if (i == 0)
printf("Lower lane\n");
else if (i == 4)
printf("Upper lane\n");
printf("des[%d]: %8.4f ", i, des.r32[i]);
printf("ind[%d]: %5d ", i, ind.i32[i]);
printf("src[%d]: %8.4f ", i, src.r32[i]);
printf("\n");
}
}
int main(int argc, char* argv[])
{
AvxPermuteInt32Cpp();
AvxPermuteFloatCpp();
AvxPermuteFloatIlCpp();
return 0;
}
avxpermute.asm
; Name: avxpermute.asm
;
; Build: g++ -c -m32 main.cpp -o main.o
; nasm -f elf32 -o avxpermute.o avxpermute.asm
; g++ -m32 -o avxpermute avxpermute.o main.o
;
; Source: Modern x86 Assembly Language Programming p. 458
global AvxPermuteInt32
global AvxPermuteFloat
global AvxPermuteFloatIl
section .text
; extern "C" void AvxPermuteInt32(YmmVal* des, YmmVal* src, YmmVal* ind);
;
; Description: The following function demonstrates use of the
; vpermd instruction.
;
; Requires: AVX2
%define des [ebp+8]
%define src [ebp+12]
%define ind [ebp+16]
AvxPermuteInt32:
push ebp
mov ebp,esp
; Load argument values
mov eax,des ;eax = ptr to des
mov ecx,src ;ecx = ptr to src
mov edx,ind ;edx = ptr to ind
; Perform dword permutation
vmovdqa ymm1,[edx] ;ymm1 = ind
vpermd ymm0,ymm1,[ecx]
vmovdqa [eax],ymm0 ;save result
vzeroupper
pop ebp
ret
; extern "C" void AvxPermuteFloat(YmmVal* des, YmmVal* src, YmmVal* ind);
;
; Description: The following function demonstrates use of the
; vpermps instruction.
;
; Requires: AVX2
%define des [ebp+8]
%define src [ebp+12]
%define ind [ebp+16]
AvxPermuteFloat:
push ebp
mov ebp,esp
; Load argument values
mov eax,des ;eax = ptr to des
mov ecx,src ;ecx = ptr to src
mov edx,ind ;edx = ptr to ind
; Perform SPFP permutation
vmovdqa ymm1,[edx] ;ymm1 = ind
vpermps ymm0,ymm1,[ecx]
vmovaps [eax],ymm0 ;save result
vzeroupper
pop ebp
ret
; extern "C" void AvxPermuteFloatIl(YmmVal* des, YmmVal* src, YmmVal* ind);
;
; Description: The following function demonstrates use of the
; vpermilps instruction.
;
; Requires: AVX2
%define des [ebp+8]
%define src [ebp+12]
%define ind [ebp+16]
AvxPermuteFloatIl:
push ebp
mov ebp,esp
; Load argument values
mov eax,des ;eax = ptr to des
mov ecx,src ;ecx = ptr to src
mov edx,ind ;edx = ptr to ind
; Perform in-lane SPFP permutation. Note that the second source
; operand of vpermilps specifies the indices.
vmovdqa ymm1,[ecx] ;ymm1 = src
vpermilps ymm0,ymm1,[edx]
vmovaps [eax],ymm0 ;save result
vzeroupper
pop ebp
ret
build
g++ -c -m32 main.cpp -o main.o -std=c++11
nasm -f elf32 -o avxpermute.o avxpermute.asm
g++ -m32 -o avxpermute avxpermute.o main.o ../../commonfiles/ymmval.o