avxfma.h
#pragma once
#include "../../commonfiles/miscdefs.h"
// These functions are defined in AvxFma.cpp
extern void AvxFmaSmooth5Cpp(float* y, const float*x, Uint32 n, const float* sm5_mask);
extern bool AvxFmaInitX(float* x, Uint32 n);
// These functions are defined in AvxFma_.asm
extern "C" void AvxFmaSmooth5a(float* y, const float*x, Uint32 n, const float* sm5_mask);
extern "C" void AvxFmaSmooth5b(float* y, const float*x, Uint32 n, const float* sm5_mask);
extern "C" void AvxFmaSmooth5c(float* y, const float*x, Uint32 n, const float* sm5_mask);
// next function is removed because of windows-only functions. In time I need to figure out
// how to reconstruct this function for g++ on Linux
// These functions are defined in AvxFmaTimed.cpp
//extern void AvxFmaTimed(void);
main.cpp
#include "avxfma.h"
#include <stdio.h>
#include <malloc.h>
#define _USE_MATH_DEFINES
#include <math.h>
#include <stdlib.h>
float* aligned_malloc(int n, int alignment){
void* ptrx;
int rc = posix_memalign(&ptrx,alignment,n*sizeof(float));
return (float*)ptrx;
}
bool AvxFmaInitX(float* x, Uint32 n)
{
const float degtorad = (float)(M_PI / 180.0);
const float t_start = 0;
const float t_step = 0.002f;
const Uint32 m = 3;
const float amp[m] = {1.0f, 0.80f, 1.20f};
const float freq[m] = {5.0f, 10.0f, 15.0f};
const float phase[m] = {0.0f, 45.0f, 90.0f};
float t = t_start;
srand(97);
for (Uint32 i = 0; i < n; i++, t += t_step)
{
float x_total = 0;
for (Uint32 j = 0; j < m; j++)
{
float omega = 2.0f * (float)M_PI * freq[j];
float x_temp1 = amp[j] * sin(omega * t + phase[j] * degtorad);
float noise = (float)((rand() % 300) - 150) / 10.0f;
float x_temp2 = x_temp1 + x_temp1 * noise / 100.0f;
x_total += x_temp2;
}
x[i] = x_total;
}
return true;
}
void AvxFmaSmooth5Cpp(float* y, const float*x, Uint32 n, const float* sm5_mask)
{
for (Uint32 i = 2; i < n - 2; i++)
{
float sum = 0;
sum += x[i - 2] * sm5_mask[0];
sum += x[i - 1] * sm5_mask[1];
sum += x[i + 0] * sm5_mask[2];
sum += x[i + 1] * sm5_mask[3];
sum += x[i + 2] * sm5_mask[4];
y[i] = sum;
}
}
void AvxFma(void)
{
const Uint32 n = 512;
float* x = aligned_malloc(n, 32);
float* y_cpp = aligned_malloc(n, 32);
float* y_a = aligned_malloc(n, 32);
float* y_b = aligned_malloc(n, 32);
float* y_c = aligned_malloc(n, 32);
const float sm5_mask[] = { 0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f };
printf("Results for AvxFma\n");
if (!AvxFmaInitX(x, n))
{
printf("Data initialization failed\n");
return;
}
AvxFmaSmooth5Cpp(y_cpp, x, n, sm5_mask);
AvxFmaSmooth5a(y_a, x, n, sm5_mask);
AvxFmaSmooth5b(y_b, x, n, sm5_mask);
AvxFmaSmooth5b(y_c, x, n, sm5_mask);
FILE* fp;
const char* fn = "AvxFmaRawData.csv";
fp=fopen(fn, "wr");
if (fp == NULL)
{
printf("File open failed (%s)\n", fn);
return;
}
fprintf(fp, "i, x, y_cpp, y_a, y_b, y_c, ");
fprintf(fp, "diff_ab, diff_ac, diff_bc\n");
Uint32 num_diff_ab = 0, num_diff_ac = 0, num_diff_bc = 0;
for (Uint32 i = 2; i < n - 2; i++)
{
bool diff_ab = false, diff_ac = false, diff_bc = false;
if (y_a[i] != y_b[i])
{
diff_ab = true;
num_diff_ab++;
}
if (y_a[i] != y_c[i])
{
diff_ac = true;
num_diff_ac++;
}
if (y_b[i] != y_c[i])
{
diff_bc = true;
num_diff_bc++;
}
const char* fs1 = "%15.8f, ";
fprintf(fp, "%4d, ", i);
fprintf(fp, fs1, x[i]);
fprintf(fp, fs1, y_cpp[i]);
fprintf(fp, fs1, y_a[i]);
fprintf(fp, fs1, y_b[i]);
fprintf(fp, fs1, y_c[i]);
fprintf(fp, "%d, %d, %d, ", diff_ab, diff_ac, diff_bc);
fprintf(fp, "\n");
}
printf("\nRaw data saved to file %s\n", fn);
printf("\nNumber of data point differences between\n");
printf(" y_a and y_b: %u\n", num_diff_ab);
printf(" y_a and y_c: %u\n", num_diff_ac);
printf(" y_b and y_c: %u\n", num_diff_bc);
fclose(fp);
free(x);
free(y_cpp);
free(y_a);
free(y_b);
free(y_c);
}
int main(int argc, char* argv[])
{
try
{
AvxFma();
//AvxFmaTimed();
}
catch (...)
{
printf("Unexpected exception has occurred!\n");
printf("File: %s (main)\n", __FILE__);
}
return 0;
}
avxfma.asm
; Name: avxfma.asm
;
; Build: g++ -c -m32 main.cpp -o main.o
; nasm -f elf32 -o avxfma.o avxfma.asm
; g++ -m32 -o avxfma avxfma.o main.o
;
; Source: Modern x86 Assembly Language Programming p. 470
global AvxFmaSmooth5a
global AvxFmaSmooth5b
global AvxFmaSmooth5c
section .text
; void AvxFmaSmooth5a(float* y, const float*x, Uint32 n, const float* sm5_mask);
;
; Description: The following function applies a weighted-average
; smoothing transformation to the input array x using
; scalar SPFP multiplication and addition.
;
; Requires: AVX
%define y [ebp+8]
%define x [ebp+12]
%define n [ebp+16]
%define sm5_mask [ebp+20]
AvxFmaSmooth5a:
push ebp
mov ebp,esp
push esi
push edi
; Load argument values
mov edi,y ;edi = ptr to y
mov esi,x ;esi = ptr to x
mov ecx,n ;ecx = n
mov eax,sm5_mask ;eax = ptr to sm5_mask
add esi,8 ;adjust pointers and
add edi,8 ;counter to skip first 2
sub ecx,4 ;and last 2 elements
align 16
; Apply smoothing operator to each element of x
.@1:
vxorps xmm6,xmm6,xmm6 ;x_total=0
; Compute x_total += x[i-2]*sm5_mask[0]
vmovss xmm0,dword [esi-8]
vmulss xmm1,xmm0,dword [eax]
vaddss xmm6,xmm6,xmm1
; Compute x_total += x[i-1]*sm5_mask[1]
vmovss xmm2,dword [esi-4]
vmulss xmm3,xmm2,dword [eax+4]
vaddss xmm6,xmm6,xmm3
; Compute x_total += x[i]*sm5_mask[2]
vmovss xmm0,dword [esi]
vmulss xmm1,xmm0,dword[eax+8]
vaddss xmm6,xmm6,xmm1
; Compute x_total += x[i+1]*sm5_mask[3]
vmovss xmm2,dword[esi+4]
vmulss xmm3,xmm2,dword[eax+12]
vaddss xmm6,xmm6,xmm3
; Compute x_total += x[i+2]*sm5_mask[4]
vmovss xmm0,dword[esi+8]
vmulss xmm1,xmm0,dword[eax+16]
vaddss xmm6,xmm6,xmm1
; Save x_total
vmovss dword[edi],xmm6
add esi,4
add edi,4
sub ecx,1
jnz .@1
pop edi
pop esi
pop ebp
ret
; void AvxFmaSmooth5b(float* y, const float*x, Uint32 n, const float* sm5_mask);
;
; Description: The following function applies a weighted-average
; smoothing transformation to the input array x using
; scalar SPFP fused-multiply-add operations.
;
; Requires: AVX2, FMA
%define y [ebp+8]
%define x [ebp+12]
%define n [ebp+16]
%define sm5_mask [ebp+20]
AvxFmaSmooth5b:
push ebp
mov ebp,esp
push esi
push edi
; Load argument values
mov edi,y ;edi = ptr to y
mov esi,x ;esi = ptr to x
mov ecx,n ;ecx = n
mov eax,sm5_mask ;eax = ptr to sm5_mask
add esi,8 ;adjust pointers and
add edi,8 ;counter to skip first 2
sub ecx,4 ;and last 2 elements
align 16
; Apply smoothing operator to each element of x
.@1:
vxorps xmm6,xmm6,xmm6 ;set x_total1 = 0
vxorps xmm7,xmm7,xmm7 ;set x_total2 = 0
; Compute x_total1 = x[i-2] * sm5_mask[0] + x_total1
vmovss xmm0,dword[esi-8]
vfmadd231ss xmm6,xmm0,dword[eax]
; Compute x_total2 = x[i-1] * sm5_mask[1] + x_total2
vmovss xmm2,dword[esi-4]
vfmadd231ss xmm7,xmm2,dword[eax+4]
; Compute x_total1 = x[i] * sm5_mask[2] + x_total1
vmovss xmm0,dword[esi]
vfmadd231ss xmm6,xmm0,dword[eax+8]
; Compute x_total2 = x[i+1] * sm5_mask[3] + x_total2
vmovss xmm2,dword[esi+4]
vfmadd231ss xmm7,xmm2,dword[eax+12]
; Compute x_total1 = x[i+2] * sm5_mask[4] + x_total1
vmovss xmm0,dword[esi+8]
vfmadd231ss xmm6,xmm0,dword[eax+16]
; Compute final x_total and save result
vaddss xmm5,xmm6,xmm7
vmovss dword[edi],xmm5
add esi,4
add edi,4
sub ecx,1
jnz .@1
pop edi
pop esi
pop ebp
ret
; void AvxFmaSmooth5c(float* y, const float*x, Uint32 n, const float* sm5_mask);
;
; Description: The following function applies a weighted-average
; smoothing transformation to the input array x using
; scalar SPFP fused-multiply-add operations.
;
; Requires: AVX2, FMA
%define y [ebp+8]
%define x [ebp+12]
%define n [ebp+16]
%define sm5_mask [ebp+20]
AvxFmaSmooth5c:
push ebp
mov ebp,esp
push esi
push edi
; Load argument values
mov edi,y ;edi = ptr to y
mov esi,x ;esi = ptr to x
mov ecx,n ;ecx = n
mov eax,sm5_mask ;eax = ptr to sm5_mask
add esi,8 ;adjust pointers and
add edi,8 ;counter to skip first 2
sub ecx,4 ;and last 2 elements
align 16
; Apply smoothing operator to each element of x, save result to y
.@1:
vxorps xmm6,xmm6,xmm6 ;set x_total = 0
; Compute x_total = x[i-2] * sm5_mask[0] + x_total
vmovss xmm0,dword[esi-8]
vfmadd231ss xmm6,xmm0,dword[eax]
; Compute x_total = x[i-1] * sm5_mask[1] + x_total
vmovss xmm0,dword[esi-4]
vfmadd231ss xmm6,xmm0,dword[eax+4]
; Compute x_total = x[i] * sm5_mask[2] + x_total
vmovss xmm0,dword[esi]
vfmadd231ss xmm6,xmm0,dword[eax+8]
; Compute x_total = x[i+1] * sm5_mask[3] + x_total
vmovss xmm0,dword[esi+4]
vfmadd231ss xmm6,xmm0,dword[eax+12]
; Compute x_total = x[i+2] * sm5_mask[4] + x_total
vmovss xmm0,dword[esi+8]
vfmadd231ss xmm6,xmm0,dword[eax+16]
; Save result
vmovss dword[edi],xmm6
add esi,4
add edi,4
sub ecx,1
jnz .@1
pop edi
pop esi
pop ebp
ret
build
g++ -c -m32 main.cpp -o main.o -std=c++11
nasm -f elf32 -o avxfma.o avxfma.asm
g++ -m32 -o avxfma avxfma.o main.o