avxfma.h
#pragma once
#include "../../commonfiles/miscdefs.h"

// These functions are defined in AvxFma.cpp
extern void AvxFmaSmooth5Cpp(float* y, const float*x, Uint32 n, const float* sm5_mask);
extern bool AvxFmaInitX(float* x, Uint32 n);

// These functions are defined in AvxFma_.asm
extern "C" void AvxFmaSmooth5a(float* y, const float*x, Uint32 n, const float* sm5_mask);
extern "C" void AvxFmaSmooth5b(float* y, const float*x, Uint32 n, const float* sm5_mask);
extern "C" void AvxFmaSmooth5c(float* y, const float*x, Uint32 n, const float* sm5_mask);

// next function is removed because of windows-only functions.  In time I need to figure out
// how to reconstruct this function for g++ on Linux
// These functions are defined in AvxFmaTimed.cpp
//extern void AvxFmaTimed(void);
main.cpp
#include "avxfma.h"
#include <stdio.h>
#include <malloc.h>
#define _USE_MATH_DEFINES
#include <math.h>
#include <stdlib.h>

float* aligned_malloc(int n, int alignment){
    void* ptrx;
    int rc = posix_memalign(&ptrx,alignment,n*sizeof(float));
    return (float*)ptrx;
}

bool AvxFmaInitX(float* x, Uint32 n)
{
    const float degtorad = (float)(M_PI / 180.0);
    const float t_start = 0;
    const float t_step = 0.002f;
    const Uint32 m = 3;
    const float amp[m] = {1.0f, 0.80f, 1.20f};
    const float freq[m] = {5.0f, 10.0f, 15.0f};
    const float phase[m] = {0.0f, 45.0f, 90.0f};
    float t = t_start;

    srand(97);

    for (Uint32 i = 0; i < n; i++, t += t_step)
    {
        float x_total = 0;

        for (Uint32 j = 0; j < m; j++)
        {
            float omega = 2.0f * (float)M_PI * freq[j];
            float x_temp1 = amp[j] * sin(omega * t + phase[j] * degtorad);
            float noise = (float)((rand() % 300) - 150) / 10.0f;
            float x_temp2 = x_temp1 + x_temp1 * noise / 100.0f;

            x_total += x_temp2;
        }

        x[i] = x_total;
    }

    return true;
}

void AvxFmaSmooth5Cpp(float* y, const float*x, Uint32 n, const float* sm5_mask)
{
    for (Uint32 i = 2; i < n - 2; i++)
    {
        float sum = 0;

        sum += x[i - 2] * sm5_mask[0];
        sum += x[i - 1] * sm5_mask[1];
        sum += x[i + 0] * sm5_mask[2];
        sum += x[i + 1] * sm5_mask[3];
        sum += x[i + 2] * sm5_mask[4];
        y[i] = sum;
    }
}

void AvxFma(void)
{
    const Uint32 n = 512;
    float* x = aligned_malloc(n, 32);
    float* y_cpp = aligned_malloc(n, 32);
    float* y_a = aligned_malloc(n, 32);
    float* y_b = aligned_malloc(n, 32);
    float* y_c = aligned_malloc(n, 32);
    const float sm5_mask[] = { 0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f };

    printf("Results for AvxFma\n");

    if (!AvxFmaInitX(x, n))
    {
        printf("Data initialization failed\n");
        return;
    }

    AvxFmaSmooth5Cpp(y_cpp, x, n, sm5_mask);
    AvxFmaSmooth5a(y_a, x, n, sm5_mask);
    AvxFmaSmooth5b(y_b, x, n, sm5_mask);
    AvxFmaSmooth5b(y_c, x, n, sm5_mask);

    FILE* fp;
    const char* fn = "AvxFmaRawData.csv";

    fp=fopen(fn, "wr");
    if (fp == NULL)
    {
        printf("File open failed (%s)\n", fn);
        return;
    }

    fprintf(fp, "i, x, y_cpp, y_a, y_b, y_c, ");
    fprintf(fp, "diff_ab, diff_ac, diff_bc\n");

    Uint32 num_diff_ab = 0, num_diff_ac = 0, num_diff_bc = 0;

    for (Uint32 i = 2; i < n - 2; i++)
    {
        bool diff_ab = false, diff_ac = false, diff_bc = false;

        if (y_a[i] != y_b[i])
        {
            diff_ab = true;
            num_diff_ab++;
        }

        if (y_a[i] != y_c[i])
        {
            diff_ac = true;
            num_diff_ac++;
        }

        if (y_b[i] != y_c[i])
        {
            diff_bc = true;
            num_diff_bc++;
        }

        const char* fs1 = "%15.8f, ";
        fprintf(fp, "%4d, ", i);
        fprintf(fp, fs1, x[i]);
        fprintf(fp, fs1, y_cpp[i]);
        fprintf(fp, fs1, y_a[i]);
        fprintf(fp, fs1, y_b[i]);
        fprintf(fp, fs1, y_c[i]);
        fprintf(fp, "%d, %d, %d, ", diff_ab, diff_ac, diff_bc);
        fprintf(fp, "\n");
    }
    
    printf("\nRaw data saved to file %s\n", fn);
    printf("\nNumber of data point differences between\n");
    printf("  y_a and y_b: %u\n", num_diff_ab);
    printf("  y_a and y_c: %u\n", num_diff_ac);
    printf("  y_b and y_c: %u\n", num_diff_bc);

    fclose(fp);
    free(x);
    free(y_cpp);
    free(y_a);
    free(y_b);
    free(y_c);

}

int main(int argc, char* argv[])
{
    try
    {
        AvxFma();
        //AvxFmaTimed();
    }

    catch (...)
    {
        printf("Unexpected exception has occurred!\n");
        printf("File: %s (main)\n", __FILE__);
    }

    return 0;
}
avxfma.asm
; Name:     avxfma.asm
;
; Build:    g++ -c -m32 main.cpp -o main.o
;           nasm -f elf32 -o avxfma.o avxfma.asm
;           g++ -m32 -o avxfma avxfma.o main.o
;
; Source:   Modern x86 Assembly Language Programming p. 470

global AvxFmaSmooth5a
global AvxFmaSmooth5b
global AvxFmaSmooth5c

section .text

; void AvxFmaSmooth5a(float* y, const float*x, Uint32 n, const float* sm5_mask);
;
; Description:  The following function applies a weighted-average
;               smoothing transformation to the input array x using
;               scalar SPFP multiplication and addition.
;
; Requires:     AVX

%define y           [ebp+8]
%define x           [ebp+12]
%define n           [ebp+16]
%define sm5_mask    [ebp+20]

AvxFmaSmooth5a:
    push    ebp
    mov     ebp,esp
    push    esi
    push    edi

; Load argument values
    mov     edi,y                           ;edi = ptr to y
    mov     esi,x                           ;esi = ptr to x
    mov     ecx,n                           ;ecx = n
    mov     eax,sm5_mask                    ;eax = ptr to sm5_mask

    add     esi,8                           ;adjust pointers and
    add     edi,8                           ;counter to skip first 2
    sub     ecx,4                           ;and last 2 elements

align 16
; Apply smoothing operator to each element of x
.@1:
    vxorps  xmm6,xmm6,xmm6               ;x_total=0

; Compute x_total += x[i-2]*sm5_mask[0]
    vmovss  xmm0,dword [esi-8]
    vmulss  xmm1,xmm0,dword [eax]
    vaddss  xmm6,xmm6,xmm1

; Compute x_total += x[i-1]*sm5_mask[1]
    vmovss  xmm2,dword [esi-4]
    vmulss  xmm3,xmm2,dword [eax+4]
    vaddss  xmm6,xmm6,xmm3

; Compute x_total += x[i]*sm5_mask[2]
    vmovss  xmm0,dword [esi]
    vmulss  xmm1,xmm0,dword[eax+8]
    vaddss  xmm6,xmm6,xmm1

; Compute x_total += x[i+1]*sm5_mask[3]
    vmovss  xmm2,dword[esi+4]
    vmulss  xmm3,xmm2,dword[eax+12]
    vaddss  xmm6,xmm6,xmm3

; Compute x_total += x[i+2]*sm5_mask[4]
    vmovss  xmm0,dword[esi+8]
    vmulss  xmm1,xmm0,dword[eax+16]
    vaddss  xmm6,xmm6,xmm1

; Save x_total
    vmovss  dword[edi],xmm6

    add     esi,4
    add     edi,4
    sub     ecx,1
    jnz     .@1

    pop     edi
    pop     esi
    pop     ebp
    ret

; void AvxFmaSmooth5b(float* y, const float*x, Uint32 n, const float* sm5_mask);
;
; Description:  The following function applies a weighted-average
;               smoothing transformation to the input array x using
;               scalar SPFP fused-multiply-add operations.
;
; Requires:     AVX2, FMA

%define y           [ebp+8]
%define x           [ebp+12]
%define n           [ebp+16]
%define sm5_mask    [ebp+20]

AvxFmaSmooth5b:
    push    ebp
    mov     ebp,esp
    push    esi
    push    edi

; Load argument values
    mov     edi,y                           ;edi = ptr to y
    mov     esi,x                           ;esi = ptr to x
    mov     ecx,n                           ;ecx = n
    mov     eax,sm5_mask                    ;eax = ptr to sm5_mask

    add     esi,8                           ;adjust pointers and
    add     edi,8                           ;counter to skip first 2
    sub     ecx,4                           ;and last 2 elements

align 16
; Apply smoothing operator to each element of x
.@1:
    vxorps  xmm6,xmm6,xmm6               ;set x_total1 = 0
    vxorps  xmm7,xmm7,xmm7               ;set x_total2 = 0

; Compute x_total1 = x[i-2] * sm5_mask[0] + x_total1
    vmovss      xmm0,dword[esi-8]
    vfmadd231ss xmm6,xmm0,dword[eax]

; Compute x_total2 = x[i-1] * sm5_mask[1] + x_total2
    vmovss      xmm2,dword[esi-4]
    vfmadd231ss xmm7,xmm2,dword[eax+4]

; Compute x_total1 = x[i] * sm5_mask[2] + x_total1
    vmovss      xmm0,dword[esi]
    vfmadd231ss xmm6,xmm0,dword[eax+8]

; Compute x_total2 = x[i+1] * sm5_mask[3] + x_total2
    vmovss      xmm2,dword[esi+4]
    vfmadd231ss xmm7,xmm2,dword[eax+12]

; Compute x_total1 = x[i+2] * sm5_mask[4] + x_total1
    vmovss      xmm0,dword[esi+8]
    vfmadd231ss xmm6,xmm0,dword[eax+16]

; Compute final x_total and save result
    vaddss  xmm5,xmm6,xmm7
    vmovss  dword[edi],xmm5

    add     esi,4
    add     edi,4
    sub     ecx,1
    jnz     .@1

    pop     edi
    pop     esi
    pop     ebp
    ret

; void AvxFmaSmooth5c(float* y, const float*x, Uint32 n, const float* sm5_mask);
;
; Description:  The following function applies a weighted-average
;               smoothing transformation to the input array x using
;               scalar SPFP fused-multiply-add operations.
;
; Requires:     AVX2, FMA

%define y           [ebp+8]
%define x           [ebp+12]
%define n           [ebp+16]
%define sm5_mask    [ebp+20]

AvxFmaSmooth5c:
    push    ebp
    mov     ebp,esp
    push    esi
    push    edi

; Load argument values
    mov     edi,y                           ;edi = ptr to y
    mov     esi,x                           ;esi = ptr to x
    mov     ecx,n                           ;ecx = n
    mov     eax,sm5_mask                    ;eax = ptr to sm5_mask

    add     esi,8                           ;adjust pointers and
    add     edi,8                           ;counter to skip first 2
    sub     ecx,4                           ;and last 2 elements

align 16

; Apply smoothing operator to each element of x, save result to y
.@1:
    vxorps xmm6,xmm6,xmm6               ;set x_total = 0

; Compute x_total = x[i-2] * sm5_mask[0] + x_total
    vmovss      xmm0,dword[esi-8]
    vfmadd231ss xmm6,xmm0,dword[eax]

; Compute x_total = x[i-1] * sm5_mask[1] + x_total
    vmovss      xmm0,dword[esi-4]
    vfmadd231ss xmm6,xmm0,dword[eax+4]

; Compute x_total = x[i] * sm5_mask[2] + x_total
    vmovss      xmm0,dword[esi]
    vfmadd231ss xmm6,xmm0,dword[eax+8]

; Compute x_total = x[i+1] * sm5_mask[3] + x_total
    vmovss      xmm0,dword[esi+4]
    vfmadd231ss xmm6,xmm0,dword[eax+12]

; Compute x_total = x[i+2] * sm5_mask[4] + x_total
    vmovss      xmm0,dword[esi+8]
    vfmadd231ss xmm6,xmm0,dword[eax+16]

; Save result
    vmovss  dword[edi],xmm6

    add     esi,4
    add     edi,4
    sub     ecx,1
    jnz     .@1

    pop     edi
    pop     esi
    pop     ebp
    ret
build
g++ -c -m32 main.cpp -o main.o -std=c++11
nasm -f elf32 -o avxfma.o avxfma.asm
g++ -m32 -o avxfma avxfma.o main.o