main.cpp
#include <stdio.h>
#include <memory.h>
#include <stdlib.h>
#include <inttypes.h>

extern "C" bool AvxPfpColMeans(const double* x, int nrows, int ncols, double* col_means);

double* aligned_malloc(int n, int alignment){
    void* ptrx;
    int rc = posix_memalign(&ptrx,alignment,n*sizeof(double));
    return (double*)ptrx;
}

bool AvxPfpColMeansCpp(const double* x, int nrows, int ncols, double* col_means)
{
    // Make sure nrows and ncols are valid
    if ((nrows <= 0) || (ncols <= 0))
        return false;

    // Make sure col_means is properly aligned
    if (((uintptr_t)col_means & 0x1f) != 0)
        return false;

    // Calculate column means
    memset(col_means, 0, ncols * sizeof(double));

    for (int i = 0; i < nrows; i++)
    {
        for (int j = 0; j < ncols; j++)
            col_means[j] += x[i * ncols + j];
    }

    for (int j = 0; j < ncols; j++)
        col_means[j] /= nrows;

    return true;
}

int main(int argc, char* argv[])
{
    const int nrows = 13;
    const int ncols = 11;
	double* x = (double*)malloc(nrows * ncols * sizeof(double));
    double* col_means1 = (double*)aligned_malloc(ncols, 32);
    double* col_means2 = (double*)aligned_malloc(ncols, 32);

    srand(47);
    rand();

    for (int i = 0; i < nrows; i++)
    {
        for (int j = 0; j < ncols; j++)
            x[i * ncols + j] = rand() % 511;
    }

    bool rc1 = AvxPfpColMeansCpp(x, nrows, ncols, col_means1);
    bool rc2 = AvxPfpColMeans(x, nrows, ncols, col_means2);
	
    printf("Results for sample program AvxPackedFloatingPointColMeans\n");

    if (rc1 != rc2)
    {
        printf("Bad return code (rc1 = %d, rc2 = %d)\n", rc1, rc2);
        return 1;
    }

    printf("\nTest Matrix\n");
    for (int i = 0; i < nrows; i++)
    {
        printf("row %2d: ", i);
        for (int j = 0; j < ncols; j++)
            printf("%5.0lf ", x[i * ncols + j]);
        printf("\n");
    }
    printf("\n");

    for (int j = 0; j < ncols; j++)
    {
        printf("col_means1[%2d]: %12.4lf  ", j, col_means1[j]);
        printf("col_means2[%2d]: %12.4lf  ", j, col_means2[j]);
        printf("\n");
    }
    
    free(x);
    free(col_means1);
    free(col_means2);
	
    return 0;
}
avxpackedfloatingpointcolmeans.asm
; Name:     avxpackedfloatingpointcolmeans.asm
;
; Build:    g++ -c -m32 main.cpp -o main.o
;           nasm -f elf32 -o avxpackedfloatingpointcolmeans.o avxpackedfloatingpointcolmeans.asm
;           g++ -m32 -o avxpackedfloatingpointcolmeans avxpackedfloatingpointcolmeans.o main.o
;
; Source:   Modern x86 Assembly Language Programming p. 396

global AvxPfpColMeans

section .text

; extern "C" bool AvxPfpColMeans(const double* x, int nrows, int ncols, double* col_means)
;
; Description:  The following function computes the mean value of each
;               column in a matrix of DPFP values.
;
; Requires:     AVX

AvxPfpColMeans:
    push    ebp
    mov     ebp,esp
    push    ebx
    push    esi
    push    edi

; Load and validate arguments
    mov     esi,dword[ebp+8]        ;esi = ptr to x

    xor     eax,eax
    mov     edx,[ebp+12]            ;edx = nrows
    test    edx,edx
    jle     .badArg                 ;jump if nrows <= 0

    mov     ecx,[ebp+16]            ;ecx = ncols
    test    ecx,ecx
    jle     .badArg                 ;jump if ncols <= 0

    mov     edi,dword[ebp+20]       ;edi = ptr to col_means
    test    edi,1fh
    jnz     .badArg                 ;jump if col_means not aligned

; Set col_means to zero
    mov     ebx,ecx                 ;ebx = ncols
    shl     ecx,1                   ;ecx = num dowrds in col_means
    rep     stosd                   ;set col_means to zero

; Compute the sum of each column in x
.lp1:
    mov     edi,dword[ebp+20]       ;edi = ptr to col_means
    xor     ecx,ecx                 ;ecx = col_index

.lp2:
    mov     eax,ecx                 ;eax = col_index
    add     eax,4
    cmp     eax,ebx                 ;4 or more columns remaining?
    jg      .@1                     ;jump if col_index + 4 > ncols

; Update col_means using next four columns
    vmovupd ymm0,[esi]              ;load next 4 cols of cur row
    vaddpd  ymm1,ymm0,[edi]         ;add to col_means
    vmovapd [edi],ymm1              ;save updated col_means
    add     ecx,4                   ;col_index += 4
    add     esi,32                  ;update x ptr
    add     edi,32                  ;update col_means ptr
    jmp     .nextColSet
.@1:
    sub     eax,2
    cmp     eax,ebx                 ;2 or more columns remaining?
    jg      .@2                     ;jump if col_index + 2 > ncols

; Update col_means using next two columns
    vmovupd xmm0,[esi]              ;load next 2 cols of cur row
    vaddpd  xmm1,xmm0,[edi]         ;add to col_meanss
    vmovapd [edi],xmm1              ;save updated col_meanss
    add     ecx,2                   ;col_index += 2
    add     esi,16                  ;update x ptr
    add     edi,16                  ;update col_means ptr
    jmp     .nextColSet

; Update col_means using next column (or last column in the current row)
.@2:
    vmovsd  xmm0,qword[esi]         ;load x from last column
    vaddsd  xmm1,xmm0,qword[edi]    ;add to col_means
    vmovsd  qword[edi],xmm1         ;save updated col_means
    add     ecx,1                   ;col_index += 1
    add     esi,8                   ;update x ptr
.nextColSet:
    cmp     ecx,ebx                 ;more columns in current row?
    jl      .lp2                    ;jump if yes
    dec     edx                     ;nrows -= 1
    jnz     .lp1                    ;jump if more rows

; Compute the final col_means
    mov       eax,[ebp+12]          ;eax = nrows
    vcvtsi2sd xmm2,xmm2,eax         ;xmm2 = DPFP nrows
    mov       edx,[ebp+16]          ;edx = ncols
    mov       edi,dword[ebp+20]     ;edi = ptr to col_means
.@3:
    vmovsd  xmm0,qword[edi]         ;xmm0 = col_means[i]
    vdivsd  xmm1,xmm0,xmm2          ;compute final mean
    vmovsd  [edi],xmm1              ;save col_mean[i]
    add     edi,8                   ;update col_means ptr
    dec     edx                     ;ncols -= 1
    jnz     .@3                     ;repeat until done
    mov     eax,1                   ;set success return code
    vzeroupper
.badArg:
    pop     edi
    pop     esi
    pop     ebx
    pop     ebp
    ret
build
g++ -g -c -m32 main.cpp -o main.o
nasm -f elf32 -o avxpackedfloatingpointcolmeans.o avxpackedfloatingpointcolmeans.asm
g++ -m32 -o avxpackedfloatingpointcolmeans avxpackedfloatingpointcolmeans.o main.o