main.cpp
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <inttypes.h>
#include <time.h>

extern "C" __attribute__((aligned(32))) double CcEpsilon = 1.0e-12;
extern "C" bool AvxPfpCorrCoef(const double* x, const double* y, int n, double sums[5], double* rho);

bool AvxPfpCorrCoefCpp(const double* x, const double* y, int n, double sums[5], double* rho)
{
    double sum_x = 0, sum_y = 0;
    double sum_xx = 0, sum_yy = 0, sum_xy = 0;

    // Make sure x and y are properly aligned to a 32-byte boundary
    if (((uintptr_t)x & 0x1f) != 0)
        return false;
    if (((uintptr_t)y & 0x1f) != 0)
        return false;

    // Make sure n is valid
    if ((n < 4) || ((n & 3) != 0))
        return false;

    // Calculate and save sum variables
    for (int i = 0; i < n; i++)
    {
        sum_x += x[i];
        sum_y += y[i];
        sum_xx += x[i] * x[i];
        sum_yy += y[i] * y[i];
        sum_xy += x[i] * y[i];
    }

    sums[0] = sum_x;
    sums[1] = sum_y;
    sums[2] = sum_xx;
    sums[3] = sum_yy;
    sums[4] = sum_xy;

    // Calculate rho
    double rho_num = n * sum_xy - sum_x * sum_y;
    double rho_den = sqrt(n * sum_xx - sum_x * sum_x) * sqrt(n * sum_yy - sum_y * sum_y);

    if (rho_den >= CcEpsilon)
    {
        *rho = rho_num / rho_den;
        return true;
    }
    else
    {
        *rho = 0;
        return false;
    }
}

int main(int argc, char* argv[])
{
    const int n = 100;
    __attribute__ ((aligned(32))) double x[n];
	__attribute__ ((aligned(32))) double y[n];
    double sums1[5], sums2[5];
    double rho1, rho2;

    srand(17);
	printf("       x              y\n");
    for (int i = 0; i < n; i++)
    {
		// replacing 1000 by 100 shows major differences in the results between Cpp and the assembler
		// code.
        x[i] = rand() / 100000;
        y[i] = x[i] + ((rand() % 6000) - 3000);
		printf("%12.2lf , %12.2lf\n", x[i], y[i]);
    }

    bool rc1 = AvxPfpCorrCoefCpp(x, y, n, sums1, &rho1);
    bool rc2 = AvxPfpCorrCoef(x, y, n, sums2, &rho2);

    printf("\nResults for AvxPackedFloatingPointCorrCoef\n\n");

    if (!rc1 || !rc2)
    {
        printf("Invalid return code (rc1: %d, rc2: %d)\n", rc1, rc2);
        return 1;
    }
    printf("rho1: %.8lf  rho2: %.8lf\n", rho1, rho2);
    printf("\n");
	printf("sum_x : %12.2lf %12.2lf difference: %12.2lf\n", sums1[0], sums2[0], sums1[0]-sums2[0]);
	printf("sum_y : %12.2lf %12.2lf difference: %12.2lf\n", sums1[1], sums2[1], sums1[1]-sums2[1]);
	printf("sum_xx: %12.2lf %12.2lf difference: %12.2lf\n", sums1[2], sums2[2], sums1[2]-sums2[2]);
	printf("sum_yy: %12.2lf %12.2lf difference: %12.2lf\n", sums1[3], sums2[3], sums1[3]-sums2[3]);
	printf("sum_xy: %12.2lf %12.2lf difference: %12.2lf\n", sums1[4], sums2[4], sums1[4]-sums2[4]);
    return 0;
}
avxpackedfloatingpointcorrcoef.asm
; Name:     avxpackedfloatingpointcorrcoef.asm
;
; Build:    g++ -c -m32 main.cpp -o main.o
;           nasm -f elf32 -o avxpackedfloatingpointcorrcoef.o avxpackedfloatingpointcorrcoef.asm
;           g++ -m32 -o avxpackedfloatingpointcorrcoef avxpackedfloatingpointcorrcoef.o main.o
;
; Source:   Modern x86 Assembly Language Programming p. 389

global AvxPfpCorrCoef

extern CcEpsilon

section .text

; extern "C" bool AvxPfpCorrCoef(const double* x, const double* y, int n, double sums[5], double* rho);
;
; Description:  The following function computes the correlation
;               coeficient for the specified x and y arrays.
;
; Requires:     AVX

%define x      [ebp+8]
%define y      [ebp+12]
%define n      [ebp+16]
%define sums   [ebp+20]
%define rho    [ebp+24]

AvxPfpCorrCoef:
    push    ebp
    mov     ebp,esp

; Load and validate argument values
    mov     eax,x                       ;eax = ptr to x
    test    eax,1fh
    jnz     .badArg                     ;jump if x is not aligned
    mov     edx,y                       ;edx = ptr to y
    test    edx,1fh
    jnz     .badArg                     ;jump if y is not aligned

    mov     ecx,n                       ;ecx = n
    cmp     ecx,4
    jl      .badArg                     ;jump if n < 4
    test    ecx,3                       ;is n evenly divisible by 4?
    jnz     .badArg                     ;jump if no
    shr     ecx,2                       ;ecx = num iterations

; Initialize sum variables to zero
    vxorpd  ymm3,ymm3,ymm3              ;ymm3 = packed sum_x
    vmovapd ymm4,ymm3                   ;ymm4 = packed sum_y
    vmovapd ymm5,ymm3                   ;ymm5 = packed sum_xx
    vmovapd ymm6,ymm3                   ;ymm6 = packed sum_yy
    vmovapd ymm7,ymm3                   ;ymm7 = packed sum_xy

; Calculate intermediate packed sum variables.
.@1:
    vmovapd ymm0,[eax]      ;ymm0 = packed x values
    vmovapd ymm1,[edx]      ;ymm1 = packed y values

    vaddpd  ymm3,ymm3,ymm0              ;update packed sum_x
    vaddpd  ymm4,ymm4,ymm1              ;update packed sum_y

    vmulpd  ymm2,ymm0,ymm1              ;ymm2 = packed xy values
    vaddpd  ymm7,ymm7,ymm2              ;update packed sum_xy

    vmulpd  ymm0,ymm0,ymm0              ;ymm0 = packed xx values
    vmulpd  ymm1,ymm1,ymm1              ;ymm1 = packed yy values
    vaddpd  ymm5,ymm5,ymm0              ;update packed sum_xx
    vaddpd  ymm6,ymm6,ymm1              ;update packed sum_yy

    add     eax,32                      ;update x ptr
    add     edx,32                      ;update y ptr
    dec     ecx                         ;update loop counter
    jnz     .@1                         ;repeat if not finished

; Calculate final sum variables
    vextractf128 xmm0,ymm3,1
    vaddpd  xmm1,xmm0,xmm3
    vhaddpd xmm3,xmm1,xmm1              ;xmm3[63:0] = sum_x

    vextractf128 xmm0,ymm4,1
    vaddpd  xmm1,xmm0,xmm4
    vhaddpd xmm4,xmm1,xmm1              ;xmm4[63:0] = sum_y

    vextractf128 xmm0,ymm5,1
    vaddpd  xmm1,xmm0,xmm5
    vhaddpd xmm5,xmm1,xmm1              ;xmm5[63:0] = sum_xx

    vextractf128 xmm0,ymm6,1
    vaddpd  xmm1,xmm0,xmm6
    vhaddpd xmm6,xmm1,xmm1              ;xmm6[63:0] = sum_yy

    vextractf128 xmm0,ymm7,1
    vaddpd  xmm1,xmm0,xmm7
    vhaddpd xmm7,xmm1,xmm1              ;xmm7[63:0] = sum_xy

; Save final sum variables
    mov     eax,sums                    ;eax = ptr to sums array
    vmovsd  [eax],xmm3                  ;save sum_x
    vmovsd  [eax+8],xmm4                ;save sum_y
    vmovsd  [eax+16],xmm5               ;save sum_xx
    vmovsd  [eax+24],xmm6               ;save sum_yy
    vmovsd  [eax+32],xmm7               ;save sum_xy

; Calculate rho numerator
; rho_num = n * sum_xy - sum_x * sum_y;
    vcvtsi2sd xmm2,xmm2,n  ;xmm2 = n
    vmulsd  xmm0,xmm2,xmm7              ;xmm0= = n * sum_xy
    vmulsd  xmm1,xmm3,xmm4              ;xmm1 = sum_x * sum_y
    vsubsd  xmm7,xmm0,xmm1              ;xmm7 = rho_num

; Calculate rho denominator
; t1 = sqrt(n * sum_xx - sum_x * sum_x)
; t2 = sqrt(n * sum_yy - sum_y * sum_y)
; rho_den = t1 * t2
    vmulsd  xmm0,xmm2,xmm5              ;xmm0 = n * sum_xx
    vmulsd  xmm3,xmm3,xmm3              ;xmm3 = sum_x * sum_x
    vsubsd  xmm3,xmm0,xmm3              ;xmm3 = n * sum_xx - sum_x * sum_x
    vsqrtsd xmm3,xmm3,xmm3              ;xmm3 = t1

    vmulsd  xmm0,xmm2,xmm6              ;xmm0 = n * sum_yy
    vmulsd  xmm4,xmm4,xmm4              ;xmm4 = sum_y * sum_y
    vsubsd  xmm4,xmm0,xmm4              ;xmm4 = n * sum_yy - sum_y * sum_y
    vsqrtsd xmm4,xmm4,xmm4              ;xmm4 = t2

    vmulsd  xmm0,xmm3,xmm4              ;xmm0 = rho_den

; Calculate final value of rho
    xor     eax,eax                     ;clear upper bits of eax
    vcomisd xmm0,[CcEpsilon]            ;is rho_den < CcEpsilon?
    setae   al                          ;set return code
    jb      .badRho                     ;jump if rho_den < CcEpsilon

    vdivsd  xmm1,xmm7,xmm0              ;xmm1 = rho
.svRho:
    mov     edx,rho                     ;edx = ptr to rho
    vmovsd  qword[edx],xmm1             ;save rho
    vzeroupper
.done:
    pop     ebp
    ret

; Error handlers
.badRho:
    vxorpd  xmm1,xmm1,xmm1              ;rho = 0
    jmp     .svRho
.badArg:
    xor     eax,eax                     ;eax = invalid arg ret code
    jmp     .done
build
g++ -c -m32 main.cpp -o main.o                                               ^
nasm -f elf32 -o avxpackedfloatingpointcorrcoef.o avxpackedfloatingpointcorrcoef.asm
g++ -m32 -o avxpackedfloatingpointcorrcoef avxpackedfloatingpointcorrcoef.o main.o