main.cpp
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <inttypes.h>
#include <time.h>
extern "C" __attribute__((aligned(32))) double CcEpsilon = 1.0e-12;
extern "C" bool AvxPfpCorrCoef(const double* x, const double* y, int n, double sums[5], double* rho);
bool AvxPfpCorrCoefCpp(const double* x, const double* y, int n, double sums[5], double* rho)
{
double sum_x = 0, sum_y = 0;
double sum_xx = 0, sum_yy = 0, sum_xy = 0;
// Make sure x and y are properly aligned to a 32-byte boundary
if (((uintptr_t)x & 0x1f) != 0)
return false;
if (((uintptr_t)y & 0x1f) != 0)
return false;
// Make sure n is valid
if ((n < 4) || ((n & 3) != 0))
return false;
// Calculate and save sum variables
for (int i = 0; i < n; i++)
{
sum_x += x[i];
sum_y += y[i];
sum_xx += x[i] * x[i];
sum_yy += y[i] * y[i];
sum_xy += x[i] * y[i];
}
sums[0] = sum_x;
sums[1] = sum_y;
sums[2] = sum_xx;
sums[3] = sum_yy;
sums[4] = sum_xy;
// Calculate rho
double rho_num = n * sum_xy - sum_x * sum_y;
double rho_den = sqrt(n * sum_xx - sum_x * sum_x) * sqrt(n * sum_yy - sum_y * sum_y);
if (rho_den >= CcEpsilon)
{
*rho = rho_num / rho_den;
return true;
}
else
{
*rho = 0;
return false;
}
}
int main(int argc, char* argv[])
{
const int n = 100;
__attribute__ ((aligned(32))) double x[n];
__attribute__ ((aligned(32))) double y[n];
double sums1[5], sums2[5];
double rho1, rho2;
srand(17);
printf(" x y\n");
for (int i = 0; i < n; i++)
{
// replacing 1000 by 100 shows major differences in the results between Cpp and the assembler
// code.
x[i] = rand() / 100000;
y[i] = x[i] + ((rand() % 6000) - 3000);
printf("%12.2lf , %12.2lf\n", x[i], y[i]);
}
bool rc1 = AvxPfpCorrCoefCpp(x, y, n, sums1, &rho1);
bool rc2 = AvxPfpCorrCoef(x, y, n, sums2, &rho2);
printf("\nResults for AvxPackedFloatingPointCorrCoef\n\n");
if (!rc1 || !rc2)
{
printf("Invalid return code (rc1: %d, rc2: %d)\n", rc1, rc2);
return 1;
}
printf("rho1: %.8lf rho2: %.8lf\n", rho1, rho2);
printf("\n");
printf("sum_x : %12.2lf %12.2lf difference: %12.2lf\n", sums1[0], sums2[0], sums1[0]-sums2[0]);
printf("sum_y : %12.2lf %12.2lf difference: %12.2lf\n", sums1[1], sums2[1], sums1[1]-sums2[1]);
printf("sum_xx: %12.2lf %12.2lf difference: %12.2lf\n", sums1[2], sums2[2], sums1[2]-sums2[2]);
printf("sum_yy: %12.2lf %12.2lf difference: %12.2lf\n", sums1[3], sums2[3], sums1[3]-sums2[3]);
printf("sum_xy: %12.2lf %12.2lf difference: %12.2lf\n", sums1[4], sums2[4], sums1[4]-sums2[4]);
return 0;
}
avxpackedfloatingpointcorrcoef.asm
; Name: avxpackedfloatingpointcorrcoef.asm
;
; Build: g++ -c -m32 main.cpp -o main.o
; nasm -f elf32 -o avxpackedfloatingpointcorrcoef.o avxpackedfloatingpointcorrcoef.asm
; g++ -m32 -o avxpackedfloatingpointcorrcoef avxpackedfloatingpointcorrcoef.o main.o
;
; Source: Modern x86 Assembly Language Programming p. 389
global AvxPfpCorrCoef
extern CcEpsilon
section .text
; extern "C" bool AvxPfpCorrCoef(const double* x, const double* y, int n, double sums[5], double* rho);
;
; Description: The following function computes the correlation
; coeficient for the specified x and y arrays.
;
; Requires: AVX
%define x [ebp+8]
%define y [ebp+12]
%define n [ebp+16]
%define sums [ebp+20]
%define rho [ebp+24]
AvxPfpCorrCoef:
push ebp
mov ebp,esp
; Load and validate argument values
mov eax,x ;eax = ptr to x
test eax,1fh
jnz .badArg ;jump if x is not aligned
mov edx,y ;edx = ptr to y
test edx,1fh
jnz .badArg ;jump if y is not aligned
mov ecx,n ;ecx = n
cmp ecx,4
jl .badArg ;jump if n < 4
test ecx,3 ;is n evenly divisible by 4?
jnz .badArg ;jump if no
shr ecx,2 ;ecx = num iterations
; Initialize sum variables to zero
vxorpd ymm3,ymm3,ymm3 ;ymm3 = packed sum_x
vmovapd ymm4,ymm3 ;ymm4 = packed sum_y
vmovapd ymm5,ymm3 ;ymm5 = packed sum_xx
vmovapd ymm6,ymm3 ;ymm6 = packed sum_yy
vmovapd ymm7,ymm3 ;ymm7 = packed sum_xy
; Calculate intermediate packed sum variables.
.@1:
vmovapd ymm0,[eax] ;ymm0 = packed x values
vmovapd ymm1,[edx] ;ymm1 = packed y values
vaddpd ymm3,ymm3,ymm0 ;update packed sum_x
vaddpd ymm4,ymm4,ymm1 ;update packed sum_y
vmulpd ymm2,ymm0,ymm1 ;ymm2 = packed xy values
vaddpd ymm7,ymm7,ymm2 ;update packed sum_xy
vmulpd ymm0,ymm0,ymm0 ;ymm0 = packed xx values
vmulpd ymm1,ymm1,ymm1 ;ymm1 = packed yy values
vaddpd ymm5,ymm5,ymm0 ;update packed sum_xx
vaddpd ymm6,ymm6,ymm1 ;update packed sum_yy
add eax,32 ;update x ptr
add edx,32 ;update y ptr
dec ecx ;update loop counter
jnz .@1 ;repeat if not finished
; Calculate final sum variables
vextractf128 xmm0,ymm3,1
vaddpd xmm1,xmm0,xmm3
vhaddpd xmm3,xmm1,xmm1 ;xmm3[63:0] = sum_x
vextractf128 xmm0,ymm4,1
vaddpd xmm1,xmm0,xmm4
vhaddpd xmm4,xmm1,xmm1 ;xmm4[63:0] = sum_y
vextractf128 xmm0,ymm5,1
vaddpd xmm1,xmm0,xmm5
vhaddpd xmm5,xmm1,xmm1 ;xmm5[63:0] = sum_xx
vextractf128 xmm0,ymm6,1
vaddpd xmm1,xmm0,xmm6
vhaddpd xmm6,xmm1,xmm1 ;xmm6[63:0] = sum_yy
vextractf128 xmm0,ymm7,1
vaddpd xmm1,xmm0,xmm7
vhaddpd xmm7,xmm1,xmm1 ;xmm7[63:0] = sum_xy
; Save final sum variables
mov eax,sums ;eax = ptr to sums array
vmovsd [eax],xmm3 ;save sum_x
vmovsd [eax+8],xmm4 ;save sum_y
vmovsd [eax+16],xmm5 ;save sum_xx
vmovsd [eax+24],xmm6 ;save sum_yy
vmovsd [eax+32],xmm7 ;save sum_xy
; Calculate rho numerator
; rho_num = n * sum_xy - sum_x * sum_y;
vcvtsi2sd xmm2,xmm2,n ;xmm2 = n
vmulsd xmm0,xmm2,xmm7 ;xmm0= = n * sum_xy
vmulsd xmm1,xmm3,xmm4 ;xmm1 = sum_x * sum_y
vsubsd xmm7,xmm0,xmm1 ;xmm7 = rho_num
; Calculate rho denominator
; t1 = sqrt(n * sum_xx - sum_x * sum_x)
; t2 = sqrt(n * sum_yy - sum_y * sum_y)
; rho_den = t1 * t2
vmulsd xmm0,xmm2,xmm5 ;xmm0 = n * sum_xx
vmulsd xmm3,xmm3,xmm3 ;xmm3 = sum_x * sum_x
vsubsd xmm3,xmm0,xmm3 ;xmm3 = n * sum_xx - sum_x * sum_x
vsqrtsd xmm3,xmm3,xmm3 ;xmm3 = t1
vmulsd xmm0,xmm2,xmm6 ;xmm0 = n * sum_yy
vmulsd xmm4,xmm4,xmm4 ;xmm4 = sum_y * sum_y
vsubsd xmm4,xmm0,xmm4 ;xmm4 = n * sum_yy - sum_y * sum_y
vsqrtsd xmm4,xmm4,xmm4 ;xmm4 = t2
vmulsd xmm0,xmm3,xmm4 ;xmm0 = rho_den
; Calculate final value of rho
xor eax,eax ;clear upper bits of eax
vcomisd xmm0,[CcEpsilon] ;is rho_den < CcEpsilon?
setae al ;set return code
jb .badRho ;jump if rho_den < CcEpsilon
vdivsd xmm1,xmm7,xmm0 ;xmm1 = rho
.svRho:
mov edx,rho ;edx = ptr to rho
vmovsd qword[edx],xmm1 ;save rho
vzeroupper
.done:
pop ebp
ret
; Error handlers
.badRho:
vxorpd xmm1,xmm1,xmm1 ;rho = 0
jmp .svRho
.badArg:
xor eax,eax ;eax = invalid arg ret code
jmp .done
build
g++ -c -m32 main.cpp -o main.o ^
nasm -f elf32 -o avxpackedfloatingpointcorrcoef.o avxpackedfloatingpointcorrcoef.asm
g++ -m32 -o avxpackedfloatingpointcorrcoef avxpackedfloatingpointcorrcoef.o main.o