main.cpp
#include <stddef.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
extern "C" double LsEpsilon = 1.0e-12;
extern "C" bool SsePfpLeastSquares(const double* x, const double* y, int n, double* m, double* b);
bool SsePfpLeastSquaresCpp(const double* x, const double* y, int n, double* m, double* b)
{
if (n < 2)
return false;
// Make sure x and y are properly aligned
if ((((uintptr_t)x & 0xf) != 0) || (((uintptr_t)y & 0xf) != 0))
return false;
double sum_x = 0, sum_y = 0.0, sum_xx = 0, sum_xy = 0.0;
for (int i = 0; i < n; i++)
{
sum_x += x[i];
sum_xx += x[i] * x[i];
sum_xy += x[i] * y[i];
sum_y += y[i];
}
double denom = n * sum_xx - sum_x * sum_x;
if (fabs(denom) >= LsEpsilon)
{
*m = (n * sum_xy - sum_x * sum_y) / denom;
*b = (sum_xx * sum_y - sum_x * sum_xy) / denom;
return true;
}
else
{
*m = *b = 0.0;
return false;
}
}
int main(int argc, char* argv[])
{
const int n = 11;
__attribute__ ((aligned(16))) double x[n] = {10, 13, 17, 19, 23, 7, 35, 51, 89, 92, 99};
__attribute__ ((aligned(16))) double y[n] = {1.2, 1.1, 1.8, 2.2, 1.9, 0.5, 3.1, 5.5, 8.4, 9.7, 10.4};
double m1, m2, b1, b2;
bool rc1 = SsePfpLeastSquaresCpp(x, y, n, &m1, &b1);
bool rc2 = SsePfpLeastSquares(x, y, n, &m2, &b2);
printf("\nResults from SsePackedFloatingPointLeastSquaresCpp\n");
printf(" rc: %12d\n", rc1);
printf(" slope: %12.8lf\n", m1);
printf(" intercept: %12.8lf\n", b1);
printf("\nResults from SsePackedFloatingPointLeastSquares\n");
printf(" rc: %12d\n", rc2);
printf(" slope: %12.8lf\n", m2);
printf(" intercept: %12.8lf\n", b2);
return 0;
}
ssepackedfloatingpointleastsquares.asm
; Name: ssepackedfloatingpointleastsquares.asm
;
; Build: g++ -c -m32 main.cpp -o main.o
; nasm -f elf32 -o ssepackedfloatingpointleastsquares.o ssepackedfloatingpointleastsquares.asm
; g++ -m32 -o ssepackedfloatingpointleastsquares ssepackedfloatingpointleastsquares.o main.o
;
; Source: Modern x86 Assembly Language Programming p. 254
extern LsEpsilon
global SsePfpLeastSquares
section .data
align 16 ; must be aligned on 16 byte boundary
PackedFp64Abs: dq 0x7fffffffffffffff,0x7fffffffffffffff
section .text
; extern "C" bool SsePfpLeastSquares(const double* x, const double* y, int n, double* m, double* b);
;
; Description: The following function computes the slope and intercept
; of a least squares regression line.
;
; Returns 0 = error (invalid n or improperly aligned array)
; 1 = success
;
; Requires: SSE3
%define x [ebp+8]
%define y [ebp+12]
%define n [ebp+16]
%define m [ebp+20]
%define b [ebp+24]
SsePfpLeastSquares:
push ebp
mov ebp,esp
push ebx
; Load and validate arguments
xor eax,eax ;set error return code
mov ebx,x ;ebx = 'x'
test ebx,0fh
jnz .done ;jump if 'x' not aligned
mov edx,y ;edx ='y'
test edx,0fh
jnz .done ;jump if 'y' not aligned
mov ecx,n ;ecx = n
cmp ecx,2
jl .done ;jump if n < 2
; Initialize sum registers
cvtsi2sd xmm3,ecx ;xmm3 = DPFP n
mov eax,ecx
and ecx,0xfffffffe ;ecx = n / 2 * 2
and eax,1 ;eax = n % 2
xorpd xmm4,xmm4 ;sum_x (both qwords)
xorpd xmm5,xmm5 ;sum_y (both qwords)
xorpd xmm6,xmm6 ;sum_xx (both qwords)
xorpd xmm7,xmm7 ;sum_xy (both qwords)
; Calculate sum variables. Note that two values are processed each cycle.
.@1:
movapd xmm0,[ebx] ;load next two x values
movapd xmm1,[edx] ;load next two y values
movapd xmm2,xmm0 ;copy of x
addpd xmm4,xmm0 ;update sum_x
addpd xmm5,xmm1 ;update sum_y
mulpd xmm0,xmm0 ;calc x * x
addpd xmm6,xmm0 ;update sum_xx
mulpd xmm2,xmm1 ;calc x * y
addpd xmm7,xmm2 ;update sum_xy
add ebx,16 ;ebx = next x array value
add edx,16 ;edx = next x array value
sub ecx,2 ;adjust counter
jnz .@1 ;repeat until done
; Update sum variables with final x, y values if 'n' is odd
or eax,eax
jz .calcFinalSums ;jump if n is even
movsd xmm0,qword[ebx] ;load final x
movsd xmm1,qword[edx] ;load final y
movsd xmm2,xmm0
addsd xmm4,xmm0 ;update sum_x
addsd xmm5,xmm1 ;update sum_y
mulsd xmm0,xmm0 ;calc x * x
addsd xmm6,xmm0 ;update sum_xx
mulsd xmm2,xmm1 ;calc x * y
addsd xmm7,xmm2 ;update sum_xy
; Calculate final sum values
.calcFinalSums:
haddpd xmm4,xmm4 ;xmm4[63:0] = final sum_x
haddpd xmm5,xmm5 ;xmm5[63:0] = final sum_y
haddpd xmm6,xmm6 ;xmm6[63:0] = final sum_xx
haddpd xmm7,xmm7 ;xmm7[63:0] = final sum_xy
; Compute denom and make sure it's valid
; denom = n * sum_xx - sum_x * sum_x
movsd xmm0,xmm3 ;n
movsd xmm1,xmm4 ;sum_x
mulsd xmm0,xmm6 ;n * sum_xx
mulsd xmm1,xmm1 ;sum_x * sum_x
subsd xmm0,xmm1 ;xmm0 = denom
movsd xmm2,xmm0
andpd xmm2,[PackedFp64Abs] ;xmm2 = fabs(denom)
comisd xmm2,[LsEpsilon]
jb .badDenom ;jump if denom < fabs(denom)
; Compute and save slope
; slope = (n * sum_xy - sum_x * sum_y) / denom
movsd xmm1,xmm4 ;sum_x
mulsd xmm3,xmm7 ;n * sum_xy
mulsd xmm1,xmm5 ;sum_x * sum_y
subsd xmm3,xmm1 ;slope_numerator
divsd xmm3,xmm0 ;xmm3 = final slope
mov edx,m ;edx = 'm'
movsd qword[edx],xmm3 ;save slope
; Compute and save intercept
; intercept = (sum_xx * sum_y - sum_x * sum_xy) / denom
mulsd xmm6,xmm5 ;sum_xx * sum_y
mulsd xmm4,xmm7 ;sum_x * sum_xy
subsd xmm6,xmm4 ;intercept_numerator
divsd xmm6,xmm0 ;xmm6 = final intercept
mov edx,b ;edx = 'b'
movsd qword[edx],xmm6 ;save intercept
mov eax,1 ;success return code
.done:
pop ebx
pop ebp
ret
; Set 'm' and 'b' to 0.0
.badDenom:
xor eax,eax ;set error code
mov edx,m ;eax = 'm'
mov [edx],eax
mov [edx+4],eax ;*m = 0.0
mov edx,b ;edx = 'b'
mov [edx],eax
mov [edx+4],eax ;*b = 0.0
jmp .done
build
g++ -c -m32 main.cpp -o main.o
nasm -f elf32 -o ssepackedfloatingpointleastsquares.o ssepackedfloatingpointleastsquares.asm
g++ -m32 -o ssepackedfloatingpointleastsquares ssepackedfloatingpointleastsquares.o main.o