main.cpp
#include <stdio.h>
#include <math.h>
extern "C" void AvxSfpQuadEqu(const double coef[3], double roots[2], double epsilon, int* dis);
void AvxSfpQuadEquCpp(const double coef[3], double roots[2], double epsilon, int* dis)
{
double a = coef[0];
double b = coef[1];
double c = coef[2];
double delta = b * b - 4.0 * a * c;
double temp = 2.0 * a;
if (fabs(a) < epsilon)
{
*dis = 9999;
return;
}
if (fabs(delta) < epsilon)
{
roots[0] = -b / temp;
roots[1] = -b / temp;
*dis = 0;
}
else if (delta > 0)
{
roots[0] = (-b + sqrt(delta)) / temp;
roots[1] = (-b - sqrt(delta)) / temp;
*dis = 1;
}
else
{
// roots[0] contains real part, roots[1] contains imaginary part
// complete answer is (r0, +r1), (r0, -r1)
roots[0] = -b / temp;
roots[1] = sqrt(-delta) / temp;
*dis = -1;
}
}
int main(int argc, char* argv[])
{
const int n = 4;
const double coef[n * 3] =
{
2.0, 8.0, -15.0, // real roots (b * b > 4 * a * c)
1.0, 6.0, 9.0, // identical roots (b * b = 4 * a * c)
3.0, 2.0, 4.0, // complex roots (b * b < 4 * a * c)
1.0e-13, 7.0, -5.0, // invalid value for a
};
const double epsilon = 1.0e-12;
printf("Results for AvxScalarFloatingPointQuadEqu\n");
for (int i = 0; i < n * 3; i += 3)
{
double roots1[2], roots2[2];
const double* coef2 = &coef[i];
int dis1, dis2;
AvxSfpQuadEquCpp(coef2, roots1, epsilon, &dis1);
AvxSfpQuadEqu(coef2, roots2, epsilon, &dis2);
printf("\na: %lf, b: %lf c: %lf\n", coef2[0], coef2[1], coef2[2]);
if (dis1 != dis2)
{
printf("Discriminant compare error\b");
printf(" dis1/dis2: %d/%d\n", dis1, dis2);
}
switch (dis1)
{
case 1:
printf("Distinct real roots\n");
printf(" C++ roots: %lf %lf\n", roots1[0], roots1[1]);
printf(" AVX roots: %lf %lf\n", roots2[0], roots2[1]);
break;
case 0:
printf("Identical roots\n");
printf(" C++ root: %lf\n", roots1[0]);
printf(" AVX root: %lf\n", roots2[0]);
break;
case -1:
printf("Complex roots\n");
printf(" C++ roots: (%lf %lf) ", roots1[0], roots1[1]);
printf("(%lf %lf)\n", roots1[0], -roots1[1]);
printf(" AVX roots: (%lf %lf) ", roots2[0], roots2[1]);
printf("(%lf %lf)\n", roots2[0], -roots2[1]);
break;
case 9999:
printf("Coefficient 'a' is invalid\n");
break;
default:
printf("Invalid discriminant value: %d\n", dis1);
return 1;
}
}
return 0;
}
avxscalarfloatingpointquadequ.asm
; Name: avxscalarfloatingpointquadequ.asm
;
; Build: g++ -c -m32 main.cpp -o main.o
; nasm -f elf32 -o avxscalarfloatingpointquadequ.o avxscalarfloatingpointquadequ.asm
; g++ -m32 -o avxscalarfloatingpointquadequ avxscalarfloatingpointquadequ.o main.o
;
; Source: Modern x86 Assembly Language Programming p. 361
global AvxSfpQuadEqu
section .data
align 16
FpNegateMask: dq 8000000000000000h,0 ;mask to negate DPFP value
FpAbsMask: dq 7FFFFFFFFFFFFFFFh,-1 ;mask to compute fabs()
r8_0p0: dq 0.0
r8_2p0: dq 2.0
r8_4p0: dq 4.0
section .text
; extern "C" void AvxSfpQuadEqu(const double coef[3], double roots[2], double epsilon, int* dis);
;
; Description: The following function calculates the roots of a
; quadratic equation using the quadratic formula.
;
; Requires: AVX
%define coef [ebp+8]
%define roots [ebp+12]
%define epsilon [ebp+16]
%define dis [ebp+24]
AvxSfpQuadEqu:
push ebp
mov ebp,esp
; Load argument values
mov eax,coef ;eax = ptr to coeff array
mov ecx,roots ;ecx = ptr to roots array
mov edx,dis ;edx = ptr to dis
vmovsd xmm0,[eax] ;xmm0 = a
vmovsd xmm1,[eax+8] ;xmm1 = b
vmovsd xmm2,[eax+16] ;xmm2 = c
vmovsd xmm7,epsilon ;xmm7 = epsilon
; Make sure coefficient a is valid
vandpd xmm6,xmm0,[FpAbsMask] ;xmm2 = fabs(a)
vcomisd xmm6,xmm7
jb .error ;jump if fabs(a) < epsilon
; Compute intermediate values
vmulsd xmm3,xmm1,xmm1 ;xmm3 = b * b
vmulsd xmm4,xmm0,[r8_4p0] ;xmm4 = 4 * a
vmulsd xmm4,xmm4,xmm2 ;xmm4 = 4 * a * c
vsubsd xmm3,xmm3,xmm4 ;xmm3 = b * b - 4 * a * c
vmulsd xmm0,xmm0,[r8_2p0] ;xmm0 = 2 * a
vxorpd xmm1,xmm1,[FpNegateMask] ;xmm1 = -b
; Test delta to determine root type
vandpd xmm2,xmm3,[FpAbsMask] ;xmm2 = fabs(delta)
vcomisd xmm2,xmm7
jb .identicalRoots ;jump if fabs(delta) < epsilon
vcomisd xmm3,[r8_0p0]
jb .complexRoots ;jump if delta < 0.0
; Distinct real roots
; r1 = (-b + sqrt(delta)) / 2 * a, r2 = (-b - sqrt(delta)) / 2 * a
vsqrtsd xmm3,xmm3,xmm3 ;xmm3 = sqrt(delta)
vaddsd xmm4,xmm1,xmm3 ;xmm4 = -b + sqrt(delta)
vsubsd xmm5,xmm1,xmm3 ;xmm5 = -b - sqrt(delta)
vdivsd xmm4,xmm4,xmm0 ;xmm4 = final r1
vdivsd xmm5,xmm5,xmm0 ;xmm5 = final r2
vmovsd [ecx],xmm4 ;save r1
vmovsd [ecx+8],xmm5 ;save r2
mov dword[edx],1 ;*dis = 1
jmp .done
; Identical roots
; r1 = r2 = -b / 2 * a
.identicalRoots:
vdivsd xmm4,xmm1,xmm0 ;xmm4 = -b / 2 * a
vmovsd [ecx],xmm4 ;save r1
vmovsd [ecx+8],xmm4 ;save r2
mov dword[edx],0 ;*dis = 0
jmp .done
; Complex roots
; real = -b / 2 * a, imag = sqrt(-delta) / 2 * a
; final roots: r1 = (real, imag), r2 = (real, -imag)
.complexRoots:
vdivsd xmm4,xmm1,xmm0 ;xmm4 = -b / 2 * a
vxorpd xmm3,xmm3,[FpNegateMask] ;xmm3 = -delta
vsqrtsd xmm3,xmm3,xmm3 ;xmm3 = sqrt(-delta)
vdivsd xmm5,xmm3,xmm0 ;xmm5 = sqrt(-delta) / 2 * a
vmovsd [ecx],xmm4 ;save real part
vmovsd [ecx+8],xmm5 ;save imaginary part
mov dword[edx],-1 ;*dis = -1
.done:
pop ebp
ret
.error:
mov dword[edx],9999 ;*dis = 9999 (error code)
pop ebp
ret
build
g++ -c -m32 main.cpp -o main.o
nasm -f elf32 -o avxscalarfloatingpointquadequ.o avxscalarfloatingpointquadequ.asm
g++ -m32 -o avxscalarfloatingpointquadequ avxscalarfloatingpointquadequ.o main.o