main.cpp
#include <stdio.h>
#include <math.h>

extern "C" void AvxSfpQuadEqu(const double coef[3], double roots[2], double epsilon, int* dis);

void AvxSfpQuadEquCpp(const double coef[3], double roots[2], double epsilon, int* dis)
{
    double a = coef[0];
    double b = coef[1];
    double c = coef[2];
    double delta = b * b - 4.0 * a * c;
    double temp = 2.0 * a;

    if (fabs(a) < epsilon)
    {
        *dis = 9999;
        return;
    }

    if (fabs(delta) < epsilon)
    {
        roots[0] = -b / temp;
        roots[1] = -b / temp;
        *dis = 0;
    }
    else if (delta > 0)
    {
        roots[0] = (-b + sqrt(delta)) / temp;
        roots[1] = (-b - sqrt(delta)) / temp;
        *dis = 1;
    }
    else
    {
        // roots[0] contains real part, roots[1] contains imaginary part
        // complete answer is (r0, +r1), (r0, -r1)
        roots[0] = -b / temp;
        roots[1] = sqrt(-delta) / temp;
        *dis = -1;
    }
}

int main(int argc, char* argv[])
{
    const int n = 4;
    const double coef[n * 3] =
    {
        2.0, 8.0, -15.0,        // real roots (b * b > 4 * a * c)
        1.0, 6.0, 9.0,          // identical roots (b * b = 4 * a * c)
        3.0, 2.0, 4.0,          // complex roots (b * b < 4 * a * c)
        1.0e-13, 7.0, -5.0,     // invalid value for a
    };

    const double epsilon = 1.0e-12;

    printf("Results for AvxScalarFloatingPointQuadEqu\n");

    for (int i = 0; i < n * 3; i += 3)
    {
        double roots1[2], roots2[2];
        const double* coef2 = &coef[i];
        int dis1, dis2;

        AvxSfpQuadEquCpp(coef2, roots1, epsilon, &dis1);
        AvxSfpQuadEqu(coef2, roots2, epsilon, &dis2);

        printf("\na: %lf, b: %lf c: %lf\n", coef2[0], coef2[1], coef2[2]);

        if (dis1 != dis2)
        {
            printf("Discriminant compare error\b");
            printf("  dis1/dis2: %d/%d\n", dis1, dis2);
        }

        switch (dis1)
        {
            case 1:
                printf("Distinct real roots\n");
                printf("  C++ roots: %lf %lf\n", roots1[0], roots1[1]);
                printf("  AVX roots: %lf %lf\n", roots2[0], roots2[1]);
                break;

            case 0:
                printf("Identical roots\n");
                printf("  C++ root: %lf\n", roots1[0]);
                printf("  AVX root: %lf\n", roots2[0]);
                break;

            case -1:
                printf("Complex roots\n");
                printf("  C++ roots: (%lf %lf) ", roots1[0], roots1[1]);
                printf("(%lf %lf)\n", roots1[0], -roots1[1]);
                printf("  AVX roots: (%lf %lf) ", roots2[0], roots2[1]);
                printf("(%lf %lf)\n", roots2[0], -roots2[1]);
                break;

            case 9999:
                printf("Coefficient 'a' is invalid\n");
                break;

            default:
                printf("Invalid discriminant value: %d\n", dis1);
                return 1;
        }
    }

    return 0;
}
avxscalarfloatingpointquadequ.asm
; Name:     avxscalarfloatingpointquadequ.asm
;
; Build:    g++ -c -m32 main.cpp -o main.o
;           nasm -f elf32 -o avxscalarfloatingpointquadequ.o avxscalarfloatingpointquadequ.asm
;           g++ -m32 -o avxscalarfloatingpointquadequ avxscalarfloatingpointquadequ.o main.o
;
; Source:   Modern x86 Assembly Language Programming p. 361

global AvxSfpQuadEqu

section .data
align 16

    FpNegateMask:    dq 8000000000000000h,0   ;mask to negate DPFP value
    FpAbsMask:       dq 7FFFFFFFFFFFFFFFh,-1  ;mask to compute fabs()
    r8_0p0:          dq 0.0
    r8_2p0:          dq 2.0
    r8_4p0:          dq 4.0

section .text


; extern "C" void AvxSfpQuadEqu(const double coef[3], double roots[2], double epsilon, int* dis);
;
; Description:  The following function calculates the roots of a
;               quadratic equation using the quadratic formula.
;
; Requires:     AVX

%define coef    [ebp+8]
%define roots   [ebp+12]
%define epsilon [ebp+16]
%define dis     [ebp+24]

AvxSfpQuadEqu:
    push    ebp
    mov     ebp,esp

; Load argument values
    mov     eax,coef                    ;eax = ptr to coeff array
    mov     ecx,roots                   ;ecx = ptr to roots array
    mov     edx,dis                     ;edx = ptr to dis
    vmovsd  xmm0,[eax]                  ;xmm0 = a
    vmovsd  xmm1,[eax+8]                ;xmm1 = b
    vmovsd  xmm2,[eax+16]               ;xmm2 = c
    vmovsd  xmm7,epsilon                ;xmm7 = epsilon

; Make sure coefficient a is valid
    vandpd  xmm6,xmm0,[FpAbsMask]       ;xmm2 = fabs(a)
    vcomisd xmm6,xmm7
    jb      .error                      ;jump if fabs(a) < epsilon

; Compute intermediate values
    vmulsd  xmm3,xmm1,xmm1              ;xmm3 = b * b
    vmulsd  xmm4,xmm0,[r8_4p0]          ;xmm4 = 4 * a
    vmulsd  xmm4,xmm4,xmm2              ;xmm4 = 4 * a * c
    vsubsd  xmm3,xmm3,xmm4              ;xmm3 = b * b - 4 * a * c
    vmulsd  xmm0,xmm0,[r8_2p0]          ;xmm0 = 2 * a
    vxorpd  xmm1,xmm1,[FpNegateMask]    ;xmm1 = -b

; Test delta to determine root type
    vandpd  xmm2,xmm3,[FpAbsMask]       ;xmm2 = fabs(delta)
    vcomisd xmm2,xmm7
    jb      .identicalRoots             ;jump if fabs(delta) < epsilon
    vcomisd xmm3,[r8_0p0]
    jb      .complexRoots               ;jump if delta < 0.0

; Distinct real roots
; r1 = (-b + sqrt(delta)) / 2 * a, r2 = (-b - sqrt(delta)) / 2 * a
    vsqrtsd xmm3,xmm3,xmm3              ;xmm3 = sqrt(delta)
    vaddsd  xmm4,xmm1,xmm3              ;xmm4 = -b + sqrt(delta)
    vsubsd  xmm5,xmm1,xmm3              ;xmm5 = -b - sqrt(delta)
    vdivsd  xmm4,xmm4,xmm0              ;xmm4 = final r1
    vdivsd  xmm5,xmm5,xmm0              ;xmm5 = final r2
    vmovsd  [ecx],xmm4                  ;save r1
    vmovsd  [ecx+8],xmm5                ;save r2
    mov     dword[edx],1                ;*dis = 1
    jmp     .done

; Identical roots
; r1 = r2 = -b / 2 * a
.identicalRoots:
    vdivsd  xmm4,xmm1,xmm0              ;xmm4 = -b / 2 * a
    vmovsd  [ecx],xmm4                  ;save r1
    vmovsd  [ecx+8],xmm4                ;save r2
    mov     dword[edx],0                ;*dis = 0
    jmp     .done

; Complex roots
;  real = -b / 2 * a, imag = sqrt(-delta) / 2 * a
;  final roots: r1 = (real, imag), r2 = (real, -imag)
.complexRoots:
    vdivsd  xmm4,xmm1,xmm0              ;xmm4 = -b / 2 * a
    vxorpd  xmm3,xmm3,[FpNegateMask]    ;xmm3 = -delta
    vsqrtsd xmm3,xmm3,xmm3              ;xmm3 = sqrt(-delta)
    vdivsd  xmm5,xmm3,xmm0              ;xmm5 = sqrt(-delta) / 2 * a
    vmovsd  [ecx],xmm4                  ;save real part
    vmovsd  [ecx+8],xmm5                ;save imaginary part
    mov     dword[edx],-1               ;*dis = -1

.done:
    pop     ebp
    ret

.error:
    mov     dword[edx],9999              ;*dis = 9999 (error code)
    pop     ebp
    ret
build
g++ -c -m32 main.cpp -o main.o
nasm -f elf32 -o avxscalarfloatingpointquadequ.o avxscalarfloatingpointquadequ.asm
g++ -m32 -o avxscalarfloatingpointquadequ avxscalarfloatingpointquadequ.o main.o