Ellipsoid Calculations

This section examines a sample program that calculates the volume and surface area of an ellipsoid using the scalar floating-point capabilities of x86-AVX. The example is converted to c++/nasm on Linux with Qt5 from the original source code from the book.

more on Ellipsoids : https://en.wikipedia.org/wiki/Ellipsoid

Formulas for Volume (V) and Surface (A) calculation:

The assembly code differs from the one in the book.  The calculation of 4π, 4π/3 and p-1 are made before the loop instead of in the loop.

The PROLOGUE and EPILOGUE macros are used to assemble and compile correctly against the default -fPIC (position independent code) of g++.  You can omit these by building the project by hand instead of the IDE.  In the latter case you have to remove -fPIC from g++ on the commandline.

Download available on GITHUB

TEMPLATE = app

CONFIG += console c++11
CONFIG -= app_bundle
CONFIG -= qt

SOURCES += main.cpp
LIBS += -lm

QMAKE_EXTRA_COMPILERS += nasm
NASMEXTRAFLAGS = -f elf64 -g -F dwarf

OTHER_FILES += $$NASM_SOURCES
nasm.output = ${QMAKE_FILE_BASE}.o
nasm.commands = nasm $$NASMEXTRAFLAGS -o ${QMAKE_FILE_BASE}.o ${QMAKE_FILE_NAME}
nasm.input = NASM_SOURCES

NASM_SOURCES += avx64calcellipsoid_.asm

// build:
// nasm -f elf64 -g -F dwarf -o avx64calcellipsoid_.o avx64calcellipsoid_.asm
// g++ -c -pipe -O2 -std=gnu++11 -Wall -W -fPIC  -I. -I/home/agguro/Programs/Qt/64bits/5.11.0/gcc_64/mkspecs/linux-g++ -o main.o main.cpp
// g++ -Wl,-O1 -o Avx64CalcEllipsoid avx64calcellipsoid_.o main.o   -lm

#include <stdio.h>
#include <math.h>

extern "C" bool Avx64CalcEllipsoid_(const double* a, const double* b, const double* c, int n, double p, double* sa, double* vol);

bool Avx64CalcEllipsoid(const double* a, const double* b, const double* c, int n, double p, double* sa, double* vol)
{
    if (n <= 0)
        return false;

    for (int i = 0; i < n; i++)
    {
        double a_p = pow(a[i], p);
        double b_p = pow(b[i], p);
        double c_p = pow(c[i], p);

        double temp1 = (a_p * b_p + a_p * c_p + b_p * c_p) / 3;

        sa[i] = 4 * M_PI * pow(temp1, 1.0 / p);
        vol[i] = 4 * M_PI * a[i] * b[i] * c[i] / 3;
    }
    return true;
}

int main(void)
{
    const int n = 8;
    const  double p = 1.6075;
    const double a[n] = { 1, 2, 6, 3, 4,  5, 5, 2};
    const double b[n] = { 1, 2, 1, 7, 2,  6, 5, 7};
    const double c[n] = { 1, 2, 7, 4, 3, 11, 5, 9};
    double sa1[n], vol1[n];
    double sa2[n], vol2[n];

    Avx64CalcEllipsoid(a, b, c, n, p, sa1, vol1);
    Avx64CalcEllipsoid_(a, b, c, n, p, sa2, vol2);

    printf("Results for Avx64CalcEllipsoid\n\n");

    for (int i = 0; i < n; i++)
    {
        printf("\na, b, c: %6.2lf %6.2lf %6.2lf\n", a[i], b[i], c[i]);
        printf("  sa1, vol1: %14.8lf %14.8lf\n", sa1[i], vol1[i]);
        printf("  sa2, vol2: %14.8lf %14.8lf\n", sa2[i], vol2[i]);
    }

    return 0;
}
; Name:         avx64calcellipsoid_.asm
;
; Source:       Modern x86 Assembly Language Programming p.590

bits 64
align 16

global Avx64CalcEllipsoid_
    
extern pow
extern  _GLOBAL_OFFSET_TABLE_
    
section .bss

section .data
    r8_1p0   dq 1.0
    r8_3p0   dq 3.0
    r8_4p0   dq 4.0
    r8_pi    dq 3.14159265358979323846

section .text

;extern "C" bool Avx64CalcEllipsoid_(const double* a, const double* b, const double* c, int n, double p, double* sa, double* vol);
;
;Description:  The following function calculates the surface area
;              and volume of an ellipsoid
;
;Requires:     x86-64, AVX
;registers: rdi  :const double* a
;           rsi  :const double* b
;           rdx  :const double* c
;	    ecx  :int n (isn't saved because of single use)
;	    xmm0 :double p
;	    r8   :double* sa  (result)
;	    r9   :double* vol (result)

%define STK_TOTAL 0x60
;stack must be kept aligned to 16 bytes (0x10).
;12 local variables = 12 x 8 = 96 (or 0x60) bytes needed. We don't need padding because we have 6 non-volatile
;registers pushed on the stack. 6 x 8 = 48 (or 0x30) which is also aligned to 16 bytes.

%macro PROLOGUE 0
        push	r12
        push	r13
        push	r14
        push	r15
        push	rbx
        push    rbp 
        mov     rbp,rsp 
        push    rbx 
        call    .get_GOT 
    .get_GOT: 
        pop     rbx 
        add     rbx,_GLOBAL_OFFSET_TABLE_+$$-.get_GOT wrt ..gotpc
        sub	rsp,STK_TOTAL
%endmacro

%macro EPILOGUE 0
        add	rsp,STK_TOTAL
        mov     rbx,[rbp-8] 
        mov     rsp,rbp 
        pop     rbp
        pop	rbx
        pop	r15
        pop	r14
        pop	r13
        pop	r12
        ret
%endmacro

Avx64CalcEllipsoid_:
    PROLOGUE
    mov	    [rsp],rdi		    ;pointer to a[]
    mov	    [rsp+0x8],rsi	    ;pointer to b[]
    mov	    [rsp+0x10],rdx	    ;pointer to c[]
    movsd   [rsp+0x18],xmm0	    ;p
    mov	    [rsp+0x20],r8	    ;pointer to surface[]
    mov	    [rsp+0x28],r9	    ;pointer to volume[]

    ;first calculate constant values outside the loop
    ;calculate 4*pi
    vmovsd  xmm0,qword[r8_4p0]
    vmulsd  xmm0,xmm0,qword[r8_pi]
    vmovsd  [rsp+0x30],xmm0		;save 4*pi
    ;calculate 4*pi/3
    vdivsd  xmm0,xmm0,qword[r8_3p0]	;4*pi/3 in xmm2 for volume
    vmovsd  [rsp+0x38],xmm0		;save 4*pi/3
    ;calculate 1/p
    vmovsd  xmm0,qword[r8_1p0]		;1 in xmm0
    vmovsd  xmm1,[rsp+0x18]		;p in xmm1
    vdivsd  xmm0,xmm0,xmm1		;1/p
    vmovsd  [rsp+0x40],xmm0		;save 1/p

    test    ecx,ecx		    ;is n <= 0
    jle	    .error
    mov	    r15d,ecx
    xor	    r11,r11
.repeat:
    ;*** Calculate the ellipsoid's volume ***
    ;vol = (4*pi*a*b*c)/3
    ;xmm0 = 4*pi/3
    vmovsd  xmm0,[rsp+0x38]		;load 4*pi/3
    mov	    r12,qword[rsp]		;a[]
    mov	    r13,qword[rsp+0x8]		;b[]
    mov	    r14,qword[rsp+0x10]		;c[]
    vmulsd  xmm0,xmm0,[r12+r11]		;a*4*pi/3
    vmulsd  xmm0,xmm0,[r13+r11]		;a*b*4*pi/3
    vmulsd  xmm0,xmm0,[r14+r11]		;a*b*c*pi/3
    mov	    r9,qword[rsp+0x28]		;vol[]
    vmovsd  qword[r9+r11],xmm0		;save ellipsoid volume
    mov	    r8,qword[rsp+0x20]		;vol[]
    vmovsd  qword[r8+r11],xmm0
    
    ;*** Calculate surface  ***
    ;surface = 4*pi*(((a^p*b^p+a^p*c^p+b^p*c^p)/3)^(1/p))
    movsd   xmm0,[r12+r11]		;a
    vmovsd  xmm1,qword[rsp+0x18]	;p
    call    pow
    vmovsd  [rsp+0x48],xmm0		;a^p
    
    vmovsd  xmm0,[r13+r11]		;b
    vmovsd  xmm1,qword[rsp+0x18]	;p
    call    pow
    vmovsd  [rsp+0x50],xmm0		;b^p
    
    vmovsd  xmm0,[r14+r11]		;c
    vmovsd  xmm1,qword[rsp+0x18]	;p
    call    pow
    vmovsd  [rsp+0x58],xmm0		;c^p
    
    vmovsd  xmm0,qword[rsp+0x48]	;a^p
    vmovsd  xmm1,qword[rsp+0x50]	;b^p
    vmulsd  xmm0,xmm0,xmm1		;a^p*b^p
    vmovsd  xmm1,qword[rsp+0x50]	;b^p
    vmovsd  xmm2,qword[rsp+0x58]	;c^p
    vmulsd  xmm1,xmm1,xmm2		;b^p*c^p
    vmovsd  xmm2,qword[rsp+0x48]	;a^p
    vmovsd  xmm3,qword[rsp+0x58]	;c^p
    vmulsd  xmm2,xmm2,xmm3		;a^p*c^p
    vaddsd  xmm0,xmm0,xmm1		;a^p*b^p+b^p*c^p
    vaddsd  xmm0,xmm0,xmm2		;a^p*b^p+b^p*c^p+a^p*c^p    
    vdivsd  xmm0,xmm0,qword[r8_3p0]	;(a^p*b^p+b^p*c^p+a^p*c^p)/3
    
    vmovsd  xmm1,[rsp+0x40]		;1/p
    call    pow				;((a^p*b^p+b^p*c^p+a^p*c^p)/3)^(1/p)
    
    vmovsd  xmm1,[rsp+0x30]		;4*pi
    vmulsd  xmm0,xmm0,xmm1		;4*pi*(((a^p*b^p+b^p*c^p+a^p*c^p)/3)^(1/p))
    mov	    r8,qword[rsp+0x20]		;sa[]
    vmovsd  qword[r8+r11],xmm0		;save 
    
    add	    r11,8
    sub	    r15d,1
    jnz	    .repeat
    mov	    rax,1
    jmp	    .done
.error:
    xor	    rax,rax
.done:    
    EPILOGUE
If you should decide to build it from the commandline then you should use the next commands.

nasm -f elf64 -g -F dwarf -o avx64calcellipsoid_.o avx64calcellipsoid_.asm
g++ -c -pipe -O2 -std=gnu++11 -Wall -W -fPIC -I. -o main.o main.cpp
g++ -Wl,-O1 -o Avx64CalcEllipsoid avx64calcellipsoid_.o main.o -lm