Ellipsoid Calculations
This section examines a sample program that calculates the volume and surface area of an ellipsoid using the scalar floating-point capabilities of x86-AVX. The example is converted to c++/nasm on Linux with Qt5 from the original source code from the book.
more on Ellipsoids : https://en.wikipedia.org/wiki/Ellipsoid
Formulas for Volume (V) and Surface (A) calculation:
The assembly code differs from the one in the book. The calculation of 4π, 4π/3 and p-1 are made before the loop instead of in the loop.
The PROLOGUE and EPILOGUE macros are used to assemble and compile correctly against the default -fPIC (position independent code) of g++. You can omit these by building the project by hand instead of the IDE. In the latter case you have to remove -fPIC from g++ on the commandline.
Download available on GITHUB
TEMPLATE = app
CONFIG += console c++11 CONFIG -= app_bundle CONFIG -= qt SOURCES += main.cpp LIBS += -lm QMAKE_EXTRA_COMPILERS += nasm NASMEXTRAFLAGS = -f elf64 -g -F dwarf OTHER_FILES += $$NASM_SOURCES nasm.output = ${QMAKE_FILE_BASE}.o nasm.commands = nasm $$NASMEXTRAFLAGS -o ${QMAKE_FILE_BASE}.o ${QMAKE_FILE_NAME} nasm.input = NASM_SOURCES NASM_SOURCES += avx64calcellipsoid_.asm
// build: // nasm -f elf64 -g -F dwarf -o avx64calcellipsoid_.o avx64calcellipsoid_.asm // g++ -c -pipe -O2 -std=gnu++11 -Wall -W -fPIC -I. -I/home/agguro/Programs/Qt/64bits/5.11.0/gcc_64/mkspecs/linux-g++ -o main.o main.cpp // g++ -Wl,-O1 -o Avx64CalcEllipsoid avx64calcellipsoid_.o main.o -lm #include <stdio.h> #include <math.h> extern "C" bool Avx64CalcEllipsoid_(const double* a, const double* b, const double* c, int n, double p, double* sa, double* vol); bool Avx64CalcEllipsoid(const double* a, const double* b, const double* c, int n, double p, double* sa, double* vol) { if (n <= 0) return false; for (int i = 0; i < n; i++) { double a_p = pow(a[i], p); double b_p = pow(b[i], p); double c_p = pow(c[i], p); double temp1 = (a_p * b_p + a_p * c_p + b_p * c_p) / 3; sa[i] = 4 * M_PI * pow(temp1, 1.0 / p); vol[i] = 4 * M_PI * a[i] * b[i] * c[i] / 3; } return true; } int main(void) { const int n = 8; const double p = 1.6075; const double a[n] = { 1, 2, 6, 3, 4, 5, 5, 2}; const double b[n] = { 1, 2, 1, 7, 2, 6, 5, 7}; const double c[n] = { 1, 2, 7, 4, 3, 11, 5, 9}; double sa1[n], vol1[n]; double sa2[n], vol2[n]; Avx64CalcEllipsoid(a, b, c, n, p, sa1, vol1); Avx64CalcEllipsoid_(a, b, c, n, p, sa2, vol2); printf("Results for Avx64CalcEllipsoid\n\n"); for (int i = 0; i < n; i++) { printf("\na, b, c: %6.2lf %6.2lf %6.2lf\n", a[i], b[i], c[i]); printf(" sa1, vol1: %14.8lf %14.8lf\n", sa1[i], vol1[i]); printf(" sa2, vol2: %14.8lf %14.8lf\n", sa2[i], vol2[i]); } return 0; }
; Name: avx64calcellipsoid_.asm ; ; Source: Modern x86 Assembly Language Programming p.590 bits 64 align 16 global Avx64CalcEllipsoid_ extern pow extern _GLOBAL_OFFSET_TABLE_ section .bss section .data r8_1p0 dq 1.0 r8_3p0 dq 3.0 r8_4p0 dq 4.0 r8_pi dq 3.14159265358979323846 section .text ;extern "C" bool Avx64CalcEllipsoid_(const double* a, const double* b, const double* c, int n, double p, double* sa, double* vol); ; ;Description: The following function calculates the surface area ; and volume of an ellipsoid ; ;Requires: x86-64, AVX ;registers: rdi :const double* a ; rsi :const double* b ; rdx :const double* c ; ecx :int n (isn't saved because of single use) ; xmm0 :double p ; r8 :double* sa (result) ; r9 :double* vol (result) %define STK_TOTAL 0x60 ;stack must be kept aligned to 16 bytes (0x10). ;12 local variables = 12 x 8 = 96 (or 0x60) bytes needed. We don't need padding because we have 6 non-volatile ;registers pushed on the stack. 6 x 8 = 48 (or 0x30) which is also aligned to 16 bytes. %macro PROLOGUE 0 push r12 push r13 push r14 push r15 push rbx push rbp mov rbp,rsp push rbx call .get_GOT .get_GOT: pop rbx add rbx,_GLOBAL_OFFSET_TABLE_+$$-.get_GOT wrt ..gotpc sub rsp,STK_TOTAL %endmacro %macro EPILOGUE 0 add rsp,STK_TOTAL mov rbx,[rbp-8] mov rsp,rbp pop rbp pop rbx pop r15 pop r14 pop r13 pop r12 ret %endmacro Avx64CalcEllipsoid_: PROLOGUE mov [rsp],rdi ;pointer to a[] mov [rsp+0x8],rsi ;pointer to b[] mov [rsp+0x10],rdx ;pointer to c[] movsd [rsp+0x18],xmm0 ;p mov [rsp+0x20],r8 ;pointer to surface[] mov [rsp+0x28],r9 ;pointer to volume[] ;first calculate constant values outside the loop ;calculate 4*pi vmovsd xmm0,qword[r8_4p0] vmulsd xmm0,xmm0,qword[r8_pi] vmovsd [rsp+0x30],xmm0 ;save 4*pi ;calculate 4*pi/3 vdivsd xmm0,xmm0,qword[r8_3p0] ;4*pi/3 in xmm2 for volume vmovsd [rsp+0x38],xmm0 ;save 4*pi/3 ;calculate 1/p vmovsd xmm0,qword[r8_1p0] ;1 in xmm0 vmovsd xmm1,[rsp+0x18] ;p in xmm1 vdivsd xmm0,xmm0,xmm1 ;1/p vmovsd [rsp+0x40],xmm0 ;save 1/p test ecx,ecx ;is n <= 0 jle .error mov r15d,ecx xor r11,r11 .repeat: ;*** Calculate the ellipsoid's volume *** ;vol = (4*pi*a*b*c)/3 ;xmm0 = 4*pi/3 vmovsd xmm0,[rsp+0x38] ;load 4*pi/3 mov r12,qword[rsp] ;a[] mov r13,qword[rsp+0x8] ;b[] mov r14,qword[rsp+0x10] ;c[] vmulsd xmm0,xmm0,[r12+r11] ;a*4*pi/3 vmulsd xmm0,xmm0,[r13+r11] ;a*b*4*pi/3 vmulsd xmm0,xmm0,[r14+r11] ;a*b*c*pi/3 mov r9,qword[rsp+0x28] ;vol[] vmovsd qword[r9+r11],xmm0 ;save ellipsoid volume mov r8,qword[rsp+0x20] ;vol[] vmovsd qword[r8+r11],xmm0 ;*** Calculate surface *** ;surface = 4*pi*(((a^p*b^p+a^p*c^p+b^p*c^p)/3)^(1/p)) movsd xmm0,[r12+r11] ;a vmovsd xmm1,qword[rsp+0x18] ;p call pow vmovsd [rsp+0x48],xmm0 ;a^p vmovsd xmm0,[r13+r11] ;b vmovsd xmm1,qword[rsp+0x18] ;p call pow vmovsd [rsp+0x50],xmm0 ;b^p vmovsd xmm0,[r14+r11] ;c vmovsd xmm1,qword[rsp+0x18] ;p call pow vmovsd [rsp+0x58],xmm0 ;c^p vmovsd xmm0,qword[rsp+0x48] ;a^p vmovsd xmm1,qword[rsp+0x50] ;b^p vmulsd xmm0,xmm0,xmm1 ;a^p*b^p vmovsd xmm1,qword[rsp+0x50] ;b^p vmovsd xmm2,qword[rsp+0x58] ;c^p vmulsd xmm1,xmm1,xmm2 ;b^p*c^p vmovsd xmm2,qword[rsp+0x48] ;a^p vmovsd xmm3,qword[rsp+0x58] ;c^p vmulsd xmm2,xmm2,xmm3 ;a^p*c^p vaddsd xmm0,xmm0,xmm1 ;a^p*b^p+b^p*c^p vaddsd xmm0,xmm0,xmm2 ;a^p*b^p+b^p*c^p+a^p*c^p vdivsd xmm0,xmm0,qword[r8_3p0] ;(a^p*b^p+b^p*c^p+a^p*c^p)/3 vmovsd xmm1,[rsp+0x40] ;1/p call pow ;((a^p*b^p+b^p*c^p+a^p*c^p)/3)^(1/p) vmovsd xmm1,[rsp+0x30] ;4*pi vmulsd xmm0,xmm0,xmm1 ;4*pi*(((a^p*b^p+b^p*c^p+a^p*c^p)/3)^(1/p)) mov r8,qword[rsp+0x20] ;sa[] vmovsd qword[r8+r11],xmm0 ;save add r11,8 sub r15d,1 jnz .repeat mov rax,1 jmp .done .error: xor rax,rax .done: EPILOGUE
If you should decide to build it from the commandline then you should use the next commands.
nasm -f elf64 -g -F dwarf -o avx64calcellipsoid_.o avx64calcellipsoid_.asm
g++ -c -pipe -O2 -std=gnu++11 -Wall -W -fPIC -I. -o main.o main.cpp
g++ -Wl,-O1 -o Avx64CalcEllipsoid avx64calcellipsoid_.o main.o -lm