main.cpp
#include <stdio.h> #define _USE_MATH_DEFINES #include <math.h> #include <stddef.h> // Uncomment line below to enable display of PDATA information #define DISPLAY_PDATA_INFO // This structure must agree with the structure that's defined // in file SseScalarFloatingPointParallelograms_.asm. typedef struct { double A; // Length of left and right double B; // Length of top and bottom double Alpha; // Angle alpha in degrees double Beta; // Angle beta in degrees double H; // Height of parallelogram double Area; // Parallelogram area double P; // Length of diagonal P double Q; // Length of diagonal Q bool BadValue; // Set to true if A, B, or Alpha is invalid char Pad[7]; // Reserved for future use } PDATA; extern "C" bool SseSfpParallelograms(PDATA* pdata, int n); extern "C" double DegToRad = M_PI / 180.0; extern "C" int SizeofPdataX86; const bool PrintPdataInfo = true; void SetPdata(PDATA* pdata, double a, double b, double alpha) { pdata->A = a; pdata->B = b; pdata->Alpha = alpha; } int main(int argc, char* argv[]) { #ifdef DISPLAY_PDATA_INFO size_t spd1 = sizeof(PDATA); size_t spd2 = SizeofPdataX86; if (spd1 != spd2) printf("PDATA size discrepancy [%d, %d]", spd1, spd2); else { printf("sizeof(PDATA): %d\n", spd1); printf("Offset of A: %d\n", offsetof(PDATA, A)); printf("Offset of B: %d\n", offsetof(PDATA, B)); printf("Offset of Alpha: %d\n", offsetof(PDATA, Alpha)); printf("Offset of Beta: %d\n", offsetof(PDATA, Beta)); printf("Offset of H %d\n", offsetof(PDATA, H)); printf("Offset of Area: %d\n", offsetof(PDATA, Area)); printf("Offset of P: %d\n", offsetof(PDATA, P)); printf("Offset of Q: %d\n", offsetof(PDATA, Q)); printf("Offset of BadValue %d\n", offsetof(PDATA, BadValue)); printf("Offset of Pad %d\n", offsetof(PDATA, Pad)); } #endif const int n = 10; PDATA pdata[n]; // Create some test parallelograms SetPdata(&pdata[0], -1.0, 1.0, 60.0); SetPdata(&pdata[1], 1.0, -1.0, 60.0); SetPdata(&pdata[2], 1.0, 1.0, 181.0); SetPdata(&pdata[3], 1.0, 1.0, 90.0); SetPdata(&pdata[4], 3.0, 4.0, 90.0); SetPdata(&pdata[5], 2.0, 3.0, 30.0); SetPdata(&pdata[6], 3.0, 2.0, 60.0); SetPdata(&pdata[7], 4.0, 2.5, 120.0); SetPdata(&pdata[8], 5.0, 7.125, 135.0); SetPdata(&pdata[9], 8.0, 8.0, 165.0); SseSfpParallelograms(pdata, n); for (int i = 0; i < n; i++) { PDATA* p = &pdata[i]; printf("\npdata[%d] - BadValue = %d\n", i, p->BadValue); printf(" A: %12.6lf B: %12.6lf\n", p->A, p->B); printf(" Alpha: %12.6lf Beta: %12.6lf\n", p->Alpha, p->Beta); printf(" H: %12.6lf Area: %12.6lf\n", p->H, p->Area); printf(" P: %12.6lf Q: %12.6lf\n", p->P, p->Q); } return 0; }
ssescalarfloatingpointparallelograms.asm
; Name: ssescalarfloatingpointspheres.asm ; ; Build: g++ -c -m32 main.cpp -o main.o -std=c++11 ; nasm -f elf32 -o ssescalarfloatingpointparallelograms.o ssescalarfloatingpointparallelograms.asm ; g++ -m32 -o ssescalarfloatingpointparallelograms ssescalarfloatingpointparallelograms.o main.o ; ; Source: Modern x86 Assembly Language Programming p. 228 ; ; Remark: 4 lines are added to this source code. Calling the sin and cos ; functions from the math library changes the content of ecx. ; We need to save ecx somewhere before calling sin or cos and restore ; ecx back when sin or cos returns with the result. global SizeofPdataX86 global SseSfpParallelograms extern sin extern cos extern DegToRad ; This structure must agree with the structure that's defined ; in file SseScalarFloatingPointParallelograms.cpp. struc PDATA .A: resq 1 .B: resq 1 .Alpha: resq 1 .Beta: resq 1 .H: resq 1 .Area: resq 1 .P: resq 1 .Q: resq 1 .BadVal: resb 1 .Pad: resb 7 .size: equ $-PDATA endstruc section .data ; Constant values used by function r8_2p0 dq 2.0 r8_180p0 dq 180.0 r8_MinusOne dq -1.0 SizeofPdataX86 dq PDATA.size section .text
; extern "C" bool SseSfpParallelograms(PDATA* pdata, int n); ; ; Description: The following function calculates area and length ; values for parallelograms. ; ; Returns: 0 n <= 0 ; 1 n > 0 ; ; Local stack: [ebp-8] x87 FPU transfer location ; [ebp-16] Alpha in radians ; ; Requires SSE2
SseSfpParallelograms: %define pdata [ebp+8] ; pointer %define n dword[ebp+12] ; value push ebp mov ebp,esp sub esp,16 ;allocate space for local vars push ebx ; Load arguments and validate n xor eax,eax ;set error code mov ebx,pdata ;ebx = pdata mov ecx,n ;ecx = n test ecx,ecx jle .done ;jump if n <= 0 ; Initialize constant values .loop1: movsd xmm6,qword[r8_180p0] ;xmm6 = 180.0 xorpd xmm7,xmm7 ;xmm7 = 0.0 sub esp,8 ;space for sin/cos arg value ; Load and validate A and B movsd xmm0,qword[ebx+PDATA.A] ;xmm0 = A movsd xmm1,qword[ebx+PDATA.B] ;xmm0 = B comisd xmm0,xmm7 jp .invalidValue jbe .invalidValue ;jump if A <= 0.0 comisd xmm1,xmm7 jp .invalidValue jbe .invalidValue ;jump if B <= 0.0 ; Load and validate Alpha movsd xmm2,qword[ebx+PDATA.Alpha] comisd xmm2,xmm7 jp .invalidValue jbe .invalidValue ;jump if Alpha <= 0.0 comisd xmm2,xmm6 jae .invalidValue ;jump if Alpha >= 180.0 ; Compute Beta subsd xmm6,xmm2 ;Beta = 180.0 - Alpha movsd qword[ebx+PDATA.Beta],xmm6 ;Save Beta ; Compute sin(Alpha) mulsd xmm2,[DegToRad] ;convert Alpha to radians movsd qword[ebp-16],xmm2 ;save value for later movsd qword[esp],xmm2 ;copy Alpha onto stack ; save ecx into the location for n mov n,ecx ;n = ecx call sin ; restore ecx mov ecx,n ;ecx = n fstp qword[ebp-8] ;save sin(Alpha) ; Compute parallelogram Height and Area movsd xmm0,qword[ebx+PDATA.A] ;A mulsd xmm0, [ebp-8] ;A * sin(Alpha) movsd qword[ebx+PDATA.H],xmm0 ;save height mulsd xmm0, [ebx+PDATA.B] ;A * sin(Alpha) * B movsd qword[ebx+PDATA.Area],xmm0 ;save area ; Compute cos(Alpha) movsd xmm0,qword[ebp-16] ;xmm0 = Alpha in radians movsd [esp],xmm0 ;copy Alpha onto stack ; save ecx into the location for n mov n,ecx ;n = ecx call cos ; restore ecx mov ecx,n ;ecx = n fstp qword[ebp-8] ;save cos(Alpha) ; Compute 2.0 * A * B * cos(Alpha) movsd xmm0,qword[r8_2p0] movsd xmm1,qword[ebx+PDATA.A] movsd xmm2,qword[ebx+PDATA.B] mulsd xmm0,xmm1 ;2 * A mulsd xmm0,xmm2 ;2 * A * B mulsd xmm0,[ebp-8] ;2 * A * B * cos(Alpha) ; Compute A * A + B * B movsd xmm3,xmm1 movsd xmm4,xmm2 mulsd xmm3,xmm3 ;A * A mulsd xmm4,xmm4 ;B * B addsd xmm3,xmm4 ;A * A + B * B movsd xmm4,xmm3 ;A * A + B * B ; Compute P and Q subsd xmm3,xmm0 sqrtsd xmm3,xmm3 ;xmm3 = P movsd qword[ebx+PDATA.P],xmm3 addsd xmm4,xmm0 sqrtsd xmm4,xmm4 ;xmm4 = Q movsd qword[ebx+PDATA.Q],xmm4 mov byte[ebx+PDATA.BadVal],0 ;set BadVal to false .nextItem: add ebx,PDATA.size ;ebx = next element in array dec ecx jnz .loop1 ;repeat loop until done add esp,8 ;restore ESP .done: pop ebx mov esp,ebp pop ebp ret ; Set structure members to know values for display purposes .invalidValue: movsd xmm0,[r8_MinusOne] movsd qword[ebx+PDATA.Beta],xmm0 movsd qword[ebx+PDATA.H],xmm0 movsd qword[ebx+PDATA.Area],xmm0 movsd qword[ebx+PDATA.P],xmm0 movsd qword[ebx+PDATA.Q],xmm0 mov byte[ebx+PDATA.BadVal],1 jmp .nextItem
build
g++ -c -m32 main.cpp -o main.o -std=c++11 nasm -f elf32 -o ssescalarfloatingpointparallelograms.o ssescalarfloatingpointparallelograms.asm g++ -m32 -o ssescalarfloatingpointparallelograms ssescalarfloatingpointparallelograms.o main.o