main.cpp
#include <stdio.h>
#define _USE_MATH_DEFINES
#include <math.h>
#include <stddef.h>

// Uncomment line below to enable display of PDATA information
#define DISPLAY_PDATA_INFO

// This structure must agree with the structure that's defined
// in file SseScalarFloatingPointParallelograms_.asm.
typedef struct
{
    double A;               // Length of left and right
    double B;               // Length of top and bottom
    double Alpha;           // Angle alpha in degrees
    double Beta;            // Angle beta in degrees
    double H;               // Height of parallelogram
    double Area;            // Parallelogram area
    double P;               // Length of diagonal P
    double Q;               // Length of diagonal Q
    bool BadValue;          // Set to true if A, B, or Alpha is invalid
    char Pad[7];            // Reserved for future use
} PDATA;

extern "C" bool SseSfpParallelograms(PDATA* pdata, int n);
extern "C" double DegToRad = M_PI / 180.0;
extern "C" int SizeofPdataX86;
const bool PrintPdataInfo = true;

void SetPdata(PDATA* pdata, double a, double b, double alpha)
{
    pdata->A = a;
    pdata->B = b;
    pdata->Alpha = alpha;
}

int main(int argc, char* argv[])
{

#ifdef DISPLAY_PDATA_INFO
    size_t spd1 = sizeof(PDATA);
    size_t spd2 =  SizeofPdataX86;

    if (spd1 != spd2)
        printf("PDATA size discrepancy [%d, %d]", spd1, spd2);
    else
    {    
        printf("sizeof(PDATA):      %d\n", spd1);
        printf("Offset of A:        %d\n", offsetof(PDATA, A));
        printf("Offset of B:        %d\n", offsetof(PDATA, B));
        printf("Offset of Alpha:    %d\n", offsetof(PDATA, Alpha));
        printf("Offset of Beta:     %d\n", offsetof(PDATA, Beta));
        printf("Offset of H         %d\n", offsetof(PDATA, H));
        printf("Offset of Area:     %d\n", offsetof(PDATA, Area));
        printf("Offset of P:        %d\n", offsetof(PDATA, P));
        printf("Offset of Q:        %d\n", offsetof(PDATA, Q));
        printf("Offset of BadValue  %d\n", offsetof(PDATA, BadValue));
        printf("Offset of Pad       %d\n", offsetof(PDATA, Pad));
    }
#endif

    const int n = 10;
    PDATA pdata[n];

    // Create some test parallelograms
    SetPdata(&pdata[0], -1.0, 1.0, 60.0);
    SetPdata(&pdata[1], 1.0, -1.0, 60.0);
    SetPdata(&pdata[2], 1.0, 1.0, 181.0);
    SetPdata(&pdata[3], 1.0, 1.0, 90.0);
    SetPdata(&pdata[4], 3.0, 4.0, 90.0);
    SetPdata(&pdata[5], 2.0, 3.0, 30.0);
    SetPdata(&pdata[6], 3.0, 2.0, 60.0);
    SetPdata(&pdata[7], 4.0, 2.5, 120.0);
    SetPdata(&pdata[8], 5.0, 7.125, 135.0);
    SetPdata(&pdata[9], 8.0, 8.0, 165.0);

    SseSfpParallelograms(pdata, n);

    for (int i = 0; i < n; i++)
    {
        PDATA* p = &pdata[i];
        printf("\npdata[%d] - BadValue = %d\n", i, p->BadValue);
        printf("  A:      %12.6lf  B:    %12.6lf\n", p->A, p->B);
        printf("  Alpha:  %12.6lf  Beta: %12.6lf\n", p->Alpha, p->Beta);
        printf("  H:      %12.6lf  Area: %12.6lf\n", p->H, p->Area);
        printf("  P:      %12.6lf  Q:    %12.6lf\n", p->P, p->Q);
    }

    return 0;
}
ssescalarfloatingpointparallelograms.asm
; Name:		ssescalarfloatingpointspheres.asm
;
; Build:	g++ -c -m32 main.cpp -o main.o -std=c++11
;		nasm -f elf32 -o ssescalarfloatingpointparallelograms.o ssescalarfloatingpointparallelograms.asm
;		g++ -m32 -o ssescalarfloatingpointparallelograms ssescalarfloatingpointparallelograms.o main.o
;
; Source:	Modern x86 Assembly Language Programming p. 228
;
; Remark:	4 lines are added to this source code. Calling the sin and cos
;		functions from the math library changes the content of ecx.
;		We need to save ecx somewhere before calling sin or cos and restore
;		ecx back when sin or cos returns with the result.

global	SizeofPdataX86
global	SseSfpParallelograms

extern sin
extern cos
extern DegToRad

; This structure must agree with the structure that's defined
; in file SseScalarFloatingPointParallelograms.cpp.
struc PDATA
	.A:       resq 1
	.B:       resq 1
	.Alpha:   resq 1
	.Beta:    resq 1
	.H:       resq 1
	.Area:    resq 1
	.P:       resq 1
	.Q:       resq 1
	.BadVal:  resb 1
	.Pad:     resb 7
	.size:	equ $-PDATA
endstruc

section .data

; Constant values used by function
	r8_2p0         dq 2.0
	r8_180p0       dq 180.0
	r8_MinusOne    dq -1.0
	SizeofPdataX86 dq PDATA.size

section .text
; extern "C" bool SseSfpParallelograms(PDATA* pdata, int n); ; ; Description: The following function calculates area and length ; values for parallelograms. ; ; Returns: 0 n <= 0 ; 1 n > 0 ; ; Local stack: [ebp-8] x87 FPU transfer location ; [ebp-16] Alpha in radians ; ; Requires SSE2
SseSfpParallelograms: %define pdata [ebp+8] ; pointer %define n dword[ebp+12] ; value push ebp mov ebp,esp sub esp,16 ;allocate space for local vars push ebx ; Load arguments and validate n xor eax,eax ;set error code mov ebx,pdata ;ebx = pdata mov ecx,n ;ecx = n test ecx,ecx jle .done ;jump if n <= 0 ; Initialize constant values .loop1: movsd xmm6,qword[r8_180p0] ;xmm6 = 180.0 xorpd xmm7,xmm7 ;xmm7 = 0.0 sub esp,8 ;space for sin/cos arg value ; Load and validate A and B movsd xmm0,qword[ebx+PDATA.A] ;xmm0 = A movsd xmm1,qword[ebx+PDATA.B] ;xmm0 = B comisd xmm0,xmm7 jp .invalidValue jbe .invalidValue ;jump if A <= 0.0 comisd xmm1,xmm7 jp .invalidValue jbe .invalidValue ;jump if B <= 0.0 ; Load and validate Alpha movsd xmm2,qword[ebx+PDATA.Alpha] comisd xmm2,xmm7 jp .invalidValue jbe .invalidValue ;jump if Alpha <= 0.0 comisd xmm2,xmm6 jae .invalidValue ;jump if Alpha >= 180.0 ; Compute Beta subsd xmm6,xmm2 ;Beta = 180.0 - Alpha movsd qword[ebx+PDATA.Beta],xmm6 ;Save Beta ; Compute sin(Alpha) mulsd xmm2,[DegToRad] ;convert Alpha to radians movsd qword[ebp-16],xmm2 ;save value for later movsd qword[esp],xmm2 ;copy Alpha onto stack ; save ecx into the location for n mov n,ecx ;n = ecx call sin ; restore ecx mov ecx,n ;ecx = n fstp qword[ebp-8] ;save sin(Alpha) ; Compute parallelogram Height and Area movsd xmm0,qword[ebx+PDATA.A] ;A mulsd xmm0, [ebp-8] ;A * sin(Alpha) movsd qword[ebx+PDATA.H],xmm0 ;save height mulsd xmm0, [ebx+PDATA.B] ;A * sin(Alpha) * B movsd qword[ebx+PDATA.Area],xmm0 ;save area ; Compute cos(Alpha) movsd xmm0,qword[ebp-16] ;xmm0 = Alpha in radians movsd [esp],xmm0 ;copy Alpha onto stack ; save ecx into the location for n mov n,ecx ;n = ecx call cos ; restore ecx mov ecx,n ;ecx = n fstp qword[ebp-8] ;save cos(Alpha) ; Compute 2.0 * A * B * cos(Alpha) movsd xmm0,qword[r8_2p0] movsd xmm1,qword[ebx+PDATA.A] movsd xmm2,qword[ebx+PDATA.B] mulsd xmm0,xmm1 ;2 * A mulsd xmm0,xmm2 ;2 * A * B mulsd xmm0,[ebp-8] ;2 * A * B * cos(Alpha) ; Compute A * A + B * B movsd xmm3,xmm1 movsd xmm4,xmm2 mulsd xmm3,xmm3 ;A * A mulsd xmm4,xmm4 ;B * B addsd xmm3,xmm4 ;A * A + B * B movsd xmm4,xmm3 ;A * A + B * B ; Compute P and Q subsd xmm3,xmm0 sqrtsd xmm3,xmm3 ;xmm3 = P movsd qword[ebx+PDATA.P],xmm3 addsd xmm4,xmm0 sqrtsd xmm4,xmm4 ;xmm4 = Q movsd qword[ebx+PDATA.Q],xmm4 mov byte[ebx+PDATA.BadVal],0 ;set BadVal to false .nextItem: add ebx,PDATA.size ;ebx = next element in array dec ecx jnz .loop1 ;repeat loop until done add esp,8 ;restore ESP .done: pop ebx mov esp,ebp pop ebp ret ; Set structure members to know values for display purposes .invalidValue: movsd xmm0,[r8_MinusOne] movsd qword[ebx+PDATA.Beta],xmm0 movsd qword[ebx+PDATA.H],xmm0 movsd qword[ebx+PDATA.Area],xmm0 movsd qword[ebx+PDATA.P],xmm0 movsd qword[ebx+PDATA.Q],xmm0 mov byte[ebx+PDATA.BadVal],1 jmp .nextItem
build
g++ -c -m32 main.cpp -o main.o -std=c++11
nasm -f elf32 -o ssescalarfloatingpointparallelograms.o ssescalarfloatingpointparallelograms.asm
g++ -m32 -o ssescalarfloatingpointparallelograms ssescalarfloatingpointparallelograms.o main.o