main.cpp
#include "../../commonfiles/xmmval.h"
#include <stdio.h>
extern "C" void SsePiAddI16(const XmmVal* a, const XmmVal* b, XmmVal c[2]);
extern "C" void SsePiSubI32(const XmmVal* a, const XmmVal* b, XmmVal* c);
extern "C" void SsePiMul32(const XmmVal* a, const XmmVal* b, XmmVal c[2]);
void SsePiAddI16Cpp(void)
{
__attribute__ ((aligned(16))) XmmVal a;
__attribute__ ((aligned(16))) XmmVal b;
__attribute__ ((aligned(16))) XmmVal c[2];
char buff[256];
a.i16[0] = 10; b.i16[0] = 100;
a.i16[1] = 200; b.i16[1] = -200;
a.i16[2] = 30; b.i16[2] = 32760;
a.i16[3] = -32766; b.i16[3] = -400;
a.i16[4] = 50; b.i16[4] = 500;
a.i16[5] = 60; b.i16[5] = -600;
a.i16[6] = 32000; b.i16[6] = 1200;
a.i16[7] = -32000; b.i16[7] = -950;
SsePiAddI16(&a, &b, c);
printf("\nResults for SsePiAddI16\n");
printf("a: %s\n", a.ToString_i16(buff, sizeof(buff)));
printf("b: %s\n", b.ToString_i16(buff, sizeof(buff)));
printf("c[0]: %s\n", c[0].ToString_i16(buff, sizeof(buff)));
printf("\n");
printf("a: %s\n", a.ToString_i16(buff, sizeof(buff)));
printf("b: %s\n", b.ToString_i16(buff, sizeof(buff)));
printf("c[1]: %s\n", c[1].ToString_i16(buff, sizeof(buff)));
}
void SsePiSubI32Cpp(void)
{
__attribute__ ((aligned(16))) XmmVal a;
__attribute__ ((aligned(16))) XmmVal b;
__attribute__ ((aligned(8))) XmmVal c; // Misaligned XmmVal
char buff[256];
a.i32[0] = 800; b.i32[0] = 250;
a.i32[1] = 500; b.i32[1] = -2000;
a.i32[2] = 1000; b.i32[2] = -40;
a.i32[3] = 900; b.i32[3] = 1200;
SsePiSubI32(&a, &b, &c);
printf("\nResults for SsePiSubI32\n");
printf("a: %s\n", a.ToString_i32(buff, sizeof(buff)));
printf("b: %s\n", b.ToString_i32(buff, sizeof(buff)));
printf("c: %s\n", c.ToString_i32(buff, sizeof(buff)));
}
void SsePiMul32Cpp(void)
{
__attribute__ ((aligned(16))) XmmVal a;
__attribute__ ((aligned(16))) XmmVal b;
__attribute__ ((aligned(16))) XmmVal c[2];
char buff[256];
a.i32[0] = 10; b.i32[0] = 100;
a.i32[1] = 20; b.i32[1] = -200;
a.i32[2] = -30; b.i32[2] = 300;
a.i32[3] = -40; b.i32[3] = -400;
SsePiMul32(&a, &b, c);
printf("\nResults for SsePiMul32\n");
printf("a: %s\n", a.ToString_i32(buff, sizeof(buff)));
printf("b: %s\n", b.ToString_i32(buff, sizeof(buff)));
printf("c[0]: %s\n", c[0].ToString_i32(buff, sizeof(buff)));
printf("\n");
printf("a: %s\n", a.ToString_i32(buff, sizeof(buff)));
printf("b: %s\n", b.ToString_i32(buff, sizeof(buff)));
printf("c[1]: %s\n", c[1].ToString_i64(buff, sizeof(buff)));
}
int main(int argc, char* argv[])
{
SsePiAddI16Cpp();
SsePiSubI32Cpp();
SsePiMul32Cpp();
return 0;
}
ssepackedintegerfundamentals.asm
; Name: ssepackedintegerfundamentals.asm
;
; Build: g++ -c -m32 main.cpp -o main.o
; nasm -f elf32 -o ssepackedintegerfundamentals.o ssepackedintegerfundamentals.asm
; g++ -m32 -o ssepackedintegerfundamentals ssepackedintegerfundamentals.o main.o ../../commonfiles/xmmval.o
;
; Source: Modern x86 Assembly Language Programming p. 273
global SsePiAddI16
global SsePiSubI32
global SsePiMul32
section .text
; extern "C" void SsePiAddI16(const XmmVal* a, const XmmVal* b, XmmVal c[2]);
;
; Description: The following function demonstrates packed signed word
; addition using wraparound and saturated modes.
;
; Requires: SSE2
%define a [ebp+8]
%define b [ebp+12]
%define c [ebp+16]
SsePiAddI16:
push ebp
mov ebp,esp
; Initialize
mov eax,a ;eax = pointer to a
mov ecx,b ;ecx = pointer to b
mov edx,c ;edx = pointer to c
; Load XmmVals a and b
movdqa xmm0,[eax] ;xmm0 = a
movdqa xmm1,xmm0
movdqa xmm2,[ecx] ;xmm2 = b
; Perform packed word additions
paddw xmm0,xmm2 ;packed add - wraparound
paddsw xmm1,xmm2 ;packed add - saturated
; Save results
movdqa [edx],xmm0 ;save c[0]
movdqa [edx+16],xmm1 ;save c[1]
pop ebp
ret
; extern "C" void SsePiSubI32_(const XmmVal* a, const XmmVal* b, XmmVal* c);
;
; Description: The following function demonstrates packed signed
; doubleword subtraction.
;
; Requires: SSE2
%define a [ebp+8]
%define b [ebp+12]
%define c [ebp+16]
SsePiSubI32:
push ebp
mov ebp,esp
; Initialize
mov eax,a ;eax = pointer to a
mov ecx,b ;ecx = pointer to b
mov edx,c ;edx = pointer to c
; Perform packed doubleword subtraction
movdqa xmm0,[eax] ;xmm0 = a
psubd xmm0,[ecx] ;xmm0 = a - b
movdqu [edx],xmm0 ;save result to unaligned mem
pop ebp
ret
; extern "C" void SsePiMul32_(const XmmVal* a, const XmmVal* b, XmmVal c[2]);
;
; Description: The following function demonstrates packed doubleword
; multiplication.
;
; Requires: SSE4.1
%define a [ebp+8]
%define b [ebp+12]
%define c [ebp+16]
SsePiMul32:
push ebp
mov ebp,esp
; Initialize
mov eax,a ;eax = pointer to a
mov ecx,b ;ecx = pointer to b
mov edx,c ;edx = pointer to c
; Load values and perform the multiplication
movdqa xmm0,[eax] ;xmm0 = a
movdqa xmm1,[ecx] ;xmm1 = b
movdqa xmm2,xmm0
pmulld xmm0,xmm1 ;signed dword mul - low result
pmuldq xmm2,xmm1 ;signed dword mul - qword result
movdqa [edx],xmm0 ;c[0] = pmulld result
movdqa [edx+16],xmm2 ;c[1] = pmuldq result
pop ebp
ret
build
g++ -c -m32 main.cpp -o main.o
nasm -f elf32 -o ssepackedintegerfundamentals.o ssepackedintegerfundamentals.asm
g++ -m32 -o ssepackedintegerfundamentals ssepackedintegerfundamentals.o main.o ../../commonfiles/xmmval.o