main.h
//modifications
#pragma once
#include "../../commonfiles/mat4x4.h"

// SsePackedFloatingPointMatrix4x4.cpp function declarations
//extern void SsePfpMatrix4x4MultiplyCpp(Mat4x4 m_des, Mat4x4 m_src1, Mat4x4 m_src2);
//extern void SsePfpMatrix4x4TransformVectorsCpp(Vec4x1* v_des, Mat4x4 m_src, Vec4x1* v_src, int num_vec);

// SsePackedFloatingPointMatrix4x4_.asm function declarations
extern "C" void SsePfpMatrix4x4Multiply(Mat4x4 m_des, Mat4x4 m_src1, Mat4x4 m_src2);
extern "C" void SsePfpMatrix4x4TransformVectors(Vec4x1* v_des, Mat4x4 m_src, Vec4x1* v_src, int num_vec);
main.cpp
#include "main.h"

// The functions Mat4x4Mul and Mat4x4MulVec are defined in
// the file CommonFiles\Mat4x4.cpp
/*
void SsePfpMatrix4x4MultiplyCpp(Mat4x4 m_des, Mat4x4 m_src1, Mat4x4 m_src2)
{
    Mat4x4Mul(m_des, m_src1, m_src2);
}

void SsePfpMatrix4x4TransformVectorsCpp(Vec4x1* v_des, Mat4x4 m_src, Vec4x1* v_src, int num_vec)
{
    for (int i= 0; i < num_vec; i++)
        Mat4x4MulVec(v_des[i], m_src, v_src[i]);
}
*/

void SsePfpMatrix4x4MultiplyCpp(void)
{
	__attribute__ ((aligned(16))) Mat4x4 m_src1;
	__attribute__ ((aligned(16))) Mat4x4 m_src2;
	__attribute__ ((aligned(16))) Mat4x4 m_des1;
	__attribute__ ((aligned(16))) Mat4x4 m_des2;

    Mat4x4SetRow(m_src1, 0, 10.5, 11, 12, -13.625);
    Mat4x4SetRow(m_src1, 1, 14, 15, 16, 17.375);
    Mat4x4SetRow(m_src1, 2, 18.25, 19, 20.125, 21);
    Mat4x4SetRow(m_src1, 3, 22, 23.875, 24, 25);

    Mat4x4SetRow(m_src2, 0, 7, 1, 4, 8);
    Mat4x4SetRow(m_src2, 1, 14, -5, 2, 9);
    Mat4x4SetRow(m_src2, 2, 10, 9, 3, 6);
    Mat4x4SetRow(m_src2, 3, 2, 11, -14, 13);

    //SsePfpMatrix4x4MultiplyCpp(m_des1, m_src1, m_src2);
	Mat4x4Mul(m_des1, m_src1, m_src2);
    SsePfpMatrix4x4Multiply(m_des2, m_src1, m_src2);

    printf("\nResults for SsePfpMatrix4x4Multiply()\n");
    Mat4x4Printf(m_src1, "\nMatrix m_src1\n");
    Mat4x4Printf(m_src2, "\nMatrix m_src2\n");
    Mat4x4Printf(m_des1, "\nMatrix m_des1\n");
    Mat4x4Printf(m_des2, "\nMatrix m_des2\n");
}

void SsePfpMatrix4x4TransformVectorsCpp(void)
{
    const int n = 8;
	__attribute__ ((aligned(16))) Mat4x4 m_src;
	__attribute__ ((aligned(16))) Vec4x1 v_src[n];
	__attribute__ ((aligned(16))) Vec4x1 v_des1[n];
	__attribute__ ((aligned(16))) Vec4x1 v_des2[n];

    Vec4x1Set(v_src[0], 10, 10, 10, 1);
    Vec4x1Set(v_src[1], 10, 11, 10, 1);
    Vec4x1Set(v_src[2], 11, 10, 10, 1);
    Vec4x1Set(v_src[3], 11, 11, 10, 1);
    Vec4x1Set(v_src[4], 10, 10, 12, 1);
    Vec4x1Set(v_src[5], 10, 11, 12, 1);
    Vec4x1Set(v_src[6], 11, 10, 12, 1);
    Vec4x1Set(v_src[7], 11, 11, 12, 1);

    // m_src = scale(2, 3, 4)
    Mat4x4SetRow(m_src, 0, 2, 0, 0, 0);
    Mat4x4SetRow(m_src, 1, 0, 3, 0, 0);
    Mat4x4SetRow(m_src, 2, 0, 0, 7, 0);
    Mat4x4SetRow(m_src, 3, 0, 0, 0, 1);

    //SsePfpMatrix4x4TransformVectorsCpp(v_des1, m_src, v_src, n);
	for (int i= 0; i < n; i++)
		Mat4x4MulVec(v_des1[i], m_src, v_src[i]);
	
    SsePfpMatrix4x4TransformVectors(v_des2, m_src, v_src, n);

    printf("\nResults for SsePfpMatrix4x4TransformVectors()\n");
    Mat4x4Printf(m_src, "Matrix m_src\n");
    printf("\n");

    for (int i = 0; i < n; i++)
    {
        const char* fmt = "%4s %4d: %12.6f %12.6f %12.6f %12.6f\n";
        printf(fmt, "v_src  ", i, v_src[i][0],  v_src[i][1],  v_src[i][2],  v_src[i][3]);
        printf(fmt, "v_des1 ", i, v_des1[i][0], v_des1[i][1], v_des1[i][2], v_des1[i][3]);
        printf(fmt, "v_des2 ", i, v_des2[i][0], v_des2[i][1], v_des2[i][2], v_des2[i][3]);
        printf("\n");
    }
}

int main(int argc, char* argv[])
{
    SsePfpMatrix4x4MultiplyCpp();
    SsePfpMatrix4x4TransformVectorsCpp();

    //SsePfpMatrix4x4MultiplyTimed();
    //SsePfpMatrix4x4TransformVectorsTimed();
    return 0;
}
ssepackedfloatingpointmatrix4x4.asm
; Name:     ssepackedfloatingpointmatrix4x4.asm
;
; Build:    g++ -c -m32 main.cpp -o main.o
;           nasm -f elf32 -o ssepackedfloatingpointmatrix4x4.o ssepackedfloatingpointmatrix4x4.asm
;           g++ -m32 -o ssepackedfloatingpointmatrix4x4 ssepackedfloatingpointmatrix4x4.o main.o ../../commonfiles/mat4x4.o
;
; Source:   Modern x86 Assembly Language Programming p. 260

global SsePfpMatrix4x4Multiply
global SsePfpMatrix4x4TransformVectors

; Mat4x4Transpose macro
;
; Description:  This macro computes the transpose of a 4x4
;               single-precision floating-point matrix.
;
;   Input Matrix                    Output Matrtix
;   xmm0    a3 a2 a1 a0             xmm4    d0 c0 b0 a0
;   xmm1    b3 b2 b1 b0             xmm5    d1 c1 b1 a1
;   xmm2    c3 c2 c1 c0             xmm6    d2 c2 b2 a2
;   xmm3    d3 d2 d1 d0             xmm7    d3 c3 b3 a3
;
; Note:     The row of a 4x4 matrix is reversed when loaded into an
;           XMM register due to x86 little-endian ordering.
;
; Requires: SSE

%MACRO Mat4x4Transpose 0
    movaps   xmm4,xmm0
    unpcklps xmm4,xmm1                  ;xmm4 = b1 a1 b0 a0
    unpckhps xmm0,xmm1                  ;xmm0 = b3 a3 b2 a2
    movaps   xmm5,xmm2
    unpcklps xmm5,xmm3                  ;xmm5 = d1 c1 d0 c0
    unpckhps xmm2,xmm3                  ;xmm2 = d3 c3 d2 c2

    movaps   xmm1,xmm4
    movlhps  xmm4,xmm5                  ;xmm4 = d0 c0 b0 a0
    movhlps  xmm5,xmm1                  ;xmm5 = d1 c1 b1 a1
    movaps   xmm6,xmm0
    movlhps  xmm6,xmm2                  ;xmm6 = d2 c2 b2 a2
    movaps   xmm7,xmm2
    movhlps  xmm7,xmm0                  ;xmm7 = d3 c3 b2 a3
%ENDMACRO

section .text

; extern "C" void SsePfpMatrix4x4Multiply(Mat4x4 m_des, Mat4x4 m_src1, Mat4x4 m_src2);
;
; Description:  The following function computes the product of two
;               4x4 single-precision floating-point matrices.
;
; Requires: SSE4.1

%define m_des   [ebp+8]
%define m_src1  [ebp+12]
%define m_src2  [ebp+16]

SsePfpMatrix4x4Multiply:
    push     ebp
    mov      ebp,esp
    push     ebx
; Compute transpose of m_src2 (m_src2_T)
    mov      ebx,m_src2                 ;ebx = m_src2
    movaps   xmm0,[ebx]
    movaps   xmm1,[ebx+16]
    movaps   xmm2,[ebx+32]
    movaps   xmm3,[ebx+48]              ;xmm3:xmm0 = m_src2
    Mat4x4Transpose                     ;xmm7:xmm4 = m_src2_T
; Perform initializations for matrix product
    mov      edx,m_des                  ;edx = m_des
    mov      ebx,m_src1                 ;ebx = m_src1
    mov      ecx,4                      ;ecx = number of rows
    xor      eax,eax                    ;eax = offset into arrays
; Repeat loop until matrix product is calculated.
align 16
.@1:
    movaps   xmm0,[ebx+eax]             ;xmm0 = row i of m_src1
; Compute dot product of m_src1 row i and m_src2_T row 0
    movaps   xmm1,xmm0
    dpps     xmm1,xmm4,11110001b        ;xmm1[31:0] = dot product
    insertps xmm3,xmm1,00000000b        ;xmm3[31:0] = xmm1[31:0]
; Compute dot product of m_src1 row i and m_src2_T row 1
    movaps   xmm2,xmm0
    dpps     xmm2,xmm5,11110001b        ;xmm2[31:0] = dot product
    insertps xmm3,xmm2,00010000b        ;xmm3[63:32] = xmm2[31:0]
; Compute dot product of m_src1 row i and m_src2_T row 2
    movaps   xmm1,xmm0
    dpps     xmm1,xmm6,11110001b        ;xmm1[31:0] = dot product
    insertps xmm3,xmm1,00100000b        ;xmm3[95:64] = xmm1[31:0]
; Compute dot product of m_src1 row i and m_src2_T row 3
    movaps   xmm2,xmm0
    dpps     xmm2,xmm7,11110001b        ;xmm2[31:0] = dot product
    insertps xmm3,xmm2,00110000b        ;xmm3[127:96] = xmm2[31:0]
; Save m_des.row i and update loop variables
    movaps   [edx+eax],xmm3             ;save current row result
    add      eax,16                     ;set array offset to next row
    dec      ecx
    jnz      .@1
    pop      ebx
    pop      ebp
    ret

; extern void SsePfpMatrix4x4TransformVectors(Vec4x1* v_des, Mat4x4 m_src, Vec4x1* v_src, int num_vec);
;
; Description:  The following function applies a transformation matrix
;               to an array 4x1 single-precision floating-point vectors.
;
; Requires:     SSE4.1

%define v_des   [ebp+8]
%define m_src   [ebp+12]
%define v_src   [ebp+16]
%define num_vec [ebp+20]

SsePfpMatrix4x4TransformVectors:
    push     ebp
    mov      ebp,esp
    push     esi
    push     edi
; Make sure num_vec is valid
    mov      ecx,num_vec                ;ecx = num_vec
    test     ecx,ecx
    jle      .done                      ;jump if num_vec <= 0
; Load m_src into xmm3:xmm0
    mov      eax,m_src                  ;eax = pointer to m_src
    movaps   xmm0,[eax]                 ;xmm0 = row 0
    movaps   xmm1,[eax+16]              ;xmm1 = row 1
    movaps   xmm2,[eax+32]              ;xmm2 = row 2
    movaps   xmm3,[eax+48]              ;xmm3 = row 3
; Initialize pointers to v_src and v_des
    mov      esi,v_src                  ;esi = pointer to v_src
    mov      edi,v_des                  ;edi = pointer to v_des
    xor      eax,eax                    ;eax = array offset
; Compute v_des[i] = m_src * v_src[i]
align 16
.@1:
    movaps   xmm4,[esi+eax]             ;xmm4 = vector v_src[i]
; Compute dot product of m_src row 0 and v_src[i]
    movaps   xmm5,xmm4
    dpps     xmm5,xmm0,11110001b        ;xmm5[31:0] = dot product
    insertps xmm7,xmm5,00000000b        ;xmm7[31:0] = xmm5[31:0]
; Compute dot product of m_src row 1 and v_src[i]
    movaps   xmm6,xmm4
    dpps     xmm6,xmm1,11110001b        ;xmm6[31:0] = dot product
    insertps xmm7,xmm6,00010000b        ;xmm7[63:32] = xmm6[31:0]
; Compute dot product of m_src row 2 and v_src[i]
    movaps   xmm5,xmm4
    dpps     xmm5,xmm2,11110001b        ;xmm5[31:0] = dot product
    insertps xmm7,xmm5,00100000b        ;xmm7[95:64] = xmm5[31:0]
; Compute dot product of m_src row 3 and v_src[i]
    movaps   xmm6,xmm4
    dpps     xmm6,xmm3,11110001b        ;xmm6[31:0] = dot product
    insertps xmm7,xmm6,00110000b        ;xmm7[127:96] = xmm6[31:0]
; Save v_des[i] and update loop variables
    movaps   [edi+eax],xmm7             ;save transformed vector
    add      eax,16
    dec      ecx
    jnz      .@1
.done:
    pop      edi
    pop      esi
    pop      ebp
    ret
build
g++ -c -m32 main.cpp -o main.o
nasm -f elf32 -o ssepackedfloatingpointmatrix4x4.o ssepackedfloatingpointmatrix4x4.asm
g++ -m32 -o ssepackedfloatingpointmatrix4x4 ssepackedfloatingpointmatrix4x4.o main.o ../../commonfiles/mat4x4.o