main.cpp
#include <stdio.h>
#include "../../commonfiles/miscdefs.h"

extern "C" Uint64 AvxGprMulx(Uint32 a, Uint32 b, Uint8 flags[2]);
extern "C" void AvxGprShiftx(Int32 x, Uint32 count, Int32 results[3]);

void AvxGprMulxCpp(void)
{
    const int n = 3;
    Uint32 a[n] = {64, 3200, 100000000};
    Uint32 b[n] = {1001, 12, 250000000};

    printf("Results for AvxGprMulx()\n");
    for (int i = 0; i < n; i++)
    {
        Uint8 flags[2];
        Uint64 c = AvxGprMulx(a[i], b[i], flags);

        printf("Test case %d\n", i);
        printf("  a: %u  b: %u  c: %llu\n", a[i], b[i], c);
        printf("  status flags before mulx: 0x%02X\n", flags[0]);
        printf("  status flags after mulx:  0x%02X\n", flags[1]);
    }
}

void AvxGprShiftxCpp(void)
{
    const int n = 4;
    Int32 x[n] = { 0x00000008, 0x80000080, 0x00000040, 0xfffffc10 };
    Uint32 count[n] = { 2, 5, 3, 4 };

    printf("\nResults for AvxGprShiftx()\n");
    for (int i = 0; i < n; i++)
    {
        Int32 results[3];

        AvxGprShiftx(x[i], count[i], results);
        printf("Test case %d\n", i);
        printf("  x:    0x%08X (%11d) count: %u\n", x[i], x[i], count[i]);
        printf("  sarx: 0x%08X (%11d)\n", results[0], results[0]);
        printf("  shlx: 0x%08X (%11d)\n", results[1], results[1]);
        printf("  shrx: 0x%08X (%11d)\n", results[2], results[2]);
    }
}

int main(int argc, char* argv[])
{
    AvxGprMulxCpp();
    AvxGprShiftxCpp();
    return 0;
}
avxgprmulxshiftx.asm
; Name:     avxgprmulxshiftx.asm
;
; Build:    g++ -c -m32 main.cpp -o main.o
;           nasm -f elf32 -o avxgprmulxshiftx.o avxgprmulxshiftx.asm
;           g++ -m32 -o avxgprmulxshiftx avxgprmulxshiftx.o main.o
;
; Source:   Modern x86 Assembly Language Programming p. 482

global AvxGprMulx
global AvxGprShiftx

section .text

; extern "C" Uint64 AvxGprMulx(Uint32 a, Uint32 b, Uint8 flags[2]);
;
; Description:  The following function demonstrates use of the
;               flagless unsigned integer multiply instruction mulx.
;
; Requires      BMI2.

%define a     [ebp+8]
%define b     [ebp+12]
%define flags [ebp+16]

AvxGprMulx:
    push    ebp
    mov     ebp,esp
; Save copy of status flags before mulx
    mov     ecx,flags
    lahf
    mov     byte[ecx],ah
; Perform flagless multiplication.  The mulx instruction below computes
; the product of explicit source operand [ebp+8] and implicit source
; operand edx. The 64-bit result is saved to the register pair edx:eax.
    mov     edx,b                    ;edx = b
    mulx    edx,eax,a                ;edx:eax = [ebp+8] * edx
; Save copy of status flags after mulx
    push    eax
    lahf
    mov     byte[ecx+1],ah
    pop     eax
    pop     ebp
    ret

; extern "C" void AvxGprShiftx(Int32 x, Uint32 count, Int32 results[3]);
;
; Description:  The following function demonstrates use of the flagless
;               shift instructions sarx, shlx, and shrx.
;
; Requires      BMI2

%define x       [ebp+8]
%define count   [ebp+12]
%define results [ebp+16]

AvxGprShiftx:
    push    ebp
    mov     ebp,esp
; Load argument values and perform shifts.  Note that each shift
; instruction requires three operands: DesOp, SrcOp, and CountOp.
    mov     ecx,count        ;ecx = shift bit count
    mov     edx,results      ;edx = ptr to results
    sarx    eax,x,ecx        ;shift arithmetic right
    mov     [edx],eax
    shlx    eax,x,ecx        ;shift logical left
    mov     [edx+4],eax
    shrx    eax,x,ecx        ;shift logical right
    mov     [edx+8],eax
    pop     ebp
    ret
build
g++ -c -m32 main.cpp -o main.o
nasm -f elf32 -o avxgprmulxshiftx.o avxgprmulxshiftx.asm
g++ -m32 -o avxgprmulxshiftx avxgprmulxshiftx.o main.o