main.cpp
#include <stdio.h>
#include "../../commonfiles/ymmval.h"

extern "C" void AvxBlendFloat(YmmVal* des, YmmVal* src1, YmmVal* src2, YmmVal* src3);
extern "C" void AvxBlendByte(YmmVal* des, YmmVal* src1, YmmVal* src2, YmmVal* src3);

void AvxBlendFloat(void)
{
    char buff[256];
    const Uint32 select1 = 0x00000000;
    const Uint32 select2 = 0x80000000;
    __attribute__((aligned(32))) YmmVal des, src1, src2, src3;

    src1.r32[0] = 100.0f;       src2.r32[0] = -1000.0f;
    src1.r32[1] = 200.0f;       src2.r32[1] = -2000.0f;
    src1.r32[2] = 300.0f;       src2.r32[2] = -3000.0f;
    src1.r32[3] = 400.0f;       src2.r32[3] = -4000.0f;
    src1.r32[4] = 500.0f;       src2.r32[4] = -5000.0f;
    src1.r32[5] = 600.0f;       src2.r32[5] = -6000.0f;
    src1.r32[6] = 700.0f;       src2.r32[6] = -7000.0f;
    src1.r32[7] = 800.0f;       src2.r32[7] = -8000.0f;

    src3.u32[0] = select2;
    src3.u32[1] = select2;
    src3.u32[2] = select1;
    src3.u32[3] = select2;
    src3.u32[4] = select1;
    src3.u32[5] = select1;
    src3.u32[6] = select2;
    src3.u32[7] = select1;

    AvxBlendFloat(&des, &src1, &src2, &src3);

    printf("\nResults for AvxBlendFloat()\n");
    printf("src1 lo: %s\n", src1.ToString_r32(buff, sizeof(buff), false));
    printf("src1 hi: %s\n", src1.ToString_r32(buff, sizeof(buff), true));
    printf("src2 lo: %s\n", src2.ToString_r32(buff, sizeof(buff), false));
    printf("src2 hi: %s\n", src2.ToString_r32(buff, sizeof(buff), true));
    printf("\n");
    printf("src3 lo: %s\n", src3.ToString_x32(buff, sizeof(buff), false));
    printf("src3 hi: %s\n", src3.ToString_x32(buff, sizeof(buff), true));
    printf("\n");
    printf("des lo:  %s\n", des.ToString_r32(buff, sizeof(buff), false));
    printf("des hi:  %s\n", des.ToString_r32(buff, sizeof(buff), true));
}

void AvxBlendByte(void)
{
    char buff[256];
	__attribute__((aligned(32))) YmmVal des, src1, src2, src3;

    // Control values required to perform doubleword blend
    // using vpblendvb instruction
    const Uint32 select1 = 0x00000000;      // select src1
    const Uint32 select2 = 0x80808080;      // select src2

    src1.i32[0] = 100;          src2.i32[0] = -1000;
    src1.i32[1] = 200;          src2.i32[1] = -2000;
    src1.i32[2] = 300;          src2.i32[2] = -3000;
    src1.i32[3] = 400;          src2.i32[3] = -4000;
    src1.i32[4] = 500;          src2.i32[4] = -5000;
    src1.i32[5] = 600;          src2.i32[5] = -6000;
    src1.i32[6] = 700;          src2.i32[6] = -7000;
    src1.i32[7] = 800;          src2.i32[7] = -8000;

    src3.u32[0] = select1;
    src3.u32[1] = select1;
    src3.u32[2] = select2;
    src3.u32[3] = select1;
    src3.u32[4] = select2;
    src3.u32[5] = select2;
    src3.u32[6] = select1;
    src3.u32[7] = select2;

    AvxBlendByte(&des, &src1, &src2, &src3);

    printf("\nResults for AvxBlendByte() - doublewords\n");
    printf("src1 lo: %s\n", src1.ToString_i32(buff, sizeof(buff), false));
    printf("src1 hi: %s\n", src1.ToString_i32(buff, sizeof(buff), true));
    printf("src2 lo: %s\n", src2.ToString_i32(buff, sizeof(buff), false));
    printf("src2 hi: %s\n", src2.ToString_i32(buff, sizeof(buff), true));
    printf("\n");
    printf("src3 lo: %s\n", src3.ToString_x32(buff, sizeof(buff), false));
    printf("src3 hi: %s\n", src3.ToString_x32(buff, sizeof(buff), true));
    printf("\n");
    printf("des lo:  %s\n", des.ToString_i32(buff, sizeof(buff), false));
    printf("des hi:  %s\n", des.ToString_i32(buff, sizeof(buff), true));
}

int main(int argc, char* argv[])
{
    AvxBlendFloat();
    AvxBlendByte();
    return 0;
}
avxblend.asm
; Name:		avxblend.asm
;
; Build:	g++ -c -m32 main.cpp -o main.o
;			nasm -f elf32 -o avxblend.o avxblend.asm
;			g++ -m32 -o avxblend avxblend.o main.o
;
; Source:	Modern x86 Assembly Language Programming p. 453

global AvxBlendFloat
global AvxBlendByte

section .text

; extern "C" void AvxBlendFloat(YmmVal* des, YmmVal* src1, YmmVal* src2, YmmVal* src3);
;
; Description:  The following function demonstrates used of the vblendvps
;               instruction using YMM registers.
;
; Requires:     AVX

%define des  [ebp+8]
%define src1 [ebp+12]
%define src3 [ebp+16]
%define src2 [ebp+20]

AvxBlendFloat:
    push      ebp
    mov       ebp,esp

; Load argument values
    mov       eax,src1                  ;eax = ptr to src1
    mov       ecx,src2                  ;ecx = ptr to src2
    mov       edx,src3                  ;edx = ptr to src3

    vmovaps   ymm1,[eax]                ;ymm1 = src1
    vmovaps   ymm2,[ecx]                ;ymm2 = src2
    vmovdqa   ymm3,[edx]                ;ymm3 = src3

; Perform variable SPFP blend
    vblendvps ymm0,ymm1,ymm2,ymm3       ;ymm0 = blend result
    mov       eax,[ebp+8]
    vmovaps   [eax],ymm0                ;save blend result

    vzeroupper
    pop       ebp
    ret

; extern "C" void AvxBlendByte(YmmVal* des, YmmVal* src1, YmmVal* src2, YmmVal* src3);
;
; Description:  The following function demonstrates use of the vpblendvb
;               instruction.
;
; Requires:     AVX2

%define des  [ebp+8]
%define src1 [ebp+12]
%define src3 [ebp+16]
%define src2 [ebp+20]

AvxBlendByte:
    push    ebp
    mov     ebp,esp

; Load argument values
    mov     eax,src1                    ;eax = ptr to src1
    mov     ecx,src2                    ;ecx = ptr to src2
    mov     edx,src3                    ;edx = ptr to src3

    vmovdqa   ymm1,[eax]                ;ymm1 = src1
    vmovdqa   ymm2,[ecx]                ;ymm2 = src2
    vmovdqa   ymm3,[edx]                ;ymm3 = src3

; Perform variable byte blend
    vpblendvb ymm0,ymm1,ymm2,ymm3       ;ymm0 = blend result
    mov       eax,des
    vmovdqa   [eax],ymm0                ;save blend result
    vzeroupper
    pop       ebp
    ret
build
gcc -m32 -c main.cpp -o main.o
nasm -f elf32 -o avxblend.o avxblend.asm
g++ -m32 -o avxblend avxblend.o main.o ../../commonfiles/ymmval.o