This program is still to be converted. Because the Windows source uses ThreadTimer::SetThreadAffinityMask(), this will take some time.
ssepackedintegerthreshold.asm
; Name:     ssepackedintegerthreshold.asm
;
; Build:    g++ -c -m32 main.cpp -o main.o
;           nasm -f elf32 -o ssepackedintegerthreshold.o ssepackedintegerthreshold.asm
;           g++ -m32 -o ssepackedintegerthreshold ssepackedintegerthreshold.o main.o ../../commonfiles/xmmval.o
;
; Source:   Modern x86 Assembly Language Programming p. 279
;
; Remark:   Until ImageBuffer.cpp is converted, this example is of little use.

extern NUM_PIXELS_MAX

global SsePiThreshold
global SsePiCalcMean

; Image threshold data structure (see SsePackedIntegerThreshold.h)
struc ITD
    .PbSrc:             resd 1
    .PbMask:            resd 1
    .NumPixels:         resd 1
    .Threshold:         resb 1
    .Pad:               resb 3
    .NumMaskedPixels:   resd 1
    .SumMaskedPixels:   resd 1
    .MeanMaskedPixels:  resq 1
endstruc

section .data
align 16

PixelScale:      times 16 db 0x80            ;uint8 to int8 scale value
CountPixelsMask: times 16 db 0x01            ;mask to count pixels
R8_MinusOne:     dq -1.0                     ;invalid mean value
                
section .text

; extern "C" bool SsePiThreshold(ITD* itd);
;
; Description:  The following function performs image thresholding
;               of an 8 bits-per-pixel grayscale image.
;
; Returns:      0 = invalid size or unaligned image buffer
;               1 = success
;
; Requires:     SSSE3

%define itd [ebp+8]

SsePiThreshold:
    push    ebp
    mov     ebp,esp
    push    esi
    push    edi
; Load and verify the argument values in ITD structure
    mov     edx,itd                         ;edx = 'itd'
    xor     eax,eax                         ;set error return code
    mov     ecx,[edx+ITD.NumPixels]         ;ecx = NumPixels
    test    ecx,ecx
    jz      .done                           ;jump if num_pixels == 0
    cmp     ecx,[NUM_PIXELS_MAX]
    ja      .done                           ;jump if num_pixels too big
    test    ecx,0fh
    jnz     .done                           ;jump if num_pixels % 16 != 0
    shr     ecx,4                           ;ecx = number of packed pixels
    mov     esi,[edx+ITD.PbSrc]             ;esi = PbSrc
    test    esi,0fh
    jnz     .done                           ;jump if misaligned
    mov     edi,[edx+ITD.PbMask]            ;edi = PbMask
    test    edi,0fh
    jnz     .done                           ;jump if misaligned
; Initialize packed threshold
    movzx   eax,byte[edx+ITD.Threshold]     ;eax = threshold
    movd    xmm1,eax                        ;xmm1[7:0] = threshold
    pxor    xmm0,xmm0                       ;mask for pshufb
    pshufb  xmm1,xmm0                       ;xmm1 = packed threshold
    movdqa  xmm2,[PixelScale]
    psubb   xmm1,xmm2                       ;xmm1 = scaled threshold
; Create the mask image
.@1:
    movdqa  xmm0,[esi]                      ;load next packed pixel
    psubb   xmm0,xmm2                       ;xmm0 = scaled image pixels
    pcmpgtb xmm0,xmm1                       ;compare against threshold
    movdqa  [edi],xmm0                      ;save packed threshold mask
    add     esi,16
    add     edi,16
    dec     ecx
    jnz     .@1                             ;repeat until done
    mov     eax,1                           ;set return code
.done:
    pop     edi
    pop     esi
    pop     ebp
    ret

; extern "C" bool SsePiCalcMean(ITD* itd);
;
; Description:  The following function calculates the mean value all
;               above-threshold image pixels using the mask created by
;               the function SsePiThreshold_.
;
; Returns:      0 = invalid image size or unaligned image buffer
;               1 = success
;
; Requires:     SSSE3

%define itd [ebp+8]

SsePiCalcMean:
    push    ebp
    mov     ebp,esp
    push    ebx
    push    esi
    push    edi

; Load and verify the argument values in ITD structure
    mov     eax,itd                             ;eax = 'itd'
    mov     ecx,[eax+ITD.NumPixels]             ;ecx = NumPixels
    test    ecx,ecx
    jz      .error                              ;jump if num_pixels == 0
    cmp     ecx,[NUM_PIXELS_MAX]
    ja      .error                              ;jump if num_pixels too big
    test    ecx,0x0f
    jnz     .error                              ;jump if num_pixels % 16 != 0
    shr     ecx,4                               ;ecx = number of packed pixels
    mov     edi,[eax+ITD.PbMask]                ;edi = PbMask
    test    edi,0x0f
    jnz     .error                              ;jump if PbMask not aligned
    mov     esi,[eax+ITD.PbSrc]                 ;esi = PbSrc
    test    esi,0x0f
    jnz     .error                              ;jump if PbSrc not aligned
; Initialize values for mean calculation
    xor     edx,edx                             ;edx = update counter
    pxor    xmm7,xmm7                           ;xmm7 = packed zero
    pxor    xmm2,xmm2                           ;xmm2 = sum_masked_pixels (8 words)
    pxor    xmm3,xmm3                           ;xmm3 = sum_masked_pixels (8 words)
    pxor    xmm4,xmm4                           ;xmm4 = sum_masked_pixels (4 dwords)
    pxor    xmm6,xmm6                           ;xmm6 = num_masked_pixels (8 bytes)
    xor     ebx,ebx                             ;ebx = num_masked_pixels (1 dword)
; Register usage for processing loop
; esi = PbSrc, edi = PbMask, eax = itd
; ebx = num_pixels_masked, ecx = NumPixels / 16, edx = update counter
;
; xmm0 = packed pixel, xmm1 = packed mask
; xmm3:xmm2 = sum_masked_pixels (16 words)
; xmm4 = sum_masked_pixels (4 dwords)
; xmm5 = scratch register
; xmm6 = packed num_masked_pixels
; xmm7 = packed zero
.@1:
    movdqa    xmm0,[esi]                        ;load next packed pixel
    movdqa    xmm1,[edi]                        ;load next packed mask
; Update sum_masked_pixels (word values)
    movdqa    xmm5,[CountPixelsMask]
    pand      xmm5,xmm1
    paddb     xmm6,xmm5                         ;update num_masked_pixels
    pand      xmm0,xmm1                         ;set non-masked pixels to zero
    movdqa    xmm1,xmm0
    punpcklbw xmm0,xmm7
    punpckhbw xmm1,xmm7                         ;xmm1:xmm0 = masked pixels (words)
    paddw     xmm2,xmm0
    paddw     xmm3,xmm1             ;xmm3:xmm2 = sum_masked_pixels
; Check and see if it's necessary to update the dword sum_masked_pixels
; in xmm4 and num_masked_pixels in ebx
    inc     edx
    cmp     edx,255
    jb      .noUpdate
    call    SsePiCalcMeanUpdateSums
.noUpdate:
    add     esi,16
    add     edi,16
    dec     ecx
    jnz     .@1                                 ;repeat loop until done
; Main processing loop is finished. If necessary, perform final update
; of sum_masked_pixels in xmm4 & num_masked_pixels in ebx.
    test    edx,edx
    jz      .@2
    call    SsePiCalcMeanUpdateSums
; Compute and save final sum_masked_pixels & num_masked_pixels
.@2:
    phaddd  xmm4,xmm7
    phaddd  xmm4,xmm7
    movd    edx,xmm4                            ;edx = final sum_mask_pixels
    mov     [eax+ITD.SumMaskedPixels],edx       ;save final sum_masked_pixels
    mov     [eax+ITD.NumMaskedPixels],ebx       ;save final num_masked_pixels
; Compute mean of masked pixels
    test     ebx,ebx                            ;is num_mask_pixels zero?
    jz       .noMean                            ;if yes, skip calc of mean
    cvtsi2sd xmm0,edx                           ;xmm0 = sum_masked_pixels
    cvtsi2sd xmm1,ebx                           ;xmm1 = num_masked_pixels
    divsd    xmm0,xmm1                          ;xmm0 = mean_masked_pixels
    jmp      .@3
.noMean:
    movsd   xmm0,[R8_MinusOne]                  ;use -1.0 for no mean
.@3:
    movsd   [eax+ITD.MeanMaskedPixels],xmm0     ;save mean
    mov     eax,1                               ;set return code
.done:
    pop     edi
    pop     esi
    pop     ebx
    pop     ebp
    ret
.error:
    xor     eax,eax                             ;set error return code
    jmp     .done

; void SsePiCalcMeanUpdateSums
;
; Description:  The following function updates sum_masked_pixels in xmm4
;               and num_masked_pixels in ebx. It also resets any
;               necessary intermediate values in order to prevent an
;               overflow condition.
;
; Register contents:
;   xmm3:xmm2 = packed word sum_masked_pixels
;   xmm4 = packed dword sum_masked_pixels
;   xmm6 = packed num_masked_pixels
;   xmm7 = packed zero
;   ebx = num_masked_pixels
;
; Temp registers:
;   xmm0, xmm1, xmm5, edx

SsePiCalcMeanUpdateSums:

; Promote packed word sum_masked_pixels to dword
    movdqa      xmm0,xmm2
    movdqa      xmm1,xmm3
    punpcklwd   xmm0,xmm7
    punpcklwd   xmm1,xmm7
    punpckhwd   xmm2,xmm7
    punpckhwd   xmm3,xmm7
; Update packed dword sums in sum_masked_pixels
    paddd       xmm0,xmm1
    paddd       xmm2,xmm3
    paddd       xmm4,xmm0
    paddd       xmm4,xmm2       ;xmm4 = packed sum_masked_pixels
; Sum num_masked_pixel counts (bytes) in xmm6, then add to total in ebx.
    movdqa      xmm5,xmm6
    punpcklbw   xmm5,xmm7
    punpckhbw   xmm6,xmm7       ;xmm6:xmm5 = packed num_masked_pixels
    paddw       xmm6,xmm5       ;xmm6 = packed num_masked_pixels
    phaddw      xmm6,xmm7
    phaddw      xmm6,xmm7
    phaddw      xmm6,xmm7       ;xmm6[15:0] = final word sum
    movd        edx,xmm6
    add         ebx,edx         ;ebx = num_masked_pixels
; Reset intermediate values
    xor         edx,edx
    pxor        xmm2,xmm2
    pxor        xmm3,xmm3
    pxor        xmm6,xmm6
    ret
build
nasm -f elf32 -o ssepackedintegerthreshold.o ssepackedintegerthreshold.asm