This program is still to be converted. Because the Windows source uses ImageBuffer.cpp and ImageBuffer.h and gdilib, this will take some time.
avxpackedintegerthreshold.asm
; Name:     avxpackedintegerthreshold.asm
;
; Build:    nasm -f elf32 -o avxpackedintegerthreshold.o avxpackedintegerthreshold.asm
;
; Source:   Modern x86 Assembly Language Programming p. 425
;
; Remark:   Until ImageBuffer.cpp is converted, this example is of little use.

extern NUM_PIXELS_MAX
global AvxPiThreshold
global AvxPiCalcMean

; Image threshold data structure (see AvxPackedIntegerThreshold.h)
struc       ITD
    .PbSrc               resd   1
    .PbMask              resd   1
    .NumPixels           resd   1
    .Threshold           resb   1
    .Pad                 resb   3
    .NumMaskedPixels     resd   1
    .SumMaskedPixels     resd   1
    .MeanMaskedPixels    resq   1
endstruc

; Marco AvxPiCalcMeanUpdateSums
;
; Description:  The following macro updates sum_masked_pixels in ymm4.
;               It also resets any necessary intermediate values in
;               order to prevent an overflow condition.
;
; Register contents:
;   ymm3:ymm2 = packed word sum_masked_pixels
;   ymm4 = packed dword sum_masked_pixels
;   ymm7 = packed zero
;
; Temp registers:
;   ymm0, ymm1, ymm5, ymm6

%macro AvxPiCalcMeanUpdateSums 0

; Promote packed word sum_masked_pixels to dword
    vpunpcklwd ymm0,ymm2,ymm7
    vpunpcklwd ymm1,ymm3,ymm7
    vpunpckhwd ymm5,ymm2,ymm7
    vpunpckhwd ymm6,ymm3,ymm7

; Update packed dword sums in sum_masked_pixels
    vpaddd ymm0,ymm0,ymm1
    vpaddd ymm5,ymm5,ymm6
    vpaddd ymm4,ymm4,ymm0
    vpaddd ymm4,ymm4,ymm5

; Reset intermediate values
    xor    edx,edx                       ;reset update counter
    vpxor  ymm2,ymm2,ymm2                ;reset sum_masked_pixels lo
    vpxor  ymm3,ymm3,ymm3                ;reset sum_masked_pixels hi
%endmacro


; Custom segment for constant values
section .data

PixelScale      times 32 db 0           ;uint8 to int8 scale value
R8_MinusOne     dq -1.0                 ;invalid mean value

section .text

; extern "C" bool AvxPiThreshold(ITD* itd);
;
; Description:  The following function performs image thresholding
;               of an 8 bits-per-pixel grayscale image.
;
; Returns:      0 = invalid size or unaligned image buffer
;               1 = success
;
; Requires:     AVX2

AvxPiThreshold:
    push    ebp
    mov     ebp,esp
    push    esi
    push    edi

; Load and verify the argument values in ITD structure
    mov     edx,[ebp+8]                     ;edx = 'itd'
    xor     eax,eax                         ;set error return code
    mov     ecx,[edx+ITD.NumPixels]         ;ecx = NumPixels
    test    ecx,ecx
    jz      .done                           ;jump if num_pixels == 0
    cmp     ecx,[NUM_PIXELS_MAX]
    ja      .done                           ;jump if num_pixels too big
    test    ecx,1fh
    jnz     .done                           ;jump if num_pixels % 32 != 0
    shr     ecx,5                           ;ecx = number of packed pixels

    mov     esi,[edx+ITD.PbSrc]             ;esi = PbSrc
    test    esi,1fh
    jnz     .done                           ;jump if misaligned
    mov     edi,[edx+ITD.PbMask]            ;edi = PbMask
    test    edi,1fh
    jnz     .done                           ;jump if misaligned

; Initialize packed threshold
    vpbroadcastb ymm0,[edx+ITD.Threshold]   ;ymm0 = packed threshold
    vmovdqa  ymm7, [PixelScale]             ;ymm7 = uint8 to int8 SF
    vpsubb   ymm2,ymm0,ymm7                 ;ymm1 = scaled threshold

; Create the mask image
.@1:     
    vmovdqa  ymm0, [esi]                    ;load next packed pixel
    vpsubb   ymm1,ymm0,ymm7                 ;ymm1 = scaled image pixels
    vpcmpgtb ymm3,ymm1,ymm2                 ;compare against threshold
    vmovdqa  [edi],ymm3                     ;save packed threshold mask
 
    add     esi,32
    add     edi,32
    dec     ecx
    jnz     .@1                             ;repeat until done
    mov     eax,1                           ;set return code

.done:
    pop     edi
    pop     esi
    pop     ebp
    ret

; extern "C" bool AvxPiCalcMean(ITD* itd);
;
; Description:  The following function calculates the mean value all
;               above-threshold image pixels using the mask created by
;               function AvxPiThreshold_.
;
; Returns:      0 = invalid image size or unaligned image buffer
;               1 = success
;
; Requires:     AVX2, POPCNT

AvxPiCalcMean:
    push ebp
    mov ebp,esp
    push ebx
    push esi
    push edi

; Load and verify the argument values in ITD structure
    mov eax,[ebp+8]                     ;eax = 'itd'
    mov ecx,[eax+ITD.NumPixels]         ;ecx = NumPixels
    test ecx,ecx
    jz .error                            ;jump if num_pixels == 0
    cmp ecx,[NUM_PIXELS_MAX]
    ja .error                            ;jump if num_pixels too big
    test ecx,1fh
    jnz .error                           ;jump if num_pixels % 32 != 0
    shr ecx,5                           ;ecx = number of packed pixels

    mov edi,[eax+ITD.PbMask]            ;edi = PbMask
    test edi,1fh
    jnz .error                           ;jump if PbMask not aligned
    mov esi,[eax+ITD.PbSrc]             ;esi = PbSrc
    test esi,1fh
    jnz .error                           ;jump if PbSrc not aligned

; Initialize values for mean calculation
    xor edx,edx                 ;edx = update counter
    vpxor ymm7,ymm7,ymm7        ;ymm7 = packed zero
    vmovdqa ymm2,ymm7           ;ymm2 = sum_masked_pixels (16 words)
    vmovdqa ymm3,ymm7           ;ymm3 = sum_masked_pixels (16 words)
    vmovdqa ymm4,ymm7           ;ymm4 = sum_masked_pixels (8 dwords)
    xor ebx,ebx                 ;ebx = num_masked_pixels (1 dword)

; Register usage for processing loop
; esi = PbSrc, edi = PbMask, eax = scratch register
; ebx = num_pixels_masked, ecx = NumPixels / 32, edx = update counter
;
; ymm0 = packed pixel, ymm1 = packed mask
; ymm3:ymm2 = sum_masked_pixels (32 words)
; ymm4 = sum_masked_pixels (8 dwords)
; ymm5 = scratch register
; ymm6 = scratch register
; ymm7 = packed zero

.@1:
    vmovdqa     ymm0, [esi]      ;load next packed pixel
    vmovdqa     ymm1, [edi]      ;load next packed mask

; Update mum_masked_pixels
    vpmovmskb   eax,ymm1
    popcnt      eax,eax
    add         ebx,eax

; Update sum_masked_pixels (word values)
    vpand      ymm6,ymm0,ymm1            ;set non-masked pixels to zero
    vpunpcklbw ymm0,ymm6,ymm7
    vpunpckhbw ymm1,ymm6,ymm7       ;ymm1:ymm0 = masked pixels (words)
    vpaddw     ymm2,ymm2,ymm0
    vpaddw     ymm3,ymm3,ymm1           ;ymm3:ymm2 = sum_masked_pixels

; Check and see if it's necessary to update the dword sum_masked_pixels
; in xmm4 and num_masked_pixels in ebx
    inc     edx
    cmp     edx,255
    jb      .noUpdate

    AvxPiCalcMeanUpdateSums

.noUpdate:
    add     esi,32
    add     edi,32
    dec     ecx
    jnz     .@1                              ;repeat loop until done

; Main processing loop is finished. If necessary, perform final update
; of sum_masked_pixels in xmm4 & num_masked_pixels in ebx.
    test    edx,edx
    jz      .@2
    
    AvxPiCalcMeanUpdateSums

; Compute and save final sum_masked_pixels & num_masked_pixels
.@2:     
    vextracti128 xmm0,ymm4,1
    vpaddd       xmm1,xmm0,xmm4
    vphaddd      xmm2,xmm1,xmm7
    vphaddd      xmm3,xmm2,xmm7
    vmovd        edx,xmm3                      ;edx = final sum_mask_pixels

    mov     eax,[ebp+8]                     ;eax = 'itd'
    mov     [eax+ITD.SumMaskedPixels],edx   ;save final sum_masked_pixels
    mov     [eax+ITD.NumMaskedPixels],ebx   ;save final num_masked_pixels

; Compute mean of masked pixels
    test    ebx,ebx                        ;is num_mask_pixels zero?
    jz      .noMean                           ;if yes, skip calc of mean
    vcvtsi2sd xmm0,xmm0,edx             ;xmm0 = sum_masked_pixels
    vcvtsi2sd xmm1,xmm1,ebx             ;xmm1 = num_masked_pixels
    vdivsd    xmm0,xmm0,xmm1               ;xmm0 = mean_masked_pixels
    jmp       .@3
.noMean:
    vmovsd  xmm0,[R8_MinusOne]               ;use -1.0 for no mean
.@3:
    vmovsd  [eax+ITD.MeanMaskedPixels],xmm0  ;save mean
    mov     eax,1                               ;set return code
    vzeroupper
.done:
    pop     edi
    pop     esi
    pop     ebx
    pop     ebp
    ret
.error:
    xor     eax,eax                         ;set error return code
    jmp     .done
build
nasm -f elf32 -o avxpackedintegerthreshold.o avxpackedintegerthreshold.asm