This program is still to be converted. Because the Windows source uses ThreadTimer::SetThreadAffinityMask(), this will take some time.
ssepackedintegerhistogram.asm
; Name: ssepackedintegerhistogram.asm
;
; Build: g++ -c -m32 main.cpp -o main.o
; nasm -f elf32 -o ssepackedintegerhistogram.o ssepackedintegerhistogram.asm
; g++ -m32 -o ssepackedintegerhistogram ssepackedintegerhistogram.o main.o ../../commonfiles/xmmval.o
;
; Source: Modern x86 Assembly Language Programming p. 279
;
; Remark: Until ImageBuffer.cpp is converted, this example is of little use.
extern NUM_PIXELS_MAX
global SsePiHistogram
section .text
; extern bool SsePiHistogram(Uint32* histo, const Uint8* pixel_buff, Uint32 num_pixels);
;
; Description: The following function builds an image histogram.
;
; Returns: 0 = invalid argument value
; 1 = success
;
; Requires: SSE4.1
SsePiHistogram:
push ebp
mov ebp,esp
and esp,0FFFFFFF0H ;align ESP to 16 byte boundary
sub esp,1024 ;allocate histo2
mov edx,esp ;edx = histo2
push ebx
push esi
push edi
; Make sure num_pixels is valid
xor eax,eax ;set error return code
mov ecx,[ebp+16] ;ecx = num_pixels
cmp ecx,[NUM_PIXELS_MAX]
ja .done ;jump if num_pixels too big
test ecx,1fh
jnz .done ;jump if num_pixels % 32 != 0
; Make sure histo & pixel_buff are properly aligned
mov ebx,[ebp+8] ;ebx = histo
test ebx,0fh
jnz .done ;jump if misaligned
mov esi,[ebp+12] ;esi = pixel_buff
test esi,0fh
jnz .done ;jump if misaligned
; Initialize the histogram buffers (set all entries to zero)
mov edi,ebx ;edi = histo
mov ecx,256
rep stosd ;initialize histo
mov edi,edx ;edi = histo2
mov ecx,256
rep stosd ;initialize histo2
; Perform processing loop initializations
mov edi,edx ;edi = histo2
mov ecx,[ebp+16] ;ecx = number of pxiels
shr ecx,5 ;ecx = number of pixel blocks
; Build the histograms
; Register usage: ebx = histo, edi = histo2, esi = pixel_buff
align 16 ;align jump target
.@1:
movdqa xmm0,[esi] ;load pixel block
movdqa xmm2,[esi+16] ;load pixel block
movdqa xmm1,xmm0
movdqa xmm3,xmm2
; Process pixels 0 - 3
pextrb eax,xmm0,0 ;extract & count pixel 0
add dword[ebx+eax*4],1
pextrb edx,xmm1,1 ;extract & count pixel 1
add dword[edi+edx*4],1
pextrb eax,xmm0,2 ;extract & count pixel 2
add dword[ebx+eax*4],1
pextrb edx,xmm1,3 ;extract & count pixel 3
add dword[edi+edx*4],1
; Process pixels 4 - 7
pextrb eax,xmm0,4 ;extract & count pixel 4
add dword[ebx+eax*4],1
pextrb edx,xmm1,5 ;extract & count pixel 5
add dword[edi+edx*4],1
pextrb eax,xmm0,6 ;extract & count pixel 6
add dword[ebx+eax*4],1
pextrb edx,xmm1,7 ;extract & count pixel 7
add dword[edi+edx*4],1
; Process pixels 8 - 11
pextrb eax,xmm0,8 ;extract & count pixel 8
add dword[ebx+eax*4],1
pextrb edx,xmm1,9 ;extract & count pixel 9
add dword[edi+edx*4],1
pextrb eax,xmm0,10 ;extract & count pixel 10
add dword[ebx+eax*4],1
pextrb edx,xmm1,11 ;extract & count pixel 11
add dword[edi+edx*4],1
; Process pixels 12 - 15
pextrb eax,xmm0,12 ;extract & count pixel 12
add dword[ebx+eax*4],1
pextrb edx,xmm1,13 ;extract & count pixel 13
add dword[edi+edx*4],1
pextrb eax,xmm0,14 ;extract & count pixel 14
add dword[ebx+eax*4],1
pextrb edx,xmm1,15 ;extract & count pixel 15
add dword[edi+edx*4],1
; Process pixels 16 - 19
pextrb eax,xmm2,0 ;extract & count pixel 16
add dword[ebx+eax*4],1
pextrb edx,xmm3,1 ;extract & count pixel 17
add dword[edi+edx*4],1
pextrb eax,xmm2,2 ;extract & count pixel 18
add dword[ebx+eax*4],1
pextrb edx,xmm3,3 ;extract & count pixel 19
add dword[edi+edx*4],1
; Process pixels 20 - 23
pextrb eax,xmm2,4 ;extract & count pixel 20
add dword[ebx+eax*4],1
pextrb edx,xmm3,5 ;extract & count pixel 21
add dword[edi+edx*4],1
pextrb eax,xmm2,6 ;extract & count pixel 22
add dword[ebx+eax*4],1
pextrb edx,xmm3,7 ;extract & count pixel 23
add dword[edi+edx*4],1
; Process pixels 24 - 27
pextrb eax,xmm2,8 ;extract & count pixel 24
add dword[ebx+eax*4],1
pextrb edx,xmm3,9 ;extract & count pixel 25
add dword[edi+edx*4],1
pextrb eax,xmm2,10 ;extract & count pixel 26
add dword[ebx+eax*4],1
pextrb edx,xmm3,11 ;extract & count pixel 27
add dword[edi+edx*4],1
; Process pixels 28 - 31
pextrb eax,xmm2,12 ;extract & count pixel 28
add dword[ebx+eax*4],1
pextrb edx,xmm3,13 ;extract & count pixel 29
add dword[edi+edx*4],1
pextrb eax,xmm2,14 ;extract & count pixel 30
add dword[ebx+eax*4],1
pextrb edx,xmm3,15 ;extract & count pixel 31
add dword[edi+edx*4],1
add esi,32 ;esi =next pixel block
sub ecx,1 ;update counter
jnz .@1 ;repeat loop if not done
; Add histo2 to histo for final histogram. Note that each loop iteration
; adds 8 histogram entries.
mov ecx,32 ;ecx = number of iterations
xor eax,eax ;eax = offset for histo arrays
.@2:
movdqa xmm0,[ebx+eax] ;load histo counts
movdqa xmm1,[ebx+eax+16]
paddd xmm0,[edi+eax] ;add counts from histo2
paddd xmm1,[edi+eax+16]
movdqa [ebx+eax],xmm0 ;save final histo counts
movdqa [ebx+eax+16],xmm1
add eax,32 ;update array offset
sub ecx,1 ;update counter
jnz .@2 ;repeat loop if not done
mov eax,1 ;set success return code
.done:
pop edi
pop esi
pop ebx
mov esp,ebp
pop ebp
ret
build
nasm -f elf32 -o ssepackedintegerhistogram.o ssepackedintegerhistogram.asm