     ```; Name:         optimizingstep1.asm;; 1st optimization step : n = Array length
;
; The bubble sort algorithm can be easily optimized by observing that
; the n-th pass finds the n-th largest element and puts it into its
; final place. So, the inner loop can avoid looking at the last n-1
; items when running for the n-th time.
;
; source: http://en.wikipedia.org/wiki/Bubble_sort#Optimizing_bubble_sort

BubbleSort:
xor       r8,r8                         ; number of iterations
xor       r9,r9                         ; number of swaps (informative)
mov       r10, array.length             ; *** ADDED ***
.repeat:
mov       rdx, FALSE                    ; isSwapped = false
mov       rcx,1                         ; i = 1
mov       rsi,array                     ; point to start of the array
.for:
lodsq                                   ; RBX = array[i]
mov       rbx, rax
lodsq                                   ; RAX = array[i+1]
cmp       rax, rbx                      ; if array[i+1] >= array[i]
jge       .next
xor       rax, rbx                      ; then swap both values
xor       rbx, rax
xor       rax, rbx
mov       qword [rsi-datasize*2], rbx   ; and store swapped values in array
mov       qword [rsi-datasize], rax
mov       rdx,TRUE                      ; isSwapped = true
inc       r9                            ; increment number of swaps
.next:
inc       r8                            ; increment number of iterations
sub       rsi,datasize                  ; adjust pointer in array
inc       rcx                           ; i++
cmp       rcx,r10                       ; if i <= arrayLength-1 *** MODIFIED ***
jle       .for                          ; next comparison
.until:
dec       r10                           ; *** ADDED ***
cmp       rdx,TRUE                      ; if isSwapped == true
je        .repeat                       ; then repeat sort algorithm```

Here below the results of this optimization

```Bubblesort Algorithm - Agguro 2012
Optimization step: n-th pass finds the n-th largest elements
The UNSORTED array:
-------------------
9223372036854775807
15421441199845202
75
15
0
-854
-7854
-48545825
-9223372036854775808

The SORTED array:
-----------------
-9223372036854775808
-48545825
-7854
-854
0
15
75
15421441199845202
8030874076713219394

Number of iterations:                   45
Number of swaps     :                   37```     ```; Name:         optimizingstep2.asm;; 2nd optimization step : no check after last swap
;
; it can happen that more than one element is placed in their final position
; on a single pass. In particular, after every pass, all elements after the
; last swap are sorted, and do not need to be checked again. This allows us to
; skip over a lot of the elements, resulting in about a worst case 50% improvement
; in comparison count (though no improvement in swap counts), and adds very little
; complexity because the new code subsumes the "swapped" variable.
;
; source: http://en.wikipedia.org/wiki/Bubble_sort#Optimizing_bubble_sort

BubbleSort:
xor       r8,r8                         ; number of iterations
xor       r9,r9                         ; number of swaps (informative)
mov       r10, array.length             ; r10 = n = arrayLength
.repeat:
mov       r11, 0                        ; r11 = newn = 0     ADDED
mov       rcx,1                         ; i = 1
mov       rsi,array                     ; point to start of the array
.for:
lodsq                                   ; RBX = array[i]
mov       rbx, rax
lodsq                                   ; RAX = array[i+1]
cmp       rax, rbx                      ; if array[i+1] >= array[i]
jge       .next
xor       rax, rbx                      ; then swap both values
xor       rbx, rax
xor       rax, rbx
mov       r11, rcx                      ; newn = i    ADDED
mov       qword [rsi-datasize*2], rbx   ; and store swapped values in array
mov       qword [rsi-datasize], rax
inc       r9                            ; increment number of swaps
.next:
inc       r8                            ; increment number of iterations
sub       rsi,datasize                  ; adjust pointer in array
inc       rcx                           ; i++
cmp       rcx, r10                      ; if i <= arrayLength-1
jle       .for
mov       r10, r11                      ; n = newn   ADDED
.until:
dec       r10
cmp       r10, 0                        ; if r10 > 0
jg        .repeat                       ; then repeat sort algorithm```

And finally the results with this optimization

```Bubblesort Algorithm - Agguro 2012
Optimization step: no check after last swap
The UNSORTED array:
-------------------
9223372036854775807
15421441199845202
75
15
0
-854
-7854
-48545825
-9223372036854775808

The SORTED array:
-----------------
-9223372036854775808
-48545825
-7854
-854
0
15
75
15421441199845202
8030874076713219394

Number of iterations:                   38
Number of swaps     :                   37```     ```; Name:         optimizingstep0.asm;; The non-optimized bubblesort algorithm
;
; source: http://en.wikipedia.org/wiki/Bubble_sort

BubbleSort:
xor       r8,r8                         ; number of iterations
xor       r9,r9                         ; number of swaps
.repeat:
mov       rdx, FALSE                    ; isSwapped = false
mov       rcx,1                         ; i = 1
mov       rsi,array                     ; point to start of the array
.for:
lodsq                                   ; RBX = array[i]
mov       rbx, rax
lodsq                                   ; RAX = array[i+1]
cmp       rax, rbx                      ; if array[i+1] >= array[i]
jge       .next
xor       rax, rbx                      ; if less then swap both values
xor       rbx, rax
xor       rax, rbx
mov       qword [rsi-datasize*2], rbx   ; and store swapped values in array
mov       qword [rsi-datasize], rax
mov       rdx,TRUE                      ; isSwapped = true
inc       r9                            ; increment number of swaps
.next:
inc       r8                            ; increment number of iterations
sub       rsi,datasize                  ; adjust pointer in array
inc       rcx                           ; i++
cmp       rcx,array.length-1            ; if i <= arrayLength-1
jle       .for                          ; next comparison
.until:
cmp       rdx,TRUE                      ; if isSwapped == true
je        .repeat                       ; then repeat sort algorithm```

Her below the results of this optimization

```Bubblesort Algorithm - Agguro 2012
The UNSORTED array:
-------------------
9223372036854775807
15421441199845202
75
15
0
-854
-7854
-48545825
-9223372036854775808

The SORTED array:
-----------------
-9223372036854775808
-48545825
-7854
-854
0
15
75
15421441199845202
9223372036854775807

Number of iterations:                   72
Number of swaps     :                   36```     ```; Name:         bubblesort.asm
;
; Build:        nasm "-felf64" bubblesort.asm -l bubblesort.lst -o bubblesort.o
;               ld -s -melf_x86_64 -o bubblesort bubblesort.o
;
; Description:  Demonstration of the BubbleSort Algorithm, with or without optimzing steps.
;               The program needs to be re-assembled and linked when changing to an
;               optimization step.
;               Source : https://en.wikipedia.org/wiki/Bubble_sort

bits 64

[list -]
%include "unistd.inc"
[list +]

; here you can choose between the 3 different steps of optimization
; O : [default] no optimization
; 1 : n-th pass finds the n-th largest elements
; 2 : no check after last swap

%define OPTIMIZE_STEP 2                 ; select 0, 1 or 2

%define TRUE      1
%define FALSE     0

%macro STRING 1
.start:     db %1
.length:    equ \$-.start
%endmacro```
```%macro ARRAY 1-*
%rep  %0
dq  %1
%rotate 1
%endrep
%endmacro

section .bss
; 64 bit integers have a range of -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
; both included, so we need a buffer of 20 bytes at least to store them
Buffer:
sign           resb      1              ; ascii sign
decimal        resb     19              ; 20 bytes to store a 64 bits number + sign

section .data

; The list is intentionally sorted in descending order to demonstrate the wurst case scenario
; and determine the swaps and iterations.

datasize:      equ       8
array:         ARRAY     9223372036854775807, 15421441199845202, 75, 15, 0, -854, -7854, -48545825, -9223372036854775808
.length        equ       (\$-array)/datasize

title:         STRING    {"Bubblesort Algorithm - Agguro 2012",10}
%if OPTIMIZE_STEP == 1
opt1:          STRING    {"Optimization step: n-th pass finds the n-th largest elements",10}
%elif OPTIMIZE_STEP == 2
opt2:          STRING    {"Optimization step: no check after last swap",10}
%endif
unsorted:      STRING    {"The UNSORTED array:",10,"-------------------",10}
sorted:        STRING    {10,"The SORTED array:",10,"-----------------",10}
iterations:    STRING    {10,"Number of iterations: "}
swaps:         STRING    {"Number of swaps     : "}
lf:            STRING    {10}

section .text
global _start

_start:
pushfq
push      rax
push      rbx
push      rcx
push      rdx
push      rsi
push      r9
push      r8
; clear the screen and print title
; if the terminal you use don't support this, modify the bytes in .data
mov       rsi, title
mov       rdx, title.length
call      Print.string
%if OPTIMIZE_STEP == 1
mov       rsi, opt1
mov       rdx, opt1.length
call      Print.string
%elif OPTIMIZE_STEP == 2
mov       rsi, opt2
mov       rdx, opt2.length
call      Print.string
%endif
; display the unsorted Array on screen
mov       rsi, unsorted
mov       rdx, unsorted.length
call      Print.string
; print the Array elements
call      ShowArray

; Here the Bubblesort algorithm starts. Depending the value of OPTIMIZE_STEP an optimized or
; non-optimized version is included. On error the default (0) is used.
%if OPTIMIZE_STEP == 1
%include "optimizingstep1.asm"
%elif OPTIMIZE_STEP == 2
%include "optimizingstep2.asm"
%elif
%warning "no OPTIMIZE_STEP defined, defaulting to 0"
%include "optimizingstep0.asm"
%endif
; End of the BubbleSort algorithm.

; all we need to do is to display the sorted Array and restore the used registers
mov       rsi, sorted
mov       rdx, sorted.length
call      Print.string
call      ShowArray
; show number of iterations
mov       rsi, iterations
mov       rdx, iterations.length
call      Print.string
mov       rax, r8                       ; iterations in RAX
call      Convert                       ; RAX contains the number to convert
call      Print.integer
call      ClearBuffer                   ; clear buffer for next use
; show number of swaps
mov       rax, r9                       ; number of swaps in RAX
mov       rsi, swaps
mov       rdx, swaps.length
call      Print.string
call      Convert                       ; RAX contains the number to convert
call      Print.integer
call      Print.linefeed
; restore used registers
pop       r8
pop       r9
pop       rsi
pop       rdx
pop       rcx
pop       rbx
pop       rax
popfq
Exit:
syscall   exit, 0
ShowArray:
push      rcx
push      rsi
mov       rcx, array.length             ; show all integers
mov       rsi, array                    ; start of the array
.nextInteger:
lodsq                                   ; get integer
call      Convert                       ; RAX contains the number to convert
call      Print.integer
call      ClearBuffer                   ; clear buffer for next use
loop      .nextInteger
pop       rsi
pop       rcx
ret
Convert:
push      rax
push      rbx
push      rdx
push      rdi
push      rcx
mov       rdi,sign
mov       byte[rdi]," "                 ; default no sign
cmp       rax, 0
jge       .noSign
mov       byte[rdi],"-"                 ; number is zero
neg       rax                           ; make positive
.noSign:
mov       rdi, decimal                  ; address of buffer in RDI
add       rdi, 18                       ; 0..18 = 19 bytes of storage
; start conversion of absolute value of RAX
.repeat:
xor       rdx, rdx                      ; remainder will be in RDX
mov       rbx, 10
div       rbx                           ; RDX = remainder of division
or        dl,"0"                        ; make remainder decimal ASCII
mov       byte[rdi],dl                  ; and store
dec       rdi                           ; go to previous position
cmp       rax, 0                        ; RAX = quotient of division, if zero stop
; we can stop when AL < 10 however this will add more code to store AL
jnz       .repeat
; align integers to the right
mov       dl ,byte[sign]                ; copy sign character in [RDI] just before the number
mov       byte[rdi], dl
dec       rdi                           ; point to position before the sign character
mov       rcx, rdi                      ; calculate remaining bytes
sub       rcx, sign
cmp       rcx, 0
jle       .end
inc       rcx
mov       al," "                        ; fill remaining bytes with spaces
std
.fill:
stosb
loop      .fill
.end:
pop       rcx
pop       rdi
pop       rdx
pop       rbx
pop       rax
ret

; **** Clear the integer buffer
ClearBuffer:
push      rax
push      rcx
push      rdi
mov       rdi, Buffer
mov       rcx, 20                       ; 20 bytes to clear (1 added to clear last too)
xor       rax, rax
cld                                     ; begin at lowest address
.repeat:
stosb                                   ; erase [RDI] and increment RSI pointer
loop      .repeat
pop       rdi
pop       rcx
pop       rax
ret
; *** Print routines
Print:
.integer:
push      rdx
push      rsi
mov       rsi,Buffer
mov       rdx, 20                       ; 20 bytes to display
call      Print.string
call      Print.linefeed
pop       rsi
pop       rdx
ret
.linefeed:
mov       rsi, lf
mov       rdx, lf.length
.string:
push      rax
push      rdi
push      rcx                           ; even not used, RCX is changed after syscall
syscall   write, stdout
pop       rcx
pop       rdi
pop       rax
ret```