main.cpp
#include <stdio.h>
#include <inttypes.h>
#include "../../commonfiles/miscdefs.h"

extern "C" bool Cc2(const Int64* a, const Int64* b, Int32 n, Int64 * sum_a, Int64* sum_b, Int64* prod_a, Int64* prod_b);

int main(int argc, char* argv[])
{
    const int32_t n = 6;
    Int64 a[n] = { 2, -2, -6, 7, 12, 5 };
    Int64 b[n] = { 3, 5, -7, 8, 4, 9 };
    Int64 sum_a, sum_b;
    Int64 prod_a, prod_b;

    printf("\nResults for CallingConvention2\n");
    
    bool rc = Cc2(a, b, n, &sum_a, &sum_b, &prod_a, &prod_b);

    if (!rc)
        printf("Invalid return code from Cc2()\n");
    else
    {
        printf("               a                b\n");
        for (int i = 0; i < n; i++)
            printf("        %8" PRId64 "         %8" PRId64 "\n", a[i], b[i]);

        printf("\n");
        printf("sum_a:  %8" PRId64 " sum_b:  %8" PRId64 "\n", sum_a, sum_b);
        printf("prod_a: %8" PRId64 " prod_b: %8" PRId64 "\n", prod_a, prod_b);
    }

    return 0;
}
callingconvention2.asm
; Name:     callingconvention2.asm
;
; Build:    g++ -m32 -c main.cpp -o main.o
;           nasm -f elf32 -o callingconvention2.o callingconvention2.asm
;           g++ -m32 -o callingconvention2 callingconvention2.o main.o
;
; Remark:   GCC also has non-volatile registers, so this example is converted too.
;           In fact I expanded it with useless but interesting issues on stack-
;           operations.  Keep in mind that this program is to illustrate how
;           to deal with stacks.  In real life I will never program this way, but
;           it's a nice example.
;
; Source:   Modern x86 Assembly Language Programming p.528

global Cc2

bits 64

section .text

; extern "C" void Cc2(const Int64* a, const Int64* b, Int32 n, Int64* sum_a, Int64* sum_b, Int64* prod_a, Int64* prod_b);
;
; Description:  The following function illustrates how to initialize and
;               use a stack frame pointer.  It also demonstrates use
;               of several non-volatile general purpose registers.

; Named expressions for constant values.
;
; NUM_PUSHREG   = number of prolog non-volatile register pushes
; STK_LOCAL1    = size in bytes of STK_LOCAL1 area
; STK_LOCAL2    = size in bytes of STK_LOCAL2 area
; STK_PAD       = extra bytes (0 or 8) needed to 16-byte align RSP
; STK_TOTAL     = total size in bytes of local stack
; RBP_RA        = number of bytes between RBP and ret addr on stack
; LOCAL1        = start LocalVars1 space on stack
; LOCAL2        = start LocalVars2 space on stack
; ARGS          = start arguments space on stack

NUM_PUSHREG: equ 6
;space for LocalVar1A, LocalVar1B, LocalVar1C and LocalVar1D, 64 bits
STK_LOCAL1:  equ 4
;space for arguments on stack, 64 bits
STK_ARG:     equ 7
;space for LocalVar2A/B and LocalVar2C/D, 128 bits
STK_LOCAL2:  equ 2
STK_PAD:     equ ((NUM_PUSHREG + STK_ARG + STK_LOCAL1 + STK_LOCAL2*2) & 1) ^ 1
STK_TOTAL:   equ (STK_LOCAL1 + STK_LOCAL2*2 + STK_ARG + STK_PAD) * 8
RBP_RA:      equ (NUM_PUSHREG + STK_LOCAL1 + STK_ARG + STK_PAD) * 8
;start LocalVars2
%define LOCAL2 rbp-STK_LOCAL2*2*8
;start LocalVars1
%define LOCAL1 rbp
;start Arguments
%define ARGS   rbp+STK_LOCAL1*8

Cc2:
    ;Registers: rdi    a
    ;           rsi    b
    ;           rdx    n
    ;           rcx    sum_a
    ;           r8     sum_b
    ;           r9     prod_a
    ;           stack  prod_b
    ;Save non-volatile registers on the stack
    ;in GCC these are different from VC++
    push    rbp
    push    rbx
    push    r12
    push    r13
    push    r14
    push    r15
    ;Allocate local stack space and set frame pointer
    sub     rsp,STK_TOTAL                   ;allocate local stack space
    lea     rbp,[rsp+STK_LOCAL2*2*8]        ;set frame pointer
    ;Initialize local variables on the stack (demonstration only)
    ;128 bit values
    pxor    xmm5,xmm5
    movdqa  [LOCAL2],xmm5                   ;save xmm5 to LocalVar2A/2B
    movdqa  [LOCAL2+16],xmm5                ;save xmm5 to LocalVar2C/2D
    ;64 bit values
    mov     qword[LOCAL1],0aah              ;save 0xaa to LocalVar1A
    mov     qword[LOCAL1+8],0bbh            ;save 0xbb to LocalVar1B
    mov     qword[LOCAL1+16],0cch           ;save 0xcc to LocalVar1C
    mov     qword[LOCAL1+24],0ddh           ;save 0xdd to LocalVar1D
    ;Save argument values, just to do crazy things
    mov     qword[ARGS],rdi                 ;a
    mov     qword[ARGS+8],rsi               ;b
    movsx   rdx,edx
    mov     qword[ARGS+16],rdx              ;n
    mov     qword[ARGS+24],rcx              ;sum_a
    mov     qword[ARGS+32],r8               ;sum_b
    mov     qword[ARGS+40],r9               ;prod_a
    mov     rax,[rbp+RBP_RA+8]
    mov     qword[ARGS+48],rax              ;prod_b
    ;Perform required initializations for processing loop
    test    edx,edx                         ;is n <= 0?
    jle .error                              ;jump if n <= 0
    ;Initialize registers
    xor     rbx,rbx                         ;rbx = current element offset
    xor     r10,r10                         ;r10 = sum_a
    xor     r11,r11                         ;r11 = sum_b
    mov     r12,1                           ;r12 = prod_a
    mov     r13,1                           ;r13 = prod_b
    ;Compute the array sums and products
.l1:
    mov     rax,[rdi+rbx]                   ;rax = a[i]
    add     r10,rax                         ;update sum_a
    imul    r12,rax                         ;update prod_a
    mov     rax,[rsi+rbx]                   ;rax = b[i]
    add     r11,rax                         ;update sum_b
    imul    r13,rax                         ;update prod_b
    add     rbx,8                           ;set ebx to next element
    dec     edx                             ;adjust count
    jnz     .l1                             ;repeat until done
    ;more crazy stuff
    mov     qword[ARGS+24],r10              ;sum_a
    mov     qword[ARGS+32],r11              ;sum_b
    mov     qword[ARGS+40],r12              ;prod_a
    mov     qword[ARGS+48],r13              ;prod_b
    ;Save the final results
    mov     rax,qword[ARGS+24]
    mov     [rcx],rax                       ;sum_a
    mov     rax,qword[ARGS+32]
    mov     [r8],rax                        ;sum_b
    mov     rax,qword[ARGS+40]              ;prod_a
    mov     [r9],rax
    mov     rax,qword[rbp+RBP_RA+8]
    mov     rbx,qword[ARGS+48]
    mov     [rax],rbx                       ;prod_b
    mov     eax,1                           ;set return code to true
.done:
    ;Function epilog
    lea     rsp,[rbp+(STK_LOCAL1+STK_ARG+STK_PAD)*8]    ;restore rsp
    pop     r15
    pop     r14
    pop     r13                             ;restore NV registers
    pop     r12
    pop     rbx
    pop     rbp
    ret
.error:
    xor     eax,eax                         ;set return code to false
    jmp     .done
build
g++ -m32 -c main.cpp -o main.o
nasm -f elf32 -o callingconvention2.o callingconvention2.asm
g++ -m32 -o callingconvention2 callingconvention2.o main.o