3.7. LOOPS: SEVERAL ITERATORS
; store 32-bit word at $a0:
sw $a3, 0($a0)
; add 0x1C (28) to $a1 at each iteration:
addiu $a1, 0x1C
; jump to loop body if i<cnt:
bnez $v1, loc_8
; add 0xC (12) to $a0 at each iteration:
addiu $a0, 0xC ; branch delay slot
locret_24:
jr $ra
or $at, $zero ; branch delay slot, NOP
3.7.3 Intel C++ 2011 case
Compiler optimizations can also be weird, but nevertheless, still correct. Here is what the Intel C++
compiler 2011 does:
Listing 3.17: Optimizing Intel C++ 2011 x64
f PROC
; parameter 1: rcx = a1
; parameter 2: rdx = a2
; parameter 3: r8 = cnt
.B1.1::
test r8, r8
jbe exit
.B1.2::
cmp r8, 6
jbe just_copy
.B1.3::
cmp rcx, rdx
jbe .B1.5
.B1.4::
mov r10, r8
mov r9, rcx
shl r10, 5
lea rax, QWORD PTR [r8*4]
sub r9, rdx
sub r10, rax
cmp r9, r10
jge just_copy2
.B1.5::
cmp rdx, rcx
jbe just_copy
.B1.6::
mov r9, rdx
lea rax, QWORD PTR [r88]
sub r9, rcx
lea r10, QWORD PTR [rax+r84]
cmp r9, r10
jl just_copy
just_copy2::
; R8 = cnt
; RDX = a2
; RCX = a1
xor r10d, r10d
xor r9d, r9d
xor eax, eax
.B1.8::
mov r11d, DWORD PTR [rax+rdx]