3.7. LOOPS: SEVERAL ITERATORS
; R8=cnt
test r8, r8 ; cnt==0? exit then
je SHORT $LN1@f
npad 11
$LL3@f:
mov eax, DWORD PTR [rdx]
lea rcx, QWORD PTR [rcx+12]
lea rdx, QWORD PTR [rdx+28]
mov DWORD PTR [rcx-12], eax
dec r8
jne SHORT $LL3@f
$LN1@f:
ret 0
f ENDP
Now there are 3 iterators: thecntvariable and two indices, which are increased by 12 and 28 at each
iteration. We can rewrite this code in C/C++:
#include <stdio.h>
void f(int a1, int a2, size_t cnt)
{
size_t i;
size_t idx1=0; idx2=0;
// copy from one array to another in some weird scheme
for (i=0; i<cnt; i++)
{
a1[idx1]=a2[idx2];
idx1+=3;
idx2+=7;
};
};
So, at the cost of updating 3 iterators at each iteration instead of one, we can remove two multiplication
operations.
3.7.2 Two iterators
GCC 4.9 does even more, leaving only 2 iterators:
Listing 3.14: Optimizing GCC 4.9 x64; RDI=a1
; RSI=a2
; RDX=cnt
f:
test rdx, rdx ; cnt==0? exit then
je .L1
; calculate last element address in "a2" and leave it in RDX
lea rax, [0+rdx4]
; RAX=RDX4=cnt4
sal rdx, 5
; RDX=RDX<<5=cnt32
sub rdx, rax
; RDX=RDX-RAX=cnt32-cnt4=cnt28
add rdx, rsi
; RDX=RDX+RSI=a2+cnt28
.L3:
mov eax, DWORD PTR [rsi]
add rsi, 28
add rdi, 12
mov DWORD PTR [rdi-12], eax
cmp rsi, rdx
jne .L3
.L1:
rep ret
