1.29. SIMD
mov [ebp+var_18], ecx
mov ecx, [ebp+var_14]
xor eax, eax
xor edx, edx
nop
loc_8048520: ; CODE XREF: f(int,int ,int ,int *)+9B
movdqu xmm1, xmmword ptr [edi+eax]
movdqu xmm0, xmmword ptr [esi+eax]
add edx, 1
paddd xmm0, xmm1
movdqa xmmword ptr [ebx+eax], xmm0
add eax, 10h
cmp edx, ecx
jb short loc_8048520
mov ecx, [ebp+var_18]
mov eax, [ebp+var_10]
cmp ecx, eax
jz short loc_80484D8
loc_8048547: ; CODE XREF: f(int,int ,int ,int )+73
lea edx, ds:0[eax4]
add esi, edx
add edi, edx
add ebx, edx
lea esi, [esi+0]
loc_8048558: ; CODE XREF: f(int,int ,int ,int *)+CC
mov edx, [edi]
add eax, 1
add edi, 4
add edx, [esi]
add esi, 4
mov [ebx], edx
add ebx, 4
cmp ecx, eax
jg short loc_8048558
add esp, 0Ch
xor eax, eax
pop ebx
pop esi
pop edi
pop ebp
retn
loc_8048578: ; CODE XREF: f(int,int ,int ,int *)+52
cmp eax, esi
jnb loc_80484C1
jmp loc_80484F8
_Z1fiPiSS endp
Almost the same, however, not as meticulously as Intel C++.
Memory copy example
Let’s revisit the simple memcpy() example (1.16.2 on page 195):
#include <stdio.h>
void my_memcpy (unsigned char dst, unsigned char src, size_t cnt)
{
size_t i;
for (i=0; i<cnt; i++)
dst[i]=src[i];
};
And that’s what optimizations GCC 4.9.1 did: