CHAPTER 25. SIMD CHAPTER 25. SIMD
add eax, 1
add edi, 4
add edx, [esi]
add esi, 4
mov [ebx], edx
add ebx, 4
cmp ecx, eax
jg short loc_8048558
add esp, 0Ch
xor eax, eax
pop ebx
pop esi
pop edi
pop ebp
retn
loc_8048578: ; CODE XREF: f(int,int ,int ,int *)+52
cmp eax, esi
jnb loc_80484C1
jmp loc_80484F8
_Z1fiPiSS endp
Almost the same, however, not as meticulously as Intel C++.
25.1.2 Memory copy example
Let’s revisit the simple memcpy() example (14.2 on page 183):
#include <stdio.h>
void my_memcpy (unsigned char dst, unsigned char src, size_t cnt)
{
size_t i;
for (i=0; i<cnt; i++)
dst[i]=src[i];
};
And that’s what optimizations GCC 4.9.1 did:
Listing 25.1: Optimizing GCC 4.9.1 x64
my_memcpy:
; RDI = destination address
; RSI = source address
; RDX = size of block
test rdx, rdx
je .L41
lea rax, [rdi+16]
cmp rsi, rax
lea rax, [rsi+16]
setae cl
cmp rdi, rax
setae al
or cl, al
je .L13
cmp rdx, 22
jbe .L13
mov rcx, rsi
push rbp
push rbx
neg rcx
and ecx, 15
cmp rcx, rdx
cmova rcx, rdx
xor eax, eax
test rcx, rcx
je .L4
movzx eax, BYTE PTR [rsi]
cmp rcx, 1