CHAPTER 25. SIMD CHAPTER 25. SIMD
movdqu xmm0, xmmword ptr [esi+edi*4] ; ar2+i*4 is not 16-byte aligned, so load it
to XMM0
paddd xmm1, xmm0
movdqa xmmword ptr [eax+edi*4], xmm1 ; ar3+i*4
add edi, 4
cmp edi, ecx
jb short loc_ED
jmp short loc_127
loc_109: ; CODE XREF: f(int,int ,int ,int *)+E3
mov ebx, [esp+10h+ar1]
mov esi, [esp+10h+ar2]
loc_111: ; CODE XREF: f(int,int ,int ,int )+125
movdqu xmm0, xmmword ptr [ebx+edi4]
paddd xmm0, xmmword ptr [esi+edi4]
movdqa xmmword ptr [eax+edi4], xmm0
add edi, 4
cmp edi, ecx
jb short loc_111
loc_127: ; CODE XREF: f(int,int ,int ,int )+107
; f(int,int ,int ,int )+164
cmp ecx, edx
jnb short loc_15B
mov esi, [esp+10h+ar1]
mov edi, [esp+10h+ar2]
loc_133: ; CODE XREF: f(int,int ,int ,int )+13F
mov ebx, [esi+ecx4]
add ebx, [edi+ecx4]
mov [eax+ecx4], ebx
inc ecx
cmp ecx, edx
jb short loc_133
jmp short loc_15B
loc_143: ; CODE XREF: f(int,int ,int ,int )+17
; f(int,int ,int ,int )+3A ...
mov esi, [esp+10h+ar1]
mov edi, [esp+10h+ar2]
xor ecx, ecx
loc_14D: ; CODE XREF: f(int,int ,int ,int )+159
mov ebx, [esi+ecx4]
add ebx, [edi+ecx4]
mov [eax+ecx4], ebx
inc ecx
cmp ecx, edx
jb short loc_14D
loc_15B: ; CODE XREF: f(int,int ,int ,int )+A
; f(int,int ,int ,int )+129 ...
xor eax, eax
pop ecx
pop ebx
pop esi
pop edi
retn
loc_162: ; CODE XREF: f(int,int ,int ,int )+8C
; f(int,int ,int ,int )+9F
xor ecx, ecx
jmp short loc_127
?f@@YAHHPAH00@Z endp
The SSE2-related instructions are:
- MOVDQU(Move Unaligned Double Quadword)— just loads 16 bytes from memory into a XMM-register.
- PADDD(Add Packed Integers)— adds 4 pairs of 32-bit numbers and leaves the result in the first operand. By the way, no