examples/assembly/Sum_array_(AVX2).asm - compiler-explorer - Rivoreo Source Code Repositories

 ; Assemble with NASM and -f elf64

 %define q(w,x,y,z) ((((z) << 6) | ((y) << 4) | ((x) << 2) | (w)))

 ; Sums up 32-bit integers in array.
 ;
 ; 1st argument (rdi) = Pointer to start of array.
 ; 2nd argument (rsi) = Length of array (number of elements, not bytes).
 ;
 ; This function may read up to 28 bytes past the end of the array, similar
 ; to glibc's AVX2 memchr implementation.
 ;
 ; Unlike the "Sum over array (Optimized)" C++ example also available in CE,
 ; this function makes no assumptions about the alignment or size of the array.
 ;
 ; Compared to clang's `-O3 -mavx2` ASM generated for the "Sum over array" C++
 ; example, this code is generally faster when the length of the array is not a
 ; multiple of the unrolled size (i.e., the number of elements processed by one
 ; iteration of the unrolled loop), and about the same speed when it evenly fits
 ; into the unrolled loop size. This is because clang's ASM does not use vector
 ; operations to reduce the array as much as possible, i.e. it will use a scalar
 ; loop to process <= 31 elements after the unrolled loop, even when vector
 ; instructions could be used to reduce the "residual" to <= 7 elements. This
 ; code always uses vector instructions to add up elements; it has no scalar loop
 ; to clean up the remaining elements at all.
 sum_array_avx2:
   ; If the length (rsi) is not zero, skip past
   ; this return 0 statement.
   test rsi, rsi
   jnz .continue
   xor eax, eax
   ret
 .continue:
   ; Zero out the first accumulator register. This register
   ; is always needed no matter what branch we take.
   vpxor xmm0, xmm0

   ; Copy rsi to rdx and rcx. We store the residual number
   ; of elements in rdx, and the number of residual vector
   ; adds in rcx (needed because we unroll the add loop 4x,
   ; and we want to avoid summing the remaining elements
   ; with scalar instructions).
   mov rdx, rsi
   mov rcx, rsi

   ; Get residual number of elements. We can use a 32-bit
   ; instruction here because `x & 7` always clears the
   ; upper 32 bits of x anyway, which is what the 32-bit
   ; version of `and` does. We can use 32-bit instructions
   ; with this register from now on.
   ; edx = len - 8 * (len / 8)
   and edx, 8 - 1

   ; Mask out bits representing the number of residual
   ; elements, to get number of vector add operations.
   ; There are 8 32-bit integers in a ymm register.
   ; rcx = 8 * (len / 8)
   ;
   ; `and` sets the ZF; if there are no vector add iterations,
   ; jump to the label that handles the final residual elements
   ; after the unrolled loop and after the residual vector adds.
   ; We jump to .residual_gt0 because we already handled the case
   ; that there are 0 elements, so we can skip the check if there
   ; are 0 vector add iterations.
   and rcx, -8
   jz .residual_gt0

   ; If we got here, we need to zero out 2 more registers.
   vpxor xmm1, xmm1
   vpxor xmm2, xmm2

   ; rsi = 32 * (len / 32)
   ; This effectively sets rsi to the number of elements we can
   ; process with our 4x unrolled loop. If 0, we skip the unrolled loop.
   and rsi, -(4*8)
   jz .lt32

   ; It is always true that rcx > rsi. rcx - rsi = number of residual
   ; vector adds needed after the main unrolled loop.
   sub rcx, rsi

   ; If we got here, we need to zero out the last register,
   ; because we need it in the unrolled loop.
   vpxor xmm3, xmm3

   ; Point rdi to the next element after the elements processed by the
   ; unrolled loop.
   lea rdi, [rdi + 4*rsi]
   ; We negate rsi here (unrolled length) and add to it until it becomes
   ; 0. We use a negative offset to reuse the ZF set by `add`, as opposed
   ; to having an extra `cmp` instruction.
   neg rsi
 .loop:
   ; [<end pointer> + <negative offset> + <local offset>]
   vpaddd ymm0, ymm0, [rdi + 4*rsi + 0*(4*8)]
   vpaddd ymm1, ymm1, [rdi + 4*rsi + 1*(4*8)]
   vpaddd ymm2, ymm2, [rdi + 4*rsi + 2*(4*8)]
   vpaddd ymm3, ymm3, [rdi + 4*rsi + 3*(4*8)]
   add rsi, 32
   ; If the negative offset isn't 0, we can keep iterating.
   jnz .loop
   ; This addition only needs to happen when we do the main unrolled loop.
   vpaddd ymm2, ymm3
 .lt32:
   ; Skip over the necessary amount of residual vector adds
   ; based on rcx. The content of rcx here is actually
   ; always 0, 8, 16, or 24, so we only need to check ecx.
   test ecx, ecx
   jz .residual
   cmp ecx, 8
   je .r1
   cmp ecx, 16
   je .r2
   ; Add up remaining vectors. We do this in reverse so that the above
   ; instructions can jump to anywhere in between these instructions.
   vpaddd ymm2, ymm2, [rdi + 2*(4*8)]
 .r2:
   vpaddd ymm1, ymm1, [rdi + 1*(4*8)]
 .r1:
   vpaddd ymm0, ymm0, [rdi + 0*(4*8)]
 .residual:
   ; Sum up ymm0-2 into ymm0.
   vpaddd ymm1, ymm2
   vpaddd ymm0, ymm1

   ; Skip to the end if the number of residual elements is zero.
   test edx, edx
   jz .hsum
 .residual_gt0:
   ; Multiply by 32 (size of one row of LUT).
   shl edx, 5
   ; rdx is never 0 here, so we need to subtract the length of
   ; a row since we omit the first row from the table (which
   ; would be all zeros) since it is never used. This means
   ; that if rdx=1, we access the first row of the table.
   vmovdqa ymm4, [mask_lut + rdx - 32]
   ; Zero elements past the bounds of the array based on mask in ymm4.
   ; rdi points to the element after the elements processed by the unrolled
   ; loop, thus we need to add sizeof(int)*rcx to get a pointer to the first
   ; actual residual element.
   ;
   ; This reads up to 28 bytes past the end of the array.
   vpand   ymm4, ymm4, [rdi + 4*rcx]
   vpaddd  ymm0, ymm4
 .hsum:
   ; Horizontal reduction of 32-bit integers in ymm0.
   vextracti128    xmm1, ymm0, 1
   vpaddd  xmm0, xmm0, xmm1
   vpshufd xmm1, xmm0, q(2,3,2,3)
   vpaddd  xmm0, xmm0, xmm1
   vpshufd xmm1, xmm0, q(1,1,1,1)
   vpaddd  xmm0, xmm1, xmm0
   vmovd   eax, xmm0
   ret

 ; Lookup table for masking residual elements.
 align 32
 mask_lut:      dd \
  -1,  0,  0,  0,  0,  0,  0,  0, \
  -1, -1,  0,  0,  0,  0,  0,  0, \
  -1, -1, -1,  0,  0,  0,  0,  0, \
  -1, -1, -1, -1,  0,  0,  0,  0, \
  -1, -1, -1, -1, -1,  0,  0,  0, \
  -1, -1, -1, -1, -1, -1,  0,  0, \
  -1, -1, -1, -1, -1, -1, -1,  0
	; Assemble with NASM and -f elf64

	%define q(w,x,y,z) ((((z) << 6) \| ((y) << 4) \| ((x) << 2) \| (w)))

	; Sums up 32-bit integers in array.
	;
	; 1st argument (rdi) = Pointer to start of array.
	; 2nd argument (rsi) = Length of array (number of elements, not bytes).
	;
	; This function may read up to 28 bytes past the end of the array, similar
	; to glibc's AVX2 memchr implementation.
	;
	; Unlike the "Sum over array (Optimized)" C++ example also available in CE,
	; this function makes no assumptions about the alignment or size of the array.
	;
	; Compared to clang's `-O3 -mavx2` ASM generated for the "Sum over array" C++
	; example, this code is generally faster when the length of the array is not a
	; multiple of the unrolled size (i.e., the number of elements processed by one
	; iteration of the unrolled loop), and about the same speed when it evenly fits
	; into the unrolled loop size. This is because clang's ASM does not use vector
	; operations to reduce the array as much as possible, i.e. it will use a scalar
	; loop to process <= 31 elements after the unrolled loop, even when vector
	; instructions could be used to reduce the "residual" to <= 7 elements. This
	; code always uses vector instructions to add up elements; it has no scalar loop
	; to clean up the remaining elements at all.
	sum_array_avx2:
	; If the length (rsi) is not zero, skip past
	; this return 0 statement.
	test rsi, rsi
	jnz .continue
	xor eax, eax
	ret
	.continue:
	; Zero out the first accumulator register. This register
	; is always needed no matter what branch we take.
	vpxor xmm0, xmm0

	; Copy rsi to rdx and rcx. We store the residual number
	; of elements in rdx, and the number of residual vector
	; adds in rcx (needed because we unroll the add loop 4x,
	; and we want to avoid summing the remaining elements
	; with scalar instructions).
	mov rdx, rsi
	mov rcx, rsi

	; Get residual number of elements. We can use a 32-bit
	; instruction here because `x & 7` always clears the
	; upper 32 bits of x anyway, which is what the 32-bit
	; version of `and` does. We can use 32-bit instructions
	; with this register from now on.
	; edx = len - 8 * (len / 8)
	and edx, 8 - 1

	; Mask out bits representing the number of residual
	; elements, to get number of vector add operations.
	; There are 8 32-bit integers in a ymm register.
	; rcx = 8 * (len / 8)
	;
	; `and` sets the ZF; if there are no vector add iterations,
	; jump to the label that handles the final residual elements
	; after the unrolled loop and after the residual vector adds.
	; We jump to .residual_gt0 because we already handled the case
	; that there are 0 elements, so we can skip the check if there
	; are 0 vector add iterations.
	and rcx, -8
	jz .residual_gt0

	; If we got here, we need to zero out 2 more registers.
	vpxor xmm1, xmm1
	vpxor xmm2, xmm2

	; rsi = 32 * (len / 32)
	; This effectively sets rsi to the number of elements we can
	; process with our 4x unrolled loop. If 0, we skip the unrolled loop.
	and rsi, -(4*8)
	jz .lt32

	; It is always true that rcx > rsi. rcx - rsi = number of residual
	; vector adds needed after the main unrolled loop.
	sub rcx, rsi

	; If we got here, we need to zero out the last register,
	; because we need it in the unrolled loop.
	vpxor xmm3, xmm3

	; Point rdi to the next element after the elements processed by the
	; unrolled loop.
	lea rdi, [rdi + 4*rsi]
	; We negate rsi here (unrolled length) and add to it until it becomes
	; 0. We use a negative offset to reuse the ZF set by `add`, as opposed
	; to having an extra `cmp` instruction.
	neg rsi
	.loop:
	; [<end pointer> + <negative offset> + <local offset>]
	vpaddd ymm0, ymm0, [rdi + 4rsi + 0(4*8)]
	vpaddd ymm1, ymm1, [rdi + 4rsi + 1(4*8)]
	vpaddd ymm2, ymm2, [rdi + 4rsi + 2(4*8)]
	vpaddd ymm3, ymm3, [rdi + 4rsi + 3(4*8)]
	add rsi, 32
	; If the negative offset isn't 0, we can keep iterating.
	jnz .loop
	; This addition only needs to happen when we do the main unrolled loop.
	vpaddd ymm2, ymm3
	.lt32:
	; Skip over the necessary amount of residual vector adds
	; based on rcx. The content of rcx here is actually
	; always 0, 8, 16, or 24, so we only need to check ecx.
	test ecx, ecx
	jz .residual
	cmp ecx, 8
	je .r1
	cmp ecx, 16
	je .r2
	; Add up remaining vectors. We do this in reverse so that the above
	; instructions can jump to anywhere in between these instructions.
	vpaddd ymm2, ymm2, [rdi + 2(48)]
	.r2:
	vpaddd ymm1, ymm1, [rdi + 1(48)]
	.r1:
	vpaddd ymm0, ymm0, [rdi + 0(48)]
	.residual:
	; Sum up ymm0-2 into ymm0.
	vpaddd ymm1, ymm2
	vpaddd ymm0, ymm1

	; Skip to the end if the number of residual elements is zero.
	test edx, edx
	jz .hsum
	.residual_gt0:
	; Multiply by 32 (size of one row of LUT).
	shl edx, 5
	; rdx is never 0 here, so we need to subtract the length of
	; a row since we omit the first row from the table (which
	; would be all zeros) since it is never used. This means
	; that if rdx=1, we access the first row of the table.
	vmovdqa ymm4, [mask_lut + rdx - 32]
	; Zero elements past the bounds of the array based on mask in ymm4.
	; rdi points to the element after the elements processed by the unrolled
	; loop, thus we need to add sizeof(int)*rcx to get a pointer to the first
	; actual residual element.
	;
	; This reads up to 28 bytes past the end of the array.
	vpand ymm4, ymm4, [rdi + 4*rcx]
	vpaddd ymm0, ymm4
	.hsum:
	; Horizontal reduction of 32-bit integers in ymm0.
	vextracti128 xmm1, ymm0, 1
	vpaddd xmm0, xmm0, xmm1
	vpshufd xmm1, xmm0, q(2,3,2,3)
	vpaddd xmm0, xmm0, xmm1
	vpshufd xmm1, xmm0, q(1,1,1,1)
	vpaddd xmm0, xmm1, xmm0
	vmovd eax, xmm0
	ret

	; Lookup table for masking residual elements.
	align 32
	mask_lut: dd \
	-1, 0, 0, 0, 0, 0, 0, 0, \
	-1, -1, 0, 0, 0, 0, 0, 0, \
	-1, -1, -1, 0, 0, 0, 0, 0, \
	-1, -1, -1, -1, 0, 0, 0, 0, \
	-1, -1, -1, -1, -1, 0, 0, 0, \
	-1, -1, -1, -1, -1, -1, 0, 0, \
	-1, -1, -1, -1, -1, -1, -1, 0