|  | ; Assemble with NASM and -f elf64 | 
|  |  | 
|  | %define q(w,x,y,z) ((((z) << 6) | ((y) << 4) | ((x) << 2) | (w))) | 
|  |  | 
|  | ; Sums up 32-bit integers in array. | 
|  | ; | 
|  | ; 1st argument (rdi) = Pointer to start of array. | 
|  | ; 2nd argument (rsi) = Length of array (number of elements, not bytes). | 
|  | ; | 
|  | ; This function may read up to 28 bytes past the end of the array, similar | 
|  | ; to glibc's AVX2 memchr implementation. | 
|  | ; | 
|  | ; Unlike the "Sum over array (Optimized)" C++ example also available in CE, | 
|  | ; this function makes no assumptions about the alignment or size of the array. | 
|  | ; | 
|  | ; Compared to clang's `-O3 -mavx2` ASM generated for the "Sum over array" C++ | 
|  | ; example, this code is generally faster when the length of the array is not a | 
|  | ; multiple of the unrolled size (i.e., the number of elements processed by one | 
|  | ; iteration of the unrolled loop), and about the same speed when it evenly fits | 
|  | ; into the unrolled loop size. This is because clang's ASM does not use vector | 
|  | ; operations to reduce the array as much as possible, i.e. it will use a scalar | 
|  | ; loop to process <= 31 elements after the unrolled loop, even when vector | 
|  | ; instructions could be used to reduce the "residual" to <= 7 elements. This | 
|  | ; code always uses vector instructions to add up elements; it has no scalar loop | 
|  | ; to clean up the remaining elements at all. | 
|  | sum_array_avx2: | 
|  | ; If the length (rsi) is not zero, skip past | 
|  | ; this return 0 statement. | 
|  | test rsi, rsi | 
|  | jnz .continue | 
|  | xor eax, eax | 
|  | ret | 
|  | .continue: | 
|  | ; Zero out the first accumulator register. This register | 
|  | ; is always needed no matter what branch we take. | 
|  | vpxor xmm0, xmm0 | 
|  |  | 
|  | ; Copy rsi to rdx and rcx. We store the residual number | 
|  | ; of elements in rdx, and the number of residual vector | 
|  | ; adds in rcx (needed because we unroll the add loop 4x, | 
|  | ; and we want to avoid summing the remaining elements | 
|  | ; with scalar instructions). | 
|  | mov rdx, rsi | 
|  | mov rcx, rsi | 
|  |  | 
|  | ; Get residual number of elements. We can use a 32-bit | 
|  | ; instruction here because `x & 7` always clears the | 
|  | ; upper 32 bits of x anyway, which is what the 32-bit | 
|  | ; version of `and` does. We can use 32-bit instructions | 
|  | ; with this register from now on. | 
|  | ; edx = len - 8 * (len / 8) | 
|  | and edx, 8 - 1 | 
|  |  | 
|  | ; Mask out bits representing the number of residual | 
|  | ; elements, to get number of vector add operations. | 
|  | ; There are 8 32-bit integers in a ymm register. | 
|  | ; rcx = 8 * (len / 8) | 
|  | ; | 
|  | ; `and` sets the ZF; if there are no vector add iterations, | 
|  | ; jump to the label that handles the final residual elements | 
|  | ; after the unrolled loop and after the residual vector adds. | 
|  | ; We jump to .residual_gt0 because we already handled the case | 
|  | ; that there are 0 elements, so we can skip the check if there | 
|  | ; are 0 vector add iterations. | 
|  | and rcx, -8 | 
|  | jz .residual_gt0 | 
|  |  | 
|  | ; If we got here, we need to zero out 2 more registers. | 
|  | vpxor xmm1, xmm1 | 
|  | vpxor xmm2, xmm2 | 
|  |  | 
|  | ; rsi = 32 * (len / 32) | 
|  | ; This effectively sets rsi to the number of elements we can | 
|  | ; process with our 4x unrolled loop. If 0, we skip the unrolled loop. | 
|  | and rsi, -(4*8) | 
|  | jz .lt32 | 
|  |  | 
|  | ; It is always true that rcx > rsi. rcx - rsi = number of residual | 
|  | ; vector adds needed after the main unrolled loop. | 
|  | sub rcx, rsi | 
|  |  | 
|  | ; If we got here, we need to zero out the last register, | 
|  | ; because we need it in the unrolled loop. | 
|  | vpxor xmm3, xmm3 | 
|  |  | 
|  | ; Point rdi to the next element after the elements processed by the | 
|  | ; unrolled loop. | 
|  | lea rdi, [rdi + 4*rsi] | 
|  | ; We negate rsi here (unrolled length) and add to it until it becomes | 
|  | ; 0. We use a negative offset to reuse the ZF set by `add`, as opposed | 
|  | ; to having an extra `cmp` instruction. | 
|  | neg rsi | 
|  | .loop: | 
|  | ; [<end pointer> + <negative offset> + <local offset>] | 
|  | vpaddd ymm0, ymm0, [rdi + 4*rsi + 0*(4*8)] | 
|  | vpaddd ymm1, ymm1, [rdi + 4*rsi + 1*(4*8)] | 
|  | vpaddd ymm2, ymm2, [rdi + 4*rsi + 2*(4*8)] | 
|  | vpaddd ymm3, ymm3, [rdi + 4*rsi + 3*(4*8)] | 
|  | add rsi, 32 | 
|  | ; If the negative offset isn't 0, we can keep iterating. | 
|  | jnz .loop | 
|  | ; This addition only needs to happen when we do the main unrolled loop. | 
|  | vpaddd ymm2, ymm3 | 
|  | .lt32: | 
|  | ; Skip over the necessary amount of residual vector adds | 
|  | ; based on rcx. The content of rcx here is actually | 
|  | ; always 0, 8, 16, or 24, so we only need to check ecx. | 
|  | test ecx, ecx | 
|  | jz .residual | 
|  | cmp ecx, 8 | 
|  | je .r1 | 
|  | cmp ecx, 16 | 
|  | je .r2 | 
|  | ; Add up remaining vectors. We do this in reverse so that the above | 
|  | ; instructions can jump to anywhere in between these instructions. | 
|  | vpaddd ymm2, ymm2, [rdi + 2*(4*8)] | 
|  | .r2: | 
|  | vpaddd ymm1, ymm1, [rdi + 1*(4*8)] | 
|  | .r1: | 
|  | vpaddd ymm0, ymm0, [rdi + 0*(4*8)] | 
|  | .residual: | 
|  | ; Sum up ymm0-2 into ymm0. | 
|  | vpaddd ymm1, ymm2 | 
|  | vpaddd ymm0, ymm1 | 
|  |  | 
|  | ; Skip to the end if the number of residual elements is zero. | 
|  | test edx, edx | 
|  | jz .hsum | 
|  | .residual_gt0: | 
|  | ; Multiply by 32 (size of one row of LUT). | 
|  | shl edx, 5 | 
|  | ; rdx is never 0 here, so we need to subtract the length of | 
|  | ; a row since we omit the first row from the table (which | 
|  | ; would be all zeros) since it is never used. This means | 
|  | ; that if rdx=1, we access the first row of the table. | 
|  | vmovdqa ymm4, [mask_lut + rdx - 32] | 
|  | ; Zero elements past the bounds of the array based on mask in ymm4. | 
|  | ; rdi points to the element after the elements processed by the unrolled | 
|  | ; loop, thus we need to add sizeof(int)*rcx to get a pointer to the first | 
|  | ; actual residual element. | 
|  | ; | 
|  | ; This reads up to 28 bytes past the end of the array. | 
|  | vpand   ymm4, ymm4, [rdi + 4*rcx] | 
|  | vpaddd  ymm0, ymm4 | 
|  | .hsum: | 
|  | ; Horizontal reduction of 32-bit integers in ymm0. | 
|  | vextracti128    xmm1, ymm0, 1 | 
|  | vpaddd  xmm0, xmm0, xmm1 | 
|  | vpshufd xmm1, xmm0, q(2,3,2,3) | 
|  | vpaddd  xmm0, xmm0, xmm1 | 
|  | vpshufd xmm1, xmm0, q(1,1,1,1) | 
|  | vpaddd  xmm0, xmm1, xmm0 | 
|  | vmovd   eax, xmm0 | 
|  | ret | 
|  |  | 
|  | ; Lookup table for masking residual elements. | 
|  | align 32 | 
|  | mask_lut:      dd \ | 
|  | -1,  0,  0,  0,  0,  0,  0,  0, \ | 
|  | -1, -1,  0,  0,  0,  0,  0,  0, \ | 
|  | -1, -1, -1,  0,  0,  0,  0,  0, \ | 
|  | -1, -1, -1, -1,  0,  0,  0,  0, \ | 
|  | -1, -1, -1, -1, -1,  0,  0,  0, \ | 
|  | -1, -1, -1, -1, -1, -1,  0,  0, \ | 
|  | -1, -1, -1, -1, -1, -1, -1,  0 |