checksum: Interleave lo/hi sums while folding into 128-bit sums, drop TODO
I left a TODO and never checked -- this actually seems to slightly improve CPIs on AMD Naples (two 128-bit FMA units glued together). Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
parent
364cc313ea
commit
74f29d3148
1 changed files with 3 additions and 3 deletions
|
@ -217,9 +217,9 @@ static uint32_t csum_avx2(const void *buf, size_t len, uint32_t init)
|
||||||
_mm256_unpacklo_epi32(b, zero));
|
_mm256_unpacklo_epi32(b, zero));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Fold four 256bit sums into one 128-bit sum. TODO */
|
/* Fold four 256bit sums into one 128-bit sum. */
|
||||||
sum256 = _mm256_add_epi64(_mm256_add_epi64(sum_a_hi, sum_a_lo),
|
sum256 = _mm256_add_epi64(_mm256_add_epi64(sum_a_hi, sum_b_lo),
|
||||||
_mm256_add_epi64(sum_b_hi, sum_b_lo));
|
_mm256_add_epi64(sum_b_hi, sum_a_lo));
|
||||||
sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 0),
|
sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 0),
|
||||||
_mm256_extracti128_si256(sum256, 1));
|
_mm256_extracti128_si256(sum256, 1));
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue