x86-64: word-at-a-time: improve byte count calculations

This switches x86-64 over to using 'tzcount' instead of the integer multiply trick to turn the bytemask information into actual byte counts. We even had a comment saying that a fast bit count instruction is better than a multiply, but x86 bit counting has traditionally been "questionably fast", and so avoiding it was the right thing back in the days. Now, on any half-way modern core, using bit counting is cheaper and smaller than the large constant multiply, so let's just switch over. Note that as part of switching over to counting bits, we also do it at a different point. We used to create the byte count from the final byte mask, but once you use the 'tzcount' instruction (aka 'bsf' on older CPU's), you can actually count the leading zeroes using a value we have available earlier. In fact, we can just use the very first mask of bits that tells us whether we have any zero bytes at all. The zero bytes in the word will have the high bit set, so just doing 'tzcount' on that value and dividing by 8 will give the number of bytes that precede the first NUL character, which is exactly what we want. Note also that the input value to the tzcount is by definition not zero, since that is the condition that we already used to check the whole "do we have any zero bytes at all". So we don't need to worry about the legacy instruction behavior of pre-lzcount days when 'bsf' didn't have a result for zero input. The 32-bit code continues to use the bimple bit op trick that is faster even on newer cores, but particularly on the older 32-bit-only ones. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Linus Torvalds <torvalds@linux-foundation.org> 2024-06-15 12:21:44 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2024-06-19 12:35:18 -0700
commit: 4b8fa1173cdc33a1cb53e4cac869b1b7aac917a9 (patch)
tree: ff42efaf4e6f811e8013644c6b8c6981b518c147 /arch/x86
parent: 6ba59ff4227927d3a8530fc2973b80e94b54d58f (diff)
1 files changed, 23 insertions, 34 deletions
diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h
index e8d7d4941c4c..422a47746657 100644
--- a/arch/x86/include/asm/word-at-a-time.h
+++ b/arch/x86/include/asm/word-at-a-time.h
@@ -5,45 +5,12 @@
 #include <linux/bitops.h>
 #include <linux/wordpart.h>
 
-/*
- * This is largely generic for little-endian machines, but the
- * optimal byte mask counting is probably going to be something
- * that is architecture-specific. If you have a reliably fast
- * bit count instruction, that might be better than the multiply
- * and shift, for example.
- */
 struct word_at_a_time {
 	const unsigned long one_bits, high_bits;
 };
 
 #define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
 
-#ifdef CONFIG_64BIT
-
-/*
- * Jan Achrenius on G+: microoptimized version of
- * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
- * that works for the bytemasks without having to
- * mask them first.
- */
-static inline long count_masked_bytes(unsigned long mask)
-{
-	return mask*0x0001020304050608ul >> 56;
-}
-
-#else	/* 32-bit case */
-
-/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
-static inline long count_masked_bytes(long mask)
-{
-	/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
-	long a = (0x0ff0001+mask) >> 23;
-	/* Fix the 1 for 00 case */
-	return a & mask;
-}
-
-#endif
-
 /* Return nonzero if it has a zero */
 static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
 {
@@ -57,6 +24,22 @@ static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits,
 	return bits;
 }
 
+#ifdef CONFIG_64BIT
+
+/* Keep the initial has_zero() value for both bitmask and size calc */
+#define create_zero_mask(bits) (bits)
+
+static inline unsigned long zero_bytemask(unsigned long bits)
+{
+	bits = (bits - 1) & ~bits;
+	return bits >> 7;
+}
+
+#define find_zero(bits) (__ffs(bits) >> 3)
+
+#else
+
+/* Create the final mask for both bytemask and size */
 static inline unsigned long create_zero_mask(unsigned long bits)
 {
 	bits = (bits - 1) & ~bits;
@@ -66,11 +49,17 @@ static inline unsigned long create_zero_mask(unsigned long bits)
 /* The mask we created is directly usable as a bytemask */
 #define zero_bytemask(mask) (mask)
 
+/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
 static inline unsigned long find_zero(unsigned long mask)
 {
-	return count_masked_bytes(mask);
+	/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
+	long a = (0x0ff0001+mask) >> 23;
+	/* Fix the 1 for 00 case */
+	return a & mask;
 }
 
+#endif
+
 /*
  * Load an unaligned word from kernel space.
  *
author	Linus Torvalds <torvalds@linux-foundation.org>	2024-06-15 12:21:44 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2024-06-19 12:35:18 -0700
commit	4b8fa1173cdc33a1cb53e4cac869b1b7aac917a9 (patch)
tree	ff42efaf4e6f811e8013644c6b8c6981b518c147 /arch/x86
parent	6ba59ff4227927d3a8530fc2973b80e94b54d58f (diff)