Btrfs: heuristic: add Shannon entropy calculation

Byte distribution check in heuristic will filter edge data cases and some time fail to classify input data. Let's fix that by adding Shannon entropy calculation, that will cover classification of most other data types. As Shannon entropy needs log2 with some precision to work, let's use ilog2(N) and for increased precision, by do ilog2(pow(N, 4)). Shannon entropy has been slightly changed to avoid signed numbers and division. The calculation is direct by the formula, successor of precalculated table or chains of if-else. The accuracy errors of ilog2 are compensated by @ENTROPY_LVL_ACEPTABLE 70 -> 65 @ENTROPY_LVL_HIGH 85 -> 80 Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com> Reviewed-by: David Sterba <dsterba@suse.com> [ update comments ] Signed-off-by: David Sterba <dsterba@suse.com>
author: Timofey Titovets <nefelim4ag@gmail.com> 2017-10-08 16:11:59 +0300
committer: David Sterba <dsterba@suse.com> 2017-11-01 20:45:36 +0100
commit: 19562430c6213925faba3baa9e8cb224ddd47ee6 (patch)
tree: 46dd57e47b8378576f60e777887ca7bac729a9c9 /fs
parent: 858177d38d4681dad6efc015b99e4c786a34aca5 (diff)
1 files changed, 84 insertions, 1 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c551d8a979f4..b35ce16b3df3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -34,6 +34,7 @@
 #include <linux/slab.h>
 #include <linux/sched/mm.h>
 #include <linux/sort.h>
+#include <linux/log2.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -1223,6 +1224,59 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
 	return 1;
 }
 
+/*
+ * Shannon Entropy calculation
+ *
+ * Pure byte distribution analysis fails to determine compressiability of data.
+ * Try calculating entropy to estimate the average minimum number of bits
+ * needed to encode the sampled data.
+ *
+ * For convenience, return the percentage of needed bits, instead of amount of
+ * bits directly.
+ *
+ * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy
+ *			    and can be compressible with high probability
+ *
+ * @ENTROPY_LVL_HIGH - data are not compressible with high probability
+ *
+ * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate.
+ */
+#define ENTROPY_LVL_ACEPTABLE		(65)
+#define ENTROPY_LVL_HIGH		(80)
+
+/*
+ * For increasead precision in shannon_entropy calculation,
+ * let's do pow(n, M) to save more digits after comma:
+ *
+ * - maximum int bit length is 64
+ * - ilog2(MAX_SAMPLE_SIZE)	-> 13
+ * - 13 * 4 = 52 < 64		-> M = 4
+ *
+ * So use pow(n, 4).
+ */
+static inline u32 ilog2_w(u64 n)
+{
+	return ilog2(n * n * n * n);
+}
+
+static u32 shannon_entropy(struct heuristic_ws *ws)
+{
+	const u32 entropy_max = 8 * ilog2_w(2);
+	u32 entropy_sum = 0;
+	u32 p, p_base, sz_base;
+	u32 i;
+
+	sz_base = ilog2_w(ws->sample_size);
+	for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) {
+		p = ws->bucket[i].count;
+		p_base = ilog2_w(p);
+		entropy_sum += p * (sz_base - p_base);
+	}
+
+	entropy_sum /= ws->sample_size;
+	return entropy_sum * 100 / entropy_max;
+}
+
 /* Compare buckets by size, ascending */
 static int bucket_comp_rev(const void *lv, const void *rv)
 {
@@ -1396,7 +1450,7 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
 	struct heuristic_ws *ws;
 	u32 i;
 	u8 byte;
-	int ret = 1;
+	int ret = 0;
 
 	ws = list_entry(ws_list, struct heuristic_ws, list);
 
@@ -1431,6 +1485,35 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
 		goto out;
 	}
 
+	i = shannon_entropy(ws);
+	if (i <= ENTROPY_LVL_ACEPTABLE) {
+		ret = 4;
+		goto out;
+	}
+
+	/*
+	 * For the levels below ENTROPY_LVL_HIGH, additional analysis would be
+	 * needed to give green light to compression.
+	 *
+	 * For now just assume that compression at that level is not worth the
+	 * resources because:
+	 *
+	 * 1. it is possible to defrag the data later
+	 *
+	 * 2. the data would turn out to be hardly compressible, eg. 150 byte
+	 * values, every bucket has counter at level ~54. The heuristic would
+	 * be confused. This can happen when data have some internal repeated
+	 * patterns like "abbacbbc...". This can be detected by analyzing
+	 * pairs of bytes, which is too costly.
+	 */
+	if (i < ENTROPY_LVL_HIGH) {
+		ret = 5;
+		goto out;
+	} else {
+		ret = 0;
+		goto out;
+	}
+
 out:
 	__free_workspace(0, ws_list, true);
 	return ret;
author	Timofey Titovets <nefelim4ag@gmail.com>	2017-10-08 16:11:59 +0300
committer	David Sterba <dsterba@suse.com>	2017-11-01 20:45:36 +0100
commit	19562430c6213925faba3baa9e8cb224ddd47ee6 (patch)
tree	46dd57e47b8378576f60e777887ca7bac729a9c9 /fs
parent	858177d38d4681dad6efc015b99e4c786a34aca5 (diff)