From ae57258dfd95460f0b3e25624536633e17f65c87 Mon Sep 17 00:00:00 2001
From: Nick Cleaton <nick@cleaton.net>
Date: Sat, 11 Jan 2020 14:58:48 +0000
Subject: [PATCH 1/5] Add input byte permutation for the buzhash chunker

https://github.com/borgbackup/borg/issues/3687
---
 src/borg/_chunker.c                |  9 +++---
 src/borg/archive.py                |  7 ++--
 src/borg/chunker.pyx               | 30 +++++++++++++----
 src/borg/crypto/key.py             | 43 +++++++++++++++++++++++-
 src/borg/item.pyx                  |  4 ++-
 src/borg/testsuite/chunker.py      | 52 +++++++++++++++++++++++-------
 src/borg/testsuite/chunker_slow.py | 43 ++++++++++++++----------
 7 files changed, 143 insertions(+), 45 deletions(-)

diff --git a/src/borg/_chunker.c b/src/borg/_chunker.c
index 75599c5b15..45e1f1cb55 100644
--- a/src/borg/_chunker.c
+++ b/src/borg/_chunker.c
@@ -68,13 +68,13 @@ static uint32_t table_base[] =
 size_t pagemask;
 
 static uint32_t *
-buzhash_init_table(uint32_t seed)
+buzhash_init_table(uint32_t seed, unsigned char *permutation)
 {
     int i;
     uint32_t *table = malloc(1024);
     for(i = 0; i < 256; i++)
     {
-        table[i] = table_base[i] ^ seed;
+        table[i] = table_base[permutation[i]] ^ seed;
     }
     return table;
 }
@@ -112,13 +112,14 @@ typedef struct {
 } Chunker;
 
 static Chunker *
-chunker_init(size_t window_size, uint32_t chunk_mask, size_t min_size, size_t max_size, uint32_t seed)
+chunker_init(size_t window_size, uint32_t chunk_mask, size_t min_size, size_t max_size, uint32_t seed,
+             unsigned char *permutation)
 {
     Chunker *c = calloc(sizeof(Chunker), 1);
     c->window_size = window_size;
     c->chunk_mask = chunk_mask;
     c->min_size = min_size;
-    c->table = buzhash_init_table(seed);
+    c->table = buzhash_init_table(seed, permutation);
     c->buf_size = max_size;
     c->data = malloc(c->buf_size);
     c->fh = -1;
diff --git a/src/borg/archive.py b/src/borg/archive.py
index fa0c7d7e63..dfeb3f040e 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -320,7 +320,7 @@ def __init__(self, key, chunker_params=ITEMS_CHUNKER_PARAMS):
         self.packer = msgpack.Packer()
         self.chunks = []
         self.key = key
-        self.chunker = get_chunker(*chunker_params, seed=self.key.chunk_seed)
+        self.chunker = get_chunker(*chunker_params, seed=self.key.chunk_seed, permutation=self.key.chunk_permutation)
 
     def add(self, item):
         self.buffer.write(self.packer.pack(item.as_dict()))
@@ -1178,7 +1178,7 @@ def __init__(self, *, metadata_collector, cache, key,
         self.hard_links = {}
         self.stats = Statistics()  # threading: done by cache (including progress)
         self.cwd = os.getcwd()
-        self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed)
+        self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, permutation=key.chunk_permutation)
 
     @contextmanager
     def create_helper(self, path, st, status=None, hardlinkable=True):
@@ -2102,7 +2102,8 @@ def create_target(self, archive, target_name=None):
             cache=self.cache, key=self.key,
             add_item=target.add_item, write_checkpoint=target.write_checkpoint,
             checkpoint_interval=self.checkpoint_interval, rechunkify=target.recreate_rechunkify).process_file_chunks
-        target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed)
+        target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed,
+                                     permutation=self.key.chunk_permutation)
         return target
 
     def create_target_archive(self, name):
diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx
index 68f9c010e2..89f4b8c357 100644
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -10,14 +10,19 @@ cdef extern from "_chunker.c":
     ctypedef int uint32_t
     ctypedef struct _Chunker "Chunker":
         pass
-    _Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)
+    _Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size,
+                           uint32_t seed, unsigned char *permutation)
     void chunker_set_fd(_Chunker *chunker, object f, int fd)
     void chunker_free(_Chunker *chunker)
     object chunker_process(_Chunker *chunker)
-    uint32_t *buzhash_init_table(uint32_t seed)
+    uint32_t *buzhash_init_table(uint32_t seed, unsigned char *permutation)
     uint32_t c_buzhash "buzhash"(unsigned char *data, size_t len, uint32_t *h)
     uint32_t c_buzhash_update  "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)
 
+# The identity permutation of input by bytes, useful for maintaining
+# backward compatibility with interfaces defined before input byte
+# permutations were introduced.
+null_permutation = bytes(range(256))
 
 class ChunkerFixed:
     """
@@ -94,13 +99,14 @@ cdef class Chunker:
     """
     cdef _Chunker *chunker
 
-    def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
+    def __cinit__(self, int seed, unsigned char *permutation, int chunk_min_exp, int chunk_max_exp,
+                  int hash_mask_bits, int hash_window_size):
         min_size = 1 << chunk_min_exp
         max_size = 1 << chunk_max_exp
         # see chunker_process, first while loop condition, first term must be able to get True:
         assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
         hash_mask = (1 << hash_mask_bits) - 1
-        self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)
+        self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff, permutation)
 
     def chunkify(self, fd, fh=-1):
         """
@@ -127,7 +133,8 @@ cdef class Chunker:
 def get_chunker(algo, *params, **kw):
     if algo == 'buzhash':
         seed = kw['seed']
-        return Chunker(seed, *params)
+        perm = kw.get('permutation') or null_permutation
+        return Chunker(seed, perm, *params)
     if algo == 'fixed':
         return ChunkerFixed(*params)
     raise TypeError('unsupported chunker algo %r' % algo)
@@ -143,17 +150,26 @@ def max_chunk_size(algo, *params):
 
 
 def buzhash(data, unsigned long seed):
+    return buzhash_perm(data, seed, null_permutation)
+
+
+def buzhash_perm(data, unsigned long seed, unsigned char *permutation):
     cdef uint32_t *table
     cdef uint32_t sum
-    table = buzhash_init_table(seed & 0xffffffff)
+    table = buzhash_init_table(seed & 0xffffffff, permutation)
     sum = c_buzhash(<const unsigned char *> data, len(data), table)
     free(table)
     return sum
 
 
 def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
+    return buzhash_update_perm(sum, remove, add, len, seed, null_permutation)
+
+
+def buzhash_update_perm(uint32_t sum, unsigned char remove, unsigned char add, size_t len,
+                        unsigned long seed, unsigned char *permutation):
     cdef uint32_t *table
-    table = buzhash_init_table(seed & 0xffffffff)
+    table = buzhash_init_table(seed & 0xffffffff, permutation)
     sum = c_buzhash_update(sum, remove, add, len, table)
     free(table)
     return sum
diff --git a/src/borg/crypto/key.py b/src/borg/crypto/key.py
index e263add8de..a4644bcaf4 100644
--- a/src/borg/crypto/key.py
+++ b/src/borg/crypto/key.py
@@ -161,6 +161,10 @@ class KeyBase:
     # type: int
     chunk_seed = None
 
+    # The input byte permutation for the buzhash chunker
+    # type: bytes
+    chunk_permutation = None
+
     # Whether this *particular instance* is encrypted from a practical point of view,
     # i.e. when it's using encryption with a empty passphrase, then
     # that may be *technically* called encryption, but for all intents and purposes
@@ -266,6 +270,7 @@ class PlaintextKey(KeyBase):
     STORAGE = KeyBlobStorage.NO_STORAGE
 
     chunk_seed = 0
+    chunk_permutation = None
     logically_encrypted = False
 
     def __init__(self, repository):
@@ -345,6 +350,37 @@ def id_hash(self, data):
         return hmac_sha256(self.id_key, data)
 
 
+def _derive_byte_permutation(key_material):
+    """
+    Derive a 256-byte permutation table from the key material
+
+    There are 256! possible permutations of a byte-indexed table, and
+    we want to make an unbiased choice. Since 256! is just under 2^1684
+    (it's 0xFF578F....) we derive 1684 pseudorandom bits from the key
+    material and treat it as a single large integer. There's only a 1 in
+    350 chance that this integer is >= 256!, in which case we try again.
+    """
+    for attempt in range(10):
+        context = b"chunker input byte permutation, attempt %d" % attempt
+        key = hkdf_hmac_sha512(key_material, None, context, 211)
+        pool = int.from_bytes(key, "big")
+        pool >>= 4  # 211 bytes is 1688 bits, 4 bits more than we want
+        perm = list(range(256))
+        for i in range(256):
+            pool, offset = divmod(pool, 256-i)
+            j = i + offset
+            tmp = perm[i]
+            perm[i] = perm[j]
+            perm[j] = tmp
+
+        if pool == 0:
+            # the pool value was less than 256!, we have an unbiased choice
+            return bytes(perm)
+
+    # we're very unlikely to fall through to here. Just accept the biased permutation
+    return bytes(perm)
+
+
 class AESKeyBase(KeyBase):
     """
     Common base class shared by KeyfileKey and PassphraseKey
@@ -388,7 +424,7 @@ def decrypt(self, id, data, decompress=True):
 
     def init_from_random_data(self, data=None):
         if data is None:
-            data = os.urandom(100)
+            data = os.urandom(132)
         self.enc_key = data[0:32]
         self.enc_hmac_key = data[32:64]
         self.id_key = data[64:96]
@@ -396,6 +432,9 @@ def init_from_random_data(self, data=None):
         # Convert to signed int32
         if self.chunk_seed & 0x80000000:
             self.chunk_seed = self.chunk_seed - 0xffffffff - 1
+        if len(data) >= 132:
+            chunk_key = data[100:132]
+            self.chunk_permutation = _derive_byte_permutation(chunk_key)
 
     def init_ciphers(self, manifest_data=None):
         self.cipher = self.CIPHERSUITE(mac_key=self.enc_hmac_key, enc_key=self.enc_key, header_len=1, aad_offset=1)
@@ -620,6 +659,7 @@ def _load(self, key_data, passphrase):
             self.enc_hmac_key = key.enc_hmac_key
             self.id_key = key.id_key
             self.chunk_seed = key.chunk_seed
+            self.chunk_permutation = key.get('chunk_permutation')
             self.tam_required = key.get('tam_required', tam_required(self.repository))
             return True
         return False
@@ -660,6 +700,7 @@ def _save(self, passphrase):
             enc_hmac_key=self.enc_hmac_key,
             id_key=self.id_key,
             chunk_seed=self.chunk_seed,
+            chunk_permutation=self.chunk_permutation,
             tam_required=self.tam_required,
         )
         data = self.encrypt_key_file(msgpack.packb(key.as_dict()), passphrase)
diff --git a/src/borg/item.pyx b/src/borg/item.pyx
index 7a3c4a8ac7..bd9b5e9f63 100644
--- a/src/borg/item.pyx
+++ b/src/borg/item.pyx
@@ -318,7 +318,8 @@ class Key(PropDict):
     If a Key shall be serialized, give as_dict() method output to msgpack packer.
     """
 
-    VALID_KEYS = {'version', 'repository_id', 'enc_key', 'enc_hmac_key', 'id_key', 'chunk_seed', 'tam_required'}  # str-typed keys
+    VALID_KEYS = {'version', 'repository_id', 'enc_key', 'enc_hmac_key', 'id_key', 'chunk_seed',
+                  'chunk_permutation', 'tam_required'}  # str-typed keys
 
     __slots__ = ("_dict", )  # avoid setting attributes not supported by properties
 
@@ -328,6 +329,7 @@ class Key(PropDict):
     enc_hmac_key = PropDict._make_property('enc_hmac_key', bytes)
     id_key = PropDict._make_property('id_key', bytes)
     chunk_seed = PropDict._make_property('chunk_seed', int)
+    chunk_permutation = PropDict._make_property('chunk_permutation', bytes)
     tam_required = PropDict._make_property('tam_required', bool)
 
 
diff --git a/src/borg/testsuite/chunker.py b/src/borg/testsuite/chunker.py
index c49e5be03c..a33f6ac46e 100644
--- a/src/borg/testsuite/chunker.py
+++ b/src/borg/testsuite/chunker.py
@@ -1,6 +1,6 @@
 from io import BytesIO
 
-from ..chunker import ChunkerFixed, Chunker, get_chunker, buzhash, buzhash_update
+from ..chunker import ChunkerFixed, Chunker, get_chunker, buzhash, buzhash_perm, buzhash_update, buzhash_update_perm
 from ..constants import *  # NOQA
 from . import BaseTestCase
 
@@ -8,6 +8,18 @@
 #       See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT
 
 
+null_permutation = bytes(range(256))
+
+
+def permutation_invert_case():
+    perm = list(range(256))
+    for up in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+        low = up.lower()
+        perm[ord(low)] = ord(up)
+        perm[ord(up)] = ord(low)
+    return bytes(perm)
+
+
 class ChunkerFixedTestCase(BaseTestCase):
 
     def test_chunkify_just_blocks(self):
@@ -26,20 +38,21 @@ def test_chunkify_header_and_blocks(self):
 class ChunkerTestCase(BaseTestCase):
 
     def test_chunkify(self):
+        np = null_permutation
         data = b'0' * int(1.5 * (1 << CHUNK_MAX_EXP)) + b'Y'
-        parts = [bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))]
+        parts = [bytes(c) for c in Chunker(0, np, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))]
         self.assert_equal(len(parts), 2)
         self.assert_equal(b''.join(parts), data)
-        self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))], [])
-        self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
-        self.assert_equal([bytes(c) for c in Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
-        self.assert_equal([bytes(c) for c in Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(0, np, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))], [])
+        self.assert_equal([bytes(c) for c in Chunker(0, np, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(1, np, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, np, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(0, np, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
+        self.assert_equal([bytes(c) for c in Chunker(1, np, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, np, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(0, np, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
+        self.assert_equal([bytes(c) for c in Chunker(1, np, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, np, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
 
     def test_buzhash(self):
         self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)
@@ -48,6 +61,21 @@ def test_buzhash(self):
         # Test with more than 31 bytes to make sure our barrel_shift macro works correctly
         self.assert_equal(buzhash(b'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz', 0), 566521248)
 
+    def test_permutation(self):
+        p = permutation_invert_case()
+
+        # a non-null permutation should spoil these test cases copied from the methods above
+        self.assert_not_equal([bytes(c) for c in Chunker(2, p, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
+        self.assert_not_equal(buzhash_perm(b'abcdefghijklmnop', 0, p), 3795437769)
+
+        # inverting the case of the input should compensate for the permutation
+        self.assert_equal([bytes(c) for c in Chunker(0, p, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'FOOBARBOOBAZ' * 3))], [b'FOOBA', b'RBOOBAZ', b'FOOBA', b'RBOOBAZ', b'FOOBA', b'RBOOBAZ'])
+        self.assert_equal([bytes(c) for c in Chunker(2, p, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'FOOBARBOOBAZ' * 3))], [b'FOOBARBOOBAZ', b'FOOBARBOOBAZ', b'FOOBARBOOBAZ'])
+        self.assert_equal(buzhash_perm(b'ABCDEFGHIJKLMNOP', 0, p), 3795437769)
+        self.assert_equal(buzhash_perm(b'ABCDEFGHIJKLMNOP', 1, p), 3795400502)
+        self.assert_equal(buzhash_perm(b'ABCDEFGHIJKLMNOP', 1, p),
+                          buzhash_update_perm(buzhash_perm(b'xABCDEFGHIJKLMNO', 1, p), ord('x'), ord('P'), 16, 1, p))
+
     def test_small_reads(self):
         class SmallReadFile:
             input = b'a' * (20 + 1)
diff --git a/src/borg/testsuite/chunker_slow.py b/src/borg/testsuite/chunker_slow.py
index 2739a735ad..6892dccabf 100644
--- a/src/borg/testsuite/chunker_slow.py
+++ b/src/borg/testsuite/chunker_slow.py
@@ -20,20 +20,29 @@ def twist(size):
 
         data = twist(100000)
 
-        runs = []
-        for winsize in (65, 129, HASH_WINDOW_SIZE, 7351):
-            for minexp in (4, 6, 7, 11, 12):
-                for maxexp in (15, 17):
-                    if minexp >= maxexp:
-                        continue
-                    for maskbits in (4, 7, 10, 12):
-                        for seed in (1849058162, 1234567653):
-                            fh = BytesIO(data)
-                            chunker = Chunker(seed, minexp, maxexp, maskbits, winsize)
-                            chunks = [blake2b_256(b'', c) for c in chunker.chunkify(fh, -1)]
-                            runs.append(blake2b_256(b'', b''.join(chunks)))
-
-        # The "correct" hash below matches the existing chunker behavior.
-        # Future chunker optimisations must not change this, or existing repos will bloat.
-        overall_hash = blake2b_256(b'', b''.join(runs))
-        self.assert_equal(overall_hash, unhexlify("b559b0ac8df8daaa221201d018815114241ea5c6609d98913cd2246a702af4e3"))
+        null_permutation = bytes(range(256))
+        reverse_permutation = bytes(reversed(range(256)))
+
+        # The hashes below match the existing chunker behavior. Future chunker optimisations
+        # must not change this, or existing repos will bloat.
+        tests = ( (null_permutation,
+                   unhexlify("b559b0ac8df8daaa221201d018815114241ea5c6609d98913cd2246a702af4e3")),
+                  (reverse_permutation,
+                   unhexlify("6e56c9a94c29b4564c158131914ab21b34e6897002b38e71b0843be68158c00f")))
+
+        for permutation, expected_result in tests:
+            runs = []
+            for winsize in (65, 129, HASH_WINDOW_SIZE, 7351):
+                for minexp in (4, 6, 7, 11, 12):
+                    for maxexp in (15, 17):
+                        if minexp >= maxexp:
+                            continue
+                        for maskbits in (4, 7, 10, 12):
+                            for seed in (1849058162, 1234567653):
+                                fh = BytesIO(data)
+                                chunker = Chunker(seed, permutation, minexp, maxexp, maskbits, winsize)
+                                chunks = [blake2b_256(b'', c) for c in chunker.chunkify(fh, -1)]
+                                runs.append(blake2b_256(b'', b''.join(chunks)))
+
+            overall_hash = blake2b_256(b'', b''.join(runs))
+            self.assert_equal(overall_hash, expected_result)

From 5691a5e8a3b634599b8eb159c59fe0465ef2a5e4 Mon Sep 17 00:00:00 2001
From: Nick Cleaton <nick@cleaton.net>
Date: Sun, 19 Jan 2020 13:19:30 +0000
Subject: [PATCH 2/5] update docs for chunker input permutation

---
 docs/internals/data-structures.rst | 8 ++++++--
 docs/internals/security.rst        | 4 ++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst
index caaf758108..24fdc10c0f 100644
--- a/docs/internals/data-structures.rst
+++ b/docs/internals/data-structures.rst
@@ -624,8 +624,9 @@ can be used to tune the chunker parameters, the default is:
 - HASH_MASK_BITS = 21 (target chunk size ~= 2^21 B = 2 MiB)
 - HASH_WINDOW_SIZE = 4095 [B] (`0xFFF`)
 
-The buzhash table is altered by XORing it with a seed randomly generated once
-for the repository, and stored encrypted in the keyfile. This is to prevent
+The buzhash table is altered by XORing it with a seed and shuffling its
+elements. The XOR seed and shuffle pattern are randomly generated once for
+the repository, and stored encrypted in the keyfile. This is to prevent
 chunk size based fingerprinting attacks on your encrypted repo contents (to
 guess what files you have based on a specific set of chunk sizes).
 
@@ -901,6 +902,9 @@ id_key
 chunk_seed
   the seed for the buzhash chunking table (signed 32 bit integer)
 
+chunk_permutation
+  the permutation for shuffling the buzhash table (256 bytes)
+
 These fields are packed using msgpack_. The utf-8 encoded passphrase
 is processed with PBKDF2_ (SHA256_, 100000 iterations, random 256 bit salt)
 to derive a 256 bit key encryption key (KEK).
diff --git a/docs/internals/security.rst b/docs/internals/security.rst
index 72688bc45d..6cd1619268 100644
--- a/docs/internals/security.rst
+++ b/docs/internals/security.rst
@@ -407,8 +407,8 @@ buzhash chunker
 +++++++++++++++
 
 The buzhash chunker chunks according to the input data, the chunker's
-parameters and the secret chunker seed (which all influence the chunk boundary
-positions).
+parameters and the secret chunker seed and permutation (which all influence the
+chunk boundary positions).
 
 Small files below some specific threshold (default: 512 KiB) result in only one
 chunk (identical content / size as the original file), bigger files result in

From a038f213fd5af0493bb2c398a2ca5c3f4ab6f362 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Thu, 2 Apr 2020 18:13:33 +0200
Subject: [PATCH 3/5] fixup: adjust SELFTEST_COUNT

---
 src/borg/selftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/borg/selftest.py b/src/borg/selftest.py
index 80707c0cc2..49ad0fbf1c 100644
--- a/src/borg/selftest.py
+++ b/src/borg/selftest.py
@@ -30,7 +30,7 @@
     ChunkerTestCase,
 ]
 
-SELFTEST_COUNT = 37
+SELFTEST_COUNT = 38
 
 
 class SelfTestResult(TestResult):

From 3606c6732d9ba9c2938dc0f21b0cf60bc2e02aba Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Thu, 2 Apr 2020 18:42:37 +0200
Subject: [PATCH 4/5] fixup: fix pep8 issue

---
 src/borg/testsuite/chunker_slow.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/borg/testsuite/chunker_slow.py b/src/borg/testsuite/chunker_slow.py
index 6892dccabf..abc0fc04f2 100644
--- a/src/borg/testsuite/chunker_slow.py
+++ b/src/borg/testsuite/chunker_slow.py
@@ -25,10 +25,10 @@ def twist(size):
 
         # The hashes below match the existing chunker behavior. Future chunker optimisations
         # must not change this, or existing repos will bloat.
-        tests = ( (null_permutation,
-                   unhexlify("b559b0ac8df8daaa221201d018815114241ea5c6609d98913cd2246a702af4e3")),
-                  (reverse_permutation,
-                   unhexlify("6e56c9a94c29b4564c158131914ab21b34e6897002b38e71b0843be68158c00f")))
+        tests = ((null_permutation,
+                  unhexlify("b559b0ac8df8daaa221201d018815114241ea5c6609d98913cd2246a702af4e3")),
+                 (reverse_permutation,
+                  unhexlify("6e56c9a94c29b4564c158131914ab21b34e6897002b38e71b0843be68158c00f")))
 
         for permutation, expected_result in tests:
             runs = []

From 6647331268a8c69d321dd84d0db27fa33f685559 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Thu, 2 Apr 2020 19:06:44 +0200
Subject: [PATCH 5/5] fixup: python can do the swap in a 1-liner

---
 src/borg/crypto/key.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/borg/crypto/key.py b/src/borg/crypto/key.py
index a4644bcaf4..f689ef7439 100644
--- a/src/borg/crypto/key.py
+++ b/src/borg/crypto/key.py
@@ -369,9 +369,7 @@ def _derive_byte_permutation(key_material):
         for i in range(256):
             pool, offset = divmod(pool, 256-i)
             j = i + offset
-            tmp = perm[i]
-            perm[i] = perm[j]
-            perm[j] = tmp
+            perm[i], perm[j] = perm[j], perm[i]
 
         if pool == 0:
             # the pool value was less than 256!, we have an unbiased choice