Skip to content

Commit

Permalink
Add input byte permutation for the buzhash chunker
Browse files Browse the repository at this point in the history
  • Loading branch information
ncleaton authored and ThomasWaldmann committed Dec 7, 2020
1 parent c2118f1 commit ae57258
Show file tree
Hide file tree
Showing 7 changed files with 143 additions and 45 deletions.
9 changes: 5 additions & 4 deletions src/borg/_chunker.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,13 @@ static uint32_t table_base[] =
size_t pagemask;

static uint32_t *
buzhash_init_table(uint32_t seed)
buzhash_init_table(uint32_t seed, unsigned char *permutation)
{
int i;
uint32_t *table = malloc(1024);
for(i = 0; i < 256; i++)
{
table[i] = table_base[i] ^ seed;
table[i] = table_base[permutation[i]] ^ seed;
}
return table;
}
Expand Down Expand Up @@ -112,13 +112,14 @@ typedef struct {
} Chunker;

static Chunker *
chunker_init(size_t window_size, uint32_t chunk_mask, size_t min_size, size_t max_size, uint32_t seed)
chunker_init(size_t window_size, uint32_t chunk_mask, size_t min_size, size_t max_size, uint32_t seed,
unsigned char *permutation)
{
Chunker *c = calloc(sizeof(Chunker), 1);
c->window_size = window_size;
c->chunk_mask = chunk_mask;
c->min_size = min_size;
c->table = buzhash_init_table(seed);
c->table = buzhash_init_table(seed, permutation);
c->buf_size = max_size;
c->data = malloc(c->buf_size);
c->fh = -1;
Expand Down
7 changes: 4 additions & 3 deletions src/borg/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def __init__(self, key, chunker_params=ITEMS_CHUNKER_PARAMS):
self.packer = msgpack.Packer()
self.chunks = []
self.key = key
self.chunker = get_chunker(*chunker_params, seed=self.key.chunk_seed)
self.chunker = get_chunker(*chunker_params, seed=self.key.chunk_seed, permutation=self.key.chunk_permutation)

def add(self, item):
self.buffer.write(self.packer.pack(item.as_dict()))
Expand Down Expand Up @@ -1178,7 +1178,7 @@ def __init__(self, *, metadata_collector, cache, key,
self.hard_links = {}
self.stats = Statistics() # threading: done by cache (including progress)
self.cwd = os.getcwd()
self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed)
self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, permutation=key.chunk_permutation)

@contextmanager
def create_helper(self, path, st, status=None, hardlinkable=True):
Expand Down Expand Up @@ -2102,7 +2102,8 @@ def create_target(self, archive, target_name=None):
cache=self.cache, key=self.key,
add_item=target.add_item, write_checkpoint=target.write_checkpoint,
checkpoint_interval=self.checkpoint_interval, rechunkify=target.recreate_rechunkify).process_file_chunks
target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed)
target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed,
permutation=self.key.chunk_permutation)
return target

def create_target_archive(self, name):
Expand Down
30 changes: 23 additions & 7 deletions src/borg/chunker.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,19 @@ cdef extern from "_chunker.c":
ctypedef int uint32_t
ctypedef struct _Chunker "Chunker":
pass
_Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)
_Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size,
uint32_t seed, unsigned char *permutation)
void chunker_set_fd(_Chunker *chunker, object f, int fd)
void chunker_free(_Chunker *chunker)
object chunker_process(_Chunker *chunker)
uint32_t *buzhash_init_table(uint32_t seed)
uint32_t *buzhash_init_table(uint32_t seed, unsigned char *permutation)
uint32_t c_buzhash "buzhash"(unsigned char *data, size_t len, uint32_t *h)
uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)

# The identity permutation of input by bytes, useful for maintaining
# backward compatibility with interfaces defined before input byte
# permutations were introduced.
null_permutation = bytes(range(256))

class ChunkerFixed:
"""
Expand Down Expand Up @@ -94,13 +99,14 @@ cdef class Chunker:
"""
cdef _Chunker *chunker

def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
def __cinit__(self, int seed, unsigned char *permutation, int chunk_min_exp, int chunk_max_exp,
int hash_mask_bits, int hash_window_size):
min_size = 1 << chunk_min_exp
max_size = 1 << chunk_max_exp
# see chunker_process, first while loop condition, first term must be able to get True:
assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
hash_mask = (1 << hash_mask_bits) - 1
self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)
self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff, permutation)

def chunkify(self, fd, fh=-1):
"""
Expand All @@ -127,7 +133,8 @@ cdef class Chunker:
def get_chunker(algo, *params, **kw):
if algo == 'buzhash':
seed = kw['seed']
return Chunker(seed, *params)
perm = kw.get('permutation') or null_permutation
return Chunker(seed, perm, *params)
if algo == 'fixed':
return ChunkerFixed(*params)
raise TypeError('unsupported chunker algo %r' % algo)
Expand All @@ -143,17 +150,26 @@ def max_chunk_size(algo, *params):


def buzhash(data, unsigned long seed):
return buzhash_perm(data, seed, null_permutation)


def buzhash_perm(data, unsigned long seed, unsigned char *permutation):
cdef uint32_t *table
cdef uint32_t sum
table = buzhash_init_table(seed & 0xffffffff)
table = buzhash_init_table(seed & 0xffffffff, permutation)
sum = c_buzhash(<const unsigned char *> data, len(data), table)
free(table)
return sum


def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
return buzhash_update_perm(sum, remove, add, len, seed, null_permutation)


def buzhash_update_perm(uint32_t sum, unsigned char remove, unsigned char add, size_t len,
unsigned long seed, unsigned char *permutation):
cdef uint32_t *table
table = buzhash_init_table(seed & 0xffffffff)
table = buzhash_init_table(seed & 0xffffffff, permutation)
sum = c_buzhash_update(sum, remove, add, len, table)
free(table)
return sum
43 changes: 42 additions & 1 deletion src/borg/crypto/key.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,10 @@ class KeyBase:
# type: int
chunk_seed = None

# The input byte permutation for the buzhash chunker
# type: bytes
chunk_permutation = None

# Whether this *particular instance* is encrypted from a practical point of view,
# i.e. when it's using encryption with a empty passphrase, then
# that may be *technically* called encryption, but for all intents and purposes
Expand Down Expand Up @@ -266,6 +270,7 @@ class PlaintextKey(KeyBase):
STORAGE = KeyBlobStorage.NO_STORAGE

chunk_seed = 0
chunk_permutation = None
logically_encrypted = False

def __init__(self, repository):
Expand Down Expand Up @@ -345,6 +350,37 @@ def id_hash(self, data):
return hmac_sha256(self.id_key, data)


def _derive_byte_permutation(key_material):
"""
Derive a 256-byte permutation table from the key material
There are 256! possible permutations of a byte-indexed table, and
we want to make an unbiased choice. Since 256! is just under 2^1684
(it's 0xFF578F....) we derive 1684 pseudorandom bits from the key
material and treat it as a single large integer. There's only a 1 in
350 chance that this integer is >= 256!, in which case we try again.
"""
for attempt in range(10):
context = b"chunker input byte permutation, attempt %d" % attempt
key = hkdf_hmac_sha512(key_material, None, context, 211)
pool = int.from_bytes(key, "big")
pool >>= 4 # 211 bytes is 1688 bits, 4 bits more than we want
perm = list(range(256))
for i in range(256):
pool, offset = divmod(pool, 256-i)
j = i + offset
tmp = perm[i]
perm[i] = perm[j]
perm[j] = tmp

if pool == 0:
# the pool value was less than 256!, we have an unbiased choice
return bytes(perm)

# we're very unlikely to fall through to here. Just accept the biased permutation
return bytes(perm)


class AESKeyBase(KeyBase):
"""
Common base class shared by KeyfileKey and PassphraseKey
Expand Down Expand Up @@ -388,14 +424,17 @@ def decrypt(self, id, data, decompress=True):

def init_from_random_data(self, data=None):
if data is None:
data = os.urandom(100)
data = os.urandom(132)
self.enc_key = data[0:32]
self.enc_hmac_key = data[32:64]
self.id_key = data[64:96]
self.chunk_seed = bytes_to_int(data[96:100])
# Convert to signed int32
if self.chunk_seed & 0x80000000:
self.chunk_seed = self.chunk_seed - 0xffffffff - 1
if len(data) >= 132:
chunk_key = data[100:132]
self.chunk_permutation = _derive_byte_permutation(chunk_key)

def init_ciphers(self, manifest_data=None):
self.cipher = self.CIPHERSUITE(mac_key=self.enc_hmac_key, enc_key=self.enc_key, header_len=1, aad_offset=1)
Expand Down Expand Up @@ -620,6 +659,7 @@ def _load(self, key_data, passphrase):
self.enc_hmac_key = key.enc_hmac_key
self.id_key = key.id_key
self.chunk_seed = key.chunk_seed
self.chunk_permutation = key.get('chunk_permutation')
self.tam_required = key.get('tam_required', tam_required(self.repository))
return True
return False
Expand Down Expand Up @@ -660,6 +700,7 @@ def _save(self, passphrase):
enc_hmac_key=self.enc_hmac_key,
id_key=self.id_key,
chunk_seed=self.chunk_seed,
chunk_permutation=self.chunk_permutation,
tam_required=self.tam_required,
)
data = self.encrypt_key_file(msgpack.packb(key.as_dict()), passphrase)
Expand Down
4 changes: 3 additions & 1 deletion src/borg/item.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,8 @@ class Key(PropDict):
If a Key shall be serialized, give as_dict() method output to msgpack packer.
"""

VALID_KEYS = {'version', 'repository_id', 'enc_key', 'enc_hmac_key', 'id_key', 'chunk_seed', 'tam_required'} # str-typed keys
VALID_KEYS = {'version', 'repository_id', 'enc_key', 'enc_hmac_key', 'id_key', 'chunk_seed',
'chunk_permutation', 'tam_required'} # str-typed keys

__slots__ = ("_dict", ) # avoid setting attributes not supported by properties

Expand All @@ -328,6 +329,7 @@ class Key(PropDict):
enc_hmac_key = PropDict._make_property('enc_hmac_key', bytes)
id_key = PropDict._make_property('id_key', bytes)
chunk_seed = PropDict._make_property('chunk_seed', int)
chunk_permutation = PropDict._make_property('chunk_permutation', bytes)
tam_required = PropDict._make_property('tam_required', bool)


Expand Down
52 changes: 40 additions & 12 deletions src/borg/testsuite/chunker.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
from io import BytesIO

from ..chunker import ChunkerFixed, Chunker, get_chunker, buzhash, buzhash_update
from ..chunker import ChunkerFixed, Chunker, get_chunker, buzhash, buzhash_perm, buzhash_update, buzhash_update_perm
from ..constants import * # NOQA
from . import BaseTestCase

# Note: these tests are part of the self test, do not use or import py.test functionality here.
# See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT


null_permutation = bytes(range(256))


def permutation_invert_case():
perm = list(range(256))
for up in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
low = up.lower()
perm[ord(low)] = ord(up)
perm[ord(up)] = ord(low)
return bytes(perm)


class ChunkerFixedTestCase(BaseTestCase):

def test_chunkify_just_blocks(self):
Expand All @@ -26,20 +38,21 @@ def test_chunkify_header_and_blocks(self):
class ChunkerTestCase(BaseTestCase):

def test_chunkify(self):
np = null_permutation
data = b'0' * int(1.5 * (1 << CHUNK_MAX_EXP)) + b'Y'
parts = [bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))]
parts = [bytes(c) for c in Chunker(0, np, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))]
self.assert_equal(len(parts), 2)
self.assert_equal(b''.join(parts), data)
self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))], [])
self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
self.assert_equal([bytes(c) for c in Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
self.assert_equal([bytes(c) for c in Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
self.assert_equal([bytes(c) for c in Chunker(0, np, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))], [])
self.assert_equal([bytes(c) for c in Chunker(0, np, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
self.assert_equal([bytes(c) for c in Chunker(1, np, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, np, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(0, np, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(1, np, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, np, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
self.assert_equal([bytes(c) for c in Chunker(0, np, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(1, np, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, np, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])

def test_buzhash(self):
self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)
Expand All @@ -48,6 +61,21 @@ def test_buzhash(self):
# Test with more than 31 bytes to make sure our barrel_shift macro works correctly
self.assert_equal(buzhash(b'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz', 0), 566521248)

def test_permutation(self):
p = permutation_invert_case()

# a non-null permutation should spoil these test cases copied from the methods above
self.assert_not_equal([bytes(c) for c in Chunker(2, p, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
self.assert_not_equal(buzhash_perm(b'abcdefghijklmnop', 0, p), 3795437769)

# inverting the case of the input should compensate for the permutation
self.assert_equal([bytes(c) for c in Chunker(0, p, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'FOOBARBOOBAZ' * 3))], [b'FOOBA', b'RBOOBAZ', b'FOOBA', b'RBOOBAZ', b'FOOBA', b'RBOOBAZ'])
self.assert_equal([bytes(c) for c in Chunker(2, p, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'FOOBARBOOBAZ' * 3))], [b'FOOBARBOOBAZ', b'FOOBARBOOBAZ', b'FOOBARBOOBAZ'])
self.assert_equal(buzhash_perm(b'ABCDEFGHIJKLMNOP', 0, p), 3795437769)
self.assert_equal(buzhash_perm(b'ABCDEFGHIJKLMNOP', 1, p), 3795400502)
self.assert_equal(buzhash_perm(b'ABCDEFGHIJKLMNOP', 1, p),
buzhash_update_perm(buzhash_perm(b'xABCDEFGHIJKLMNO', 1, p), ord('x'), ord('P'), 16, 1, p))

def test_small_reads(self):
class SmallReadFile:
input = b'a' * (20 + 1)
Expand Down
43 changes: 26 additions & 17 deletions src/borg/testsuite/chunker_slow.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,29 @@ def twist(size):

data = twist(100000)

runs = []
for winsize in (65, 129, HASH_WINDOW_SIZE, 7351):
for minexp in (4, 6, 7, 11, 12):
for maxexp in (15, 17):
if minexp >= maxexp:
continue
for maskbits in (4, 7, 10, 12):
for seed in (1849058162, 1234567653):
fh = BytesIO(data)
chunker = Chunker(seed, minexp, maxexp, maskbits, winsize)
chunks = [blake2b_256(b'', c) for c in chunker.chunkify(fh, -1)]
runs.append(blake2b_256(b'', b''.join(chunks)))

# The "correct" hash below matches the existing chunker behavior.
# Future chunker optimisations must not change this, or existing repos will bloat.
overall_hash = blake2b_256(b'', b''.join(runs))
self.assert_equal(overall_hash, unhexlify("b559b0ac8df8daaa221201d018815114241ea5c6609d98913cd2246a702af4e3"))
null_permutation = bytes(range(256))
reverse_permutation = bytes(reversed(range(256)))

# The hashes below match the existing chunker behavior. Future chunker optimisations
# must not change this, or existing repos will bloat.
tests = ( (null_permutation,
unhexlify("b559b0ac8df8daaa221201d018815114241ea5c6609d98913cd2246a702af4e3")),
(reverse_permutation,
unhexlify("6e56c9a94c29b4564c158131914ab21b34e6897002b38e71b0843be68158c00f")))

for permutation, expected_result in tests:
runs = []
for winsize in (65, 129, HASH_WINDOW_SIZE, 7351):
for minexp in (4, 6, 7, 11, 12):
for maxexp in (15, 17):
if minexp >= maxexp:
continue
for maskbits in (4, 7, 10, 12):
for seed in (1849058162, 1234567653):
fh = BytesIO(data)
chunker = Chunker(seed, permutation, minexp, maxexp, maskbits, winsize)
chunks = [blake2b_256(b'', c) for c in chunker.chunkify(fh, -1)]
runs.append(blake2b_256(b'', b''.join(chunks)))

overall_hash = blake2b_256(b'', b''.join(runs))
self.assert_equal(overall_hash, expected_result)

0 comments on commit ae57258

Please sign in to comment.