From e25a9b7be91c92c8296ebc5446ee58c2cab01937 Mon Sep 17 00:00:00 2001 From: Pete Date: Tue, 28 Feb 2023 11:28:33 -0800 Subject: [PATCH] Implement a `Tokenizer` and `MemMapDataset` (#10) * Implement `Tokenizer` and `MMapDataset` * Support concatenation of `MemMapDataset`s * Add script for generating memmap file * minor improvements * smaller test fixtures * add test * Add validation final array * remove duplicate job * clean up progress * add "files" word * Add `Tokenizer.vocab_size()` method * add "-j/--workers" argument * clean up --- .gitattributes | 1 + .github/workflows/main.yml | 14 +-- CHANGELOG.md | 4 + Makefile | 2 +- conftest.py | 55 ++++++++++ dolma/aliases.py | 7 ++ dolma/data/__init__.py | 4 + dolma/data/memmap_dataset.py | 95 ++++++++++++++++++ dolma/data/tokenizer.py | 125 +++++++++++++++++++++++ dolma/util.py | 13 +++ requirements.txt | 4 + scripts/prepare_changelog.py | 2 +- scripts/prepare_memmap_dataset.py | 155 +++++++++++++++++++++++++++++ test_fixtures/c4-sample.01.json.gz | Bin 0 -> 7471 bytes test_fixtures/c4-sample.02.json.gz | Bin 0 -> 9889 bytes test_fixtures/c4-sample.03.json.gz | Bin 0 -> 8205 bytes tests/data/__init__.py | 0 tests/data/memmap_dataset_test.py | 55 ++++++++++ tests/data/tokenizer_test.py | 44 ++++++++ tests/hello_test.py | 2 - tests/util_test.py | 9 ++ 21 files changed, 581 insertions(+), 10 deletions(-) create mode 100644 .gitattributes create mode 100644 conftest.py create mode 100644 dolma/aliases.py create mode 100644 dolma/data/__init__.py create mode 100644 dolma/data/memmap_dataset.py create mode 100644 dolma/data/tokenizer.py create mode 100644 dolma/util.py create mode 100644 scripts/prepare_memmap_dataset.py create mode 100644 test_fixtures/c4-sample.01.json.gz create mode 100644 test_fixtures/c4-sample.02.json.gz create mode 100644 test_fixtures/c4-sample.03.json.gz create mode 100644 tests/data/__init__.py create mode 100644 tests/data/memmap_dataset_test.py create mode 100644 tests/data/tokenizer_test.py delete mode 100644 tests/hello_test.py create mode 100644 tests/util_test.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..47e403f03 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +test_fixtures/*.json.gz binary diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6cfd48d0d..d043de74e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -17,10 +17,11 @@ env: # Change this to invalidate existing cache. CACHE_PREFIX: v0 PYTHONPATH: ./ + TOKENIZERS_PARALLELISM: 'false' jobs: checks: - name: Python ${{ matrix.python }} - ${{ matrix.task.name }} + name: ${{ matrix.task.name }} (py ${{ matrix.python }}) runs-on: [ubuntu-latest] timeout-minutes: 15 strategy: @@ -38,11 +39,6 @@ jobs: name: Test run: pytest -v --color=yes --durations=5 tests/ - - python: '3.10' - task: - name: Lint - run: flake8 . - - python: '3.10' task: name: Type check @@ -62,6 +58,12 @@ jobs: isort --check . black --check . + - python: '3.10' + task: + name: Prepare mmap dataset + run: | + python scripts/prepare_memmap_dataset.py test_fixtures/*.json.gz -o /tmp/out.npy --validate + steps: - uses: actions/checkout@v3 diff --git a/CHANGELOG.md b/CHANGELOG.md index a4af7e839..13dbc3676 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,3 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## Unreleased + +### Added + +- Added `Tokenizer` and `MemMapDataset` classes. diff --git a/Makefile b/Makefile index 8f39af955..cfbcfe76a 100644 --- a/Makefile +++ b/Makefile @@ -4,4 +4,4 @@ run-checks : black --check . flake8 . mypy . - CUDA_VISIBLE_DEVICES='' pytest -v --color=yes --doctest-modules tests/ dolma/ + CUDA_VISIBLE_DEVICES='' pytest -v --color=yes tests/ diff --git a/conftest.py b/conftest.py new file mode 100644 index 000000000..8adb29d9a --- /dev/null +++ b/conftest.py @@ -0,0 +1,55 @@ +from typing import List + +import pytest + +from dolma.data.tokenizer import Tokenizer + +TEST_MODEL = "gpt2" + +LOREM_IPSUM_1 = """ +Lorem ipsum dolor sit amet, consectetur adipiscing elit, +sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. +Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip +ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit +esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat +non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +""" + +LOREM_IPSUM_2 = """ +Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque +laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi +architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia +voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores +eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem +ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius +modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. +Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit +laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure +reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, +vel illum qui dolorem eum fugiat quo voluptas nulla pariatur? +""" + + +@pytest.fixture(scope="module") +def pretrained_tokenizer_name() -> str: + return TEST_MODEL + + +@pytest.fixture(scope="function") +def tokenizer() -> Tokenizer: + return Tokenizer.from_pretrained(TEST_MODEL) + + +@pytest.fixture(scope="module") +def eos_token_id(tokenizer: Tokenizer) -> int: + return tokenizer.eos_token_id + + +@pytest.fixture(scope="module") +def lorem_ipsum() -> str: + return LOREM_IPSUM_1.replace("\n", " ").strip() + + +@pytest.fixture(scope="module") +def lorem_ipsum_docs() -> List[str]: + return [text.replace("\n", " ").strip() for text in (LOREM_IPSUM_1, LOREM_IPSUM_2)] diff --git a/dolma/aliases.py b/dolma/aliases.py new file mode 100644 index 000000000..dcf75db2b --- /dev/null +++ b/dolma/aliases.py @@ -0,0 +1,7 @@ +from os import PathLike +from typing import Union + +__all__ = ["PathOrStr"] + + +PathOrStr = Union[str, PathLike] diff --git a/dolma/data/__init__.py b/dolma/data/__init__.py new file mode 100644 index 000000000..84adeec6a --- /dev/null +++ b/dolma/data/__init__.py @@ -0,0 +1,4 @@ +from .memmap_dataset import MemMapDataset +from .tokenizer import Tokenizer, TruncationDirection + +__all__ = ["MemMapDataset", "Tokenizer", "TruncationDirection"] diff --git a/dolma/data/memmap_dataset.py b/dolma/data/memmap_dataset.py new file mode 100644 index 000000000..bfd881fd4 --- /dev/null +++ b/dolma/data/memmap_dataset.py @@ -0,0 +1,95 @@ +from typing import List, Optional, Tuple, cast + +import numpy as np +import torch +from torch.utils.data import Dataset + +from ..aliases import PathOrStr + +__all__ = ["MemMapDataset"] + + +class MemMapDataset(Dataset[torch.LongTensor]): + """ + A PyTorch :class:`~torch.utils.data.Dataset` backed by one or more numpy memory-mapped arrays + of token IDs. Token IDs are chunked together into contiguous blocks of ``chunk_size`` + to create instances. + + If the length of a memory-mapped array is not a multiple of ``chunk_size`` the + remainder of the tokens will be ignored. + + No special tokens are added to the input IDs so it's assumed that if you want + EOS tokens between documents, for example, those will already by in the memory-mapped array. + + :param paths: Paths to memory-mapped token arrays. + :param chunk_size: The number of tokens to chunk together into a single instance. + Generally this should correspond to your model's maximum input length. + :param memmap_dtype: The numpy datatype of the memory-mapped array. + """ + + def __init__(self, *paths: PathOrStr, chunk_size: int = 1024, memmap_dtype=np.uint16): + if not paths: + raise ValueError("At least one path is required") + self._memmap_paths = paths + self._chunk_size = chunk_size + self._mmaps: Optional[List[np.memmap]] = None + self._mmap_offsets: Optional[List[Tuple[int, int]]] = None + self._num_instances: Optional[int] = None + self.dtype = memmap_dtype + + @property + def memmaps(self) -> List[np.memmap]: + if self._mmaps is None: + self._mmaps = [] + for path in self._memmap_paths: + mmap = np.memmap(path, mode="r", dtype=self.dtype) + self._mmaps.append(mmap) + return self._mmaps + + @property + def offsets(self) -> List[Tuple[int, int]]: + if self._mmap_offsets is None: + start_offset = 0 + self._mmap_offsets = [] + for mmap in self.memmaps: + length = mmap.shape[0] // self._chunk_size + end_offset = start_offset + length + self._mmap_offsets.append((start_offset, end_offset)) + start_offset += length + return self._mmap_offsets + + def __len__(self) -> int: + if self._num_instances is None: + self._num_instances = self.offsets[-1][1] + return self._num_instances + + def __getitem__(self, index: int) -> torch.LongTensor: + pos_index = index if index >= 0 else len(self) + index + + # The index of the memmap array within 'self.memmaps' + memmap_index: Optional[int] = None + # The 'index' relative to the corresponding memmap array. + memmap_local_index: Optional[int] = None + for i, (offset_start, offset_end) in enumerate(self.offsets): + if offset_start <= pos_index < offset_end: + memmap_index = i + memmap_local_index = pos_index - offset_start + + if memmap_index is None or memmap_local_index is None: + raise IndexError(f"{index} is out of bounds for dataset of size {len(self)}") + + memmap = self.memmaps[memmap_index] + index_start = memmap_local_index * self._chunk_size + index_stop = (memmap_local_index + 1) * self._chunk_size + data = memmap[index_start:index_stop].astype(np.int_) + return cast(torch.LongTensor, torch.tensor(data, dtype=torch.long)) + + def __add__(self, other: "MemMapDataset") -> "MemMapDataset": + """ + Concatenate one :class:`MemMapDataset` with another. + """ + if not isinstance(other, MemMapDataset): + raise NotImplementedError(f"Expected another MemMapDataset but got {type(other)}") + return MemMapDataset( + *(self._memmap_paths + other._memmap_paths), chunk_size=self._chunk_size, memmap_dtype=self.dtype + ) diff --git a/dolma/data/tokenizer.py b/dolma/data/tokenizer.py new file mode 100644 index 000000000..0bcad0fb3 --- /dev/null +++ b/dolma/data/tokenizer.py @@ -0,0 +1,125 @@ +from contextlib import contextmanager +from typing import Generator, List, Optional, Union + +from tokenizers import Tokenizer as BaseTokenizer + +from ..util import StrEnum + +__all__ = ["Tokenizer", "TruncationDirection"] + + +class TruncationDirection(StrEnum): + right = "right" + left = "left" + + +class Tokenizer: + """ + A :class:`Tokenizer` is a light-weight wrapper around :class:`tokenizers.Tokenizer`. + + :param base_tokenizer: The :class:`tokenizers.Tokenizer` to use. + :param eos_token_id: The EOS token ID. If not set we default to using the last token + in the vocabulary, which is usually correct for GPT tokenizers. + :param truncate_to: Truncate when tokenizer to this number of token IDs. + :param truncate_direction: The direction to truncate in. "right" means truncate the tokens + on the right. "left" means truncate the tokens on the left. If ``truncate_to`` is null, + this setting has no effect. + """ + + def __init__( + self, + base_tokenizer: BaseTokenizer, + eos_token_id: Optional[int] = None, + truncate_to: Optional[int] = None, + truncate_direction: Union[str, TruncationDirection] = TruncationDirection.right, + ): + self.base_tokenizer = base_tokenizer + self.eos_token_id = eos_token_id if eos_token_id is not None else base_tokenizer.get_vocab_size() - 1 + self.truncate_to = truncate_to + self.truncate_direction = TruncationDirection(truncate_direction) + + @property + def vocab_size(self) -> int: + return self.base_tokenizer.get_vocab_size() + + @classmethod + def from_pretrained(cls, identifier: str, **kwargs) -> "Tokenizer": + """ + Initialize a tokenizer from a pretrained tokenizer on the HuggingFace Hub. + + :param identifier: The identifier of a model on the Hub that contains a + ``tokenizer.json`` file. + """ + base_tokenizer = BaseTokenizer.from_pretrained(identifier) + return cls(base_tokenizer, **kwargs) + + def add_special_tokens(self, input_ids: List[int]) -> List[int]: + """ + Add special tokens in-place (if not already present) to the given token IDs. + """ + if not input_ids or input_ids[-1] != self.eos_token_id: + input_ids.append(self.eos_token_id) + return input_ids + + def num_special_tokens_to_add(self, is_pair: bool = False) -> int: + return 2 if is_pair else 1 + + @contextmanager + def _truncation( + self, truncate_to: Optional[int], direction: Union[str, TruncationDirection] = TruncationDirection.right + ) -> Generator["Tokenizer", None, None]: + """ + A context manager to temporarily enable/disable truncation. + """ + truncation = self.base_tokenizer.truncation + + try: + if truncate_to is not None: + self.base_tokenizer.enable_truncation(truncate_to, direction=str(direction)) + else: + self.base_tokenizer.no_truncation() + yield self + finally: + if truncation is None: + self.base_tokenizer.no_truncation() + else: + self.base_tokenizer.enable_truncation(**truncation) + + def encode(self, input: str, add_special_tokens: bool = True) -> List[int]: + """ + Encode a string into token IDs. + """ + truncate_to = self.truncate_to + if truncate_to is not None and add_special_tokens: + truncate_to -= self.num_special_tokens_to_add(False) + + with self._truncation(truncate_to, direction=self.truncate_direction): + input_ids = self.base_tokenizer.encode(input).ids + + if add_special_tokens: + input_ids = self.add_special_tokens(input_ids) + + return input_ids + + def encode_batch(self, inputs: List[str], add_special_tokens: bool = True) -> List[List[int]]: + """ + Encode a batch of strings into token IDs. + """ + truncate_to = self.truncate_to + if truncate_to is not None and add_special_tokens: + truncate_to -= self.num_special_tokens_to_add(False) + + with self._truncation(truncate_to, direction=self.truncate_direction): + batch_encoding = self.base_tokenizer.encode_batch(inputs) + + all_input_ids = [] + for encoding in batch_encoding: + input_ids = encoding.ids + if add_special_tokens: + input_ids = self.add_special_tokens(input_ids) + all_input_ids.append(input_ids) + + return all_input_ids + + def decode(self, token_ids: List[int]) -> str: + return self.base_tokenizer.decode(token_ids) diff --git a/dolma/util.py b/dolma/util.py new file mode 100644 index 000000000..733d29434 --- /dev/null +++ b/dolma/util.py @@ -0,0 +1,13 @@ +from enum import Enum + +__all__ = ["StrEnum"] + + +class StrEnum(str, Enum): + """ + This is equivalent to Python's :class:`enum.StrEnum` since version 3.11. + We include this here for compatibility with older version of Python. + """ + + def __str__(self) -> str: + return self.value diff --git a/requirements.txt b/requirements.txt index 12c6d5d5e..cfb9338a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,5 @@ +numpy torch +tokenizers +click +rich diff --git a/scripts/prepare_changelog.py b/scripts/prepare_changelog.py index 163aface9..768fb5caf 100644 --- a/scripts/prepare_changelog.py +++ b/scripts/prepare_changelog.py @@ -4,7 +4,7 @@ from dolma.version import VERSION -def main(): +def main() -> None: changelog = Path("CHANGELOG.md") with changelog.open() as f: diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py new file mode 100644 index 000000000..f5f145dd5 --- /dev/null +++ b/scripts/prepare_memmap_dataset.py @@ -0,0 +1,155 @@ +""" +To test this, run: + +$ python scripts/prepare_memmap_dataset.py test_fixtures/*.json.gz -o /tmp/out.npy +""" + +import concurrent.futures +import gzip +import json +import os +from collections import defaultdict +from pathlib import Path +from typing import Dict, Generator, List, Optional, Tuple + +import click +import numpy as np +from rich.progress import ( + BarColumn, + MofNCompleteColumn, + Progress, + TaskProgressColumn, + TimeElapsedColumn, +) + +from dolma.data import Tokenizer + + +def get_progress() -> Progress: + return Progress( + "[progress.description]{task.description}", + MofNCompleteColumn(), + "files", + BarColumn(), + TaskProgressColumn(), + TimeElapsedColumn(), + ) + + +def tokenize_file(tokenizer: Tokenizer, path: Path) -> Generator[List[int], None, None]: + with gzip.open(path, "rt", encoding="UTF8") as f: + for line in f: + text = json.loads(line)["text"] + yield tokenizer.encode(text, add_special_tokens=True) + + +def count_tokens(tokenizer: Tokenizer, path: Path) -> Tuple[Path, int, int]: + num_tokens = 0 + num_docs = 0 + for token_ids in tokenize_file(tokenizer, path): + num_tokens += len(token_ids) + num_docs += 1 + return path, num_tokens, num_docs + + +def fill_memmap( + tokenizer: Tokenizer, path: Path, memmap_path: Path, num_tokens: int, offset: int, dtype: np.dtype +): + memmap = np.memmap(memmap_path, mode="r+", dtype=dtype, offset=offset * dtype.itemsize, shape=(num_tokens,)) + index = 0 + for token_ids in tokenize_file(tokenizer, path): + memmap[index : index + len(token_ids)] = token_ids + index += len(token_ids) + memmap.flush() + + +@click.command() +@click.argument( + "src", + nargs=-1, + type=click.Path(exists=True, dir_okay=False, path_type=Path), +) +@click.option( + "-o", + "--output", + type=click.Path(exists=False, dir_okay=False, path_type=Path), + help="Specify the output path.", + prompt="Output file", +) +@click.option( + "--tokenizer", "tokenizer_id", type=str, help="Name of path of a pretrained tokenizer", default="gpt2" +) +@click.option("--dtype", "dtype_str", default="uint16") +@click.option("--validate/--no-validate", default=False) +@click.option("-j", "--workers", "max_workers", type=int, default=None, help="Defaults to number of CPUs") +def main( + src: Tuple[Path], + output: Path, + tokenizer_id: str, + dtype_str: str, + validate: bool, + max_workers: Optional[int] = None, +): + tokenizer = Tokenizer.from_pretrained(tokenizer_id, truncate_to=None) + dtype = np.dtype(dtype_str) + dtype_max = np.iinfo(dtype).max + + # Tokenize all documents to determine how many tokens are in each file. + src_to_num_tokens: Dict[Path, int] = defaultdict(int) + total_docs = 0 + with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor: + futures = [] + for path in src: + future = executor.submit(count_tokens, tokenizer, path) + futures.append(future) + with get_progress() as progress: + for future in progress.track( + concurrent.futures.as_completed(futures), description="Counting tokens...", total=len(futures) + ): + path, num_tokens, num_docs = future.result() + src_to_num_tokens[path] = num_tokens + total_docs += num_docs + + total_tokens = sum(src_to_num_tokens.values()) + print(f"Counted {total_tokens:,d} tokens over {total_docs:,d} documents") + + # Initialize memmap file. + memmap = np.memmap(output, mode="w+", dtype=dtype, shape=(total_tokens,)) + if validate: + # Fill with max value so that we can check later that all values in the array + # have been populated with actual token IDs. + memmap[:] = dtype_max + memmap.flush() + del memmap + + # Now tokenizer all documents again and populate the memmap array. + # We do this in parallel. + with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor: + futures = [] + offset = 0 + for path in sorted(src): + future = executor.submit(fill_memmap, tokenizer, path, output, src_to_num_tokens[path], offset, dtype) + futures.append(future) + offset += src_to_num_tokens[path] + with get_progress() as progress: + for future in progress.track( + concurrent.futures.as_completed(futures), description="Filling memmap array...", total=len(futures) + ): + future.result() + + print(f"Done! File written to {output}") + + if validate: + print("Validating...") + memmap = np.memmap(output, mode="r", dtype=dtype, shape=(total_tokens,)) + # Should have an EOS token for every document. + assert (memmap == tokenizer.eos_token_id).sum() == total_docs + assert memmap[-1] == tokenizer.eos_token_id + # Make sure all entries have been filled with actual token IDs. + assert (memmap < tokenizer.vocab_size).all() + print("All good!") + + +if __name__ == "__main__": + os.environ["TOKENIZERS_PARALLELISM"] = "false" + main() diff --git a/test_fixtures/c4-sample.01.json.gz b/test_fixtures/c4-sample.01.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..e573e25525b3f96b012aee288d355b75197dc4c8 GIT binary patch literal 7471 zcmV+~9nj(*iwFqM{`X@517kETb75_8Y-KJmF)nIzZ*BmUTibFRM{<4lSCsf5k+Czl zkz9h%!x{uZ5)uG%3COe@e5vWKnJJ>XtGB8f!{LS_^bz~w2>ToS5&jc>iOw<3YTKCAGMLi4(%5@&Vgq>cx)cVeu)<#>q(ic2->WrJuL+sMT$oVFsK`t7iU-$+ht@VhI6dO;54I4;Q zV6W0a2-}H3sc?jZr7H$^8&5`-|;7nyI9CYOy=q9?_)iX%E4*iZf zGq6tz5u&Xe^k14LbS62RIjjtgZPVG~Y)7c8}%(UMD?z7ERQu1 zSSN^xMueE#+E;5)0Rw#S5!s@%H*n>OYh^_o$6`72X&G9TE}3^$zQ<}8P*3VYw!bXk zK4W!!Afe`C(buW7@aSYxA0^xvuNQDyjrd_vaZH@(II00t7yHFRKqf?U)4G zl6?A$`Cz+ZX{e@KU`&9jD?9}&&8g+kcj6orzkFBo3XH9tz?iHkpmui1JF22Vm=W4S3sZzDgZ!-lqU zBhCsm@!QY=c}yAoE#qEHEXT%SUaaxRS7H40k$EzCy0!qsC9bEPMjz zC;n_4pC?6#Z6E{I{;5Y?Ru-h1ac{u|P^^2vZiitUH7On!$!`XpO6Z7=R z1o}w#CJ^Fy7;?tWgUN$FR!BU{u4~iL?(S-}+PQ%~Dx7zxajUc|f22WoQ$W%b{xIdz zQ(1a?@bvKD$=(6hDsAVm*umcZ^XcA`>B0W>{^8NnCr5iv{_VlP|I^=xELXdf?1Ivy zHL$71iB(6MuiC1|m(!VL3eYDA`=Ot872Xu#Zvdvu0jlMd_*m(&YIKZU=nra!G7*PB z8gupoR)VFvr3RUgH0>vvmS*TXoI^0MM7-Fh0|f9%Ti)Uy$X)Q$fIsrC@G0>%JfYrA zY?NFe`8V~m+IO91z=id-@m+G&909ak1Kb7!biUR&*389^5l0SLf(abrl+bz=TGOq> z6Bo#L30~k4gk?Ys#;%vN)(w8uEg{%nJ~o#chEWkuylNtD2v$(wL{Qayj;{$5JLWY- z?2}W?zP|qXPY@+kHy&#ESx+qA{6pli?wJeDbG1A$K(JHfVP9@!Wc< zw?oYGTr;AQs^{mIr_9ul1!g#t1Kf<8NXiN3Z#~1sH@Bg>b){PObpvdIS6l@`InE@u zD?|lpv}3WR-ewhCEbDyuC&G1!W}ZVE+^-(|Fe z?i^Fy8VL%P|3D@;=<%xqO-$cN07*uRj1PND6BZUH5w_)gMj%nkr~!rp9Q$naNNZmI z0*cVKAbpk^nBYqh8+xvp(kz}We4=vfjHyBW)-`b3&`>K-E8>E2O8Q31AxTI4_J>5K zDXXd88r9NN4KG}mHjob$HB;Bm=ce-5`*r1KqBM9HG|>FenNK}X^sm=gfN&swjsu8x zYWoTK1S~EgsYN-BIu=~w45Y1-eDe_S@koMWqIp*gh84d!q@Ip>WKJ?3#E7JVwt}ZL zSWmUmCz`BY8zvI$0b`w+*f*5Gf9~jF@CrTNAp4er6BUFoL{yPJM2I9{xpFh$QCKC0 z_-kLcBCQAqylA0x)y%}WuGle5L3I2a;3-q0+N}idnkK9%=FAnAn?X@)Vhy7Jloski zT!VObzUV8UD?v9 ze+VFM9+~yNf}iY|UlB#r5j;KG#T0@}%ZS*>NUg3=Ll{2t%h^Okg*Owd7D)MWW?^lh zjHqdXGBw$bBI^!(I!RzDdP&ZL%H;J*&Wk7h_2PWnS;Qb#q94G@6yZJ>UN8d;&9Q3q zDeqmkwqcYaW)?g&eFR_PpOK7)+&WRwe45)x$f#h0MtMf-%O%7hClD3*`{%=_`5>poyiaj@0waT(2^<~A0 z!LgLB=v9)L>NTm#gpOJ>T}zOfhJQuEcRvb#V!_Ihmx?otUuHIhHLtPJd^ z_w1WAkrjj+z|efi@i;0A@=fLZKvwbGEhk`@;_dHPy)2gBktAr|=fcbliy)MGKE=kx z4VJAy$UqclexP=)O1XYN!_%EcE2R0$Hhkpc#HfrzU?bm0RxJ!67B|&;inQn)8w)cg z4`%M#iTz%9;8?u^M{m6X2z6gvd}J;hGA)ykbZiu6 zlcY2kQKhVA~1*V&$E+u66l@{73T9aZTwgx86T|@_Ztds8mf-dJn2oP4AdDB-q*%lPy>? zja@IHg8|Wi>S1EabzWuv@bFNxFp?gk*H&2yLE`A8d0}q>wl=dgiY~?G{^8>TIpVN{ z)RK7qt{vp1{llkwCP}OBt|HY4Qhkm_Sc>6Trh&Hb)AVCL+y~GqpWA|$Ad3N|@9)3; zLoLS_l1D&9e#Qe%%!%dajg>df@fQ_1Z?+PCG`A7Lu>KE4(YF~Qzx6~d)HjN!%24Ol z=41(;15bXlv!#CX(~fNV#x-#*916yViGLgqA?lKfeNN3&r-}Zcht+Qc;pf1ttV~l{ zRP`N$=!J{G)`>pX(5`i8x}_|UGxQ4ez>^!&!#>v1ymxIsV|HuU)nEo)(5VFH8{snSq4BWzJH#l2ag(G< zn=SzQ@p1+J$SR*ryp2ik01>E8+yrpt=h~IYF7yLK22K}dY?um_V)94PX@{(w2^D)w z3GN%JlntAxl;M@^o_(8J>S7{hy^5!jj`f_VL1*RzO)dp#AU01^U)NMD?0=(eveD8~ zd;iq4`v{^Z>_N?zV|-3_6Cn^1?+Z0k!qY@&Zy~5*)@zozJ^4iUi7AKx*X6a zfdpN6cwE3SzT`F@o8ml*ZfCyVWC&zCK%K*ide+S7*v5`*Pk8vV>?h~JNuFyVLftxP z)(h&|re~iaS58$=e`C96Jv8tlDhn^Y8rE2qc7*4BY&*sIcz#h5Gk{JCO?)Pl4<)i!>2o7A-P%_AnZQY zfZZRS(~I%pmq+8K-*SZ?MUd>AlQEravW+G;jiet(sfSRVjLB;M@Y%BrPXu^7BwK7X zCrS*`ZgR@G|MJDd3{(qt?iVNrtw?8kTZCamB+M!*Z@0a%Bl0c7feIUl_g@@ z*~&N&fW$4o%DapauhN}-Et(ZZu?>b~;@Sw|<%ISgQP-0#!ax8!Ehu>rg$2Zbb9UmA z*4~A78uDEh@Za6AtbL;e;zY3u)qPk^S>Swu$L7w|l51%f6@>?C8LtqSS(qdyDgqU| zf?lou)3M8kaE%%ej#bCz0S#*3gqTH2f`BWn=uu##E>nLeg^8r9k*aQyX%wF%&SUDi zMd%NBs_Ysv65W}x{vT;dTV|Cw78$xxetx%!j)EbrVcrE1znFHQuuWh?1IK~JcV|8n zwR*q~s;27O5@P@P_5S|R-m{~p2X{({(|iRNY zi(Y#VCiTwt4Rb#+dX9>NZ6;-fzcg|Mf_zV1y+QqiBG`iy`KA&ylAB#^aWdaXRVYb{ z>LpG~BSJg~shXU#=?08p0)V=5XGqt$(S~$wq-P1DRP_q}Bry)fWN32#XUWgU3owg% z7MQwRcgtEYUDl>^atHeK*)w=AaDF2fUe8rBJmS}y)iXC`_ zv~|RDlC~peEHh)SnDUlh@Ru!js&>%5G|DRHTA3P%;(e9+;Lvt~SF&xw-le9n+`N2K z*SVX_W}Q-CHk4MOLH-gRzm^ZAztQ~u+kbQ782pK1vRp1;mQyGz3AW$Y_t?AMJ2-m!>}db- z-M$B2>Stk2;-=C8TLh^wa+QwGv#Os`yt$10^>ecZ^V#cDQ^Z8B#EVbY}I%>l7 z13Sqbnh49KkmYZjoQ7$i=vXXWL7CX_K(a-s1m8%{frn!QotOg5l@Dc(M4W@S;s#V= ziL#0vejX%{5G1eC;2c=$-6egQ*P9blofg<2Rh48*S957&E=>bM<#nu!`>OKlW<-`GrvH#xy z`Wt;EMl{0i_~hgm)%!dC@d|&udVi6xaHZMURy||mcxW;nXh(ttc`kX}=a%|ubST2{ zf^e-wfK>@QJOU(xocDUtZirv|;slJl^5s{{%ZhpC{@B&B5>iaZ z9j8o3(^{+ZM~=c(P=da|`T;3@)@kEx{M%x|>-}@4P;^yhCNGSj_+6-3F>Q!zJo z%v720Lu;S<|M_pM@XF5mD!?nvB^I5dYEgHCB?)VsEXz64Ry*b~g^e1N?sEayl=8g; z`(|@~#lRPRt+mvaR&*vp^PctW{f;rS5sP7uX$OtK^wUL;3R4`#HE@g$EA2+d1E$?# zQ14W}n2vA%0qs0ExIQ>II@mwj-@9Y?vkv)AuX0>d4vIr0R^{aP-~LM{wz4i=Y{|7F zV&eDT{&VZinOm@W(!s$$cz-5S7;_Fvu@D-2FvYs4awledMD8c38Pg2HeZ5jX{&67pXR+7kjl$bnh_o-XnY>SbVkf{Qh7y*(9*^P6*}}6!-_q}-s|B@?J^8(Ij7bqeo z=Hr7)(6)R5eZx`B%_~YC^VyE&>gmZLcv`Q0gQPQvldVk-WPHix&X@c})c4Pw%uU&* zEob-1aUDGe3+W{IE2~fTU$o|Zx3sQAl)Ytx;(G&TU3mEL#oL$9_K=z0HabX92;<>p zyuN^RRPhe~{+t4E%;Iy-Pc%%0?d~1!&P9Jy{!_{WIoY%op{GbYf6%=+ygt}Fdi><; zLf}{vfkuMl1+MW1vMdJNlfjLHgi)BBFZlpW^&|`sQG|XmdZ;d(>NlM3i zjSQ}gj#<=8gN=BDGP%X5%7tUCfOM7aZ0mHyPqMpEHehDU*aO0KYcYWo+ynO>^aLNY ze1~EKFmqpVC&wyrhIT$Hd>zcm>089x8iDs3@FTZiE$20)ioYC??O2s675s6MDSJ7j z3=7{it!ow#rOeBBVQWZkytk(ztBUO!)|wp^v&c8J8Qc;lMR~a*^Zt0m-;BTaxWk~B zCs2*Wp||_e0(mE@0x^O3_@F8hZ`gOZJd9@RmX8FdK(s`s!#JE@Mz(nD7HMG^IX0v?+=N=WKkbN7v`-blq3I&KfKfMP zzY+)E&G9!ySw)hQrd(_4n<@C(bmmKxKvM?HbO@jOh{2QVJ>(%2HD6b&jJ^oz5H=lY^FJAGoGD-ROE0G;t ziBYJn9)7-LOPPKn_4Rdbc`@kTN@1ZpIsTbwjil|q%oeWxKOS&CV8)#r`q%H zfKtf6rr@oZ;DI*ZD7elP3}A; z)otF~7)R_$p|m9r(bzlMcr=e9ETpkY8`|X>>Xyk+!qt(V`wqBsyRI-OSXSZ&}vd7)BlBWOF z=Cs$NPQ_a(vZuF>ZC8#UHMaJ}WayqqNwB3bN+XldStihgo`dnMu>>Tc31Fb*FB2wm zjB%Gu^FzJ$jw;!X3vn!fQcoQ7ay;$l!s&P(ygpQXZrmJL&D23oJyHy*8|AIEeL=+~ z2=@6if38h4GBa9)E|Bbnw=8N^ywDob}KOigy5Wx6*bbLcSatr;6*hMQalf2X5Z%(LrOY?Y(~R$Kl&$87>N=~%s( zs*{XOb@(>N!$c;JW0&Nl9qQJ^5NNr%k?~guX%+fA4c2Pc35`?L>;%@2ELIW)Q1I1Cd`ToT!7U zB_yin`B>ZccboiGEF+XqNRG>ik?dE>*sW}qM~9EQuRD=RFB|0V=ezGWv&m^)BhE9k z(wv6{F5h0C9&Lsk^4z#w>CN<40s^o3_X!&?_$oFw{<(EW%Y4Hu&327sK$mXhN^l%$ t$U69DWWcX&w546A)+#H~<)=9*-$QQppI`53oh@CD{{^F&X2C)}008TW%*_A* literal 0 HcmV?d00001 diff --git a/test_fixtures/c4-sample.02.json.gz b/test_fixtures/c4-sample.02.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..3547cde469d31d30702e1f7a0a037b5203f9b8b5 GIT binary patch literal 9889 zcmV;SCSKVeiwFqM{`X@517kETb75_8Y-KJmGA?R!Z*BmUeA{juN4Dkrd__uw8QVZr zk(5MP(mWWHOk1>dgGgKGhF+pFqpDKO%*f6Q%jJUu^gqng05fj`%=}0HNq@<#wRS{S zQKA}W9&CtJnQ_^%_u6Y;qW{^`AL(#3{_S3LueKsn~!+43)`|;XG^@$XG7S!uE<GP!+x(vbC#y zcHM++g9o$PHP>ZREV8pc^RBCQnd`IDcxzRzM|_6q`<|0fTT&;YEC1k5E)LA*C zF|e}2=f3Yl*Uz-qu~~NE8WRqi&bwmT<3IoYKl-dH8}HUW$uq}xa0VC`L*=2aLz9K& zC!Y^IExb8&J8lnB#>Oj?qk$O70E*@!#Tuh3_DUe!rOC$7=0x$giMZ9BqKA&a&FOMg{??LZY18WEnRn zaXFmWT7#_04weD2P{VOJ+Ar5rFDnkS-M9ho$ZU962b#9XE}A&@P}W|mvCGCD`^W=S zWQ5F@63Bp)%dc(aXREHvog`Rc^_{)2q1gr*`!<9^n&Ubu6J%*~%EA}fbH6Sd?NG*d zUbnu3{hW>Ku^&d9O?=wt+{thVEj=vJ_dEyAPBrTKs5HYQCLM9sZgx;Ln11%MY}=4` ze3Prv@xVZa>|y{o;eq$Qab@2-o+`)w86Q_Z%nrtET{mE~)-}U{;>yOGFi6wfaQo!l zu&F#>>T9jGhbooY1uUw?H^dYi)Ti+V^A9F{9D1?Cq2pXmPQQO_OL*_w5oek6-QRsv zVrKh5w%ZiGPP#5Z&G2mlD3bkdXtdi+Ncbm_uK)(U7FE`5SN5T)fds%&e1gN+xVF#g zK)mg^!>yY@gL4TCv*k{oy}&jD*Y;vFcKHTVmH2<*p%-0TROQNHr`f`4#UrT%Utv=& zONuVXDXw=Eywr;eEu(HrmzR1v$1pWNH_$srLOda3|Rk|3o+n-D5blUcstW&Y$R)T!NU zn&;2MgQ4}6a%sthB^x|>@)T!XjGeg@SnmjIr9NVR`!&xF4Boc-w%P%r0QOw<##aKM zGXfjz$Z&EoDQj>3$Vr>BwE>Iq~&cDvcZ#Bw4SWmN@ivfm_}{H>7+T*|H&*^edxV*N6qL_a;dxLimAU33{3 zSF6H;`{bV|tudWVkAhY-ro;zJ-#~7aUk!%u2dXg{IE7hz!QCXjV5`9{yot&DNOUM% z7XGUPCx!5`2d6VIl9^bE#)NP-V_&*{=s(xNThfT?@$vr0<_%aDrFY2zI!*O+s8&h9 z(Wk@f0)l0?6I^jP)4eFfZRnsRGX36I0K8IE(nzx8D5jU}g$)T?M`SvyvP%FvL|j#n z3T8X>vx313C{R_cFdJ#4DG--}HF zrI9#q?^fEwz1q6QW+1eQU+W^=o zkv%I{U!C&vy-|lvqWrL-Y3U_Awe(5$+b}{Qmudf#o!^=R|NQE{7VwNMT%rE~(+R?4 zn1-=43qnc*+ucA{4e!fUWmf@+c2@a6Aa1O^D0^zJowwd9@JM)Ud2ZADlNq$pY zb7l(4zdzpJ^3X5jB0@v1$FVl$dh_u<6uvBMPN}lKQLqL^H0UA7>m`?d+weKqr?Hn$ z^9WHaa2WgKxqGp0t``q*e7Ye_&~L&v0tUTYWpv6macc*d_HrO;TkkTH079*DZNBS( zWiNn=wKw`&4&M-@hrtegQw}lcpqW8K0N$X2wnkZ}l&jm_vV|`W@=CM}^tP8-xvH0@ zi1m_oV?Z$?muwivS=_C0zxo?49iZS-fK)b#vqMlndbm8O2L(3E&Jf|4WBK4J?g@vY z9hIH@?ZOU@58!adeiXAZ*VyD61cp3;^nwAT7zn6!ki2TDBYYuAY?7FpLBw_ z1@k+IO$jpCHOK?cXcE2{8F-5)mz2AV7UdL^hCMJxlDLc+f@nY@LZT0jGnl}L|C7n- zS(92ax+QS6G#u(Z`RQj*{|B*c7J=~)&J1I=!L~{hhO4u%T8fXQ9UA#_KWqg}kQbN? z4&<-krJyJ5@`c|hNNV^QhJ(3d+>JI zbqPxK!RDj0Fbi|bO8@*61SVL?dK7mh>s2rb!-KUbxF9jCtVO&svToZvK>lrvieOy-2 zshD>9niM>Jcd%S8GFrel+@aDuL$rliR_`!sV-l6^oh}ZZkCHZN;0$te2wV&2!3p%) zw~(a}<~u1v9FQkce7=}j=KI#ipm$=j$tz)XFOy%#ycCop;|xWNUl{quY!$|Iy&^RR zm{Ql3FhQd-Ed*Wvct29vC39o_$NSWze&iB{L+db2hcA@|lUlBDbaud>#`W4VM{qX^ z#~P&%ZLjW{9}(yGyCB)1uEx;lVEfD|>_DSumWC3Sw!GaeGar=xit!V31YXPsJ7Xa7 z@Uea8|KJ`H4q6PHBN{KvgRcA*JM+-!`?LFFSKWVhe=`j2v!kQ!cDvXl8}2+kp6?cz z{3zPO(aEC+r%#Wx>N)o_FZ;RD)^kieXWDvRHuGtW#b&50%u+yyG0lVHlc)3JNAm}# zS0~5MPM$nFefZb=|Nfu;E#=0~!2B8l?i#fJqA5zJz>`VWvgd))$QfiQYh)WcLA2Nc zDR`uPKvX(!!&Y$`Y%fJ-F9P=8kk{u*cEr&&jNpaze_8PvSCntGB9V$A9H;R%e7`Zn*_X@r&#>H1*Ty`P* z_SJvA`p#%~S`K(>YsqS&!RbTA$-QAqO}+r-rx);jxsHy^TFkME{jR>C#LMx~wDUJvK#B+4E%5DZ&ZYN zG-rI0{rvvo&jmqW_1}rZLm}9D&V&taBI=<#=V3ah>R{$X#9!`n@Tc2`t z`!LP&aINWTJ`UPb!I-jkw6;c?n+K+UEQxKoaN--RQo8WD@TQ$Gu~{m`bbHB~!*XXL zOtjGGX~;<6mz&;gd_xbY=vq(BB5k+4EBxt} zKG2OrsBs**92^j8jDd3FF+9&Ql82No?X^uBMnk+&Qd#j6Q~3y#pk9zw#jupFtka2< z%2VA!4qyTyy0e7N3R1I#xBNEeZLwzGtZi6|)7`+ui7S0RuW}9)ZQG_4xfxMZ6o!3_ zUf@HSpU*aFj%oMMZ3Mu=Q37&5BwoU!FvvT5(UYbtTLILmyl>$d3O|{o88#OiAY>ma zP& ziE);^sX=n~V6(9IrQc_2hY(c801!2x0>q5mmqOiUr*b61OwdwB&PquUL79iJTDt-Y zK85F3T)3uTo$4l3qj&-WFPNJzLd|OUh7Vn03s9x!8@GmKTw*U9lLEbM+K=#Jg#)Mv z===_EFN0LX3VZP+`N6xf`Q(-}z<;-M4fgr6tl>W(jdvX|s^6Hm`^DAusDJ$mZ&jDw zo%D}~>W3VTK5#Q4f1qurN5@Z(9-JP1|M=eB1c@=^P{McHx?WzNn>8w=k5rTsg(Z!d; zP|3%`_Gxr4Hl6`@9U7iF^?rDTJ5}pWAG{dM5is~_%&&J$b|G@Le?u2MBgGfkW<`pv5OPb_R9(w0?aS{8`yiDfsOccGQfH}6 z*VR3g zY05!(5$ALkcbFYcz}5QD;$#&G)(mP*0hEIm#WudpDa6UG__=an8Gz{AsBxS1s zn74kgtYZ%vvtxS7Tb5LvA=#2)Hx{!6C8|ZqFF<%>`b(KbwRDiJwM-DKl#9D5dkZ{m z&^P|Z749(4dtv&vi(MY_#LPf6%=nEe?m(3-^ku@=n8QE+{y)Jf``)^ethOU-?-u|3 z`~SkjM8d+do{D$1Nk$j3$PfpKCdr*5CZqwb%bX){9e6utMPmw~+8Z1bLrm_{ont?* zl)+)h+we#&eaOFI6g@RT(zXhE-Qv7ckSsGkNJ~9H3sEQ??A?u}cj|h8$Pw zv>@sW%X`L~rZ-Ge@1?^eE){tb**WIP4P2FsBGx#WP6DWX5-Q&)`A#+4s40uaBNXSK zi|i$KsuY{#Ra#o+2Y}kHX*`XXxj1nGm3U*ziW)?eF`d?aFN<}5@#rqxl!ZZ;BLN0{ zlkNlI3)nXV_rXsh>pTk+ace)rb%Jyp{nWYGF!h?p+hxCuJlbzRSe;TeDK0gE^(6!X z;@)I`rDbppPovUEGD_ioN0uS6u%7_NphTi^46Gyirb*f3$Z-5hY%x(}&XW*HQ;S=` zW|s&NAl8b|)yQ)(hH5Ovyi@L1Q`x_deG)VbHovvjH~7!L|M%>5VDN{iHn42pH>I1u z^xdC+4e65KoK;K6)%}p2#i{x>SemDUE8!PO%)R%7Hdg~di>d!&V{KYAbnkAsOGEZq z{A$LGqPBUX)QV+RVMRo5amwf#ltk$KzJSR3y!Tava|PU0fCBV;Gu)XwTf)JlHCLLy3+jRLH@dB;G9BHE|cAr$~rJd;vHwc zXI$yP==h+c6E3f<)il3eoAIt<{ng_&;vkmBv==KuBzb10EM-vbTUKnW z_^SPqkla9v;G5s#6witqt=1n{58N{d#1L~WEX7bsWOzCu=yz$rS&w8ii$esIZa|K# zE9+1|4%H6nq5>tta*riUCAw3G)ur#2SQ)$z7s-T$%06n7OlSwx-(=^G70@@-rZ**v z76vFZOcy!xFhxv~gg9#U>^tcT-JrbPk^*&mxF%G^2l|am=y4ej0unT9jH^+Ux&~(5 z+dC5-Y+8Nnar%K4MlU1l77KCqG{Yr*eH&s5WZeE&iQNEXvuI}g);dY|FD)ygS+ew&=$vYbdzT8tR2IFwB?8rooft+8 zj^mhm0z;;T2!%Aj+p^!x_H`y>Ra=eDvvJ**FYP?JDccpDbruvHBO4F}#=4mN#G2Oy zc2a$7A*G!azEvAiT>0SH=B!lzw1m#DnB9wMb?xtZH;cA`ZPe5d+NX;>w*+|C2?mYy zM`<4b&M@`((U;1*DC^uG`onwH@HqbYC*x?w5@|_&aF&l@4Vmd(hKc6WMmjYSt3HJ= zRT~+- zR}ddW(q;bT(c%5oz#9Mv{H)B%fAmL7!f6_TJ0+vlwh8G&zz>h@A$zWcAJWy|_NM_woMkKEGz=;%v40$N%np$DRYFM#fad zT9qoU3vomM*iI&Q?wZs%!8tx3iMjZ%%sY!k!Iah{xm84|#)Gqc8HW|G3TWun({mNz z#25rffkZ!KDT;uu+^2Hr$60oV1g1{?x@8=qgN{JJyFzdU#5Ul`+W6B_)_!(Dkg$ZK z-J8IQ3`M2|^ueM5*S4|<1vYfUR-p)p3C(7f#*=*irEV}6+iB5@-XARi0yODIM~{zA zkL+`n-qU7Sbni{9^CZ@}iFJNsM=*D*6%ODIr8#CX{^7~9liN!35mKqN9Rb%Fq4hljGy>vy+o!P$QVwqvNX$cq)kCHv3B;v)=y;Po6wk zz>nJGD)pp7wUsI^Va?VCcJc_%KYhx_$4+jP8TN6i*yM+UIBVXVlV|oZ%fG*MQJe2M zf{2>2s$=yA#9|ZFEdTANvB@4hp0OMS`u4H;auwNGPEvihd_FeQ#~1I`UY)bN=S){QbN47nkRk+1sD?s=}x4)cq*+xnYBx-8xqbt2 z&v~%(s91NX_d~j}(}&L<-R?Dc0SZo%QJH@JK=eIknVoAbb&TY8X8ssA8)#uiS;sv`}%6+Au9{GcAnk)5@ZODl8|5_Y#7FSrtc82QY5UsX5_1z7G-Yj8)fQ!!Wxu5LHs*U+8sfd7gE|8(rr4uOef5jQTqgs)<3lH2gaQK#E|vA=S< zDPrekiq}VVgnkJ#0(6XoKKC-X5vLji1n%QuRX~ z^X1l}!KlWJGI8+rNO@wtif$O%+I^0(C$mu)S!vVME^Vt>tCi>D{WJDgcb<{CYXd&8 zJ7L%9!rGk{qExo^T}kJqN0W9=$fN0$^z45?^uQ*A6?{_Y_9?X3_spYhJNheYyRneb zutkF_v7!Q@s#_FYZ2VcpBfFPfsI%RK;_S8QKxg_2!IjeF>S1w*gGSl0VU&#)yd4~S z7mN7LmRU3l<=YO5nsOEgAuR7nsx!tEf((SQWxxX2rC21*TKc#7?TKB$i zGSN66C2Q}akEvpu*&jjH&Cp`rqgL0RhaO+II|fkkWmZ_`(*X04aVFDGpXV*bLaIAs^t5HEeyfC*h}6A3IEK#1WaYrYiya@@r? z^q02_@l2<6o&`XK;rq@gHG)L+Ft zCMm_{V!c)W1lNvJUR#8jM06*O8kpf6t6{0% z9LU3Bygt&>=cZP7$NtBs&yJyyx7#oNg2{8Wx3F9T2F+0LxgV6_W@C#|+V&$5`#^r_ zRdMQWN>f|4ms>kC9)BL+WxXrUUU8E*?9c?4f3X0UAHRV-WzPoV94oXN;is7VVUOvp z{%A^=EgG;;!#}~Ix8`WOj)g4zpcN)$*23; zTT4Qx`vvM!%Rx|;$EDch;&p)WcOu}5#V_ch`zb_}YJ3Vy?xJV+*i>s@VUq zzlOFS!Vpyy2HZ>6)9k8@GKn&ybWfz^-P)8e5eMht-l2hZv6G>6T$0M*q(O1RNtKIW zW&@`%dD6@bH0TugsF2O$FC!!}tJEmRvm6Vx#poK{BsOaWmh+3+@y*`mrb}^* z#XXyFCo-V;XQ_BSZdqV$ImV<%JZ&{~Q3(GUmCMj&OZjcEn zqRK@)>}@0^oex?Hbzww`np+FA65z`IGsRhzK4n}9{go#UCZn({7Mz|g^~#3S0|4bd z5S5);DG1PzyTqNQEUaNI;tIV|cwynbD_C~OLgW+-SxdZDlyY7mA!5^{bjJIt%5~FK z{$wj0;Vx&Q)|eO0qLxF1lE_uxnXqFIJ<-*4zGaAIzYEZ$;}k?sySVdkKQ=GS9Q6Yl z|3VE+zoKEOt&~*WVy!yef!aHZ$;bpWvT#FXaB1v-t+>&o^s*43Kp=?$KfxGcC$rtI zPDX{GDCbXi#NVyq2^T@pU$k1BN>V1hpylYtrM6asz=P@65B6stmB}2iu9y#dV5KU= z!d(-)A=HJ`@z+uO=ibKJaK1R*_{%obKO1TQ)6F_1dQU4CiQIGl-;aW}ES7yc_x|b)Z_6e_s&qc(Xbfm5Fy1 zXb%t|PNQ^o51i#1m~~8oq^dZ@_Yj^i2H~Pe_3mKU_*)YD=MlytTD-RkFbPewClyc3Ye-)HhQmDN}FPqmh%Ym`dcVFC4o zegoB7HJj;WYecc@B=LtErp_TffJwIm-?2??Q&V{6td&O0MawA(_C#e*P6a7Y? zFpP~gu8(#F>($6YqjhlcdZYbRTC8qQWhga+#vhufW8bctIA+XH6SkLSP#TH9XJR$I zVJ7Lvysfi4|H8Jhk5}lbu`H$^sjLheJ?y6&*f=I2(ADc))WbTLbyUh>{mksx$E>B3F+oUy2B>wPlZ%xKjv)G;j5+TTgxQzKDBi}f*y6eg*dVv z@h<4blg@EH?(Riv=hoEImKuq~O*XMxSy;ib*s??+axJoFTsD_^s1 z?Pn!Ap;chxU3Cv|J!JJ^FtgEh1k1~)qnL&1i}*VQOMlqSjWg8m$qAWUWGg?zxfStl zHLdw3-uPZq-|dG^s<2jr>OHLINB!^t8{s{V;3Biz@hSpP?p)Zu<%~ujV?$r08|X*; zMXnWckGkj^4|NwZHJ!0saC8@qYA z@wqxtv7a>w{JmWc7BoV&%jutPKh@@BJ*peP0X{4Z)$== zAG&V0hgbXcq7*fl$GNqM1LkFP*g~*)b;_+rwD|2`PTU zrF);>oT_-E$`8XDgDog69(e4-Iev0<{P+l)nd_M63eM-{5dVb4=VrlQyAJjE>iF@q Tlc(SSBmn;p%qZk(Nmc*=I0vR@ literal 0 HcmV?d00001 diff --git a/test_fixtures/c4-sample.03.json.gz b/test_fixtures/c4-sample.03.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..225668a9ac3a83c3c816db1281898f0c2527b88d GIT binary patch literal 8205 zcmV+oAoAZIiwFqM{`X@517kETb75_8Y-KJmGcIa#Z*BmcT-$Qo#&&(rSHP-@ORAa~ zawyqHrYptBvLwr~E=Qz~$|=9}06jBp0%#Zw%xHWnRsM(MB~{65KFnW|wf61?FeGg& z@q^l^LtN9Q`y2~E@UPveQwH-b$(yZvi;WOTYkRZ9oNM{`i{&D_!j$q^!KH2afVEg_CxdOR zX}dIX?yIV=U3bXVoNi~^Y=-buWA0mW7PJ7C9${(OvNgEVbigW>xAQY>m^~ za-%%G0O53BQiDVXT&HW26&rkzQv@=Z971rl4G<;uu_;Y%MbO8h$J{E+INeYFfTyEM zy?FQR$yE~R9BSHOg)LuCx*GPbEFswUR#y;gX>%^&RhvBvO^f+G7XTroaawOehxt}E zH+`^C8(b@ka$RNXj;s2)hfJ|RS=NW#mLKII9B-wjs&lqoi4wapFWzOJ-Rwwo`1GHKVAOoy?^_MzbByj z8kTcSor0DGETwZ5Rtx)sC7O`K+B<%0Lt}FX860?m2j4@^_!#r4!L3_oA)7C$hk|)x zZX!>=!%+lLHAtf5XprC@hlpdxZ;R|3zqgoCamW{nH>S+})(aw;y7mXYmsB>I6vxl~ z9Hw+l5gJ7-E-pe0zfonMtlzEm5uDg%g;j9i(p}qOM!s1?I$2>3i);{3`Wgz-mtCOs z8-kJNhPdeI{Dd3|6krwbB?J#oSvKD{@*b5gEX9nJ` zXw>38w!kNBrmx`%Ofb;cB%J6S#?>scm$kwG0N3U899E~tt8FdE&NH~NlCz~$Z?=y?(X06SXe`+(zx zjkR@1pBFq09CnnBQ(pzZgVS9lqZ+7^2K4M&)3!#0|>|_L?p*YB8PZut;e=0 zqisCj)cFP@2x4h)7g$o^iAl#NTmp+PvX{_%+gJfQmG1VifpNnjVOIFBFqltg)G*e7 za7eRI2yPV(We>ILfTACOFz@I*e{W}loks(Bjn#6l90W(tl2}8`G4+A_0_cRvQNchc zY9R9eK{bD0@m0F64jp8TXR|k>eu{I3Jg|W7#_q&M#H_Ey#PRgX6nEho9wAK;NqOBd*XkD!^kXE8jmTkuHV_0ymZ1*5 zu(`PFXs|bgvBOs-TZWe#8A8-;kroPBtn3l>&lJD!nwiN%U7IzFKYMWl$3Q0?FnUjr( zr-M|YHuW5WOv0K!5*(=ZM%H|h{SXgb6iggE>|l(PTKTt*k7snX z>E~ej+`>X?wV&WBiHvY2YG3)*?kUWYx`-vj*IKg&C+;W)&Z{~^D~;brL=@g9fJxp4 z|MC&~a8nm9rw+;2u%}MxMx1IQVL9uW%o?bH@hz(q3;`DaSzr=VViHob-E>!Qnme&# zI#9!DdvNmr`6K}oZgd<6UI?H3dA6{6uzKY1*V`5rJ{0K2fC(0m1 zK!Xr{2yx+d;@!B;acMVp35S&C!iuQJjOE5X3QN9a)2p_}6at~0wVaZVxo(j?_l$}< zwQwPvGpNte3mA&Jv-K_ak6p)?m3dat=wXsYpYz;S2GLmS2rVwZjszhNvk*O1^8e(R z79^irOdOMNT?nyi-DaSRvPzgXKydJ1sNe>Z(~heZhJVLy#+;6*A|cS%h6zwQpfc zEF++#HEg--t<%Va$_-&sGGZFRmg|Aod>Lt2!$ikH;$#A0E?PKbx=p_?3u259V z)*3jZ@Mubcp=7W~Nl|uQK_VQ0nAIOrh>3&NA5D4q#o{F1AmHIFwe7oBg0eBriSdlS zaG{aC*bntC3OI}mG$fZYj&-znI>opQK>QyXhHxrTnhT9Ujst;#ucho$1U-?4&M})( zw(&7$Bug6u$slkf5`^$Y!|5B~IZ_5y0j;v_hUeFJqg#|W@Hjr-DDdW6(<&CzU+AmR zXN*z!pP2|`q?^D3FS7S0IWG~&P1j{$`QXZcWB4!AkXOia^ur^E5eX2rlCxIf=54=P z*_5mUQ|Z8340BAiT`PYrB0tqPa_C*HM)+sD__>F}WCVCC88qa&BAevzA$a%;nvOyo zU9qiJrfu<+)2tbcj?vDMqjA#=NOcrTGd(7WY6Q+8V#U7Z46fTJ?7 z8-eu=pqU*Bc!HrR=8TN9WByhMppwK1Dg2{A=OeU+`glRik$k|`?{A~uHJ~$$A&Z=+ z>{dX$KF-ukh*=&JKca{!xdH9>ZK3H|i7^4=@|4m|8kS}~caryD8RItiipI)WZR~N@ zHk_ZF(4gQ@!@M#ZR%-yN%-Uja^Ulv9j5&lc_v?8abuPk~6I|xQ0iN7W?Jgf*ot>SZ zEzduDbd=h?@{CRQHpads-Lb+vpfSS31HmiMrS}6KFJ>fNS)Py zx6~lLh&xP&#AQg4Aua`zFexkv5Rvrq^zTPRW zut$ofDecZD4Y3nt!@0gXKm>S>jE4+4W}des{qUG|PVUTt8*^l~USuyNKB(YXs6#b= z-wV%|)EZVng@@r}KlOpYy3b;?KxeQw3WHvvOvb2El-RnCHVMhZWMbnaySEkwDl|v~ z5wLw-Mu>l0f$%~% zwXjc>DeMu$G3^O=)<7<;utNwo$#li2mqp2994{r5l%Dd~VjXCDb|MQsHpQDHABBpX zDGlLjrEoO?lOQ%mZA@9@vU`tM0olA#UGzbdn)~G<@XK?P=lGM9Jf3&8) zO(;a)3+u-Br8h-l{c?7y)*#u*Tg-vt;2|t|TZ&2n?#QmE+M`;s$ez(OyKT%=6cmCM z^X2q?osdgTnOlh64*MHwKjTBRA&s=aNw=YZ-=R8}@fZF3KS} z3v+}K`6pr%*P*qVtnBk^$+ZQR$f$NB#m@Gs>dkS>duT*oSl%hEW<}zeRZAuvw zgC09*#vLpf+PN-Oi40iVs^5&pFVG_1nSCCT6OfP*(W4aMs0HZ+#Xn@P%mGM70ccdK zo>J;H$tbs`X%0tzSGeuOCWk8gR3IBhzbP+nVy;SdrWH0GxIi>y)<@^RDOJX;-8s9* z7Mrc_yzWny7u8K`kfXQxBx=S`H;aN3P+$A~nCSVfvBY6&X9ZX16EWh>) z*`>LrAKu`6EL}<=xpb8yPGrwr8z^|y7U`p@l+afs7cPca(9Me3|9m>zf|yEuuT>`s z&BU8*<(R>?pJ1zAgAV@is4TzP>7N+RunHl0^)10J}9Y%jfBA0Q47JTJQK#OW%dj(_|rs@ zgu9ZNE4Ymlld7e8c{sv|0X;7F4X}j@Q_);m(-imRJ4wr}@F9eYPY}a6kbA^2<$=r= zgI`8on(Ea|KVUgNFejd?s8-DMtbDc7j2Ru_kws4IYGR2aMSg3@Dn8ZdJE^HGh>F3; zi;uu5a?4oVx2Eo~%4r^8Fw&ufo^)jDC&ok?1xUy#fvWO>L+B|jJrJ-+lc|frd|(5^ z!09-}X>oo6&>f(&k}4BITPl}i_vm(9WdS?SQ!wW6-E@U9tJ}v25|J1d)iV9%f3hO1 zZ&I{JS*Wk&$QNUGSw%^itnP)IOXrNO!btcM#7R5FL}zbfKG9JppsVS`hbDh4?pXj# z5B)GGyY{gO5gG+Qq32o@Z6Q*)zgAHn7Eh2&vNF4L*R@34z*>c!ET9uHKol zmsY?S9@Ks(_4UXn5t1|g87ZDd09>tW)ZWyKuzyjD0*n}CsP-_STM;&62ig(&h>5~6 zl9(Q_yokU}^R2NrY0BvA&%J8|*9XE(C7)>=Wj{0+5JFh6{UmjwrK*v(>BX%>D6xh( zt?aYv9s3s`DJ$;F_rb(=y~3BJiK#*18G;a99J9eG7q&d=UQu$xMff;`?lDpb!phjQ zc8J|n-d}5JS2aG-4MGf2k97h4eh@exn<#2}gCns72*ldQG!>g=r2;5XVA{8+XT!9j zuO-ls4#4agnEw=h&-VJB%|Knj7jVe?`g;kWG>>3oWHZZ-fl0tUyIh_)8{yRI4B zNSCkEmKa9in()`WlDwhuyoBaBRM&kX)~dkI4N!nWgOu;e#zvw%(PWN{MS?8qp>Ax9 zQY57vAP7wa%l0*8C?5Vs=IImWjN^!Gg5?4@AQ>@ffLBDAlBGyR%?&MNCqeRLGuCwh zizy$?G-TE{D(UCtVTI9SgHehfVKmQ8ZdX*01_XyHeDg_l0HZF@1=iV1b{~O?R@pzV z<%Q^>0&O!zJWpcZ4Ob&*aXX#IvCC%@y2f9d2Qk0RlZkQ5yjksmv^hCkt61lOn zC}6n;{)`y_g>u&UJ>XL?^gGvefSqsswx*@+6YrPCDdW1>=L(VTW6>1L#AzRneSA={ z&~GI)(<*NmvK2C^lKc((X-p@olIlbPT+GdJx&g}~m`WEErLDEXDDn{Vs39#aa^>HC z{jVY62%P%u*Z+=#v(VK z?N`bKq%n$+SVMdtvQ0p@N^GDx|0urlp6Pw0ji;AogVQnUZ@ zo^(D}%yLU0E3GF0i{BL@Svw-NHC-9-*YH{hz9ARVGKN+k3sw=!Qm8BQ;&2Yafi%1KUL{em6 zogG%WCy+}^oHQBySikw|+hp zLUL2C{;7uxy#My~%Ll47!EI^?DJSjX{^*s#4@(A2(cP3Rsur4qsS%gKq4VmLfT&Rv1RQ!>KIki2@%JpK|zK_68Jl2g=Lr6GPY#1 z%cq&)x2mcY%WWo?u&j&sb#mgl#8u5r;p$A>{p1kuU;XvLO!5rg=)5HC_(rp$uiXaQ zpw@C$Cb^hJkd~cVdKmo&hmgIS-EMP4AkGUiSa{Kw@e@g}Qd6Sth}BMdA79pEGMkMp zT^}30Z@p#LREk^<GZFJrYKWNF8N1&t@ zdiv_w)i-axzC35Q60uItQ)*H!_)=yOZD&J>94aDa*zu+mn7I$Dz{z8IpU)r90sPl< zUlv@3WtG?o%4i%%#pI8vvNkAVq*d`ol7?vQ&{&3TGb`%Wa;2uzp8vXwI0v8gK|GHH zp?Lg$w8{sAU}Bl76h1$Z(#jf1M~Eb7%^&Y6`H+u;FOiN&}TU4f=l)BD)UA>Kv;^n*FL@d90xy>#6$N!H`EN;=uvL9vo- z7(UNS)8HW4?dvD&!Fj%WcKQ8xS2$0|ZGtLHcru|VoJ3KKiHyG8LE)v_R*i3H5KZ62 zz6c`1n0kS=YeeE!L@8GJXH!oViy}uaz>)MVDFivgjLUNs8zVl2!78 zlh6h^REZTM1jka5P4#Cr78W~3Qz^g)=6YR08^|`BShF%9M6L9XlTHd?9<;ziV`T1T z&xQ=%o6$wnSlOd4Fe>gm{W994sItzk7wNp@_GvWJWeVbrZuu5~i3&QoI`;I<>!;s+ zfBEvwYdKH*Ve3}Xta3c!I8h2=W$1LXtuPhlEN;zo`|4P=*dX?-d5foBd%ZQLmx4~Z z37pQ;Lgv$D%!g&nhw0~W?)fO~#=-@lAPn5IZ(@BY zW;^zF6<9W1o-zsDQ)yz?S^Sm>B*>G4y_JSC?c%6)CK1=T&RBsjX)~!+}f`Bo*}%I< zftvT9Jp1|qh5xN-m`AnQ4-fd;^Yq0c`_4!iyl;w+mFLl_Y5lVOj{KaLNe@RO;0;;p+aI)HNa$vBe_o zO}5;zCF$g+=5R6@;tsujNT^soJYRl(9LYWV?%7xGo?HPgFWHyjVo7kCo>Av!c|kp- zWG}C5!=JoMogC`_7*=UkZmua!R)|;X_jZ6it;;SI>aqH+ub`xjRBwhx zO3%DNTWEMb)QMW48I#vg6Yx^u?I2em=MT(p0I{YsO-nT0wZ}!?}UO~aT*smOT zxq#)m_cmT01Z$8EAS*GwtVe51!Nxpb$F7J8Asg;{$0iUa7~+L;Xoj|(&v?(AML1d@ zT%;RrPlH5P6TwIsR9m9At}M2aa)hCEyr|CLFs{F#q6{HuXAwjxi=CjT&JLVPJ*gX3lGS9xUEq;Fsyky&i=aZ(0e*odC z|6mKBebquI9YQcfgRNcYe@5ck5-bUXRMWOwL^I=UJ_Z#}kMgACD4SlcX87345HD1S zv#VLiEaA+yR%%?KuZNp*^4-Qr2kgp$ees*2-pi|qbP7hVII=%*7jt~_u-G1tkO$1d z=ixT+bZo%yO26*RwK&ln;eiHB;e_rh%%c{ClT@KQ8T3gqPI|uc-1}=LP3Jpb_Mqht z??9`wtL0zLPfyQJ&yLWF)sgH{kSM#-7B8N}SN;IK`3mOC63Uasi^Z!2)MRG|BJywv z1AYaW03g4BY~|L~lRRgO4@sjA7P7Z3YqD}x=ryx8KQO&|f&Xr@J`CFUumyVb6~wXV zuTR*ZpuacU=3@O%%d=0VR$IpJehet~(32HSe{jrrDJ7w^B10>R5Xq<&9lXPW#B`FbU zLh`kKJ|h9%0i%Q|XP3Vr{i$-<4se1yNX|lX9pS4II4@JGNIY{&@w_ zIw{JyE?otSH_7gLU1``&E%t3pGY#Rgc0lMK9nm(lo%|6JMwyT+TM<(Co!r=N{n~0> zt{h&zacYvH%~NkIJw<7w*0G3g!ZjB6q)T-#`4d3#FGdu$lN(YtX5>f3!lN61Sq(sE*igq7Vr#CG0?l|CAlK_;WIm2$fxQeI!{mZ(>ab;W2JP$vLKNO7n2ZUk(ut?;rW&QX%3Cs=V<;oa6xzEu zXiV(2l6;C>s%5vKF72){X~bIM8Vp+1S(n6^ltK)R+xXNnBmI!wjU^6UU0fyGIU9kJ z$>l+*RN=^L&9zEqHZ;68P*SeJW2`nK2NS1vS2rkUV*}8DpmJYR1_mCtCaM6{EVft7 zSTEIXRHp6*%0lI(oM)-IsZ;l(m121$micKPNftt;X)QyC?iKqBddZ-@wtH5QU(3D! z*yJPk?P5}fDV