diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 0924e93..e2aacd0 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -14,7 +14,7 @@ jobs: - name: Setup python3.11 uses: actions/setup-python@v4 with: - python-version: 3.11 + python-version: '3.11' - run: python -m pip install poetry==1.6 - run: poetry install - run: poetry run ruff simdjson_schemaful tests @@ -28,7 +28,7 @@ jobs: - name: Setup python3.8 uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: '3.8' - run: python -m pip install poetry==1.6 - run: poetry install - run: poetry run tox -m mypy diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2d5843b..a43833b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -36,3 +36,16 @@ jobs: - run: tox -m ${{ env.TOXENV }} env: FORCE_COLOR: 1 + + test-readme: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Setup python3.10 + uses: actions/setup-python@v4 + with: + python-version: '3.10' + - run: python -m pip install tox + - run: tox -m readme + env: + FORCE_COLOR: 1 diff --git a/CHANGES.md b/CHANGES.md new file mode 100644 index 0000000..825c32f --- /dev/null +++ b/CHANGES.md @@ -0,0 +1 @@ +# Changelog diff --git a/CHANGES.rst b/CHANGES.rst deleted file mode 100644 index a5693d9..0000000 --- a/CHANGES.rst +++ /dev/null @@ -1,2 +0,0 @@ -Changelog -========= diff --git a/Makefile b/Makefile index 946b2e1..51e5c73 100644 --- a/Makefile +++ b/Makefile @@ -59,6 +59,9 @@ test: test-tox: poetry run tox -r +test-readme-tox: + poetry run tox -m readme + test-docker-linux: docker run --rm -v $(shell pwd):/mnt -w /mnt --name=$(PROJECT_NAME)_test $(PYTHON_IMAGE) tox diff --git a/README.md b/README.md new file mode 100644 index 0000000..970279e --- /dev/null +++ b/README.md @@ -0,0 +1,224 @@ +# pysimdjson-schemaful + +Schema-aware [pysimdjson](https://github.com/TkTech/pysimdjson) loader for +efficient parsing of large excessive JSON inputs. + +When working with external APIs you have zero influence on, you may face the +following unfortunate edge-case (as we did): + +* Particular endpoint responds with a relatively massive JSON-body, say, ≥ 1 MB. +* The amount of data you really need is several magnitudes smaller, e.g., 1 KB. +* There is no server-side filtering available. + +In such a case it may be very excessive in terms of memory, cpu time and delay to +deserialize and, subsequently, validate the whole response, even when using +fast JSON-deseralization libraries, such as +[orjson](https://github.com/ijl/orjson>). + +In our particular case we needed less than 0.1% of ~5 MB responses, which we +validated with [pydantic](https://github.com/pydantic/pydantic>). +First, we compared several combinations of deserializers and validators: + +* `json` + `pydantic v1` (`Model.parse_raw(json.loads(data))`) +* `orjson` + `pydantic v1` (`Model.parse_raw(orjson.loads(data))`) +* `pysimdjson` + `pydantic v1` (`Model.parse_raw(simdjson.loads(data))`) +* `pydantic v2` (`Model.model_validate_json(data)`) + +To our surprise internal `pydantic v2` parser appeared to be ~2-3 times slower +than `json` + `pydantic v1`. The fastest was `orjson` + `pydantic v1` +(~2-3 times faster than `json` and a bit faster than full `simdjson` parsing). +Such a speed-up, however, still comes with excessive memory spending +(as a complete python dict object is created and populated on deserialization). + +Thus, we ended up using `pysimdjson` with its fast lazy parsing and manually +iterated over nested JSON objects/arrays and extracted only required keys. It is +ugly, tedious and hard to maintain of course. However, it showed to be several +times faster than `orjson` and decreased memory consumption. + + +## Table of Contents + +* [The crux](#crux) +* [When to use?](#when_use) +* [Installation](#installation) +* [Usage](#usage) + * [Basic](#usage_basic) + * [Reusing parser](#usage_reusing_parser) + * [Pydantic v1](#usage_pydantic_v1) + * [Pydantic v2](#usage_pydantic_v2) +* [Benchmarks (TBD)](#benchmarks) + +## The crux +This package aims to automate the manual labour of lazy loading with pysimdjson. + +Simply feed the JSON-schema in and the input data will be traversed +and loaded with pysimdjson accordingly. + +Supports +* `pydantic>=1,<3` +* `python>=3.8,<3.12` +* `simdjson>=2,<6` (with caveats) + +Does not support complex schemas (yet), e.g. +* `anyOf` (`Union[Model1, Model2]`) +* `additionalProperties` (`dict[str, Model]`) +* ... + +In such cases it will fully (not lazily) load the underlying objects. + +## When to use? + +* [ ] Input JSON data is large relatively to what is needed in there, i.e., +selectivity is small. +* [ ] Other deserialization methods appear to be slower and/or more memory +consuming. + +If you can check all the boxes, then, this package may prove useful to you. +**Never** use it as a default deserialization method: run some benchmarks for +your particular case first, otherwise, it may and will disappoint you. + +## Installation + +```bash +pip install pysimdjson-schemaful +``` + +If you need pydantic support +```bash +pip install "pysimdjson-schemaful[pydantic]" +``` + +## Usage + +### Basic + + +```python +import json +from simdjson_schemaful import loads + +schema = { + "type": "array", + "items": { + "$ref": "#/definitions/Model" + }, + "definitions": { + "Model": { + "type": "object", + "properties": { + "key": {"type": "integer"}, + } + } + } +} + +data = json.dumps([ + {"key": 0, "other": 1}, + {"missing": 2}, +]) + +parsed = loads(data, schema=schema) + +assert parsed == [ + {"key": 0}, + {}, +] +``` + +### Reusing parser + +With re-used simdjson parser **(recommended when used in a single thread, +otherwise better consult pysimdjson project on thread-safety)**: + + +```python +from simdjson import Parser + +parser = Parser() +parsed = loads(data, schema=schema, parser=parser) + +assert parsed == [ + {"key": 0}, + {}, +] +``` + +### Pydantic v1 + +With model (call `BaseModel.parse_raw_simdjson`): + + +```python +import json +from simdjson_schemaful.pydantic.v1 import BaseModel + +class Model(BaseModel): + key: int + +data = json.dumps({"key": 0, "other": 1}) + +obj = Model.parse_raw_simdjson(data) +``` + +With type (call `parse_raw_as_simdjson`): + + +```python +import json +from typing import List +from simdjson_schemaful.pydantic.v1 import BaseModel, parse_raw_simdjson_as + +class Model(BaseModel): + key: int + +Type = List[Model] + +data = json.dumps([ + {"key": 0, "other": 1}, + {"key": 1, "another": 2}, +]) + +obj1, obj2 = parse_raw_simdjson_as(Type, data) +``` + +### Pydantic v2 + +With model (call `BaseModel.model_validate_simdjson`): + + +```python +import json +from simdjson_schemaful.pydantic.v2 import BaseModel + +class Model(BaseModel): + key: int + +data = json.dumps({"key": 0, "other": 1}) + +obj = Model.model_validate_simdjson(data) +``` + +With type adapter (call `TypeAdapter.validate_simdjson`) + + +```python +import json +from typing import List +from simdjson_schemaful.pydantic.v2 import BaseModel, TypeAdapter + +class Model(BaseModel): + key: int + +adapter = TypeAdapter(List[Model]) + +data = json.dumps([ + {"key": 0, "other": 1}, + {"key": 1, "another": 2}, +]) + +obj1, obj2 = adapter.validate_simdjson(data) +``` + +## Benchmarks + +TBD diff --git a/README.rst b/README.rst deleted file mode 100644 index df70d77..0000000 --- a/README.rst +++ /dev/null @@ -1,6 +0,0 @@ -pysimdjson-schemaful -==================== - -Schema-aware pysimdjson loader for efficient parsing of large excessive inputs. - - diff --git a/pyproject.toml b/pyproject.toml index 8a3dd9d..7cc159f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ "Topic :: Utilities", "Typing :: Typed", ] -readme = "README.rst" +readme = "README.md" packages = [ {include = "simdjson_schemaful"} ] @@ -43,15 +43,15 @@ pysimdjson = [ # version 4 does not build for 3.11 for some reason {version=">=3,!=4.*,<6", python=">=3.11,<3.12"}, ] +pydantic = { version = ">=1,<3", optional = true } -[tool.poetry.group.pydantic] -optional = true - -[tool.poetry.group.pydantic.dependencies] -pydantic = ">=1,<3" +[tool.poetry.extras] +pydantic = ["pydantic"] [tool.poetry.group.dev.dependencies] +attrs = "^23.1.0" black = "^23.10.0" +markdown-pytest = "^0.3.0" mypy = "^1.0" orjson = "^3.9.9" pre-commit = "^3.3.1" diff --git a/tox.ini b/tox.ini index fba2c3e..dfc8386 100644 --- a/tox.ini +++ b/tox.ini @@ -40,3 +40,19 @@ deps = commands = pydantic1: mypy simdjson_schemaful --exclude simdjson_schemaful/pydantic/v2.py pydantic2: mypy simdjson_schemaful --exclude simdjson_schemaful/pydantic/v1.py + + +[testenv:readme-pydantic{1,2}] +labels = readme +basepython = python3.10 + +deps = + pytest + attrs~=23.1 + markdown-pytest~=0.3.0 + pydantic1: pydantic~=1.0 + pydantic2: pydantic~=2.0 + +commands = + pydantic1: pytest README.md -k "not test_pydantic_v2" + pydantic2: pytest README.md -k "not test_pydantic_v1"