diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 0924e93..e2aacd0 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -14,7 +14,7 @@ jobs:
- name: Setup python3.11
uses: actions/setup-python@v4
with:
- python-version: 3.11
+ python-version: '3.11'
- run: python -m pip install poetry==1.6
- run: poetry install
- run: poetry run ruff simdjson_schemaful tests
@@ -28,7 +28,7 @@ jobs:
- name: Setup python3.8
uses: actions/setup-python@v4
with:
- python-version: 3.8
+ python-version: '3.8'
- run: python -m pip install poetry==1.6
- run: poetry install
- run: poetry run tox -m mypy
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2d5843b..a43833b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -36,3 +36,16 @@ jobs:
- run: tox -m ${{ env.TOXENV }}
env:
FORCE_COLOR: 1
+
+ test-readme:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - name: Setup python3.10
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.10'
+ - run: python -m pip install tox
+ - run: tox -m readme
+ env:
+ FORCE_COLOR: 1
diff --git a/CHANGES.md b/CHANGES.md
new file mode 100644
index 0000000..825c32f
--- /dev/null
+++ b/CHANGES.md
@@ -0,0 +1 @@
+# Changelog
diff --git a/CHANGES.rst b/CHANGES.rst
deleted file mode 100644
index a5693d9..0000000
--- a/CHANGES.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Changelog
-=========
diff --git a/Makefile b/Makefile
index 946b2e1..51e5c73 100644
--- a/Makefile
+++ b/Makefile
@@ -59,6 +59,9 @@ test:
test-tox:
poetry run tox -r
+test-readme-tox:
+ poetry run tox -m readme
+
test-docker-linux:
docker run --rm -v $(shell pwd):/mnt -w /mnt --name=$(PROJECT_NAME)_test $(PYTHON_IMAGE) tox
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..970279e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,224 @@
+# pysimdjson-schemaful
+
+Schema-aware [pysimdjson](https://github.com/TkTech/pysimdjson) loader for
+efficient parsing of large excessive JSON inputs.
+
+When working with external APIs you have zero influence on, you may face the
+following unfortunate edge-case (as we did):
+
+* Particular endpoint responds with a relatively massive JSON-body, say, ≥ 1 MB.
+* The amount of data you really need is several magnitudes smaller, e.g., 1 KB.
+* There is no server-side filtering available.
+
+In such a case it may be very excessive in terms of memory, cpu time and delay to
+deserialize and, subsequently, validate the whole response, even when using
+fast JSON-deseralization libraries, such as
+[orjson](https://github.com/ijl/orjson>).
+
+In our particular case we needed less than 0.1% of ~5 MB responses, which we
+validated with [pydantic](https://github.com/pydantic/pydantic>).
+First, we compared several combinations of deserializers and validators:
+
+* `json` + `pydantic v1` (`Model.parse_raw(json.loads(data))`)
+* `orjson` + `pydantic v1` (`Model.parse_raw(orjson.loads(data))`)
+* `pysimdjson` + `pydantic v1` (`Model.parse_raw(simdjson.loads(data))`)
+* `pydantic v2` (`Model.model_validate_json(data)`)
+
+To our surprise internal `pydantic v2` parser appeared to be ~2-3 times slower
+than `json` + `pydantic v1`. The fastest was `orjson` + `pydantic v1`
+(~2-3 times faster than `json` and a bit faster than full `simdjson` parsing).
+Such a speed-up, however, still comes with excessive memory spending
+(as a complete python dict object is created and populated on deserialization).
+
+Thus, we ended up using `pysimdjson` with its fast lazy parsing and manually
+iterated over nested JSON objects/arrays and extracted only required keys. It is
+ugly, tedious and hard to maintain of course. However, it showed to be several
+times faster than `orjson` and decreased memory consumption.
+
+
+## Table of Contents
+
+* [The crux](#crux)
+* [When to use?](#when_use)
+* [Installation](#installation)
+* [Usage](#usage)
+ * [Basic](#usage_basic)
+ * [Reusing parser](#usage_reusing_parser)
+ * [Pydantic v1](#usage_pydantic_v1)
+ * [Pydantic v2](#usage_pydantic_v2)
+* [Benchmarks (TBD)](#benchmarks)
+
+## The crux
+This package aims to automate the manual labour of lazy loading with pysimdjson.
+
+Simply feed the JSON-schema in and the input data will be traversed
+and loaded with pysimdjson accordingly.
+
+Supports
+* `pydantic>=1,<3`
+* `python>=3.8,<3.12`
+* `simdjson>=2,<6` (with caveats)
+
+Does not support complex schemas (yet), e.g.
+* `anyOf` (`Union[Model1, Model2]`)
+* `additionalProperties` (`dict[str, Model]`)
+* ...
+
+In such cases it will fully (not lazily) load the underlying objects.
+
+## When to use?
+
+* [ ] Input JSON data is large relatively to what is needed in there, i.e.,
+selectivity is small.
+* [ ] Other deserialization methods appear to be slower and/or more memory
+consuming.
+
+If you can check all the boxes, then, this package may prove useful to you.
+**Never** use it as a default deserialization method: run some benchmarks for
+your particular case first, otherwise, it may and will disappoint you.
+
+## Installation
+
+```bash
+pip install pysimdjson-schemaful
+```
+
+If you need pydantic support
+```bash
+pip install "pysimdjson-schemaful[pydantic]"
+```
+
+## Usage
+
+### Basic
+
+
+```python
+import json
+from simdjson_schemaful import loads
+
+schema = {
+ "type": "array",
+ "items": {
+ "$ref": "#/definitions/Model"
+ },
+ "definitions": {
+ "Model": {
+ "type": "object",
+ "properties": {
+ "key": {"type": "integer"},
+ }
+ }
+ }
+}
+
+data = json.dumps([
+ {"key": 0, "other": 1},
+ {"missing": 2},
+])
+
+parsed = loads(data, schema=schema)
+
+assert parsed == [
+ {"key": 0},
+ {},
+]
+```
+
+### Reusing parser
+
+With re-used simdjson parser **(recommended when used in a single thread,
+otherwise better consult pysimdjson project on thread-safety)**:
+
+
+```python
+from simdjson import Parser
+
+parser = Parser()
+parsed = loads(data, schema=schema, parser=parser)
+
+assert parsed == [
+ {"key": 0},
+ {},
+]
+```
+
+### Pydantic v1
+
+With model (call `BaseModel.parse_raw_simdjson`):
+
+
+```python
+import json
+from simdjson_schemaful.pydantic.v1 import BaseModel
+
+class Model(BaseModel):
+ key: int
+
+data = json.dumps({"key": 0, "other": 1})
+
+obj = Model.parse_raw_simdjson(data)
+```
+
+With type (call `parse_raw_as_simdjson`):
+
+
+```python
+import json
+from typing import List
+from simdjson_schemaful.pydantic.v1 import BaseModel, parse_raw_simdjson_as
+
+class Model(BaseModel):
+ key: int
+
+Type = List[Model]
+
+data = json.dumps([
+ {"key": 0, "other": 1},
+ {"key": 1, "another": 2},
+])
+
+obj1, obj2 = parse_raw_simdjson_as(Type, data)
+```
+
+### Pydantic v2
+
+With model (call `BaseModel.model_validate_simdjson`):
+
+
+```python
+import json
+from simdjson_schemaful.pydantic.v2 import BaseModel
+
+class Model(BaseModel):
+ key: int
+
+data = json.dumps({"key": 0, "other": 1})
+
+obj = Model.model_validate_simdjson(data)
+```
+
+With type adapter (call `TypeAdapter.validate_simdjson`)
+
+
+```python
+import json
+from typing import List
+from simdjson_schemaful.pydantic.v2 import BaseModel, TypeAdapter
+
+class Model(BaseModel):
+ key: int
+
+adapter = TypeAdapter(List[Model])
+
+data = json.dumps([
+ {"key": 0, "other": 1},
+ {"key": 1, "another": 2},
+])
+
+obj1, obj2 = adapter.validate_simdjson(data)
+```
+
+## Benchmarks
+
+TBD
diff --git a/README.rst b/README.rst
deleted file mode 100644
index df70d77..0000000
--- a/README.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-pysimdjson-schemaful
-====================
-
-Schema-aware pysimdjson loader for efficient parsing of large excessive inputs.
-
-
diff --git a/pyproject.toml b/pyproject.toml
index 8a3dd9d..7cc159f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ classifiers = [
"Topic :: Utilities",
"Typing :: Typed",
]
-readme = "README.rst"
+readme = "README.md"
packages = [
{include = "simdjson_schemaful"}
]
@@ -43,15 +43,15 @@ pysimdjson = [
# version 4 does not build for 3.11 for some reason
{version=">=3,!=4.*,<6", python=">=3.11,<3.12"},
]
+pydantic = { version = ">=1,<3", optional = true }
-[tool.poetry.group.pydantic]
-optional = true
-
-[tool.poetry.group.pydantic.dependencies]
-pydantic = ">=1,<3"
+[tool.poetry.extras]
+pydantic = ["pydantic"]
[tool.poetry.group.dev.dependencies]
+attrs = "^23.1.0"
black = "^23.10.0"
+markdown-pytest = "^0.3.0"
mypy = "^1.0"
orjson = "^3.9.9"
pre-commit = "^3.3.1"
diff --git a/tox.ini b/tox.ini
index fba2c3e..dfc8386 100644
--- a/tox.ini
+++ b/tox.ini
@@ -40,3 +40,19 @@ deps =
commands =
pydantic1: mypy simdjson_schemaful --exclude simdjson_schemaful/pydantic/v2.py
pydantic2: mypy simdjson_schemaful --exclude simdjson_schemaful/pydantic/v1.py
+
+
+[testenv:readme-pydantic{1,2}]
+labels = readme
+basepython = python3.10
+
+deps =
+ pytest
+ attrs~=23.1
+ markdown-pytest~=0.3.0
+ pydantic1: pydantic~=1.0
+ pydantic2: pydantic~=2.0
+
+commands =
+ pydantic1: pytest README.md -k "not test_pydantic_v2"
+ pydantic2: pytest README.md -k "not test_pydantic_v1"