-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #19 from prrao87/srsly
Use srsly for JSON serialization
- Loading branch information
Showing
10 changed files
with
234 additions
and
331 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -132,5 +132,6 @@ dmypy.json | |
.DS_Store | ||
|
||
# data | ||
*.jsonl | ||
data/*.json | ||
data/*.jsonl | ||
*/*/meili_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,46 +1,26 @@ | ||
import json | ||
import zipfile | ||
from typing import Any, Iterator | ||
""" | ||
Run `pip install srsly` to use this script | ||
This script converts the JSON data file from https://www.kaggle.com/datasets/zynicide/wine-reviews | ||
to a .gzip line-delimited (.jsonl) file for use downstream with the databases in question. | ||
Full credit to the original author, @zynicide, on Kaggle, for the data. | ||
""" | ||
from pathlib import Path | ||
from typing import Any | ||
|
||
import srsly | ||
|
||
JsonBlob = dict[str, Any] | ||
|
||
|
||
def read_data(filename: str) -> list[JsonBlob]: | ||
with open(filename) as f: | ||
data = json.load(f) | ||
for idx, item in enumerate(data, 1): | ||
item["id"] = idx | ||
return data | ||
|
||
|
||
def chunk_iterable(item_list: list[JsonBlob], chunksize: int) -> Iterator[tuple[JsonBlob, ...]]: | ||
""" | ||
Break a large iterable into an iterable of smaller iterables of size `chunksize` | ||
""" | ||
for i in range(0, len(item_list), chunksize): | ||
yield tuple(item_list[i : i + chunksize]) | ||
|
||
|
||
def write_chunked_data(item_list: list[JsonBlob], output_name: str, chunksize: int = 5000) -> None: | ||
""" | ||
Write data to a zip file in chunks so that we don't dump all data into a single huge JSON file | ||
""" | ||
zipfilename = f"{output_name}-jsonl.zip" | ||
with zipfile.ZipFile( | ||
zipfilename, | ||
"w", | ||
compression=zipfile.ZIP_DEFLATED, | ||
compresslevel=5, | ||
) as zipf: | ||
chunked_data = chunk_iterable(item_list, chunksize) | ||
for num, chunk in enumerate(chunked_data, 1): | ||
filename = f"{output_name}-{num}.jsonl" | ||
chunk_json = "\n".join(json.dumps(item) for item in chunk) | ||
# Write the JSONL data into the specified filename *inside* the ZIP file | ||
zipf.writestr(filename, data=chunk_json) | ||
def convert_to_jsonl(filename: str) -> None: | ||
data = srsly.read_json(filename) | ||
# Add an `id` field to the start of each dict item so we have a primary key for indexing | ||
new_data = [{"id": idx, **item} for idx, item in enumerate(data, 1)] | ||
srsly.write_gzip_jsonl(f"{Path(filename).stem}.jsonl.gz", new_data) | ||
|
||
|
||
if __name__ == "__main__": | ||
# Download the JSON data file from https://www.kaggle.com/datasets/zynicide/wine-reviews | ||
data = read_data("winemag-data-130k-v2.json") | ||
write_chunked_data(data, "winemag-data-130k-v2") | ||
# Download the JSON data file from https://www.kaggle.com/datasets/zynicide/wine-reviews' | ||
convert_to_jsonl("winemag-data-130k-v2.json") |
Binary file renamed
BIN
+18.4 MB
data/winemag-data-130k-v2-jsonl.zip → data/winemag-data-130k-v2.jsonl.gz
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.