Skip to content

Commit

Permalink
fix: process docs through pipeline in batches
Browse files Browse the repository at this point in the history
stops memory issue and null results
  • Loading branch information
paluchasz committed Dec 11, 2024
1 parent 06e355d commit cb515d0
Showing 1 changed file with 9 additions and 1 deletion.
10 changes: 9 additions & 1 deletion kazu/training/evaluate_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pathlib import Path

import hydra
import tqdm
from hydra.utils import instantiate
from omegaconf import DictConfig

Expand All @@ -19,6 +20,7 @@
from kazu.steps.ner.tokenized_word_processor import TokenizedWordProcessor
from kazu.training.config import PredictionConfig
from kazu.training.modelling_utils import (
chunks,
create_wrapper,
doc_yielder,
get_label_list_from_model,
Expand Down Expand Up @@ -69,10 +71,16 @@ def main(cfg: DictConfig) -> None:
documents = move_entities_to_metadata(documents)
print("Predicting with the KAZU pipeline")
start = time.time()
pipeline(documents)
docs_in_batch = 10
for documents_batch in tqdm.tqdm(
chunks(documents, docs_in_batch), total=len(documents) // docs_in_batch
):
pipeline(documents_batch)
print(f"Predicted {len(documents)} documents in {time.time() - start:.2f} seconds.")

Path(cfg.predictions_dir).mkdir(parents=True, exist_ok=True)
save_out_predictions(Path(cfg.predictions_dir), documents)

print("Calculating metrics")
metrics, _ = calculate_metrics(0, documents, label_list)
with open(Path(prediction_config.path) / "test_metrics.json", "w") as file:
Expand Down

0 comments on commit cb515d0

Please sign in to comment.