Skip to content

Commit

Permalink
Allow translation to read from gzip
Browse files Browse the repository at this point in the history
  • Loading branch information
Waino committed Dec 9, 2024
1 parent 18458d5 commit a98e9f8
Showing 1 changed file with 10 additions and 5 deletions.
15 changes: 10 additions & 5 deletions mammoth/utils/misc.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# -*- coding: utf-8 -*-

import torch
import random
import gzip
import inspect
import numpy as np
from itertools import islice, repeat
from io import StringIO
import os
import random
import torch
from io import StringIO
from itertools import islice, repeat


def check_path(path, exist_ok=False, log=print):
Expand All @@ -33,7 +34,11 @@ def split_corpus(path, shard_size, default=None):
def _split_corpus(path, shard_size):
"""Yield io's with `shard_size` lines each."""
# FIXME: this is a horrible, ugly kludge
with open(path, "rt") as f:
if path.endswith('.gz'):
open_func = gzip.open
else:
open_func = open
with open_func(path, "rt") as f:
if shard_size <= 0:
yield f
else:
Expand Down

0 comments on commit a98e9f8

Please sign in to comment.