diff --git a/onmt/transforms/filtering.py b/onmt/transforms/filtering.py index fa7f8321..46807af1 100644 --- a/onmt/transforms/filtering.py +++ b/onmt/transforms/filtering.py @@ -1,5 +1,6 @@ from onmt.transforms import register_transform from .transform import Transform, ObservableStats +import regex as re class FilterTooLongStats(ObservableStats): """Runing statistics for FilterTooLongTransform.""" @@ -48,42 +49,6 @@ def _repr_args(self): """Return str represent key arguments for class.""" return '{}={}, {}={}'.format('src_seq_length', self.src_seq_length, 'tgt_seq_length', self.tgt_seq_length) - -@register_transform(name='filtertoolong_opusfilter') -class FilterTooLongTransformOP(Transform): - """Filter out sentence that are too long.""" - - def __init__(self, opts): - super().__init__(opts) - import opusfilter as op - - @classmethod - def add_options(cls, parser): - """Avalilable options relate to this Transform.""" - group = parser.add_argument_group("Transform/Filter") - group.add("--src_seq_length_op", "-src_seq_length_op", type=int, default=200, help="Maximum source sequence length.") - group.add("--tgt_seq_length_op", "-tgt_seq_length_op", type=int, default=200, help="Maximum target sequence length.") - - def _parse_opts(self): - self.src_seq_length = self.opts.src_seq_length - self.tgt_seq_length = self.opts.tgt_seq_length - - def apply(self, example, is_train=False, stats=None, **kwargs): - """Return None if too long else return as is.""" - lf_src = op.filters.LengthFilter(max_length=self.src_seq_length) - length_src = lf_src.get_length(example['src'], 0) - lf_tgt = op.filters.LengthFilter(max_length=self.src_seq_length) - length_tgt = lf_tgt.get_length(example['tgt'], 0) - if lf_src.accept([length_src]) and lf_tgt.accept([length_tgt]): - return example - else: - return None - - def _repr_args(self): - """Return str represent key arguments for class.""" - return '{}={}, {}={}'.format('src_seq_length_op', self.src_seq_length, 'tgt_seq_length_op', self.tgt_seq_length) - - # Filters inspired by OpusFilter https://github.com/Helsinki-NLP/OpusFilter/blob/aca40bd064d9b087c5216de0568d7fb91a31d142/opusfilter/filters.py @register_transform(name='filterwordratio') @@ -118,35 +83,4 @@ def apply(self, example, **kwargs): def _repr_args(self): """Return str represent key arguments for class.""" - return '{}={}'.format('word_ratio_threshold', self.word_ratio_threshold) - -@register_transform(name='filterwordratio_opusfilter') -class FilterWordRatioOP(Transform): - """Filter out sentence based on word length ratio""" - - def __init__(self, opts): - super().__init__(opts) - - @classmethod - def add_options(cls, parser): - """Avalilable options relate to this Transform.""" - group = parser.add_argument_group("Transform/Filter") - group.add("--word_ratio_threshold_op", "-word_ratio_threshold_op", type=int, default=3, help="Threshold for discarding sentences based on word ratio.") - group.add("--word_ratio_unit_op", "-word_ratio_unit_op", type=str, default="word", choices=[('word', 'char', 'character')], help="Unit for discarding sentences based on char/character/word ratio.") - - def _parse_opts(self): - self.word_ratio_threshold = self.opts.word_ratio_threshold - self.word_ratio_unit = self.opts.word_ratio_unit - - def apply(self, example, **kwargs): - ratiofilter = op.filters.LengthRatioFilter(threshold=self.opts.word_ratio_threshold, unit=self.opts.word_ratio_unit) - score = ratiofilter.score([example['src'],example['tgt']]) - accept = ratiofilter.accept(next(score)) - if accept: - return example - else: - return None - - def _repr_args(self): - """Return str represent key arguments for class.""" - return '{}={}'.format('word_ratio_threshold_op', self.word_ratio_threshold) \ No newline at end of file + return '{}={}'.format('word_ratio_threshold', self.word_ratio_threshold) \ No newline at end of file