Skip to content

Commit

Permalink
removed implementations with opusfilter
Browse files Browse the repository at this point in the history
  • Loading branch information
onadegibert committed Sep 26, 2023
1 parent 429f50a commit e1b7c64
Showing 1 changed file with 2 additions and 68 deletions.
70 changes: 2 additions & 68 deletions onmt/transforms/filtering.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from onmt.transforms import register_transform
from .transform import Transform, ObservableStats
import regex as re

class FilterTooLongStats(ObservableStats):
"""Runing statistics for FilterTooLongTransform."""
Expand Down Expand Up @@ -48,42 +49,6 @@ def _repr_args(self):
"""Return str represent key arguments for class."""
return '{}={}, {}={}'.format('src_seq_length', self.src_seq_length, 'tgt_seq_length', self.tgt_seq_length)


@register_transform(name='filtertoolong_opusfilter')
class FilterTooLongTransformOP(Transform):
"""Filter out sentence that are too long."""

def __init__(self, opts):
super().__init__(opts)
import opusfilter as op

@classmethod
def add_options(cls, parser):
"""Avalilable options relate to this Transform."""
group = parser.add_argument_group("Transform/Filter")
group.add("--src_seq_length_op", "-src_seq_length_op", type=int, default=200, help="Maximum source sequence length.")
group.add("--tgt_seq_length_op", "-tgt_seq_length_op", type=int, default=200, help="Maximum target sequence length.")

def _parse_opts(self):
self.src_seq_length = self.opts.src_seq_length
self.tgt_seq_length = self.opts.tgt_seq_length

def apply(self, example, is_train=False, stats=None, **kwargs):
"""Return None if too long else return as is."""
lf_src = op.filters.LengthFilter(max_length=self.src_seq_length)
length_src = lf_src.get_length(example['src'], 0)
lf_tgt = op.filters.LengthFilter(max_length=self.src_seq_length)
length_tgt = lf_tgt.get_length(example['tgt'], 0)
if lf_src.accept([length_src]) and lf_tgt.accept([length_tgt]):
return example
else:
return None

def _repr_args(self):
"""Return str represent key arguments for class."""
return '{}={}, {}={}'.format('src_seq_length_op', self.src_seq_length, 'tgt_seq_length_op', self.tgt_seq_length)


# Filters inspired by OpusFilter https://github.com/Helsinki-NLP/OpusFilter/blob/aca40bd064d9b087c5216de0568d7fb91a31d142/opusfilter/filters.py

@register_transform(name='filterwordratio')
Expand Down Expand Up @@ -118,35 +83,4 @@ def apply(self, example, **kwargs):

def _repr_args(self):
"""Return str represent key arguments for class."""
return '{}={}'.format('word_ratio_threshold', self.word_ratio_threshold)

@register_transform(name='filterwordratio_opusfilter')
class FilterWordRatioOP(Transform):
"""Filter out sentence based on word length ratio"""

def __init__(self, opts):
super().__init__(opts)

@classmethod
def add_options(cls, parser):
"""Avalilable options relate to this Transform."""
group = parser.add_argument_group("Transform/Filter")
group.add("--word_ratio_threshold_op", "-word_ratio_threshold_op", type=int, default=3, help="Threshold for discarding sentences based on word ratio.")
group.add("--word_ratio_unit_op", "-word_ratio_unit_op", type=str, default="word", choices=[('word', 'char', 'character')], help="Unit for discarding sentences based on char/character/word ratio.")

def _parse_opts(self):
self.word_ratio_threshold = self.opts.word_ratio_threshold
self.word_ratio_unit = self.opts.word_ratio_unit

def apply(self, example, **kwargs):
ratiofilter = op.filters.LengthRatioFilter(threshold=self.opts.word_ratio_threshold, unit=self.opts.word_ratio_unit)
score = ratiofilter.score([example['src'],example['tgt']])
accept = ratiofilter.accept(next(score))
if accept:
return example
else:
return None

def _repr_args(self):
"""Return str represent key arguments for class."""
return '{}={}'.format('word_ratio_threshold_op', self.word_ratio_threshold)
return '{}={}'.format('word_ratio_threshold', self.word_ratio_threshold)

0 comments on commit e1b7c64

Please sign in to comment.