diff --git a/configs/config_all.yaml b/configs/config_all.yaml index d251d24a2..46c3c502e 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -632,19 +632,16 @@ process: - video_deduplicator: # deduplicator to deduplicate samples at document-level using exact matching of videos between documents. consider_text: false # whether to consider text hash together with video hash when applying deduplication. - ray_video_deduplicator: # the simple video deduplicator that can run on multi-nodes using md5 hashing exact matching method - redis_host: 'redis_host' # the host of the redis instance - redis_port: 6380 # the port of redis instance, please note that the default port of redis is 6379 which is the same as default port for ray, so we need to modify the default redis config to use it in other port + redis_address: 'redis://localhost:6379' # the address of the redis instance - ray_image_deduplicator: # the simple image deduplicator that can deduplicate samples at document-level using exact matching of images between documents. - redis_host: 'redis_host' # the host of the redis instance - redis_port: 6380 # the port of redis instance, please note that the default port of redis is 6379 which is the same as default port for ray, so we need to modify the default redis config to use it in other port + redis_address: 'redis://localhost:6379' # the address of the redis instance method: phash # hash method for image. One of [phash, dhash, whash, ahash] - ray_document_deduplicator: # the simple document deduplicator that can run on multi-nodes using md5 hashing exact matching method - redis_host: 'redis_host' # the host of the redis instance - redis_port: 6380 # the port of redis instance, please note that the default port of redis is 6379 which is the same as default port for ray, so we need to modify the default redis config to use it in other port + redis_address: 'redis://localhost:6379' # the address of the redis instance lowercase: false # whether to convert text to lower case ignore_non_character: false # whether to ignore non-alphabet characters, including whitespaces, digits, and punctuations - ray_redis_minhash_deduplicator: # the document deduplicator that can run on multi-nodes using minhashLSH algorithm - redis_address: 'redis://localhost:6379' # the address of the redis instance + redis_address: 'redis://localhost:6379' # the address of the redis instance tokenization: space # tokenization method for text. One of [space, punctuation, character, sentencepiece] window_size: 5 # window size of shingling num_permutations: 256 # number of permutations in minhash computing diff --git a/data_juicer/ops/deduplicator/ray_basic_deduplicator.py b/data_juicer/ops/deduplicator/ray_basic_deduplicator.py index dad317d17..3fb902386 100644 --- a/data_juicer/ops/deduplicator/ray_basic_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_basic_deduplicator.py @@ -19,23 +19,20 @@ class RayBasicDeduplicator(Filter): EMPTY_HASH_VALUE = 'EMPTY' def __init__(self, - redis_host: str = 'localhost', - redis_port: PositiveInt = 6380, + redis_address: str = 'redis://localhost:6379', *args, **kwargs): """ Initialization. - :param redis_host: the hostname of redis server - :param redis_port: the port of redis server + :param redis_address: the address of redis server :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) - self.redis_host = redis_host - self.redis_port = redis_port + self.redis_address = redis_address # TODO: add a barrier to ensure that flushdb is performed before # the operator is called - r = redis.StrictRedis(host=self.redis_host, port=self.redis_port, db=0) + r = redis.from_url(url=redis_address) r.flushdb(0) def calculate_hash(self, sample, context=False): @@ -44,7 +41,7 @@ def calculate_hash(self, sample, context=False): def compute_stats_single(self, sample, context=False): # init redis client - r = redis.StrictRedis(host=self.redis_host, port=self.redis_port, db=0) + r = redis.from_url(url=self.redis_address) # compute hash md5_value = self.calculate_hash(sample, context) # check existing diff --git a/data_juicer/ops/deduplicator/ray_document_deduplicator.py b/data_juicer/ops/deduplicator/ray_document_deduplicator.py index ce5cced4e..667f86e38 100644 --- a/data_juicer/ops/deduplicator/ray_document_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_document_deduplicator.py @@ -17,24 +17,21 @@ class RayDocumentDeduplicator(RayBasicDeduplicator): """ def __init__(self, - redis_host: str = 'localhost', - redis_port: PositiveInt = 6380, + redis_address: str = 'redis://localhost:6379', lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs): """ Initialization method. - :param redis_host: the hostname of redis server - :param redis_port: the port of redis server + :param redis_address: the address of redis server :param lowercase: Whether to convert sample text to lower case :param ignore_non_character: Whether to ignore non-alphabet characters, including whitespaces, digits, and punctuations :param args: extra args :param kwargs: extra args. """ - super().__init__(redis_host=redis_host, - redis_port=redis_port, + super().__init__(redis_address=redis_address, *args, **kwargs) self.lowercase = lowercase diff --git a/data_juicer/ops/deduplicator/ray_image_deduplicator.py b/data_juicer/ops/deduplicator/ray_image_deduplicator.py index 7ca0d10f2..7610dc30d 100644 --- a/data_juicer/ops/deduplicator/ray_image_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_image_deduplicator.py @@ -36,20 +36,17 @@ class RayImageDeduplicator(RayBasicDeduplicator): """ def __init__(self, - redis_host: str = 'localhost', - redis_port: PositiveInt = 6380, + redis_address: str = 'redis://localhost:6379', method: str = 'phash', *args, **kwargs): """ Initialization. - :param redis_host: the hostname of redis server - :param redis_port: the port of redis server + :param redis_address: the address of redis server :param args: extra args :param kwargs: extra args """ - super().__init__(redis_host=redis_host, - redis_port=redis_port, + super().__init__(redis_address=redis_address, *args, **kwargs) if method not in HASH_METHOD: diff --git a/data_juicer/ops/deduplicator/ray_redis_minhash_deduplicator.py b/data_juicer/ops/deduplicator/ray_redis_minhash_deduplicator.py index 203fcf059..14ce9fc28 100644 --- a/data_juicer/ops/deduplicator/ray_redis_minhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_redis_minhash_deduplicator.py @@ -128,9 +128,7 @@ def clean(self): @OPERATORS.register_module(OP_NAME) class RayRedisMinhashDeduplicator(Deduplicator): """ - A basic exact matching deduplicator for RAY. - Although its functionality is deduplication, - it is implemented as Filter sub-class. + A MinhashLSH deduplicator based on RAY and Redis. """ def __init__( diff --git a/data_juicer/ops/deduplicator/ray_video_deduplicator.py b/data_juicer/ops/deduplicator/ray_video_deduplicator.py index 902ca1979..342abf7a1 100644 --- a/data_juicer/ops/deduplicator/ray_video_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_video_deduplicator.py @@ -21,19 +21,16 @@ class RayVideoDeduplicator(RayBasicDeduplicator): """ def __init__(self, - redis_host: str = 'localhost', - redis_port: PositiveInt = 6380, + redis_address: str = 'redis://localhost:6379', *args, **kwargs): """ Initialization. - :param redis_host: the hostname of redis server - :param redis_port: the port of redis server + :param redis_address: the address of redis server :param args: extra args :param kwargs: extra args """ - super().__init__(redis_host=redis_host, - redis_port=redis_port, + super().__init__(redis_address=redis_address, *args, **kwargs) diff --git a/docs/Operators.md b/docs/Operators.md index f1a20c9ef..4282f18df 100644 --- a/docs/Operators.md +++ b/docs/Operators.md @@ -13,7 +13,7 @@ The operators in Data-Juicer are categorized into 5 types. | [ Formatter ]( #formatter ) | 9 | Discovers, loads, and canonicalizes source data | | [ Mapper ]( #mapper ) | 58 | Edits and transforms samples | | [ Filter ]( #filter ) | 44 | Filters out low-quality samples | -| [ Deduplicator ]( #deduplicator ) | 8 | Detects and removes duplicate samples | +| [ Deduplicator ]( #deduplicator ) | 9 | Detects and removes duplicate samples | | [ Selector ]( #selector ) | 4 | Selects top samples based on ranking | diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md index b1194f250..5adc44e41 100644 --- a/docs/Operators_ZH.md +++ b/docs/Operators_ZH.md @@ -13,7 +13,7 @@ Data-Juicer 中的算子分为以下 5 种类型。 | [ Formatter ]( #formatter ) | 9 | 发现、加载、规范化原始数据 | | [ Mapper ]( #mapper ) | 58 | 对数据样本进行编辑和转换 | | [ Filter ]( #filter ) | 44 | 过滤低质量样本 | -| [ Deduplicator ]( #deduplicator ) | 8 | 识别、删除重复样本 | +| [ Deduplicator ]( #deduplicator ) | 9 | 识别、删除重复样本 | | [ Selector ]( #selector ) | 4 | 基于排序选取高质量样本 | 下面列出所有具体算子,每种算子都通过多个标签来注明其主要功能。