fix #38 while adding new test cases to catch this error in the future

delftdata · Oct 20, 2021 · f970409 · f970409
1 parent 1031db4
commit f970409
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 8 deletions.
diff --git a/tests/test_algorithms.py b/tests/test_algorithms.py
@@ -27,18 +27,27 @@ def test_cupid(self):
         cu_matcher = Cupid()
         matches_cu_matcher = cu_matcher.get_matches(d1, d2)
         assert len(matches_cu_matcher) > 0  # Check that it actually produced output
+        cu_matcher = Cupid(parallelism=2)
+        matches_cu_matcher = cu_matcher.get_matches(d1, d2)
+        assert len(matches_cu_matcher) > 0  # Check that it actually produced output
 
     def test_distribution_based(self):
         # Test the Distribution based matcher
         distribution_based_matcher = DistributionBased()
         matches_db_matcher = distribution_based_matcher.get_matches(d1, d2)
         assert len(matches_db_matcher) > 0  # Check that it actually produced output
+        distribution_based_matcher = DistributionBased(process_num=2)
+        matches_db_matcher = distribution_based_matcher.get_matches(d1, d2)
+        assert len(matches_db_matcher) > 0  # Check that it actually produced output
 
     def test_jaccard_levenshtein(self):
         # Test the Jaccard Levenshtein matcher
         jl_matcher = JaccardLevenMatcher()
         matches_jl_matcher = jl_matcher.get_matches(d1, d2)
         assert len(matches_jl_matcher) > 0  # Check that it actually produced output
+        jl_matcher = JaccardLevenMatcher(threshold_leven=0.5, process_num=2)
+        matches_jl_matcher = jl_matcher.get_matches(d1, d2)
+        assert len(matches_jl_matcher) > 0  # Check that it actually produced output
 
     def test_similarity_flooding(self):
         # Test the Similarity flooding matcher

diff --git a/valentine/algorithms/jaccard_levenshtein/jaccard_leven.py b/valentine/algorithms/jaccard_levenshtein/jaccard_leven.py
@@ -46,19 +46,21 @@ def get_matches(self,
                                                               self.__threshold_leven,
                                                               target_id,
                                                               source_id):
-                matches.update(self.__process_jaccard_leven(combination))
+                matches.update(self.process_jaccard_leven(combination))
         else:
             with get_context("spawn").Pool(self.__process_num) as process_pool:
-                matches = dict(process_pool.map(self.__process_jaccard_leven,
-                                                self.__get_column_combinations(source_input,
-                                                                               target_input,
-                                                                               self.__threshold_leven,
-                                                                               target_id,
-                                                                               source_id)))
+                matches = {}
+                list_of_matches = process_pool.map(self.process_jaccard_leven,
+                                                   self.__get_column_combinations(source_input,
+                                                                                  target_input,
+                                                                                  self.__threshold_leven,
+                                                                                  target_id,
+                                                                                  source_id))
+                [matches.update(match) for match in list_of_matches]
         matches = {k: v for k, v in matches.items() if v > 0.0}  # Remove the pairs with zero similarity
         return matches
 
-    def __process_jaccard_leven(self, tup: tuple):
+    def process_jaccard_leven(self, tup: tuple):
 
         source_data, target_data, threshold, target_id, target_table_name, target_table_unique_identifier, \
             target_column_name, target_column_unique_identifier, source_table_name, source_table_unique_identifier, \