Skip to content

Commit

Permalink
fix #38 while adding new test cases to catch this error in the future
Browse files Browse the repository at this point in the history
  • Loading branch information
kPsarakis committed Oct 20, 2021
1 parent 1031db4 commit f970409
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 8 deletions.
9 changes: 9 additions & 0 deletions tests/test_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,27 @@ def test_cupid(self):
cu_matcher = Cupid()
matches_cu_matcher = cu_matcher.get_matches(d1, d2)
assert len(matches_cu_matcher) > 0 # Check that it actually produced output
cu_matcher = Cupid(parallelism=2)
matches_cu_matcher = cu_matcher.get_matches(d1, d2)
assert len(matches_cu_matcher) > 0 # Check that it actually produced output

def test_distribution_based(self):
# Test the Distribution based matcher
distribution_based_matcher = DistributionBased()
matches_db_matcher = distribution_based_matcher.get_matches(d1, d2)
assert len(matches_db_matcher) > 0 # Check that it actually produced output
distribution_based_matcher = DistributionBased(process_num=2)
matches_db_matcher = distribution_based_matcher.get_matches(d1, d2)
assert len(matches_db_matcher) > 0 # Check that it actually produced output

def test_jaccard_levenshtein(self):
# Test the Jaccard Levenshtein matcher
jl_matcher = JaccardLevenMatcher()
matches_jl_matcher = jl_matcher.get_matches(d1, d2)
assert len(matches_jl_matcher) > 0 # Check that it actually produced output
jl_matcher = JaccardLevenMatcher(threshold_leven=0.5, process_num=2)
matches_jl_matcher = jl_matcher.get_matches(d1, d2)
assert len(matches_jl_matcher) > 0 # Check that it actually produced output

def test_similarity_flooding(self):
# Test the Similarity flooding matcher
Expand Down
18 changes: 10 additions & 8 deletions valentine/algorithms/jaccard_levenshtein/jaccard_leven.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,19 +46,21 @@ def get_matches(self,
self.__threshold_leven,
target_id,
source_id):
matches.update(self.__process_jaccard_leven(combination))
matches.update(self.process_jaccard_leven(combination))
else:
with get_context("spawn").Pool(self.__process_num) as process_pool:
matches = dict(process_pool.map(self.__process_jaccard_leven,
self.__get_column_combinations(source_input,
target_input,
self.__threshold_leven,
target_id,
source_id)))
matches = {}
list_of_matches = process_pool.map(self.process_jaccard_leven,
self.__get_column_combinations(source_input,
target_input,
self.__threshold_leven,
target_id,
source_id))
[matches.update(match) for match in list_of_matches]
matches = {k: v for k, v in matches.items() if v > 0.0} # Remove the pairs with zero similarity
return matches

def __process_jaccard_leven(self, tup: tuple):
def process_jaccard_leven(self, tup: tuple):

source_data, target_data, threshold, target_id, target_table_name, target_table_unique_identifier, \
target_column_name, target_column_unique_identifier, source_table_name, source_table_unique_identifier, \
Expand Down

0 comments on commit f970409

Please sign in to comment.