-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanagrams.py
89 lines (69 loc) · 2.15 KB
/
anagrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import sys
import re
def sort_chars(words):
'''
Sorts the chars for easy comparison.
'''
joined = "".join(words)
sorted_chars = sorted(joined)
return "".join(sorted_chars)
def extract_bigrams(words):
'''
Gets all unique bigrams.
(Will still have reversed duplicates)
'''
words = set(words)
bigrams = set()
stopped = set()
for word in words:
stopped.add(word)
without = words - stopped
only = [word] * len(without)
bigrams |= set(zip(only, without))
return bigrams
def find_matches(bigrams, minlen=2):
'''
Uses dict to find all matches, only displays lists of matches with len >=
minlen.
'''
all_matches = {}
picked = {}
for bigram in bigrams:
hashed = sort_chars(bigram)
if hashed not in all_matches:
all_matches[hashed] = set()
picked[hashed] = set()
bigram_set = set(bigram)
if not picked[hashed].isdisjoint(bigram_set):
continue
picked[hashed] |= bigram_set
all_matches[hashed].add(tuple(bigram_set))
return [value for key, value in all_matches.items()
if len(value) >= minlen]
def print_matches(matches):
'''
Pretty-print the matches.
'''
for match in matches:
print ", ".join(sorted([" ".join(sorted(pair)) for pair in match]))
def main(text, wordlen=4, minpairlen=2):
text = re.sub(r'[^a-z0-9]+', ' ', text.lower())
words = sorted([word for word in set(re.split(r'\s+', text))
if len(word) >= wordlen])
bigrams = extract_bigrams(words)
matches = find_matches(bigrams, minlen=minpairlen)
print_matches(matches)
USAGE = '''Reads text from stdin, and prints out anagrams.
Usage: cat text | python anagrams.py [minimum word length] [minimum pair count]
'''
if __name__ == '__main__':
kwargs = {}
if len(sys.argv) > 1:
argv1 = sys.argv[1].lower()
if argv1 == '-h' or argv1 == '--help':
print USAGE,
sys.exit(1)
kwargs['wordlen'] = int(argv1)
if len(sys.argv) > 2:
kwargs['minpairlen'] = int(sys.argv[2])
main(sys.stdin.read(), **kwargs)