-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalizza_libro1.py
147 lines (103 loc) · 3.42 KB
/
analizza_libro1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import random
import string
def process_file(filename, skip_header):
"""Makes a histogram that contains the words from a file.
filename: string
skip_header: boolean, whether to skip the Gutenberg header
returns: map from each word to the number of times it appears.
"""
hist = {}
fp = open(filename)
if skip_header:
skip_gutenberg_header(fp)
for line in fp:
if line.startswith(' *** END OF THE PROJECT'):
break
process_line(line, hist)
return hist
def skip_gutenberg_header(fp):
"""Reads from fp until it finds the line that ends the header.
fp: open file object
"""
for line in fp:
if line.startswith('*** START OF THE PROJECT'):
break
def process_line(line, hist):
"""Adds the words in the line to the histogram.
Modifies hist.
line: string
hist: histogram (map from word to frequency)
"""
# TODO: rewrite using Counter
# replace hyphens with spaces before splitting
line = line.replace('—', ' ')
strippables = string.punctuation + string.whitespace
for word in line.split():
# remove punctuation and convert to lowercase
word = word.strip(strippables)
word = word.lower()
# update the histogram
hist[word] = hist.get(word, 0) + 1
def most_common(hist):
"""Makes a list of word-freq pairs in descending order of frequency.
hist: map from word to frequency
returns: list of (frequency, word) pairs
"""
t = []
for key, value in hist.items():
t.append((value, key))
t.sort()
t.reverse()
return t
def print_most_common(hist, num=10):
"""Prints the most commons words in a histgram and their frequencies.
hist: histogram (map from word to frequency)
num: number of words to print
"""
t = most_common(hist)
print('The most common words are:')
for freq, word in t[:num]:
print(word, '\t', freq)
def subtract(d1, d2):
"""Returns a dictionary with all keys that appear in d1 but not d2.
d1, d2: dictionaries
"""
# TODO: reimplement using Counter
res = {}
for key in d1:
if key not in d2:
res[key] = None
return res
def total_words(hist):
"""Returns the total of the frequencies in a histogram."""
return sum(hist.values())
def different_words(hist):
"""Returns the number of different words in a histogram."""
return len(hist)
def random_word(hist):
"""Chooses a random word from a histogram.
The probability of each word is proportional to its frequency.
"""
# TODO: rewrite using Counter
t = []
for word, freq in hist.items():
t.extend([word] * freq)
return random.choice(t)
def main():
hist = process_file('thethames.txt', skip_header=True)
print('Total number of words:', total_words(hist))
print('Number of different words:', different_words(hist))
t = most_common(hist)
print('The most common words are:')
for freq, word in t[0:20]:
print(word, freq, sep='\t')
words = process_file('words.txt', skip_header=False)
diff = subtract(hist, words)
print("The words in the book that aren't in the word list are:")
for word in diff.keys():
print(word, end=' ')
print("\n\nHere are some random words from the book")
for i in range(100):
print(random_word(hist), end=' ')
if __name__ == '__main__':
main()