-
Notifications
You must be signed in to change notification settings - Fork 0
/
Zipfs_Readability_Score.py
110 lines (90 loc) · 3.67 KB
/
Zipfs_Readability_Score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
04/01/2020
Program: Zipfs_Readability_Score
Written by: Gabriel Dutra
The code's main goal is to utilize the slope of a Zipfian distribution from a given text to analyze and quantify
the text's complexity through the metric of Flesch's Readability Scores.
"""
import matplotlib.pyplot as plt
import math
import numpy as np
import textatistic
# Prints the Zipfian distribution of a text
def zipf(text):
file = open(text, "rt", encoding="UTF-8") # Opens file with proper encoding
text = (file.read()).lower()
file.close()
# Replaces any character that can interfer with the word frequency distribution
# The replace function imported from py-se
text = text.replace(",", "")
text = text.replace(".", "")
text = text.replace("!", "")
text = text.replace("?", "")
text = text.replace("'", "")
text = text.replace(" \" ", "")
text = text.replace("-", "")
text = text.replace("—", "")
text = text.replace(";", "")
text = text.replace("\n", "")
# Splits the text into words, putting them into a list
words = text.split()
word_count = 0
dict_w = {}
# Loops through the list, counts their frequency and creates a dictionary (dict_w) that contains each word with their number of occurences
for i in range(len(words)):
for j in words:
if j == words[i]:
word_count += 1
dict_w[words[i]] = word_count
word_count = 0
i += 1
# Sorts the dictionary from the most frequent word to the least frequent
sorted_dict = sorted(dict_w.items(), reverse=True, key=lambda x: x[1])
word_list = []
count_list = []
for x, y in sorted_dict:
word_list.append(x)
count_list.append(y)
rank_list = []
# Creates a list containing the log of the word's occurance and rank
for i in range(0, len(count_list)):
count_list[i] = math.log(count_list[i], 10)
rank_list.append(math.log(i+1, 10))
# Initializes the log of the list of ranks and the log of the list of frequencies as numpy arrays for future use
rank_list = np.array(rank_list)
count_list = np.array(count_list)
#Equations for line of best fit
denominator = rank_list.dot(rank_list) - rank_list.mean() * rank_list.sum()
m = (rank_list.dot(count_list) - count_list.mean() * rank_list.sum()) / denominator
b = (count_list.mean() * rank_list.dot(rank_list) - rank_list.mean() * rank_list.dot(count_list)) / denominator
best_fit = m * rank_list + b
residual = count_list - best_fit
total = count_list - count_list.mean()
r_squared = 1 - residual.dot(residual) / total.dot(total)
# Prints the result of the slope and R squared
print("The R^2 is: ", r_squared)
print("The slope is: ", m)
# Prints the distribution of words and their ranks
plt.scatter(word_list, count_list)
plt.ylabel("Word frequency")
plt.xlabel("Word rank")
plt.title("Frequency vs Rank")
plt.show()
# Prints the logged graph of the zipfian distribution
plt.scatter(rank_list,count_list, s=1)
plt.plot(rank_list, best_fit, color="red")
plt.ylabel("Word frequency (logged)")
plt.xlabel("Word rank (logged)")
plt.title("log(frequency) vs log(rank)")
plt.show()
# Function that prints the readability score of a given text
def reading_ease(txt):
file = open(txt, "rt", encoding="UTF-8")
text = (file.read()).lower()
file.close()
print()
print(txt)
print("The readability score is: ",textatistic.flesch_score(text)) #Uses the library textatistic, which contains dictionaries from PyHyphen
# Call functions by using any sample text encoded in UTF-8
zipf("sample.txt")
reading_ease("sample.txt")