-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_preprocessor.py
156 lines (115 loc) · 4.96 KB
/
text_preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
rhcp = []
with open('rhcp.txt', 'r') as file:
for line in file:
rhcp.append(line.strip())
madonna = []
with open('madonna.txt', 'r') as file:
for line in file:
madonna.append(line.strip())
class TextProcessor:
def __init__(self, texts, language='english', font_path=None):
"""
Initializes the TextProcessor with given texts and language.
Args:
texts (list): List of text data (e.g., lyrics, articles).
language (str): Language for stopword filtering (default is English).
font_path (str): Path to the font for the word cloud (optional).
"""
self.texts = texts
self.language = language
self.font_path = font_path
self.stop_words = set(stopwords.words(self.language)) #
def tokenize_sentences(self):
"""
Tokenizes the text into sentences.
Returns:
list: List of lists containing sentences.
"""
return [sent_tokenize(text) for text in self.texts]
def tokenize_words(self):
"""
Tokenizes the text into words.
Returns:
list: List of lists containing tokenized words.
"""
return [word_tokenize(text) for text in self.texts]
def filter_words(self, words):
"""
Filters out stopwords and non-alphabetic words from tokenized words.
Args:
words (list): List of tokenized words.
Returns:
list: List of filtered words.
"""
return [
[word for word in word_list if word.lower() not in self.stop_words and word.isalpha()]
for word_list in words
]
def get_frequency_distribution(self, filtered_words):
"""
Calculates the frequency distribution of words.
Args:
filtered_words (list): List of filtered words.
Returns:
FreqDist: Frequency distribution of words.
"""
flat_filtered_words = [word for word_list in filtered_words for word in word_list]
return FreqDist(flat_filtered_words)
def generate_word_cloud(self, freq_dist, width=800, height=600):
"""
Generates a word cloud from the frequency distribution.
Args:
freq_dist (FreqDist): Frequency distribution of words.
width (int): Width of the word cloud image.
height (int): Height of the word cloud image.
Returns:
WordCloud: WordCloud object.
"""
word_freq = dict(freq_dist)
# Custom color function for the word cloud
def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
return "hsl({}, 100%, 50%)".format(np.random.randint(0, 360)) # Random hue
# Create the word cloud
wordcloud = WordCloud(
font_path=self.font_path,
width=width,
height=height,
prefer_horizontal=0.5,
background_color="black", # Set background color
color_func=color_func, # Pass the custom color function
random_state=42
).generate_from_frequencies(word_freq)
return wordcloud
def plot_word_cloud(self, wordcloud, file_path="wordcloud.png"):
"""
Plots the generated word cloud.
Args:
wordcloud (WordCloud): WordCloud object to be plotted.
"""
plt.figure(figsize=(10, 14))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off') # Hide the axes
plt.savefig(file_path, format='png')
plt.show()
madonna_processor = TextProcessor(madonna, language='english', font_path='/Users/saramaras/Documents/github/text_classification/Kanit/Kanit-Regular.ttf')
madonna_sentences = madonna_processor.tokenize_sentences()
madonna_words = madonna_processor.tokenize_words()
filtered_madonna_words = madonna_processor.filter_words(madonna_words)
freq_dist_madonna = madonna_processor.get_frequency_distribution(filtered_madonna_words)
wordcloud_madonna = madonna_processor.generate_word_cloud(freq_dist_madonna, width=600, height=600)
madonna_processor.plot_word_cloud(wordcloud_madonna, file_path="madonna_wordcloud.png")
rhcp_processor = TextProcessor(rhcp, language='english', font_path='/Users/saramaras/Documents/github/text_classification/Kanit/Kanit-Regular.ttf')
rhcp_sentences = rhcp_processor.tokenize_sentences()
rhcp_words = rhcp_processor.tokenize_words()
filtered_rhcp_words = rhcp_processor.filter_words(rhcp_words)
freq_dist_rhcp = rhcp_processor.get_frequency_distribution(filtered_rhcp_words)
# Generate word cloud for RHCP
wordcloud_rhcp = rhcp_processor.generate_word_cloud(freq_dist_rhcp, width=600, height=600)
rhcp_processor.plot_word_cloud(wordcloud_rhcp, file_path="rhcp_wordcloud.png")