-
Notifications
You must be signed in to change notification settings - Fork 0
/
Text Analytics.py
359 lines (220 loc) · 8.6 KB
/
Text Analytics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
#!/usr/bin/env python
# coding: utf-8
# The project is centered on creating a tool for language translation and text analysis using Python, featuring a range of functions for handling and analyzing text data. Here's an overview of its primary capabilities:
#
# - Text Processing:
# - The project encompasses routines for reading texts from files, eliminating punctuation, segmenting texts into sentences, and generating lists of words from these sentences.
# - A function named read_file retrieves text content from a file, returning it as a string.
# - The remove_punc2 function strips punctuation from the text, yielding a cleaned version.
#
# - Language Translation:
# - The tool includes a feature for translating English words into Latin.
# - The translate function employs a rule-based method to convert English words into Latin, relocating consonants before the first vowel to the end and appending 'ay' as a suffix.
#
# - Text Analysis:
# - There are several functions designed for text analysis and manipulation.
# - The split_sentences function breaks a text string into individual sentences based on punctuation marks like '.', '?', and '!'.
# - The starts_with_vowel function determines whether a string begins with a vowel.
#
# - Social Network Analysis:
# - The tool also provides functionality for analyzing social network dynamics.
# - The list_textfiles function compiles a list of '.txt' files within a specific directory.
# - Social relationships are parsed from a file and mapped in a dictionary (edge_dict).
# - Using edge_dict, the following2 function identifies the users followed by a specific individual.
#
# - Performance Evaluation:
# - Execution times for particular functions are measured using the %timeit magic command.
# - In summary, this project integrates language translation, text processing, and social network analysis tools, offering a comprehensive suite for processing and examining textual and social network data, alongside translating English to Latin.
# In[1]:
# Importing the data
infile = open('/Users/sowmya/Downloads/austen-emma-excerpt.txt')
print(infile)
text = infile.read()
print(text.count("e"))
print(text.count('an'))
infile.close()
# In[3]:
#The program processes a text to tally how many times the letter 'e' appears within it.
#Afterward, it outputs the original text along with the count of 'e' instances.
nE = 0
print(text)
for x in text:
if 'e' in x:
nE = nE + x.count('e')
print(nE)
# In[4]:
print(text.count(' an '))
# In[5]:
#This code calculates how many times the letter 'e' appears in the text string, saves that number in the counts variable, and then displays the count.
counts = 0
item_to_count = text
for txt in text:
if 'e' == txt:
counts = counts + 1
print(counts)
# In[6]:
# The `remove_punc2` function removes punctuation characters from the `text` string and returns the cleaned version.
def remove_punc2(text):
punctuation = '!@#$%^&*()_-+={}[]:;"\'|<>>,.?/~`»¿'
clean_text = ""
for character in text:
if character not in punctuation:
clean_text += character
return clean_text
# In[7]:
for element in enumerate("Python"):
print(element)
# In[8]:
for index, character in enumerate("Python"):
print(index)
# In[11]:
def end_of_sentence_marker(character):
if character == '?':
return True
elif character == '!':
return True
elif character == '.':
return True
else:
return False
# In[12]:
print(end_of_sentence_marker('?') == True)
# In[13]:
print(end_of_sentence_marker("a"))
# In[14]:
for element in enumerate('Python'):
print(element)
# In[15]:
for index, character in enumerate("Python"):
print(index)
# In[16]:
# The split_sentences function takes a text string as input and splits it into a list of sentences.
def split_sentences(text):
#"Split a text string into a list of sentences."
sentences = []
start = 0
for end, character in enumerate(text):
if end_of_sentence_marker(character):
sentence = text[start: end + 1]
sentences.append(sentence)
start = end + 1
return sentences
# In[17]:
splitedSentences = split_sentences("This is a sentence. Should we seperate it from this one?")
# In[18]:
#The code takes each sentence from the splitedSentences list, cleans it by trimming spaces, removing punctuation, and changing to lowercase.
#Then it splits the sentence into words and prints the list of words for each sentence.
for index,sent in enumerate(splitedSentences):
wordList = []
sent = sent.strip()
cleanText = remove_punc2(sent)
lowerSent = cleanText.lower()
wordList = lowerSent.split(' ')
print(wordList)
# In[23]:
# the code reads a file containing follower and followee names, extracts the pairs of names,
# and stores them in a list named edges. It then prints the first 10 pairs of follower and followee names.
edges = [] # In twitterName.txt we have list of names in the format as 'follower','followee'
for line in open('/Users/sowmya/Downloads/twitterName.txt'):
follower,followee = line.strip().split(';')
edges.append((follower,followee))
print(edges[:10])
# In[24]:
def following(user, edges):
"Return a list of all users USERS is following."
followees = []
for follower, followee in edges:
if follower == user:
followees.append(followee)
return followees
print(following("@Fox", edges)) # The User Fox(follower) is following 6 People
# In[25]:
get_ipython().run_line_magic('timeit', 'following("@Fox", edges)')
# In[32]:
edge_dict = {}
for line in open("/Users/sowmya/Downloads/twitterName.txt"):
name_a, name_b = line.strip().split(';')
if name_a in edge_dict:
edge_dict[name_a].append(name_b)
else:
edge_dict[name_a] = [name_b]
# In[33]:
edge_dict
# In[34]:
def following2(user, edges):
return edges[user]
get_ipython().run_line_magic('timeit', 'following2("@Fox", edge_dict)')
# In[35]:
edges = []
for line in open("/Users/sowmya/Downloads/twitterName.txt"):
name_a, name_b = line.strip().split(';')
# repeatedly add edges to the network (1000 times)
for i in range(1000):
edges.append((name_a, name_b))
# In[36]:
get_ipython().run_line_magic('timeit', 'following("@Fox", edges)')
# The code imports a file with names of followers and followees, constructs a edge_dict dictionary to represent their social network connections, and calculates how long it takes to run the following2 function for the user "@Fox" using this dictionary.
# In[37]:
edge_dict = {}
for line in open("/Users/sowmya/Downloads/twitterName.txt"):
name_a, name_b = line.strip().split(';')
for i in range(1000):
if name_a in edge_dict:
edge_dict[name_a].append(name_b)
else:
edge_dict[name_a] = [name_b]
get_ipython().run_line_magic('timeit', 'following2("@Fox", edge_dict)')
# The code changes an English word to Latin by shifting the initial consonants to the end and appending 'ay' to it.
# In[38]:
# English to Latin
def translate(word):
"Convert a word to latin."
vowels = 'aeiouAEIOU'
start = 0
end = ''
# loop over all characters in word
for i, char in enumerate(word):
# if this character is not a vowel
if char not in vowels:
# it is a consonant, so add it to the end.
end += char
# if it is a vowel
else:
# we set the starting position to
# the position of this character
start = i
break
return word[start:] + end + 'ay'
translate('Practice')
# In[39]:
#Method 1
def starts_with_vowel(strings):
vowels = 'aeiouAEIOU'
if strings[0] in vowels:
return True
else:
return False
starts_with_vowel('Amazing')
starts_with_vowel('Jack')
# In[40]:
#Method 2
def starts_with_vowel(word):
"Return True if WORD starts with a vowel, False otherwise."
vowels = ('a', 'e', 'i', 'o', 'u', 'A', 'E', 'I', 'O', 'U')
return word.startswith(vowels)
# In[41]:
def add_suffix(word,suffix):
word_suffix = word + suffix
return word_suffix
add_suffix('luck','ily')
# In[42]:
word = 'quick'
add_suffix(word,'ly')
# The code translates a word by altering its letter order and adding a suffix, depending on if it starts with a vowel. It recursively calls the translate function, adjusting the word each time to achieve the translation.
# In[43]:
def translate(word, suffix):
if starts_with_vowel(word):
return add_suffix(word, suffix)
return translate(word[1:] + word[0], suffix)
translate('JkcEEEE','Amazing')
# In[ ]: