-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrank_tokens_tf_idf.py
145 lines (116 loc) · 4.88 KB
/
rank_tokens_tf_idf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from nltk.corpus import brown, wordnet, stopwords, names
import nltk
import string
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
"""
You may need to:
>>> nltk.download('brown')
>>> nltk.download('stopwords')
>>> nltk.download('universal_tagset')
"""
def dummy(doc):
"""Do nothing"""
return doc
def contain_unwanted_char(word):
""":param word: certain token as string type
:return TRUE or FALSE
"""
for char in word:
if char in string.punctuation:
return True
elif char in string.digits:
return True
return False
def clean(tokens):
""" :param tokens: list of tokenized words
:return remove Stop-words, convert tokens in lowercase
"""
clean_tokens = []
for token in tokens:
if token in stopwords.words('english'):
continue
elif contain_unwanted_char(token):
continue
elif token in names.words():
continue
else:
clean_tokens.append(token)
return clean_tokens
lemmatizer = WordNetLemmatizer()
# PREPARE THE CORPUS
# retrieve words tagged as verb, noun and adjective of 02 categories: romance, non-roman (both immaginative)
romance_tagged = [word for word, tag in brown.tagged_words(tagset="universal", categories='romance')
if tag == 'VERB']
non_romance_tagged = [word for word, tag in brown.tagged_words(tagset="universal", categories=['adventure', 'humor', 'mystery', 'science_fiction'])
if tag == 'VERB']
# convert Universal to Wordnet tag; then lemmatize words in the 02 document
romance_docs = []
non_romance_docs = []
for word in romance_tagged:
romance_docs.append(lemmatizer.lemmatize(word.lower(), pos='v')) # if not lowercase lematization wont work
for word in non_romance_tagged:
non_romance_docs.append(lemmatizer.lemmatize(word.lower(), pos='v'))
"""
As can be seen, only verbs are extracted, tagged as 'VERB' in Brown, so when lemmatizer is employed, I can
specify the pos is 'v' with confidence. This step can be generalized, in case multiple tags are extracted,
romance_tagged = [(word, tag) for word, tag in brown.tagged_words(tagset="universal", categories='romance')
if tag == 'VERB' OR tag == 'NOUN']
....by adding this function:
def get_wordnet_pos(universal_tag):
'''Binding the built-in pos tag of Brown to Wordnet tag.
From: https://coling.epfl.ch/TP/TP-tagging.html
'''
if universal_tag == 'VERB':
return wordnet.VERB
elif universal_tag == 'ADJ':
return wordnet.ADJ
elif universal_tag == 'ADV':
return wordnet.ADV
else:
return wordnet.NOUN
and then specify accordingly in lemmatize(), for example:
for word, tag in non_romance_tagged:
non_romance_docs.append(lemmatizer.lemmatize(word.lower(), pos=get_wordnet_pos(universal_tag)))
The get_wordnet_pos() will get correct pos, which is important for lemmatizer to work. Then lemmatizer can transform
'abandonning' to 'abandon', for example; otherwise, it treats everything as noun and 'abandonning' stays the same.
"""
# Create corpus space with clean tokens inside each document: convert to lowercase and remove all punctuations + stop-words
corpus = {}
corpus['romance'] = clean(romance_docs)
corpus['non_romance'] = clean(non_romance_docs)
# PREPARE THE TOKENS TO RANK
total_words = []
total_words.extend(corpus['romance'])
total_words.extend(corpus['non_romance'])
unique_tokens = sorted(list(set(total_words))) # make tokens unique
"""
See how much we clean our data:
>>> print('Romance raw:', len(romance_docs))
>>> print('Romance processed:', len(corpus['romance']))
>>> print('Non romance raw', len(non_romance_docs))
>>> print('Non romance processed:', len(corpus['non_romance']))
>>> print('Total tokens in the corpus:', len(total_words))
>>> print('Total unique tokens in the corpus:', len(unique_tokens))
Romance raw: 12784
Romance processed: 8801
Non romance raw 28782
Non romance processed: 20601
Total tokens in the corpus: 29402
Total unique tokens in the corpus: 2608
"""
# Compile the tf-idf algorithm
tf_idf = TfidfVectorizer(vocabulary=unique_tokens,
analyzer='word',
tokenizer=dummy,
preprocessor=dummy,
token_pattern=None)
# Apply the tf-idf algorithm to create tf-idf matrix
tfs = tf_idf.fit_transform(corpus.values())
# Using Pandas to export the tf-idf matrix to a .csv file for inspection
feature_names = tf_idf.get_feature_names()
corpus_index = [n for n in corpus]
df = pd.DataFrame(tfs.T.todense(), index=feature_names, columns=corpus_index)
export_csv = df.to_csv (r'output/tf_idf_romance_matrix_v4.csv', header=True)