-
Notifications
You must be signed in to change notification settings - Fork 69
/
corpusLoader.py
executable file
·204 lines (165 loc) · 7.16 KB
/
corpusLoader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# -*- coding=GBK -*-
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import reuters
import HTMLParser
import os
import pdb
from utils import *
def extractSentenceWords(doc, remove_url=True, remove_punc="utf-8", min_length=1):
if remove_punc:
# ensure doc_u is in unicode
if not isinstance(doc, unicode):
encoding = remove_punc
doc_u = doc.decode(encoding)
else:
doc_u = doc
# remove unicode punctuation marks, keep ascii punctuation marks
doc_u = doc_u.translate(unicode_punc_tbl)
if not isinstance(doc, unicode):
doc = doc_u.encode(encoding)
else:
doc = doc_u
if remove_url:
re_url = r"(https?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
doc = re.sub( re_url, "", doc )
sentences = re.split( r"\s*[,;:`\"()?!{}]\s*|--+|\s*-\s+|''|\.\s|\.$|\.\.+|¡°|¡±", doc ) #"
wc = 0
wordsInSentences = []
for sentence in sentences:
if sentence == "":
continue
if not re.search( "[A-Za-z0-9]", sentence ):
continue
words = re.split( r"\s+\+|^\+|\+?[\-*\/&%=<>\[\]~\|\@\$]+\+?|\'\s+|\'s\s+|\'s$|\s+\'|^\'|\'$|\$|\\|\s+", sentence )
words = filter( lambda w: w, words )
if len(words) >= min_length:
wordsInSentences.append(words)
wc += len(words)
#print "%d words extracted" %wc
return wordsInSentences, wc
def load_20news(setName):
newsgroups_subset = fetch_20newsgroups(subset=setName, remove=('headers', 'footers')) #, 'quotes'
totalLineNum = 0
readDocNum = 0
print "Loading 20 newsgroup %s data..." %setName
setDocNum = len(newsgroups_subset.data)
orig_docs_name = []
orig_docs_cat = []
orig_docs_words = []
catNum = len(newsgroups_subset.target_names)
cats_docsWords = [ [] for i in xrange(catNum) ]
cats_docNames = [ [] for i in xrange(catNum) ]
emptyFileNum = 0
for d, text in enumerate(newsgroups_subset.data):
if d % 50 == 49 or d == setDocNum - 1:
print "\r%d %d\r" %( d + 1, totalLineNum ),
text = text.encode("utf-8")
lines = text.split("\n")
if len(text) == 0 or len(lines) == 0:
emptyFileNum += 1
continue
readDocNum += 1
totalLineNum += len(lines)
catID = newsgroups_subset.target[d]
category = newsgroups_subset.target_names[catID]
text = " ".join(lines)
wordsInSentences, wc = extractSentenceWords(text)
filename = newsgroups_subset.filenames[d]
filename = os.path.basename(filename)
orig_docs_words.append( wordsInSentences )
orig_docs_name.append(filename)
orig_docs_cat.append(catID)
cats_docsWords[catID].append(wordsInSentences)
cats_docNames[catID].append(filename)
print "Done. %d docs read, %d empty docs skipped. Totally %d lines" %(readDocNum, emptyFileNum, totalLineNum)
return setDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, \
cats_docsWords, cats_docNames, newsgroups_subset.target_names
def load_reuters(setName):
html = HTMLParser.HTMLParser()
doc_ids = reuters.fileids()
cat2all_ids = {}
cat2train_ids = {}
cat2test_ids = {}
cat2all_num = {}
cand_docNum = 0
for doc_id in doc_ids:
# only choose docs belonging in one category
if len( reuters.categories(doc_id) ) == 1:
cat = reuters.categories(doc_id)[0]
cand_docNum += 1
if doc_id.startswith("train"):
cat2set_ids = cat2train_ids
else:
cat2set_ids = cat2test_ids
if cat in cat2set_ids:
cat2set_ids[cat].append(doc_id)
else:
cat2set_ids[cat] = [ doc_id ]
# both train and test doc_ids are put in cat2all_ids
if cat in cat2all_ids:
cat2all_ids[cat].append(doc_id)
else:
cat2all_ids[cat] = [ doc_id ]
if cat in cat2all_num:
cat2all_num[cat] += 1
else:
cat2all_num[cat] = 1
print "Totally %d docs, %d single-category docs in %d categories" %( len(doc_ids),
cand_docNum, len(cat2train_ids) )
sorted_cats = sorted( cat2all_num.keys(), key=lambda cat: cat2all_num[cat],
reverse=True )
catNum = 10
cats_docsWords = [ [] for i in xrange(catNum) ]
cats_docNames = [ [] for i in xrange(catNum) ]
topN_cats = sorted_cats[:catNum]
print "Top 10 categories:"
keptAllDocNum = 0
keptTrainDocNum = 0
keptTestDocNum = 0
for cat in topN_cats:
print "%s: %d/%d" %( cat, len(cat2train_ids[cat]), len(cat2test_ids[cat]) )
keptTrainDocNum += len(cat2train_ids[cat])
keptTestDocNum += len(cat2test_ids[cat])
keptAllDocNum += len(cat2train_ids[cat]) + len(cat2test_ids[cat])
print "Totally %d docs kept, %d in train, %d in test" %( keptAllDocNum,
keptTrainDocNum, keptTestDocNum )
if setName == "train":
cat2set_ids = cat2train_ids
setDocNum = keptTrainDocNum
elif setName == "test":
cat2set_ids = cat2test_ids
setDocNum = keptTestDocNum
elif setName == "all":
cat2set_ids = cat2all_ids
setDocNum = keptAllDocNum
else:
raise Exception("Unknown set name %s" %setName)
orig_docs_name = []
orig_docs_cat = []
orig_docs_words = []
readDocNum = 0
totalLineNum = 0
emptyFileNum = 0
for cat_id, cat in enumerate(topN_cats):
for doc_id in cat2set_ids[cat]:
if readDocNum % 50 == 49 or readDocNum == setDocNum - 1:
print "\r%d %d\r" %( readDocNum + 1, totalLineNum ),
text = html.unescape( reuters.raw(doc_id) )
text = text.encode("utf-8")
lines = text.split("\n")
if len(text) == 0 or len(lines) == 0:
emptyFileNum += 1
continue
readDocNum += 1
totalLineNum += len(lines)
text = " ".join(lines)
wordsInSentences, wc = extractSentenceWords(text)
filename = doc_id
orig_docs_words.append( wordsInSentences )
orig_docs_name.append(filename)
orig_docs_cat.append(cat_id)
cats_docsWords[cat_id].append(wordsInSentences)
cats_docNames[cat_id].append(filename)
print "Done. %d docs read, %d empty docs skipped. Totally %d lines" %(readDocNum, emptyFileNum, totalLineNum)
return setDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, \
cats_docsWords, cats_docNames, topN_cats