-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWikipedia.py
44 lines (34 loc) · 948 Bytes
/
Wikipedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import os, sys, json
class Wikipedia:
directory = 'data/'
data = 'data/'
def __init__(self):
self.files = []
for root, dirs, files in os.walk(self.data + 'articles'):
for name in files:
self.files.append(os.path.join(root, name))
self.files.sort()
self.fileId = 0
self.progress = 0
self.currentFile = open(self.files[self.fileId])
def __iter__(self):
return self
def next(self):
try:
line = 0
line = self.currentFile.next()
except StopIteration:
self.fileId += 1
if self.fileId >= len(self.files):
print
raise StopIteration
else:
self.currentFile = open(self.files[self.fileId])
line = self.currentFile.next()
# show progress
progress = int(100 * float(self.fileId) / len(self.files))
if progress > self.progress:
self.progress = progress
sys.stdout.write('.')
sys.stdout.flush()
return json.loads(line)