-
Notifications
You must be signed in to change notification settings - Fork 0
/
final.py
71 lines (55 loc) · 1.79 KB
/
final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import re
import os
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
def docfile(file1):
with open(file1,"rb") as f:
contents = f.read()
contents = contents.decode('utf-8','ignore')
f.close()
#arrays=contents.split()
arrays=word_tokenize(contents)
#loai bo ky tu dac biet va so
for i in range(0,len(arrays)):
arrays[i]= arrays[i].lower()
arrays[i]= re.sub(r'[^a-z]', '', arrays[i])
#loai bo stopword
array1=[]
for word in arrays:
if(word) not in (stopwords.words('english')):
array1.append(word)
# steamming
stemmer = PorterStemmer()
array2=[]
for word in array1:
if(len(word)<10 and len(word)>2):
array2.append(stemmer.stem(word))
tf = np.unique(array2, return_counts = True)[1].tolist() #bo tu lap
value = np.unique(array2, return_counts = True)[0].tolist()
str=' '.join(value)
return str
path_train="/home/hoangntbn/Desktop/20192/project2/20news-bydate/20news-bydate-train"
path_test="/home/hoangntbn/Desktop/20192/project2/20news-bydate/20news-bydate-test"
FJoin = os.path.join
def solve(path):
contents=""
dirs = [FJoin(path, f) for f in os.listdir(path)]
for i in range(0,len(dirs)):
d=dirs[i]
files = [FJoin(d,f) for f in os.listdir(d)]
for j in range(0,len(files)):
s= docfile(files[j])
s= str(i)+"###" + s + "\n"
contents=contents+s
break
return contents
contents_train=solve(path_train)
contents_test=solve(path_test)
file = open("train.txt", "w+")
file.write(contents_train)
file.close()
file = open("test.txt", "w+")
file.write(contents_test)
file.close()