-
Notifications
You must be signed in to change notification settings - Fork 152
/
Copy pathTextMain.py
174 lines (161 loc) · 7.89 KB
/
TextMain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# coding: utf-8
from __future__ import division
__author__ = 'LiNing'
import os
import shutil
import re
import math
import nltk
import jieba
import jieba.analyse
import pymongo
import datetime
import numpy as np
try:
import cPickle as pickle
except ImportError:
import pickle
try:
import simplejson as json
except ImportError:
import json
from TextConfig import *
from TextProcess import *
from SendMail import *
from TextMining import *
def MakeAllWordsDict(*para):
posts, \
time_col, content_col, source_col, t_status_col, keyword_col, country_col, imp_col, limit_number, \
lag = para
## --------------------------------------------------------------------------------
all_words_tf_dict = {}
all_words_idf_dict = {}
train_datas = []
# all_source_dict = {}
# all_country_dict = {}
## --------------------------------------------------------------------------------
start_time = datetime.datetime(2014, 1, 1)
end_time = datetime.datetime.now()
for post in posts.find({ ##################################### 查询操作
time_col:{"$gte":start_time, "$lte":end_time},
content_col:{"$exists":1},
source_col:{"$exists":1},
t_status_col:1, # 已发布的
keyword_col:{"$exists":1}, country_col:{"$exists":1}, imp_col:{"$exists":1},
},): #.sort(time_col, -1).limit(limit_number):
## --------------------------------------------------------------------------------
# print post
if post[content_col] is not None and post[country_col] in Country_Number_Map:
# print post[content_col]
textseg_list = TextSeg(post[content_col], lag)
## --------------------------------------------------------------------------------
for testseg in textseg_list: ## all_words_tf_dict为该词的词频数
if all_words_tf_dict.has_key(testseg):
all_words_tf_dict[testseg] += 1
else:
all_words_tf_dict[testseg] = 1
for testseg in set(textseg_list): ## all_words_idf_dict为包含该词的逆文档频率
if all_words_idf_dict.has_key(testseg):
all_words_idf_dict[testseg] += 1
else:
all_words_idf_dict[testseg] = 1
train_datas.append((textseg_list, Country_Number_Map[post[country_col]]))
## --------------------------------------------------------------------------------
# if all_source_dict.has_key(post[source_col]):
# all_source_dict[post[source_col]] += 1
# else:
# all_source_dict[post[source_col]] = 1
# if all_country_dict.has_key(post[country_col]):
# all_country_dict[post[country_col]] += 1
# else:
# all_country_dict[post[country_col]] = 1
else:
print '{"_id":ObjectId("%s")} None or Not in Map' % post["_id"]
## --------------------------------------------------------------------------------
# for key_source in all_source_dict:
# print key_source, all_source_dict[key_source]
# for key_country in all_country_dict:
# print key_country, all_country_dict[key_country]
train_datas_count = len(train_datas)
print "number of all the train datas:", train_datas_count
for k in all_words_idf_dict:
all_words_idf_dict[k] = math.log(train_datas_count/(1+all_words_idf_dict[k]))
## --------------------------------------------------------------------------------
return all_words_tf_dict, all_words_idf_dict, train_datas
def DataSendMail(*para):
smtp_server, from_addr, passwd, to_addr, sendmail_flag, id_dict = para
text = ''
for id_data in id_dict["NotPass"]:
id_content = '{"_id":ObjectId("%s")} NotPass\n%s\n' % (id_data, id_dict["NotPass"][id_data])
text += id_content
# text = text.join(id_content) # 错误!join是循环用text连接id_content的字符
if sendmail_flag and text != '':
subject = 'Waring...'
files = []
send_mail(smtp_server, from_addr, passwd, to_addr, subject, text, files)
def DataUpdateAndSave(*para):
posts, keyword_col, country_col, imp_col, id_dict = para
for id_data in id_dict["Pass"]:
posts.update({"_id":id_data}, {"$set":{keyword_col:id_dict["Pass"][id_data][0],
country_col:id_dict["Pass"][id_data][1],
imp_col:id_dict["Pass"][id_data][2],
"t_status":1}})
## --------------------------------------------------------------------------------
## 动态加载
# import requests
# url = ''
# data = {
# keyword_col:id_dict["Pass"][id_data][0],
# country_col:id_dict["Pass"][id_data][1],
# imp_col:id_dict["Pass"][id_data][2],
# "t_status":1
# }
# response = requests.post(url=url, data=data)
## --------------------------------------------------------------------------------
print '{"_id":ObjectId("%s")} Update' % id_data
def Main():
## --------------------------------------------------------------------------------
## 一旦训练集变化,则需要事先手动删除all_words_dict_file和train_datas_file
if os.path.exists(all_words_dict_file) and os.path.exists(train_datas_file):
# with open(all_words_dict_file, "rb") as fp_pickle:
# all_words_tf_dict, all_words_idf_dict = pickle.load(fp_pickle)
# with open(train_datas_file, "rb") as fp_pickle:
# train_datas = pickle.load(fp_pickle)
with open(all_words_dict_file, "rb") as fp_json:
all_words_tf_dict, all_words_idf_dict = json.load(fp_json)
with open(train_datas_file, "rb") as fp_json:
train_datas = json.load(fp_json)
else:
if not os.path.exists(Datas_Dir):
os.makedirs(Datas_Dir)
all_para = (posts,
time_col, content_col, source_col, t_status_col, keyword_col, country_col, imp_col, limit_number,
lag)
all_words_tf_dict, all_words_idf_dict, train_datas = MakeAllWordsDict(*all_para)
# with open(all_words_dict_file, "wb") as fp_pickle:
# pickle.dump((all_words_tf_dict, all_words_idf_dict), fp_pickle)
# with open(train_datas_file, "wb") as fp_pickle:
# pickle.dump(train_datas, fp_pickle)
with open(all_words_dict_file, "wb") as fp_json:
json.dump((all_words_tf_dict, all_words_idf_dict), fp_json)
with open(train_datas_file, "wb") as fp_json:
json.dump(train_datas, fp_json)
#### 删除之前与之相关的文件
if os.path.exists(Classifier_Dir):
shutil.rmtree(Classifier_Dir)
## --------------------------------------------------------------------------------
all_para = (posts,
time_col, content_col, source_col, t_status_col, keyword_col, country_col, imp_col, limit_number,
lag, stopwords_set, blackwords_set, writewords_set,
all_words_tf_dict, all_words_idf_dict, train_datas, test_speedup)
id_dict = MakeTextMining(*all_para)
# MakeTextMining_ClassifyTest(*all_para) # 测试不同分类器性能
# MakeTextMining_Calendar(*all_para) # 日历
## --------------------------------------------------------------------------------
# all_para = (smtp_server, from_addr, passwd, to_addr, sendmail_flag, id_dict)
# DataSendMail(*all_para)
# all_para = (posts, keyword_col, country_col, imp_col, id_dict)
# DataUpdateAndSave(*all_para)
## --------------------------------------------------------------------------------
if __name__ == '__main__':
Main()