-
Notifications
You must be signed in to change notification settings - Fork 3
/
tools.py
136 lines (106 loc) · 3.96 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# _*_ coding: utf-8 _*_
"""
tools.py by xianhu
"""
import re
import jieba
import random
import operator
import functools
from collections import defaultdict
from config_code.config_dm_gender import config_gender_tags
from config_code.config_dm_age import config_age_tags
def get_config_oneword(file_name, is_update=False):
"""
获取单行只有一个单词的配置文件,例如停用词等,返回一个列表
"""
with open(file_name, "r") as file_in:
words_list = [line.strip() for line in file_in if line.strip()]
if is_update:
with open(file_name, "w") as file_out:
file_out.write("\n".join([word for word in sorted(set(words_list))]))
return words_list
# 停用词
stop_words = set(get_config_oneword("./config_data/app_stopwords.txt", is_update=True))
stop_words.update(get_config_oneword("/Users/allen/我的坚果云/5-Project/文本资源/stop_words.txt", is_update=False))
def check_word(word):
"""
检查word的状态
"""
if (len(word) < 2) or (word in stop_words):
return False
if re.search("[\u4e00-\u9fa5]", word) or re.search("[a-z]", word):
return True
return False
def segment_app_data():
"""
给APP信息分词,首先要读取标签数据
"""
# 获取全部的label数据:性别、年龄
all_label = set(functools.reduce(operator.add, functools.reduce(operator.add, config_gender_tags.values())))
all_label.update(functools.reduce(operator.add, config_age_tags.values()))
assert len(stop_words & all_label) == 0
# 加入jieba分词
jieba.set_dictionary("/Users/allen/我的坚果云/5-Project/文本资源/segment_words.txt")
for label in all_label:
jieba.add_word(label, 300)
num = 0
file_out = open("./config_data/app_segment.txt", "w")
for line in open("./backup/app_info.txt", "r", encoding="utf-8"):
frags = [item.strip() for item in line.strip().split("\t")]
assert len(frags) == 7, frags
num += 1
if num % 1000 == 0:
print("now:", num)
pkg_name = frags[0].strip()
soft_game, app_name, app_classify, app_tags, content, and_ios = [item.lower() for item in frags[1:]]
# 处理content
words = filter(lambda w: check_word(w), [w.strip() for w in jieba.cut(app_name+" "+content)])
# 处理tags
tags = filter(lambda w: check_word(w), [w.strip() for w in app_tags.split(",")])
# 输出
file_out.write("\t".join([pkg_name, soft_game, app_name, app_classify, ",".join(tags), ",".join(words), and_ios])+"\n")
return
def get_train_split(train_file, weights_dict):
"""
分割训练集
"""
count_data = defaultdict(int)
line_nums_data = defaultdict(list)
line_num = 0
for line in open(train_file, "r"):
frags = line.strip().split("\t")
assert len(frags) and (frags[0].strip() in weights_dict), frags
count_data[frags[0].strip()] += 1
line_nums_data[frags[0].strip()].append(line_num)
line_num += 1
max_index = 10000000
for key in weights_dict:
count = count_data[key] / weights_dict[key]
max_index = count if count < max_index else max_index
line_nums_set = set()
for key in weights_dict:
count = int(weights_dict[key] * max_index)
line_num_list = random.shuffle(line_nums_data[key])
line_nums_set.update(line_num_list[:count])
file_out = open(train_file + "_shuffle.txt", "w")
line_num = 0
for line in open(train_file, "r"):
frags = line.strip().split("\t")
assert len(frags) and (frags[0].strip() in weights_dict), frags
if line_num in line_nums_set:
file_out.write(line)
line_num += 1
return
if __name__ == '__main__':
segment_app_data()
exit()
get_train_split("age_train.txt", weights_dict={
"16-19": 0.15,
"20-25": 0.2,
"26-30": 0.3,
"31-35": 0.15,
"36-45": 0.15,
">45": 0.05,
})
exit()