-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathsentence_clip.py
75 lines (59 loc) · 2.31 KB
/
sentence_clip.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# --coding: utf-8-
"""
@Time : 2022/4/25 10:25
@Author : 吴双
@File : clip_msra.py
@Software: PyCharm
"""
from utils.paths import msra_ner_cn_path, ontonote4ner_cn_path
def create_cliped_file(fp, clip_len):
f = open(fp,'r',encoding='utf-8')
fp_out = fp + '_clip'
f_out = open(fp_out,'w',encoding='utf-8')
now_example_len = 0
lines = f.readlines()
last_line_split = ['','']
for line in lines:
line_split = line.strip().split()
print(line,end='',file=f_out)
now_example_len += 1
if len(line_split) == 0 or \
(line_split[0] in ['。','!','?']
and line_split[1] == 'O' and now_example_len > clip_len):
print('',file=f_out)
now_example_len = 0
elif ((line_split[0] in [',',';'] or (now_example_len > 1 and last_line_split[0] == '…' and line_split[0] == '…'))
and line_split[1] == 'O' and now_example_len > clip_len):
print('',file=f_out)
now_example_len = 0
elif line_split[1][0].lower() == 'e' and now_example_len > clip_len:
print('',file=f_out)
now_example_len = 0
last_line_split = line_split
f_out.close()
f_check = open(fp_out,'r',encoding='utf-8')
lines = f_check.readlines()
cliped_examples = [[]]
now_example = cliped_examples[0]
for line in lines:
line_split = line.strip().split()
if len(line_split) == 0:
cliped_examples.append([])
now_example = cliped_examples[-1]
else:
now_example.append(line.strip())
check = 0
max_length = 0
for example in cliped_examples:
if len(example)>200:
print(len(example),''.join(map(lambda x:x.split(' ')[0],example)))
check = 1
max_length = max(max_length,len(example))
print('最长的句子有:{}'.format(max_length))
if check == 0:
print('没句子超过200的长度')
create_cliped_file('{}/train_dev.char.bmes'.format(msra_ner_cn_path), 210)
create_cliped_file('{}/test.char.bmes'.format(msra_ner_cn_path), 210)
create_cliped_file('{}/train.char.bmes'.format(ontonote4ner_cn_path), 180)
create_cliped_file('{}/dev.char.bmes'.format(ontonote4ner_cn_path), 180)
create_cliped_file('{}/test.char.bmes'.format(ontonote4ner_cn_path), 180)