-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdo_sentence_segmentation.py
74 lines (65 loc) · 2 KB
/
do_sentence_segmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""Script for sentence segmentation.
Copied and modified from https://github.com/eric-haibin-lin/text-proc.git
"""
import argparse
import glob
import io
import logging
import multiprocessing
import os
import time
import nltk
from nltk.tokenize import sent_tokenize
parser = argparse.ArgumentParser(
description='Sentence segmentation for BERT documents.')
parser.add_argument(
'--data',
type=str,
default='./*/*.compact',
help='Input files. Default is "./*/*.compact"')
parser.add_argument(
'--input_suffix',
type=str,
default='.2',
help='Suffix for input files. Default is ".2"')
parser.add_argument(
'--output_suffix',
type=str,
default='.3',
help='Suffix for output files. Default is ".3"')
parser.add_argument(
'--nworker',
type=int,
default=72,
help='Number of workers for parallel processing.')
args = parser.parse_args()
# download package
nltk.download('punkt')
# arguments
input_files = sorted(glob.glob(os.path.expanduser(args.data)))
num_files = len(input_files)
num_workers = args.nworker
logging.basicConfig(level=logging.INFO)
logging.info('Number of input files to process = %d', num_files)
def process_one_file(one_input):
"""Separate paragraphs into sentences, for one file."""
input_filename = one_input + args.input_suffix
output_filename = one_input + args.output_suffix
logging.info('Processing %s => %s', input_filename, output_filename)
with io.open(input_filename, 'r', encoding='utf-8') as fin:
with io.open(output_filename, 'w', encoding='utf-8') as fout:
for line in fin:
if len(line) == 1:
fout.write(u'\n')
sents = sent_tokenize(line)
for sent in sents:
sent_str = sent.strip()
# if sent_str:
fout.write('%s\n' % sent_str)
fout.write(u'\n')
if __name__ == '__main__':
tic = time.time()
p = multiprocessing.Pool(num_workers)
p.map(process_one_file, input_files)
toc = time.time()
logging.info('Processed %s in %.2f sec', args.data, toc - tic)