forked from modelscope/data-juicer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_formatter.py
159 lines (137 loc) · 6.19 KB
/
text_formatter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import os
from multiprocessing import Pool
import pdfplumber
from datasets import Dataset, concatenate_datasets, load_dataset
from docx import Document
from loguru import logger
from data_juicer.utils.cache_utils import DATA_JUICER_CACHE_HOME
from data_juicer.utils.file_utils import find_files_with_suffix
from .formatter import FORMATTERS, LocalFormatter, add_suffixes, unify_format
def extract_txt_from_docx(fn, tgt_path):
"""
Extract text from a docx file and save to target path.
:param fn: path to input pdf file
:param tgt_path: path to save text file.
"""
doc = Document(fn)
text = [para.text for para in doc.paragraphs if para.text.strip()]
base_fn = os.path.basename(fn).lower().replace('.docx', '.txt')
with open(os.path.join(tgt_path, base_fn), 'w') as f:
f.write('\n'.join(text))
def extract_txt_from_pdf(fn, tgt_path):
"""
Extract text from a pdf file and save to target path.
:param fn: path to input pdf file
:param tgt_path: path to save text file.
"""
with pdfplumber.open(fn) as pdf:
text = []
for page in pdf.pages:
# remove tables from each page extracted by pdfplumber
tables = page.find_tables()
for table in tables:
page = page.outside_bbox(table.bbox)
# remove page number from the end of each page
page_text = page.extract_text()
page_num = str(page.page_number)
if page_text.rstrip().endswith(page_num):
page_text = page_text.rstrip()[:-len(page_num)]
if page_text.strip():
text.append(page_text)
base_fn = os.path.basename(fn).lower().replace('.pdf', '.txt')
with open(os.path.join(tgt_path, base_fn), 'w') as f:
f.write('\n'.join(text))
@FORMATTERS.register_module()
class TextFormatter(LocalFormatter):
"""
The class is used to load and format text-type files.
e.g. `['.txt', '.pdf', '.cpp', '.docx']`
"""
SUFFIXES = [
'.docx', '.pdf', '.txt', '.md', '.tex', '.asm', '.bat', '.cmd', '.c',
'.h', '.cs', '.cpp', '.hpp', '.c++', '.h++', '.cc', '.hh', '.C', '.H',
'.cmake', '.css', '.dockerfile', '.f90', '.f', '.f03', '.f08', '.f77',
'.f95', '.for', '.fpp', '.go', '.hs', '.html', '.java', '.js', '.jl',
'.lua', '.markdown', '.php', '.php3', '.php4', '.php5', '.phps',
'.phpt', '.pl', '.pm', '.pod', '.perl', '.ps1', '.psd1', '.psm1',
'.py', '.rb', '.rs', '.sql', '.scala', '.sh', '.bash', '.command',
'.zsh', '.ts', '.tsx', '.vb', 'Dockerfile', 'Makefile', '.xml', '.rst',
'.m', '.smali'
]
def __init__(self,
dataset_path,
suffixes=None,
add_suffix=False,
**kwargs):
"""
Initialization method.
:param dataset_path: a dataset file or a dataset directory
:param suffixes: files with specified suffixes to be processed
:param add_suffix: Whether to add file suffix to datase meta
info
:param kwargs: extra args
"""
super().__init__(
dataset_path=dataset_path,
suffixes=suffixes if suffixes else self.SUFFIXES,
type='text',
add_suffix=add_suffix,
**kwargs,
)
self.dataset_path = dataset_path
self.add_suffix = add_suffix
def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
"""
Load a dataset from local text-type files.
:param num_proc: number of processes when loading the dataset
:param global_cfg: the global cfg used in consequent processes,
:return: unified_format_dataset.
"""
# extract text to cache directory
extracted_dataset_path = os.path.join(
DATA_JUICER_CACHE_HOME,
os.path.basename(os.path.abspath(self.dataset_path)))
for file_type in self.data_files:
# extract text from docx or pdf files, and save as txt type
if file_type == '.docx' or file_type == '.pdf':
extracted_filetype_path = os.path.join(extracted_dataset_path,
file_type.strip('.'))
if not os.path.exists(extracted_filetype_path):
os.makedirs(extracted_filetype_path)
logger.info('Extracting text from {} files...'.format(
file_type.strip('.')))
extract_func = extract_txt_from_docx \
if file_type == '.docx' else extract_txt_from_pdf
pool = Pool(num_proc)
for data_file in self.data_files[file_type]:
pool.apply_async(func=extract_func,
args=(
data_file,
extracted_filetype_path,
))
pool.close()
pool.join()
logger.info(f'Extracted text files are stored in directory '
f'{extracted_filetype_path}')
# look for extracted txt files
self.data_files[file_type] = find_files_with_suffix(
extracted_filetype_path, '.txt')['.txt']
# load text dataset, one text file as one sample
datasets = load_dataset('text',
data_files={
key.strip('.'): self.data_files[key]
for key in self.data_files
},
sample_by='document',
num_proc=num_proc,
**self.kwargs)
# whether to add file suffix to datase meta info
if self.add_suffix:
logger.info('Add suffix info into dataset...')
datasets = add_suffixes(datasets, num_proc)
else:
datasets = concatenate_datasets([ds for _, ds in datasets.items()])
return unify_format(datasets,
text_keys=self.text_keys,
num_proc=num_proc,
global_cfg=global_cfg)