-
Notifications
You must be signed in to change notification settings - Fork 1
/
clean_jsonl.py
143 lines (109 loc) · 4.72 KB
/
clean_jsonl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
Provides functions to clean and process JSONL files by removing duplicate sentences,
repeated phrases, and simple repetitions from the text fields.
Functions:
remove_duplicate_sentences(text: str) -> str:
Removes consecutive duplicate sentences from the given text.
remove_repeated_phrases(text: str, min_phrase_length: int = 2, max_phrase_length: int = 22, iterations: int = 3) -> str:
Removes repeated phrases of varying lengths from the given text.
remove_simple_repetitions(text: str, max_len: int = 22) -> str:
Removes simple repetitions of words or phrases up to a specified length from the given text.
clean_text(text: str) -> str:
Cleans the given text by removing simple repetitions, duplicate sentences, and repeated phrases.
process_jsonl_line(json_line: str) -> str:
Processes a single JSONL line, cleaning the "text" field if it exists.
process_jsonl_file(input_file: str, output_file: str) -> None:
Processes an entire JSONL file, cleaning the "text" fields in parallel using multiple processes.
Usage:
This script can be run as a standalone program to clean a JSONL file specified by the input_file and output_file paths.
Author: Tijmen de Haan
Date: 2024-09-24
"""
import json
import logging
import re
from nltk.tokenize import sent_tokenize
from multiprocessing import Pool, cpu_count
# Set up basic configuration for logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
def remove_duplicate_sentences(text):
sentences = sent_tokenize(text)
unique_sentences = []
last_sentence = None
for sentence in sentences:
if sentence != last_sentence:
unique_sentences.append(sentence)
last_sentence = sentence
return " ".join(unique_sentences)
def remove_repeated_phrases(
text, min_phrase_length=2, max_phrase_length=22, iterations=3
):
words = text.split()
for _ in range(iterations):
modified = False
for phrase_length in range(max_phrase_length, min_phrase_length - 1, -1):
i = 0
while i <= len(words) - 2 * phrase_length:
phrase = words[i : i + phrase_length]
next_phrase = words[i + phrase_length : i + 2 * phrase_length]
if phrase == next_phrase:
del words[i + phrase_length : i + 2 * phrase_length]
modified = True
else:
i += phrase_length # Skip ahead to the end of the current phrase
# Early exit if no changes were made
if not modified:
break
return " ".join(words)
def remove_simple_repetitions(text, max_len=22):
pattern = r"\b(\w+(?:\s+\w+){0," + str(max_len - 1) + "})\s+\1\b"
while re.search(pattern, text):
text = re.sub(pattern, r"\1", text)
return text
def clean_text(text):
text = remove_simple_repetitions(text)
text = re.sub(r"(.)\1{4,}", r"\1", text)
text = re.sub(r"(\b\w+\b)( \1){3,}", r"\1", text)
text = re.sub(r"(\\\\quad){2,}", "\\\\quad\\\\quad", text)
text = re.sub(r"(\\\\qquad){2,}", "\\\\qquad\\\\qquad", text)
text = re.sub(r"(\\\\!)+", "\\\\!", text)
text = re.sub(r"(\[\\\\n\*\\\\\]){2,}", "[]", text)
text = re.sub(r"(,\\\\){2,}", ",\\\\", text)
text = re.sub(r"(cos\\\\phi\\s*){2,}", "cos\\\\phi ", text)
text = re.sub(r"(\\\\,){2,}", "\\\\,", text)
text = remove_duplicate_sentences(text)
try:
text = remove_repeated_phrases(text)
except RecursionError:
logging.error(
"RecursionError occurred while removing repeated phrases. Skipping this step."
)
return text
def process_jsonl_line(json_line):
try:
data = json.loads(json_line)
if "text" in data:
cleaned_text = clean_text(data["text"])
data["text"] = cleaned_text
return json.dumps(data)
except json.JSONDecodeError as e:
logging.error(f"JSONDecodeError: {e}")
return None
def process_jsonl_file(input_file, output_file):
num_processes = cpu_count()
logging.info(f"Processing {input_file} using {num_processes} processes.")
with open(input_file, "r") as f:
lines = f.readlines()
with Pool(processes=num_processes) as pool:
cleaned_lines = pool.map(process_jsonl_line, lines)
with open(output_file, "w") as fout:
for cleaned_line in cleaned_lines:
if cleaned_line is not None:
fout.write(cleaned_line + "\n")
if __name__ == "__main__":
base_dir = "/home/tijmen/cosmosage/datasets/cpt/"
input_file = base_dir + "cosmosage_cpt.jsonl"
output_file = base_dir + "cosmosage_cpt_cleaned.jsonl"
process_jsonl_file(input_file, output_file)