-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_transcript.py
executable file
·381 lines (317 loc) · 13.7 KB
/
clean_transcript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
#!/usr/bin/env python3
"""
Translator/Transcript Cleaner and Formatter
This script processes raw transcripts using OpenAI's API to correct grammar,
remove irrelevant content, and improve formatting. It can handle large transcripts
by splitting them into chunks and supports various languages.
Usage:
clean-transcript raw_transcript.txt -c "context" -m model -o clean_transcript.txt
"""
import argparse
import sys
import os
from openai import OpenAI
from language_codes import get_language_name
import re
from tenacity import retry, stop_after_attempt, wait_exponential
DEF_MODEL = 'gpt-4o'
DEF_SUMMARY_MODEL = 'gpt-4o-mini'
DEF_MAX_CHUNK_SIZE = 3000
DEF_TEMPERATURE = 0.05
DEF_MAX_TOKENS = 4096
import logging
import colorlog
def setup_logging(verbose, debug=False):
"""Set up logging configuration with color."""
logger = logging.getLogger()
handler = colorlog.StreamHandler()
if verbose or debug:
if debug:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)
datefmt="%H:%M:%S"
logformat=f"%(log_color)s%(asctime)s:%(module)s:%(levelname)s: %(message)s"
else:
logger.setLevel(logging.ERROR)
datefmt=None
logformat=f"%(log_color)s%(module)s:%(levelname)s: %(message)s"
formatter = colorlog.ColoredFormatter(
logformat,
datefmt=datefmt,
reset=True,
log_colors={ 'DEBUG': 'cyan', 'INFO': 'green', 'WARNING': 'yellow', 'ERROR': 'red', 'CRITICAL': 'red,bg_white' },
secondary_log_colors={},
style='%'
)
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger
# Initialize OpenAI client with API key from environment variables
openai_client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
if not os.getenv('OPENAI_API_KEY'):
logging.error("OPENAI_API_KEY environment variable not set")
sys.exit(1)
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def call_LLM(systemprompt, input_text, model='gpt-4o', temperature=0, max_tokens=1000):
messages = [
{"role": "system", "content": systemprompt},
{"role": "user", "content": input_text}
]
try:
response = openai_client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
n=1,
stop=''
)
if not response.choices or not response.choices[0].message.content.strip():
logging.warning(f"Empty response from API: input_text='{input_text[:128]}...'")
response.choices[0].message.content = ''
return response.choices[0].message.content.strip()
except Exception as e:
logging.error(f"API error: {str(e)}")
sys.exit(1)
# CONTEXT --------------------------------------------------------------------------
def create_context_summary(input_text, model=DEF_SUMMARY_MODEL):
systemprompt="""
# Summary Editor
You are a Summary Editor, expert in editing texts into very brief, concise summaries.
Your role is to create a summary of the main points of the text, focussing only on the most salient information, in no more than three paragraphs.
Follow these guidelines:
- **Do not** use third-person references (e.g., "The speaker said...", etc).
- **Do not** add any new information or preambles. Just the summarized text.
- **Only** Output the summary paragraphs; *no* preambles or commentary.
- NEVER include **any** additional preamble to the summary paragraphs, such as 'Here is the detailed summary ...', etc.
- Your only task is to create summary paragraphs, and to output those paragraphs.
Examples:
- Incorrect: "Here is the brief summary of the main points from the text"
- Incorrect: "The speaker said that the brain's prefrontal cortex is responsible for decision-making."
"""
return call_LLM(systemprompt, input_text, model, 0, 1000)
def add_and_before_last(s):
words = s.split(', ')
if len(words) <= 1: return s
elif len(words) == 2: return ' and '.join(words)
return ', '.join(words[:-1]) + ', and ' + words[-1]
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def _generate_text_with_continuation(input_text, model, max_tokens, temperature, context, lang, context_summary):
"""
Generate corrected and formatted text using OpenAI's API with retry mechanism.
Args:
input_text (str): The input text to process
model (str): The OpenAI model to use
max_tokens (int): Maximum number of tokens for the API response
temperature (float): Temperature for text generation
context (str): Domain-specific context for the transcript
lang (str): Language code of the input text
Returns:
OpenAI API response object
"""
if context:
context = f", with extensive knowledge in {add_and_before_last(context)}"
Language = ''
Language_Task = ''
if lang is None:
lang='en'
Language = ''
elif lang != 'en':
Language = get_language_name(lang)
if 'Unknown' in Language:
lang = 'en'
Language = ''
else:
context += f", and you are an expert {Language}-English translator."
Language_Task = f', and accurately translate/interpret the text from {Language} into English'
logging.info(f"{context=}")
if context_summary:
context_summary = f"\n\n## Context Summary:\n\n{context_summary}\n\n"
else:
context_summary = ''
systemprompt = f"""
# Translation/Transcription Correction and Formatting Editor
You are an expert translation/transcription editor{context}.
Your task is to review and correct text -- which could be transcriptions, or other text -- focusing on domain-specific terms and concepts{Language_Task}. Follow these guidelines:
1. **Grammar and Clarity**:
- Fix poor grammar **only where necessary** for clarity.
- Remove hesitation words (e.g., "um", "uh") and repetitions.
2. **Relevance**:
- Some text/transcripts may contain sentences or paragraphs requesting that viewers subscribe to their channel, or join their Patreon. These sentences and paragraphs must be removed.
- Some text/transcripts may contain sentences or paragraphs that are promotions for products or services unrelated to the topic of the text. These sentences and paragraphs must be removed.
3. **Transcription Formatting**:
- Do **not** use third-person references (e.g., "The speaker said...").
- Create logical sentences that are properly capitalized and punctuated.
- Create logical paragraphs from the sentences that are neither too long nor too short.
4. **Content Integrity**:
- Do **not** change the meaning of the text.
- Do **not** add any new information or preambles. Just the corrected text.
- Reformat the text as needed without altering the original content.
5. **Examples**:
- Correct: "The brain's prefrontal cortex is responsible for decision-making."
- Incorrect: "The speaker said that the brain's prefrontal cortex is responsible for decision-making."
{context_summary}
## Input/Output
Your only goal is to reformat the text for readability and clarity while preserving the original meaning and context. Output only the corrected and formatted text. Do not include any additional preamble, commentary or explanations.
Input: Raw transcript text
Output: Corrected and formatted translation/transcript, in English
"""
messages = [
{"role": "system", "content": systemprompt},
{"role": "user", "content": input_text}
]
try:
response = openai_client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
top_p=1,
n=1,
stop='',
frequency_penalty=0,
presence_penalty=0)
if not response.choices or not response.choices[0].message.content.strip():
logging.warning(f"Empty response from API: input_text='{input_text[:128]}...'")
logging.debug(f'{response=}')
response.choices[0].message.content = ''
logging.debug(f'{response.choices[0].message.content[:80]=}')
return response
except Exception as e:
logging.debug(f'{response=}')
logging.critical(f"Error in API call: {str(e)}")
sys.exit(1)
from create_sentences import create_sentences
def _get_chunk_with_complete_sentences(text, max_chunk_size):
"""
Extract a chunk of text with complete sentences up to max_chunk_size.
Args:
text (str): Input text to chunk.
max_chunk_size (int): Maximum size of the chunk.
Returns:
tuple: (chunk, remaining_text)
"""
sentences = create_sentences(text, max_sentence_length=max_chunk_size-1)
chunk = ''
for sentence in sentences:
proposed_chunk = chunk + sentence.strip() + ' '
if len(proposed_chunk.encode('utf-8')) <= max_chunk_size:
chunk = proposed_chunk
else:
# Handle case when an individual sentence exceeds max_chunk_size
if not chunk:
logging.warning(f"Sentence exceeds max_chunk_size and will be skipped: {sentence}")
break
# The remaining text should start after the successful chunk
remaining_text_index = len(chunk.encode('utf-8')) # Ensures UTF-8 byte consideration
remaining_text = text[len(chunk):] if remaining_text_index < len(text) else ''
return chunk.strip(), remaining_text
def process_transcript(input_text, *, model=DEF_MODEL, max_chunk_size=DEF_MAX_CHUNK_SIZE, temperature=DEF_TEMPERATURE, context='', language='en', max_tokens=DEF_MAX_TOKENS):
"""
Process the entire transcript by splitting it into chunks and generating corrected text.
Args:
input_text (str): The full input transcript
model (str): OpenAI model to use
max_chunk_size (int): Maximum size of each chunk
temperature (float): Temperature for text generation
context (str): Domain-specific context
language (str): Language code of the input text
max_tokens (int): Maximum number of tokens for API response
Returns:
str: Processed and corrected transcript
"""
input_text = input_text.rstrip()
total_length = len(input_text)
processed_length = 0
generated_text = ""
remaining_text = input_text
iterations = 0
iteration_limit = int((total_length / max_chunk_size) * 2)
context_summary = None
while remaining_text:
# Store the initial length of the remaining text
initial_length = len(remaining_text)
# Get the next chunk with complete sentences
chunk, remaining_text = _get_chunk_with_complete_sentences(remaining_text, max_chunk_size)
# Generate the text
response = _generate_text_with_continuation(chunk, model, max_tokens, temperature, context, language, context_summary)
if generated_text and generated_text[-1] in '.,?!`"':
generated_text += ' '
generated_text += response.choices[0].message.content
paragraphs = response.choices[0].message.content.strip().split('\n\n')
context_summary = create_context_summary('\n\n'.join(paragraphs[-7:]))
# Update the processed length by the difference in length before and after processing the chunk
processed_length += initial_length - len(remaining_text)
percent = processed_length/total_length
logging.info(f"Progress: {percent:.1%} Iteration: {iterations}/{iteration_limit}")
# Break the loop if all text has been processed
if processed_length >= total_length or remaining_text == '' or percent >= 99.85:
break
iterations+=1
if iterations > iteration_limit:
logging.error(f'Too many iterations!')
return generated_text
def main():
"""
Parse command-line arguments and orchestrate the transcript cleaning process.
"""
# Set up the signal handler
import signal
def signal_handler(sig, frame):
print('\033[0m^C\n')
sys.exit(130)
signal.signal(signal.SIGINT, signal_handler)
parser = argparse.ArgumentParser(
description="Fix and clean up transcripts using OpenAI API.",
epilog="Example: clean-transcript raw_transcript.txt -c \"neuroscience, free will\" -m gpt-4o -o clean_transcript.txt"
)
parser.add_argument("input_file",
help="Path to the raw text/transcript file")
parser.add_argument('-L', '--input-language', default=None,
help='Define the language of the text. If this is specified, then the text is translated into English (def: None))')
parser.add_argument("-c", "--context", default=None,
help="Domain-specific context for the transcript (default: none)")
parser.add_argument("-m", "--model", default=DEF_MODEL,
help=f"OpenAI model to use (default: {DEF_MODEL})")
parser.add_argument("-M", "--max-tokens", type=int, default=DEF_MAX_TOKENS,
help=f"Maximum tokens (default: {DEF_MAX_TOKENS})")
parser.add_argument("-s", "--max-chunk-size", type=int, default=DEF_MAX_CHUNK_SIZE,
help=f"Maximum chunk size for processing (default: {DEF_MAX_CHUNK_SIZE})")
parser.add_argument("-t", "--temperature", type=float, default=DEF_TEMPERATURE,
help=f"Temperature for text generation, 0.0 - 1.0 (default: {DEF_TEMPERATURE})")
parser.add_argument("-o", "--output",
help="Output file path (default: stdout)")
parser.add_argument('-v', '--verbose', default=False, action='store_true',
help='Enable verbose output')
parser.add_argument('-d', '--debug', default=False, action='store_true',
help='Enable debug output')
args = parser.parse_args()
# Set up logging based on verbose/debug options
logger = setup_logging(args.verbose, args.debug)
try:
with open(args.input_file, 'r') as file:
input_text = file.read()
except IOError as e:
logging.error(f"Error reading input file: {str(e)}")
sys.exit(1)
generated_text = process_transcript(input_text,
model=args.model,
max_tokens=args.max_tokens,
temperature=args.temperature,
context=args.context,
language=args.input_language,
max_chunk_size=args.max_chunk_size,
)
if args.output:
try:
with open(args.output, 'w') as file:
file.write(generated_text)
except IOError as e:
logging.error(f"Error writing to output file: {str(e)}")
sys.exit(1)
else:
print(generated_text)
if __name__ == "__main__":
main()
#fin