forked from Ayanaminn/N46Whisper
-
Notifications
You must be signed in to change notification settings - Fork 1
/
summay_everything.py
79 lines (65 loc) · 3.07 KB
/
summay_everything.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# 使用 internLM2 总结文件,格式化输出到目标位置
import re
import os
from pathlib import Path
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.environ['HF_HOME'] = './temp/hf-cache'
from whispertranslator.llm import InternLM2
from lmdeploy import GenerationConfig
def split_text(text, max_word_count):
def count_words(text):
words = re.findall(r'\b\w+\b', text)
return len(words)
sentences = re.split(r'(?<=[,.])\s', text) # 按照逗号和句号分割文本
new_paragraphs = []
current_paragraph = ''
current_word_count = 0
for sentence in sentences:
sentence_word_count = count_words(sentence)
if current_word_count + sentence_word_count <= max_word_count:
current_paragraph += sentence + ' '
current_word_count += sentence_word_count
else:
if current_word_count > 0:
new_paragraphs.append(current_paragraph.strip())
current_paragraph = sentence + ' '
current_word_count = sentence_word_count
if current_paragraph != '':
new_paragraphs.append(current_paragraph.strip())
return new_paragraphs
internLM2 = InternLM2(session_len=8096)
gen_config = GenerationConfig(top_k=20,top_p=0.3,temperature=0.1)
translator_system_prompt = """
你是一个英文专家,请你把下列文字翻译成中文,你可以修改这段话的叙述方式让他更符合中文,只返回给我翻译结果:
"""
summary_system_prompt = f"""
总结下列文字的主题,分点阐述概括:
"""
if __name__ == "__main__":
# 只需要修改这些内容 | you only need to modify here
src_path = ""
export_dir = ""
# 只需要修改这些内容 | you only need to modify here
with open(src_path,'r') as file:
full_text = file.read()
new_paragraphs = split_text(full_text, max_word_count=200)
translate_filename = os.path.basename(src_path) + '_translate_new' + '.txt'
translate_filename = Path(export_dir) / translate_filename
with open(translate_filename, 'w', encoding='utf-8') as file:
for chunk in new_paragraphs:
chunk = chunk.replace("\n", ".")
chunk_translate = internLM2.infer(translator_system_prompt,f"{chunk}" ,gen_config)
chunk_translate.text = chunk_translate.text.replace(" ", "") # 去除空格
if chunk_translate.text.count(chunk_translate.text[-4:]) > 10:
print("出现重复!")
chunk_translate = internLM2.infer(translator_system_prompt,f"{chunk}" ,gen_config)
print(chunk, '\n' ,chunk_translate.text.split('\n')[-1])
file.write(
chunk_translate.text.split('\n')[-1] +'\n')
with open(translate_filename, 'r') as file:
content = file.read()
summary_text = internLM2.infer(summary_system_prompt,str(content).replace(' ','').replace('\n',''),gen_config).text
print("总结结果:",summary_text)
content = summary_text + '\n\n' + content
with open(translate_filename, "w") as file:
file.write(content)