-
Notifications
You must be signed in to change notification settings - Fork 2
/
telegram_export_chat_parser.py
58 lines (53 loc) · 2.07 KB
/
telegram_export_chat_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
import fnmatch
from bs4 import BeautifulSoup
import lxml
from tqdm import tqdm
import json
chatdir = 'dump'
def myclean(s):
cleandict = {
')': ' ',
'?': '? ',
'😂': '',
'😅': '',
'🙈': '',
'🤗': '',
'🤣': '',
'@': '',
'😬': '',
'❤': '',
'хД': '',
'ДДД': '',
'😁': '',
'😹': '',
'👋': '',
'🥺': '',
'☀': '',
'🍀': '',
' ': ' ',
' ': ' '
}
for x in cleandict:
s = s.replace(x, cleandict[x])
return s.strip()
with open('output.jsonl', 'a') as dataset:
for file in os.listdir(chatdir):
if fnmatch.fnmatch(file, '*.html'):
with open(chatdir + '/' + file, 'r', encoding='UTF-8') as f:
data = f.read().strip()
soup = BeautifulSoup(data, 'html.parser')
messages = soup.find_all('div', class_='message')
for message in tqdm(messages, desc=f'Обработка файла: {file}'):
if message.find("div", {"class": "reply_to details"}) and message.find("div", {"class": "text"}):
new_message = (message.find("div", {"class": "text"}).text).strip()
old_message0 = message.find("div", {"class": "reply_to details"}).a.get('href').replace('#go_to_', '')
old_message = ''
if soup.find("div", {"id": str(old_message0)}):
old_message0 = soup.find("div", {"id": str(old_message0)})
if old_message0.find("div", {"class": "text"}):
old_message = (old_message0.find("div", {"class": "text"}).text).strip()
if len(new_message) > 7 and len(old_message) > 7:
old_message = myclean(old_message)
new_message = myclean(new_message)
dataset.write(json.dumps({'question': old_message, 'answer': new_message}) + '\n')