-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
135 lines (103 loc) · 4.21 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# -*- coding: utf-8 -*-
"""preprocessing.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/19wrGvNf6svZFeyQ1M4IxUB2PbA8eu4tA
"""
import re
import os
import pandas as pd
def katalk_msg_parse(file_path):
my_katalk_data = list()
print(my_katalk_data)
katalk_msg_pattern = "[0-9]{4}[년.] [0-9]{1,2}[월.] [0-9]{1,2}[일.] 오\S [0-9]{1,2}:[0-9]{1,2},.*:" # 카카오톡 스마트폰 버전
#katalk_msg_pattern = "\[(.*?)\] \[(오\S) ([0-9]{1,2}:[0-9]{1,2})\] (.*)" # 카카오톡 PC버전
date_info = "[0-9]{4}년 [0-9]{1,2}월 [0-9]{1,2}일 \S요일"
in_out_info = "[0-9]{4}[년.] [0-9]{1,2}[월.] [0-9]{1,2}[일.] 오\S [0-9]{1,2}:[0-9]{1,2}:.*"
for line in open(file_path):
if re.match(date_info, line) or re.match(in_out_info, line):
continue
elif line == '\n':
continue
elif re.match(katalk_msg_pattern, line):
line = line.split(",")
date_time = line[0]
user_text = line[1].split(" : ", maxsplit=1)
user_name = user_text[0].strip()
text = user_text[1].strip()
my_katalk_data.append({'date_time': date_time,
'user_name': user_name,
'text': text
})
else:
if len(my_katalk_data) > 0:
my_katalk_data[-1]['text'] += "\n"+line.strip()
my_katalk_df = pd.DataFrame(my_katalk_data)
return my_katalk_df
# 파일이 저장되어 있는 경로를 넣어주세요.
f_path = '/content/drive/MyDrive/kakaotalk_data/'
# 파일 이름을 넣어주세요.
f_name = 'KakaoTalkChats.txt'
#df = katalk_msg_parse(os.path.join(f_path, f_name))
df = katalk_msg_parse('/content/drive/MyDrive/kakaotalk_data/정말 그냥 우당탕탕 12 카카오톡 대화.txt')
df
import re
df['text'] =df['text'].apply(lambda x: re.sub(r'\n.*', '', x))
df = df[~df['text'].str.startswith('파일:')]
df
!pip install emoji
import re
import emoji
def extract_emojis(text):
emoji_list = list()
demojized_text = emoji.demojize(text)
for c in demojized_text:
if c.startswith(':') and c.endswith(':'):
emoji_list.append(c)
return emoji_list
mimetic = "[ㅋㅎㅠㅜ!?~]+"
punctuations = "[,.]{2,}"
emo_type1_facial1 = "[;:]{1}[\^\'-]?[)(DPpboOX]"
emo_type1_facial2 = "[>ㅜㅠㅡ@\^][ㅁㅇ0oO\._\-]*[\^ㅜㅠㅡ@<];*"
emo_type3 = "\(.+?\)"
nonverbal_list = [mimetic, punctuations, emo_type1_facial1, emo_type1_facial2, emo_type3]
def remove_nonverbal(text):
for pattern in nonverbal_list:
text = re.sub(pattern, '', text)
return text.strip()
df['nonverbal'] = df['text'].apply(remove_nonverbal) + df['text'].apply(extract_emojis).apply(lambda x: ' '.join(x))
df['nonverbal_count'] = df['nonverbal'].apply(len)
df = df[df['nonverbal_count'] > 0]
df
#import re
#df['nonverbal'] =df['nonverbal'].apply(lambda x: re.sub(r'\n.*', '', x))
#df = df[~df['nonverbal'].str.startswith('파일:')]
#df
audio_visual_text = '^동영상$|^사진$|^이모티콘$|^사진 [0-9]{1,2}장$'
mask = df['nonverbal'].str.contains(audio_visual_text)
df.loc[mask, 'audio_visual'] = 1
df.loc[~mask, 'audio_visual'] = 0
df.loc[mask, 'msg_len'] = 0
df.loc[mask, 'msg_word_count'] = 0
df[df['audio_visual']==1]
filtered_df = df[df['audio_visual'] != 1]
filtered_df
import re
url_pattern = r'http\S+|www\S+'
filtered_df = filtered_df[~filtered_df['text'].str.contains(url_pattern)]
filtered_df
#import re
#filtered_df['nonverbal'] =filtered_df['nonverbal'].apply(lambda x: re.sub(r'\n.*', '', x))
#filtered_df = filtered_df[~filtered_df['nonverbal'].str.startswith('파일:')]
#filtered_df
remove_columns = ['text','nonverbal_count', 'audio_visual','msg_len', 'msg_word_count']
final_df = filtered_df.drop(remove_columns, axis=1)
final_df = final_df.rename(columns={'nonverbal': 'text'})
final_df
final_df.to_csv('output17.txt', sep='\t', index=False)
#remove_columns2 = ['date_time', 'user_name']
#final_data2 = final_df.drop(remove_columns2, axis=1)
#final_data2 = final_data2.rename(columns={'text': 'document'})
#final_data2['label'] = '0'
#final_data2
#final_data2.to_excel(os.path.join(f_path, "이채연.xlsx"), index=False)