-
Notifications
You must be signed in to change notification settings - Fork 1
/
findModifiable.py
78 lines (60 loc) · 1.66 KB
/
findModifiable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import emoji
import re
from emoji import unicode_codes
from collections import Counter
import pandas
import sys
codings = pandas.read_csv("final_codings.csv", encoding='utf-8')
e_to_f = dict()
e_to_c = dict()
for index, row in codings.iterrows():
key = row['Emoji'].strip()
e_to_f[key] = row['final']
e_to_c[key] = row['final_categories']
toPatternize = list()
for key in e_to_f:
toPatternize.append(key)
emojis = sorted(toPatternize, key=len,
reverse=True)
pattern3 = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
codingRE= re.compile(pattern3)
def func(message):
c = Counter()
score = 0
emojis = codingRE.findall(message)
for e in emojis:
c[e_to_f[e]] += 1
score = c[u'H'] - c[u'U']
if score > 0:
return u'H'
elif score < 0:
return u'U'
else:
return u'C'
emojiss = unicode_codes.EMOJI_ALIAS_UNICODE
emojis = sorted(emojiss.values(), key=len,
reverse=True)
pattern3 = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
ree = re.compile(pattern3)
def func2(message):
score = 0
emojis = ree.findall(message)
cc = Counter()
for e in emojis:
if e in e_to_c:
cc[e_to_c[e]] += 1
else:
cc[u'CONTEXT NEEDED'] += 1
a = cc.most_common()
if(len(a) > 2 and a[0][1] == a[1][1]):
return u'CONTEXT NEEDED'
else:
return a[0][0]
path = "/data/06333/aroraish/flat/"
path += sys.argv[1]
data = pandas.read_csv(path, encoding='utf-8')
print len(data[u'message'])
mg = data[u'message'].apply(func).value_counts()
print mg
mg = data[u'message'].apply(func2).value_counts()
print mg