-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalyze.py
127 lines (99 loc) · 3.45 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
''' Some basic stats '''
import os
import re
import json
import nltk
import string
from nltk.collocations import *
from collections import defaultdict
# Read in all seasons into season by episode dict of lists
def read_seasons():
seasons = {}
for x in range(1, 8):
season = 'season' + str(x)
seasons[season] = []
d = './data/s' + str(x) + '/'
fs = os.listdir(d)
for f in fs:
raw_lines = open(d + f).read().splitlines()
lines = filter(lambda l: len(l.strip()) > 0, raw_lines)
seasons[season].append(lines)
return seasons
# Regex identifying a character name
character = re.compile(r'^(\w+):')
character_up = re.compile(r'^(\W+)$')
parenthetical = re.compile(r'\(.*\)')
punctuation = string.punctuation
# There are several ways of identifying who is speaking
# Currently only dealing with Buffy: <line> constructs
# TODO: fix character_up for season 7
# TODO: A few episodes contains lines like BUFFYhere is her line
def character_in_line(line):
c = character.search(line.strip())
if c:
return c.group(1).lower()
def remove_actions(line):
actions = parenthetical.findall(line)
for action in actions:
line = line.replace(action, ' ')
return line
def remove_punctuation(line):
for p in punctuation:
line = line.replace(p, ' ')
return line
# Omit parenthetical phrases since these aren't dialogue
# ex: Buffy: I'm gonna kill them all. (turns to her task)
def get_line(episode, i):
line = episode[i].split(':')[1]
currIndex = i + 1
while len(episode[currIndex]) > 1:
line += episode[currIndex]
currIndex += 1
line = remove_punctuation(remove_actions(line))
return ' '.join(line.split())
def get_lines_in_episode(episode_path):
episode = open(episode_path).read().splitlines()
for i, line in enumerate(episode):
character = character_in_line(line)
if character and len(episode[i + 1]) > 0:
yield get_line(episode, i)
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
def get_collocations(dialogue):
biFinder = BigramCollocationFinder.from_words(dialogue)
triFinder = TrigramCollocationFinder.from_words(dialogue)
biFinder.apply_freq_filter(1)
triFinder.apply_freq_filter(1)
return biFinder.nbest(bigram_measures.pmi, 20), triFinder.nbest(trigram_measures.pmi, 20)
def get_collocations_for_season(season):
dialogue = ''
d = './data/s' + str(season) + '/'
fs = os.listdir(d)
for f in fs:
for line in get_lines_in_episode(d + f):
dialogue += ' ' + line
biCollocations, triCollocations = get_collocations(dialogue.split())
print 'top 20 bigram collocations by point-wise mutual information:'
for colloc in biCollocations:
print '\t', ' '.join(list(colloc))
print
print 'top 20 trigram collocations by point-wise mutual information:'
for colloc in triCollocations:
print '\t', ' '.join(list(colloc))
get_collocations_for_season(1)
# Distribution of characters in each season, by # of lines spoken
def character_line_distribution():
seasons = read_seasons()
characters = {}
for season, episodes in seasons.iteritems():
characters[season] = defaultdict(int)
for i, episode in enumerate(episodes):
for j, line in enumerate(episode):
character = character_in_line(line)
if character:
print season, i, character, get_line(episode, j)
print
characters[season][character] += 1
# test = sorted(characters['season7'].items(), key=lambda t: t[1], reverse=True)
# Save to json
json.dump(characters, open('character_counts_by_season.json', 'w'))