-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_data.py
94 lines (81 loc) · 3.23 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os, json, pandas as pd, numpy as np
from tqdm import tqdm
def load_subtitle_paths(path='data/imdb.subtitles.paths'):
return pd.read_table(path,sep='\t',header=None,names=['movie_id','subtitle_path'])
def load_subtitles(path='clean_subtitles.txt'):
print "Loading subtitles..."
lines = [{'movie_id': l.split('\t\t')[0], 'script': l.split('\t\t')[1]} for l in tqdm(open(path).read().split('\n')[0:-1])]
return pd.DataFrame(lines)
def load_audio_info(path):
h = json.load(open(path))
soundtrack_id = path.split('.json')[0].split('/')[-1]
h['soundtrack_id'] = soundtrack_id
return h
# root_audio_path = '/data/corpora/soundtracks/audio_info/'
def load_audio_features(root_audio_path):
all_audio_paths = [root_audio_path + f for f in os.listdir(root_audio_path)]
print "Loading audio features..."
infos = [load_audio_info(p) for p in tqdm(all_audio_paths)]
return pd.DataFrame(infos)
def load_soundtracks(path='data/imdb.soundtracks.txt'):
lines = [l.split('\t') for l in open(path).read().split('\n')][0:-1]
return [{'movie_id': l[0], 'soundtrack_ids': l[1]} for l in lines]
def load_flat_soundtracks():
tracks = load_soundtracks()
flat_tracks = []
for t in tracks:
movie_id = t['movie_id']
for soundtrack_id in t['soundtrack_ids'].split(' '):
flat_tracks.append([movie_id, soundtrack_id])
df = pd.DataFrame(flat_tracks,columns=['movie_id','soundtrack_id'])
return df
# load metadata from imdb dump of title.basics.tsv.gz
#path = '/data/corpora/imdb/title.basics.tsv'
def load_titles_basic_metadata(path = 'data/title.basics.tsv'):
return pd.read_csv(path,sep='\t',low_memory=False)
def add_metadata_to_data(data):
titles_metadata = load_titles_basic_metadata()
data = data.merge(titles_metadata,left_on='movie_id',right_on='tconst',how='inner')
all_genres = list(set(np.concatenate([g.split(',') for g in list(data.genres)])))
genre_list = []
print "Loading metadata..."
for i in tqdm(range(len(data))):
h = {}
row_genres = ['genre_' + g for g in data.genres[i].split(',')]
for r in row_genres:
h[r] = 1
genre_list.append(h)
genre_df = pd.DataFrame(genre_list)
genre_df.fillna(0,inplace=True)
data.reset_index(inplace=True)
genre_df.reset_index(inplace=True)
return data.merge(genre_df)
def pad_id(m):
if len(m) == 9:
return m
else:
prefix = 'tt'
suffix = m.split(prefix)[-1]
pad_length = 7-len(suffix)
pad_str = ''.join(list(np.zeros(pad_length).astype(np.int32).astype(str)))
return prefix+pad_str+suffix
def fix_movie_ids(data):
movie_ids = data.movie_id
movie_ids = [pad_id(m) for m in movie_ids]
data.movie_id = movie_ids
return data
#data dump from imdb title.ratings.tsv
#'/data/corpora/imdb/title.ratings.tsv'
def load_ratings(path='data/title.ratings.tsv'):
return pd.read_csv(path,sep='\t')
#path = '/data/corpora/imdb/tm/topics.50.txt'
def load_titles_tm(path = 'data/topics.50.txt'):
return pd.read_csv(path,sep='\t',low_memory=False)
def load_data():
data = load_flat_soundtracks()#.merge(load_subtitle_paths()).merge(load_audio_features())
data = data.merge(load_audio_features())
data = fix_movie_ids(data)
data = data.merge(load_ratings(),left_on='movie_id',right_on='tconst')
data = data.merge(load_titles_tm(),left_on='movie_id',right_on='tconst')
data = add_metadata_to_data(data)
return data