-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_imdb.py
162 lines (113 loc) · 4.94 KB
/
get_imdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import argparse
import imdb
import json
from argparse import RawTextHelpFormatter
from imdb import IMDbDataAccessError
# ---
parser = argparse.ArgumentParser(description='''
Provide me with the movies JSON and I will try to get the IMDb data of each movie based on its 'movie_title' field. I expect this JSON to have been generated by the 'process_ml100k.py' script.\n
Movies for which I couldn't get the IMDb data properly will have a field named 'imdb_failed', whose value is the exception thrown when reading the data, though this can be disabled.\n
After every sucessful retrieval or failure, I update the output file with the current state of things. Should the program stop mid-execution for one reason or another (e.g. a crash, an interruption, etc.), simply provide the ouput file as the input and I'll be able to continue from where I left. The 'imdb_failed' field will help me skip the ones I have failed before.
''', formatter_class=RawTextHelpFormatter)
parser.add_argument('input',
help="JSON input file with the movies")
parser_group = parser.add_mutually_exclusive_group(required=False)
parser_group.add_argument('-o', '--output',
help="Place the output into a file. Defaults to \"<input>.out\"")
parser_group.add_argument('-o2i', '--output-to-input',
help="Set it to make the input file also the output file",
action="store_true")
parser.add_argument('-r', '--restore',
help="Set it to remove all 'imdb_failed' fields from movies in the input file before execution (the input file will be altered)",
action="store_true")
parser.add_argument('--disable-imdb-failed',
help="Set it to disable the creation of an 'imdb_failed' field on movies that failed",
action="store_true")
args = parser.parse_args()
# ---
input_filename = args.input
output_filename = args.input if args.output_to_input else (args.output if args.output else input_filename + ".out")
use_error_field = not args.disable_imdb_failed
# ---
data = None
if args.restore:
print("Restoring movies with 'imdb_failed'...")
with open(input_filename) as input_file:
data = json.load(input_file)
amount_restored = 0
for movie in data:
if 'imdb_failed' in movie:
del movie['imdb_failed']
amount_restored += 1
with open(input_filename, 'w') as input_file:
json.dump(data, input_file)
print("{} movies restored!".format(amount_restored))
print("---")
# ---
print("Input filename : \"{}\"".format(input_filename))
print("Final output filename : \"{}\"".format(output_filename))
print("Using the 'imdb_failed' field? {}".format("Yes" if use_error_field else "No"))
print("---")
ia = imdb.IMDb()
if data == None:
with open(input_filename) as input_file:
data = json.load(input_file)
total_movies = len(data)
movies_read = 0
movies_failed = 0
for movie in data:
fail_message = None
movies_read += 1
title = movie["movie_title"]
if 'imdb_failed' in movie:
print("Skipping movie (already has failed before): \"{}\"".format(title))
movies_failed += 1
elif 'movie_plot' in movie:
print("Skipping movie (already has IMDb data): \"{}\"".format(title))
else:
try:
print("Retrieving IMDb data for \"{}\"...".format(title))
found_movies = ia.search_movie(title)
if len(found_movies) == 0:
fail_message = "Couldn't find the movie from its title."
else:
imdb_movie = ia.get_movie(found_movies[0].movieID)
director_info = None
cast_info = []
plot_info = None
for director in imdb_movie['director']:
director_info = director['name']
break
cast = imdb_movie['cast']
if len(cast) != 0:
for person in cast:
cast_info.append(person['name'])
for plot in imdb_movie['plot']:
plot_info = plot
break
movie["movie_director"] = director_info
movie["movie_cast"] = cast_info
movie["movie_plot"] = plot_info
print(" Success!")
except IMDbDataAccessError as e:
print("Faulty internet is inadmissible! I'm stopping right here.")
print(repr(e))
exit()
except KeyError as e:
fail_message = "The IMDb data was missing a field. " + repr(e)
if fail_message:
movies_failed += 1
movies_failed_percent = '{0:.2f}'.format((movies_failed * 100) / movies_read)
print(" Failure!", fail_message)
print(" Failures so far: {} ({}% of seen movies)".format(movies_failed, movies_failed_percent))
if use_error_field:
movie["imdb_failed"] = fail_message
print(" Saving progress...")
with open(output_filename, 'w') as save_file:
json.dump(data, save_file)
movies_left = total_movies - movies_read
movies_left_percent = '{0:.2f}'.format((movies_left * 100) / total_movies)
print(" {} movies read, {} ({}%) left.".format(movies_read, movies_left, movies_left_percent))
print("---")
print("All {} movies analysed.".format(total_movies))
print("{} ({}%) failed in retrieving the IMDb data.".format(movies_failed, '{0:.2f}'.format((movies_failed * 100) / total_movies)))