-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
84 lines (78 loc) · 2.09 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import requests
import pysnooper
from bs4 import BeautifulSoup
from collections import OrderedDict
imdb_url = 'https://www.imdb.com'
item_class = 'lister-item-content'
# @pysnooper.snoop()
def search_list(soup, num_items=10, op=None):
"""Basic function to collect links to movies.
Note: the only distinction of the search here
is of the interface of IMDB with respect to
actor, we want to collect the movies from
`known for` section (at least for now)"""
items = {}
n = 0
if op=='actor':
s = soup.find_all('a', class_='knownfor-ellipsis')
else:
s = soup.find_all('a')
for m in s:
if len(items) == num_items:
break
try:
if '/title/tt' in m.get('href'):
name = m.text.strip()
link = imdb_url+m.get('href')
if len(name) == 0:
continue
items[name] = link
n += 1
except Exception as e:
pass
return items
# @pysnooper.snoop()
def scrape_by_genre(key_word):
"""Colect movie names and links based on genre"""
link = imdb_url+f'/search/title/?genres={key_word}'
r = requests.get(link)
soup = BeautifulSoup(r.text, 'lxml')
items = search_list(soup)
return items
# @pysnooper.snoop()
def scrape_by_cast(person):
"""Colect movie names and links based on actor"""
link = imdb_url+f'/find?q={person}'
r = requests.get(link)
soup = BeautifulSoup(r.text, 'lxml')
actors = soup.find_all('a')
actor_link = None
for a in actors:
try:
if '/name/nm' in a.get('href'):
actor_link = imdb_url+a.get('href')
break
except Exception as e:
pass
if actor_link is None:
return None
r = requests.get(actor_link)
soup = BeautifulSoup(r.text, 'lxml')
items = search_list(soup)
return items
# @pysnooper.snoop()
def get_listing_items(num_items=5):
"""Colect movie names and links for popular ones"""
text = get_link('chart/moviemeter')
soup = BeautifulSoup(text, 'lxml')
items = search_list(soup)
return OrderedDict(items)
# @pysnooper.snoop()
def get_link(extension=None):
"""Get response from the base imdb url,
appends extension if applicable"""
if extension is None:
r = requests.get(imdb_url)
else:
r = requests.get(imdb_url+'/'+extension)
return r.text