-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathYelpCrawler.py
153 lines (114 loc) · 4.07 KB
/
YelpCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
__author__ = 'marc'
# -*- coding: utf-8 -*-
"""
Yelp API v2.0 code sample.
This program demonstrates the capability of the Yelp API version 2.0
by using the Search API to query for businesses by a search term and location,
and the Business API to query additional information about the top result
from the search query.
Please refer to http://www.yelp.com/developers/documentation for the API documentation.
This program requires the Python oauth2 library, which you can install via:
`pip install -r requirements.txt`.
Sample usage of the program:
`python sample.py --term="bars" --location="San Francisco, CA"`
"""
import json
import urllib
import urllib2
import oauth2
from pymongo import MongoClient
API_HOST = 'api.yelp.com'
DEFAULT_TERM = 'dinner'
DEFAULT_LOCATION = 'San Francisco, CA'
SEARCH_LIMIT = 3
SEARCH_PATH = '/v2/search/'
BUSINESS_PATH = '/v2/business/'
# OAuth credential placeholders that must be filled in by users.
lines = [line.strip() for line in open('config/yelp.cfg')]
CONSUMER_KEY = lines[0]
CONSUMER_SECRET = lines[1]
TOKEN = lines[2]
TOKEN_SECRET = lines[3]
def request(host, path, url_params=None):
"""Prepares OAuth authentication and sends the request to the API.
Args:
host (str): The domain host of the API.
path (str): The path of the API after the domain.
url_params (dict): An optional set of query parameters in the request.
Returns:
dict: The JSON response from the request.
Raises:
urllib2.HTTPError: An error occurs from the HTTP request.
"""
url_params = url_params or {}
url = 'http://{0}{1}?'.format(host, urllib.quote(path.encode('utf8')))
consumer = oauth2.Consumer(CONSUMER_KEY, CONSUMER_SECRET)
oauth_request = oauth2.Request(method="GET", url=url, parameters=url_params)
oauth_request.update(
{
'oauth_nonce': oauth2.generate_nonce(),
'oauth_timestamp': oauth2.generate_timestamp(),
'oauth_token': TOKEN,
'oauth_consumer_key': CONSUMER_KEY
}
)
token = oauth2.Token(TOKEN, TOKEN_SECRET)
oauth_request.sign_request(oauth2.SignatureMethod_HMAC_SHA1(), consumer, token)
signed_url = oauth_request.to_url()
conn = urllib2.urlopen(signed_url, None)
try:
response = json.loads(conn.read())
finally:
conn.close()
return response
def search(term, location):
"""Query the Search API by a search term and location.
Args:
term (str): The search term passed to the API.
location (str): The search location passed to the API.
Returns:
dict: The JSON response from the request.
"""
url_params = {
'term': term.replace(' ', '+'),
'location': location.replace(' ', '+'),
'limit': SEARCH_LIMIT
}
return request(API_HOST, SEARCH_PATH, url_params=url_params)
def get_business(business_id):
"""Query the Business API by a business ID.
Args:
business_id (str): The ID of the business to query.
Returns:
dict: The JSON response from the request.
"""
business_path = BUSINESS_PATH + business_id
return request(API_HOST, business_path)
def query_api(term, location):
"""Queries the API by the input values from the user.
Args:
term (str): The search term to query.
location (str): The location of the business to query.
"""
response = search(term, location)
businesses = response.get('businesses')
if not businesses:
return
business_id = businesses[0]['id']
return get_business(business_id)
def main():
client = MongoClient('localhost', 27017)
db = client.museums
collection = db.museum_locations
cursor = collection.find()
yelp = []
for document in cursor:
try:
print "processing: %s" % document['id']
yelp.append({'id': document['id'], 'yelp': query_api(document["location"]["address_components"][0]["short_name"], "Nederland")})
except:
continue
with open('data/yelp.json', 'w') as outfile:
json.dump(yelp, outfile)
if __name__ == '__main__':
main()