-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
215 lines (155 loc) · 5.63 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/usr/bin/python
# -*- coding: utf-8 -*-
#-------------------------------------------------------------------------------
# Name: module1
# Purpose:
#
# Author: lrvillan
#
# Created: 17/11/2012
# Copyright: (c) lrvillan 2012
# Licence: <your licence>
#-------------------------------------------------------------------------------
from google import search # find at http://breakingcode.wordpress.com/2010/06/29/google-search-python/
import BeautifulSoup #find at http://www.crummy.com/software/BeautifulSoup/
import urlparse
import urllib
import csv
import sys
NUM_URLS = 500
SEARCH_CRITERIA = r'(autos carros automoviles) venta jalisco -chocados -autoestereos site:.mx -renta -partes -seguro -foro -blog -"seccion amarilla"'
BASE_DOMAINS = ".com.mx .com .net .mx".split()
fields = {
"imagen" : "imagen".split(),
"modelo" : "modelo mod".split(),
"año" : "año".split(),
"marca" : "marca".split(),
"precio" : "precio".split(),
"colores" : "colores".split(),
"descuento" : "descuento promocion desc".split(),
"bateria" : "bateria pila".split(),
"origen" : "ciudad".split(),
"kilometraje" : "kilometraje km k.m.".split(),
"descripcion" : "descipcion".split(),
"forma de pago" : ["forma de pago", "metodo de pago", "credito", "contado"],
"vendedor" : "vendedor anunciante".split(),
"telefono" : "telefono tel celular cel".split(), #agregar un regex para numero telefonico
"contacto" : "contacto".split(), # buscar un correo electronica
"enlaces" : "href".split()
}
TITLES = fields.keys()
def get_domain(url):
host = urlparse.urlparse(url).hostname
domain = ""
for base in BASE_DOMAINS:
if base in host:
domain_index = host.index(base)
domain = host[:domain_index]
# check if subdomains exists
if "." in domain:
subdomain = domain.split(".")[-1]
else:
subdomain = domain
domain = subdomain + base
break
def not_duplicate(urls):
current_domains = []
filtered_urls = []
for url in urls:
#print str(urlparse.urlparse(url).netloc).
host = urlparse.urlparse(url).hostname
domain = ""
# figure out the domain
for base in BASE_DOMAINS:
if base in host:
domain_index = host.index(base)
domain = host[:domain_index]
# check if subdomains exists
if "." in domain:
subdomain = domain.split(".")[-1]
else:
subdomain = domain
domain = subdomain + base
break
if domain not in current_domains:
current_domains.append(domain)
filtered_urls.append(url)
#print filtered_urls
return filtered_urls
def update_progress(progress):
print '\r[{0}] {1}%'.format('#'*(progress/10), progress)
def gather_data(urls):
global fields
gathered_data = []
total = len(urls)
for number, url in enumerate(urls):
data = urllib.urlopen(url).read()
#data = bs4.BeautifulSoup(data).body.get_text(strip=True).lower()
data = data.lower()
# TODO: we would improve the search below if we convert from unicode to ascii such as
# we convert chars like ? to u only for a simple search. Not for now
#print data
#print bs4.BeautifulSoup(data).prettify()
found_fields = []
for field, words in fields.iteritems():
for word in words:
if word in data:
found_fields.append(field)
break
gathered_data.append([url, found_fields])
update_progress(number/total)
return gathered_data
def format_row(row_data):
pass
def send_to_csv(data):
global fields
titles = TITLES
titles.insert(0, "url")
print titles
with open('cars_db.csv', 'wb') as csvfile:
csvwriter = csv.writer(csvfile, dialect='excel', )
csvwriter.writerow(titles)
for row in data:
url, found_fields = row
print row
csv_row = []
csv_row.append(url)
for title in titles[1:]:
if title in found_fields:
csv_row.append('YES')
else:
csv_row.append('NO')
csvwriter.writerow(csv_row)
#def get_search_results(query):
# import urllib2
# import simplejson
#
# # The request also includes the userip parameter which provides the end
# # user's IP address. Doing so will help distinguish this legitimate
# # server-side traffic from traffic which doesn't come from an end-user.
# url = ('https://ajax.googleapis.com/ajax/services/search/web'
# '?v=1.0&q=Paris%20Hilton&userip=USERS-IP-ADDRESS')
#
# request = urllib2.Request(
# url, None, {'Referer': /* Enter the URL of your site here */})
# response = urllib2.urlopen(request)
#
# # Process the JSON string.
# results = simplejson.load(response)
def main():
# get a list with first NUM_URLS
urls = []
# now have some fun with the results...
for url in search(SEARCH_CRITERIA, stop=NUM_URLS):
urls.append(str(url))
# filter those that belongs to the same domain
urls = not_duplicate(urls)
# go to each web page and gather data
data = gather_data(urls)
# create a CSV file with data just gathered
send_to_csv(data)
#sys.exit(0)
# DONE!
print "DONE"
if __name__ == '__main__':
main()