-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathavails.py
executable file
·186 lines (171 loc) · 6.82 KB
/
avails.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python3
## StreatEasy Apartment Finder
from __future__ import print_function
from __future__ import division
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import datetime
import time
# Run from command line
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description = "StreetEasy Apartment Finder")
parser.add_argument("--outfile", help = "Example: test.csv")
parser.add_argument("--area")
parser.add_argument("--min_price", default = "")
parser.add_argument("--max_price")
parser.add_argument("--beds", default = "", help = "Examples: =1, <=1, >=1")
parser.add_argument("--no_fee", default = "")
args = parser.parse_args()
# Get area numbers by searching StreetEasy for your desired location and looking in the address bar
# Also use search bar to identify if new parameters are needed
price = args.min_price+"-"+args.max_price
area = args.area
if args.beds[0] == '=':
beds = ':' + args.beds[1]
else:
beds = args.beds
no_fee = args.no_fee # 1 gets only no fee apartment listings; 0 gets none
def create_search_url(price, area, min_beds, no_fee):
start = 'http://streeteasy.com/for-rent/nyc/status:open'
price_param = "%7Cprice:"
area_param = "%7Carea:"
beds_param = "%7Cbeds"
fee_param = "%7Cno_fee:"
page_param = "?page="
url = start + price_param + price + area_param + area + beds_param + beds
if no_fee in ['1', '0']:
url = url + fee_param + no_fee + page_param
else:
url = url + page_param
return url
search_url = create_search_url(price, area, beds, no_fee)
print(search_url)
print("Starting at " + str(datetime.datetime.now()))
def create_page_url(page):
return search_url + str(page)
# Get number of listings from first search page
url = create_page_url(1)
r = urlopen(url)
if r.getcode() == "404":
time.sleep(1)
else:
r = r.read()
soup = BeautifulSoup(r,'html.parser')
num_results = soup.find_all('div',{'class':'result-count first'})[0].text
# 14 results on a page but the top 2 featured don't count towards total
num_pages = int(int(num_results.replace(',', ''))/12)
# Create list of links to each listing page
links = []
for page in range(1, num_pages + 1):
print("Pages percent done: ", round((page/num_pages)*100, 3))
url = create_page_url(page)
r = urlopen(url)
if r.getcode() == "404":
time.sleep(1)
else:
r = r.read()
soup = BeautifulSoup(r,'html.parser')
listings = soup.find_all('div',{'class':'left-two-thirds items item-rows listings'})[0]
for i in range(14):
try:
links.append(listings.find_all('div', {'class': 'details row'})[i].find('a', href = True)['href'])
except IndexError:
# Deals with final page not having the full 14 results, probably a better way
pass
# Go to each page and extract date available
avails = {}
for i, link in enumerate(links):
print("Getting date for (", i + 1, " of ", len(links), ") ", link)
r =urlopen("http://streeteasy.com" + link)
if r.getcode() == "404":
time.sleep(1)
else:
r = r.read()
soup = BeautifulSoup(r,'html.parser')
div_set = soup.find_all('div',{'class':'right-two-fifths'})[0].find_all('h6')
text_div_set = [x.text for x in div_set]
if "Available on" in text_div_set:
index = text_div_set.index("Available on")
date = div_set[index].parent.text.split("\n")[2].strip()
avails["http://streeteasy.com" + link] = date
elif "Listing Availability" in text_div_set:
index = text_div_set.index("Listing Availability")
date = div_set[index].parent.text.split("\n")[2].strip()
avails["http://streeteasy.com" + link] = date
else:
avails["http://streeteasy.com" + link] = np.nan
# Go to each page and extract price
prices = {}
for i, link in enumerate(links):
print("Getting rent for (", i + 1, " of ", len(links), ") ", link)
r =urlopen("http://streeteasy.com" + link)
if r.getcode() == "404":
time.sleep(1)
else:
try:
r = r.read()
soup = BeautifulSoup(r,'html.parser')
div_set = soup.find_all('div',{'class':'price '})[0]
price = div_set.parent.text.split("\n")[3].strip()
prices["http://streeteasy.com" + link] = price
except(IndexError):
print("No Rent Listed.")
# Go to each page and extract the description of the listing
listingdesc = {}
for i, link in enumerate(links):
print("Getting description for (", i + 1, " of ", len(links), ") ", link)
r = urlopen("http://streeteasy.com" + link)
if r.getcode() == "404":
time.sleep(1)
else:
try:
r = r.read()
soup = BeautifulSoup(r, 'html.parser')
div_set = soup.find_all('meta',{'name':'description'})[0]
div_set = div_set['content']
listingdesc["http://streeteasy.com" + link] = div_set
except(IndexError):
print("Description error.")
# Go to each page and extract the amenities
amenities = {}
for i, link in enumerate(links):
print("Getting amenities for (", i + 1, " of ", len(links), ") ", link)
r = urlopen("http://streeteasy.com" + link)
if r.getcode() == "404":
time.sleep(1)
else:
try:
r = r.read()
soup = BeautifulSoup(r, 'html.parser')
div_set = soup.find_all('div',{'class':'third'})
text_div_set = [x.text.strip().split("\n") for x in div_set]
# Flatten lists and remove whitespace
amenlist = [y.strip() for x in text_div_set for y in x if y.strip()]
# Test if not a googletag
amenlist2 = [x for x in amenlist if x[0].istitle()]
# Convert into string
amenlist3 = ', '.join(amenlist2)
amenities["http://streeteasy.com" + link] = amenlist3
except(IndexError):
print("Amenities Error")
# Create dataframe from results
df1 = pd.DataFrame.from_dict(avails, orient = "index").reset_index()
df1.columns = ['link', 'date_avail']
df2 = pd.DataFrame.from_dict(prices, orient = "index").reset_index()
df2.columns = ['link', 'rent']
df3 = pd.DataFrame.from_dict(listingdesc, orient = "index").reset_index()
df3.columns = ['link', 'listingdesc']
df4 = pd.DataFrame.from_dict(amenities, orient = "index").reset_index()
df4.columns = ['link', 'amenities']
df = pd.merge(df1, df2, on = 'link')
df = pd.merge(df, df3, on = 'link')
df = pd.merge(df, df4, on = 'link')
df = df.sort_values("date_avail", na_position = "last")
# Write to file with date in column
df['record_added'] = datetime.datetime.now().date().strftime("%Y-%m-%d")
df.to_csv(args.outfile, index = False)
print("Ending at " + str(datetime.datetime.now()))