-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbird_scraper.py
189 lines (158 loc) · 7.67 KB
/
bird_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import requests
from bs4 import BeautifulSoup
import time
import os
from urllib.parse import urljoin
import json
user_agent_string = ""
class WikiBirdScraper:
def __init__(self):
self.base_url = "https://en.wikipedia.org"
self.birds_list_url = "/wiki/List_of_birds_of_North_America"
self.output_dir = "bird_pages"
self.headers = {
'User-Agent': user_agent_string
}
self.cache_file = os.path.join(self.output_dir, "scraping_progress.json")
def setup_directory(self):
"""Create output directory if it doesn't exist"""
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
def get_bird_links(self):
"""Scrape the main bird list page and return all bird page URLs"""
response = requests.get(
urljoin(self.base_url, self.birds_list_url),
headers=self.headers
)
soup = BeautifulSoup(response.text, 'html.parser')
content = soup.find('div', {'class': 'mw-parser-output'})
bird_links = []
# Find all h2 headers
h2_headers = content.find_all('h2')
# Find the range between Tinamous and See also
start_collecting = False
for h2 in h2_headers:
header_text = h2.get_text().strip()
print(f"Found header: {header_text}") # Debug print
if 'Tinamous' in header_text:
start_collecting = True
print("\nStarting to collect birds...")
elif 'See also' in header_text:
print("\nReached See also section, stopping...")
break
if start_collecting:
# Get all ul elements until the next h2
current = h2.find_next()
while current and current.name != 'h2':
if current.name == 'ul':
for link in current.find_all('a'):
href = link.get('href', '')
if (href and
href.startswith('/wiki/') and
':' not in href and
not href.startswith('/wiki/List_of') and
not href.startswith('/wiki/Family_')):
bird_links.append({
'href': href,
'name': link.text,
'section': header_text
})
current = current.find_next()
if not bird_links:
print("Warning: No birds found! Check if page structure has changed.")
else:
total_birds = len(bird_links)
print(f"\nFound {total_birds} birds in total")
print("\nFirst 10 birds found:")
for i, bird in enumerate(bird_links[:10], 1):
print(f"{i}. {bird['name']} ({bird['href']}) - Section: {bird['section']}")
if total_birds > 20: # Only show last 10 if we have more than 20 birds
print("\nLast 10 birds found:")
for i, bird in enumerate(bird_links[-10:], total_birds-9):
print(f"{i}. {bird['name']} ({bird['href']}) - Section: {bird['section']}")
return [link['href'] for link in bird_links]
def download_bird_pages(self, limit=None):
"""Download individual bird pages
Args:
limit (int, optional): Maximum number of pages to download. Defaults to None (all pages).
"""
bird_links = self.get_bird_links()
if limit:
print(f"\nLimiting to first {limit} birds:")
for i, link in enumerate(bird_links[:limit]):
print(f"{i+1}. {link}")
print()
bird_links = bird_links[:limit]
# Load existing metadata and downloaded birds
downloaded_birds = set()
metadata = {}
if os.path.exists(os.path.join(self.output_dir, 'metadata.json')):
try:
with open(os.path.join(self.output_dir, 'metadata.json'), 'r') as f:
metadata = json.load(f)
downloaded_birds = {link for filename, data in metadata.items()
for link in [data.get('original_link')] if link}
print(f"Found {len(downloaded_birds)} previously downloaded birds")
except Exception as e:
print(f"Error loading existing metadata: {str(e)}")
# Load scraping progress if it exists
last_processed_idx = -1
if os.path.exists(self.cache_file):
try:
with open(self.cache_file, 'r') as f:
progress = json.load(f)
last_processed_idx = progress.get('last_processed_idx', -1)
last_bird = progress.get('last_bird', '')
print(f"Resuming from bird #{last_processed_idx + 1} (after {last_bird})")
except Exception as e:
print(f"Error loading scraping progress: {str(e)}")
# Filter out already downloaded birds and start from last position
remaining_links = []
for idx, link in enumerate(bird_links):
if idx > last_processed_idx and link not in downloaded_birds:
remaining_links.append((idx, link))
print(f"Found {len(remaining_links)} new birds to download")
for idx, link in remaining_links:
try:
# Construct full URL
full_url = urljoin(self.base_url, link)
# Create filename from URL
filename = link.split('/')[-1] + '.html'
filepath = os.path.join(self.output_dir, filename)
# Download page
response = requests.get(full_url, headers=self.headers)
response.raise_for_status()
# Save page content
with open(filepath, 'w', encoding='utf-8') as f:
f.write(response.text)
# Store metadata
metadata[filename] = {
'url': full_url,
'original_link': link,
'downloaded_at': time.strftime('%Y-%m-%d %H:%M:%S')
}
print(f"Downloaded ({len(metadata)}/{len(bird_links)}): {filename}")
# Update progress
with open(self.cache_file, 'w') as f:
progress = {
'last_processed_idx': idx,
'last_bird': link
}
json.dump(progress, f)
# Save metadata after each successful download
with open(os.path.join(self.output_dir, 'metadata.json'), 'w') as f:
json.dump(metadata, f, indent=2)
# Rate limiting - be nice to Wikipedia
time.sleep(0.5)
except Exception as e:
print(f"Error downloading {link}: {str(e)}")
# Continue with next bird, don't update progress
# Final save of metadata (redundant but safe)
with open(os.path.join(self.output_dir, 'metadata.json'), 'w') as f:
json.dump(metadata, f, indent=2)
def main():
scraper = WikiBirdScraper()
scraper.setup_directory()
scraper.download_bird_pages()
if __name__ == "__main__":
main()