-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbird_parser.py
247 lines (204 loc) · 9.75 KB
/
bird_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import os
import json
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
import re
from pathlib import Path
import time
user_agent_string = ""
class WikiBirdParser:
def __init__(self):
self.pages_dir = "bird_pages"
self.output_dir = "parsed_birds"
self.text_dir = os.path.join(self.output_dir, "text")
self.images_base_dir = os.path.join(self.output_dir, "images")
self.parsed_metadata = {}
self.headers = {
'User-Agent': user_agent_string
}
self.cache_file = os.path.join(self.output_dir, "parsing_progress.json")
def setup_directories(self):
"""Create necessary directories if they don't exist"""
for directory in [self.output_dir, self.text_dir, self.images_base_dir]:
Path(directory).mkdir(parents=True, exist_ok=True)
def clean_text(self, text):
"""Clean extracted text by removing extra whitespace and citations"""
# Remove citation brackets [1], [2], etc.
text = re.sub(r'\[\d+\]', '', text)
# Remove extra whitespace
text = ' '.join(text.split())
return text
def is_valid_bird_image(self, image, image_url, alt_text):
"""Check if the image is a valid bird image"""
if not image or not image_url:
return False
# Skip status/range maps and icons
if ('status' in image_url.lower() or
'range' in image_url.lower() or
'icon' in image_url.lower() or
'map' in image_url.lower()):
return False
# Skip small icons/decorative images
width = int(image.get('width', 0))
if width and width < 100:
return False
return True
def categorize_image(self, alt_text, caption_text=""):
"""Categorize image based on alt text and caption"""
text = (alt_text + " " + caption_text).lower()
# Look for gender indicators
if any(term in text for term in ['male', 'female']):
if 'male' in text and 'female' in text:
return 'both_genders'
elif 'male' in text:
return 'male'
else:
return 'female'
return 'unspecified'
def download_image(self, image_url, image_path):
"""Download an image and return success status"""
try:
img_response = requests.get(image_url, headers=self.headers)
img_response.raise_for_status()
with open(image_path, 'wb') as f:
f.write(img_response.content)
return True
except Exception as e:
print(f"Error downloading image {image_url}: {str(e)}")
return False
def parse_bird_page(self, filename):
"""Parse a single bird page for text and image"""
bird_name = filename.replace('.html', '')
# Skip if already parsed
if bird_name in self.parsed_metadata:
print(f"Bird {bird_name} already parsed. Skipping.")
return True
with open(os.path.join(self.pages_dir, filename), 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
# Create bird-specific image directory
bird_image_dir = os.path.join(self.images_base_dir, bird_name)
Path(bird_image_dir).mkdir(exist_ok=True)
# Extract main content
content = soup.find('div', {'id': 'mw-content-text'})
if not content:
print(f"No content found for {bird_name}")
return False
# Get paragraphs
paragraphs = []
for p in content.find_all('p'):
if p.text.strip():
cleaned_text = self.clean_text(p.text)
if cleaned_text:
paragraphs.append(cleaned_text)
# Save text content
text_filename = f"{bird_name}.txt"
text_path = os.path.join(self.text_dir, text_filename)
with open(text_path, 'w', encoding='utf-8') as f:
f.write('\n\n'.join(paragraphs))
# Find and download images - focusing only on infobox
images_info = []
infobox = soup.find('table', {'class': 'infobox'})
if infobox:
for img in infobox.find_all('img')[:2]: # Only look at first two images
# Get caption if available
caption = ""
caption_elem = img.find_parent('td').find_next_sibling('td') if img.find_parent('td') else None
if caption_elem:
caption = caption_elem.get_text(strip=True)
image_url = urljoin('https:', img.get('src', ''))
alt_text = img.get('alt', '')
if self.is_valid_bird_image(img, image_url, alt_text):
image_info = {
'url': image_url,
'alt': alt_text,
'caption': caption,
'width': img.get('width', 'unknown'),
'height': img.get('height', 'unknown'),
'type': self.categorize_image(alt_text, caption)
}
images_info.append(image_info)
if len(images_info) >= 2: # Stop after finding 2 valid images
break
# Download and save images
successful_images = []
for i, image_info in enumerate(images_info):
# Create descriptive filename based on type
image_type = image_info['type']
if image_type in ['male', 'female']:
image_filename = f"{bird_name}_{image_type}.jpg"
else:
image_filename = f"{bird_name}_{i+1}.jpg"
image_path = os.path.join(bird_image_dir, image_filename)
# Check if image already exists
if os.path.exists(image_path):
print(f"Image {image_filename} already exists for {bird_name}")
image_info['local_path'] = os.path.join(bird_name, image_filename)
successful_images.append(image_info)
continue
if self.download_image(image_info['url'], image_path):
image_info['local_path'] = os.path.join(bird_name, image_filename)
successful_images.append(image_info)
# Store metadata
self.parsed_metadata[bird_name] = {
'text_file': text_filename,
'paragraphs': len(paragraphs),
'images': successful_images,
'total_images': len(successful_images),
'has_male_female_pairs': any(img['type'] == 'male' for img in successful_images) and
any(img['type'] == 'female' for img in successful_images)
}
# Save parsing metadata after each bird to allow resuming
with open(os.path.join(self.output_dir, 'parsing_metadata.json'), 'w') as f:
json.dump(self.parsed_metadata, f, indent=2)
# Update progress cache
with open(self.cache_file, 'w') as f:
json.dump({
'last_parsed_bird': bird_name,
'total_parsed': len(self.parsed_metadata),
'last_updated': time.strftime('%Y-%m-%d %H:%M:%S')
}, f)
return True
def parse_all_birds(self):
"""Parse all downloaded bird pages"""
self.setup_directories()
# Load original metadata to get list of files
with open(os.path.join(self.pages_dir, 'metadata.json'), 'r') as f:
original_metadata = json.load(f)
# Load existing parsing metadata if available
if os.path.exists(os.path.join(self.output_dir, 'parsing_metadata.json')):
with open(os.path.join(self.output_dir, 'parsing_metadata.json'), 'r') as f:
self.parsed_metadata = json.load(f)
print(f"Loaded existing metadata for {len(self.parsed_metadata)} birds")
# Load parsing progress if available
last_parsed_bird = None
if os.path.exists(self.cache_file):
with open(self.cache_file, 'r') as f:
progress = json.load(f)
last_parsed_bird = progress.get('last_parsed_bird')
print(f"Last parsed bird: {last_parsed_bird}")
# Get all HTML files to parse
files_to_parse = [f for f in original_metadata.keys() if f.endswith('.html')]
total_files = len(files_to_parse)
print(f"Found {total_files} bird pages to parse")
# Find starting point if resuming
start_idx = 0
if last_parsed_bird:
last_filename = f"{last_parsed_bird}.html"
if last_filename in files_to_parse:
start_idx = files_to_parse.index(last_filename) + 1
print(f"Resuming from file #{start_idx} (after {last_parsed_bird})")
# Parse each file
for i, filename in enumerate(files_to_parse[start_idx:], start_idx):
print(f"Parsing ({i+1}/{total_files}): {filename}")
success = self.parse_bird_page(filename)
if not success:
print(f"Failed to parse {filename}, continuing with next file")
# Final save of parsing metadata
with open(os.path.join(self.output_dir, 'parsing_metadata.json'), 'w') as f:
json.dump(self.parsed_metadata, f, indent=2)
def main():
parser = WikiBirdParser()
parser.parse_all_birds()
if __name__ == "__main__":
main()