-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf_toc_reader.py
324 lines (265 loc) · 10.4 KB
/
pdf_toc_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
import fitz # PyMuPDF
from typing import List, Dict, Optional, Tuple, Any
from pathlib import Path
import json
import re
def read_pdf_toc(pdf_path: str | Path) -> List[Dict]:
"""
Read and return the table of contents from a PDF file using PyMuPDF with page ranges.
Args:
pdf_path: Path to the PDF file
Returns:
List of dictionaries containing TOC entries with their titles, page ranges, and children
"""
doc = fitz.open(pdf_path)
toc = doc.get_toc()
if not toc:
doc.close()
raise ValueError("No table of contents found in the PDF")
def calculate_page_range(entries: List, start_idx: int) -> Tuple[Dict, int]:
"""
Calculate the page range for a section and its children.
Returns the processed entry and the index where processing stopped.
"""
current_entry = entries[start_idx]
level, title, start_page = current_entry[:3]
# Initialize with the current entry's page as both start and end
result = {
"title": title,
"start_page": start_page,
"end_page": start_page,
"children": [],
}
current_idx = start_idx + 1
# Process children
while current_idx < len(entries):
child_level, _, child_page = entries[current_idx][:3]
# If we encounter a higher level, we've moved to a new section
if child_level <= level:
break
# If it's an immediate child (level + 1)
if child_level == level + 1:
child_entry, new_idx = calculate_page_range(entries, current_idx)
result["children"].append(child_entry)
current_idx = new_idx
# Update parent's end page if child ends later
result["end_page"] = max(result["end_page"], child_entry["end_page"])
else:
current_idx += 1
# If this is a leaf node or the last entry, use the next section's start page
# or the current page as the end page (as the last page and next section's start page may be the same)
if not result["children"] and current_idx < len(entries):
result["end_page"] = entries[current_idx][2]
return result, current_idx
try:
# Process the root level entries
root_entries = []
idx = 0
while idx < len(toc):
if toc[idx][0] == 1: # Root level
entry, idx = calculate_page_range(toc, idx)
root_entries.append(entry)
else:
idx += 1
return root_entries
finally:
doc.close()
def is_leaf_section(section_info: Dict, mappings: Dict) -> bool:
"""
Determine if a section is a leaf section in the TOC hierarchy.
A leaf section is one that has no subsections.
Args:
section_info: The section's information dictionary
mappings: The complete mappings dictionary
Returns:
bool: True if this is a leaf section (has no subsections), False otherwise
"""
# A section is a leaf if it's not a parent of any other section
# not very efficient (since we should be able to terminate early) but ToS are small
return not any(
other_info["full_toc_path"].startswith(f"{section_info['full_toc_path']}/")
for other_info in mappings.values()
)
def print_toc_with_ranges(toc: List[Dict], level: int = 0) -> None:
"""
Print the table of contents with page ranges.
Args:
toc: List of TOC entries with page ranges
level: Current indentation level
"""
for item in toc:
indent = " " * level
range_info = f"(pages {item['start_page']}-{item['end_page']})"
print(f"{indent}{item['title']} {range_info}")
if item["children"]:
print_toc_with_ranges(item["children"], level + 1)
def sanitize_filename(name: str, max_length: int = 30) -> str:
"""
Convert a string to a valid filename across operating systems.
Args:
name: The string to convert
max_length: Maximum length of the resulting filename
Returns:
A sanitized filename string
"""
# Replace invalid characters with underscores
name = re.sub(r'[<>:"/\\|?*]', "_", name)
# Remove leading/trailing spaces and dots
name = name.strip(". ")
# Limit length
if len(name) > max_length:
name = name[:max_length].rstrip(". ")
# Ensure we have a valid name
if not name:
name = "unnamed_section"
return name
def create_directory_structure(
pdf_path: str | Path,
toc: List[Dict],
output_path: str | Path = ".",
doc: Optional[fitz.Document] = None,
skip_keywords: set[str] = set(),
screenshot_dpi: int = 300,
) -> Path:
"""
Create a directory structure matching the TOC and save page screenshots.
Args:
pdf_path: Path to the PDF file
toc: Table of contents data
output_path: Where to create the directory structure
doc: Optional open PDF document (to avoid reopening)
skip_keywords: Set of keywords to skip in section titles
Returns:
Path to the created directory structure
"""
pdf_path = Path(pdf_path)
output_path = Path(output_path)
# Use PDF filename as top directory name
top_dir = output_path / sanitize_filename(pdf_path.stem)
top_dir.mkdir(parents=True, exist_ok=True)
# Initialize mapping dictionary
path_mapping = {}
# Open PDF if not provided
should_close_doc = False
if doc is None:
doc = fitz.open(pdf_path)
should_close_doc = True
try:
def get_trimmed_pixmap(page: fitz.Page) -> fitz.Pixmap:
"""Get a trimmed pixmap of the page content."""
# Get the content bounds by joining all content bboxes
content_bbox = fitz.Rect() # start with empty rectangle
for item in page.get_bboxlog():
# item[1] contains the bbox coordinates
content_bbox |= item[1] # join this bbox into result
# If no content was found, use the full page
if content_bbox.is_empty:
content_bbox = page.bound()
# Add small margin (5 points) around content
margin = 5
content_bbox += (-margin, -margin, margin, margin)
# Ensure we stay within page bounds
page_bbox = page.bound()
content_bbox &= page_bbox # intersect with page bounds
# Calculate matrix for screenshot_dpi DPI
zoom = screenshot_dpi / 72
matrix = fitz.Matrix(zoom, zoom)
# Create pixmap of just the content area
return page.get_pixmap(
matrix=matrix,
clip=content_bbox,
)
def process_toc_item(
item: Dict[str, Any],
current_path: Path,
parent_path: str = "",
full_toc_path: str = "",
) -> None:
"""Process a TOC item and create corresponding directory structure"""
# Sanitize the current directory name
if any(keyword in item["title"].lower() for keyword in skip_keywords):
return
dir_name = sanitize_filename(item["title"])
dir_path = current_path / dir_name
# Calculate the relative path using sanitized names
if parent_path:
# If there's a parent, join with sanitized parent path
rel_path = f"{parent_path}/{dir_name}"
else:
# If no parent (root level), just use sanitized name
rel_path = dir_name
# Calculate the full unsanitized path
if full_toc_path:
full_unsanitized_path = f"{full_toc_path}/{item['title']}"
else:
full_unsanitized_path = item["title"]
# Create directory
dir_path.mkdir(exist_ok=True)
# Add to mapping
path_mapping[rel_path] = {
"title": item["title"],
"page_range": {"start": item["start_page"], "end": item["end_page"]},
"full_toc_path": full_unsanitized_path,
}
if item["children"]:
# Process children
for child in item["children"]:
process_toc_item(child, dir_path, rel_path, full_unsanitized_path)
else:
# Leaf node: save page screenshots
for page_num in range(item["start_page"] - 1, item["end_page"]):
try:
page = doc[page_num]
pix = get_trimmed_pixmap(page)
image_path = dir_path / f"page_{page_num + 1}.jpg"
pix.save(str(image_path))
except Exception as e:
print(f"Error processing page {page_num + 1}: {e}")
# Process all root entries
for item in toc:
process_toc_item(item, top_dir)
# Save mapping file
mapping_path = top_dir / "directory_mapping.json"
with open(mapping_path, "w", encoding="utf-8") as f:
json.dump(path_mapping, f, indent=2, ensure_ascii=False)
return top_dir
finally:
if should_close_doc:
doc.close()
def process_pdf_and_create_structure(
pdf_path: str | Path,
output_path: str | Path = ".",
skip_keywords: Optional[set[str]] = None,
) -> Path:
"""
Process a PDF file, extract its table of contents, and create a directory structure.
Args:
pdf_path: Path to the PDF file
output_path: Where to create the directory structure
skip_keywords: Optional set of keywords to skip in section titles (case-insensitive)
Returns:
Path to the created directory structure
Raises:
ValueError: If the PDF has no table of contents or other validation errors
Exception: For other errors during processing
"""
try:
doc = fitz.open(pdf_path)
toc = read_pdf_toc(pdf_path)
print("Extracted ToC:")
print("-" * 40)
print_toc_with_ranges(toc)
print("\nCreating directory structure...")
top_dir = create_directory_structure(
pdf_path,
toc,
output_path=output_path,
doc=doc,
skip_keywords=skip_keywords,
)
print("Done!")
doc.close()
return top_dir
except Exception as e:
print(f"Error: {e}")
raise