Skip to content

Commit

Permalink
Merge pull request #9 from YM162/development
Browse files Browse the repository at this point in the history
Fix for PDFs with unusual MediaBoxes
  • Loading branch information
YM162 authored Sep 20, 2023
2 parents 9090208 + b1fa97b commit 3d72951
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion gulagcleaner/extract.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pdfrw import PdfReader, PdfWriter
from pdfrw.findobjs import wrap_object, find_objects
from pdfrw.objects import PdfName
from pdfrw.objects import PdfName, PdfArray

def find_iobj_pairs(first_page, second_page):
"""
Expand Down Expand Up @@ -86,6 +86,8 @@ def clean_pdf(pdf_path, output_path="", method="auto"):
newpage = page.copy()
newpage.Contents = [pdf.indirect_objects[iobj] for iobj in new_contents[i]]
newpage.Annots = []
newpage.MediaBox = PdfArray([0,0,float(newpage.MediaBox[2])-float(newpage.MediaBox[0]),float(newpage.MediaBox[3])-float(newpage.MediaBox[1])])
newpage.BleedBox = PdfArray([0,0,float(newpage.BleedBox[2])-float(newpage.BleedBox[0]),float(newpage.BleedBox[3])-float(newpage.BleedBox[1])])
newpages.append(newpage)

#Naive method. Just detects the pages with ads and crops them. THIS METHOD IS NOT RECOMENDED AT ALL. It is very unreliable and when copying text from the outputed pdf the ads and watermaks are copied as well, because we are just "hiding" them from the user, not truly removing them.
Expand Down

0 comments on commit 3d72951

Please sign in to comment.