From 270c47925fa8441b7e8cac1cf49f5d92999f261b Mon Sep 17 00:00:00 2001 From: YM162 Date: Tue, 19 Sep 2023 18:19:32 +0200 Subject: [PATCH] Fixed MediaBox and BleedBox for PDFs where the initial coordinates were not initially 0,0 (New method). --- gulagcleaner/extract.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gulagcleaner/extract.py b/gulagcleaner/extract.py index 0c384bf..8e8f44e 100644 --- a/gulagcleaner/extract.py +++ b/gulagcleaner/extract.py @@ -1,6 +1,6 @@ from pdfrw import PdfReader, PdfWriter from pdfrw.findobjs import wrap_object, find_objects -from pdfrw.objects import PdfName +from pdfrw.objects import PdfName, PdfArray def find_iobj_pairs(first_page, second_page): """ @@ -86,6 +86,8 @@ def clean_pdf(pdf_path, output_path="", method="auto"): newpage = page.copy() newpage.Contents = [pdf.indirect_objects[iobj] for iobj in new_contents[i]] newpage.Annots = [] + newpage.MediaBox = PdfArray([0,0,float(newpage.MediaBox[2])-float(newpage.MediaBox[0]),float(newpage.MediaBox[3])-float(newpage.MediaBox[1])]) + newpage.BleedBox = PdfArray([0,0,float(newpage.BleedBox[2])-float(newpage.BleedBox[0]),float(newpage.BleedBox[3])-float(newpage.BleedBox[1])]) newpages.append(newpage) #Naive method. Just detects the pages with ads and crops them. THIS METHOD IS NOT RECOMENDED AT ALL. It is very unreliable and when copying text from the outputed pdf the ads and watermaks are copied as well, because we are just "hiding" them from the user, not truly removing them.