Skip to content

Commit

Permalink
Added experimental auto-detection of extraction method.
Browse files Browse the repository at this point in the history
  • Loading branch information
YM162 committed Jun 28, 2023
1 parent 44b6e2b commit 9d241ce
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 21 deletions.
24 changes: 6 additions & 18 deletions gulagcleaner/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,10 @@ def main():
Available CLI arguments:
-h : Display help information.
-r : Replace the original file with the cleaned file.
-o : Use the old cleaning method (for files older than 18/05/2023).
-v : Display the version of the program.
'''
import sys
import os.path

# Check for the -h argument
if '-h' in sys.argv:
Expand All @@ -31,40 +29,33 @@ def main():
print("Optional arguments:")
print(" -h Show this help message.")
print(" -r Replace the original file with the cleaned file.")
print(" -o Use the old cleaning method (for files older than 18/05/2023).")
print(" -v Show the version of the program.")
return

# Check for the -v argument
if '-v' in sys.argv:
print("Current version: 0.6.4")
print("Current version: 0.7.0")
return

# Get the pdf_path argument
if len(sys.argv) < 2:
print('Usage: gulagcleaner [-h] [-r] [-o] [-v] <pdf_path>')
print('Usage: gulagcleaner [-h] [-r] [-v] <pdf_path>')
return
pdf_path = sys.argv[-1]

# Check if the file exists
if not exists(pdf_path):
print("File not found.")
return

# Check if the -r argument is present
if '-r' in sys.argv:
output_path = pdf_path
else:
output_path = pdf_path[:-4] + "_clean.pdf"

# Check if the -o argument is present
if '-o' in sys.argv:
method = "old"
pdf_path = decrypt_pdf(pdf_path)
intermediate = True
else:
method = "new"
intermediate = False
#We decrypt the PDF file
pdf_path = decrypt_pdf(pdf_path)

#Extract metadata
try:
Expand All @@ -80,15 +71,12 @@ def main():
print("Failed to extract metadata:", e)

# Call the cleaning function
return_msg = clean_pdf(pdf_path, output_path, method)
return_msg = clean_pdf(pdf_path, output_path)

if return_msg["Success"]:
print("Cleaning successful. File saved in", return_msg["return_path"])
else:
print("Error:", return_msg["Error"])

if intermediate:
os.remove(pdf_path)

if __name__ == "__main__":
print('Call from the "gulagcleaner" command.')
81 changes: 79 additions & 2 deletions gulagcleaner/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ def find_iobj_pairs(first_page, second_page):
else:
return (first_page,first_page.index(comunes[1]),first_page.index(comunes[0]))

def clean_pdf(pdf_path, output_path="", method="new"):
def clean_pdf(pdf_path, output_path="", method="auto"):
"""
De-embeds the PDF file and creates a new PDF file in the same folder with each embedded page in a new page.
Args:
pdf_path (str): The path to the PDF file.
output_path (str): The path to the output PDF file.
method (str, optional): Defines what strategy will be used to clean the pdf file. Can be "new", "old" or "naive".
method (str, optional): Defines what strategy will be used to clean the pdf file. Can be "new", "old", "naive" or "auto".
Default is "new".
Returns:
Expand Down Expand Up @@ -92,6 +92,37 @@ def clean_pdf(pdf_path, output_path="", method="new"):
elif method=="naive":
#Not yet implemented.
newpages = []
for page in pdf.pages:
page_type = get_page_type(page)
if page_type == "banner_ads":
newpage = page.copy()
#TODO: Set MediaBox,BleedBox...etc


newpages.append(newpage)

if page_type == "watermark":
newpage = page.copy()
#TODO: Set MediaBox,BleedBox...etc



newpages.append(newpage)

if page_type == "full_page_ads":
continue

if page_type == "unknown":
newpages.append(page)

logo_dims = [(71,390),(37,203),(73,390)]
for logo in [image for image in find_objects(page,valid_subtypes=(PdfName.Image, PdfName.Dummy)) if (int(image.Height),int(image.Width)) in logo_dims]:
logo.Height = 0
logo.Width = 0
#TODO: We scale annotations to 0,0

elif method == "auto":
return auto_clean_pdf(pdf, pdf_path, output_path)

else:
return {
Expand All @@ -117,3 +148,49 @@ def clean_pdf(pdf_path, output_path="", method="new"):
"Error": str(e),
"Meta": {}
}


def auto_clean_pdf(pdf, pdf_path, output_path):
#Test for the new method
content_list = []
for page in pdf.pages:
content = [content.indirect for content in page.Contents]
if len(content)>1:
content_list.append(content)

if len(content_list)>0 and len(tuple(set(content_list[0]).intersection(content_list[1])))>1:
return clean_pdf(pdf_path, output_path, method="new")

#Test for the old method
xobjs = []
for page in pdf.pages:
xobjs.extend([page.Resources.XObject[object] for object in page.Resources.XObject if "EmbeddedPdfPage" in str(object)])

if len(xobjs)>0:
return clean_pdf(pdf_path, output_path, method="old")

#We don't know what method to use, so we use the naive one.
return clean_pdf(pdf_path, output_path, method="naive")

def get_page_type(page):
images = [(image.Height,image.Width) for image in find_objects(page,valid_subtypes=(PdfName.Image, PdfName.Dummy))]

#There has to be a better way to do this, but this works for 99.9% of the cases.
logo_dims = [(71,390),(37,203),(73,390)]
horizontal_banner_dims = [(247,1414),(213,1219),(215,1219),(249,1414),(217,1240)]
vertical_banner_dims = [(1753,170),(1518,248),(1520,147),(1753,177),(1751,171),(1537,147)]
full_page_dims = [(842,595),(1754,1240),(2526,1785),(1733,1219),(3508,2480),(2339,1653)]

has_logo = len(tuple(set(logo_dims).intersection(images)))>0
has_horizontal_banner = len(tuple(set(logo_dims).intersection(images)))>0
has_vertical_banner = len(tuple(set(logo_dims).intersection(images)))>0
has_full_page = len(tuple(set(logo_dims).intersection(images)))>0

if has_horizontal_banner and has_vertical_banner:
return "banner_ads"
elif has_full_page:
return "full_page_ads"
elif has_logo:
return "watermark"
else:
return "unknown"
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = gulagcleaner
version = 0.6.4
version = 0.7.0
author = YM162
author_email = [email protected]
description = Ad removal tool for PDFs written in python.
Expand Down

0 comments on commit 9d241ce

Please sign in to comment.