Added experimental auto-detection of extraction method.

YM162 · Jun 28, 2023 · 9d241ce · 9d241ce
1 parent 44b6e2b
commit 9d241ce
Show file tree

Hide file tree

Showing 3 changed files with 86 additions and 21 deletions.
diff --git a/gulagcleaner/command_line.py b/gulagcleaner/command_line.py
@@ -12,12 +12,10 @@ def main():
     Available CLI arguments:
     -h : Display help information.
     -r : Replace the original file with the cleaned file.
-    -o : Use the old cleaning method (for files older than 18/05/2023).
     -v : Display the version of the program.
 
     '''
     import sys
-    import os.path
 
     # Check for the -h argument
     if '-h' in sys.argv:
@@ -31,40 +29,33 @@ def main():
         print("Optional arguments:")
         print("  -h            Show this help message.")
         print("  -r            Replace the original file with the cleaned file.")
-        print("  -o            Use the old cleaning method (for files older than 18/05/2023).")
         print("  -v            Show the version of the program.")
         return
 
     # Check for the -v argument
     if '-v' in sys.argv:
-        print("Current version: 0.6.4")
+        print("Current version: 0.7.0")
         return
 
     # Get the pdf_path argument
     if len(sys.argv) < 2:
-        print('Usage: gulagcleaner [-h] [-r] [-o] [-v] <pdf_path>')
+        print('Usage: gulagcleaner [-h] [-r] [-v] <pdf_path>')
         return
     pdf_path = sys.argv[-1]
 
     # Check if the file exists
     if not exists(pdf_path):
         print("File not found.")
         return
-
+    
     # Check if the -r argument is present
     if '-r' in sys.argv:
         output_path = pdf_path
     else:
         output_path = pdf_path[:-4] + "_clean.pdf"
 
-     # Check if the -o argument is present
-    if '-o' in sys.argv:
-        method = "old"
-        pdf_path = decrypt_pdf(pdf_path)
-        intermediate = True
-    else:
-        method = "new"
-        intermediate = False
+    #We decrypt the PDF file
+    pdf_path = decrypt_pdf(pdf_path)
 
     #Extract metadata
     try:
@@ -80,15 +71,12 @@ def main():
         print("Failed to extract metadata:", e)         
 
     # Call the cleaning function
-    return_msg = clean_pdf(pdf_path, output_path, method)
+    return_msg = clean_pdf(pdf_path, output_path)
 
     if return_msg["Success"]:
         print("Cleaning successful. File saved in", return_msg["return_path"])
     else:
         print("Error:", return_msg["Error"])
 
-    if intermediate:
-        os.remove(pdf_path)
-
 if __name__ == "__main__":
     print('Call from the "gulagcleaner" command.')
diff --git a/gulagcleaner/extract.py b/gulagcleaner/extract.py
@@ -21,14 +21,14 @@ def find_iobj_pairs(first_page, second_page):
     else:
         return (first_page,first_page.index(comunes[1]),first_page.index(comunes[0]))
 
-def clean_pdf(pdf_path, output_path="", method="new"):
+def clean_pdf(pdf_path, output_path="", method="auto"):
     """
     De-embeds the PDF file and creates a new PDF file in the same folder with each embedded page in a new page.
 
     Args:
         pdf_path (str): The path to the PDF file.
         output_path (str): The path to the output PDF file.
-        method (str, optional): Defines what strategy will be used to clean the pdf file. Can be "new", "old" or "naive".
+        method (str, optional): Defines what strategy will be used to clean the pdf file. Can be "new", "old", "naive" or "auto".
             Default is "new".
 
     Returns:
@@ -92,6 +92,37 @@ def clean_pdf(pdf_path, output_path="", method="new"):
         elif method=="naive":
             #Not yet implemented.
             newpages = []
+            for page in pdf.pages:
+                page_type = get_page_type(page)
+                if page_type == "banner_ads":
+                    newpage = page.copy()
+                    #TODO: Set MediaBox,BleedBox...etc
+
+
+                    newpages.append(newpage)
+
+                if page_type == "watermark":
+                    newpage = page.copy()
+                    #TODO: Set MediaBox,BleedBox...etc
+
+
+
+                    newpages.append(newpage)
+
+                if page_type == "full_page_ads":
+                    continue
+
+                if page_type == "unknown":
+                    newpages.append(page)
+
+                logo_dims = [(71,390),(37,203),(73,390)]
+                for logo in [image for image in find_objects(page,valid_subtypes=(PdfName.Image, PdfName.Dummy)) if (int(image.Height),int(image.Width)) in logo_dims]:
+                    logo.Height = 0
+                    logo.Width = 0
+                #TODO: We scale annotations to 0,0
+
+        elif method == "auto":
+            return auto_clean_pdf(pdf, pdf_path, output_path)
 
         else:
             return {
@@ -117,3 +148,49 @@ def clean_pdf(pdf_path, output_path="", method="new"):
             "Error": str(e),
             "Meta": {}
         }
+
+
+def auto_clean_pdf(pdf, pdf_path, output_path):
+    #Test for the new method
+    content_list = []
+    for page in pdf.pages:
+        content = [content.indirect for content in page.Contents]
+        if len(content)>1:
+            content_list.append(content)
+
+    if len(content_list)>0 and len(tuple(set(content_list[0]).intersection(content_list[1])))>1:
+        return clean_pdf(pdf_path, output_path, method="new")
+
+    #Test for the old method
+    xobjs = []
+    for page in pdf.pages:
+        xobjs.extend([page.Resources.XObject[object] for object in page.Resources.XObject if "EmbeddedPdfPage" in str(object)])
+
+    if len(xobjs)>0:
+        return clean_pdf(pdf_path, output_path, method="old")
+
+    #We don't know what method to use, so we use the naive one.
+    return clean_pdf(pdf_path, output_path, method="naive")
+
+def get_page_type(page):
+    images = [(image.Height,image.Width) for image in find_objects(page,valid_subtypes=(PdfName.Image, PdfName.Dummy))]
+
+    #There has to be a better way to do this, but this works for 99.9% of the cases.
+    logo_dims = [(71,390),(37,203),(73,390)]
+    horizontal_banner_dims = [(247,1414),(213,1219),(215,1219),(249,1414),(217,1240)]
+    vertical_banner_dims = [(1753,170),(1518,248),(1520,147),(1753,177),(1751,171),(1537,147)]
+    full_page_dims = [(842,595),(1754,1240),(2526,1785),(1733,1219),(3508,2480),(2339,1653)]
+
+    has_logo = len(tuple(set(logo_dims).intersection(images)))>0
+    has_horizontal_banner = len(tuple(set(logo_dims).intersection(images)))>0
+    has_vertical_banner = len(tuple(set(logo_dims).intersection(images)))>0
+    has_full_page = len(tuple(set(logo_dims).intersection(images)))>0
+
+    if has_horizontal_banner and has_vertical_banner:
+        return "banner_ads"
+    elif has_full_page:
+        return "full_page_ads"
+    elif has_logo:
+        return "watermark"
+    else:
+        return "unknown"
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = gulagcleaner
-version = 0.6.4
+version = 0.7.0
 author = YM162
 author_email = [email protected]
 description = Ad removal tool for PDFs written in python.