diff --git a/gulagcleaner/command_line.py b/gulagcleaner/command_line.py index ce3082b..250925d 100644 --- a/gulagcleaner/command_line.py +++ b/gulagcleaner/command_line.py @@ -12,12 +12,10 @@ def main(): Available CLI arguments: -h : Display help information. -r : Replace the original file with the cleaned file. - -o : Use the old cleaning method (for files older than 18/05/2023). -v : Display the version of the program. ''' import sys - import os.path # Check for the -h argument if '-h' in sys.argv: @@ -31,18 +29,17 @@ def main(): print("Optional arguments:") print(" -h Show this help message.") print(" -r Replace the original file with the cleaned file.") - print(" -o Use the old cleaning method (for files older than 18/05/2023).") print(" -v Show the version of the program.") return # Check for the -v argument if '-v' in sys.argv: - print("Current version: 0.6.4") + print("Current version: 0.7.0") return # Get the pdf_path argument if len(sys.argv) < 2: - print('Usage: gulagcleaner [-h] [-r] [-o] [-v] ') + print('Usage: gulagcleaner [-h] [-r] [-v] ') return pdf_path = sys.argv[-1] @@ -50,21 +47,15 @@ def main(): if not exists(pdf_path): print("File not found.") return - + # Check if the -r argument is present if '-r' in sys.argv: output_path = pdf_path else: output_path = pdf_path[:-4] + "_clean.pdf" - # Check if the -o argument is present - if '-o' in sys.argv: - method = "old" - pdf_path = decrypt_pdf(pdf_path) - intermediate = True - else: - method = "new" - intermediate = False + #We decrypt the PDF file + pdf_path = decrypt_pdf(pdf_path) #Extract metadata try: @@ -80,15 +71,12 @@ def main(): print("Failed to extract metadata:", e) # Call the cleaning function - return_msg = clean_pdf(pdf_path, output_path, method) + return_msg = clean_pdf(pdf_path, output_path) if return_msg["Success"]: print("Cleaning successful. File saved in", return_msg["return_path"]) else: print("Error:", return_msg["Error"]) - if intermediate: - os.remove(pdf_path) - if __name__ == "__main__": print('Call from the "gulagcleaner" command.') diff --git a/gulagcleaner/extract.py b/gulagcleaner/extract.py index 8564385..0c384bf 100644 --- a/gulagcleaner/extract.py +++ b/gulagcleaner/extract.py @@ -21,14 +21,14 @@ def find_iobj_pairs(first_page, second_page): else: return (first_page,first_page.index(comunes[1]),first_page.index(comunes[0])) -def clean_pdf(pdf_path, output_path="", method="new"): +def clean_pdf(pdf_path, output_path="", method="auto"): """ De-embeds the PDF file and creates a new PDF file in the same folder with each embedded page in a new page. Args: pdf_path (str): The path to the PDF file. output_path (str): The path to the output PDF file. - method (str, optional): Defines what strategy will be used to clean the pdf file. Can be "new", "old" or "naive". + method (str, optional): Defines what strategy will be used to clean the pdf file. Can be "new", "old", "naive" or "auto". Default is "new". Returns: @@ -92,6 +92,37 @@ def clean_pdf(pdf_path, output_path="", method="new"): elif method=="naive": #Not yet implemented. newpages = [] + for page in pdf.pages: + page_type = get_page_type(page) + if page_type == "banner_ads": + newpage = page.copy() + #TODO: Set MediaBox,BleedBox...etc + + + newpages.append(newpage) + + if page_type == "watermark": + newpage = page.copy() + #TODO: Set MediaBox,BleedBox...etc + + + + newpages.append(newpage) + + if page_type == "full_page_ads": + continue + + if page_type == "unknown": + newpages.append(page) + + logo_dims = [(71,390),(37,203),(73,390)] + for logo in [image for image in find_objects(page,valid_subtypes=(PdfName.Image, PdfName.Dummy)) if (int(image.Height),int(image.Width)) in logo_dims]: + logo.Height = 0 + logo.Width = 0 + #TODO: We scale annotations to 0,0 + + elif method == "auto": + return auto_clean_pdf(pdf, pdf_path, output_path) else: return { @@ -117,3 +148,49 @@ def clean_pdf(pdf_path, output_path="", method="new"): "Error": str(e), "Meta": {} } + + +def auto_clean_pdf(pdf, pdf_path, output_path): + #Test for the new method + content_list = [] + for page in pdf.pages: + content = [content.indirect for content in page.Contents] + if len(content)>1: + content_list.append(content) + + if len(content_list)>0 and len(tuple(set(content_list[0]).intersection(content_list[1])))>1: + return clean_pdf(pdf_path, output_path, method="new") + + #Test for the old method + xobjs = [] + for page in pdf.pages: + xobjs.extend([page.Resources.XObject[object] for object in page.Resources.XObject if "EmbeddedPdfPage" in str(object)]) + + if len(xobjs)>0: + return clean_pdf(pdf_path, output_path, method="old") + + #We don't know what method to use, so we use the naive one. + return clean_pdf(pdf_path, output_path, method="naive") + +def get_page_type(page): + images = [(image.Height,image.Width) for image in find_objects(page,valid_subtypes=(PdfName.Image, PdfName.Dummy))] + + #There has to be a better way to do this, but this works for 99.9% of the cases. + logo_dims = [(71,390),(37,203),(73,390)] + horizontal_banner_dims = [(247,1414),(213,1219),(215,1219),(249,1414),(217,1240)] + vertical_banner_dims = [(1753,170),(1518,248),(1520,147),(1753,177),(1751,171),(1537,147)] + full_page_dims = [(842,595),(1754,1240),(2526,1785),(1733,1219),(3508,2480),(2339,1653)] + + has_logo = len(tuple(set(logo_dims).intersection(images)))>0 + has_horizontal_banner = len(tuple(set(logo_dims).intersection(images)))>0 + has_vertical_banner = len(tuple(set(logo_dims).intersection(images)))>0 + has_full_page = len(tuple(set(logo_dims).intersection(images)))>0 + + if has_horizontal_banner and has_vertical_banner: + return "banner_ads" + elif has_full_page: + return "full_page_ads" + elif has_logo: + return "watermark" + else: + return "unknown" \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index d1dfdad..119b092 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = gulagcleaner -version = 0.6.4 +version = 0.7.0 author = YM162 author_email = david.fontaneda16@gmail.com description = Ad removal tool for PDFs written in python.