From 5e126b308afd3f250ad6a36831a65d2dd492bd77 Mon Sep 17 00:00:00 2001 From: gongchandang49 <170948611+gongchandang49@users.noreply.github.com> Date: Sun, 1 Dec 2024 20:36:11 +0100 Subject: [PATCH] =?UTF-8?q?Add=20tolerance=20=C2=B7=20page=5Ftype.rs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gulagcleaner_rs/src/models/page_type.rs | 122 +++++++++--------------- 1 file changed, 47 insertions(+), 75 deletions(-) diff --git a/gulagcleaner_rs/src/models/page_type.rs b/gulagcleaner_rs/src/models/page_type.rs index ee05941..106ab0f 100644 --- a/gulagcleaner_rs/src/models/page_type.rs +++ b/gulagcleaner_rs/src/models/page_type.rs @@ -1,11 +1,10 @@ use std::{collections::HashSet, error::Error}; - + use lopdf::{Document, ObjectId}; - + use super::method::{get_images, get_xobjs}; - -#[derive(Default)] -#[derive(Debug)] + +#[derive(Default, Debug)] /// Represents the different methods used in the Gulag Cleaner application. pub enum PageType { BannerAds, @@ -14,9 +13,9 @@ pub enum PageType { #[default] Idk, } - -pub const LOGO_DIMS: [(i64, i64); 9] = [(71, 390), (37, 203), (73, 390),(23,130),(24,130),(19,109),(20,109),(72,391),(24,129)]; - + +pub const LOGO_DIMS: [(i64, i64); 9] = [(71, 390), (37, 203), (73, 390), (23, 130), (24, 130), (19, 109), (20, 109), (72, 391), (24, 129)]; + const HORIZONTAL_BANNER_DIMS: [(i64, i64); 12] = [ (247, 1414), (213, 1219), @@ -55,68 +54,44 @@ const FULL_PAGE_DIMS: [(i64, i64); 10] = [ (2339, 1653), (1785, 2526), (1109, 782), - (1109,784), - (1759,1241) + (1109, 784), + (1759, 1241), ]; - + +/// Check if dimension matches any dimension from a list within a tolerance of 10 units. +fn matches_with_tolerance(dims: &[(i64, i64)], images: &HashSet<(i64, i64)>) -> bool { + dims.iter().any(|&(w, h)| { + images.iter().any(|&(iw, ih)| { + (iw >= w - 10 && iw <= w + 10) && (ih >= h - 10 && ih <= h + 10) + }) + }) +} + impl PageType { /// Get the type of a page based on its content. - /// - /// This function takes a document and a page ID as input and returns the type of the page. - /// The page type is determined by analyzing the images present in the page. - /// It checks for the presence of specific image dimensions to identify different types of pages, - /// such as banner ads, full-page ads, watermarks, or unknown types. - /// - /// # Arguments - /// - /// * `doc` - A reference to the document containing the page. - /// * `page` - A reference to the ID of the page. - /// - /// # Returns - /// - /// A `Result` containing the `PageType` of the page if successful, or a `Box` if an error occurs. pub fn get_page_type(doc: &Document, page: &ObjectId) -> Result> { let xobjs = get_xobjs(doc, page)?; let images = get_images(doc, xobjs)?; - println!("{:?}", images); - // let has_logo = !LOGO_DIMS - // .iter() - // .collect::>() - // .intersection(&images.iter().collect::>()) - // .collect::>() - // .is_empty(); - - let has_horizontal_banner = !HORIZONTAL_BANNER_DIMS - .iter() - .collect::>() - .intersection(&images.iter().collect::>()) - .collect::>() - .is_empty(); - - let has_vertical_banner = !VERTICAL_BANNER_DIMS - .iter() - .collect::>() - .intersection(&images.iter().collect::>()) - .collect::>() - .is_empty(); - - let has_full_page = !FULL_PAGE_DIMS - .iter() - .collect::>() - .intersection(&images.iter().collect::>()) - .collect::>() - .is_empty(); - + let image_set: HashSet<(i64, i64)> = images.into_iter().collect(); + + let has_horizontal_banner = matches_with_tolerance(&HORIZONTAL_BANNER_DIMS, &image_set); + let has_vertical_banner = matches_with_tolerance(&VERTICAL_BANNER_DIMS, &image_set); + let has_full_page = matches_with_tolerance(&FULL_PAGE_DIMS, &image_set); + if has_horizontal_banner && has_vertical_banner { Ok(PageType::BannerAds) } else if has_full_page { Ok(PageType::FullPageAds) } else { let annots = doc.get_page_annotations(*page)?; - - let wuolah_annot = annots.iter().filter(|x| is_annots_wuolah(x,doc)).filter( |x| x.get(b"Rect").unwrap().as_array().unwrap()[0] == lopdf::Object::Integer(0) || x.get(b"Rect").unwrap().as_array().unwrap()[0] == lopdf::Object::Real(0.0)); - // For each wuolah annot, check if substrings are present in any of the URI - println!("{:?}", wuolah_annot); + + let wuolah_annot = annots + .iter() + .filter(|x| is_annots_wuolah(x, doc)) + .filter(|x| { + x.get(b"Rect").unwrap().as_array().unwrap()[0] == lopdf::Object::Integer(0) + || x.get(b"Rect").unwrap().as_array().unwrap()[0] == lopdf::Object::Real(0.0) + }); let mut bannercounter = 0; let mut hasfooter = false; for annot in wuolah_annot { @@ -136,35 +111,32 @@ impl PageType { } } } - println!("{}", bannercounter); if bannercounter == 1 { - return Ok(PageType::Watermark) + return Ok(PageType::Watermark); } if bannercounter > 1 { - return Ok(PageType::BannerAds) - } + return Ok(PageType::BannerAds); + } if hasfooter { - return Ok(PageType::Watermark) + return Ok(PageType::Watermark); } Ok(PageType::Idk) } } } - + fn is_annots_wuolah(annot: &&&lopdf::Dictionary, doc: &lopdf::Document) -> bool { match annot.get(b"A") { - Ok(x) => { - match doc.dereference(x).unwrap().1.as_dict().unwrap().get(b"URI") { - Ok(y) => { - let url = doc.dereference(y).unwrap().1.as_string().unwrap(); - if url.contains("track.wlh.es"){ - !(url.contains("apuntes")) - } else { - false - } - }, - Err(_) => false, + Ok(x) => match doc.dereference(x).unwrap().1.as_dict().unwrap().get(b"URI") { + Ok(y) => { + let url = doc.dereference(y).unwrap().1.as_string().unwrap(); + if url.contains("track.wlh.es") { + !(url.contains("apuntes")) + } else { + false + } } + Err(_) => false, }, Err(_) => false, }