From 5e126b308afd3f250ad6a36831a65d2dd492bd77 Mon Sep 17 00:00:00 2001
From: gongchandang49 <170948611+gongchandang49@users.noreply.github.com>
Date: Sun, 1 Dec 2024 20:36:11 +0100
Subject: [PATCH] =?UTF-8?q?Add=20tolerance=20=C2=B7=20page=5Ftype.rs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 gulagcleaner_rs/src/models/page_type.rs | 122 +++++++++---------------
 1 file changed, 47 insertions(+), 75 deletions(-)
diff --git a/gulagcleaner_rs/src/models/page_type.rs b/gulagcleaner_rs/src/models/page_type.rs
index ee05941..106ab0f 100644
--- a/gulagcleaner_rs/src/models/page_type.rs
+++ b/gulagcleaner_rs/src/models/page_type.rs
@@ -1,11 +1,10 @@
 use std::{collections::HashSet, error::Error};
-
+ 
 use lopdf::{Document, ObjectId};
-
+ 
 use super::method::{get_images, get_xobjs};
-
-#[derive(Default)]
-#[derive(Debug)]
+ 
+#[derive(Default, Debug)]
 /// Represents the different methods used in the Gulag Cleaner application.
 pub enum PageType {
     BannerAds,
@@ -14,9 +13,9 @@ pub enum PageType {
     #[default]
     Idk,
 }
-
-pub const LOGO_DIMS: [(i64, i64); 9] = [(71, 390), (37, 203), (73, 390),(23,130),(24,130),(19,109),(20,109),(72,391),(24,129)];
-
+ 
+pub const LOGO_DIMS: [(i64, i64); 9] = [(71, 390), (37, 203), (73, 390), (23, 130), (24, 130), (19, 109), (20, 109), (72, 391), (24, 129)];
+ 
 const HORIZONTAL_BANNER_DIMS: [(i64, i64); 12] = [
     (247, 1414),
     (213, 1219),
@@ -55,68 +54,44 @@ const FULL_PAGE_DIMS: [(i64, i64); 10] = [
     (2339, 1653),
     (1785, 2526),
     (1109, 782),
-    (1109,784),
-    (1759,1241)
+    (1109, 784),
+    (1759, 1241),
 ];
-
+ 
+/// Check if dimension matches any dimension from a list within a tolerance of 10 units.
+fn matches_with_tolerance(dims: &[(i64, i64)], images: &HashSet<(i64, i64)>) -> bool {
+    dims.iter().any(|&(w, h)| {
+        images.iter().any(|&(iw, ih)| {
+            (iw >= w - 10 && iw <= w + 10) && (ih >= h - 10 && ih <= h + 10)
+        })
+    })
+}
+ 
 impl PageType {
     /// Get the type of a page based on its content.
-    ///
-    /// This function takes a document and a page ID as input and returns the type of the page.
-    /// The page type is determined by analyzing the images present in the page.
-    /// It checks for the presence of specific image dimensions to identify different types of pages,
-    /// such as banner ads, full-page ads, watermarks, or unknown types.
-    ///
-    /// # Arguments
-    ///
-    /// * `doc` - A reference to the document containing the page.
-    /// * `page` - A reference to the ID of the page.
-    ///
-    /// # Returns
-    ///
-    /// A `Result` containing the `PageType` of the page if successful, or a `Box<dyn Error>` if an error occurs.
     pub fn get_page_type(doc: &Document, page: &ObjectId) -> Result<PageType, Box<dyn Error>> {
         let xobjs = get_xobjs(doc, page)?;
         let images = get_images(doc, xobjs)?;
-        println!("{:?}", images);
-        // let has_logo = !LOGO_DIMS
-        //     .iter()
-        //     .collect::<HashSet<_>>()
-        //     .intersection(&images.iter().collect::<HashSet<_>>())
-        //     .collect::<Vec<_>>()
-        //     .is_empty();
-
-        let has_horizontal_banner = !HORIZONTAL_BANNER_DIMS
-            .iter()
-            .collect::<HashSet<_>>()
-            .intersection(&images.iter().collect::<HashSet<_>>())
-            .collect::<Vec<_>>()
-            .is_empty();
-
-        let has_vertical_banner = !VERTICAL_BANNER_DIMS
-            .iter()
-            .collect::<HashSet<_>>()
-            .intersection(&images.iter().collect::<HashSet<_>>())
-            .collect::<Vec<_>>()
-            .is_empty();
-
-        let has_full_page = !FULL_PAGE_DIMS
-            .iter()
-            .collect::<HashSet<_>>()
-            .intersection(&images.iter().collect::<HashSet<_>>())
-            .collect::<Vec<_>>()
-            .is_empty();
-
+        let image_set: HashSet<(i64, i64)> = images.into_iter().collect();
+ 
+        let has_horizontal_banner = matches_with_tolerance(&HORIZONTAL_BANNER_DIMS, &image_set);
+        let has_vertical_banner = matches_with_tolerance(&VERTICAL_BANNER_DIMS, &image_set);
+        let has_full_page = matches_with_tolerance(&FULL_PAGE_DIMS, &image_set);
+ 
         if has_horizontal_banner && has_vertical_banner {
             Ok(PageType::BannerAds)
         } else if has_full_page {
             Ok(PageType::FullPageAds)
         } else {
             let annots = doc.get_page_annotations(*page)?;
-            
-            let wuolah_annot =  annots.iter().filter(|x| is_annots_wuolah(x,doc)).filter( |x| x.get(b"Rect").unwrap().as_array().unwrap()[0] == lopdf::Object::Integer(0) || x.get(b"Rect").unwrap().as_array().unwrap()[0] == lopdf::Object::Real(0.0));
-            // For each wuolah annot, check if substrings are present in any of the URI
-            println!("{:?}", wuolah_annot);
+ 
+            let wuolah_annot = annots
+                .iter()
+                .filter(|x| is_annots_wuolah(x, doc))
+                .filter(|x| {
+                    x.get(b"Rect").unwrap().as_array().unwrap()[0] == lopdf::Object::Integer(0)
+                        || x.get(b"Rect").unwrap().as_array().unwrap()[0] == lopdf::Object::Real(0.0)
+                });
             let mut bannercounter = 0;
             let mut hasfooter = false;
             for annot in wuolah_annot {
@@ -136,35 +111,32 @@ impl PageType {
                     }
                 }
             }
-            println!("{}", bannercounter);
             if bannercounter == 1 {
-                return Ok(PageType::Watermark)
+                return Ok(PageType::Watermark);
             }
             if bannercounter > 1 {
-                return Ok(PageType::BannerAds)
-            } 
+                return Ok(PageType::BannerAds);
+            }
             if hasfooter {
-                return Ok(PageType::Watermark)
+                return Ok(PageType::Watermark);
             }
             Ok(PageType::Idk)
         }
     }
 }
-
+ 
 fn is_annots_wuolah(annot: &&&lopdf::Dictionary, doc: &lopdf::Document) -> bool {
     match annot.get(b"A") {
-        Ok(x) => {
-            match doc.dereference(x).unwrap().1.as_dict().unwrap().get(b"URI") {
-                Ok(y) => {
-                    let url = doc.dereference(y).unwrap().1.as_string().unwrap();
-                    if url.contains("track.wlh.es"){
-                        !(url.contains("apuntes"))
-                    } else {
-                        false
-                    }
-                },
-                Err(_) => false,
+        Ok(x) => match doc.dereference(x).unwrap().1.as_dict().unwrap().get(b"URI") {
+            Ok(y) => {
+                let url = doc.dereference(y).unwrap().1.as_string().unwrap();
+                if url.contains("track.wlh.es") {
+                    !(url.contains("apuntes"))
+                } else {
+                    false
+                }
             }
+            Err(_) => false,
         },
         Err(_) => false,
     }