Skip to content

Commit

Permalink
Merge pull request #33 from gongchandang49/main
Browse files Browse the repository at this point in the history
Update page_type.rs
  • Loading branch information
YM162 authored Dec 6, 2024
2 parents fd5da46 + 5e126b3 commit c63bb28
Showing 1 changed file with 51 additions and 77 deletions.
128 changes: 51 additions & 77 deletions gulagcleaner_rs/src/models/page_type.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
use std::{collections::HashSet, error::Error};

use lopdf::{Document, ObjectId};

use super::method::{get_images, get_xobjs};

#[derive(Default)]
#[derive(Debug)]

#[derive(Default, Debug)]
/// Represents the different methods used in the Gulag Cleaner application.
pub enum PageType {
BannerAds,
Expand All @@ -14,30 +13,32 @@ pub enum PageType {
#[default]
Idk,
}

pub const LOGO_DIMS: [(i64, i64); 9] = [(71, 390), (37, 203), (73, 390),(23,130),(24,130),(19,109),(20,109),(72,391),(24,129)];

const HORIZONTAL_BANNER_DIMS: [(i64, i64); 11] = [
pub const LOGO_DIMS: [(i64, i64); 9] = [(71, 390), (37, 203), (73, 390), (23, 130), (24, 130), (19, 109), (20, 109), (72, 391), (24, 129)];
const HORIZONTAL_BANNER_DIMS: [(i64, i64); 12] = [
(247, 1414),
(213, 1219),
(215, 1219),
(249, 1414),
(217, 1240),
(147, 1757),
(148, 1769),
(221, 1240),
(136, 780),
(137,780),
(218,1241),
(218,1246)
];
const VERTICAL_BANNER_DIMS: [(i64, i64); 12] = [
const VERTICAL_BANNER_DIMS: [(i64, i64); 13] = [
(1753, 170),
(1518, 248),
(1520, 147),
(1753, 177),
(1751, 171),
(1537, 147),
(1093, 217),
(1094, 217),
(1534, 150),
(970, 92),
(969,93),
Expand All @@ -53,68 +54,44 @@ const FULL_PAGE_DIMS: [(i64, i64); 10] = [
(2339, 1653),
(1785, 2526),
(1109, 782),
(1109,784),
(1759,1241)
(1109, 784),
(1759, 1241),
];


/// Check if dimension matches any dimension from a list within a tolerance of 10 units.
fn matches_with_tolerance(dims: &[(i64, i64)], images: &HashSet<(i64, i64)>) -> bool {
dims.iter().any(|&(w, h)| {
images.iter().any(|&(iw, ih)| {
(iw >= w - 10 && iw <= w + 10) && (ih >= h - 10 && ih <= h + 10)
})
})
}

impl PageType {
/// Get the type of a page based on its content.
///
/// This function takes a document and a page ID as input and returns the type of the page.
/// The page type is determined by analyzing the images present in the page.
/// It checks for the presence of specific image dimensions to identify different types of pages,
/// such as banner ads, full-page ads, watermarks, or unknown types.
///
/// # Arguments
///
/// * `doc` - A reference to the document containing the page.
/// * `page` - A reference to the ID of the page.
///
/// # Returns
///
/// A `Result` containing the `PageType` of the page if successful, or a `Box<dyn Error>` if an error occurs.
pub fn get_page_type(doc: &Document, page: &ObjectId) -> Result<PageType, Box<dyn Error>> {
let xobjs = get_xobjs(doc, page)?;
let images = get_images(doc, xobjs)?;
println!("{:?}", images);
// let has_logo = !LOGO_DIMS
// .iter()
// .collect::<HashSet<_>>()
// .intersection(&images.iter().collect::<HashSet<_>>())
// .collect::<Vec<_>>()
// .is_empty();

let has_horizontal_banner = !HORIZONTAL_BANNER_DIMS
.iter()
.collect::<HashSet<_>>()
.intersection(&images.iter().collect::<HashSet<_>>())
.collect::<Vec<_>>()
.is_empty();

let has_vertical_banner = !VERTICAL_BANNER_DIMS
.iter()
.collect::<HashSet<_>>()
.intersection(&images.iter().collect::<HashSet<_>>())
.collect::<Vec<_>>()
.is_empty();

let has_full_page = !FULL_PAGE_DIMS
.iter()
.collect::<HashSet<_>>()
.intersection(&images.iter().collect::<HashSet<_>>())
.collect::<Vec<_>>()
.is_empty();

let image_set: HashSet<(i64, i64)> = images.into_iter().collect();

let has_horizontal_banner = matches_with_tolerance(&HORIZONTAL_BANNER_DIMS, &image_set);
let has_vertical_banner = matches_with_tolerance(&VERTICAL_BANNER_DIMS, &image_set);
let has_full_page = matches_with_tolerance(&FULL_PAGE_DIMS, &image_set);

if has_horizontal_banner && has_vertical_banner {
Ok(PageType::BannerAds)
} else if has_full_page {
Ok(PageType::FullPageAds)
} else {
let annots = doc.get_page_annotations(*page)?;

let wuolah_annot = annots.iter().filter(|x| is_annots_wuolah(x,doc)).filter( |x| x.get(b"Rect").unwrap().as_array().unwrap()[0] == lopdf::Object::Integer(0) || x.get(b"Rect").unwrap().as_array().unwrap()[0] == lopdf::Object::Real(0.0));
// For each wuolah annot, check if substrings are present in any of the URI
println!("{:?}", wuolah_annot);

let wuolah_annot = annots
.iter()
.filter(|x| is_annots_wuolah(x, doc))
.filter(|x| {
x.get(b"Rect").unwrap().as_array().unwrap()[0] == lopdf::Object::Integer(0)
|| x.get(b"Rect").unwrap().as_array().unwrap()[0] == lopdf::Object::Real(0.0)
});
let mut bannercounter = 0;
let mut hasfooter = false;
for annot in wuolah_annot {
Expand All @@ -134,35 +111,32 @@ impl PageType {
}
}
}
println!("{}", bannercounter);
if bannercounter == 1 {
return Ok(PageType::Watermark)
return Ok(PageType::Watermark);
}
if bannercounter > 1 {
return Ok(PageType::BannerAds)
}
return Ok(PageType::BannerAds);
}
if hasfooter {
return Ok(PageType::Watermark)
return Ok(PageType::Watermark);
}
Ok(PageType::Idk)
}
}
}

fn is_annots_wuolah(annot: &&&lopdf::Dictionary, doc: &lopdf::Document) -> bool {
match annot.get(b"A") {
Ok(x) => {
match doc.dereference(x).unwrap().1.as_dict().unwrap().get(b"URI") {
Ok(y) => {
let url = doc.dereference(y).unwrap().1.as_string().unwrap();
if url.contains("track.wlh.es"){
!(url.contains("apuntes"))
} else {
false
}
},
Err(_) => false,
Ok(x) => match doc.dereference(x).unwrap().1.as_dict().unwrap().get(b"URI") {
Ok(y) => {
let url = doc.dereference(y).unwrap().1.as_string().unwrap();
if url.contains("track.wlh.es") {
!(url.contains("apuntes"))
} else {
false
}
}
Err(_) => false,
},
Err(_) => false,
}
Expand Down

0 comments on commit c63bb28

Please sign in to comment.