From 9d1374c34256b6e1d9ebbd565b7891510d753221 Mon Sep 17 00:00:00 2001 From: Sam Denty Date: Mon, 26 Dec 2022 16:01:35 +0000 Subject: [PATCH] 0.4.5: highest priority for alt containing site_name --- Cargo.lock | 37 ++++++++++++++++++++++++++++++++++++- Cargo.toml | 3 ++- src/icons.rs | 41 ++++++++++++++++++++++++++++++++++------- 3 files changed, 72 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2b22014..6974a7d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1638,7 +1638,7 @@ checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" [[package]] name = "site_icons" -version = "0.4.4" +version = "0.4.5" dependencies = [ "byteorder", "clap", @@ -1658,6 +1658,7 @@ dependencies = [ "serde", "serde_json", "serde_with", + "tldextract", "tokio", "url", "vec1", @@ -1783,6 +1784,26 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" +[[package]] +name = "thiserror" +version = "1.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "time" version = "0.3.17" @@ -1825,6 +1846,20 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" +[[package]] +name = "tldextract" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec03259a0567ad58eed30812bc3e5eda8030f154abc70317ab57b14f00699ca4" +dependencies = [ + "idna 0.2.3", + "log", + "regex", + "serde_json", + "thiserror", + "url", +] + [[package]] name = "tokio" version = "1.22.0" diff --git a/Cargo.toml b/Cargo.toml index f4a2ed1..5dd64a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "site_icons" -version = "0.4.4" +version = "0.4.5" authors = ["Sam Denty "] edition = "2018" license = "GPL-3.0" @@ -34,6 +34,7 @@ mime = { package = "mime_4", version = "0.4.0-a.0" } serde = { version = "1.0", features = ["derive", "rc"] } serde_json = "1.0" futures = "0.3.25" +tldextract = "0.6.0" [target.'cfg(target_arch = "wasm32")'.dependencies] reqwest = { package = "reqwest-wasm", version = "0.11.15", features = [ diff --git a/src/icons.rs b/src/icons.rs index f0ed689..d3812ba 100644 --- a/src/icons.rs +++ b/src/icons.rs @@ -10,8 +10,10 @@ use reqwest::{header::*, IntoUrl}; use scraper::{ElementRef, Html}; use serde::Deserialize; use std::convert::TryInto; +use std::iter; use std::task::Poll; use std::{collections::HashMap, error::Error, pin::Pin, task::Context}; +use tldextract::TldOption; use url::Url; pub struct Icons { @@ -227,14 +229,25 @@ impl Icons { weight += 1; } - let mentions_logo = |attr_name| { - ancestors.iter().any(|ancestor| { - ancestor - .attr(attr_name) - .map(|attr| regex!("logo([^s]|$)").is_match(&attr.to_lowercase())) - .unwrap_or(false) - }) + let mentions = |attr_name, is_match: Box bool>| { + ancestors + .iter() + .chain(iter::once(&elem_ref.value())) + .any(|ancestor| { + ancestor + .attr(attr_name) + .map(|attr| is_match(&attr.to_lowercase())) + .unwrap_or(false) + }) }; + + let mentions_logo = |attr_name| { + mentions( + attr_name, + Box::new(|attr| regex!("logo([^s]|$)").is_match(attr)), + ) + }; + if mentions_logo("class") || mentions_logo("id") { weight += 3; } @@ -245,6 +258,20 @@ impl Icons { weight += 1; } + if let Some(site_name) = url + .domain() + .and_then(|domain| TldOption::default().build().extract(domain).unwrap().domain) + { + // if the alt contains the site_name then highest priority + if site_name + .to_lowercase() + .split('-') + .any(|segment| mentions("alt", Box::new(move |attr| attr.contains(segment)))) + { + weight += 10; + } + } + Some((elem_ref, weight)) }) .collect();