From 34972db18bc4db2344ab89118a41fed918ee31c1 Mon Sep 17 00:00:00 2001 From: Sam Denty Date: Tue, 3 Jan 2023 14:21:54 +0000 Subject: [PATCH] feat(0.6): performance --- Cargo.lock | 115 +++++- Cargo.toml | 8 +- src/bin/site-icons.rs | 15 +- src/html_parser/head.rs | 122 ++++++ src/html_parser/mod.rs | 5 + src/html_parser/site_logo.rs | 157 ++++++++ src/icon.rs | 61 --- src/{ => icon}/icon_info.rs | 43 +- src/{ => icon}/icon_size/gif.rs | 0 src/{ => icon}/icon_size/ico.rs | 0 src/{ => icon}/icon_size/icon_sizes.rs | 36 +- src/{ => icon}/icon_size/jpeg.rs | 0 src/{ => icon}/icon_size/mod.rs | 0 src/{ => icon}/icon_size/png.rs | 0 src/icon/icon_size/svg.rs | 68 ++++ src/icon/mod.rs | 123 ++++++ src/icon_size/svg.rs | 84 ---- src/icons.rs | 531 ++++++++----------------- src/lib.rs | 45 +-- src/manifest.rs | 54 +++ src/utils/background_poll.rs | 43 ++ src/{ => utils}/macros.rs | 12 +- src/utils/mod.rs | 16 + src/{utils.rs => utils/svg_encoder.rs} | 0 24 files changed, 974 insertions(+), 564 deletions(-) create mode 100644 src/html_parser/head.rs create mode 100644 src/html_parser/mod.rs create mode 100644 src/html_parser/site_logo.rs delete mode 100644 src/icon.rs rename src/{ => icon}/icon_info.rs (86%) rename src/{ => icon}/icon_size/gif.rs (100%) rename src/{ => icon}/icon_size/ico.rs (100%) rename src/{ => icon}/icon_size/icon_sizes.rs (79%) rename src/{ => icon}/icon_size/jpeg.rs (100%) rename src/{ => icon}/icon_size/mod.rs (100%) rename src/{ => icon}/icon_size/png.rs (100%) create mode 100644 src/icon/icon_size/svg.rs create mode 100644 src/icon/mod.rs delete mode 100644 src/icon_size/svg.rs create mode 100644 src/manifest.rs create mode 100644 src/utils/background_poll.rs rename src/{ => utils}/macros.rs (72%) create mode 100644 src/utils/mod.rs rename src/{utils.rs => utils/svg_encoder.rs} (100%) diff --git a/Cargo.lock b/Cargo.lock index 20ae384..ddb16f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -31,6 +31,23 @@ dependencies = [ "libc", ] +[[package]] +name = "async-trait" +version = "0.1.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d1d8ab452a3936018a687b20e6f7cf5363d713b732b8884001317b0e48aa3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "async_once" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ce4f10ea3abcd6617873bae9f91d1c5332b4a778bd9ce34d0cd517474c1de82" + [[package]] name = "atty" version = "0.2.14" @@ -78,6 +95,43 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dfb24e866b15a1af2a1b663f10c6b6b8f397a84aadb828f12e5b289ec23a3a3c" +[[package]] +name = "cached" +version = "0.41.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec6d20b3d24b6c74e2c5331d2d3d8d1976a9883c7da179aa851afa4c90d62e36" +dependencies = [ + "async-trait", + "async_once", + "cached_proc_macro", + "cached_proc_macro_types", + "futures", + "hashbrown", + "instant", + "lazy_static", + "once_cell", + "thiserror", + "tokio", +] + +[[package]] +name = "cached_proc_macro" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "751f7f4e7a091545e7f6c65bacc404eaee7e87bfb1f9ece234a1caa173dc16f2" +dependencies = [ + "cached_proc_macro_types", + "darling 0.13.4", + "quote", + "syn", +] + +[[package]] +name = "cached_proc_macro_types" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a4f925191b4367301851c6d99b09890311d74b0d43f274c0b34c86d308a3663" + [[package]] name = "cc" version = "1.0.77" @@ -272,14 +326,38 @@ dependencies = [ "syn", ] +[[package]] +name = "darling" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a01d95850c592940db9b8194bc39f4bc0e89dee5c4265e4b1807c34a9aba453c" +dependencies = [ + "darling_core 0.13.4", + "darling_macro 0.13.4", +] + [[package]] name = "darling" version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0dd3cd20dc6b5a876612a6e5accfe7f3dd883db6d07acfbf14c128f61550dfa" dependencies = [ - "darling_core", - "darling_macro", + "darling_core 0.14.2", + "darling_macro 0.14.2", +] + +[[package]] +name = "darling_core" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "859d65a907b6852c9361e3185c862aae7fafd2887876799fa55f5f99dc40d610" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", ] [[package]] @@ -296,13 +374,24 @@ dependencies = [ "syn", ] +[[package]] +name = "darling_macro" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c972679f83bdf9c42bd905396b6c3588a843a17f0f16dfcfa3e2c5d57441835" +dependencies = [ + "darling_core 0.13.4", + "quote", + "syn", +] + [[package]] name = "darling_macro" version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7618812407e9402654622dd402b0a89dff9ba93badd6540781526117b92aab7e" dependencies = [ - "darling_core", + "darling_core 0.14.2", "quote", "syn", ] @@ -384,6 +473,17 @@ dependencies = [ "instant", ] +[[package]] +name = "flo_stream" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a7246db09b6a924fb11fedc1e33c34e6d5d0ba3c95a87cd2994f9581cf5a470" +dependencies = [ + "futures", + "lazy_static", + "smallvec", +] + [[package]] name = "fnv" version = "1.0.7" @@ -762,6 +862,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" dependencies = [ "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", ] [[package]] @@ -1650,7 +1753,7 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3452b4c0f6c1e357f73fdb87cd1efabaa12acf328c7a528e252893baeb3f4aa" dependencies = [ - "darling", + "darling 0.14.2", "proc-macro2", "quote", "syn", @@ -1683,12 +1786,14 @@ checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" [[package]] name = "site_icons" -version = "0.5.0" +version = "0.6.0" dependencies = [ "byteorder", + "cached", "clap", "data-url", "env_logger", + "flo_stream", "futures", "html5ever", "itertools", diff --git a/Cargo.toml b/Cargo.toml index f910e68..a0ff3b9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "site_icons" -version = "0.5.0" +version = "0.6.0" authors = ["Sam Denty "] edition = "2018" license = "GPL-3.0" @@ -19,6 +19,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] vec1 = { version = "1.10.1", features = ["serde"] } +flo_stream = "0.7" itertools = "0.10.5" serde_with = "2.1.0" html5ever = "0.26.0" @@ -44,11 +45,16 @@ reqwest = { package = "reqwest-wasm", version = "0.11.16", features = [ "blocking", "stream", ] } +cached = { version = "0.41.0", default_features = false, features = [ + "proc_macro", + "wasm", +] } [target.'cfg(not(target_arch = "wasm32"))'.dependencies] clap = { version = "3.2.23", features = ["derive"] } tokio = { version = "1.22.0", features = ["full"] } env_logger = "0.9.3" +cached = "0.41.0" reqwest = { version = "0.11.13", features = [ "json", "cookies", diff --git a/src/bin/site-icons.rs b/src/bin/site-icons.rs index a733971..2d373c5 100644 --- a/src/bin/site-icons.rs +++ b/src/bin/site-icons.rs @@ -1,12 +1,15 @@ use clap::Parser; use env_logger::Builder; use log::LevelFilter; -use site_icons::Icons; +use site_icons::SiteIcons; use std::error::Error; #[derive(Parser)] struct Opts { - urls: Vec, + url: String, + + #[clap(long)] + fast: bool, #[clap(long)] json: bool, #[clap(long)] @@ -16,7 +19,7 @@ struct Opts { #[tokio::main] async fn main() -> Result<(), Box> { - let mut icons = Icons::new(); + let mut icons = SiteIcons::new(); let opts: Opts = Opts::parse(); if opts.debug { @@ -25,11 +28,7 @@ async fn main() -> Result<(), Box> { builder.init(); } - for url in opts.urls { - icons.load_website(&url).await?; - } - - let entries = icons.entries().await; + let entries = icons.load_website(opts.url, opts.fast).await?; if opts.json { println!("{}", serde_json::to_string_pretty(&entries)?) diff --git a/src/html_parser/head.rs b/src/html_parser/head.rs new file mode 100644 index 0000000..ea69367 --- /dev/null +++ b/src/html_parser/head.rs @@ -0,0 +1,122 @@ +use crate::utils::poll_in_background; +use crate::Icon; +use crate::IconKind; +use crate::SiteIcons; +use futures::future::join_all; +use futures::FutureExt; +use futures::Stream; +use futures::StreamExt; +use lol_html::{element, errors::RewritingError, HtmlRewriter, Settings}; +use std::{ + cell::RefCell, + error::Error, + fmt::{self, Display}, +}; +use url::Url; + +#[derive(Debug)] +struct EndOfHead {} + +impl Display for EndOfHead { + fn fmt(&self, _: &mut fmt::Formatter<'_>) -> fmt::Result { + Ok(()) + } +} + +impl Error for EndOfHead {} + +pub async fn parse_head( + url: &Url, + mut body: impl Stream, String>> + Unpin, +) -> Result, Box> { + let mut icons = Vec::new(); + let new_icons = RefCell::new(Vec::new()); + + { + let mut rewriter = HtmlRewriter::new( + Settings { + element_content_handlers: vec![ + element!("head", |head| { + head.on_end_tag(|_| Err(Box::new(EndOfHead {})))?; + Ok(()) + }), + element!("link[rel='manifest']", |manifest| { + if let Some(href) = manifest + .get_attribute("href") + .and_then(|href| url.join(&href).ok()) + { + new_icons.borrow_mut().push( + async { SiteIcons::load_manifest(href).await.unwrap_or(Vec::new()) } + .boxed_local() + .shared(), + ) + } + + Ok(()) + }), + element!( + join_with!( + ",", + "link[rel='icon']", + "link[rel='shortcut icon']", + "link[rel='apple-touch-icon']", + "link[rel='apple-touch-icon-precomposed']" + ), + |link| { + let rel = link.get_attribute("rel").unwrap(); + + if let Some(href) = link + .get_attribute("href") + .and_then(|href| url.join(&href).ok()) + { + let kind = if rel.contains("apple-touch-icon") { + IconKind::AppIcon + } else { + IconKind::SiteFavicon + }; + + let sizes = link.get_attribute("sizes"); + + new_icons.borrow_mut().push( + async { + Icon::load(href, kind, sizes) + .await + .map(|icon| vec![icon]) + .unwrap_or(Vec::new()) + } + .boxed_local() + .shared(), + ) + }; + + Ok(()) + } + ), + ], + ..Settings::default() + }, + |_: &[u8]| {}, + ); + + while let Some(data) = poll_in_background(body.next(), join_all(icons.clone())).await { + let result = rewriter.write(&data?); + + icons.extend(new_icons.borrow_mut().drain(..)); + + match result { + Err(RewritingError::ContentHandlerError(result)) => { + match result.downcast::() { + Ok(_) => break, + Err(err) => return Err(err), + }; + } + + result => result?, + } + } + } + + let icons = join_all(icons).await.into_iter().flatten().collect(); + + Ok(icons) +} diff --git a/src/html_parser/mod.rs b/src/html_parser/mod.rs new file mode 100644 index 0000000..641c102 --- /dev/null +++ b/src/html_parser/mod.rs @@ -0,0 +1,5 @@ +mod head; +mod site_logo; + +pub use head::*; +pub use site_logo::*; diff --git a/src/html_parser/site_logo.rs b/src/html_parser/site_logo.rs new file mode 100644 index 0000000..297c8aa --- /dev/null +++ b/src/html_parser/site_logo.rs @@ -0,0 +1,157 @@ +use crate::{utils::encode_svg, Icon, IconKind}; +use futures::{Stream, StreamExt}; +use html5ever::{ + driver, + tendril::{Tendril, TendrilSink}, +}; +use scraper::{ElementRef, Html}; +use std::error::Error; +use std::iter; +use tldextract::TldOption; +use url::Url; + +pub async fn parse_site_logo( + url: &Url, + mut body: impl Stream, String>> + Unpin, + is_blacklisted: impl Fn(&Url) -> bool, +) -> Result> { + let mut parser = driver::parse_document(Html::new_document(), Default::default()); + while let Some(data) = body.next().await { + if let Ok(data) = Tendril::try_from_byte_slice(&data?) { + parser.process(data) + } + } + + let document = parser.finish(); + + let mut logos: Vec<_> = document + .select(selector!( + "a[href='/'] img, a[href='/'] svg", + "header img, header svg", + "img[src*=logo]", + "img[alt*=logo], svg[alt*=logo]", + "*[class*=logo] img, *[class*=logo] svg", + "*[id*=logo] img, *[id*=logo] svg", + "img[class*=logo], svg[class*=logo]", + "img[id*=logo], svg[id*=logo]", + )) + .enumerate() + .filter_map(|(i, elem_ref)| { + let elem = elem_ref.value(); + let ancestors = elem_ref + .ancestors() + .map(ElementRef::wrap) + .flatten() + .map(|elem_ref| elem_ref.value()) + .collect::>(); + + let skip_classnames = regex!("menu|search"); + let should_skip = ancestors.iter().any(|ancestor| { + ancestor + .attr("class") + .map(|attr| skip_classnames.is_match(&attr.to_lowercase())) + .or_else(|| { + ancestor + .attr("id") + .map(|attr| skip_classnames.is_match(&attr.to_lowercase())) + }) + .unwrap_or(false) + }); + + if should_skip { + return None; + } + + let mut weight = 0; + + // if in the header + if ancestors.iter().any(|element| element.name() == "header") { + weight += 2; + } + + if i == 0 { + weight += 1; + } + + let mentions = |attr_name, is_match: Box bool>| { + ancestors.iter().chain(iter::once(&elem)).any(|ancestor| { + ancestor + .attr(attr_name) + .map(|attr| is_match(&attr.to_lowercase())) + .unwrap_or(false) + }) + }; + + if mentions("href", Box::new(|attr| attr == "/")) { + weight += 5; + }; + + let mentions_logo = |attr_name| { + mentions( + attr_name, + Box::new(|attr| regex!("logo([^s]|$)").is_match(attr)), + ) + }; + + if mentions_logo("class") || mentions_logo("id") { + weight += 3; + } + if mentions_logo("alt") { + weight += 2; + } + if mentions_logo("src") { + weight += 1; + } + + if let Some(site_name) = url + .domain() + .and_then(|domain| TldOption::default().build().extract(domain).unwrap().domain) + { + // if the alt contains the site_name then highest priority + if site_name + .to_lowercase() + .split('-') + .any(|segment| mentions("alt", Box::new(move |attr| attr.contains(segment)))) + { + weight += 10; + } + } + + let href = if elem.name() == "svg" { + Some(Url::parse(&encode_svg(&elem_ref.html())).unwrap()) + } else { + elem.attr("src").and_then(|href| url.join(&href).ok()) + }; + + if let Some(href) = &href { + if is_blacklisted(href) { + return None; + } + } + + href.map(|href| (href, elem_ref, weight)) + }) + .collect(); + + logos.sort_by(|(_, _, a_weight), (_, _, b_weight)| b_weight.cmp(a_weight)); + + // prefer over svg + let mut prev_weight = None; + for (href, elem_ref, weight) in &logos { + if let Some(prev_weight) = prev_weight { + if weight != prev_weight { + break; + } + } + prev_weight = Some(weight); + + if elem_ref.value().name() == "img" { + return Icon::load(href.clone(), IconKind::SiteLogo, None).await; + } + } + + match logos.into_iter().next() { + Some((href, _, _)) => Icon::load(href.clone(), IconKind::SiteLogo, None).await, + None => Err("No site logo found".into()), + } +} diff --git a/src/icon.rs b/src/icon.rs deleted file mode 100644 index e666361..0000000 --- a/src/icon.rs +++ /dev/null @@ -1,61 +0,0 @@ -use super::IconInfo; -use serde::{Deserialize, Serialize}; -use serde_with::{DeserializeFromStr, SerializeDisplay}; -use std::{ - cmp::Ordering, - collections::HashMap, - fmt::{self, Display}, - str::FromStr, -}; -use url::Url; - -#[derive(Debug, Clone, PartialOrd, PartialEq, Ord, Eq, SerializeDisplay, DeserializeFromStr)] -pub enum IconKind { - AppIcon, - SiteLogo, - SiteFavicon, -} - -impl Display for IconKind { - fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { - f.write_str(match self { - IconKind::SiteLogo => "site_logo", - IconKind::AppIcon => "app_icon", - IconKind::SiteFavicon => "site_favicon", - }) - } -} - -impl FromStr for IconKind { - type Err = String; - - fn from_str(kind: &str) -> Result { - match kind { - "site_logo" => Ok(IconKind::SiteLogo), - "app_icon" => Ok(IconKind::AppIcon), - "site_favicon" => Ok(IconKind::SiteFavicon), - _ => Err("unknown icon kind!".into()), - } - } -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] -pub struct Icon { - pub url: Url, - pub headers: HashMap, - pub kind: IconKind, - #[serde(flatten)] - pub info: IconInfo, -} - -impl Ord for Icon { - fn cmp(&self, other: &Self) -> Ordering { - self.info.cmp(&other.info) - } -} - -impl PartialOrd for Icon { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} diff --git a/src/icon_info.rs b/src/icon/icon_info.rs similarity index 86% rename from src/icon_info.rs rename to src/icon/icon_info.rs index 8feca6d..f24e532 100644 --- a/src/icon_info.rs +++ b/src/icon/icon_info.rs @@ -1,4 +1,5 @@ -use crate::{icon_size::*, CLIENT}; +use super::*; +use crate::CLIENT; use data_url::DataUrl; use futures::{io::Cursor, prelude::*, stream::TryStreamExt}; use mime::MediaType; @@ -6,6 +7,7 @@ use reqwest::{header::*, Url}; use serde::{Deserialize, Serialize}; use std::{ cmp::Ordering, + convert::TryFrom, error::Error, fmt::{self, Display}, io, @@ -72,7 +74,7 @@ impl IconInfo { headers: HeaderMap, sizes: Option, ) -> Result> { - let sizes = sizes.as_ref().and_then(|s| IconSizes::from_str(s).ok()); + let sizes = sizes.as_ref().and_then(|s| IconSizes::try_from(s).ok()); let (mime, mut body): (_, Box) = match url.scheme() { "data" => { @@ -92,6 +94,43 @@ impl IconInfo { } _ => { + match &url.path().split('.').last().unwrap_or("").to_lowercase()[..] { + "svg" => { + if let Some(sizes) = sizes { + return Ok(IconInfo::SVG { + size: Some(*sizes.largest()), + }); + } + } + "png" => { + if let Some(sizes) = sizes { + return Ok(IconInfo::PNG { + size: *sizes.largest(), + }); + } + } + "jpeg" | "jpg" => { + if let Some(sizes) = sizes { + return Ok(IconInfo::JPEG { + size: *sizes.largest(), + }); + } + } + "ico" => { + if let Some(sizes) = sizes { + return Ok(IconInfo::ICO { sizes }); + } + } + "gif" => { + if let Some(sizes) = sizes { + return Ok(IconInfo::GIF { + size: *sizes.largest(), + }); + } + } + _ => {} + }; + let res = CLIENT.get(url).headers(headers).send().await?; if !res.status().is_success() { return Err("failed to fetch".into()); diff --git a/src/icon_size/gif.rs b/src/icon/icon_size/gif.rs similarity index 100% rename from src/icon_size/gif.rs rename to src/icon/icon_size/gif.rs diff --git a/src/icon_size/ico.rs b/src/icon/icon_size/ico.rs similarity index 100% rename from src/icon_size/ico.rs rename to src/icon/icon_size/ico.rs diff --git a/src/icon_size/icon_sizes.rs b/src/icon/icon_size/icon_sizes.rs similarity index 79% rename from src/icon_size/icon_sizes.rs rename to src/icon/icon_size/icon_sizes.rs index ec3af56..b5fae53 100644 --- a/src/icon_size/icon_sizes.rs +++ b/src/icon/icon_size/icon_sizes.rs @@ -22,7 +22,22 @@ impl Display for IconSizes { } impl IconSizes { - pub fn from_str(sizes_str: &str) -> Result> { + pub fn add_size(&mut self, size: IconSize) { + match self.0.binary_search(&size) { + Ok(_) => {} + Err(pos) => self.0.insert(pos, size), + } + } + + pub fn largest(&self) -> &IconSize { + self.0.first() + } +} + +impl TryFrom<&str> for IconSizes { + type Error = Box; + + fn try_from(sizes_str: &str) -> Result { let size_strs = sizes_str.split(" "); let mut sizes = Vec::new(); @@ -34,16 +49,21 @@ impl IconSizes { Ok(sizes.try_into()?) } +} - pub fn add_size(&mut self, size: IconSize) { - match self.0.binary_search(&size) { - Ok(_) => {} - Err(pos) => self.0.insert(pos, size), - } +impl TryFrom<&String> for IconSizes { + type Error = Box; + + fn try_from(sizes_str: &String) -> Result { + IconSizes::try_from(sizes_str.as_str()) } +} - pub fn largest(&self) -> &IconSize { - self.0.first() +impl TryFrom for IconSizes { + type Error = Box; + + fn try_from(sizes_str: String) -> Result { + IconSizes::try_from(sizes_str.as_str()) } } diff --git a/src/icon_size/jpeg.rs b/src/icon/icon_size/jpeg.rs similarity index 100% rename from src/icon_size/jpeg.rs rename to src/icon/icon_size/jpeg.rs diff --git a/src/icon_size/mod.rs b/src/icon/icon_size/mod.rs similarity index 100% rename from src/icon_size/mod.rs rename to src/icon/icon_size/mod.rs diff --git a/src/icon_size/png.rs b/src/icon/icon_size/png.rs similarity index 100% rename from src/icon_size/png.rs rename to src/icon/icon_size/png.rs diff --git a/src/icon/icon_size/svg.rs b/src/icon/icon_size/svg.rs new file mode 100644 index 0000000..bfaf4bb --- /dev/null +++ b/src/icon/icon_size/svg.rs @@ -0,0 +1,68 @@ +use super::IconSize; +use futures::prelude::*; +use lol_html::{element, HtmlRewriter, Settings}; +use std::{cell::RefCell, error::Error}; + +fn parse_size(size: S) -> Option { + size + .to_string() + .parse::() + .ok() + .map(|size| size.round() as u32) +} + +pub async fn get_svg_size( + first_bytes: &[u8; 2], + reader: &mut R, +) -> Result, Box> { + let size = RefCell::new(None); + + let mut rewriter = HtmlRewriter::new( + Settings { + element_content_handlers: vec![ + // Rewrite insecure hyperlinks + element!("svg", |el| { + let viewbox = el.get_attribute("viewbox"); + + let width = el.get_attribute("width").and_then(parse_size); + let height = el.get_attribute("height").and_then(parse_size); + + *size.borrow_mut() = Some(if let (Some(width), Some(height)) = (width, height) { + Some(IconSize::new(width, height)) + } else if let Some(viewbox) = viewbox { + regex!(r"^\d+\s+\d+\s+(\d+\.?[\d]?)\s+(\d+\.?[\d]?)") + .captures(&viewbox) + .map(|captures| { + let width = parse_size(captures.get(1).unwrap().as_str()).unwrap(); + let height = parse_size(captures.get(2).unwrap().as_str()).unwrap(); + IconSize::new(width, height) + }) + } else { + None + }); + + Ok(()) + }), + ], + ..Settings::default() + }, + |_: &[u8]| {}, + ); + + rewriter.write(first_bytes)?; + + let mut buffer = [0; 100]; + + loop { + let n = reader.read(&mut buffer).await?; + if n == 0 { + return Err("invalid svg".into()); + } + + rewriter.write(&buffer[..n])?; + + if let Some(size) = *size.borrow() { + return Ok(size); + } + } +} diff --git a/src/icon/mod.rs b/src/icon/mod.rs new file mode 100644 index 0000000..109e59f --- /dev/null +++ b/src/icon/mod.rs @@ -0,0 +1,123 @@ +mod icon_info; +mod icon_size; + +pub use icon_info::*; +pub use icon_size::*; + +use itertools::Itertools; +use serde::{Deserialize, Serialize}; +use serde_with::{DeserializeFromStr, SerializeDisplay}; +use std::{ + cmp::Ordering, + collections::HashMap, + convert::TryInto, + error::Error, + fmt::{self, Display}, + hash::{Hash, Hasher}, + str::FromStr, +}; +use url::Url; + +#[derive(Debug, Clone, PartialOrd, PartialEq, Ord, Eq, SerializeDisplay, DeserializeFromStr)] +pub enum IconKind { + AppIcon, + SiteFavicon, + SiteLogo, +} + +impl Display for IconKind { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + f.write_str(match self { + IconKind::SiteLogo => "site_logo", + IconKind::AppIcon => "app_icon", + IconKind::SiteFavicon => "site_favicon", + }) + } +} + +impl FromStr for IconKind { + type Err = String; + + fn from_str(kind: &str) -> Result { + match kind { + "site_logo" => Ok(IconKind::SiteLogo), + "app_icon" => Ok(IconKind::AppIcon), + "site_favicon" => Ok(IconKind::SiteFavicon), + _ => Err("unknown icon kind!".into()), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct Icon { + pub url: Url, + pub headers: HashMap, + pub kind: IconKind, + #[serde(flatten)] + pub info: IconInfo, +} + +impl Hash for Icon { + fn hash(&self, state: &mut H) { + ( + &self.url, + self + .headers + .iter() + .sorted_by_key(|(key, _)| *key) + .collect::>(), + ) + .hash(state); + } +} + +impl Icon { + pub fn new(url: Url, kind: IconKind, info: IconInfo) -> Self { + Icon::new_with_headers(url, HashMap::new(), kind, info) + } + + pub fn new_with_headers( + url: Url, + headers: HashMap, + kind: IconKind, + info: IconInfo, + ) -> Self { + Self { + url, + headers, + kind, + info, + } + } + + pub async fn load( + url: Url, + kind: IconKind, + sizes: Option, + ) -> Result> { + Icon::load_with_headers(url, HashMap::new(), kind, sizes).await + } + + pub async fn load_with_headers( + url: Url, + headers: HashMap, + kind: IconKind, + sizes: Option, + ) -> Result> { + let info = IconInfo::load(url.clone(), (&headers).try_into().unwrap(), sizes).await?; + + Ok(Icon::new_with_headers(url, headers, kind, info)) + } +} + +impl Ord for Icon { + fn cmp(&self, other: &Self) -> Ordering { + self.info.cmp(&other.info) + } +} + +impl PartialOrd for Icon { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} diff --git a/src/icon_size/svg.rs b/src/icon_size/svg.rs deleted file mode 100644 index c340849..0000000 --- a/src/icon_size/svg.rs +++ /dev/null @@ -1,84 +0,0 @@ -use super::IconSize; -use futures::prelude::*; -use lol_html::{element, errors::RewritingError, HtmlRewriter, Settings}; -use std::{ - error::Error, - fmt::{self, Display}, -}; - -#[derive(Debug)] -struct SizeResult(Option); - -impl Display for SizeResult { - fn fmt(&self, _: &mut fmt::Formatter<'_>) -> fmt::Result { - Ok(()) - } -} - -impl Error for SizeResult {} - -fn parse_size(size: S) -> Option { - size - .to_string() - .parse::() - .ok() - .map(|size| size.round() as u32) -} - -pub async fn get_svg_size( - first_bytes: &[u8; 2], - reader: &mut R, -) -> Result, Box> { - let mut rewriter = HtmlRewriter::new( - Settings { - element_content_handlers: vec![ - // Rewrite insecure hyperlinks - element!("svg", |el| { - let viewbox = el.get_attribute("viewbox"); - - let width = el.get_attribute("width").and_then(parse_size); - let height = el.get_attribute("height").and_then(parse_size); - - Err(Box::new(SizeResult( - if let (Some(width), Some(height)) = (width, height) { - Some(IconSize::new(width, height)) - } else if let Some(viewbox) = viewbox { - regex!(r"^\d+\s+\d+\s+(\d+\.?[\d]?)\s+(\d+\.?[\d]?)") - .captures(&viewbox) - .map(|captures| { - let width = parse_size(captures.get(1).unwrap().as_str()).unwrap(); - let height = parse_size(captures.get(2).unwrap().as_str()).unwrap(); - IconSize::new(width, height) - }) - } else { - None - }, - ))) - }), - ], - ..Settings::default() - }, - |_: &[u8]| {}, - ); - - rewriter.write(first_bytes)?; - - let mut buffer = [0; 100]; - - loop { - let n = reader.read(&mut buffer).await?; - if n == 0 { - return Err("invalid svg".into()); - } - - match rewriter.write(&buffer[..n]) { - Err(RewritingError::ContentHandlerError(result)) => { - let result = result.downcast::().unwrap(); - - return Ok(result.0); - } - - result => result?, - } - } -} diff --git a/src/icons.rs b/src/icons.rs index b33bd7b..4d90089 100644 --- a/src/icons.rs +++ b/src/icons.rs @@ -1,123 +1,38 @@ -use crate::{utils::encode_svg, Icon, IconInfo, IconKind, CLIENT}; -use future::join_all; -use futures::StreamExt; -use futures::{prelude::*, task::noop_waker}; -use html5ever::{ - driver, - tendril::{Tendril, TendrilSink}, -}; +use crate::{html_parser, utils::push_url, Icon, IconKind, CLIENT}; +use flo_stream::{MessagePublisher, Publisher, StreamPublisher}; +use futures::future::{join_all, select_all}; +use futures::prelude::*; +use futures::{join, StreamExt}; +use itertools::Itertools; use reqwest::{header::*, IntoUrl}; -use scraper::{ElementRef, Html}; -use serde::Deserialize; use std::convert::TryInto; -use std::iter; -use std::task::Poll; -use std::{collections::HashMap, error::Error, pin::Pin, task::Context}; -use tldextract::TldOption; +use std::error::Error; use url::Url; +use vec1::Vec1; -pub struct Icons { +pub struct SiteIcons { blacklist: Option bool>>, - entries: Vec, - pending_entries: HashMap< - Url, - ( - IconKind, - HashMap, - Pin>>>>, - ), - >, } -fn add_icon_entry( - entries: &mut Vec, - url: Url, - headers: HashMap, - kind: IconKind, - info: Result>, -) { - match info { - Ok(info) => entries.push(Icon { - url, - headers, - kind, - info, - }), - Err(_) => warn_err!(info, "failed to parse icon"), - } +#[derive(Debug, Clone)] +enum LoadedKind { + DefaultManifest(Option>), + HeadTags(Option>), + DefaultFavicon(Option), + SiteLogo(Option), } -impl Icons { +impl SiteIcons { pub fn new() -> Self { - Icons { - blacklist: None, - entries: Vec::new(), - pending_entries: HashMap::new(), - } + SiteIcons { blacklist: None } } pub fn new_with_blacklist(blacklist: impl Fn(&Url) -> bool + 'static) -> Self { - Icons { + SiteIcons { blacklist: Some(Box::new(blacklist)), - entries: Vec::new(), - pending_entries: HashMap::new(), } } - /// Add an icon URL and start fetching it - pub fn add_icon(&mut self, url: Url, kind: IconKind, sizes: Option) { - self.add_icon_with_headers(url, HashMap::new(), kind, sizes) - } - - /// Add an icon URL and start fetching it, - /// along with the specified headers - pub fn add_icon_with_headers( - &mut self, - url: Url, - headers: HashMap, - kind: IconKind, - sizes: Option, - ) { - // check to see if it already exists - let mut entries = self.entries.iter_mut(); - if let Some(existing_kind) = self - .pending_entries - .get_mut(&url) - .map(|(kind, _, _)| kind) - .or_else(|| { - entries.find_map(|icon| { - if icon.url.eq(&url) { - Some(&mut icon.kind) - } else { - None - } - }) - }) - { - // if the kind is more important, replace it - if &kind > existing_kind { - *existing_kind = kind; - } - return; - } - - let mut info = Box::pin(IconInfo::load( - url.clone(), - (&headers).try_into().unwrap(), - sizes, - )); - - // Start fetching the icon - let noop_waker = noop_waker(); - let cx = &mut Context::from_waker(&noop_waker); - match info.poll_unpin(cx) { - Poll::Ready(info) => add_icon_entry(&mut self.entries, url, headers, kind, info), - Poll::Pending => { - self.pending_entries.insert(url, (kind, headers, info)); - } - }; - } - pub fn is_blacklisted(&self, url: &Url) -> bool { if let Some(is_blacklisted) = &self.blacklist { is_blacklisted(url) @@ -126,271 +41,163 @@ impl Icons { } } - pub async fn load_website(&mut self, url: U) -> Result<(), Box> { - let res = CLIENT - .get(url) - .header(ACCEPT, "text/html") - .send() - .await? - .error_for_status()?; + pub async fn load_website( + &mut self, + url: U, + best_matches_only: bool, + ) -> Result, Box> { + let url = url.into_url()?; - let url = res.url().clone(); + let manifest_urls = vec![ + push_url(&url, "manifest.json"), + push_url(&url, "manifest.webmanifest"), + url.join("/manifest.json")?, + url.join("/manifest.webmanifest")?, + ] + .into_iter() + .unique(); - if self.is_blacklisted(&url) { - return Ok(()); - } - - let mut body = res.bytes_stream(); - - let mut parser = driver::parse_document(Html::new_document(), Default::default()); - while let Some(data) = body.next().await { - if let Ok(data) = Tendril::try_from_byte_slice(&data?) { - parser.process(data) - } - } - let document = parser.finish(); - - { - let mut found_favicon = false; - - for elem_ref in document.select(selector!( - "link[rel='icon']", - "link[rel='shortcut icon']", - "link[rel='apple-touch-icon']", - "link[rel='apple-touch-icon-precomposed']" - )) { - let elem = elem_ref.value(); - if let Some(href) = elem.attr("href").and_then(|href| url.join(&href).ok()) { - let rel = elem.attr("rel").unwrap(); - self.add_icon( - href, - if rel.contains("apple-touch-icon") { - IconKind::AppIcon - } else { - IconKind::SiteFavicon - }, - elem.attr("sizes").map(|sizes| sizes.into()), - ); - - found_favicon = true; - }; - } - - // Check for default favicon.ico - if !found_favicon { - self.add_icon( - url.join("/favicon.ico").unwrap(), - IconKind::SiteFavicon, - None, - ); - } - } - - { - let mut logos: Vec<_> = document - .select(selector!( - "a[href='/'] img, a[href='/'] svg", - "header img, header svg", - "img[src*=logo]", - "img[alt*=logo], svg[alt*=logo]", - "*[class*=logo] img, *[class*=logo] svg", - "*[id*=logo] img, *[id*=logo] svg", - "img[class*=logo], svg[class*=logo]", - "img[id*=logo], svg[id*=logo]", - )) - .enumerate() - .filter_map(|(i, elem_ref)| { - let elem = elem_ref.value(); - let ancestors = elem_ref - .ancestors() - .map(ElementRef::wrap) - .flatten() - .map(|elem_ref| elem_ref.value()) - .collect::>(); - - let skip_classnames = regex!("menu|search"); - let should_skip = ancestors.iter().any(|ancestor| { - ancestor - .attr("class") - .map(|attr| skip_classnames.is_match(&attr.to_lowercase())) - .or_else(|| { - ancestor - .attr("id") - .map(|attr| skip_classnames.is_match(&attr.to_lowercase())) - }) - .unwrap_or(false) - }); - - if should_skip { - return None; - } - - let mut weight = 0; - - // if in the header - if ancestors.iter().any(|element| element.name() == "header") { - weight += 2; - } - - if i == 0 { - weight += 1; - } - - let mentions = |attr_name, is_match: Box bool>| { - ancestors.iter().chain(iter::once(&elem)).any(|ancestor| { - ancestor - .attr(attr_name) - .map(|attr| is_match(&attr.to_lowercase())) - .unwrap_or(false) - }) - }; - - if mentions("href", Box::new(|attr| attr == "/")) { - weight += 5; - }; - - let mentions_logo = |attr_name| { - mentions( - attr_name, - Box::new(|attr| regex!("logo([^s]|$)").is_match(attr)), - ) - }; - - if mentions_logo("class") || mentions_logo("id") { - weight += 3; - } - if mentions_logo("alt") { - weight += 2; - } - if mentions_logo("src") { - weight += 1; - } - - if let Some(site_name) = url - .domain() - .and_then(|domain| TldOption::default().build().extract(domain).unwrap().domain) - { - // if the alt contains the site_name then highest priority - if site_name - .to_lowercase() - .split('-') - .any(|segment| mentions("alt", Box::new(move |attr| attr.contains(segment)))) - { - weight += 10; - } - } - - let href = if elem.name() == "svg" { - Some(Url::parse(&encode_svg(&elem_ref.html())).unwrap()) - } else { - elem.attr("src").and_then(|href| url.join(&href).ok()) - }; - - if let Some(href) = &href { - if self.is_blacklisted(href) { - return None; - } - } - - href.map(|href| (href, elem_ref, weight)) - }) - .collect(); - - logos.sort_by(|(_, _, a_weight), (_, _, b_weight)| b_weight.cmp(a_weight)); - - // prefer over svg - let mut prev_weight = None; - for (href, elem_ref, weight) in &logos { - if let Some(prev_weight) = prev_weight { - if weight != prev_weight { - break; - } - } - prev_weight = Some(weight); - - if elem_ref.value().name() == "img" { - self.add_icon(href.clone(), IconKind::SiteLogo, None); - break; - } - } - - if let Some((href, _, _)) = logos.into_iter().next() { - self.add_icon(href, IconKind::SiteLogo, None); - } - } - - for elem_ref in document.select(selector!("link[rel='manifest']")) { - if let Some(href) = elem_ref - .value() - .attr("href") - .and_then(|href| url.join(&href).ok()) - { - warn_err!(self.load_manifest(href).await, "failed to fetch manifest"); - } - } - - Ok(()) - } - - pub async fn load_manifest(&mut self, manifest_url: Url) -> Result<(), Box> { - #[derive(Deserialize)] - struct ManifestIcon { - src: String, - sizes: Option, - } - - #[derive(Deserialize)] - struct Manifest { - icons: Option>, - } - - let manifest: Manifest = CLIENT - .get(manifest_url.as_str()) - .send() - .await? - .json() - .await?; - - if let Some(icons) = manifest.icons { - for icon in icons { - if let Ok(src) = manifest_url.join(&icon.src) { - let _ = self.add_icon(src, IconKind::AppIcon, icon.sizes); - } - } - } - - Ok(()) - } - - /// Fetch all the icons. Ordered from highest to lowest resolution - /// - /// ``` - /// async fn run() { - /// let mut icons = site_icons::Icons::new(); - /// icons.load_website("https://github.com").await.unwrap(); - /// - /// let entries = icons.entries().await; - /// for icon in entries { - /// println!("{:?}", icon) - /// } - /// } - /// ``` - pub async fn entries(mut self) -> Vec { - let (urls, infos): (Vec<_>, Vec<_>) = self - .pending_entries + let favicon_urls = vec![push_url(&url, "favicon.ico"), url.join("/favicon.ico")?] .into_iter() - .map(|(url, (kind, headers, info))| ((url, headers, kind), info)) - .unzip(); + .unique(); - let mut urls = urls.into_iter(); + let html_response = async { + let res = CLIENT + .get(url.clone()) + .header(ACCEPT, "text/html") + .send() + .await + .ok()? + .error_for_status() + .ok()?; - for info in join_all(infos).await { - let (url, headers, kind) = urls.next().unwrap(); - add_icon_entry(&mut self.entries, url, headers, kind, info); + let url = res.url().clone(); + + if self.is_blacklisted(&url) { + None + } else { + let body = res.bytes_stream().map(|res| { + res + .map(|bytes| bytes.to_vec()) + .map_err(|err| err.to_string()) + }); + + let mut publisher = Publisher::new(128); + let subscriber = publisher.subscribe(); + + Some(( + url, + async move { StreamPublisher::new(&mut publisher, body).await }.shared(), + subscriber, + )) + } + } + .shared(); + + let mut futures = vec![ + async { + let html_response = html_response.clone().await; + + LoadedKind::HeadTags(match html_response { + Some((url, _, body)) => html_parser::parse_head(&url, body) + .await + .ok() + .and_then(|icons| icons.try_into().ok()), + None => None, + }) + } + .boxed_local(), + async { + let html_response = html_response.clone().await; + + LoadedKind::SiteLogo(match html_response { + Some((url, complete, body)) => { + let (icons, _) = join!( + html_parser::parse_site_logo(&url, body, |url| self.is_blacklisted(url)), + complete + ); + + icons.ok() + } + None => None, + }) + } + .boxed_local(), + async { + let manifests = join_all(manifest_urls.map(|url| SiteIcons::load_manifest(url))).await; + + LoadedKind::DefaultManifest( + manifests + .into_iter() + .find_map(|manifest| manifest.ok().and_then(|icons| icons.try_into().ok())), + ) + } + .boxed_local(), + async { + let favicons = + join_all(favicon_urls.map(|url| Icon::load(url.clone(), IconKind::SiteFavicon, None))) + .await; + + LoadedKind::DefaultFavicon(favicons.into_iter().find_map(|favicon| favicon.ok())) + } + .boxed_local(), + ]; + + let mut icons: Vec = Vec::new(); + let mut found_best_match = false; + let mut previous_loads = Vec::new(); + + while !futures.is_empty() { + let (loaded, index, _) = select_all(&mut futures).await; + futures.remove(index); + + match loaded.clone() { + LoadedKind::DefaultManifest(manifest_icons) => { + if let Some(manifest_icons) = manifest_icons { + icons.extend(manifest_icons); + found_best_match = true; + } + } + LoadedKind::DefaultFavicon(favicon) => { + if let Some(favicon) = favicon { + icons.push(favicon); + + if previous_loads + .iter() + .any(|kind| matches!(kind, LoadedKind::HeadTags(_))) + { + found_best_match = true; + } + } + } + LoadedKind::HeadTags(head_icons) => { + if let Some(head_icons) = head_icons { + icons.extend(head_icons); + found_best_match = true; + } else if previous_loads + .iter() + .any(|kind| matches!(kind, LoadedKind::DefaultFavicon(Some(_)))) + { + found_best_match = true; + } + } + LoadedKind::SiteLogo(logo) => { + if let Some(logo) = logo { + icons.push(logo); + } + } + } + + previous_loads.push(loaded); + + icons.sort(); + icons = icons.into_iter().unique().collect(); + + if best_matches_only && found_best_match { + break; + } } - self.entries.sort(); - - self.entries + Ok(icons) } } diff --git a/src/lib.rs b/src/lib.rs index b6e4d56..3568d76 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +#![feature(async_closure)] //! # site_icons //! An efficient website icon scraper. //! @@ -26,15 +27,13 @@ extern crate serde_with; extern crate log; #[macro_use] -mod macros; -mod icon; -mod icon_info; -mod icon_size; -mod icons; mod utils; +mod html_parser; +mod icon; +mod icons; +mod manifest; pub use icon::*; -pub use icon_info::*; pub use icons::*; use once_cell::sync::Lazy; @@ -49,23 +48,23 @@ static CLIENT: Lazy = Lazy::new(|| { Client::builder().default_headers(headers).build().unwrap() }); -#[cfg(test)] -mod tests { - use super::*; - #[tokio::test] - async fn test_icons() { - let mut icons = Icons::new(); - // scrape the icons from a url - icons.load_website("https://github.com").await.unwrap(); +// #[cfg(test)] +// mod tests { +// use super::*; +// #[tokio::test] +// async fn test_icons() { +// let mut icons = SiteIcons::new(); +// // scrape the icons from a url +// icons.load_website("https://github.com").await.unwrap(); - // fetch all icons, ensuring they exist & determining size - let entries = icons.entries().await; +// // fetch all icons, ensuring they exist & determining size +// let entries = icons.entries().await; - // entries are sorted from highest to lowest resolution - for icon in &entries { - println!("{:?}", icon) - } +// // entries are sorted from highest to lowest resolution +// for icon in &entries { +// println!("{:?}", icon) +// } - assert_eq!(entries.len() > 0, true); - } -} +// assert_eq!(entries.len() > 0, true); +// } +// } diff --git a/src/manifest.rs b/src/manifest.rs new file mode 100644 index 0000000..58b4c61 --- /dev/null +++ b/src/manifest.rs @@ -0,0 +1,54 @@ +use crate::{Icon, IconKind, SiteIcons, CLIENT}; +use cached::proc_macro::cached; +use futures::future::join_all; +use reqwest::IntoUrl; +use serde::Deserialize; +use std::error::Error; +use url::Url; + +#[derive(Debug, Deserialize)] +struct ManifestIcon { + src: String, + sizes: Option, +} + +#[derive(Debug, Deserialize)] +struct Manifest { + icons: Vec, +} + +impl SiteIcons { + pub async fn load_manifest(url: U) -> Result, Box> { + let url = url.into_url()?; + + Ok(load_manifest_cached(url).await?) + } +} + +#[cached(sync_writes = true)] +async fn load_manifest_cached(url: Url) -> Result, String> { + let url = &url; + + let manifest: Manifest = CLIENT + .get(url.clone()) + .send() + .await + .map_err(|e| format!("{}: {:?}", url, e))? + .json() + .await + .map_err(|e| format!("{}: {:?}", url, e))?; + + Ok( + join_all(manifest.icons.into_iter().map(async move |icon| { + if let Ok(src) = url.join(&icon.src) { + Icon::load(src, IconKind::AppIcon, icon.sizes).await.ok() + } else { + None + } + })) + .await + .into_iter() + .filter_map(|icon| icon) + .collect(), + ) +} diff --git a/src/utils/background_poll.rs b/src/utils/background_poll.rs new file mode 100644 index 0000000..bcc4ec1 --- /dev/null +++ b/src/utils/background_poll.rs @@ -0,0 +1,43 @@ +use std::{ + pin::Pin, + task::{Context, Poll}, +}; + +use futures::Future; + +pub async fn poll_in_background(future: F, background_future: B) -> FO +where + F: Future + Unpin, + B: Future + Unpin, +{ + struct BackgroundPoller { + future: F, + background_future: B, + } + + impl Future for BackgroundPoller + where + F: Future + Unpin, + B: Future + Unpin, + { + type Output = FO; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let this = self.get_mut(); + + let result = Pin::new(&mut this.future).poll(cx); + + if result.is_pending() { + let _ = Pin::new(&mut this.background_future).poll(cx); + } + + result + } + } + + BackgroundPoller { + future, + background_future, + } + .await +} diff --git a/src/macros.rs b/src/utils/macros.rs similarity index 72% rename from src/macros.rs rename to src/utils/macros.rs index 2185249..68a4342 100644 --- a/src/macros.rs +++ b/src/utils/macros.rs @@ -1,11 +1,11 @@ macro_rules! selector { ($($selector:expr),+ $(,)?) => {{ static RE: once_cell::sync::OnceCell = once_cell::sync::OnceCell::new(); - RE.get_or_init(|| scraper::Selector::parse(join!(",", $($selector),+)).unwrap()) + RE.get_or_init(|| scraper::Selector::parse(join_with!(",", $($selector),+)).unwrap()) }}; } -macro_rules! join { +macro_rules! join_with { ($pattern:literal,$first:expr$(, $($rest:expr),*)? $(,)?) => { concat!($first$(, $($pattern, $rest),*)?) }; @@ -18,14 +18,6 @@ macro_rules! regex { }}; } -macro_rules! warn_err { - ($result:expr, $($arg:tt)*) => {{ - if let Err(err) = $result { - warn!("{} {}", format!($($arg)*), err); - } - }}; -} - macro_rules! assert_slice_eq { ($cur:expr, $offset:expr, $slice:expr, $($arg:tt)+) => {{ if !super::slice_eq($cur, $offset, $slice)? { diff --git a/src/utils/mod.rs b/src/utils/mod.rs new file mode 100644 index 0000000..fa00d3f --- /dev/null +++ b/src/utils/mod.rs @@ -0,0 +1,16 @@ +#[macro_use] +mod macros; +mod background_poll; +mod svg_encoder; + +pub use background_poll::*; +pub use macros::*; +pub use svg_encoder::*; + +use url::Url; + +pub fn push_url(url: &Url, segment: &str) -> Url { + let mut url = url.clone(); + url.path_segments_mut().unwrap().push(segment); + url +} diff --git a/src/utils.rs b/src/utils/svg_encoder.rs similarity index 100% rename from src/utils.rs rename to src/utils/svg_encoder.rs