From 2c485208d34dd2e9c43415de8da337b136c52dc6 Mon Sep 17 00:00:00 2001 From: Sam Denty Date: Fri, 29 Jan 2021 16:26:04 +0000 Subject: [PATCH] site-icons binary + svg scraping --- Cargo.lock | 21 ++++ Cargo.toml | 13 +- README.md | 34 +++-- src/bin/{site_icons.rs => site-icons.rs} | 11 ++ src/icon_size/jpeg.rs | 2 +- src/icons.rs | 151 ++++++++++++++++------- src/lib.rs | 1 + src/macros.rs | 17 +++ src/utils.rs | 38 ++++++ 9 files changed, 228 insertions(+), 60 deletions(-) rename src/bin/{site_icons.rs => site-icons.rs} (67%) create mode 100644 src/utils.rs diff --git a/Cargo.lock b/Cargo.lock index 8b262fe..e18f6c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -312,6 +312,19 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "env_logger" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26ecb66b4bdca6c1409b40fb255eefc2bd4f6d135dab3c3124f80ffa2a9661e" +dependencies = [ + "atty", + "humantime", + "log", + "regex", + "termcolor", +] + [[package]] name = "error-chain" version = "0.12.4" @@ -588,6 +601,12 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47" +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + [[package]] name = "hyper" version = "0.14.2" @@ -1525,12 +1544,14 @@ dependencies = [ "byteorder", "clap", "data-url", + "env_logger", "futures", "html5ever", "itertools", "log", "mime_4", "once_cell", + "percent-encoding", "pin-utils", "regex", "reqwest-wasm", diff --git a/Cargo.toml b/Cargo.toml index c04a384..16d741b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,8 +4,12 @@ version = "0.1.0" authors = ["Sam Denty "] edition = "2018" license = "gpl-3.0" +homepage = "https://github.com/samdenty/site_icons" repository = "https://github.com/samdenty/site_icons" -description = "Website icon scraper with sizes, ordering, and WASM support" +documentation = "https://docs.rs/site_icons" +description = "Website icon scraper that fetches sizes (with WASM support)" +keywords = ["favicon", "logo", "website", "scraper", "icons", "cli"] +categories = ["command-line-utilities", "multimedia::images", "wasm"] [package.metadata.wasm-pack.profile.release] wasm-opt = ["-Oz", "--enable-mutable-globals"] @@ -13,13 +17,13 @@ wasm-opt = ["-Oz", "--enable-mutable-globals"] [lib] crate-type = ["cdylib", "rlib"] - [dependencies] clap = "3.0.0-beta.2" itertools = "0.10.0" serde_with = "1.6.1" pin-utils = "0.1.0" html5ever = "0.25.1" +percent-encoding = "2.1.0" url = { version = "2.2.0", features = ["serde"] } regex = "1" log = "0.4.14" @@ -28,12 +32,13 @@ scraper = "0.12.0" tokio-futures-byteorder = { version = "0.2.0", features = ["futures"] } byteorder = "1.4.2" data-url = "0.1.0" -mime_4 = "0.4.0-a.0" +mime = { package = "mime_4", version = "0.4.0-a.0" } serde = { version = "1.0", features = ["derive", "rc"] } serde_json = "1.0" -reqwest-wasm = { features = ["json", "cookies", "blocking", "stream"] } +reqwest = { package = "reqwest-wasm", version = "0.11.0", features = ["json", "cookies", "blocking", "stream"] } futures = "0.3.12" wee_alloc = { version = "0.4.2", optional = true } [target.'cfg(not(target_arch = "wasm32"))'.dependencies] tokio = { version = "1.1.0", features = ["full"] } +env_logger = "0.8.2" diff --git a/README.md b/README.md index 1fd3aeb..b9e564c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,30 @@ # site_icons -An efficient website icon scraper for rust +[![Crates.io](https://img.shields.io/crates/v/site_icons.svg)](https://crates.io/crates/site_icons) +[![Documentation](https://docs.rs/site_icons/badge.svg)](https://docs.rs/site_icons/) +![GitHub Sponsors](https://img.shields.io/github/sponsors/samdenty?style=social) + +An efficient website icon scraper for rust or command line usage. + +## Features + +- Ensures all URLs point to valid images +- Determines icon size by partially fetching images +- Supports WASM (and cloudflare workers) + +### Command line usage + +```bash +cargo install site_icons + +site-icons https://google.com +# https://github.githubassets.com/favicons/favicon.svg site_favicon svg +# https://github.githubassets.com/app-icon-512.png app_icon png 512x512 +# https://github.githubassets.com/app-icon-192.png app_icon png 192x192 +# https://github.githubassets.com/apple-touch-icon-180x180.png app_icon png 180x180 +``` + +### API usage ```rust use site_icons::Icons; @@ -11,17 +35,13 @@ icons.load_website("https://github.com").await?; // fetch all icons, ensuring they exist & determining size let entries = icons.entries().await; + +// entries are sorted from highest to lowest resolution for icon in entries { println("{:?}", icon) } ``` -## Features - -- Validates that all URLs exist and are actually images -- Determines the size of the icon by partially fetching it -- Supports WASM (and cloudflare workers) - ### Sources - HTML favicon tag (or looking for default `/favicon.ico`) diff --git a/src/bin/site_icons.rs b/src/bin/site-icons.rs similarity index 67% rename from src/bin/site_icons.rs rename to src/bin/site-icons.rs index a539528..771c471 100644 --- a/src/bin/site_icons.rs +++ b/src/bin/site-icons.rs @@ -1,4 +1,6 @@ use clap::Clap; +use env_logger::Builder; +use log::LevelFilter; use site_icons::Icons; use std::error::Error; @@ -7,6 +9,9 @@ struct Opts { urls: Vec, #[clap(long)] json: bool, + #[clap(long)] + /// Print out errors that occurred for skipped items + debug: bool, } #[tokio::main] @@ -14,6 +19,12 @@ async fn main() -> Result<(), Box> { let mut icons = Icons::new(); let opts: Opts = Opts::parse(); + if opts.debug { + let mut builder = Builder::new(); + builder.filter_module("site_icons", LevelFilter::Info); + builder.init(); + } + for url in opts.urls { icons.load_website(&url).await?; } diff --git a/src/icon_size/jpeg.rs b/src/icon_size/jpeg.rs index 8c6404a..bbfb246 100644 --- a/src/icon_size/jpeg.rs +++ b/src/icon_size/jpeg.rs @@ -3,7 +3,7 @@ use crate::assert_slice_eq; use byteorder::BigEndian; use futures::prelude::*; use std::{error::Error, io::Cursor}; -use tokio_byteorder::AsyncReadBytesExt; +use tokio_futures_byteorder::AsyncReadBytesExt; pub async fn get_jpeg_size( reader: &mut R, diff --git a/src/icons.rs b/src/icons.rs index cc90be6..8104718 100644 --- a/src/icons.rs +++ b/src/icons.rs @@ -1,4 +1,4 @@ -use crate::{selector, Icon, IconInfo, IconKind, CLIENT}; +use crate::{selector, utils::encode_svg, warn_err, Icon, IconInfo, IconKind, CLIENT}; use future::join_all; use futures::StreamExt; use futures::{prelude::*, task::noop_waker}; @@ -7,7 +7,7 @@ use html5ever::{ tendril::{Tendril, TendrilSink}, }; use reqwest::{header::*, IntoUrl}; -use scraper::Html; +use scraper::{ElementRef, Html}; use serde::Deserialize; use std::task::Poll; use std::{collections::HashMap, error::Error, pin::Pin, task::Context}; @@ -32,9 +32,7 @@ fn add_icon_entry( ) { match info { Ok(info) => entries.push(Icon { url, kind, info }), - Err(e) => { - warn!("failed to parse icon: {}", e); - } + Err(_) => warn_err!(info, "failed to parse icon"), } } @@ -47,12 +45,7 @@ impl Icons { } /// Add an icon URL and start fetching it - pub fn add_icon( - &mut self, - url: Url, - kind: IconKind, - sizes: Option, - ) -> Result<(), Box> { + pub fn add_icon(&mut self, url: Url, kind: IconKind, sizes: Option) { // check to see if it already exists let mut entries = self.entries.iter_mut(); if let Some(existing_kind) = self @@ -65,7 +58,7 @@ impl Icons { if &kind > existing_kind { *existing_kind = kind; } - return Ok(()); + return; } let mut info = Box::pin(IconInfo::get(url.clone(), sizes)); @@ -79,8 +72,6 @@ impl Icons { self.pending_entries.insert(url, (kind, info)); } }; - - Ok(()) } pub async fn load_website(&mut self, url: U) -> Result<(), Box> { @@ -90,65 +81,131 @@ impl Icons { let mut parser = driver::parse_document(Html::new_document(), Default::default()); while let Some(data) = body.next().await { - let tendril = Tendril::try_from_byte_slice(&data?).map_err(|_| "failed to parse html")?; - parser.process(tendril); + if let Ok(data) = Tendril::try_from_byte_slice(&data?) { + parser.process(data) + } } let document = parser.finish(); { let mut found_favicon = false; - for element_ref in document.select(selector!( + for elem_ref in document.select(selector!( "link[rel='icon']", "link[rel='shortcut icon']", "link[rel='apple-touch-icon']", "link[rel='apple-touch-icon-precomposed']" )) { - let elem = element_ref.value(); + let elem = elem_ref.value(); if let Some(href) = elem.attr("href").and_then(|href| url.join(&href).ok()) { - if self - .add_icon( - href, - IconKind::SiteFavicon, - elem.attr("sizes").map(|sizes| sizes.into()), - ) - .is_ok() - { - found_favicon = true; - }; + self.add_icon( + href, + IconKind::SiteFavicon, + elem.attr("sizes").map(|sizes| sizes.into()), + ); + + found_favicon = true; }; } // Check for default favicon.ico if !found_favicon { - self.add_icon(url.join("/favicon.ico")?, IconKind::SiteFavicon, None)?; + self.add_icon( + url.join("/favicon.ico").unwrap(), + IconKind::SiteFavicon, + None, + ); } } - for element_ref in document.select(selector!( - "header img", - "img[src*=logo]", - "img[alt*=logo]", - "img[class*=logo]" - )) { - if let Some(href) = element_ref - .value() - .attr("src") - .and_then(|href| url.join(&href).ok()) - { - if self.add_icon(href, IconKind::SiteLogo, None).is_ok() { + { + let mut logos: Vec<_> = document + .select(selector!( + "header img, header svg", + "img[src*=logo]", + "img[alt*=logo], svg[alt*=logo]", + "img[class*=logo], svg[class*=logo]", + )) + .map(|elem_ref| { + let elem = elem_ref.value(); + let mut weight = 0; + + // if in the header + if elem_ref + .ancestors() + .map(ElementRef::wrap) + .flatten() + .any(|element| element.value().name() == "header") + { + weight += 2; + } + + let mentions_logo = |attr_name| { + elem + .attr(attr_name) + .map(|attr| attr.to_lowercase().contains("logo")) + .unwrap_or(false) + }; + if mentions_logo("class") || mentions_logo("id") { + weight += 3; + } + if mentions_logo("alt") { + weight += 2; + } + if mentions_logo("src") { + weight += 1; + } + + (elem_ref, weight) + }) + .collect(); + + logos.sort_by(|(_, a_weight), (_, b_weight)| b_weight.cmp(a_weight)); + + // prefer over svg + let mut prev_weight = None; + for (i, (logo, weight)) in logos.iter().enumerate() { + if let Some(prev_weight) = prev_weight { + if weight != prev_weight { + break; + } + } + prev_weight = Some(weight); + + if logo.value().name() == "img" { + let (logo, weight) = logos.remove(i); + logos.insert(0, (logo, weight + 1)); + break; + } + } + + for (elem_ref, _) in logos { + let elem = elem_ref.value(); + + if elem.name() == "svg" { + let data_uri = Url::parse(&encode_svg(&elem_ref.html())).unwrap(); + self.add_icon(data_uri, IconKind::SiteLogo, None); + break; + } + + if let Some(href) = elem_ref + .value() + .attr("src") + .and_then(|href| url.join(&href).ok()) + { + self.add_icon(href, IconKind::SiteLogo, None); break; }; - }; + } } - for element_ref in document.select(selector!("link[rel='manifest']")) { - if let Some(href) = element_ref + for elem_ref in document.select(selector!("link[rel='manifest']")) { + if let Some(href) = elem_ref .value() .attr("href") .and_then(|href| url.join(&href).ok()) { - self.load_manifest(href).await?; + warn_err!(self.load_manifest(href).await, "failed to fetch manifest"); } } @@ -185,9 +242,7 @@ impl Icons { Ok(()) } - /// Fetch all the icons and return a list of them. - /// - /// List is ordered from highest resolution to lowest resolution + /// Fetch all the icons. Ordered from highest to lowest resolution /// /// ``` /// # async fn run() { diff --git a/src/lib.rs b/src/lib.rs index d243d5b..240ce72 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,7 @@ mod icon_info; mod icon_size; mod icons; mod macros; +mod utils; pub use icon::*; pub use icon_info::*; diff --git a/src/macros.rs b/src/macros.rs index 721e65c..696f622 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -12,3 +12,20 @@ macro_rules! join { concat!($first$(, $($pattern, $rest),*)?) }; } + +#[macro_export] +macro_rules! regex { + ($re:literal $(,)?) => {{ + static RE: once_cell::sync::OnceCell = once_cell::sync::OnceCell::new(); + RE.get_or_init(|| regex::Regex::new($re).unwrap()) + }}; +} + +#[macro_export] +macro_rules! warn_err { + ($result:expr, $($arg:tt)*) => {{ + if let Err(err) = $result { + warn!("{} {}", format!($($arg)*), err); + } + }}; +} diff --git a/src/utils.rs b/src/utils.rs new file mode 100644 index 0000000..4f551b0 --- /dev/null +++ b/src/utils.rs @@ -0,0 +1,38 @@ +use crate::regex; +use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS}; + +const DATA_URI: &AsciiSet = &CONTROLS + .add(b'\r') + .add(b'\n') + .add(b'%') + .add(b'#') + .add(b'(') + .add(b')') + .add(b'<') + .add(b'>') + .add(b'?') + .add(b'[') + .add(b'\\') + .add(b']') + .add(b'^') + .add(b'`') + .add(b'{') + .add(b'|') + .add(b'}'); + +pub fn encode_svg(svg: &str) -> String { + // add namespace + let encoded = if !svg.contains("http://www.w3.org/2000/svg") { + regex!("\s{1,}<"); + let encoded = regex!(r"\s{2,}").replace_all(&encoded, " "); + + let encoded = utf8_percent_encode(&encoded, DATA_URI); + + format!("data:image/svg+xml,{}", encoded) +}