site-icons binary + svg scraping

This commit is contained in:
Sam Denty 2021-01-29 16:26:04 +00:00
parent 2cd92eb65e
commit 2c485208d3
No known key found for this signature in database
GPG key ID: F3E9308D4A43BC0E
9 changed files with 228 additions and 60 deletions

21
Cargo.lock generated
View file

@ -312,6 +312,19 @@ dependencies = [
"cfg-if 1.0.0", "cfg-if 1.0.0",
] ]
[[package]]
name = "env_logger"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26ecb66b4bdca6c1409b40fb255eefc2bd4f6d135dab3c3124f80ffa2a9661e"
dependencies = [
"atty",
"humantime",
"log",
"regex",
"termcolor",
]
[[package]] [[package]]
name = "error-chain" name = "error-chain"
version = "0.12.4" version = "0.12.4"
@ -588,6 +601,12 @@ version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47" checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47"
[[package]]
name = "humantime"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
[[package]] [[package]]
name = "hyper" name = "hyper"
version = "0.14.2" version = "0.14.2"
@ -1525,12 +1544,14 @@ dependencies = [
"byteorder", "byteorder",
"clap", "clap",
"data-url", "data-url",
"env_logger",
"futures", "futures",
"html5ever", "html5ever",
"itertools", "itertools",
"log", "log",
"mime_4", "mime_4",
"once_cell", "once_cell",
"percent-encoding",
"pin-utils", "pin-utils",
"regex", "regex",
"reqwest-wasm", "reqwest-wasm",

View file

@ -4,8 +4,12 @@ version = "0.1.0"
authors = ["Sam Denty <sam@samdenty.com>"] authors = ["Sam Denty <sam@samdenty.com>"]
edition = "2018" edition = "2018"
license = "gpl-3.0" license = "gpl-3.0"
homepage = "https://github.com/samdenty/site_icons"
repository = "https://github.com/samdenty/site_icons" repository = "https://github.com/samdenty/site_icons"
description = "Website icon scraper with sizes, ordering, and WASM support" documentation = "https://docs.rs/site_icons"
description = "Website icon scraper that fetches sizes (with WASM support)"
keywords = ["favicon", "logo", "website", "scraper", "icons", "cli"]
categories = ["command-line-utilities", "multimedia::images", "wasm"]
[package.metadata.wasm-pack.profile.release] [package.metadata.wasm-pack.profile.release]
wasm-opt = ["-Oz", "--enable-mutable-globals"] wasm-opt = ["-Oz", "--enable-mutable-globals"]
@ -13,13 +17,13 @@ wasm-opt = ["-Oz", "--enable-mutable-globals"]
[lib] [lib]
crate-type = ["cdylib", "rlib"] crate-type = ["cdylib", "rlib"]
[dependencies] [dependencies]
clap = "3.0.0-beta.2" clap = "3.0.0-beta.2"
itertools = "0.10.0" itertools = "0.10.0"
serde_with = "1.6.1" serde_with = "1.6.1"
pin-utils = "0.1.0" pin-utils = "0.1.0"
html5ever = "0.25.1" html5ever = "0.25.1"
percent-encoding = "2.1.0"
url = { version = "2.2.0", features = ["serde"] } url = { version = "2.2.0", features = ["serde"] }
regex = "1" regex = "1"
log = "0.4.14" log = "0.4.14"
@ -28,12 +32,13 @@ scraper = "0.12.0"
tokio-futures-byteorder = { version = "0.2.0", features = ["futures"] } tokio-futures-byteorder = { version = "0.2.0", features = ["futures"] }
byteorder = "1.4.2" byteorder = "1.4.2"
data-url = "0.1.0" data-url = "0.1.0"
mime_4 = "0.4.0-a.0" mime = { package = "mime_4", version = "0.4.0-a.0" }
serde = { version = "1.0", features = ["derive", "rc"] } serde = { version = "1.0", features = ["derive", "rc"] }
serde_json = "1.0" serde_json = "1.0"
reqwest-wasm = { features = ["json", "cookies", "blocking", "stream"] } reqwest = { package = "reqwest-wasm", version = "0.11.0", features = ["json", "cookies", "blocking", "stream"] }
futures = "0.3.12" futures = "0.3.12"
wee_alloc = { version = "0.4.2", optional = true } wee_alloc = { version = "0.4.2", optional = true }
[target.'cfg(not(target_arch = "wasm32"))'.dependencies] [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
tokio = { version = "1.1.0", features = ["full"] } tokio = { version = "1.1.0", features = ["full"] }
env_logger = "0.8.2"

View file

@ -1,6 +1,30 @@
# site_icons # site_icons
An efficient website icon scraper for rust [![Crates.io](https://img.shields.io/crates/v/site_icons.svg)](https://crates.io/crates/site_icons)
[![Documentation](https://docs.rs/site_icons/badge.svg)](https://docs.rs/site_icons/)
![GitHub Sponsors](https://img.shields.io/github/sponsors/samdenty?style=social)
An efficient website icon scraper for rust or command line usage.
## Features
- Ensures all URLs point to valid images
- Determines icon size by partially fetching images
- Supports WASM (and cloudflare workers)
### Command line usage
```bash
cargo install site_icons
site-icons https://google.com
# https://github.githubassets.com/favicons/favicon.svg site_favicon svg
# https://github.githubassets.com/app-icon-512.png app_icon png 512x512
# https://github.githubassets.com/app-icon-192.png app_icon png 192x192
# https://github.githubassets.com/apple-touch-icon-180x180.png app_icon png 180x180
```
### API usage
```rust ```rust
use site_icons::Icons; use site_icons::Icons;
@ -11,17 +35,13 @@ icons.load_website("https://github.com").await?;
// fetch all icons, ensuring they exist & determining size // fetch all icons, ensuring they exist & determining size
let entries = icons.entries().await; let entries = icons.entries().await;
// entries are sorted from highest to lowest resolution
for icon in entries { for icon in entries {
println("{:?}", icon) println("{:?}", icon)
} }
``` ```
## Features
- Validates that all URLs exist and are actually images
- Determines the size of the icon by partially fetching it
- Supports WASM (and cloudflare workers)
### Sources ### Sources
- HTML favicon tag (or looking for default `/favicon.ico`) - HTML favicon tag (or looking for default `/favicon.ico`)

View file

@ -1,4 +1,6 @@
use clap::Clap; use clap::Clap;
use env_logger::Builder;
use log::LevelFilter;
use site_icons::Icons; use site_icons::Icons;
use std::error::Error; use std::error::Error;
@ -7,6 +9,9 @@ struct Opts {
urls: Vec<String>, urls: Vec<String>,
#[clap(long)] #[clap(long)]
json: bool, json: bool,
#[clap(long)]
/// Print out errors that occurred for skipped items
debug: bool,
} }
#[tokio::main] #[tokio::main]
@ -14,6 +19,12 @@ async fn main() -> Result<(), Box<dyn Error>> {
let mut icons = Icons::new(); let mut icons = Icons::new();
let opts: Opts = Opts::parse(); let opts: Opts = Opts::parse();
if opts.debug {
let mut builder = Builder::new();
builder.filter_module("site_icons", LevelFilter::Info);
builder.init();
}
for url in opts.urls { for url in opts.urls {
icons.load_website(&url).await?; icons.load_website(&url).await?;
} }

View file

@ -3,7 +3,7 @@ use crate::assert_slice_eq;
use byteorder::BigEndian; use byteorder::BigEndian;
use futures::prelude::*; use futures::prelude::*;
use std::{error::Error, io::Cursor}; use std::{error::Error, io::Cursor};
use tokio_byteorder::AsyncReadBytesExt; use tokio_futures_byteorder::AsyncReadBytesExt;
pub async fn get_jpeg_size<R: AsyncRead + Unpin>( pub async fn get_jpeg_size<R: AsyncRead + Unpin>(
reader: &mut R, reader: &mut R,

View file

@ -1,4 +1,4 @@
use crate::{selector, Icon, IconInfo, IconKind, CLIENT}; use crate::{selector, utils::encode_svg, warn_err, Icon, IconInfo, IconKind, CLIENT};
use future::join_all; use future::join_all;
use futures::StreamExt; use futures::StreamExt;
use futures::{prelude::*, task::noop_waker}; use futures::{prelude::*, task::noop_waker};
@ -7,7 +7,7 @@ use html5ever::{
tendril::{Tendril, TendrilSink}, tendril::{Tendril, TendrilSink},
}; };
use reqwest::{header::*, IntoUrl}; use reqwest::{header::*, IntoUrl};
use scraper::Html; use scraper::{ElementRef, Html};
use serde::Deserialize; use serde::Deserialize;
use std::task::Poll; use std::task::Poll;
use std::{collections::HashMap, error::Error, pin::Pin, task::Context}; use std::{collections::HashMap, error::Error, pin::Pin, task::Context};
@ -32,9 +32,7 @@ fn add_icon_entry(
) { ) {
match info { match info {
Ok(info) => entries.push(Icon { url, kind, info }), Ok(info) => entries.push(Icon { url, kind, info }),
Err(e) => { Err(_) => warn_err!(info, "failed to parse icon"),
warn!("failed to parse icon: {}", e);
}
} }
} }
@ -47,12 +45,7 @@ impl Icons {
} }
/// Add an icon URL and start fetching it /// Add an icon URL and start fetching it
pub fn add_icon( pub fn add_icon(&mut self, url: Url, kind: IconKind, sizes: Option<String>) {
&mut self,
url: Url,
kind: IconKind,
sizes: Option<String>,
) -> Result<(), Box<dyn Error>> {
// check to see if it already exists // check to see if it already exists
let mut entries = self.entries.iter_mut(); let mut entries = self.entries.iter_mut();
if let Some(existing_kind) = self if let Some(existing_kind) = self
@ -65,7 +58,7 @@ impl Icons {
if &kind > existing_kind { if &kind > existing_kind {
*existing_kind = kind; *existing_kind = kind;
} }
return Ok(()); return;
} }
let mut info = Box::pin(IconInfo::get(url.clone(), sizes)); let mut info = Box::pin(IconInfo::get(url.clone(), sizes));
@ -79,8 +72,6 @@ impl Icons {
self.pending_entries.insert(url, (kind, info)); self.pending_entries.insert(url, (kind, info));
} }
}; };
Ok(())
} }
pub async fn load_website<U: IntoUrl>(&mut self, url: U) -> Result<(), Box<dyn Error>> { pub async fn load_website<U: IntoUrl>(&mut self, url: U) -> Result<(), Box<dyn Error>> {
@ -90,65 +81,131 @@ impl Icons {
let mut parser = driver::parse_document(Html::new_document(), Default::default()); let mut parser = driver::parse_document(Html::new_document(), Default::default());
while let Some(data) = body.next().await { while let Some(data) = body.next().await {
let tendril = Tendril::try_from_byte_slice(&data?).map_err(|_| "failed to parse html")?; if let Ok(data) = Tendril::try_from_byte_slice(&data?) {
parser.process(tendril); parser.process(data)
}
} }
let document = parser.finish(); let document = parser.finish();
{ {
let mut found_favicon = false; let mut found_favicon = false;
for element_ref in document.select(selector!( for elem_ref in document.select(selector!(
"link[rel='icon']", "link[rel='icon']",
"link[rel='shortcut icon']", "link[rel='shortcut icon']",
"link[rel='apple-touch-icon']", "link[rel='apple-touch-icon']",
"link[rel='apple-touch-icon-precomposed']" "link[rel='apple-touch-icon-precomposed']"
)) { )) {
let elem = element_ref.value(); let elem = elem_ref.value();
if let Some(href) = elem.attr("href").and_then(|href| url.join(&href).ok()) { if let Some(href) = elem.attr("href").and_then(|href| url.join(&href).ok()) {
if self self.add_icon(
.add_icon(
href, href,
IconKind::SiteFavicon, IconKind::SiteFavicon,
elem.attr("sizes").map(|sizes| sizes.into()), elem.attr("sizes").map(|sizes| sizes.into()),
) );
.is_ok()
{
found_favicon = true; found_favicon = true;
}; };
};
} }
// Check for default favicon.ico // Check for default favicon.ico
if !found_favicon { if !found_favicon {
self.add_icon(url.join("/favicon.ico")?, IconKind::SiteFavicon, None)?; self.add_icon(
url.join("/favicon.ico").unwrap(),
IconKind::SiteFavicon,
None,
);
} }
} }
for element_ref in document.select(selector!( {
"header img", let mut logos: Vec<_> = document
.select(selector!(
"header img, header svg",
"img[src*=logo]", "img[src*=logo]",
"img[alt*=logo]", "img[alt*=logo], svg[alt*=logo]",
"img[class*=logo]" "img[class*=logo], svg[class*=logo]",
)) { ))
if let Some(href) = element_ref .map(|elem_ref| {
let elem = elem_ref.value();
let mut weight = 0;
// if in the header
if elem_ref
.ancestors()
.map(ElementRef::wrap)
.flatten()
.any(|element| element.value().name() == "header")
{
weight += 2;
}
let mentions_logo = |attr_name| {
elem
.attr(attr_name)
.map(|attr| attr.to_lowercase().contains("logo"))
.unwrap_or(false)
};
if mentions_logo("class") || mentions_logo("id") {
weight += 3;
}
if mentions_logo("alt") {
weight += 2;
}
if mentions_logo("src") {
weight += 1;
}
(elem_ref, weight)
})
.collect();
logos.sort_by(|(_, a_weight), (_, b_weight)| b_weight.cmp(a_weight));
// prefer <img> over svg
let mut prev_weight = None;
for (i, (logo, weight)) in logos.iter().enumerate() {
if let Some(prev_weight) = prev_weight {
if weight != prev_weight {
break;
}
}
prev_weight = Some(weight);
if logo.value().name() == "img" {
let (logo, weight) = logos.remove(i);
logos.insert(0, (logo, weight + 1));
break;
}
}
for (elem_ref, _) in logos {
let elem = elem_ref.value();
if elem.name() == "svg" {
let data_uri = Url::parse(&encode_svg(&elem_ref.html())).unwrap();
self.add_icon(data_uri, IconKind::SiteLogo, None);
break;
}
if let Some(href) = elem_ref
.value() .value()
.attr("src") .attr("src")
.and_then(|href| url.join(&href).ok()) .and_then(|href| url.join(&href).ok())
{ {
if self.add_icon(href, IconKind::SiteLogo, None).is_ok() { self.add_icon(href, IconKind::SiteLogo, None);
break; break;
}; };
}; }
} }
for element_ref in document.select(selector!("link[rel='manifest']")) { for elem_ref in document.select(selector!("link[rel='manifest']")) {
if let Some(href) = element_ref if let Some(href) = elem_ref
.value() .value()
.attr("href") .attr("href")
.and_then(|href| url.join(&href).ok()) .and_then(|href| url.join(&href).ok())
{ {
self.load_manifest(href).await?; warn_err!(self.load_manifest(href).await, "failed to fetch manifest");
} }
} }
@ -185,9 +242,7 @@ impl Icons {
Ok(()) Ok(())
} }
/// Fetch all the icons and return a list of them. /// Fetch all the icons. Ordered from highest to lowest resolution
///
/// List is ordered from highest resolution to lowest resolution
/// ///
/// ``` /// ```
/// # async fn run() { /// # async fn run() {

View file

@ -9,6 +9,7 @@ mod icon_info;
mod icon_size; mod icon_size;
mod icons; mod icons;
mod macros; mod macros;
mod utils;
pub use icon::*; pub use icon::*;
pub use icon_info::*; pub use icon_info::*;

View file

@ -12,3 +12,20 @@ macro_rules! join {
concat!($first$(, $($pattern, $rest),*)?) concat!($first$(, $($pattern, $rest),*)?)
}; };
} }
#[macro_export]
macro_rules! regex {
($re:literal $(,)?) => {{
static RE: once_cell::sync::OnceCell<regex::Regex> = once_cell::sync::OnceCell::new();
RE.get_or_init(|| regex::Regex::new($re).unwrap())
}};
}
#[macro_export]
macro_rules! warn_err {
($result:expr, $($arg:tt)*) => {{
if let Err(err) = $result {
warn!("{} {}", format!($($arg)*), err);
}
}};
}

38
src/utils.rs Normal file
View file

@ -0,0 +1,38 @@
use crate::regex;
use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS};
const DATA_URI: &AsciiSet = &CONTROLS
.add(b'\r')
.add(b'\n')
.add(b'%')
.add(b'#')
.add(b'(')
.add(b')')
.add(b'<')
.add(b'>')
.add(b'?')
.add(b'[')
.add(b'\\')
.add(b']')
.add(b'^')
.add(b'`')
.add(b'{')
.add(b'|')
.add(b'}');
pub fn encode_svg(svg: &str) -> String {
// add namespace
let encoded = if !svg.contains("http://www.w3.org/2000/svg") {
regex!("<svg").replace(svg, "<svg xmlns='http://www.w3.org/2000/svg'")
} else {
svg.into()
};
// remove whitespace
let encoded = regex!(r">\s{1,}</g").replace_all(&encoded, "><");
let encoded = regex!(r"\s{2,}").replace_all(&encoded, " ");
let encoded = utf8_percent_encode(&encoded, DATA_URI);
format!("data:image/svg+xml,{}", encoded)
}