site-icons binary + svg scraping

This commit is contained in:
Sam Denty 2021-01-29 16:26:04 +00:00
parent 2cd92eb65e
commit 2c485208d3
No known key found for this signature in database
GPG key ID: F3E9308D4A43BC0E
9 changed files with 228 additions and 60 deletions

21
Cargo.lock generated
View file

@ -312,6 +312,19 @@ dependencies = [
"cfg-if 1.0.0",
]
[[package]]
name = "env_logger"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26ecb66b4bdca6c1409b40fb255eefc2bd4f6d135dab3c3124f80ffa2a9661e"
dependencies = [
"atty",
"humantime",
"log",
"regex",
"termcolor",
]
[[package]]
name = "error-chain"
version = "0.12.4"
@ -588,6 +601,12 @@ version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47"
[[package]]
name = "humantime"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
[[package]]
name = "hyper"
version = "0.14.2"
@ -1525,12 +1544,14 @@ dependencies = [
"byteorder",
"clap",
"data-url",
"env_logger",
"futures",
"html5ever",
"itertools",
"log",
"mime_4",
"once_cell",
"percent-encoding",
"pin-utils",
"regex",
"reqwest-wasm",

View file

@ -4,8 +4,12 @@ version = "0.1.0"
authors = ["Sam Denty <sam@samdenty.com>"]
edition = "2018"
license = "gpl-3.0"
homepage = "https://github.com/samdenty/site_icons"
repository = "https://github.com/samdenty/site_icons"
description = "Website icon scraper with sizes, ordering, and WASM support"
documentation = "https://docs.rs/site_icons"
description = "Website icon scraper that fetches sizes (with WASM support)"
keywords = ["favicon", "logo", "website", "scraper", "icons", "cli"]
categories = ["command-line-utilities", "multimedia::images", "wasm"]
[package.metadata.wasm-pack.profile.release]
wasm-opt = ["-Oz", "--enable-mutable-globals"]
@ -13,13 +17,13 @@ wasm-opt = ["-Oz", "--enable-mutable-globals"]
[lib]
crate-type = ["cdylib", "rlib"]
[dependencies]
clap = "3.0.0-beta.2"
itertools = "0.10.0"
serde_with = "1.6.1"
pin-utils = "0.1.0"
html5ever = "0.25.1"
percent-encoding = "2.1.0"
url = { version = "2.2.0", features = ["serde"] }
regex = "1"
log = "0.4.14"
@ -28,12 +32,13 @@ scraper = "0.12.0"
tokio-futures-byteorder = { version = "0.2.0", features = ["futures"] }
byteorder = "1.4.2"
data-url = "0.1.0"
mime_4 = "0.4.0-a.0"
mime = { package = "mime_4", version = "0.4.0-a.0" }
serde = { version = "1.0", features = ["derive", "rc"] }
serde_json = "1.0"
reqwest-wasm = { features = ["json", "cookies", "blocking", "stream"] }
reqwest = { package = "reqwest-wasm", version = "0.11.0", features = ["json", "cookies", "blocking", "stream"] }
futures = "0.3.12"
wee_alloc = { version = "0.4.2", optional = true }
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
tokio = { version = "1.1.0", features = ["full"] }
env_logger = "0.8.2"

View file

@ -1,6 +1,30 @@
# site_icons
An efficient website icon scraper for rust
[![Crates.io](https://img.shields.io/crates/v/site_icons.svg)](https://crates.io/crates/site_icons)
[![Documentation](https://docs.rs/site_icons/badge.svg)](https://docs.rs/site_icons/)
![GitHub Sponsors](https://img.shields.io/github/sponsors/samdenty?style=social)
An efficient website icon scraper for rust or command line usage.
## Features
- Ensures all URLs point to valid images
- Determines icon size by partially fetching images
- Supports WASM (and cloudflare workers)
### Command line usage
```bash
cargo install site_icons
site-icons https://google.com
# https://github.githubassets.com/favicons/favicon.svg site_favicon svg
# https://github.githubassets.com/app-icon-512.png app_icon png 512x512
# https://github.githubassets.com/app-icon-192.png app_icon png 192x192
# https://github.githubassets.com/apple-touch-icon-180x180.png app_icon png 180x180
```
### API usage
```rust
use site_icons::Icons;
@ -11,17 +35,13 @@ icons.load_website("https://github.com").await?;
// fetch all icons, ensuring they exist & determining size
let entries = icons.entries().await;
// entries are sorted from highest to lowest resolution
for icon in entries {
println("{:?}", icon)
}
```
## Features
- Validates that all URLs exist and are actually images
- Determines the size of the icon by partially fetching it
- Supports WASM (and cloudflare workers)
### Sources
- HTML favicon tag (or looking for default `/favicon.ico`)

View file

@ -1,4 +1,6 @@
use clap::Clap;
use env_logger::Builder;
use log::LevelFilter;
use site_icons::Icons;
use std::error::Error;
@ -7,6 +9,9 @@ struct Opts {
urls: Vec<String>,
#[clap(long)]
json: bool,
#[clap(long)]
/// Print out errors that occurred for skipped items
debug: bool,
}
#[tokio::main]
@ -14,6 +19,12 @@ async fn main() -> Result<(), Box<dyn Error>> {
let mut icons = Icons::new();
let opts: Opts = Opts::parse();
if opts.debug {
let mut builder = Builder::new();
builder.filter_module("site_icons", LevelFilter::Info);
builder.init();
}
for url in opts.urls {
icons.load_website(&url).await?;
}

View file

@ -3,7 +3,7 @@ use crate::assert_slice_eq;
use byteorder::BigEndian;
use futures::prelude::*;
use std::{error::Error, io::Cursor};
use tokio_byteorder::AsyncReadBytesExt;
use tokio_futures_byteorder::AsyncReadBytesExt;
pub async fn get_jpeg_size<R: AsyncRead + Unpin>(
reader: &mut R,

View file

@ -1,4 +1,4 @@
use crate::{selector, Icon, IconInfo, IconKind, CLIENT};
use crate::{selector, utils::encode_svg, warn_err, Icon, IconInfo, IconKind, CLIENT};
use future::join_all;
use futures::StreamExt;
use futures::{prelude::*, task::noop_waker};
@ -7,7 +7,7 @@ use html5ever::{
tendril::{Tendril, TendrilSink},
};
use reqwest::{header::*, IntoUrl};
use scraper::Html;
use scraper::{ElementRef, Html};
use serde::Deserialize;
use std::task::Poll;
use std::{collections::HashMap, error::Error, pin::Pin, task::Context};
@ -32,9 +32,7 @@ fn add_icon_entry(
) {
match info {
Ok(info) => entries.push(Icon { url, kind, info }),
Err(e) => {
warn!("failed to parse icon: {}", e);
}
Err(_) => warn_err!(info, "failed to parse icon"),
}
}
@ -47,12 +45,7 @@ impl Icons {
}
/// Add an icon URL and start fetching it
pub fn add_icon(
&mut self,
url: Url,
kind: IconKind,
sizes: Option<String>,
) -> Result<(), Box<dyn Error>> {
pub fn add_icon(&mut self, url: Url, kind: IconKind, sizes: Option<String>) {
// check to see if it already exists
let mut entries = self.entries.iter_mut();
if let Some(existing_kind) = self
@ -65,7 +58,7 @@ impl Icons {
if &kind > existing_kind {
*existing_kind = kind;
}
return Ok(());
return;
}
let mut info = Box::pin(IconInfo::get(url.clone(), sizes));
@ -79,8 +72,6 @@ impl Icons {
self.pending_entries.insert(url, (kind, info));
}
};
Ok(())
}
pub async fn load_website<U: IntoUrl>(&mut self, url: U) -> Result<(), Box<dyn Error>> {
@ -90,65 +81,131 @@ impl Icons {
let mut parser = driver::parse_document(Html::new_document(), Default::default());
while let Some(data) = body.next().await {
let tendril = Tendril::try_from_byte_slice(&data?).map_err(|_| "failed to parse html")?;
parser.process(tendril);
if let Ok(data) = Tendril::try_from_byte_slice(&data?) {
parser.process(data)
}
}
let document = parser.finish();
{
let mut found_favicon = false;
for element_ref in document.select(selector!(
for elem_ref in document.select(selector!(
"link[rel='icon']",
"link[rel='shortcut icon']",
"link[rel='apple-touch-icon']",
"link[rel='apple-touch-icon-precomposed']"
)) {
let elem = element_ref.value();
let elem = elem_ref.value();
if let Some(href) = elem.attr("href").and_then(|href| url.join(&href).ok()) {
if self
.add_icon(
self.add_icon(
href,
IconKind::SiteFavicon,
elem.attr("sizes").map(|sizes| sizes.into()),
)
.is_ok()
{
);
found_favicon = true;
};
};
}
// Check for default favicon.ico
if !found_favicon {
self.add_icon(url.join("/favicon.ico")?, IconKind::SiteFavicon, None)?;
self.add_icon(
url.join("/favicon.ico").unwrap(),
IconKind::SiteFavicon,
None,
);
}
}
for element_ref in document.select(selector!(
"header img",
{
let mut logos: Vec<_> = document
.select(selector!(
"header img, header svg",
"img[src*=logo]",
"img[alt*=logo]",
"img[class*=logo]"
)) {
if let Some(href) = element_ref
"img[alt*=logo], svg[alt*=logo]",
"img[class*=logo], svg[class*=logo]",
))
.map(|elem_ref| {
let elem = elem_ref.value();
let mut weight = 0;
// if in the header
if elem_ref
.ancestors()
.map(ElementRef::wrap)
.flatten()
.any(|element| element.value().name() == "header")
{
weight += 2;
}
let mentions_logo = |attr_name| {
elem
.attr(attr_name)
.map(|attr| attr.to_lowercase().contains("logo"))
.unwrap_or(false)
};
if mentions_logo("class") || mentions_logo("id") {
weight += 3;
}
if mentions_logo("alt") {
weight += 2;
}
if mentions_logo("src") {
weight += 1;
}
(elem_ref, weight)
})
.collect();
logos.sort_by(|(_, a_weight), (_, b_weight)| b_weight.cmp(a_weight));
// prefer <img> over svg
let mut prev_weight = None;
for (i, (logo, weight)) in logos.iter().enumerate() {
if let Some(prev_weight) = prev_weight {
if weight != prev_weight {
break;
}
}
prev_weight = Some(weight);
if logo.value().name() == "img" {
let (logo, weight) = logos.remove(i);
logos.insert(0, (logo, weight + 1));
break;
}
}
for (elem_ref, _) in logos {
let elem = elem_ref.value();
if elem.name() == "svg" {
let data_uri = Url::parse(&encode_svg(&elem_ref.html())).unwrap();
self.add_icon(data_uri, IconKind::SiteLogo, None);
break;
}
if let Some(href) = elem_ref
.value()
.attr("src")
.and_then(|href| url.join(&href).ok())
{
if self.add_icon(href, IconKind::SiteLogo, None).is_ok() {
self.add_icon(href, IconKind::SiteLogo, None);
break;
};
};
}
}
for element_ref in document.select(selector!("link[rel='manifest']")) {
if let Some(href) = element_ref
for elem_ref in document.select(selector!("link[rel='manifest']")) {
if let Some(href) = elem_ref
.value()
.attr("href")
.and_then(|href| url.join(&href).ok())
{
self.load_manifest(href).await?;
warn_err!(self.load_manifest(href).await, "failed to fetch manifest");
}
}
@ -185,9 +242,7 @@ impl Icons {
Ok(())
}
/// Fetch all the icons and return a list of them.
///
/// List is ordered from highest resolution to lowest resolution
/// Fetch all the icons. Ordered from highest to lowest resolution
///
/// ```
/// # async fn run() {

View file

@ -9,6 +9,7 @@ mod icon_info;
mod icon_size;
mod icons;
mod macros;
mod utils;
pub use icon::*;
pub use icon_info::*;

View file

@ -12,3 +12,20 @@ macro_rules! join {
concat!($first$(, $($pattern, $rest),*)?)
};
}
#[macro_export]
macro_rules! regex {
($re:literal $(,)?) => {{
static RE: once_cell::sync::OnceCell<regex::Regex> = once_cell::sync::OnceCell::new();
RE.get_or_init(|| regex::Regex::new($re).unwrap())
}};
}
#[macro_export]
macro_rules! warn_err {
($result:expr, $($arg:tt)*) => {{
if let Err(err) = $result {
warn!("{} {}", format!($($arg)*), err);
}
}};
}

38
src/utils.rs Normal file
View file

@ -0,0 +1,38 @@
use crate::regex;
use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS};
const DATA_URI: &AsciiSet = &CONTROLS
.add(b'\r')
.add(b'\n')
.add(b'%')
.add(b'#')
.add(b'(')
.add(b')')
.add(b'<')
.add(b'>')
.add(b'?')
.add(b'[')
.add(b'\\')
.add(b']')
.add(b'^')
.add(b'`')
.add(b'{')
.add(b'|')
.add(b'}');
pub fn encode_svg(svg: &str) -> String {
// add namespace
let encoded = if !svg.contains("http://www.w3.org/2000/svg") {
regex!("<svg").replace(svg, "<svg xmlns='http://www.w3.org/2000/svg'")
} else {
svg.into()
};
// remove whitespace
let encoded = regex!(r">\s{1,}</g").replace_all(&encoded, "><");
let encoded = regex!(r"\s{2,}").replace_all(&encoded, " ");
let encoded = utf8_percent_encode(&encoded, DATA_URI);
format!("data:image/svg+xml,{}", encoded)
}