site-icons binary + svg scraping
This commit is contained in:
parent
2cd92eb65e
commit
2c485208d3
9 changed files with 228 additions and 60 deletions
21
Cargo.lock
generated
21
Cargo.lock
generated
|
@ -312,6 +312,19 @@ dependencies = [
|
|||
"cfg-if 1.0.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "env_logger"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f26ecb66b4bdca6c1409b40fb255eefc2bd4f6d135dab3c3124f80ffa2a9661e"
|
||||
dependencies = [
|
||||
"atty",
|
||||
"humantime",
|
||||
"log",
|
||||
"regex",
|
||||
"termcolor",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "error-chain"
|
||||
version = "0.12.4"
|
||||
|
@ -588,6 +601,12 @@ version = "0.3.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47"
|
||||
|
||||
[[package]]
|
||||
name = "humantime"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
|
||||
|
||||
[[package]]
|
||||
name = "hyper"
|
||||
version = "0.14.2"
|
||||
|
@ -1525,12 +1544,14 @@ dependencies = [
|
|||
"byteorder",
|
||||
"clap",
|
||||
"data-url",
|
||||
"env_logger",
|
||||
"futures",
|
||||
"html5ever",
|
||||
"itertools",
|
||||
"log",
|
||||
"mime_4",
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pin-utils",
|
||||
"regex",
|
||||
"reqwest-wasm",
|
||||
|
|
13
Cargo.toml
13
Cargo.toml
|
@ -4,8 +4,12 @@ version = "0.1.0"
|
|||
authors = ["Sam Denty <sam@samdenty.com>"]
|
||||
edition = "2018"
|
||||
license = "gpl-3.0"
|
||||
homepage = "https://github.com/samdenty/site_icons"
|
||||
repository = "https://github.com/samdenty/site_icons"
|
||||
description = "Website icon scraper with sizes, ordering, and WASM support"
|
||||
documentation = "https://docs.rs/site_icons"
|
||||
description = "Website icon scraper that fetches sizes (with WASM support)"
|
||||
keywords = ["favicon", "logo", "website", "scraper", "icons", "cli"]
|
||||
categories = ["command-line-utilities", "multimedia::images", "wasm"]
|
||||
|
||||
[package.metadata.wasm-pack.profile.release]
|
||||
wasm-opt = ["-Oz", "--enable-mutable-globals"]
|
||||
|
@ -13,13 +17,13 @@ wasm-opt = ["-Oz", "--enable-mutable-globals"]
|
|||
[lib]
|
||||
crate-type = ["cdylib", "rlib"]
|
||||
|
||||
|
||||
[dependencies]
|
||||
clap = "3.0.0-beta.2"
|
||||
itertools = "0.10.0"
|
||||
serde_with = "1.6.1"
|
||||
pin-utils = "0.1.0"
|
||||
html5ever = "0.25.1"
|
||||
percent-encoding = "2.1.0"
|
||||
url = { version = "2.2.0", features = ["serde"] }
|
||||
regex = "1"
|
||||
log = "0.4.14"
|
||||
|
@ -28,12 +32,13 @@ scraper = "0.12.0"
|
|||
tokio-futures-byteorder = { version = "0.2.0", features = ["futures"] }
|
||||
byteorder = "1.4.2"
|
||||
data-url = "0.1.0"
|
||||
mime_4 = "0.4.0-a.0"
|
||||
mime = { package = "mime_4", version = "0.4.0-a.0" }
|
||||
serde = { version = "1.0", features = ["derive", "rc"] }
|
||||
serde_json = "1.0"
|
||||
reqwest-wasm = { features = ["json", "cookies", "blocking", "stream"] }
|
||||
reqwest = { package = "reqwest-wasm", version = "0.11.0", features = ["json", "cookies", "blocking", "stream"] }
|
||||
futures = "0.3.12"
|
||||
wee_alloc = { version = "0.4.2", optional = true }
|
||||
|
||||
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
|
||||
tokio = { version = "1.1.0", features = ["full"] }
|
||||
env_logger = "0.8.2"
|
||||
|
|
34
README.md
34
README.md
|
@ -1,6 +1,30 @@
|
|||
# site_icons
|
||||
|
||||
An efficient website icon scraper for rust
|
||||
[![Crates.io](https://img.shields.io/crates/v/site_icons.svg)](https://crates.io/crates/site_icons)
|
||||
[![Documentation](https://docs.rs/site_icons/badge.svg)](https://docs.rs/site_icons/)
|
||||
![GitHub Sponsors](https://img.shields.io/github/sponsors/samdenty?style=social)
|
||||
|
||||
An efficient website icon scraper for rust or command line usage.
|
||||
|
||||
## Features
|
||||
|
||||
- Ensures all URLs point to valid images
|
||||
- Determines icon size by partially fetching images
|
||||
- Supports WASM (and cloudflare workers)
|
||||
|
||||
### Command line usage
|
||||
|
||||
```bash
|
||||
cargo install site_icons
|
||||
|
||||
site-icons https://google.com
|
||||
# https://github.githubassets.com/favicons/favicon.svg site_favicon svg
|
||||
# https://github.githubassets.com/app-icon-512.png app_icon png 512x512
|
||||
# https://github.githubassets.com/app-icon-192.png app_icon png 192x192
|
||||
# https://github.githubassets.com/apple-touch-icon-180x180.png app_icon png 180x180
|
||||
```
|
||||
|
||||
### API usage
|
||||
|
||||
```rust
|
||||
use site_icons::Icons;
|
||||
|
@ -11,17 +35,13 @@ icons.load_website("https://github.com").await?;
|
|||
|
||||
// fetch all icons, ensuring they exist & determining size
|
||||
let entries = icons.entries().await;
|
||||
|
||||
// entries are sorted from highest to lowest resolution
|
||||
for icon in entries {
|
||||
println("{:?}", icon)
|
||||
}
|
||||
```
|
||||
|
||||
## Features
|
||||
|
||||
- Validates that all URLs exist and are actually images
|
||||
- Determines the size of the icon by partially fetching it
|
||||
- Supports WASM (and cloudflare workers)
|
||||
|
||||
### Sources
|
||||
|
||||
- HTML favicon tag (or looking for default `/favicon.ico`)
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
use clap::Clap;
|
||||
use env_logger::Builder;
|
||||
use log::LevelFilter;
|
||||
use site_icons::Icons;
|
||||
use std::error::Error;
|
||||
|
||||
|
@ -7,6 +9,9 @@ struct Opts {
|
|||
urls: Vec<String>,
|
||||
#[clap(long)]
|
||||
json: bool,
|
||||
#[clap(long)]
|
||||
/// Print out errors that occurred for skipped items
|
||||
debug: bool,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
|
@ -14,6 +19,12 @@ async fn main() -> Result<(), Box<dyn Error>> {
|
|||
let mut icons = Icons::new();
|
||||
let opts: Opts = Opts::parse();
|
||||
|
||||
if opts.debug {
|
||||
let mut builder = Builder::new();
|
||||
builder.filter_module("site_icons", LevelFilter::Info);
|
||||
builder.init();
|
||||
}
|
||||
|
||||
for url in opts.urls {
|
||||
icons.load_website(&url).await?;
|
||||
}
|
|
@ -3,7 +3,7 @@ use crate::assert_slice_eq;
|
|||
use byteorder::BigEndian;
|
||||
use futures::prelude::*;
|
||||
use std::{error::Error, io::Cursor};
|
||||
use tokio_byteorder::AsyncReadBytesExt;
|
||||
use tokio_futures_byteorder::AsyncReadBytesExt;
|
||||
|
||||
pub async fn get_jpeg_size<R: AsyncRead + Unpin>(
|
||||
reader: &mut R,
|
||||
|
|
151
src/icons.rs
151
src/icons.rs
|
@ -1,4 +1,4 @@
|
|||
use crate::{selector, Icon, IconInfo, IconKind, CLIENT};
|
||||
use crate::{selector, utils::encode_svg, warn_err, Icon, IconInfo, IconKind, CLIENT};
|
||||
use future::join_all;
|
||||
use futures::StreamExt;
|
||||
use futures::{prelude::*, task::noop_waker};
|
||||
|
@ -7,7 +7,7 @@ use html5ever::{
|
|||
tendril::{Tendril, TendrilSink},
|
||||
};
|
||||
use reqwest::{header::*, IntoUrl};
|
||||
use scraper::Html;
|
||||
use scraper::{ElementRef, Html};
|
||||
use serde::Deserialize;
|
||||
use std::task::Poll;
|
||||
use std::{collections::HashMap, error::Error, pin::Pin, task::Context};
|
||||
|
@ -32,9 +32,7 @@ fn add_icon_entry(
|
|||
) {
|
||||
match info {
|
||||
Ok(info) => entries.push(Icon { url, kind, info }),
|
||||
Err(e) => {
|
||||
warn!("failed to parse icon: {}", e);
|
||||
}
|
||||
Err(_) => warn_err!(info, "failed to parse icon"),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -47,12 +45,7 @@ impl Icons {
|
|||
}
|
||||
|
||||
/// Add an icon URL and start fetching it
|
||||
pub fn add_icon(
|
||||
&mut self,
|
||||
url: Url,
|
||||
kind: IconKind,
|
||||
sizes: Option<String>,
|
||||
) -> Result<(), Box<dyn Error>> {
|
||||
pub fn add_icon(&mut self, url: Url, kind: IconKind, sizes: Option<String>) {
|
||||
// check to see if it already exists
|
||||
let mut entries = self.entries.iter_mut();
|
||||
if let Some(existing_kind) = self
|
||||
|
@ -65,7 +58,7 @@ impl Icons {
|
|||
if &kind > existing_kind {
|
||||
*existing_kind = kind;
|
||||
}
|
||||
return Ok(());
|
||||
return;
|
||||
}
|
||||
|
||||
let mut info = Box::pin(IconInfo::get(url.clone(), sizes));
|
||||
|
@ -79,8 +72,6 @@ impl Icons {
|
|||
self.pending_entries.insert(url, (kind, info));
|
||||
}
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn load_website<U: IntoUrl>(&mut self, url: U) -> Result<(), Box<dyn Error>> {
|
||||
|
@ -90,65 +81,131 @@ impl Icons {
|
|||
|
||||
let mut parser = driver::parse_document(Html::new_document(), Default::default());
|
||||
while let Some(data) = body.next().await {
|
||||
let tendril = Tendril::try_from_byte_slice(&data?).map_err(|_| "failed to parse html")?;
|
||||
parser.process(tendril);
|
||||
if let Ok(data) = Tendril::try_from_byte_slice(&data?) {
|
||||
parser.process(data)
|
||||
}
|
||||
}
|
||||
let document = parser.finish();
|
||||
|
||||
{
|
||||
let mut found_favicon = false;
|
||||
|
||||
for element_ref in document.select(selector!(
|
||||
for elem_ref in document.select(selector!(
|
||||
"link[rel='icon']",
|
||||
"link[rel='shortcut icon']",
|
||||
"link[rel='apple-touch-icon']",
|
||||
"link[rel='apple-touch-icon-precomposed']"
|
||||
)) {
|
||||
let elem = element_ref.value();
|
||||
let elem = elem_ref.value();
|
||||
if let Some(href) = elem.attr("href").and_then(|href| url.join(&href).ok()) {
|
||||
if self
|
||||
.add_icon(
|
||||
href,
|
||||
IconKind::SiteFavicon,
|
||||
elem.attr("sizes").map(|sizes| sizes.into()),
|
||||
)
|
||||
.is_ok()
|
||||
{
|
||||
found_favicon = true;
|
||||
};
|
||||
self.add_icon(
|
||||
href,
|
||||
IconKind::SiteFavicon,
|
||||
elem.attr("sizes").map(|sizes| sizes.into()),
|
||||
);
|
||||
|
||||
found_favicon = true;
|
||||
};
|
||||
}
|
||||
|
||||
// Check for default favicon.ico
|
||||
if !found_favicon {
|
||||
self.add_icon(url.join("/favicon.ico")?, IconKind::SiteFavicon, None)?;
|
||||
self.add_icon(
|
||||
url.join("/favicon.ico").unwrap(),
|
||||
IconKind::SiteFavicon,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
for element_ref in document.select(selector!(
|
||||
"header img",
|
||||
"img[src*=logo]",
|
||||
"img[alt*=logo]",
|
||||
"img[class*=logo]"
|
||||
)) {
|
||||
if let Some(href) = element_ref
|
||||
.value()
|
||||
.attr("src")
|
||||
.and_then(|href| url.join(&href).ok())
|
||||
{
|
||||
if self.add_icon(href, IconKind::SiteLogo, None).is_ok() {
|
||||
{
|
||||
let mut logos: Vec<_> = document
|
||||
.select(selector!(
|
||||
"header img, header svg",
|
||||
"img[src*=logo]",
|
||||
"img[alt*=logo], svg[alt*=logo]",
|
||||
"img[class*=logo], svg[class*=logo]",
|
||||
))
|
||||
.map(|elem_ref| {
|
||||
let elem = elem_ref.value();
|
||||
let mut weight = 0;
|
||||
|
||||
// if in the header
|
||||
if elem_ref
|
||||
.ancestors()
|
||||
.map(ElementRef::wrap)
|
||||
.flatten()
|
||||
.any(|element| element.value().name() == "header")
|
||||
{
|
||||
weight += 2;
|
||||
}
|
||||
|
||||
let mentions_logo = |attr_name| {
|
||||
elem
|
||||
.attr(attr_name)
|
||||
.map(|attr| attr.to_lowercase().contains("logo"))
|
||||
.unwrap_or(false)
|
||||
};
|
||||
if mentions_logo("class") || mentions_logo("id") {
|
||||
weight += 3;
|
||||
}
|
||||
if mentions_logo("alt") {
|
||||
weight += 2;
|
||||
}
|
||||
if mentions_logo("src") {
|
||||
weight += 1;
|
||||
}
|
||||
|
||||
(elem_ref, weight)
|
||||
})
|
||||
.collect();
|
||||
|
||||
logos.sort_by(|(_, a_weight), (_, b_weight)| b_weight.cmp(a_weight));
|
||||
|
||||
// prefer <img> over svg
|
||||
let mut prev_weight = None;
|
||||
for (i, (logo, weight)) in logos.iter().enumerate() {
|
||||
if let Some(prev_weight) = prev_weight {
|
||||
if weight != prev_weight {
|
||||
break;
|
||||
}
|
||||
}
|
||||
prev_weight = Some(weight);
|
||||
|
||||
if logo.value().name() == "img" {
|
||||
let (logo, weight) = logos.remove(i);
|
||||
logos.insert(0, (logo, weight + 1));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (elem_ref, _) in logos {
|
||||
let elem = elem_ref.value();
|
||||
|
||||
if elem.name() == "svg" {
|
||||
let data_uri = Url::parse(&encode_svg(&elem_ref.html())).unwrap();
|
||||
self.add_icon(data_uri, IconKind::SiteLogo, None);
|
||||
break;
|
||||
}
|
||||
|
||||
if let Some(href) = elem_ref
|
||||
.value()
|
||||
.attr("src")
|
||||
.and_then(|href| url.join(&href).ok())
|
||||
{
|
||||
self.add_icon(href, IconKind::SiteLogo, None);
|
||||
break;
|
||||
};
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
for element_ref in document.select(selector!("link[rel='manifest']")) {
|
||||
if let Some(href) = element_ref
|
||||
for elem_ref in document.select(selector!("link[rel='manifest']")) {
|
||||
if let Some(href) = elem_ref
|
||||
.value()
|
||||
.attr("href")
|
||||
.and_then(|href| url.join(&href).ok())
|
||||
{
|
||||
self.load_manifest(href).await?;
|
||||
warn_err!(self.load_manifest(href).await, "failed to fetch manifest");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -185,9 +242,7 @@ impl Icons {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
/// Fetch all the icons and return a list of them.
|
||||
///
|
||||
/// List is ordered from highest resolution to lowest resolution
|
||||
/// Fetch all the icons. Ordered from highest to lowest resolution
|
||||
///
|
||||
/// ```
|
||||
/// # async fn run() {
|
||||
|
|
|
@ -9,6 +9,7 @@ mod icon_info;
|
|||
mod icon_size;
|
||||
mod icons;
|
||||
mod macros;
|
||||
mod utils;
|
||||
|
||||
pub use icon::*;
|
||||
pub use icon_info::*;
|
||||
|
|
|
@ -12,3 +12,20 @@ macro_rules! join {
|
|||
concat!($first$(, $($pattern, $rest),*)?)
|
||||
};
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! regex {
|
||||
($re:literal $(,)?) => {{
|
||||
static RE: once_cell::sync::OnceCell<regex::Regex> = once_cell::sync::OnceCell::new();
|
||||
RE.get_or_init(|| regex::Regex::new($re).unwrap())
|
||||
}};
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! warn_err {
|
||||
($result:expr, $($arg:tt)*) => {{
|
||||
if let Err(err) = $result {
|
||||
warn!("{} {}", format!($($arg)*), err);
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
|
38
src/utils.rs
Normal file
38
src/utils.rs
Normal file
|
@ -0,0 +1,38 @@
|
|||
use crate::regex;
|
||||
use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS};
|
||||
|
||||
const DATA_URI: &AsciiSet = &CONTROLS
|
||||
.add(b'\r')
|
||||
.add(b'\n')
|
||||
.add(b'%')
|
||||
.add(b'#')
|
||||
.add(b'(')
|
||||
.add(b')')
|
||||
.add(b'<')
|
||||
.add(b'>')
|
||||
.add(b'?')
|
||||
.add(b'[')
|
||||
.add(b'\\')
|
||||
.add(b']')
|
||||
.add(b'^')
|
||||
.add(b'`')
|
||||
.add(b'{')
|
||||
.add(b'|')
|
||||
.add(b'}');
|
||||
|
||||
pub fn encode_svg(svg: &str) -> String {
|
||||
// add namespace
|
||||
let encoded = if !svg.contains("http://www.w3.org/2000/svg") {
|
||||
regex!("<svg").replace(svg, "<svg xmlns='http://www.w3.org/2000/svg'")
|
||||
} else {
|
||||
svg.into()
|
||||
};
|
||||
|
||||
// remove whitespace
|
||||
let encoded = regex!(r">\s{1,}</g").replace_all(&encoded, "><");
|
||||
let encoded = regex!(r"\s{2,}").replace_all(&encoded, " ");
|
||||
|
||||
let encoded = utf8_percent_encode(&encoded, DATA_URI);
|
||||
|
||||
format!("data:image/svg+xml,{}", encoded)
|
||||
}
|
Reference in a new issue