feat(0.6): performance
This commit is contained in:
parent
99ed10ff27
commit
34972db18b
24 changed files with 974 additions and 564 deletions
115
Cargo.lock
generated
115
Cargo.lock
generated
|
@ -31,6 +31,23 @@ dependencies = [
|
|||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-trait"
|
||||
version = "0.1.60"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "677d1d8ab452a3936018a687b20e6f7cf5363d713b732b8884001317b0e48aa3"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async_once"
|
||||
version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2ce4f10ea3abcd6617873bae9f91d1c5332b4a778bd9ce34d0cd517474c1de82"
|
||||
|
||||
[[package]]
|
||||
name = "atty"
|
||||
version = "0.2.14"
|
||||
|
@ -78,6 +95,43 @@ version = "1.3.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dfb24e866b15a1af2a1b663f10c6b6b8f397a84aadb828f12e5b289ec23a3a3c"
|
||||
|
||||
[[package]]
|
||||
name = "cached"
|
||||
version = "0.41.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec6d20b3d24b6c74e2c5331d2d3d8d1976a9883c7da179aa851afa4c90d62e36"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"async_once",
|
||||
"cached_proc_macro",
|
||||
"cached_proc_macro_types",
|
||||
"futures",
|
||||
"hashbrown",
|
||||
"instant",
|
||||
"lazy_static",
|
||||
"once_cell",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cached_proc_macro"
|
||||
version = "0.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "751f7f4e7a091545e7f6c65bacc404eaee7e87bfb1f9ece234a1caa173dc16f2"
|
||||
dependencies = [
|
||||
"cached_proc_macro_types",
|
||||
"darling 0.13.4",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cached_proc_macro_types"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3a4f925191b4367301851c6d99b09890311d74b0d43f274c0b34c86d308a3663"
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.0.77"
|
||||
|
@ -272,14 +326,38 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling"
|
||||
version = "0.13.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a01d95850c592940db9b8194bc39f4bc0e89dee5c4265e4b1807c34a9aba453c"
|
||||
dependencies = [
|
||||
"darling_core 0.13.4",
|
||||
"darling_macro 0.13.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling"
|
||||
version = "0.14.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b0dd3cd20dc6b5a876612a6e5accfe7f3dd883db6d07acfbf14c128f61550dfa"
|
||||
dependencies = [
|
||||
"darling_core",
|
||||
"darling_macro",
|
||||
"darling_core 0.14.2",
|
||||
"darling_macro 0.14.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling_core"
|
||||
version = "0.13.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "859d65a907b6852c9361e3185c862aae7fafd2887876799fa55f5f99dc40d610"
|
||||
dependencies = [
|
||||
"fnv",
|
||||
"ident_case",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"strsim",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -296,13 +374,24 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling_macro"
|
||||
version = "0.13.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c972679f83bdf9c42bd905396b6c3588a843a17f0f16dfcfa3e2c5d57441835"
|
||||
dependencies = [
|
||||
"darling_core 0.13.4",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling_macro"
|
||||
version = "0.14.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7618812407e9402654622dd402b0a89dff9ba93badd6540781526117b92aab7e"
|
||||
dependencies = [
|
||||
"darling_core",
|
||||
"darling_core 0.14.2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
@ -384,6 +473,17 @@ dependencies = [
|
|||
"instant",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "flo_stream"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0a7246db09b6a924fb11fedc1e33c34e6d5d0ba3c95a87cd2994f9581cf5a470"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"lazy_static",
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
|
@ -762,6 +862,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1650,7 +1753,7 @@ version = "2.1.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3452b4c0f6c1e357f73fdb87cd1efabaa12acf328c7a528e252893baeb3f4aa"
|
||||
dependencies = [
|
||||
"darling",
|
||||
"darling 0.14.2",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
|
@ -1683,12 +1786,14 @@ checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
|
|||
|
||||
[[package]]
|
||||
name = "site_icons"
|
||||
version = "0.5.0"
|
||||
version = "0.6.0"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"cached",
|
||||
"clap",
|
||||
"data-url",
|
||||
"env_logger",
|
||||
"flo_stream",
|
||||
"futures",
|
||||
"html5ever",
|
||||
"itertools",
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "site_icons"
|
||||
version = "0.5.0"
|
||||
version = "0.6.0"
|
||||
authors = ["Sam Denty <sam@samdenty.com>"]
|
||||
edition = "2018"
|
||||
license = "GPL-3.0"
|
||||
|
@ -19,6 +19,7 @@ crate-type = ["cdylib", "rlib"]
|
|||
|
||||
[dependencies]
|
||||
vec1 = { version = "1.10.1", features = ["serde"] }
|
||||
flo_stream = "0.7"
|
||||
itertools = "0.10.5"
|
||||
serde_with = "2.1.0"
|
||||
html5ever = "0.26.0"
|
||||
|
@ -44,11 +45,16 @@ reqwest = { package = "reqwest-wasm", version = "0.11.16", features = [
|
|||
"blocking",
|
||||
"stream",
|
||||
] }
|
||||
cached = { version = "0.41.0", default_features = false, features = [
|
||||
"proc_macro",
|
||||
"wasm",
|
||||
] }
|
||||
|
||||
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
|
||||
clap = { version = "3.2.23", features = ["derive"] }
|
||||
tokio = { version = "1.22.0", features = ["full"] }
|
||||
env_logger = "0.9.3"
|
||||
cached = "0.41.0"
|
||||
reqwest = { version = "0.11.13", features = [
|
||||
"json",
|
||||
"cookies",
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
use clap::Parser;
|
||||
use env_logger::Builder;
|
||||
use log::LevelFilter;
|
||||
use site_icons::Icons;
|
||||
use site_icons::SiteIcons;
|
||||
use std::error::Error;
|
||||
|
||||
#[derive(Parser)]
|
||||
struct Opts {
|
||||
urls: Vec<String>,
|
||||
url: String,
|
||||
|
||||
#[clap(long)]
|
||||
fast: bool,
|
||||
#[clap(long)]
|
||||
json: bool,
|
||||
#[clap(long)]
|
||||
|
@ -16,7 +19,7 @@ struct Opts {
|
|||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn Error>> {
|
||||
let mut icons = Icons::new();
|
||||
let mut icons = SiteIcons::new();
|
||||
let opts: Opts = Opts::parse();
|
||||
|
||||
if opts.debug {
|
||||
|
@ -25,11 +28,7 @@ async fn main() -> Result<(), Box<dyn Error>> {
|
|||
builder.init();
|
||||
}
|
||||
|
||||
for url in opts.urls {
|
||||
icons.load_website(&url).await?;
|
||||
}
|
||||
|
||||
let entries = icons.entries().await;
|
||||
let entries = icons.load_website(opts.url, opts.fast).await?;
|
||||
|
||||
if opts.json {
|
||||
println!("{}", serde_json::to_string_pretty(&entries)?)
|
||||
|
|
122
src/html_parser/head.rs
Normal file
122
src/html_parser/head.rs
Normal file
|
@ -0,0 +1,122 @@
|
|||
use crate::utils::poll_in_background;
|
||||
use crate::Icon;
|
||||
use crate::IconKind;
|
||||
use crate::SiteIcons;
|
||||
use futures::future::join_all;
|
||||
use futures::FutureExt;
|
||||
use futures::Stream;
|
||||
use futures::StreamExt;
|
||||
use lol_html::{element, errors::RewritingError, HtmlRewriter, Settings};
|
||||
use std::{
|
||||
cell::RefCell,
|
||||
error::Error,
|
||||
fmt::{self, Display},
|
||||
};
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct EndOfHead {}
|
||||
|
||||
impl Display for EndOfHead {
|
||||
fn fmt(&self, _: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for EndOfHead {}
|
||||
|
||||
pub async fn parse_head(
|
||||
url: &Url,
|
||||
mut body: impl Stream<Item = Result<Vec<u8>, String>> + Unpin,
|
||||
) -> Result<Vec<Icon>, Box<dyn Error>> {
|
||||
let mut icons = Vec::new();
|
||||
let new_icons = RefCell::new(Vec::new());
|
||||
|
||||
{
|
||||
let mut rewriter = HtmlRewriter::new(
|
||||
Settings {
|
||||
element_content_handlers: vec![
|
||||
element!("head", |head| {
|
||||
head.on_end_tag(|_| Err(Box::new(EndOfHead {})))?;
|
||||
Ok(())
|
||||
}),
|
||||
element!("link[rel='manifest']", |manifest| {
|
||||
if let Some(href) = manifest
|
||||
.get_attribute("href")
|
||||
.and_then(|href| url.join(&href).ok())
|
||||
{
|
||||
new_icons.borrow_mut().push(
|
||||
async { SiteIcons::load_manifest(href).await.unwrap_or(Vec::new()) }
|
||||
.boxed_local()
|
||||
.shared(),
|
||||
)
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}),
|
||||
element!(
|
||||
join_with!(
|
||||
",",
|
||||
"link[rel='icon']",
|
||||
"link[rel='shortcut icon']",
|
||||
"link[rel='apple-touch-icon']",
|
||||
"link[rel='apple-touch-icon-precomposed']"
|
||||
),
|
||||
|link| {
|
||||
let rel = link.get_attribute("rel").unwrap();
|
||||
|
||||
if let Some(href) = link
|
||||
.get_attribute("href")
|
||||
.and_then(|href| url.join(&href).ok())
|
||||
{
|
||||
let kind = if rel.contains("apple-touch-icon") {
|
||||
IconKind::AppIcon
|
||||
} else {
|
||||
IconKind::SiteFavicon
|
||||
};
|
||||
|
||||
let sizes = link.get_attribute("sizes");
|
||||
|
||||
new_icons.borrow_mut().push(
|
||||
async {
|
||||
Icon::load(href, kind, sizes)
|
||||
.await
|
||||
.map(|icon| vec![icon])
|
||||
.unwrap_or(Vec::new())
|
||||
}
|
||||
.boxed_local()
|
||||
.shared(),
|
||||
)
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
),
|
||||
],
|
||||
..Settings::default()
|
||||
},
|
||||
|_: &[u8]| {},
|
||||
);
|
||||
|
||||
while let Some(data) = poll_in_background(body.next(), join_all(icons.clone())).await {
|
||||
let result = rewriter.write(&data?);
|
||||
|
||||
icons.extend(new_icons.borrow_mut().drain(..));
|
||||
|
||||
match result {
|
||||
Err(RewritingError::ContentHandlerError(result)) => {
|
||||
match result.downcast::<EndOfHead>() {
|
||||
Ok(_) => break,
|
||||
Err(err) => return Err(err),
|
||||
};
|
||||
}
|
||||
|
||||
result => result?,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let icons = join_all(icons).await.into_iter().flatten().collect();
|
||||
|
||||
Ok(icons)
|
||||
}
|
5
src/html_parser/mod.rs
Normal file
5
src/html_parser/mod.rs
Normal file
|
@ -0,0 +1,5 @@
|
|||
mod head;
|
||||
mod site_logo;
|
||||
|
||||
pub use head::*;
|
||||
pub use site_logo::*;
|
157
src/html_parser/site_logo.rs
Normal file
157
src/html_parser/site_logo.rs
Normal file
|
@ -0,0 +1,157 @@
|
|||
use crate::{utils::encode_svg, Icon, IconKind};
|
||||
use futures::{Stream, StreamExt};
|
||||
use html5ever::{
|
||||
driver,
|
||||
tendril::{Tendril, TendrilSink},
|
||||
};
|
||||
use scraper::{ElementRef, Html};
|
||||
use std::error::Error;
|
||||
use std::iter;
|
||||
use tldextract::TldOption;
|
||||
use url::Url;
|
||||
|
||||
pub async fn parse_site_logo(
|
||||
url: &Url,
|
||||
mut body: impl Stream<Item = Result<Vec<u8>, String>> + Unpin,
|
||||
is_blacklisted: impl Fn(&Url) -> bool,
|
||||
) -> Result<Icon, Box<dyn Error>> {
|
||||
let mut parser = driver::parse_document(Html::new_document(), Default::default());
|
||||
while let Some(data) = body.next().await {
|
||||
if let Ok(data) = Tendril::try_from_byte_slice(&data?) {
|
||||
parser.process(data)
|
||||
}
|
||||
}
|
||||
|
||||
let document = parser.finish();
|
||||
|
||||
let mut logos: Vec<_> = document
|
||||
.select(selector!(
|
||||
"a[href='/'] img, a[href='/'] svg",
|
||||
"header img, header svg",
|
||||
"img[src*=logo]",
|
||||
"img[alt*=logo], svg[alt*=logo]",
|
||||
"*[class*=logo] img, *[class*=logo] svg",
|
||||
"*[id*=logo] img, *[id*=logo] svg",
|
||||
"img[class*=logo], svg[class*=logo]",
|
||||
"img[id*=logo], svg[id*=logo]",
|
||||
))
|
||||
.enumerate()
|
||||
.filter_map(|(i, elem_ref)| {
|
||||
let elem = elem_ref.value();
|
||||
let ancestors = elem_ref
|
||||
.ancestors()
|
||||
.map(ElementRef::wrap)
|
||||
.flatten()
|
||||
.map(|elem_ref| elem_ref.value())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let skip_classnames = regex!("menu|search");
|
||||
let should_skip = ancestors.iter().any(|ancestor| {
|
||||
ancestor
|
||||
.attr("class")
|
||||
.map(|attr| skip_classnames.is_match(&attr.to_lowercase()))
|
||||
.or_else(|| {
|
||||
ancestor
|
||||
.attr("id")
|
||||
.map(|attr| skip_classnames.is_match(&attr.to_lowercase()))
|
||||
})
|
||||
.unwrap_or(false)
|
||||
});
|
||||
|
||||
if should_skip {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut weight = 0;
|
||||
|
||||
// if in the header
|
||||
if ancestors.iter().any(|element| element.name() == "header") {
|
||||
weight += 2;
|
||||
}
|
||||
|
||||
if i == 0 {
|
||||
weight += 1;
|
||||
}
|
||||
|
||||
let mentions = |attr_name, is_match: Box<dyn Fn(&str) -> bool>| {
|
||||
ancestors.iter().chain(iter::once(&elem)).any(|ancestor| {
|
||||
ancestor
|
||||
.attr(attr_name)
|
||||
.map(|attr| is_match(&attr.to_lowercase()))
|
||||
.unwrap_or(false)
|
||||
})
|
||||
};
|
||||
|
||||
if mentions("href", Box::new(|attr| attr == "/")) {
|
||||
weight += 5;
|
||||
};
|
||||
|
||||
let mentions_logo = |attr_name| {
|
||||
mentions(
|
||||
attr_name,
|
||||
Box::new(|attr| regex!("logo([^s]|$)").is_match(attr)),
|
||||
)
|
||||
};
|
||||
|
||||
if mentions_logo("class") || mentions_logo("id") {
|
||||
weight += 3;
|
||||
}
|
||||
if mentions_logo("alt") {
|
||||
weight += 2;
|
||||
}
|
||||
if mentions_logo("src") {
|
||||
weight += 1;
|
||||
}
|
||||
|
||||
if let Some(site_name) = url
|
||||
.domain()
|
||||
.and_then(|domain| TldOption::default().build().extract(domain).unwrap().domain)
|
||||
{
|
||||
// if the alt contains the site_name then highest priority
|
||||
if site_name
|
||||
.to_lowercase()
|
||||
.split('-')
|
||||
.any(|segment| mentions("alt", Box::new(move |attr| attr.contains(segment))))
|
||||
{
|
||||
weight += 10;
|
||||
}
|
||||
}
|
||||
|
||||
let href = if elem.name() == "svg" {
|
||||
Some(Url::parse(&encode_svg(&elem_ref.html())).unwrap())
|
||||
} else {
|
||||
elem.attr("src").and_then(|href| url.join(&href).ok())
|
||||
};
|
||||
|
||||
if let Some(href) = &href {
|
||||
if is_blacklisted(href) {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
href.map(|href| (href, elem_ref, weight))
|
||||
})
|
||||
.collect();
|
||||
|
||||
logos.sort_by(|(_, _, a_weight), (_, _, b_weight)| b_weight.cmp(a_weight));
|
||||
|
||||
// prefer <img> over svg
|
||||
let mut prev_weight = None;
|
||||
for (href, elem_ref, weight) in &logos {
|
||||
if let Some(prev_weight) = prev_weight {
|
||||
if weight != prev_weight {
|
||||
break;
|
||||
}
|
||||
}
|
||||
prev_weight = Some(weight);
|
||||
|
||||
if elem_ref.value().name() == "img" {
|
||||
return Icon::load(href.clone(), IconKind::SiteLogo, None).await;
|
||||
}
|
||||
}
|
||||
|
||||
match logos.into_iter().next() {
|
||||
Some((href, _, _)) => Icon::load(href.clone(), IconKind::SiteLogo, None).await,
|
||||
None => Err("No site logo found".into()),
|
||||
}
|
||||
}
|
61
src/icon.rs
61
src/icon.rs
|
@ -1,61 +0,0 @@
|
|||
use super::IconInfo;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{DeserializeFromStr, SerializeDisplay};
|
||||
use std::{
|
||||
cmp::Ordering,
|
||||
collections::HashMap,
|
||||
fmt::{self, Display},
|
||||
str::FromStr,
|
||||
};
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug, Clone, PartialOrd, PartialEq, Ord, Eq, SerializeDisplay, DeserializeFromStr)]
|
||||
pub enum IconKind {
|
||||
AppIcon,
|
||||
SiteLogo,
|
||||
SiteFavicon,
|
||||
}
|
||||
|
||||
impl Display for IconKind {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
f.write_str(match self {
|
||||
IconKind::SiteLogo => "site_logo",
|
||||
IconKind::AppIcon => "app_icon",
|
||||
IconKind::SiteFavicon => "site_favicon",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for IconKind {
|
||||
type Err = String;
|
||||
|
||||
fn from_str(kind: &str) -> Result<Self, Self::Err> {
|
||||
match kind {
|
||||
"site_logo" => Ok(IconKind::SiteLogo),
|
||||
"app_icon" => Ok(IconKind::AppIcon),
|
||||
"site_favicon" => Ok(IconKind::SiteFavicon),
|
||||
_ => Err("unknown icon kind!".into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct Icon {
|
||||
pub url: Url,
|
||||
pub headers: HashMap<String, String>,
|
||||
pub kind: IconKind,
|
||||
#[serde(flatten)]
|
||||
pub info: IconInfo,
|
||||
}
|
||||
|
||||
impl Ord for Icon {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.info.cmp(&other.info)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for Icon {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
|
@ -1,4 +1,5 @@
|
|||
use crate::{icon_size::*, CLIENT};
|
||||
use super::*;
|
||||
use crate::CLIENT;
|
||||
use data_url::DataUrl;
|
||||
use futures::{io::Cursor, prelude::*, stream::TryStreamExt};
|
||||
use mime::MediaType;
|
||||
|
@ -6,6 +7,7 @@ use reqwest::{header::*, Url};
|
|||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
cmp::Ordering,
|
||||
convert::TryFrom,
|
||||
error::Error,
|
||||
fmt::{self, Display},
|
||||
io,
|
||||
|
@ -72,7 +74,7 @@ impl IconInfo {
|
|||
headers: HeaderMap,
|
||||
sizes: Option<String>,
|
||||
) -> Result<IconInfo, Box<dyn Error>> {
|
||||
let sizes = sizes.as_ref().and_then(|s| IconSizes::from_str(s).ok());
|
||||
let sizes = sizes.as_ref().and_then(|s| IconSizes::try_from(s).ok());
|
||||
|
||||
let (mime, mut body): (_, Box<dyn AsyncRead + Unpin>) = match url.scheme() {
|
||||
"data" => {
|
||||
|
@ -92,6 +94,43 @@ impl IconInfo {
|
|||
}
|
||||
|
||||
_ => {
|
||||
match &url.path().split('.').last().unwrap_or("").to_lowercase()[..] {
|
||||
"svg" => {
|
||||
if let Some(sizes) = sizes {
|
||||
return Ok(IconInfo::SVG {
|
||||
size: Some(*sizes.largest()),
|
||||
});
|
||||
}
|
||||
}
|
||||
"png" => {
|
||||
if let Some(sizes) = sizes {
|
||||
return Ok(IconInfo::PNG {
|
||||
size: *sizes.largest(),
|
||||
});
|
||||
}
|
||||
}
|
||||
"jpeg" | "jpg" => {
|
||||
if let Some(sizes) = sizes {
|
||||
return Ok(IconInfo::JPEG {
|
||||
size: *sizes.largest(),
|
||||
});
|
||||
}
|
||||
}
|
||||
"ico" => {
|
||||
if let Some(sizes) = sizes {
|
||||
return Ok(IconInfo::ICO { sizes });
|
||||
}
|
||||
}
|
||||
"gif" => {
|
||||
if let Some(sizes) = sizes {
|
||||
return Ok(IconInfo::GIF {
|
||||
size: *sizes.largest(),
|
||||
});
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
};
|
||||
|
||||
let res = CLIENT.get(url).headers(headers).send().await?;
|
||||
if !res.status().is_success() {
|
||||
return Err("failed to fetch".into());
|
|
@ -22,7 +22,22 @@ impl Display for IconSizes {
|
|||
}
|
||||
|
||||
impl IconSizes {
|
||||
pub fn from_str(sizes_str: &str) -> Result<IconSizes, Box<dyn Error>> {
|
||||
pub fn add_size(&mut self, size: IconSize) {
|
||||
match self.0.binary_search(&size) {
|
||||
Ok(_) => {}
|
||||
Err(pos) => self.0.insert(pos, size),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn largest(&self) -> &IconSize {
|
||||
self.0.first()
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&str> for IconSizes {
|
||||
type Error = Box<dyn Error>;
|
||||
|
||||
fn try_from(sizes_str: &str) -> Result<Self, Self::Error> {
|
||||
let size_strs = sizes_str.split(" ");
|
||||
|
||||
let mut sizes = Vec::new();
|
||||
|
@ -34,16 +49,21 @@ impl IconSizes {
|
|||
|
||||
Ok(sizes.try_into()?)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_size(&mut self, size: IconSize) {
|
||||
match self.0.binary_search(&size) {
|
||||
Ok(_) => {}
|
||||
Err(pos) => self.0.insert(pos, size),
|
||||
}
|
||||
}
|
||||
impl TryFrom<&String> for IconSizes {
|
||||
type Error = Box<dyn Error>;
|
||||
|
||||
pub fn largest(&self) -> &IconSize {
|
||||
self.0.first()
|
||||
fn try_from(sizes_str: &String) -> Result<Self, Self::Error> {
|
||||
IconSizes::try_from(sizes_str.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<String> for IconSizes {
|
||||
type Error = Box<dyn Error>;
|
||||
|
||||
fn try_from(sizes_str: String) -> Result<Self, Self::Error> {
|
||||
IconSizes::try_from(sizes_str.as_str())
|
||||
}
|
||||
}
|
||||
|
68
src/icon/icon_size/svg.rs
Normal file
68
src/icon/icon_size/svg.rs
Normal file
|
@ -0,0 +1,68 @@
|
|||
use super::IconSize;
|
||||
use futures::prelude::*;
|
||||
use lol_html::{element, HtmlRewriter, Settings};
|
||||
use std::{cell::RefCell, error::Error};
|
||||
|
||||
fn parse_size<S: ToString>(size: S) -> Option<u32> {
|
||||
size
|
||||
.to_string()
|
||||
.parse::<f64>()
|
||||
.ok()
|
||||
.map(|size| size.round() as u32)
|
||||
}
|
||||
|
||||
pub async fn get_svg_size<R: AsyncRead + Unpin>(
|
||||
first_bytes: &[u8; 2],
|
||||
reader: &mut R,
|
||||
) -> Result<Option<IconSize>, Box<dyn Error>> {
|
||||
let size = RefCell::new(None);
|
||||
|
||||
let mut rewriter = HtmlRewriter::new(
|
||||
Settings {
|
||||
element_content_handlers: vec![
|
||||
// Rewrite insecure hyperlinks
|
||||
element!("svg", |el| {
|
||||
let viewbox = el.get_attribute("viewbox");
|
||||
|
||||
let width = el.get_attribute("width").and_then(parse_size);
|
||||
let height = el.get_attribute("height").and_then(parse_size);
|
||||
|
||||
*size.borrow_mut() = Some(if let (Some(width), Some(height)) = (width, height) {
|
||||
Some(IconSize::new(width, height))
|
||||
} else if let Some(viewbox) = viewbox {
|
||||
regex!(r"^\d+\s+\d+\s+(\d+\.?[\d]?)\s+(\d+\.?[\d]?)")
|
||||
.captures(&viewbox)
|
||||
.map(|captures| {
|
||||
let width = parse_size(captures.get(1).unwrap().as_str()).unwrap();
|
||||
let height = parse_size(captures.get(2).unwrap().as_str()).unwrap();
|
||||
IconSize::new(width, height)
|
||||
})
|
||||
} else {
|
||||
None
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}),
|
||||
],
|
||||
..Settings::default()
|
||||
},
|
||||
|_: &[u8]| {},
|
||||
);
|
||||
|
||||
rewriter.write(first_bytes)?;
|
||||
|
||||
let mut buffer = [0; 100];
|
||||
|
||||
loop {
|
||||
let n = reader.read(&mut buffer).await?;
|
||||
if n == 0 {
|
||||
return Err("invalid svg".into());
|
||||
}
|
||||
|
||||
rewriter.write(&buffer[..n])?;
|
||||
|
||||
if let Some(size) = *size.borrow() {
|
||||
return Ok(size);
|
||||
}
|
||||
}
|
||||
}
|
123
src/icon/mod.rs
Normal file
123
src/icon/mod.rs
Normal file
|
@ -0,0 +1,123 @@
|
|||
mod icon_info;
|
||||
mod icon_size;
|
||||
|
||||
pub use icon_info::*;
|
||||
pub use icon_size::*;
|
||||
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{DeserializeFromStr, SerializeDisplay};
|
||||
use std::{
|
||||
cmp::Ordering,
|
||||
collections::HashMap,
|
||||
convert::TryInto,
|
||||
error::Error,
|
||||
fmt::{self, Display},
|
||||
hash::{Hash, Hasher},
|
||||
str::FromStr,
|
||||
};
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug, Clone, PartialOrd, PartialEq, Ord, Eq, SerializeDisplay, DeserializeFromStr)]
|
||||
pub enum IconKind {
|
||||
AppIcon,
|
||||
SiteFavicon,
|
||||
SiteLogo,
|
||||
}
|
||||
|
||||
impl Display for IconKind {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
f.write_str(match self {
|
||||
IconKind::SiteLogo => "site_logo",
|
||||
IconKind::AppIcon => "app_icon",
|
||||
IconKind::SiteFavicon => "site_favicon",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for IconKind {
|
||||
type Err = String;
|
||||
|
||||
fn from_str(kind: &str) -> Result<Self, Self::Err> {
|
||||
match kind {
|
||||
"site_logo" => Ok(IconKind::SiteLogo),
|
||||
"app_icon" => Ok(IconKind::AppIcon),
|
||||
"site_favicon" => Ok(IconKind::SiteFavicon),
|
||||
_ => Err("unknown icon kind!".into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct Icon {
|
||||
pub url: Url,
|
||||
pub headers: HashMap<String, String>,
|
||||
pub kind: IconKind,
|
||||
#[serde(flatten)]
|
||||
pub info: IconInfo,
|
||||
}
|
||||
|
||||
impl Hash for Icon {
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
(
|
||||
&self.url,
|
||||
self
|
||||
.headers
|
||||
.iter()
|
||||
.sorted_by_key(|(key, _)| *key)
|
||||
.collect::<Vec<_>>(),
|
||||
)
|
||||
.hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
impl Icon {
|
||||
pub fn new(url: Url, kind: IconKind, info: IconInfo) -> Self {
|
||||
Icon::new_with_headers(url, HashMap::new(), kind, info)
|
||||
}
|
||||
|
||||
pub fn new_with_headers(
|
||||
url: Url,
|
||||
headers: HashMap<String, String>,
|
||||
kind: IconKind,
|
||||
info: IconInfo,
|
||||
) -> Self {
|
||||
Self {
|
||||
url,
|
||||
headers,
|
||||
kind,
|
||||
info,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn load(
|
||||
url: Url,
|
||||
kind: IconKind,
|
||||
sizes: Option<String>,
|
||||
) -> Result<Self, Box<dyn Error>> {
|
||||
Icon::load_with_headers(url, HashMap::new(), kind, sizes).await
|
||||
}
|
||||
|
||||
pub async fn load_with_headers(
|
||||
url: Url,
|
||||
headers: HashMap<String, String>,
|
||||
kind: IconKind,
|
||||
sizes: Option<String>,
|
||||
) -> Result<Self, Box<dyn Error>> {
|
||||
let info = IconInfo::load(url.clone(), (&headers).try_into().unwrap(), sizes).await?;
|
||||
|
||||
Ok(Icon::new_with_headers(url, headers, kind, info))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for Icon {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.info.cmp(&other.info)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for Icon {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
|
@ -1,84 +0,0 @@
|
|||
use super::IconSize;
|
||||
use futures::prelude::*;
|
||||
use lol_html::{element, errors::RewritingError, HtmlRewriter, Settings};
|
||||
use std::{
|
||||
error::Error,
|
||||
fmt::{self, Display},
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
struct SizeResult(Option<IconSize>);
|
||||
|
||||
impl Display for SizeResult {
|
||||
fn fmt(&self, _: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for SizeResult {}
|
||||
|
||||
fn parse_size<S: ToString>(size: S) -> Option<u32> {
|
||||
size
|
||||
.to_string()
|
||||
.parse::<f64>()
|
||||
.ok()
|
||||
.map(|size| size.round() as u32)
|
||||
}
|
||||
|
||||
pub async fn get_svg_size<R: AsyncRead + Unpin>(
|
||||
first_bytes: &[u8; 2],
|
||||
reader: &mut R,
|
||||
) -> Result<Option<IconSize>, Box<dyn Error>> {
|
||||
let mut rewriter = HtmlRewriter::new(
|
||||
Settings {
|
||||
element_content_handlers: vec![
|
||||
// Rewrite insecure hyperlinks
|
||||
element!("svg", |el| {
|
||||
let viewbox = el.get_attribute("viewbox");
|
||||
|
||||
let width = el.get_attribute("width").and_then(parse_size);
|
||||
let height = el.get_attribute("height").and_then(parse_size);
|
||||
|
||||
Err(Box::new(SizeResult(
|
||||
if let (Some(width), Some(height)) = (width, height) {
|
||||
Some(IconSize::new(width, height))
|
||||
} else if let Some(viewbox) = viewbox {
|
||||
regex!(r"^\d+\s+\d+\s+(\d+\.?[\d]?)\s+(\d+\.?[\d]?)")
|
||||
.captures(&viewbox)
|
||||
.map(|captures| {
|
||||
let width = parse_size(captures.get(1).unwrap().as_str()).unwrap();
|
||||
let height = parse_size(captures.get(2).unwrap().as_str()).unwrap();
|
||||
IconSize::new(width, height)
|
||||
})
|
||||
} else {
|
||||
None
|
||||
},
|
||||
)))
|
||||
}),
|
||||
],
|
||||
..Settings::default()
|
||||
},
|
||||
|_: &[u8]| {},
|
||||
);
|
||||
|
||||
rewriter.write(first_bytes)?;
|
||||
|
||||
let mut buffer = [0; 100];
|
||||
|
||||
loop {
|
||||
let n = reader.read(&mut buffer).await?;
|
||||
if n == 0 {
|
||||
return Err("invalid svg".into());
|
||||
}
|
||||
|
||||
match rewriter.write(&buffer[..n]) {
|
||||
Err(RewritingError::ContentHandlerError(result)) => {
|
||||
let result = result.downcast::<SizeResult>().unwrap();
|
||||
|
||||
return Ok(result.0);
|
||||
}
|
||||
|
||||
result => result?,
|
||||
}
|
||||
}
|
||||
}
|
491
src/icons.rs
491
src/icons.rs
|
@ -1,123 +1,38 @@
|
|||
use crate::{utils::encode_svg, Icon, IconInfo, IconKind, CLIENT};
|
||||
use future::join_all;
|
||||
use futures::StreamExt;
|
||||
use futures::{prelude::*, task::noop_waker};
|
||||
use html5ever::{
|
||||
driver,
|
||||
tendril::{Tendril, TendrilSink},
|
||||
};
|
||||
use crate::{html_parser, utils::push_url, Icon, IconKind, CLIENT};
|
||||
use flo_stream::{MessagePublisher, Publisher, StreamPublisher};
|
||||
use futures::future::{join_all, select_all};
|
||||
use futures::prelude::*;
|
||||
use futures::{join, StreamExt};
|
||||
use itertools::Itertools;
|
||||
use reqwest::{header::*, IntoUrl};
|
||||
use scraper::{ElementRef, Html};
|
||||
use serde::Deserialize;
|
||||
use std::convert::TryInto;
|
||||
use std::iter;
|
||||
use std::task::Poll;
|
||||
use std::{collections::HashMap, error::Error, pin::Pin, task::Context};
|
||||
use tldextract::TldOption;
|
||||
use std::error::Error;
|
||||
use url::Url;
|
||||
use vec1::Vec1;
|
||||
|
||||
pub struct Icons {
|
||||
pub struct SiteIcons {
|
||||
blacklist: Option<Box<dyn Fn(&Url) -> bool>>,
|
||||
entries: Vec<Icon>,
|
||||
pending_entries: HashMap<
|
||||
Url,
|
||||
(
|
||||
IconKind,
|
||||
HashMap<String, String>,
|
||||
Pin<Box<dyn Future<Output = Result<IconInfo, Box<dyn Error>>>>>,
|
||||
),
|
||||
>,
|
||||
}
|
||||
|
||||
fn add_icon_entry(
|
||||
entries: &mut Vec<Icon>,
|
||||
url: Url,
|
||||
headers: HashMap<String, String>,
|
||||
kind: IconKind,
|
||||
info: Result<IconInfo, Box<dyn Error>>,
|
||||
) {
|
||||
match info {
|
||||
Ok(info) => entries.push(Icon {
|
||||
url,
|
||||
headers,
|
||||
kind,
|
||||
info,
|
||||
}),
|
||||
Err(_) => warn_err!(info, "failed to parse icon"),
|
||||
}
|
||||
#[derive(Debug, Clone)]
|
||||
enum LoadedKind {
|
||||
DefaultManifest(Option<Vec1<Icon>>),
|
||||
HeadTags(Option<Vec1<Icon>>),
|
||||
DefaultFavicon(Option<Icon>),
|
||||
SiteLogo(Option<Icon>),
|
||||
}
|
||||
|
||||
impl Icons {
|
||||
impl SiteIcons {
|
||||
pub fn new() -> Self {
|
||||
Icons {
|
||||
blacklist: None,
|
||||
entries: Vec::new(),
|
||||
pending_entries: HashMap::new(),
|
||||
}
|
||||
SiteIcons { blacklist: None }
|
||||
}
|
||||
|
||||
pub fn new_with_blacklist(blacklist: impl Fn(&Url) -> bool + 'static) -> Self {
|
||||
Icons {
|
||||
SiteIcons {
|
||||
blacklist: Some(Box::new(blacklist)),
|
||||
entries: Vec::new(),
|
||||
pending_entries: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Add an icon URL and start fetching it
|
||||
pub fn add_icon(&mut self, url: Url, kind: IconKind, sizes: Option<String>) {
|
||||
self.add_icon_with_headers(url, HashMap::new(), kind, sizes)
|
||||
}
|
||||
|
||||
/// Add an icon URL and start fetching it,
|
||||
/// along with the specified headers
|
||||
pub fn add_icon_with_headers(
|
||||
&mut self,
|
||||
url: Url,
|
||||
headers: HashMap<String, String>,
|
||||
kind: IconKind,
|
||||
sizes: Option<String>,
|
||||
) {
|
||||
// check to see if it already exists
|
||||
let mut entries = self.entries.iter_mut();
|
||||
if let Some(existing_kind) = self
|
||||
.pending_entries
|
||||
.get_mut(&url)
|
||||
.map(|(kind, _, _)| kind)
|
||||
.or_else(|| {
|
||||
entries.find_map(|icon| {
|
||||
if icon.url.eq(&url) {
|
||||
Some(&mut icon.kind)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
})
|
||||
{
|
||||
// if the kind is more important, replace it
|
||||
if &kind > existing_kind {
|
||||
*existing_kind = kind;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
let mut info = Box::pin(IconInfo::load(
|
||||
url.clone(),
|
||||
(&headers).try_into().unwrap(),
|
||||
sizes,
|
||||
));
|
||||
|
||||
// Start fetching the icon
|
||||
let noop_waker = noop_waker();
|
||||
let cx = &mut Context::from_waker(&noop_waker);
|
||||
match info.poll_unpin(cx) {
|
||||
Poll::Ready(info) => add_icon_entry(&mut self.entries, url, headers, kind, info),
|
||||
Poll::Pending => {
|
||||
self.pending_entries.insert(url, (kind, headers, info));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
pub fn is_blacklisted(&self, url: &Url) -> bool {
|
||||
if let Some(is_blacklisted) = &self.blacklist {
|
||||
is_blacklisted(url)
|
||||
|
@ -126,271 +41,163 @@ impl Icons {
|
|||
}
|
||||
}
|
||||
|
||||
pub async fn load_website<U: IntoUrl>(&mut self, url: U) -> Result<(), Box<dyn Error>> {
|
||||
pub async fn load_website<U: IntoUrl>(
|
||||
&mut self,
|
||||
url: U,
|
||||
best_matches_only: bool,
|
||||
) -> Result<Vec<Icon>, Box<dyn Error>> {
|
||||
let url = url.into_url()?;
|
||||
|
||||
let manifest_urls = vec![
|
||||
push_url(&url, "manifest.json"),
|
||||
push_url(&url, "manifest.webmanifest"),
|
||||
url.join("/manifest.json")?,
|
||||
url.join("/manifest.webmanifest")?,
|
||||
]
|
||||
.into_iter()
|
||||
.unique();
|
||||
|
||||
let favicon_urls = vec![push_url(&url, "favicon.ico"), url.join("/favicon.ico")?]
|
||||
.into_iter()
|
||||
.unique();
|
||||
|
||||
let html_response = async {
|
||||
let res = CLIENT
|
||||
.get(url)
|
||||
.get(url.clone())
|
||||
.header(ACCEPT, "text/html")
|
||||
.send()
|
||||
.await?
|
||||
.error_for_status()?;
|
||||
.await
|
||||
.ok()?
|
||||
.error_for_status()
|
||||
.ok()?;
|
||||
|
||||
let url = res.url().clone();
|
||||
|
||||
if self.is_blacklisted(&url) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut body = res.bytes_stream();
|
||||
|
||||
let mut parser = driver::parse_document(Html::new_document(), Default::default());
|
||||
while let Some(data) = body.next().await {
|
||||
if let Ok(data) = Tendril::try_from_byte_slice(&data?) {
|
||||
parser.process(data)
|
||||
}
|
||||
}
|
||||
let document = parser.finish();
|
||||
|
||||
{
|
||||
let mut found_favicon = false;
|
||||
|
||||
for elem_ref in document.select(selector!(
|
||||
"link[rel='icon']",
|
||||
"link[rel='shortcut icon']",
|
||||
"link[rel='apple-touch-icon']",
|
||||
"link[rel='apple-touch-icon-precomposed']"
|
||||
)) {
|
||||
let elem = elem_ref.value();
|
||||
if let Some(href) = elem.attr("href").and_then(|href| url.join(&href).ok()) {
|
||||
let rel = elem.attr("rel").unwrap();
|
||||
self.add_icon(
|
||||
href,
|
||||
if rel.contains("apple-touch-icon") {
|
||||
IconKind::AppIcon
|
||||
None
|
||||
} else {
|
||||
IconKind::SiteFavicon
|
||||
},
|
||||
elem.attr("sizes").map(|sizes| sizes.into()),
|
||||
);
|
||||
|
||||
found_favicon = true;
|
||||
};
|
||||
}
|
||||
|
||||
// Check for default favicon.ico
|
||||
if !found_favicon {
|
||||
self.add_icon(
|
||||
url.join("/favicon.ico").unwrap(),
|
||||
IconKind::SiteFavicon,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
let mut logos: Vec<_> = document
|
||||
.select(selector!(
|
||||
"a[href='/'] img, a[href='/'] svg",
|
||||
"header img, header svg",
|
||||
"img[src*=logo]",
|
||||
"img[alt*=logo], svg[alt*=logo]",
|
||||
"*[class*=logo] img, *[class*=logo] svg",
|
||||
"*[id*=logo] img, *[id*=logo] svg",
|
||||
"img[class*=logo], svg[class*=logo]",
|
||||
"img[id*=logo], svg[id*=logo]",
|
||||
))
|
||||
.enumerate()
|
||||
.filter_map(|(i, elem_ref)| {
|
||||
let elem = elem_ref.value();
|
||||
let ancestors = elem_ref
|
||||
.ancestors()
|
||||
.map(ElementRef::wrap)
|
||||
.flatten()
|
||||
.map(|elem_ref| elem_ref.value())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let skip_classnames = regex!("menu|search");
|
||||
let should_skip = ancestors.iter().any(|ancestor| {
|
||||
ancestor
|
||||
.attr("class")
|
||||
.map(|attr| skip_classnames.is_match(&attr.to_lowercase()))
|
||||
.or_else(|| {
|
||||
ancestor
|
||||
.attr("id")
|
||||
.map(|attr| skip_classnames.is_match(&attr.to_lowercase()))
|
||||
})
|
||||
.unwrap_or(false)
|
||||
let body = res.bytes_stream().map(|res| {
|
||||
res
|
||||
.map(|bytes| bytes.to_vec())
|
||||
.map_err(|err| err.to_string())
|
||||
});
|
||||
|
||||
if should_skip {
|
||||
return None;
|
||||
let mut publisher = Publisher::new(128);
|
||||
let subscriber = publisher.subscribe();
|
||||
|
||||
Some((
|
||||
url,
|
||||
async move { StreamPublisher::new(&mut publisher, body).await }.shared(),
|
||||
subscriber,
|
||||
))
|
||||
}
|
||||
|
||||
let mut weight = 0;
|
||||
|
||||
// if in the header
|
||||
if ancestors.iter().any(|element| element.name() == "header") {
|
||||
weight += 2;
|
||||
}
|
||||
.shared();
|
||||
|
||||
if i == 0 {
|
||||
weight += 1;
|
||||
}
|
||||
let mut futures = vec![
|
||||
async {
|
||||
let html_response = html_response.clone().await;
|
||||
|
||||
let mentions = |attr_name, is_match: Box<dyn Fn(&str) -> bool>| {
|
||||
ancestors.iter().chain(iter::once(&elem)).any(|ancestor| {
|
||||
ancestor
|
||||
.attr(attr_name)
|
||||
.map(|attr| is_match(&attr.to_lowercase()))
|
||||
.unwrap_or(false)
|
||||
LoadedKind::HeadTags(match html_response {
|
||||
Some((url, _, body)) => html_parser::parse_head(&url, body)
|
||||
.await
|
||||
.ok()
|
||||
.and_then(|icons| icons.try_into().ok()),
|
||||
None => None,
|
||||
})
|
||||
};
|
||||
|
||||
if mentions("href", Box::new(|attr| attr == "/")) {
|
||||
weight += 5;
|
||||
};
|
||||
|
||||
let mentions_logo = |attr_name| {
|
||||
mentions(
|
||||
attr_name,
|
||||
Box::new(|attr| regex!("logo([^s]|$)").is_match(attr)),
|
||||
)
|
||||
};
|
||||
|
||||
if mentions_logo("class") || mentions_logo("id") {
|
||||
weight += 3;
|
||||
}
|
||||
if mentions_logo("alt") {
|
||||
weight += 2;
|
||||
}
|
||||
if mentions_logo("src") {
|
||||
weight += 1;
|
||||
}
|
||||
.boxed_local(),
|
||||
async {
|
||||
let html_response = html_response.clone().await;
|
||||
|
||||
if let Some(site_name) = url
|
||||
.domain()
|
||||
.and_then(|domain| TldOption::default().build().extract(domain).unwrap().domain)
|
||||
{
|
||||
// if the alt contains the site_name then highest priority
|
||||
if site_name
|
||||
.to_lowercase()
|
||||
.split('-')
|
||||
.any(|segment| mentions("alt", Box::new(move |attr| attr.contains(segment))))
|
||||
{
|
||||
weight += 10;
|
||||
}
|
||||
}
|
||||
LoadedKind::SiteLogo(match html_response {
|
||||
Some((url, complete, body)) => {
|
||||
let (icons, _) = join!(
|
||||
html_parser::parse_site_logo(&url, body, |url| self.is_blacklisted(url)),
|
||||
complete
|
||||
);
|
||||
|
||||
let href = if elem.name() == "svg" {
|
||||
Some(Url::parse(&encode_svg(&elem_ref.html())).unwrap())
|
||||
} else {
|
||||
elem.attr("src").and_then(|href| url.join(&href).ok())
|
||||
};
|
||||
|
||||
if let Some(href) = &href {
|
||||
if self.is_blacklisted(href) {
|
||||
return None;
|
||||
icons.ok()
|
||||
}
|
||||
}
|
||||
|
||||
href.map(|href| (href, elem_ref, weight))
|
||||
None => None,
|
||||
})
|
||||
.collect();
|
||||
}
|
||||
.boxed_local(),
|
||||
async {
|
||||
let manifests = join_all(manifest_urls.map(|url| SiteIcons::load_manifest(url))).await;
|
||||
|
||||
logos.sort_by(|(_, _, a_weight), (_, _, b_weight)| b_weight.cmp(a_weight));
|
||||
|
||||
// prefer <img> over svg
|
||||
let mut prev_weight = None;
|
||||
for (href, elem_ref, weight) in &logos {
|
||||
if let Some(prev_weight) = prev_weight {
|
||||
if weight != prev_weight {
|
||||
break;
|
||||
}
|
||||
}
|
||||
prev_weight = Some(weight);
|
||||
|
||||
if elem_ref.value().name() == "img" {
|
||||
self.add_icon(href.clone(), IconKind::SiteLogo, None);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some((href, _, _)) = logos.into_iter().next() {
|
||||
self.add_icon(href, IconKind::SiteLogo, None);
|
||||
}
|
||||
}
|
||||
|
||||
for elem_ref in document.select(selector!("link[rel='manifest']")) {
|
||||
if let Some(href) = elem_ref
|
||||
.value()
|
||||
.attr("href")
|
||||
.and_then(|href| url.join(&href).ok())
|
||||
{
|
||||
warn_err!(self.load_manifest(href).await, "failed to fetch manifest");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn load_manifest(&mut self, manifest_url: Url) -> Result<(), Box<dyn Error>> {
|
||||
#[derive(Deserialize)]
|
||||
struct ManifestIcon {
|
||||
src: String,
|
||||
sizes: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Manifest {
|
||||
icons: Option<Vec<ManifestIcon>>,
|
||||
}
|
||||
|
||||
let manifest: Manifest = CLIENT
|
||||
.get(manifest_url.as_str())
|
||||
.send()
|
||||
.await?
|
||||
.json()
|
||||
.await?;
|
||||
|
||||
if let Some(icons) = manifest.icons {
|
||||
for icon in icons {
|
||||
if let Ok(src) = manifest_url.join(&icon.src) {
|
||||
let _ = self.add_icon(src, IconKind::AppIcon, icon.sizes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Fetch all the icons. Ordered from highest to lowest resolution
|
||||
///
|
||||
/// ```
|
||||
/// async fn run() {
|
||||
/// let mut icons = site_icons::Icons::new();
|
||||
/// icons.load_website("https://github.com").await.unwrap();
|
||||
///
|
||||
/// let entries = icons.entries().await;
|
||||
/// for icon in entries {
|
||||
/// println!("{:?}", icon)
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
pub async fn entries(mut self) -> Vec<Icon> {
|
||||
let (urls, infos): (Vec<_>, Vec<_>) = self
|
||||
.pending_entries
|
||||
LoadedKind::DefaultManifest(
|
||||
manifests
|
||||
.into_iter()
|
||||
.map(|(url, (kind, headers, info))| ((url, headers, kind), info))
|
||||
.unzip();
|
||||
.find_map(|manifest| manifest.ok().and_then(|icons| icons.try_into().ok())),
|
||||
)
|
||||
}
|
||||
.boxed_local(),
|
||||
async {
|
||||
let favicons =
|
||||
join_all(favicon_urls.map(|url| Icon::load(url.clone(), IconKind::SiteFavicon, None)))
|
||||
.await;
|
||||
|
||||
let mut urls = urls.into_iter();
|
||||
LoadedKind::DefaultFavicon(favicons.into_iter().find_map(|favicon| favicon.ok()))
|
||||
}
|
||||
.boxed_local(),
|
||||
];
|
||||
|
||||
for info in join_all(infos).await {
|
||||
let (url, headers, kind) = urls.next().unwrap();
|
||||
add_icon_entry(&mut self.entries, url, headers, kind, info);
|
||||
let mut icons: Vec<Icon> = Vec::new();
|
||||
let mut found_best_match = false;
|
||||
let mut previous_loads = Vec::new();
|
||||
|
||||
while !futures.is_empty() {
|
||||
let (loaded, index, _) = select_all(&mut futures).await;
|
||||
futures.remove(index);
|
||||
|
||||
match loaded.clone() {
|
||||
LoadedKind::DefaultManifest(manifest_icons) => {
|
||||
if let Some(manifest_icons) = manifest_icons {
|
||||
icons.extend(manifest_icons);
|
||||
found_best_match = true;
|
||||
}
|
||||
}
|
||||
LoadedKind::DefaultFavicon(favicon) => {
|
||||
if let Some(favicon) = favicon {
|
||||
icons.push(favicon);
|
||||
|
||||
if previous_loads
|
||||
.iter()
|
||||
.any(|kind| matches!(kind, LoadedKind::HeadTags(_)))
|
||||
{
|
||||
found_best_match = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
LoadedKind::HeadTags(head_icons) => {
|
||||
if let Some(head_icons) = head_icons {
|
||||
icons.extend(head_icons);
|
||||
found_best_match = true;
|
||||
} else if previous_loads
|
||||
.iter()
|
||||
.any(|kind| matches!(kind, LoadedKind::DefaultFavicon(Some(_))))
|
||||
{
|
||||
found_best_match = true;
|
||||
}
|
||||
}
|
||||
LoadedKind::SiteLogo(logo) => {
|
||||
if let Some(logo) = logo {
|
||||
icons.push(logo);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.entries.sort();
|
||||
previous_loads.push(loaded);
|
||||
|
||||
self.entries
|
||||
icons.sort();
|
||||
icons = icons.into_iter().unique().collect();
|
||||
|
||||
if best_matches_only && found_best_match {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(icons)
|
||||
}
|
||||
}
|
||||
|
|
45
src/lib.rs
45
src/lib.rs
|
@ -1,3 +1,4 @@
|
|||
#![feature(async_closure)]
|
||||
//! # site_icons
|
||||
//! An efficient website icon scraper.
|
||||
//!
|
||||
|
@ -26,15 +27,13 @@ extern crate serde_with;
|
|||
extern crate log;
|
||||
|
||||
#[macro_use]
|
||||
mod macros;
|
||||
mod icon;
|
||||
mod icon_info;
|
||||
mod icon_size;
|
||||
mod icons;
|
||||
mod utils;
|
||||
mod html_parser;
|
||||
mod icon;
|
||||
mod icons;
|
||||
mod manifest;
|
||||
|
||||
pub use icon::*;
|
||||
pub use icon_info::*;
|
||||
pub use icons::*;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
|
@ -49,23 +48,23 @@ static CLIENT: Lazy<Client> = Lazy::new(|| {
|
|||
Client::builder().default_headers(headers).build().unwrap()
|
||||
});
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
#[tokio::test]
|
||||
async fn test_icons() {
|
||||
let mut icons = Icons::new();
|
||||
// scrape the icons from a url
|
||||
icons.load_website("https://github.com").await.unwrap();
|
||||
// #[cfg(test)]
|
||||
// mod tests {
|
||||
// use super::*;
|
||||
// #[tokio::test]
|
||||
// async fn test_icons() {
|
||||
// let mut icons = SiteIcons::new();
|
||||
// // scrape the icons from a url
|
||||
// icons.load_website("https://github.com").await.unwrap();
|
||||
|
||||
// fetch all icons, ensuring they exist & determining size
|
||||
let entries = icons.entries().await;
|
||||
// // fetch all icons, ensuring they exist & determining size
|
||||
// let entries = icons.entries().await;
|
||||
|
||||
// entries are sorted from highest to lowest resolution
|
||||
for icon in &entries {
|
||||
println!("{:?}", icon)
|
||||
}
|
||||
// // entries are sorted from highest to lowest resolution
|
||||
// for icon in &entries {
|
||||
// println!("{:?}", icon)
|
||||
// }
|
||||
|
||||
assert_eq!(entries.len() > 0, true);
|
||||
}
|
||||
}
|
||||
// assert_eq!(entries.len() > 0, true);
|
||||
// }
|
||||
// }
|
||||
|
|
54
src/manifest.rs
Normal file
54
src/manifest.rs
Normal file
|
@ -0,0 +1,54 @@
|
|||
use crate::{Icon, IconKind, SiteIcons, CLIENT};
|
||||
use cached::proc_macro::cached;
|
||||
use futures::future::join_all;
|
||||
use reqwest::IntoUrl;
|
||||
use serde::Deserialize;
|
||||
use std::error::Error;
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct ManifestIcon {
|
||||
src: String,
|
||||
sizes: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct Manifest {
|
||||
icons: Vec<ManifestIcon>,
|
||||
}
|
||||
|
||||
impl SiteIcons {
|
||||
pub async fn load_manifest<U: IntoUrl>(url: U) -> Result<Vec<Icon>, Box<dyn Error>> {
|
||||
let url = url.into_url()?;
|
||||
|
||||
Ok(load_manifest_cached(url).await?)
|
||||
}
|
||||
}
|
||||
|
||||
#[cached(sync_writes = true)]
|
||||
async fn load_manifest_cached(url: Url) -> Result<Vec<Icon>, String> {
|
||||
let url = &url;
|
||||
|
||||
let manifest: Manifest = CLIENT
|
||||
.get(url.clone())
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("{}: {:?}", url, e))?
|
||||
.json()
|
||||
.await
|
||||
.map_err(|e| format!("{}: {:?}", url, e))?;
|
||||
|
||||
Ok(
|
||||
join_all(manifest.icons.into_iter().map(async move |icon| {
|
||||
if let Ok(src) = url.join(&icon.src) {
|
||||
Icon::load(src, IconKind::AppIcon, icon.sizes).await.ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}))
|
||||
.await
|
||||
.into_iter()
|
||||
.filter_map(|icon| icon)
|
||||
.collect(),
|
||||
)
|
||||
}
|
43
src/utils/background_poll.rs
Normal file
43
src/utils/background_poll.rs
Normal file
|
@ -0,0 +1,43 @@
|
|||
use std::{
|
||||
pin::Pin,
|
||||
task::{Context, Poll},
|
||||
};
|
||||
|
||||
use futures::Future;
|
||||
|
||||
pub async fn poll_in_background<F, B, FO, BO>(future: F, background_future: B) -> FO
|
||||
where
|
||||
F: Future<Output = FO> + Unpin,
|
||||
B: Future<Output = BO> + Unpin,
|
||||
{
|
||||
struct BackgroundPoller<F, B> {
|
||||
future: F,
|
||||
background_future: B,
|
||||
}
|
||||
|
||||
impl<F, B, FO, BO> Future for BackgroundPoller<F, B>
|
||||
where
|
||||
F: Future<Output = FO> + Unpin,
|
||||
B: Future<Output = BO> + Unpin,
|
||||
{
|
||||
type Output = FO;
|
||||
|
||||
fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
|
||||
let this = self.get_mut();
|
||||
|
||||
let result = Pin::new(&mut this.future).poll(cx);
|
||||
|
||||
if result.is_pending() {
|
||||
let _ = Pin::new(&mut this.background_future).poll(cx);
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
BackgroundPoller {
|
||||
future,
|
||||
background_future,
|
||||
}
|
||||
.await
|
||||
}
|
|
@ -1,11 +1,11 @@
|
|||
macro_rules! selector {
|
||||
($($selector:expr),+ $(,)?) => {{
|
||||
static RE: once_cell::sync::OnceCell<scraper::Selector> = once_cell::sync::OnceCell::new();
|
||||
RE.get_or_init(|| scraper::Selector::parse(join!(",", $($selector),+)).unwrap())
|
||||
RE.get_or_init(|| scraper::Selector::parse(join_with!(",", $($selector),+)).unwrap())
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! join {
|
||||
macro_rules! join_with {
|
||||
($pattern:literal,$first:expr$(, $($rest:expr),*)? $(,)?) => {
|
||||
concat!($first$(, $($pattern, $rest),*)?)
|
||||
};
|
||||
|
@ -18,14 +18,6 @@ macro_rules! regex {
|
|||
}};
|
||||
}
|
||||
|
||||
macro_rules! warn_err {
|
||||
($result:expr, $($arg:tt)*) => {{
|
||||
if let Err(err) = $result {
|
||||
warn!("{} {}", format!($($arg)*), err);
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! assert_slice_eq {
|
||||
($cur:expr, $offset:expr, $slice:expr, $($arg:tt)+) => {{
|
||||
if !super::slice_eq($cur, $offset, $slice)? {
|
16
src/utils/mod.rs
Normal file
16
src/utils/mod.rs
Normal file
|
@ -0,0 +1,16 @@
|
|||
#[macro_use]
|
||||
mod macros;
|
||||
mod background_poll;
|
||||
mod svg_encoder;
|
||||
|
||||
pub use background_poll::*;
|
||||
pub use macros::*;
|
||||
pub use svg_encoder::*;
|
||||
|
||||
use url::Url;
|
||||
|
||||
pub fn push_url(url: &Url, segment: &str) -> Url {
|
||||
let mut url = url.clone();
|
||||
url.path_segments_mut().unwrap().push(segment);
|
||||
url
|
||||
}
|
Reference in a new issue