This commit is contained in:
CenTdemeern1 2025-01-31 23:52:51 +01:00
commit ed0db998d9
9 changed files with 2363 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/target

2026
Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

16
Cargo.toml Normal file
View file

@ -0,0 +1,16 @@
[package]
name = "favicon-scraper"
version = "0.1.0"
edition = "2021"
[dependencies]
futures = { version = "0.3.31", default-features = false, features = ["std"] }
imagesize = "0.13.0"
reqwest = { version = "0.12.12", features = ["json"] }
scraper = "0.22.0"
serde = { version = "1.0.217", features = ["derive"] }
url = "2.5.4"
[dev-dependencies]
tokio = { version = "1.43.0", features = ["rt", "macros"] }
tokio-test = "0.4.4"

5
README.md Normal file
View file

@ -0,0 +1,5 @@
# favicon-scraper
Scrapes favicons from websites.
Does not particularly care for 100% optimal performance, it just needs to work

54
src/error.rs Normal file
View file

@ -0,0 +1,54 @@
use std::error::Error as StdError;
use std::fmt::{Display, Formatter};
macro_rules! impl_error {
($($name: ident ($typ: ty)),*; $($simple_name: ident),*) => {
#[derive(Debug)]
pub enum Error {
$($simple_name,)*
$($name($typ),)*
}
$(
impl From<$typ> for Error {
fn from(value: $typ) -> Self {
Error::$name(value)
}
}
)*
impl Display for Error {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
$(Error::$simple_name => write!(f, stringify!($simple_name)),)*
$(Error::$name(v) => Display::fmt(v, f),)*
}
}
}
impl StdError for Error {}
};
}
impl_error!(
Reqwest(reqwest::Error);
UnsupportedURLScheme,
UnsupportedImageFormat
);
#[cfg(test)]
mod tests {
use super::*;
type SendBox = Box<dyn StdError + Send + Sync>;
/// This test just needs to compile.
/// It always panics in practice so no valid `Error` needs to be constructed
/// Compiling this test will fail if `Error` isn't `Send + Sync`.
/// `Error` must be `Send + Sync` because I got fed up with `site_icons` not being `Send`.
#[test]
#[should_panic]
fn is_send_and_sync() {
let _: SendBox = (|| -> Box<Error> { panic!("Success") })(); // If this fails to compile, Error isn't Send + Sync
}
}

62
src/html.rs Normal file
View file

@ -0,0 +1,62 @@
use crate::{
icon::{Icon, IconKind},
Error,
};
use futures::future::join_all;
use reqwest::{Client, IntoUrl};
use scraper::{Html as SHTML, Selector};
use url::Url;
const ICON_SELECTOR: &str =
"link[rel~='icon'], link[rel~='apple-touch-icon'], link[rel~='apple-touch-icon-precomposed']";
const MANIFEST_SELECTOR: &str = "link[rel~='manifest']";
pub struct HTML {
pub icons: Vec<Icon>,
pub manifest: Option<Url>,
}
impl HTML {
fn get_urls_from_html<'s, 'h, 'u>(
selector: &'s Selector,
html: &'h SHTML,
url: &'u Url,
) -> impl Iterator<Item = Url> + use<'s, 'h, 'u> {
html.select(selector)
.filter_map(|e| e.attr("href"))
.filter_map(|u| url.join(u).ok())
}
pub async fn scan_html(client: &Client, url: impl IntoUrl) -> Result<Self, Error> {
let response = client.get(url).send().await?;
let url = response.url().to_owned(); // Specifically use the destination URL after redirects and such
let text = response.text().await?;
let html = SHTML::parse_document(&text);
let icon_selector = Selector::parse(ICON_SELECTOR).unwrap();
let manifest_selector = Selector::parse(MANIFEST_SELECTOR).unwrap();
let manifest = HTML::get_urls_from_html(&manifest_selector, &html, &url).next();
let icons = HTML::get_urls_from_html(&icon_selector, &html, &url)
.map(|u| Icon::from_url(client, u, IconKind::LinkedInHTML));
let icons: Vec<Icon> = join_all(icons)
.await
.into_iter()
.filter_map(|i| i.ok())
.collect();
Ok(HTML { icons, manifest })
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn selectors_must_parse() {
Selector::parse(ICON_SELECTOR).expect("Icon selector didn't parse");
Selector::parse(MANIFEST_SELECTOR).expect("Manifest selector didn't parse");
}
}

71
src/icon.rs Normal file
View file

@ -0,0 +1,71 @@
use imagesize::ImageError;
pub use imagesize::ImageSize;
use reqwest::{Client, IntoUrl, Response};
use url::Url;
use crate::Error;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IconKind {
HardcodedURL,
LinkedInHTML,
LinkedInManifest,
}
#[derive(Debug, Clone)]
pub struct Icon {
pub kind: IconKind,
pub url: Url,
pub size: ImageSize,
}
impl Icon {
/// This is a separate function because you can't break with a value
/// from `while let` loops (which is understandable)
async fn find_size(mut response: Response) -> Result<ImageSize, Error> {
let mut buffer = vec![];
while let Some(chunk) = response.chunk().await? {
buffer.extend_from_slice(&chunk);
match imagesize::blob_size(&buffer) {
Ok(size) => return Ok(size),
Err(ImageError::IoError(_)) => continue,
Err(_) => return Err(Error::UnsupportedImageFormat),
}
}
Err(Error::UnsupportedImageFormat)
}
/// Create an `Icon` from a URL by fetching it partially using the given client to get its size.
/// This used to be public but I don't want random users to deal with `kind`
pub(crate) async fn from_url(
client: &Client,
url: impl IntoUrl,
kind: IconKind,
) -> Result<Self, Error> {
let response = client.get(url).send().await?;
let url = response.url().to_owned();
let size = Icon::find_size(response).await?;
Ok(Icon { kind, url, size })
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_google() {
let client = reqwest::Client::new();
let icon = Icon::from_url(
&client,
"https://google.com/favicon.ico",
IconKind::HardcodedURL,
)
.await
.unwrap();
let ImageSize { width, height } = icon.size;
println!("The size of Google's favicon is {width}x{height} pixels.");
}
}

95
src/lib.rs Normal file
View file

@ -0,0 +1,95 @@
//! # favicon-scraper
//!
//! A simple crate to scrape favicons asynchronously that's intended to *just work*
//!
//! To get started, have a look at [`scrape`]!
pub mod error;
pub mod html;
pub mod icon;
pub mod manifest;
pub use error::Error;
use futures::future::{join, join_all};
use html::HTML;
use icon::{Icon, IconKind};
use manifest::scan_manifest;
use reqwest::{Client, IntoUrl};
use url::Url;
/// Perform scraping.
///
/// The URL scheme **must** be either `http` or `https`.
///
/// This will load the given URL, parse the returned HTML, and if found, also load and parse any linked manifests.
///
/// Any found icons will be partially loaded to get their size.
/// ICO files will be interpreted as their largest size as per [`imagesize`'s README](https://github.com/Roughsketch/imagesize/blob/017b33da886a27484614e9527d14fc5f3f0d5079/README.md?plain=1#L41).
/// ```
/// # tokio_test::block_on(async {
/// use favicon_scraper::{scrape, Error};
///
/// let icons = scrape("https://kitsunes.dev", true).await.unwrap();
///
/// // Only HTTP(S) is supported
/// assert!(matches!(
/// scrape("ftp://example.com", true).await,
/// Err(Error::UnsupportedURLScheme)
/// ));
/// # })
/// ```
pub async fn scrape(url: impl IntoUrl) -> Result<Vec<Icon>, Error> {
let url = url.into_url()?;
if !matches!(url.scheme(), "http" | "https") {
return Err(Error::UnsupportedURLScheme);
}
let client = Client::new();
let hardcoded_urls = join_all(vec![
try_hardcoded_path(&client, &url, "/favicon.ico"),
try_hardcoded_path(&client, &url, "/favicon.svg"),
try_hardcoded_path(&client, &url, "/favicon.png"),
]);
let html = HTML::scan_html(&client, url.clone());
let (hardcoded_urls, html) = join(hardcoded_urls, html).await;
let mut icons: Vec<Icon> = hardcoded_urls.into_iter().flatten().collect();
if let Ok(mut html) = html {
icons.append(&mut html.icons);
if let Some(manifest) = html.manifest {
if let Ok(mut manifest_icons) = scan_manifest(&client, manifest).await {
icons.append(&mut manifest_icons);
}
}
}
Ok(icons)
}
async fn try_hardcoded_path(client: &Client, url: &Url, path: &'static str) -> Option<Icon> {
let url = url.join(path).unwrap();
Icon::from_url(client, url, IconKind::HardcodedURL)
.await
.ok()
}
#[cfg(test)]
mod tests {
use super::*;
// Using this as a test because site_icons failed on it for some reason
#[tokio::test]
async fn test_catwithaclarinet() {
let icons = scrape("https://ck.catwithaclari.net").await.unwrap();
println!("Found {} icons:\n", icons.len());
for icon in icons {
println!("URL: {}", icon.url);
println!("Size: {}x{} pixels", icon.size.width, icon.size.height);
println!("Kind of icon: {:?}\n", icon.kind);
}
}
}

33
src/manifest.rs Normal file
View file

@ -0,0 +1,33 @@
use futures::future::join_all;
use reqwest::{Client, IntoUrl};
use serde::Deserialize;
use crate::{
icon::{Icon, IconKind},
Error,
};
#[derive(Deserialize)]
struct Manifest {
icons: Vec<ManifestIcon>,
}
#[derive(Deserialize)]
struct ManifestIcon {
src: String,
// Not gonna trust or parse the sizes
}
pub async fn scan_manifest(client: &Client, url: impl IntoUrl) -> Result<Vec<Icon>, Error> {
let manifest: Manifest = client.get(url).send().await?.json().await?;
Ok(join_all(
manifest
.icons
.into_iter()
.map(|i| Icon::from_url(client, i.src, IconKind::LinkedInManifest)),
)
.await
.into_iter()
.filter_map(|i| i.ok())
.collect())
}