It works
This commit is contained in:
commit
ed0db998d9
9 changed files with 2363 additions and 0 deletions
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
/target
|
2026
Cargo.lock
generated
Normal file
2026
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
16
Cargo.toml
Normal file
16
Cargo.toml
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
[package]
|
||||||
|
name = "favicon-scraper"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
futures = { version = "0.3.31", default-features = false, features = ["std"] }
|
||||||
|
imagesize = "0.13.0"
|
||||||
|
reqwest = { version = "0.12.12", features = ["json"] }
|
||||||
|
scraper = "0.22.0"
|
||||||
|
serde = { version = "1.0.217", features = ["derive"] }
|
||||||
|
url = "2.5.4"
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
tokio = { version = "1.43.0", features = ["rt", "macros"] }
|
||||||
|
tokio-test = "0.4.4"
|
5
README.md
Normal file
5
README.md
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
# favicon-scraper
|
||||||
|
|
||||||
|
Scrapes favicons from websites.
|
||||||
|
|
||||||
|
Does not particularly care for 100% optimal performance, it just needs to work
|
54
src/error.rs
Normal file
54
src/error.rs
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
use std::error::Error as StdError;
|
||||||
|
use std::fmt::{Display, Formatter};
|
||||||
|
|
||||||
|
macro_rules! impl_error {
|
||||||
|
($($name: ident ($typ: ty)),*; $($simple_name: ident),*) => {
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum Error {
|
||||||
|
$($simple_name,)*
|
||||||
|
$($name($typ),)*
|
||||||
|
}
|
||||||
|
|
||||||
|
$(
|
||||||
|
impl From<$typ> for Error {
|
||||||
|
fn from(value: $typ) -> Self {
|
||||||
|
Error::$name(value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)*
|
||||||
|
|
||||||
|
impl Display for Error {
|
||||||
|
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
$(Error::$simple_name => write!(f, stringify!($simple_name)),)*
|
||||||
|
$(Error::$name(v) => Display::fmt(v, f),)*
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StdError for Error {}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
impl_error!(
|
||||||
|
Reqwest(reqwest::Error);
|
||||||
|
UnsupportedURLScheme,
|
||||||
|
UnsupportedImageFormat
|
||||||
|
);
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
type SendBox = Box<dyn StdError + Send + Sync>;
|
||||||
|
|
||||||
|
/// This test just needs to compile.
|
||||||
|
/// It always panics in practice so no valid `Error` needs to be constructed
|
||||||
|
/// Compiling this test will fail if `Error` isn't `Send + Sync`.
|
||||||
|
/// `Error` must be `Send + Sync` because I got fed up with `site_icons` not being `Send`.
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn is_send_and_sync() {
|
||||||
|
let _: SendBox = (|| -> Box<Error> { panic!("Success") })(); // If this fails to compile, Error isn't Send + Sync
|
||||||
|
}
|
||||||
|
}
|
62
src/html.rs
Normal file
62
src/html.rs
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
use crate::{
|
||||||
|
icon::{Icon, IconKind},
|
||||||
|
Error,
|
||||||
|
};
|
||||||
|
use futures::future::join_all;
|
||||||
|
use reqwest::{Client, IntoUrl};
|
||||||
|
use scraper::{Html as SHTML, Selector};
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
const ICON_SELECTOR: &str =
|
||||||
|
"link[rel~='icon'], link[rel~='apple-touch-icon'], link[rel~='apple-touch-icon-precomposed']";
|
||||||
|
const MANIFEST_SELECTOR: &str = "link[rel~='manifest']";
|
||||||
|
|
||||||
|
pub struct HTML {
|
||||||
|
pub icons: Vec<Icon>,
|
||||||
|
pub manifest: Option<Url>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HTML {
|
||||||
|
fn get_urls_from_html<'s, 'h, 'u>(
|
||||||
|
selector: &'s Selector,
|
||||||
|
html: &'h SHTML,
|
||||||
|
url: &'u Url,
|
||||||
|
) -> impl Iterator<Item = Url> + use<'s, 'h, 'u> {
|
||||||
|
html.select(selector)
|
||||||
|
.filter_map(|e| e.attr("href"))
|
||||||
|
.filter_map(|u| url.join(u).ok())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn scan_html(client: &Client, url: impl IntoUrl) -> Result<Self, Error> {
|
||||||
|
let response = client.get(url).send().await?;
|
||||||
|
let url = response.url().to_owned(); // Specifically use the destination URL after redirects and such
|
||||||
|
let text = response.text().await?;
|
||||||
|
let html = SHTML::parse_document(&text);
|
||||||
|
|
||||||
|
let icon_selector = Selector::parse(ICON_SELECTOR).unwrap();
|
||||||
|
let manifest_selector = Selector::parse(MANIFEST_SELECTOR).unwrap();
|
||||||
|
|
||||||
|
let manifest = HTML::get_urls_from_html(&manifest_selector, &html, &url).next();
|
||||||
|
|
||||||
|
let icons = HTML::get_urls_from_html(&icon_selector, &html, &url)
|
||||||
|
.map(|u| Icon::from_url(client, u, IconKind::LinkedInHTML));
|
||||||
|
let icons: Vec<Icon> = join_all(icons)
|
||||||
|
.await
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|i| i.ok())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
Ok(HTML { icons, manifest })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn selectors_must_parse() {
|
||||||
|
Selector::parse(ICON_SELECTOR).expect("Icon selector didn't parse");
|
||||||
|
Selector::parse(MANIFEST_SELECTOR).expect("Manifest selector didn't parse");
|
||||||
|
}
|
||||||
|
}
|
71
src/icon.rs
Normal file
71
src/icon.rs
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
use imagesize::ImageError;
|
||||||
|
pub use imagesize::ImageSize;
|
||||||
|
use reqwest::{Client, IntoUrl, Response};
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
use crate::Error;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum IconKind {
|
||||||
|
HardcodedURL,
|
||||||
|
LinkedInHTML,
|
||||||
|
LinkedInManifest,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct Icon {
|
||||||
|
pub kind: IconKind,
|
||||||
|
pub url: Url,
|
||||||
|
pub size: ImageSize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Icon {
|
||||||
|
/// This is a separate function because you can't break with a value
|
||||||
|
/// from `while let` loops (which is understandable)
|
||||||
|
async fn find_size(mut response: Response) -> Result<ImageSize, Error> {
|
||||||
|
let mut buffer = vec![];
|
||||||
|
while let Some(chunk) = response.chunk().await? {
|
||||||
|
buffer.extend_from_slice(&chunk);
|
||||||
|
match imagesize::blob_size(&buffer) {
|
||||||
|
Ok(size) => return Ok(size),
|
||||||
|
Err(ImageError::IoError(_)) => continue,
|
||||||
|
Err(_) => return Err(Error::UnsupportedImageFormat),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(Error::UnsupportedImageFormat)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create an `Icon` from a URL by fetching it partially using the given client to get its size.
|
||||||
|
/// This used to be public but I don't want random users to deal with `kind`
|
||||||
|
pub(crate) async fn from_url(
|
||||||
|
client: &Client,
|
||||||
|
url: impl IntoUrl,
|
||||||
|
kind: IconKind,
|
||||||
|
) -> Result<Self, Error> {
|
||||||
|
let response = client.get(url).send().await?;
|
||||||
|
let url = response.url().to_owned();
|
||||||
|
let size = Icon::find_size(response).await?;
|
||||||
|
Ok(Icon { kind, url, size })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_google() {
|
||||||
|
let client = reqwest::Client::new();
|
||||||
|
|
||||||
|
let icon = Icon::from_url(
|
||||||
|
&client,
|
||||||
|
"https://google.com/favicon.ico",
|
||||||
|
IconKind::HardcodedURL,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let ImageSize { width, height } = icon.size;
|
||||||
|
println!("The size of Google's favicon is {width}x{height} pixels.");
|
||||||
|
}
|
||||||
|
}
|
95
src/lib.rs
Normal file
95
src/lib.rs
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
//! # favicon-scraper
|
||||||
|
//!
|
||||||
|
//! A simple crate to scrape favicons asynchronously that's intended to *just work*
|
||||||
|
//!
|
||||||
|
//! To get started, have a look at [`scrape`]!
|
||||||
|
|
||||||
|
pub mod error;
|
||||||
|
pub mod html;
|
||||||
|
pub mod icon;
|
||||||
|
pub mod manifest;
|
||||||
|
|
||||||
|
pub use error::Error;
|
||||||
|
use futures::future::{join, join_all};
|
||||||
|
use html::HTML;
|
||||||
|
use icon::{Icon, IconKind};
|
||||||
|
use manifest::scan_manifest;
|
||||||
|
use reqwest::{Client, IntoUrl};
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
/// Perform scraping.
|
||||||
|
///
|
||||||
|
/// The URL scheme **must** be either `http` or `https`.
|
||||||
|
///
|
||||||
|
/// This will load the given URL, parse the returned HTML, and if found, also load and parse any linked manifests.
|
||||||
|
///
|
||||||
|
/// Any found icons will be partially loaded to get their size.
|
||||||
|
/// ICO files will be interpreted as their largest size as per [`imagesize`'s README](https://github.com/Roughsketch/imagesize/blob/017b33da886a27484614e9527d14fc5f3f0d5079/README.md?plain=1#L41).
|
||||||
|
/// ```
|
||||||
|
/// # tokio_test::block_on(async {
|
||||||
|
/// use favicon_scraper::{scrape, Error};
|
||||||
|
///
|
||||||
|
/// let icons = scrape("https://kitsunes.dev", true).await.unwrap();
|
||||||
|
///
|
||||||
|
/// // Only HTTP(S) is supported
|
||||||
|
/// assert!(matches!(
|
||||||
|
/// scrape("ftp://example.com", true).await,
|
||||||
|
/// Err(Error::UnsupportedURLScheme)
|
||||||
|
/// ));
|
||||||
|
/// # })
|
||||||
|
/// ```
|
||||||
|
pub async fn scrape(url: impl IntoUrl) -> Result<Vec<Icon>, Error> {
|
||||||
|
let url = url.into_url()?;
|
||||||
|
if !matches!(url.scheme(), "http" | "https") {
|
||||||
|
return Err(Error::UnsupportedURLScheme);
|
||||||
|
}
|
||||||
|
let client = Client::new();
|
||||||
|
|
||||||
|
let hardcoded_urls = join_all(vec![
|
||||||
|
try_hardcoded_path(&client, &url, "/favicon.ico"),
|
||||||
|
try_hardcoded_path(&client, &url, "/favicon.svg"),
|
||||||
|
try_hardcoded_path(&client, &url, "/favicon.png"),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let html = HTML::scan_html(&client, url.clone());
|
||||||
|
|
||||||
|
let (hardcoded_urls, html) = join(hardcoded_urls, html).await;
|
||||||
|
|
||||||
|
let mut icons: Vec<Icon> = hardcoded_urls.into_iter().flatten().collect();
|
||||||
|
|
||||||
|
if let Ok(mut html) = html {
|
||||||
|
icons.append(&mut html.icons);
|
||||||
|
|
||||||
|
if let Some(manifest) = html.manifest {
|
||||||
|
if let Ok(mut manifest_icons) = scan_manifest(&client, manifest).await {
|
||||||
|
icons.append(&mut manifest_icons);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(icons)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn try_hardcoded_path(client: &Client, url: &Url, path: &'static str) -> Option<Icon> {
|
||||||
|
let url = url.join(path).unwrap();
|
||||||
|
Icon::from_url(client, url, IconKind::HardcodedURL)
|
||||||
|
.await
|
||||||
|
.ok()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
// Using this as a test because site_icons failed on it for some reason
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_catwithaclarinet() {
|
||||||
|
let icons = scrape("https://ck.catwithaclari.net").await.unwrap();
|
||||||
|
println!("Found {} icons:\n", icons.len());
|
||||||
|
for icon in icons {
|
||||||
|
println!("URL: {}", icon.url);
|
||||||
|
println!("Size: {}x{} pixels", icon.size.width, icon.size.height);
|
||||||
|
println!("Kind of icon: {:?}\n", icon.kind);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
33
src/manifest.rs
Normal file
33
src/manifest.rs
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
use futures::future::join_all;
|
||||||
|
use reqwest::{Client, IntoUrl};
|
||||||
|
use serde::Deserialize;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
icon::{Icon, IconKind},
|
||||||
|
Error,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct Manifest {
|
||||||
|
icons: Vec<ManifestIcon>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct ManifestIcon {
|
||||||
|
src: String,
|
||||||
|
// Not gonna trust or parse the sizes
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn scan_manifest(client: &Client, url: impl IntoUrl) -> Result<Vec<Icon>, Error> {
|
||||||
|
let manifest: Manifest = client.get(url).send().await?.json().await?;
|
||||||
|
Ok(join_all(
|
||||||
|
manifest
|
||||||
|
.icons
|
||||||
|
.into_iter()
|
||||||
|
.map(|i| Icon::from_url(client, i.src, IconKind::LinkedInManifest)),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|i| i.ok())
|
||||||
|
.collect())
|
||||||
|
}
|
Loading…
Add table
Reference in a new issue