It works
This commit is contained in:
commit
ed0db998d9
9 changed files with 2363 additions and 0 deletions
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
/target
|
2026
Cargo.lock
generated
Normal file
2026
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
16
Cargo.toml
Normal file
16
Cargo.toml
Normal file
|
@ -0,0 +1,16 @@
|
|||
[package]
|
||||
name = "favicon-scraper"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
futures = { version = "0.3.31", default-features = false, features = ["std"] }
|
||||
imagesize = "0.13.0"
|
||||
reqwest = { version = "0.12.12", features = ["json"] }
|
||||
scraper = "0.22.0"
|
||||
serde = { version = "1.0.217", features = ["derive"] }
|
||||
url = "2.5.4"
|
||||
|
||||
[dev-dependencies]
|
||||
tokio = { version = "1.43.0", features = ["rt", "macros"] }
|
||||
tokio-test = "0.4.4"
|
5
README.md
Normal file
5
README.md
Normal file
|
@ -0,0 +1,5 @@
|
|||
# favicon-scraper
|
||||
|
||||
Scrapes favicons from websites.
|
||||
|
||||
Does not particularly care for 100% optimal performance, it just needs to work
|
54
src/error.rs
Normal file
54
src/error.rs
Normal file
|
@ -0,0 +1,54 @@
|
|||
use std::error::Error as StdError;
|
||||
use std::fmt::{Display, Formatter};
|
||||
|
||||
macro_rules! impl_error {
|
||||
($($name: ident ($typ: ty)),*; $($simple_name: ident),*) => {
|
||||
#[derive(Debug)]
|
||||
pub enum Error {
|
||||
$($simple_name,)*
|
||||
$($name($typ),)*
|
||||
}
|
||||
|
||||
$(
|
||||
impl From<$typ> for Error {
|
||||
fn from(value: $typ) -> Self {
|
||||
Error::$name(value)
|
||||
}
|
||||
}
|
||||
)*
|
||||
|
||||
impl Display for Error {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
$(Error::$simple_name => write!(f, stringify!($simple_name)),)*
|
||||
$(Error::$name(v) => Display::fmt(v, f),)*
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl StdError for Error {}
|
||||
};
|
||||
}
|
||||
|
||||
impl_error!(
|
||||
Reqwest(reqwest::Error);
|
||||
UnsupportedURLScheme,
|
||||
UnsupportedImageFormat
|
||||
);
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
type SendBox = Box<dyn StdError + Send + Sync>;
|
||||
|
||||
/// This test just needs to compile.
|
||||
/// It always panics in practice so no valid `Error` needs to be constructed
|
||||
/// Compiling this test will fail if `Error` isn't `Send + Sync`.
|
||||
/// `Error` must be `Send + Sync` because I got fed up with `site_icons` not being `Send`.
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn is_send_and_sync() {
|
||||
let _: SendBox = (|| -> Box<Error> { panic!("Success") })(); // If this fails to compile, Error isn't Send + Sync
|
||||
}
|
||||
}
|
62
src/html.rs
Normal file
62
src/html.rs
Normal file
|
@ -0,0 +1,62 @@
|
|||
use crate::{
|
||||
icon::{Icon, IconKind},
|
||||
Error,
|
||||
};
|
||||
use futures::future::join_all;
|
||||
use reqwest::{Client, IntoUrl};
|
||||
use scraper::{Html as SHTML, Selector};
|
||||
use url::Url;
|
||||
|
||||
const ICON_SELECTOR: &str =
|
||||
"link[rel~='icon'], link[rel~='apple-touch-icon'], link[rel~='apple-touch-icon-precomposed']";
|
||||
const MANIFEST_SELECTOR: &str = "link[rel~='manifest']";
|
||||
|
||||
pub struct HTML {
|
||||
pub icons: Vec<Icon>,
|
||||
pub manifest: Option<Url>,
|
||||
}
|
||||
|
||||
impl HTML {
|
||||
fn get_urls_from_html<'s, 'h, 'u>(
|
||||
selector: &'s Selector,
|
||||
html: &'h SHTML,
|
||||
url: &'u Url,
|
||||
) -> impl Iterator<Item = Url> + use<'s, 'h, 'u> {
|
||||
html.select(selector)
|
||||
.filter_map(|e| e.attr("href"))
|
||||
.filter_map(|u| url.join(u).ok())
|
||||
}
|
||||
|
||||
pub async fn scan_html(client: &Client, url: impl IntoUrl) -> Result<Self, Error> {
|
||||
let response = client.get(url).send().await?;
|
||||
let url = response.url().to_owned(); // Specifically use the destination URL after redirects and such
|
||||
let text = response.text().await?;
|
||||
let html = SHTML::parse_document(&text);
|
||||
|
||||
let icon_selector = Selector::parse(ICON_SELECTOR).unwrap();
|
||||
let manifest_selector = Selector::parse(MANIFEST_SELECTOR).unwrap();
|
||||
|
||||
let manifest = HTML::get_urls_from_html(&manifest_selector, &html, &url).next();
|
||||
|
||||
let icons = HTML::get_urls_from_html(&icon_selector, &html, &url)
|
||||
.map(|u| Icon::from_url(client, u, IconKind::LinkedInHTML));
|
||||
let icons: Vec<Icon> = join_all(icons)
|
||||
.await
|
||||
.into_iter()
|
||||
.filter_map(|i| i.ok())
|
||||
.collect();
|
||||
|
||||
Ok(HTML { icons, manifest })
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn selectors_must_parse() {
|
||||
Selector::parse(ICON_SELECTOR).expect("Icon selector didn't parse");
|
||||
Selector::parse(MANIFEST_SELECTOR).expect("Manifest selector didn't parse");
|
||||
}
|
||||
}
|
71
src/icon.rs
Normal file
71
src/icon.rs
Normal file
|
@ -0,0 +1,71 @@
|
|||
use imagesize::ImageError;
|
||||
pub use imagesize::ImageSize;
|
||||
use reqwest::{Client, IntoUrl, Response};
|
||||
use url::Url;
|
||||
|
||||
use crate::Error;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum IconKind {
|
||||
HardcodedURL,
|
||||
LinkedInHTML,
|
||||
LinkedInManifest,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Icon {
|
||||
pub kind: IconKind,
|
||||
pub url: Url,
|
||||
pub size: ImageSize,
|
||||
}
|
||||
|
||||
impl Icon {
|
||||
/// This is a separate function because you can't break with a value
|
||||
/// from `while let` loops (which is understandable)
|
||||
async fn find_size(mut response: Response) -> Result<ImageSize, Error> {
|
||||
let mut buffer = vec![];
|
||||
while let Some(chunk) = response.chunk().await? {
|
||||
buffer.extend_from_slice(&chunk);
|
||||
match imagesize::blob_size(&buffer) {
|
||||
Ok(size) => return Ok(size),
|
||||
Err(ImageError::IoError(_)) => continue,
|
||||
Err(_) => return Err(Error::UnsupportedImageFormat),
|
||||
}
|
||||
}
|
||||
Err(Error::UnsupportedImageFormat)
|
||||
}
|
||||
|
||||
/// Create an `Icon` from a URL by fetching it partially using the given client to get its size.
|
||||
/// This used to be public but I don't want random users to deal with `kind`
|
||||
pub(crate) async fn from_url(
|
||||
client: &Client,
|
||||
url: impl IntoUrl,
|
||||
kind: IconKind,
|
||||
) -> Result<Self, Error> {
|
||||
let response = client.get(url).send().await?;
|
||||
let url = response.url().to_owned();
|
||||
let size = Icon::find_size(response).await?;
|
||||
Ok(Icon { kind, url, size })
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_google() {
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
let icon = Icon::from_url(
|
||||
&client,
|
||||
"https://google.com/favicon.ico",
|
||||
IconKind::HardcodedURL,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let ImageSize { width, height } = icon.size;
|
||||
println!("The size of Google's favicon is {width}x{height} pixels.");
|
||||
}
|
||||
}
|
95
src/lib.rs
Normal file
95
src/lib.rs
Normal file
|
@ -0,0 +1,95 @@
|
|||
//! # favicon-scraper
|
||||
//!
|
||||
//! A simple crate to scrape favicons asynchronously that's intended to *just work*
|
||||
//!
|
||||
//! To get started, have a look at [`scrape`]!
|
||||
|
||||
pub mod error;
|
||||
pub mod html;
|
||||
pub mod icon;
|
||||
pub mod manifest;
|
||||
|
||||
pub use error::Error;
|
||||
use futures::future::{join, join_all};
|
||||
use html::HTML;
|
||||
use icon::{Icon, IconKind};
|
||||
use manifest::scan_manifest;
|
||||
use reqwest::{Client, IntoUrl};
|
||||
use url::Url;
|
||||
|
||||
/// Perform scraping.
|
||||
///
|
||||
/// The URL scheme **must** be either `http` or `https`.
|
||||
///
|
||||
/// This will load the given URL, parse the returned HTML, and if found, also load and parse any linked manifests.
|
||||
///
|
||||
/// Any found icons will be partially loaded to get their size.
|
||||
/// ICO files will be interpreted as their largest size as per [`imagesize`'s README](https://github.com/Roughsketch/imagesize/blob/017b33da886a27484614e9527d14fc5f3f0d5079/README.md?plain=1#L41).
|
||||
/// ```
|
||||
/// # tokio_test::block_on(async {
|
||||
/// use favicon_scraper::{scrape, Error};
|
||||
///
|
||||
/// let icons = scrape("https://kitsunes.dev", true).await.unwrap();
|
||||
///
|
||||
/// // Only HTTP(S) is supported
|
||||
/// assert!(matches!(
|
||||
/// scrape("ftp://example.com", true).await,
|
||||
/// Err(Error::UnsupportedURLScheme)
|
||||
/// ));
|
||||
/// # })
|
||||
/// ```
|
||||
pub async fn scrape(url: impl IntoUrl) -> Result<Vec<Icon>, Error> {
|
||||
let url = url.into_url()?;
|
||||
if !matches!(url.scheme(), "http" | "https") {
|
||||
return Err(Error::UnsupportedURLScheme);
|
||||
}
|
||||
let client = Client::new();
|
||||
|
||||
let hardcoded_urls = join_all(vec![
|
||||
try_hardcoded_path(&client, &url, "/favicon.ico"),
|
||||
try_hardcoded_path(&client, &url, "/favicon.svg"),
|
||||
try_hardcoded_path(&client, &url, "/favicon.png"),
|
||||
]);
|
||||
|
||||
let html = HTML::scan_html(&client, url.clone());
|
||||
|
||||
let (hardcoded_urls, html) = join(hardcoded_urls, html).await;
|
||||
|
||||
let mut icons: Vec<Icon> = hardcoded_urls.into_iter().flatten().collect();
|
||||
|
||||
if let Ok(mut html) = html {
|
||||
icons.append(&mut html.icons);
|
||||
|
||||
if let Some(manifest) = html.manifest {
|
||||
if let Ok(mut manifest_icons) = scan_manifest(&client, manifest).await {
|
||||
icons.append(&mut manifest_icons);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(icons)
|
||||
}
|
||||
|
||||
async fn try_hardcoded_path(client: &Client, url: &Url, path: &'static str) -> Option<Icon> {
|
||||
let url = url.join(path).unwrap();
|
||||
Icon::from_url(client, url, IconKind::HardcodedURL)
|
||||
.await
|
||||
.ok()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// Using this as a test because site_icons failed on it for some reason
|
||||
#[tokio::test]
|
||||
async fn test_catwithaclarinet() {
|
||||
let icons = scrape("https://ck.catwithaclari.net").await.unwrap();
|
||||
println!("Found {} icons:\n", icons.len());
|
||||
for icon in icons {
|
||||
println!("URL: {}", icon.url);
|
||||
println!("Size: {}x{} pixels", icon.size.width, icon.size.height);
|
||||
println!("Kind of icon: {:?}\n", icon.kind);
|
||||
}
|
||||
}
|
||||
}
|
33
src/manifest.rs
Normal file
33
src/manifest.rs
Normal file
|
@ -0,0 +1,33 @@
|
|||
use futures::future::join_all;
|
||||
use reqwest::{Client, IntoUrl};
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::{
|
||||
icon::{Icon, IconKind},
|
||||
Error,
|
||||
};
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Manifest {
|
||||
icons: Vec<ManifestIcon>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ManifestIcon {
|
||||
src: String,
|
||||
// Not gonna trust or parse the sizes
|
||||
}
|
||||
|
||||
pub async fn scan_manifest(client: &Client, url: impl IntoUrl) -> Result<Vec<Icon>, Error> {
|
||||
let manifest: Manifest = client.get(url).send().await?.json().await?;
|
||||
Ok(join_all(
|
||||
manifest
|
||||
.icons
|
||||
.into_iter()
|
||||
.map(|i| Icon::from_url(client, i.src, IconKind::LinkedInManifest)),
|
||||
)
|
||||
.await
|
||||
.into_iter()
|
||||
.filter_map(|i| i.ok())
|
||||
.collect())
|
||||
}
|
Loading…
Add table
Reference in a new issue