Compare commits

..

13 commits
v0.1.1 ... main

Author SHA1 Message Date
81dbf946e9 Add CI
All checks were successful
Build & Test / build-run (push) Successful in 30s
2025-02-04 00:17:26 +01:00
61619ac660 Bump version 2025-02-02 23:20:20 +01:00
314bc347de Add an example to the readme and test it 2025-02-02 23:19:30 +01:00
f1a4e2e819 Pub use icon stuff for convenience 2025-02-02 23:19:15 +01:00
108eb427b8 Fix relative URLs in manifests not being handled 2025-02-02 23:06:24 +01:00
ac481ec06a Bump version 2025-02-02 15:22:25 +01:00
5b111fd292 Scraping is now thread-safe 2025-02-02 15:22:11 +01:00
68180dc5e5 Bump version 2025-02-01 01:09:12 +01:00
c912c65255 A few more derives 2025-02-01 01:02:19 +01:00
541d70711c Add more docs 2025-02-01 00:53:22 +01:00
a1619897e3 Make IconKind non_exhaustive 2025-02-01 00:52:03 +01:00
f20e410422 Error stuff 2025-02-01 00:36:41 +01:00
f8fd42f7c4 Add some extra bits of documentation 2025-02-01 00:29:15 +01:00
9 changed files with 91 additions and 14 deletions

View file

@ -0,0 +1,20 @@
name: Build & Test
on: [push]
jobs:
build-run:
runs-on: docker
container:
image: rust
steps:
- name: Update package repos
run: apt update
- name: Install Node using apt
run: apt install nodejs -y
- name: Checkout repo
uses: actions/checkout@v4
- name: Build using Cargo
run: cargo build --verbose
- name: Run unit tests
run: cargo test --verbose

2
Cargo.lock generated
View file

@ -226,7 +226,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]]
name = "favicon-scraper"
version = "0.1.1"
version = "0.3.1"
dependencies = [
"futures",
"imagesize",

View file

@ -1,12 +1,13 @@
[package]
name = "favicon-scraper"
version = "0.1.1"
version = "0.3.1"
edition = "2021"
license = "MIT"
description = "A favicon scraper that just works"
homepage = "https://kitsunes.dev/Nekomata/favicon-scraper"
repository = "https://kitsunes.dev/Nekomata/favicon-scraper"
readme = "README.md"
exclude = ["/.forgejo"]
[dependencies]
futures = { version = "0.3.31", default-features = false, features = ["std"] }

View file

@ -1,5 +1,15 @@
# favicon-scraper
Scrapes favicons from websites.
Scrapes favicons from websites. Does not particularly care for 100% optimal
performance, it just needs to work.
Does not particularly care for 100% optimal performance, it just needs to work
To get started, try the `scrape` function:
```rust
use favicon_scraper::{scrape, Icon};
let icons: Vec<Icon> = scrape("https://google.com").await.unwrap();
// Should find something like "https://www.google.com/favicon.ico"
println!("Google's icon can be found at {}", icons[0].url);
```

View file

@ -3,6 +3,11 @@ use std::fmt::{Display, Formatter};
macro_rules! impl_error {
($($name: ident ($typ: ty)),*; $($simple_name: ident),*) => {
/// favicon-scraper's automatically generated Error type.
///
/// If more fatal errors are introduced in the future, this type may expand.
/// Hence why it's marked as `non_exhaustive`.
#[non_exhaustive]
#[derive(Debug)]
pub enum Error {
$($simple_name,)*

View file

@ -11,6 +11,10 @@ const ICON_SELECTOR: &str =
"link[rel~='icon'], link[rel~='apple-touch-icon'], link[rel~='apple-touch-icon-precomposed']";
const MANIFEST_SELECTOR: &str = "link[rel~='manifest']";
/// Represents useful data scraped from HTML.
///
/// To obtain, use [`HTML::scan_html`].
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
pub struct HTML {
pub icons: Vec<Icon>,
pub manifest: Option<Url>,
@ -27,18 +31,29 @@ impl HTML {
.filter_map(|u| url.join(u).ok())
}
pub async fn scan_html(client: &Client, url: impl IntoUrl) -> Result<Self, Error> {
let response = client.get(url).send().await?;
let url = response.url().to_owned(); // Specifically use the destination URL after redirects and such
let text = response.text().await?;
fn parse_html(text: String, url: Url) -> (Option<Url>, Vec<Url>) {
let html = SHTML::parse_document(&text);
let icon_selector = Selector::parse(ICON_SELECTOR).unwrap();
let manifest_selector = Selector::parse(MANIFEST_SELECTOR).unwrap();
let manifest = HTML::get_urls_from_html(&manifest_selector, &html, &url).next();
(
manifest,
HTML::get_urls_from_html(&icon_selector, &html, &url).collect(),
)
}
let icons = HTML::get_urls_from_html(&icon_selector, &html, &url)
/// Scans an HTML file for icons and a Web App Manifest.
pub async fn scan_html(client: &Client, url: impl IntoUrl) -> Result<Self, Error> {
let response = client.get(url).send().await?;
let url = response.url().to_owned(); // Specifically use the destination URL after redirects and such
let text = response.text().await?;
let (manifest, icons) = HTML::parse_html(text, url);
let icons = icons
.into_iter()
.map(|u| Icon::from_url(client, u, IconKind::LinkedInHTML));
let icons: Vec<Icon> = join_all(icons)
.await

View file

@ -5,17 +5,30 @@ use url::Url;
use crate::Error;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
/// The source of a scraped icon.
///
/// More sources may be added in the future.
#[non_exhaustive]
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
pub enum IconKind {
/// Discovered through checking a hardcoded URL path, like `/favicon.ico`
HardcodedURL,
/// Discovered through parsing the HTML for `<link rel="icon">`s (or similar)
LinkedInHTML,
/// Discovered through parsing the Web App Manifest linked in the HTML in `<link rel="manifest">`
LinkedInManifest,
}
#[derive(Debug, Clone)]
/// A scraped icon.
///
/// To obtain, use [`crate::scrape`], [`crate::html::HTML::scan_html`], or [`crate::manifest::scan_manifest`].
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
pub struct Icon {
/// Describes how the icon was discovered
pub kind: IconKind,
/// The source URL of the scraped icon, with redirects resolved
pub url: Url,
/// The size of the scraped icon, in pixels
pub size: ImageSize,
}

View file

@ -12,7 +12,7 @@ pub mod manifest;
pub use error::Error;
use futures::future::{join, join_all};
use html::HTML;
use icon::{Icon, IconKind};
pub use icon::{Icon, IconKind};
use manifest::scan_manifest;
use reqwest::{Client, IntoUrl};
use url::Url;
@ -92,4 +92,14 @@ mod tests {
println!("Kind of icon: {:?}\n", icon.kind);
}
}
#[tokio::test]
async fn test_readme_example() {
use crate::{scrape, Icon};
let icons: Vec<Icon> = scrape("https://google.com").await.unwrap();
// Should find something like "https://www.google.com/favicon.ico"
println!("Google's icon can be found at {}", icons[0].url);
}
}

View file

@ -18,13 +18,16 @@ struct ManifestIcon {
// Not gonna trust or parse the sizes
}
/// Scans a Web App Manifest for icons.
pub async fn scan_manifest(client: &Client, url: impl IntoUrl) -> Result<Vec<Icon>, Error> {
let manifest: Manifest = client.get(url).send().await?.json().await?;
let url = url.into_url()?;
let manifest: Manifest = client.get(url.clone()).send().await?.json().await?;
Ok(join_all(
manifest
.icons
.into_iter()
.map(|i| Icon::from_url(client, i.src, IconKind::LinkedInManifest)),
.filter_map(|i| url.join(&i.src).ok())
.map(|u| Icon::from_url(client, u, IconKind::LinkedInManifest)),
)
.await
.into_iter()