Compare commits

..

No commits in common. "main" and "v0.1.0" have entirely different histories.
main ... v0.1.0

9 changed files with 16 additions and 93 deletions

View file

@ -1,20 +0,0 @@
name: Build & Test
on: [push]
jobs:
build-run:
runs-on: docker
container:
image: rust
steps:
- name: Update package repos
run: apt update
- name: Install Node using apt
run: apt install nodejs -y
- name: Checkout repo
uses: actions/checkout@v4
- name: Build using Cargo
run: cargo build --verbose
- name: Run unit tests
run: cargo test --verbose

2
Cargo.lock generated
View file

@ -226,7 +226,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]]
name = "favicon-scraper"
version = "0.3.1"
version = "0.1.0"
dependencies = [
"futures",
"imagesize",

View file

@ -1,13 +1,12 @@
[package]
name = "favicon-scraper"
version = "0.3.1"
version = "0.1.0"
edition = "2021"
license = "MIT"
description = "A favicon scraper that just works"
homepage = "https://kitsunes.dev/Nekomata/favicon-scraper"
repository = "https://kitsunes.dev/Nekomata/favicon-scraper"
readme = "README.md"
exclude = ["/.forgejo"]
[dependencies]
futures = { version = "0.3.31", default-features = false, features = ["std"] }

View file

@ -1,15 +1,5 @@
# favicon-scraper
Scrapes favicons from websites. Does not particularly care for 100% optimal
performance, it just needs to work.
Scrapes favicons from websites.
To get started, try the `scrape` function:
```rust
use favicon_scraper::{scrape, Icon};
let icons: Vec<Icon> = scrape("https://google.com").await.unwrap();
// Should find something like "https://www.google.com/favicon.ico"
println!("Google's icon can be found at {}", icons[0].url);
```
Does not particularly care for 100% optimal performance, it just needs to work

View file

@ -3,11 +3,6 @@ use std::fmt::{Display, Formatter};
macro_rules! impl_error {
($($name: ident ($typ: ty)),*; $($simple_name: ident),*) => {
/// favicon-scraper's automatically generated Error type.
///
/// If more fatal errors are introduced in the future, this type may expand.
/// Hence why it's marked as `non_exhaustive`.
#[non_exhaustive]
#[derive(Debug)]
pub enum Error {
$($simple_name,)*

View file

@ -11,10 +11,6 @@ const ICON_SELECTOR: &str =
"link[rel~='icon'], link[rel~='apple-touch-icon'], link[rel~='apple-touch-icon-precomposed']";
const MANIFEST_SELECTOR: &str = "link[rel~='manifest']";
/// Represents useful data scraped from HTML.
///
/// To obtain, use [`HTML::scan_html`].
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
pub struct HTML {
pub icons: Vec<Icon>,
pub manifest: Option<Url>,
@ -31,29 +27,18 @@ impl HTML {
.filter_map(|u| url.join(u).ok())
}
fn parse_html(text: String, url: Url) -> (Option<Url>, Vec<Url>) {
pub async fn scan_html(client: &Client, url: impl IntoUrl) -> Result<Self, Error> {
let response = client.get(url).send().await?;
let url = response.url().to_owned(); // Specifically use the destination URL after redirects and such
let text = response.text().await?;
let html = SHTML::parse_document(&text);
let icon_selector = Selector::parse(ICON_SELECTOR).unwrap();
let manifest_selector = Selector::parse(MANIFEST_SELECTOR).unwrap();
let manifest = HTML::get_urls_from_html(&manifest_selector, &html, &url).next();
(
manifest,
HTML::get_urls_from_html(&icon_selector, &html, &url).collect(),
)
}
/// Scans an HTML file for icons and a Web App Manifest.
pub async fn scan_html(client: &Client, url: impl IntoUrl) -> Result<Self, Error> {
let response = client.get(url).send().await?;
let url = response.url().to_owned(); // Specifically use the destination URL after redirects and such
let text = response.text().await?;
let (manifest, icons) = HTML::parse_html(text, url);
let icons = icons
.into_iter()
let icons = HTML::get_urls_from_html(&icon_selector, &html, &url)
.map(|u| Icon::from_url(client, u, IconKind::LinkedInHTML));
let icons: Vec<Icon> = join_all(icons)
.await

View file

@ -5,30 +5,17 @@ use url::Url;
use crate::Error;
/// The source of a scraped icon.
///
/// More sources may be added in the future.
#[non_exhaustive]
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IconKind {
/// Discovered through checking a hardcoded URL path, like `/favicon.ico`
HardcodedURL,
/// Discovered through parsing the HTML for `<link rel="icon">`s (or similar)
LinkedInHTML,
/// Discovered through parsing the Web App Manifest linked in the HTML in `<link rel="manifest">`
LinkedInManifest,
}
/// A scraped icon.
///
/// To obtain, use [`crate::scrape`], [`crate::html::HTML::scan_html`], or [`crate::manifest::scan_manifest`].
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
#[derive(Debug, Clone)]
pub struct Icon {
/// Describes how the icon was discovered
pub kind: IconKind,
/// The source URL of the scraped icon, with redirects resolved
pub url: Url,
/// The size of the scraped icon, in pixels
pub size: ImageSize,
}

View file

@ -12,7 +12,7 @@ pub mod manifest;
pub use error::Error;
use futures::future::{join, join_all};
use html::HTML;
pub use icon::{Icon, IconKind};
use icon::{Icon, IconKind};
use manifest::scan_manifest;
use reqwest::{Client, IntoUrl};
use url::Url;
@ -29,11 +29,11 @@ use url::Url;
/// # tokio_test::block_on(async {
/// use favicon_scraper::{scrape, Error};
///
/// let icons = scrape("https://kitsunes.dev").await.unwrap();
/// let icons = scrape("https://kitsunes.dev", true).await.unwrap();
///
/// // Only HTTP(S) is supported
/// assert!(matches!(
/// scrape("ftp://example.com").await,
/// scrape("ftp://example.com", true).await,
/// Err(Error::UnsupportedURLScheme)
/// ));
/// # })
@ -92,14 +92,4 @@ mod tests {
println!("Kind of icon: {:?}\n", icon.kind);
}
}
#[tokio::test]
async fn test_readme_example() {
use crate::{scrape, Icon};
let icons: Vec<Icon> = scrape("https://google.com").await.unwrap();
// Should find something like "https://www.google.com/favicon.ico"
println!("Google's icon can be found at {}", icons[0].url);
}
}

View file

@ -18,16 +18,13 @@ struct ManifestIcon {
// Not gonna trust or parse the sizes
}
/// Scans a Web App Manifest for icons.
pub async fn scan_manifest(client: &Client, url: impl IntoUrl) -> Result<Vec<Icon>, Error> {
let url = url.into_url()?;
let manifest: Manifest = client.get(url.clone()).send().await?.json().await?;
let manifest: Manifest = client.get(url).send().await?.json().await?;
Ok(join_all(
manifest
.icons
.into_iter()
.filter_map(|i| url.join(&i.src).ok())
.map(|u| Icon::from_url(client, u, IconKind::LinkedInManifest)),
.map(|i| Icon::from_url(client, i.src, IconKind::LinkedInManifest)),
)
.await
.into_iter()