Compare commits
No commits in common. "main" and "v0.1.1" have entirely different histories.
9 changed files with 14 additions and 91 deletions
|
@ -1,20 +0,0 @@
|
||||||
name: Build & Test
|
|
||||||
|
|
||||||
on: [push]
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
build-run:
|
|
||||||
runs-on: docker
|
|
||||||
container:
|
|
||||||
image: rust
|
|
||||||
steps:
|
|
||||||
- name: Update package repos
|
|
||||||
run: apt update
|
|
||||||
- name: Install Node using apt
|
|
||||||
run: apt install nodejs -y
|
|
||||||
- name: Checkout repo
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Build using Cargo
|
|
||||||
run: cargo build --verbose
|
|
||||||
- name: Run unit tests
|
|
||||||
run: cargo test --verbose
|
|
2
Cargo.lock
generated
2
Cargo.lock
generated
|
@ -226,7 +226,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "favicon-scraper"
|
name = "favicon-scraper"
|
||||||
version = "0.3.1"
|
version = "0.1.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"futures",
|
"futures",
|
||||||
"imagesize",
|
"imagesize",
|
||||||
|
|
|
@ -1,13 +1,12 @@
|
||||||
[package]
|
[package]
|
||||||
name = "favicon-scraper"
|
name = "favicon-scraper"
|
||||||
version = "0.3.1"
|
version = "0.1.1"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
description = "A favicon scraper that just works"
|
description = "A favicon scraper that just works"
|
||||||
homepage = "https://kitsunes.dev/Nekomata/favicon-scraper"
|
homepage = "https://kitsunes.dev/Nekomata/favicon-scraper"
|
||||||
repository = "https://kitsunes.dev/Nekomata/favicon-scraper"
|
repository = "https://kitsunes.dev/Nekomata/favicon-scraper"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
exclude = ["/.forgejo"]
|
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
futures = { version = "0.3.31", default-features = false, features = ["std"] }
|
futures = { version = "0.3.31", default-features = false, features = ["std"] }
|
||||||
|
|
14
README.md
14
README.md
|
@ -1,15 +1,5 @@
|
||||||
# favicon-scraper
|
# favicon-scraper
|
||||||
|
|
||||||
Scrapes favicons from websites. Does not particularly care for 100% optimal
|
Scrapes favicons from websites.
|
||||||
performance, it just needs to work.
|
|
||||||
|
|
||||||
To get started, try the `scrape` function:
|
Does not particularly care for 100% optimal performance, it just needs to work
|
||||||
|
|
||||||
```rust
|
|
||||||
use favicon_scraper::{scrape, Icon};
|
|
||||||
|
|
||||||
let icons: Vec<Icon> = scrape("https://google.com").await.unwrap();
|
|
||||||
|
|
||||||
// Should find something like "https://www.google.com/favicon.ico"
|
|
||||||
println!("Google's icon can be found at {}", icons[0].url);
|
|
||||||
```
|
|
||||||
|
|
|
@ -3,11 +3,6 @@ use std::fmt::{Display, Formatter};
|
||||||
|
|
||||||
macro_rules! impl_error {
|
macro_rules! impl_error {
|
||||||
($($name: ident ($typ: ty)),*; $($simple_name: ident),*) => {
|
($($name: ident ($typ: ty)),*; $($simple_name: ident),*) => {
|
||||||
/// favicon-scraper's automatically generated Error type.
|
|
||||||
///
|
|
||||||
/// If more fatal errors are introduced in the future, this type may expand.
|
|
||||||
/// Hence why it's marked as `non_exhaustive`.
|
|
||||||
#[non_exhaustive]
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum Error {
|
pub enum Error {
|
||||||
$($simple_name,)*
|
$($simple_name,)*
|
||||||
|
|
25
src/html.rs
25
src/html.rs
|
@ -11,10 +11,6 @@ const ICON_SELECTOR: &str =
|
||||||
"link[rel~='icon'], link[rel~='apple-touch-icon'], link[rel~='apple-touch-icon-precomposed']";
|
"link[rel~='icon'], link[rel~='apple-touch-icon'], link[rel~='apple-touch-icon-precomposed']";
|
||||||
const MANIFEST_SELECTOR: &str = "link[rel~='manifest']";
|
const MANIFEST_SELECTOR: &str = "link[rel~='manifest']";
|
||||||
|
|
||||||
/// Represents useful data scraped from HTML.
|
|
||||||
///
|
|
||||||
/// To obtain, use [`HTML::scan_html`].
|
|
||||||
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
|
|
||||||
pub struct HTML {
|
pub struct HTML {
|
||||||
pub icons: Vec<Icon>,
|
pub icons: Vec<Icon>,
|
||||||
pub manifest: Option<Url>,
|
pub manifest: Option<Url>,
|
||||||
|
@ -31,29 +27,18 @@ impl HTML {
|
||||||
.filter_map(|u| url.join(u).ok())
|
.filter_map(|u| url.join(u).ok())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_html(text: String, url: Url) -> (Option<Url>, Vec<Url>) {
|
pub async fn scan_html(client: &Client, url: impl IntoUrl) -> Result<Self, Error> {
|
||||||
|
let response = client.get(url).send().await?;
|
||||||
|
let url = response.url().to_owned(); // Specifically use the destination URL after redirects and such
|
||||||
|
let text = response.text().await?;
|
||||||
let html = SHTML::parse_document(&text);
|
let html = SHTML::parse_document(&text);
|
||||||
|
|
||||||
let icon_selector = Selector::parse(ICON_SELECTOR).unwrap();
|
let icon_selector = Selector::parse(ICON_SELECTOR).unwrap();
|
||||||
let manifest_selector = Selector::parse(MANIFEST_SELECTOR).unwrap();
|
let manifest_selector = Selector::parse(MANIFEST_SELECTOR).unwrap();
|
||||||
|
|
||||||
let manifest = HTML::get_urls_from_html(&manifest_selector, &html, &url).next();
|
let manifest = HTML::get_urls_from_html(&manifest_selector, &html, &url).next();
|
||||||
(
|
|
||||||
manifest,
|
|
||||||
HTML::get_urls_from_html(&icon_selector, &html, &url).collect(),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Scans an HTML file for icons and a Web App Manifest.
|
let icons = HTML::get_urls_from_html(&icon_selector, &html, &url)
|
||||||
pub async fn scan_html(client: &Client, url: impl IntoUrl) -> Result<Self, Error> {
|
|
||||||
let response = client.get(url).send().await?;
|
|
||||||
let url = response.url().to_owned(); // Specifically use the destination URL after redirects and such
|
|
||||||
let text = response.text().await?;
|
|
||||||
|
|
||||||
let (manifest, icons) = HTML::parse_html(text, url);
|
|
||||||
|
|
||||||
let icons = icons
|
|
||||||
.into_iter()
|
|
||||||
.map(|u| Icon::from_url(client, u, IconKind::LinkedInHTML));
|
.map(|u| Icon::from_url(client, u, IconKind::LinkedInHTML));
|
||||||
let icons: Vec<Icon> = join_all(icons)
|
let icons: Vec<Icon> = join_all(icons)
|
||||||
.await
|
.await
|
||||||
|
|
17
src/icon.rs
17
src/icon.rs
|
@ -5,30 +5,17 @@ use url::Url;
|
||||||
|
|
||||||
use crate::Error;
|
use crate::Error;
|
||||||
|
|
||||||
/// The source of a scraped icon.
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
///
|
|
||||||
/// More sources may be added in the future.
|
|
||||||
#[non_exhaustive]
|
|
||||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
|
|
||||||
pub enum IconKind {
|
pub enum IconKind {
|
||||||
/// Discovered through checking a hardcoded URL path, like `/favicon.ico`
|
|
||||||
HardcodedURL,
|
HardcodedURL,
|
||||||
/// Discovered through parsing the HTML for `<link rel="icon">`s (or similar)
|
|
||||||
LinkedInHTML,
|
LinkedInHTML,
|
||||||
/// Discovered through parsing the Web App Manifest linked in the HTML in `<link rel="manifest">`
|
|
||||||
LinkedInManifest,
|
LinkedInManifest,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A scraped icon.
|
#[derive(Debug, Clone)]
|
||||||
///
|
|
||||||
/// To obtain, use [`crate::scrape`], [`crate::html::HTML::scan_html`], or [`crate::manifest::scan_manifest`].
|
|
||||||
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
|
|
||||||
pub struct Icon {
|
pub struct Icon {
|
||||||
/// Describes how the icon was discovered
|
|
||||||
pub kind: IconKind,
|
pub kind: IconKind,
|
||||||
/// The source URL of the scraped icon, with redirects resolved
|
|
||||||
pub url: Url,
|
pub url: Url,
|
||||||
/// The size of the scraped icon, in pixels
|
|
||||||
pub size: ImageSize,
|
pub size: ImageSize,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
12
src/lib.rs
12
src/lib.rs
|
@ -12,7 +12,7 @@ pub mod manifest;
|
||||||
pub use error::Error;
|
pub use error::Error;
|
||||||
use futures::future::{join, join_all};
|
use futures::future::{join, join_all};
|
||||||
use html::HTML;
|
use html::HTML;
|
||||||
pub use icon::{Icon, IconKind};
|
use icon::{Icon, IconKind};
|
||||||
use manifest::scan_manifest;
|
use manifest::scan_manifest;
|
||||||
use reqwest::{Client, IntoUrl};
|
use reqwest::{Client, IntoUrl};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
@ -92,14 +92,4 @@ mod tests {
|
||||||
println!("Kind of icon: {:?}\n", icon.kind);
|
println!("Kind of icon: {:?}\n", icon.kind);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
async fn test_readme_example() {
|
|
||||||
use crate::{scrape, Icon};
|
|
||||||
|
|
||||||
let icons: Vec<Icon> = scrape("https://google.com").await.unwrap();
|
|
||||||
|
|
||||||
// Should find something like "https://www.google.com/favicon.ico"
|
|
||||||
println!("Google's icon can be found at {}", icons[0].url);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,16 +18,13 @@ struct ManifestIcon {
|
||||||
// Not gonna trust or parse the sizes
|
// Not gonna trust or parse the sizes
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Scans a Web App Manifest for icons.
|
|
||||||
pub async fn scan_manifest(client: &Client, url: impl IntoUrl) -> Result<Vec<Icon>, Error> {
|
pub async fn scan_manifest(client: &Client, url: impl IntoUrl) -> Result<Vec<Icon>, Error> {
|
||||||
let url = url.into_url()?;
|
let manifest: Manifest = client.get(url).send().await?.json().await?;
|
||||||
let manifest: Manifest = client.get(url.clone()).send().await?.json().await?;
|
|
||||||
Ok(join_all(
|
Ok(join_all(
|
||||||
manifest
|
manifest
|
||||||
.icons
|
.icons
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter_map(|i| url.join(&i.src).ok())
|
.map(|i| Icon::from_url(client, i.src, IconKind::LinkedInManifest)),
|
||||||
.map(|u| Icon::from_url(client, u, IconKind::LinkedInManifest)),
|
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.into_iter()
|
.into_iter()
|
||||||
|
|
Loading…
Add table
Reference in a new issue