Compare commits

...

2 commits

Author SHA1 Message Date
ac481ec06a Bump version 2025-02-02 15:22:25 +01:00
5b111fd292 Scraping is now thread-safe 2025-02-02 15:22:11 +01:00
3 changed files with 18 additions and 8 deletions

2
Cargo.lock generated
View file

@ -226,7 +226,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]] [[package]]
name = "favicon-scraper" name = "favicon-scraper"
version = "0.2.0" version = "0.3.0"
dependencies = [ dependencies = [
"futures", "futures",
"imagesize", "imagesize",

View file

@ -1,6 +1,6 @@
[package] [package]
name = "favicon-scraper" name = "favicon-scraper"
version = "0.2.0" version = "0.3.0"
edition = "2021" edition = "2021"
license = "MIT" license = "MIT"
description = "A favicon scraper that just works" description = "A favicon scraper that just works"

View file

@ -31,19 +31,29 @@ impl HTML {
.filter_map(|u| url.join(u).ok()) .filter_map(|u| url.join(u).ok())
} }
/// Scans an HTML file for icons and a Web App Manifest. fn parse_html(text: String, url: Url) -> (Option<Url>, Vec<Url>) {
pub async fn scan_html(client: &Client, url: impl IntoUrl) -> Result<Self, Error> {
let response = client.get(url).send().await?;
let url = response.url().to_owned(); // Specifically use the destination URL after redirects and such
let text = response.text().await?;
let html = SHTML::parse_document(&text); let html = SHTML::parse_document(&text);
let icon_selector = Selector::parse(ICON_SELECTOR).unwrap(); let icon_selector = Selector::parse(ICON_SELECTOR).unwrap();
let manifest_selector = Selector::parse(MANIFEST_SELECTOR).unwrap(); let manifest_selector = Selector::parse(MANIFEST_SELECTOR).unwrap();
let manifest = HTML::get_urls_from_html(&manifest_selector, &html, &url).next(); let manifest = HTML::get_urls_from_html(&manifest_selector, &html, &url).next();
(
manifest,
HTML::get_urls_from_html(&icon_selector, &html, &url).collect(),
)
}
let icons = HTML::get_urls_from_html(&icon_selector, &html, &url) /// Scans an HTML file for icons and a Web App Manifest.
pub async fn scan_html(client: &Client, url: impl IntoUrl) -> Result<Self, Error> {
let response = client.get(url).send().await?;
let url = response.url().to_owned(); // Specifically use the destination URL after redirects and such
let text = response.text().await?;
let (manifest, icons) = HTML::parse_html(text, url);
let icons = icons
.into_iter()
.map(|u| Icon::from_url(client, u, IconKind::LinkedInHTML)); .map(|u| Icon::from_url(client, u, IconKind::LinkedInHTML));
let icons: Vec<Icon> = join_all(icons) let icons: Vec<Icon> = join_all(icons)
.await .await