initial commit
This commit is contained in:
commit
6368454e36
17 changed files with 3069 additions and 0 deletions
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
/target
|
3
.rustfmt.toml
Normal file
3
.rustfmt.toml
Normal file
|
@ -0,0 +1,3 @@
|
|||
indent_style = "Block"
|
||||
reorder_imports = true
|
||||
tab_spaces = 2
|
2157
Cargo.lock
generated
Normal file
2157
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
36
Cargo.toml
Normal file
36
Cargo.toml
Normal file
|
@ -0,0 +1,36 @@
|
|||
[package]
|
||||
name = "site_icons"
|
||||
version = "0.1.0"
|
||||
authors = ["Sam Denty <sam@samdenty.com>"]
|
||||
edition = "2018"
|
||||
|
||||
[package.metadata.wasm-pack.profile.release]
|
||||
wasm-opt = ["-Oz", "--enable-mutable-globals"]
|
||||
|
||||
[lib]
|
||||
crate-type = ["cdylib", "rlib"]
|
||||
|
||||
|
||||
[dependencies]
|
||||
clap = "3.0.0-beta.2"
|
||||
itertools = "0.10.0"
|
||||
serde_with = "1.6.1"
|
||||
pin-utils = "0.1.0"
|
||||
html5ever = "0.25.1"
|
||||
url = { version = "2.2.0", features = ["serde"] }
|
||||
regex = "1"
|
||||
log = "0.4.14"
|
||||
once_cell = "1.5.2"
|
||||
scraper = "0.12.0"
|
||||
tokio-byteorder = { git = "https://github.com/samdenty/tokio-byteorder", features = ["futures"] }
|
||||
byteorder = "1.4.2"
|
||||
data-url = "0.1.0"
|
||||
mime = { git = "https://github.com/hyperium/mime" }
|
||||
serde = { version = "1.0", features = ["derive", "rc"] }
|
||||
serde_json = "1.0"
|
||||
reqwest = { git = "https://github.com/samdenty/reqwest", features = ["json", "cookies", "blocking", "stream"] }
|
||||
futures = "0.3.8"
|
||||
wee_alloc = { version = "0.4.2", optional = true }
|
||||
|
||||
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
|
||||
tokio = { version = "1.1.0", features = ["full"] }
|
4
Makefile.toml
Normal file
4
Makefile.toml
Normal file
|
@ -0,0 +1,4 @@
|
|||
[tasks.run]
|
||||
env = { "RUST_LOG" = "site_icons" }
|
||||
command = "cargo"
|
||||
args = ["run", "--", "${@}"]
|
37
README.md
Normal file
37
README.md
Normal file
|
@ -0,0 +1,37 @@
|
|||
# site_icons
|
||||
|
||||
An efficient website icon scraper for rust
|
||||
|
||||
```rs
|
||||
use site_icons::Icons;
|
||||
|
||||
let icons = Icons::new();
|
||||
// scrape the icons from a url
|
||||
icons.load_website("https://github.com").await?;
|
||||
|
||||
// fetch all icons, ensuring they exist & determining size
|
||||
let entries = icons.entries().await;
|
||||
for icon in entries {
|
||||
println("{:?}", icon)
|
||||
}
|
||||
```
|
||||
|
||||
## Features
|
||||
|
||||
- Validates that all URLs exist and are actually images
|
||||
- Determines the size of the icon by partially fetching it
|
||||
- Supports WASM (and cloudflare workers)
|
||||
|
||||
### Sources
|
||||
|
||||
- HTML favicon tag (or looking for default `/favicon.ico`)
|
||||
- [Web app manifest](https://developer.mozilla.org/en-US/docs/Web/Manifest) [`icons`](https://developer.mozilla.org/en-US/docs/Web/Manifest/icons) field
|
||||
- `<img>` tags on the page, directly inside the header OR with a `src|alt|class` containing the text "logo"
|
||||
|
||||
## Running locally
|
||||
|
||||
Install [cargo make](https://github.com/sagiegurari/cargo-make) and then:
|
||||
|
||||
```bash
|
||||
cargo make run https://github.com
|
||||
```
|
32
src/bin/site_icons.rs
Normal file
32
src/bin/site_icons.rs
Normal file
|
@ -0,0 +1,32 @@
|
|||
use clap::Clap;
|
||||
use site_icons::Icons;
|
||||
use std::error::Error;
|
||||
|
||||
#[derive(Clap)]
|
||||
struct Opts {
|
||||
urls: Vec<String>,
|
||||
#[clap(long)]
|
||||
json: bool,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn Error>> {
|
||||
let mut icons = Icons::new();
|
||||
let opts: Opts = Opts::parse();
|
||||
|
||||
for url in opts.urls {
|
||||
icons.load_website(&url).await?;
|
||||
}
|
||||
|
||||
let entries = icons.entries().await;
|
||||
|
||||
if opts.json {
|
||||
println!("{}", serde_json::to_string_pretty(&entries)?)
|
||||
} else {
|
||||
for icon in entries {
|
||||
println!("{} {} {}", icon.url, icon.kind, icon.info);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
45
src/icon.rs
Normal file
45
src/icon.rs
Normal file
|
@ -0,0 +1,45 @@
|
|||
use super::IconInfo;
|
||||
use serde::Serialize;
|
||||
use std::{
|
||||
cmp::Ordering,
|
||||
fmt::{self, Display},
|
||||
};
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug, Serialize, Clone, PartialOrd, PartialEq, Ord, Eq)]
|
||||
pub enum IconKind {
|
||||
SiteLogo,
|
||||
SiteFavicon,
|
||||
AppIcon,
|
||||
}
|
||||
|
||||
impl Display for IconKind {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
f.write_str(match self {
|
||||
IconKind::SiteLogo => "site_logo",
|
||||
IconKind::AppIcon => "app_icon",
|
||||
IconKind::SiteFavicon => "site_favicon",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, PartialEq, Eq)]
|
||||
pub struct Icon {
|
||||
pub url: Url,
|
||||
#[serde(with = "serde_with::rust::display_fromstr")]
|
||||
pub kind: IconKind,
|
||||
#[serde(flatten)]
|
||||
pub info: IconInfo,
|
||||
}
|
||||
|
||||
impl Ord for Icon {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.info.cmp(&other.info)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for Icon {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
162
src/icon_info.rs
Normal file
162
src/icon_info.rs
Normal file
|
@ -0,0 +1,162 @@
|
|||
use crate::{icon_size::*, CLIENT};
|
||||
use data_url::DataUrl;
|
||||
use futures::{io::Cursor, prelude::*, stream::TryStreamExt};
|
||||
use mime::MediaType;
|
||||
use reqwest::{header::*, Url};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
cmp::Ordering,
|
||||
error::Error,
|
||||
fmt::{self, Display},
|
||||
io::{self},
|
||||
};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
enum IconType {
|
||||
PNG,
|
||||
JPEG,
|
||||
ICO,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||
#[serde(tag = "type")]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum IconInfo {
|
||||
PNG { size: IconSize },
|
||||
JPEG { size: IconSize },
|
||||
ICO { sizes: IconSizes },
|
||||
SVG,
|
||||
}
|
||||
|
||||
impl Display for IconInfo {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match self {
|
||||
IconInfo::PNG { size } => write!(f, "png {}", size),
|
||||
IconInfo::JPEG { size } => write!(f, "jpeg {}", size),
|
||||
IconInfo::ICO { sizes } => write!(f, "ico {}", sizes),
|
||||
IconInfo::SVG => write!(f, "svg"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for IconInfo {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let this_size = self.size();
|
||||
let other_size = other.size();
|
||||
|
||||
if this_size.is_none() && other_size.is_none() {
|
||||
Ordering::Equal
|
||||
} else if let (Some(this_size), Some(other_size)) = (this_size, other_size) {
|
||||
this_size.cmp(other_size)
|
||||
} else if this_size.is_none() {
|
||||
Ordering::Less
|
||||
} else {
|
||||
Ordering::Greater
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for IconInfo {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl IconInfo {
|
||||
pub async fn get(url: Url, sizes: Option<String>) -> Result<IconInfo, Box<dyn Error>> {
|
||||
let sizes = sizes.as_ref().and_then(|s| IconSizes::from_str(s).ok());
|
||||
|
||||
let (mime, mut body): (_, Box<dyn AsyncRead + Unpin>) = match url.scheme() {
|
||||
"data" => {
|
||||
let url = url.to_string();
|
||||
let url = DataUrl::process(&url).map_err(|_| "failed to parse data uri")?;
|
||||
|
||||
let mime = url.mime_type().to_string().parse::<MediaType>()?;
|
||||
|
||||
let body = Cursor::new(
|
||||
url
|
||||
.decode_to_vec()
|
||||
.map_err(|_| "failed to decode data uri body")?
|
||||
.0,
|
||||
);
|
||||
|
||||
(mime, Box::new(body))
|
||||
}
|
||||
|
||||
_ => {
|
||||
let res = CLIENT.get(url).send().await?;
|
||||
if !res.status().is_success() {
|
||||
return Err("failed to fetch".into());
|
||||
};
|
||||
|
||||
let mime = res
|
||||
.headers()
|
||||
.get(CONTENT_TYPE)
|
||||
.ok_or("no content type")?
|
||||
.to_str()?
|
||||
.parse::<MediaType>()?;
|
||||
|
||||
let body = res
|
||||
.bytes_stream()
|
||||
.map(|result| {
|
||||
result.map_err(|error| io::Error::new(io::ErrorKind::Other, error.to_string()))
|
||||
})
|
||||
.into_async_read();
|
||||
|
||||
(mime, Box::new(body))
|
||||
}
|
||||
};
|
||||
|
||||
let kind = match (mime.type_(), mime.subtype()) {
|
||||
(mime::IMAGE, mime::PNG) => {
|
||||
if let Some(size) = sizes.map(|s| s.into_largest()) {
|
||||
return Ok(IconInfo::PNG { size });
|
||||
}
|
||||
IconType::PNG
|
||||
}
|
||||
|
||||
(mime::IMAGE, mime::JPEG) => {
|
||||
if let Some(size) = sizes.map(|s| s.into_largest()) {
|
||||
return Ok(IconInfo::JPEG { size });
|
||||
}
|
||||
IconType::JPEG
|
||||
}
|
||||
|
||||
(mime::IMAGE, "x-icon") | (mime::IMAGE, "vnd.microsoft.icon") => {
|
||||
if let Some(sizes) = sizes {
|
||||
return Ok(IconInfo::ICO { sizes });
|
||||
}
|
||||
|
||||
IconType::ICO
|
||||
}
|
||||
|
||||
(mime::IMAGE, mime::SVG) => return Ok(IconInfo::SVG),
|
||||
|
||||
_ => return Err(format!("unsupported mime type {}", mime).into()),
|
||||
};
|
||||
|
||||
Ok(match kind {
|
||||
IconType::PNG => {
|
||||
let size = get_png_sizes(&mut body).await?;
|
||||
IconInfo::PNG { size }
|
||||
}
|
||||
IconType::ICO => {
|
||||
let sizes = get_ico_sizes(&mut body).await?;
|
||||
IconInfo::ICO { sizes }
|
||||
}
|
||||
IconType::JPEG => {
|
||||
let size = get_jpeg_size(&mut body).await?;
|
||||
IconInfo::JPEG { size }
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn size(&self) -> Option<&IconSize> {
|
||||
match self {
|
||||
IconInfo::ICO { sizes } => Some(sizes.largest()),
|
||||
IconInfo::PNG { size } | IconInfo::JPEG { size } => Some(size),
|
||||
IconInfo::SVG => None,
|
||||
}
|
||||
}
|
||||
}
|
62
src/icon_size/ico.rs
Normal file
62
src/icon_size/ico.rs
Normal file
|
@ -0,0 +1,62 @@
|
|||
use super::{png::get_png_sizes, IconSizes};
|
||||
use byteorder::{LittleEndian, ReadBytesExt};
|
||||
use futures::prelude::*;
|
||||
use std::{
|
||||
error::Error,
|
||||
io::{Cursor, Seek, SeekFrom},
|
||||
};
|
||||
|
||||
const ICO_TYPE: u16 = 1;
|
||||
const INDEX_SIZE: u16 = 16;
|
||||
|
||||
pub async fn get_ico_sizes<R: AsyncRead + Unpin>(
|
||||
reader: &mut R,
|
||||
) -> Result<IconSizes, Box<dyn Error>> {
|
||||
let mut offset = 0;
|
||||
let mut header = [0; 6];
|
||||
reader.read_exact(&mut header).await?;
|
||||
offset += header.len();
|
||||
let mut header = Cursor::new(header);
|
||||
|
||||
let header_type = header.read_u16::<LittleEndian>()?;
|
||||
let icon_type = header.read_u16::<LittleEndian>()?;
|
||||
|
||||
if header_type != 0 || icon_type != ICO_TYPE {
|
||||
return Err("bad header".into());
|
||||
}
|
||||
|
||||
let icon_count = header.read_u16::<LittleEndian>()?;
|
||||
|
||||
let mut data = vec![0; (icon_count * INDEX_SIZE) as usize];
|
||||
reader.read_exact(&mut data).await?;
|
||||
offset += data.len();
|
||||
let mut data = Cursor::new(data);
|
||||
|
||||
let mut sizes = IconSizes::new();
|
||||
for i in 0..icon_count {
|
||||
data.seek(SeekFrom::Start((INDEX_SIZE * i) as _))?;
|
||||
|
||||
let width = data.read_u8()?;
|
||||
let height = data.read_u8()?;
|
||||
|
||||
if width == 0 && height == 0 {
|
||||
data.seek(SeekFrom::Current(10))?;
|
||||
let image_offset = data.read_u32::<LittleEndian>()?;
|
||||
|
||||
let mut data = vec![0; image_offset as usize - offset];
|
||||
reader.read_exact(&mut data).await?;
|
||||
offset += data.len();
|
||||
|
||||
let size = get_png_sizes(reader).await;
|
||||
if let Ok(size) = size {
|
||||
sizes.push(size);
|
||||
}
|
||||
} else {
|
||||
sizes.add_size(width as _, height as _)
|
||||
}
|
||||
}
|
||||
|
||||
sizes.sort();
|
||||
|
||||
Ok(sizes)
|
||||
}
|
81
src/icon_size/icon_sizes.rs
Normal file
81
src/icon_size/icon_sizes.rs
Normal file
|
@ -0,0 +1,81 @@
|
|||
use super::IconSize;
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use std::{
|
||||
cmp::Ordering,
|
||||
error::Error,
|
||||
fmt::{self, Display},
|
||||
ops::{Deref, DerefMut},
|
||||
};
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize, PartialEq, Eq)]
|
||||
pub struct IconSizes(Vec<IconSize>);
|
||||
|
||||
impl Display for IconSizes {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.write_str(&self.0.iter().join(" "))
|
||||
}
|
||||
}
|
||||
|
||||
impl IconSizes {
|
||||
pub fn new() -> Self {
|
||||
IconSizes(Vec::new())
|
||||
}
|
||||
|
||||
pub fn from_str(sizes_str: &str) -> Result<IconSizes, Box<dyn Error>> {
|
||||
let size_strs = sizes_str.split(" ");
|
||||
|
||||
let mut sizes = IconSizes::new();
|
||||
for size in size_strs {
|
||||
if let Ok(size) = serde_json::from_value(Value::String(size.to_string())) {
|
||||
sizes.push(size);
|
||||
}
|
||||
}
|
||||
|
||||
if sizes.is_empty() {
|
||||
return Err("must contain a size".into());
|
||||
}
|
||||
|
||||
sizes.sort();
|
||||
|
||||
Ok(sizes)
|
||||
}
|
||||
|
||||
pub fn add_size(&mut self, width: u32, height: u32) {
|
||||
self.push(IconSize::new(width, height))
|
||||
}
|
||||
|
||||
pub fn largest(&self) -> &IconSize {
|
||||
&self.0[0]
|
||||
}
|
||||
|
||||
pub fn into_largest(self) -> IconSize {
|
||||
self.0.into_iter().next().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for IconSizes {
|
||||
type Target = Vec<IconSize>;
|
||||
fn deref(&self) -> &Vec<IconSize> {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for IconSizes {
|
||||
fn deref_mut(&mut self) -> &mut Vec<IconSize> {
|
||||
&mut self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for IconSizes {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.largest().cmp(&other.largest())
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for IconSizes {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
64
src/icon_size/jpeg.rs
Normal file
64
src/icon_size/jpeg.rs
Normal file
|
@ -0,0 +1,64 @@
|
|||
use super::IconSize;
|
||||
use crate::assert_slice_eq;
|
||||
use byteorder::BigEndian;
|
||||
use futures::prelude::*;
|
||||
use std::{error::Error, io::Cursor};
|
||||
use tokio_byteorder::AsyncReadBytesExt;
|
||||
|
||||
pub async fn get_jpeg_size<R: AsyncRead + Unpin>(
|
||||
reader: &mut R,
|
||||
) -> Result<IconSize, Box<dyn Error>> {
|
||||
let mut data = [0; 2];
|
||||
reader.read_exact(&mut data).await?;
|
||||
let data = &mut Cursor::new(data);
|
||||
|
||||
// first marker of the file MUST be 0xFFD8
|
||||
assert_slice_eq!(data, 0, &[0xFF, 0xD8], "bad header");
|
||||
|
||||
let mut marker = [0; 2];
|
||||
let mut depth = 0i32;
|
||||
|
||||
loop {
|
||||
// Read current marker (FF XX)
|
||||
reader.read_exact(&mut marker).await?;
|
||||
|
||||
if marker[0] != 0xFF {
|
||||
// Did not read a marker. Assume image is corrupt.
|
||||
return Err("invalid jpeg".into());
|
||||
}
|
||||
|
||||
let page = marker[1];
|
||||
|
||||
// Check for valid SOFn markers. C4, C8, and CC aren't dimension markers.
|
||||
if (page >= 0xC0 && page <= 0xC3)
|
||||
|| (page >= 0xC5 && page <= 0xC7)
|
||||
|| (page >= 0xC9 && page <= 0xCB)
|
||||
|| (page >= 0xCD && page <= 0xCF)
|
||||
{
|
||||
// Only get outside image size
|
||||
if depth == 0 {
|
||||
// Correct marker, go forward 3 bytes so we're at height offset
|
||||
reader.read_exact(&mut [0; 3]).await?;
|
||||
break;
|
||||
}
|
||||
} else if page == 0xD8 {
|
||||
depth += 1;
|
||||
} else if page == 0xD9 {
|
||||
depth -= 1;
|
||||
if depth < 0 {
|
||||
return Err("invalid jpeg".into());
|
||||
}
|
||||
}
|
||||
|
||||
// Read the marker length and skip over it entirely
|
||||
let page_size = reader.read_u16::<BigEndian>().await? as i64;
|
||||
reader
|
||||
.read_exact(&mut vec![0; (page_size - 2) as usize])
|
||||
.await?;
|
||||
}
|
||||
|
||||
let height = reader.read_u16::<BigEndian>().await?;
|
||||
let width = reader.read_u16::<BigEndian>().await?;
|
||||
|
||||
Ok(IconSize::new(width as _, height as _))
|
||||
}
|
103
src/icon_size/mod.rs
Normal file
103
src/icon_size/mod.rs
Normal file
|
@ -0,0 +1,103 @@
|
|||
mod ico;
|
||||
mod icon_sizes;
|
||||
mod jpeg;
|
||||
mod png;
|
||||
|
||||
pub use ico::*;
|
||||
pub use icon_sizes::*;
|
||||
pub use jpeg::*;
|
||||
pub use png::*;
|
||||
|
||||
use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
|
||||
use std::{
|
||||
cmp::Ordering,
|
||||
error::Error,
|
||||
fmt::{self, Display},
|
||||
io::{Read, Seek, SeekFrom},
|
||||
};
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct IconSize {
|
||||
width: u32,
|
||||
height: u32,
|
||||
}
|
||||
|
||||
impl Display for IconSize {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "{}x{}", self.width, self.height)
|
||||
}
|
||||
}
|
||||
|
||||
impl IconSize {
|
||||
pub fn new(width: u32, height: u32) -> Self {
|
||||
Self { width, height }
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for IconSize {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let self_res = self.width * self.height;
|
||||
let other_res = other.width * other.height;
|
||||
other_res.cmp(&self_res)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for IconSize {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for IconSize {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
serializer.collect_str(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for IconSize {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
let value: String = Deserialize::deserialize(deserializer)?;
|
||||
|
||||
let mut split = value.split("x");
|
||||
let width = split
|
||||
.next()
|
||||
.ok_or(de::Error::custom("expected width"))?
|
||||
.parse()
|
||||
.map_err(de::Error::custom)?;
|
||||
|
||||
let height = split
|
||||
.next()
|
||||
.ok_or(de::Error::custom("expected height"))?
|
||||
.parse()
|
||||
.map_err(de::Error::custom)?;
|
||||
|
||||
Ok(IconSize::new(width, height))
|
||||
}
|
||||
}
|
||||
|
||||
fn slice_eq<T: Read + Seek + Unpin>(
|
||||
cur: &mut T,
|
||||
offset: u64,
|
||||
slice: &[u8],
|
||||
) -> Result<bool, Box<dyn Error>> {
|
||||
cur.seek(SeekFrom::Start(offset))?;
|
||||
let mut buffer = vec![0; slice.len()];
|
||||
cur.read_exact(&mut buffer)?;
|
||||
Ok(buffer == slice)
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! assert_slice_eq {
|
||||
($cur:expr, $offset:expr, $slice:expr, $($arg:tt)+) => {{
|
||||
if !super::slice_eq($cur, $offset, $slice)? {
|
||||
return Err(format!($($arg)+).into());
|
||||
}
|
||||
}};
|
||||
}
|
21
src/icon_size/png.rs
Normal file
21
src/icon_size/png.rs
Normal file
|
@ -0,0 +1,21 @@
|
|||
use super::IconSize;
|
||||
use crate::assert_slice_eq;
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use futures::prelude::*;
|
||||
use std::{error::Error, io::Cursor};
|
||||
|
||||
pub async fn get_png_sizes<R: AsyncRead + Unpin>(
|
||||
reader: &mut R,
|
||||
) -> Result<IconSize, Box<dyn Error>> {
|
||||
let mut header = [0; 24];
|
||||
reader.read_exact(&mut header).await?;
|
||||
let header = &mut Cursor::new(header);
|
||||
|
||||
assert_slice_eq!(header, 0, b"\x89PNG\r\n\x1a\n", "bad header");
|
||||
assert_slice_eq!(header, 12, b"IHDR", "bad header");
|
||||
|
||||
let width = header.read_u32::<BigEndian>()?;
|
||||
let height = header.read_u32::<BigEndian>()?;
|
||||
|
||||
Ok(IconSize::new(width, height))
|
||||
}
|
220
src/icons.rs
Normal file
220
src/icons.rs
Normal file
|
@ -0,0 +1,220 @@
|
|||
use crate::{selector, Icon, IconInfo, IconKind, CLIENT};
|
||||
use future::join_all;
|
||||
use futures::StreamExt;
|
||||
use futures::{prelude::*, task::noop_waker};
|
||||
use html5ever::{
|
||||
driver,
|
||||
tendril::{Tendril, TendrilSink},
|
||||
};
|
||||
use reqwest::{header::*, IntoUrl};
|
||||
use scraper::Html;
|
||||
use serde::Deserialize;
|
||||
use std::task::Poll;
|
||||
use std::{collections::HashMap, error::Error, pin::Pin, task::Context};
|
||||
use url::Url;
|
||||
|
||||
pub struct Icons {
|
||||
entries: Vec<Icon>,
|
||||
pending_entries: HashMap<
|
||||
Url,
|
||||
(
|
||||
IconKind,
|
||||
Pin<Box<dyn Future<Output = Result<IconInfo, Box<dyn Error>>>>>,
|
||||
),
|
||||
>,
|
||||
}
|
||||
|
||||
fn add_icon_entry(
|
||||
entries: &mut Vec<Icon>,
|
||||
url: Url,
|
||||
kind: IconKind,
|
||||
info: Result<IconInfo, Box<dyn Error>>,
|
||||
) {
|
||||
match info {
|
||||
Ok(info) => entries.push(Icon { url, kind, info }),
|
||||
Err(e) => {
|
||||
warn!("failed to parse icon: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Icons {
|
||||
pub fn new() -> Self {
|
||||
Icons {
|
||||
entries: Vec::new(),
|
||||
pending_entries: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Add an icon URL and start fetching it
|
||||
pub fn add_icon(
|
||||
&mut self,
|
||||
url: Url,
|
||||
kind: IconKind,
|
||||
sizes: Option<String>,
|
||||
) -> Result<(), Box<dyn Error>> {
|
||||
// check to see if it already exists
|
||||
let mut entries = self.entries.iter_mut();
|
||||
if let Some(existing_kind) = self
|
||||
.pending_entries
|
||||
.get_mut(&url)
|
||||
.map(|(kind, _)| kind)
|
||||
.or_else(|| entries.find_map(|icon| (icon.url == url).then_some(&mut icon.kind)))
|
||||
{
|
||||
// if the kind is more important, replace it
|
||||
if &kind > existing_kind {
|
||||
*existing_kind = kind;
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut info = Box::pin(IconInfo::get(url.clone(), sizes));
|
||||
|
||||
// Start fetching the icon
|
||||
let noop_waker = noop_waker();
|
||||
let cx = &mut Context::from_waker(&noop_waker);
|
||||
match info.poll_unpin(cx) {
|
||||
Poll::Ready(info) => add_icon_entry(&mut self.entries, url, kind, info),
|
||||
Poll::Pending => {
|
||||
self.pending_entries.insert(url, (kind, info));
|
||||
}
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn load_website<U: IntoUrl>(&mut self, url: U) -> Result<(), Box<dyn Error>> {
|
||||
let res = CLIENT.get(url).header(ACCEPT, "text/html").send().await?;
|
||||
let url = res.url().clone();
|
||||
let mut body = res.bytes_stream();
|
||||
|
||||
let mut parser = driver::parse_document(Html::new_document(), Default::default());
|
||||
while let Some(data) = body.next().await {
|
||||
let tendril = Tendril::try_from_byte_slice(&data?).map_err(|_| "failed to parse html")?;
|
||||
parser.process(tendril);
|
||||
}
|
||||
let document = parser.finish();
|
||||
|
||||
{
|
||||
let mut found_favicon = false;
|
||||
|
||||
for element_ref in document.select(selector!(
|
||||
"link[rel='icon']",
|
||||
"link[rel='shortcut icon']",
|
||||
"link[rel='apple-touch-icon']",
|
||||
"link[rel='apple-touch-icon-precomposed']"
|
||||
)) {
|
||||
let elem = element_ref.value();
|
||||
if let Some(href) = elem.attr("href").and_then(|href| url.join(&href).ok()) {
|
||||
if self
|
||||
.add_icon(
|
||||
href,
|
||||
IconKind::SiteFavicon,
|
||||
elem.attr("sizes").map(|sizes| sizes.into()),
|
||||
)
|
||||
.is_ok()
|
||||
{
|
||||
found_favicon = true;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
// Check for default favicon.ico
|
||||
if !found_favicon {
|
||||
self.add_icon(url.join("/favicon.ico")?, IconKind::SiteFavicon, None)?;
|
||||
}
|
||||
}
|
||||
|
||||
for element_ref in document.select(selector!(
|
||||
"header img",
|
||||
"img[src*=logo]",
|
||||
"img[alt*=logo]",
|
||||
"img[class*=logo]"
|
||||
)) {
|
||||
if let Some(href) = element_ref
|
||||
.value()
|
||||
.attr("src")
|
||||
.and_then(|href| url.join(&href).ok())
|
||||
{
|
||||
if self.add_icon(href, IconKind::SiteLogo, None).is_ok() {
|
||||
break;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
for element_ref in document.select(selector!("link[rel='manifest']")) {
|
||||
if let Some(href) = element_ref
|
||||
.value()
|
||||
.attr("href")
|
||||
.and_then(|href| url.join(&href).ok())
|
||||
{
|
||||
self.load_manifest(href).await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn load_manifest(&mut self, manifest_url: Url) -> Result<(), Box<dyn Error>> {
|
||||
#[derive(Deserialize)]
|
||||
struct ManifestIcon {
|
||||
src: String,
|
||||
sizes: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Manifest {
|
||||
icons: Option<Vec<ManifestIcon>>,
|
||||
}
|
||||
|
||||
let manifest: Manifest = CLIENT
|
||||
.get(manifest_url.as_str())
|
||||
.send()
|
||||
.await?
|
||||
.json()
|
||||
.await?;
|
||||
|
||||
if let Some(icons) = manifest.icons {
|
||||
for icon in icons {
|
||||
if let Ok(src) = manifest_url.join(&icon.src) {
|
||||
let _ = self.add_icon(src, IconKind::AppIcon, icon.sizes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Fetch all the icons and return a list of them.
|
||||
///
|
||||
/// List is ordered from highest resolution to lowest resolution
|
||||
///
|
||||
/// ```
|
||||
/// # async fn run() {
|
||||
/// let icons = Icons::new();
|
||||
/// icons.load_website("https://github.com").await?;
|
||||
///
|
||||
/// let entries = icons.entries().await;
|
||||
/// for icon in entries {
|
||||
/// println("{:?}", icon)
|
||||
/// }
|
||||
/// ```
|
||||
pub async fn entries(mut self) -> Vec<Icon> {
|
||||
let (urls, infos): (Vec<_>, Vec<_>) = self
|
||||
.pending_entries
|
||||
.into_iter()
|
||||
.map(|(url, (kind, info))| ((url, kind), info))
|
||||
.unzip();
|
||||
|
||||
let mut urls = urls.into_iter();
|
||||
|
||||
for info in join_all(infos).await {
|
||||
let (url, kind) = urls.next().unwrap();
|
||||
add_icon_entry(&mut self.entries, url, kind, info);
|
||||
}
|
||||
|
||||
self.entries.sort();
|
||||
|
||||
self.entries
|
||||
}
|
||||
}
|
27
src/lib.rs
Normal file
27
src/lib.rs
Normal file
|
@ -0,0 +1,27 @@
|
|||
#![feature(async_closure, map_into_keys_values, bool_to_option)]
|
||||
#[macro_use]
|
||||
extern crate serde_with;
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
|
||||
mod icon;
|
||||
mod icon_info;
|
||||
mod icon_size;
|
||||
mod icons;
|
||||
mod macros;
|
||||
|
||||
pub use icon::*;
|
||||
pub use icon_info::*;
|
||||
pub use icons::*;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use reqwest::{
|
||||
header::{HeaderMap, HeaderValue, USER_AGENT},
|
||||
Client,
|
||||
};
|
||||
|
||||
static CLIENT: Lazy<Client> = Lazy::new(|| {
|
||||
let mut headers = HeaderMap::new();
|
||||
headers.insert(USER_AGENT, HeaderValue::from_str("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36").unwrap());
|
||||
Client::builder().default_headers(headers).build().unwrap()
|
||||
});
|
14
src/macros.rs
Normal file
14
src/macros.rs
Normal file
|
@ -0,0 +1,14 @@
|
|||
#[macro_export]
|
||||
macro_rules! selector {
|
||||
($($selector:expr),+ $(,)?) => {{
|
||||
static RE: once_cell::sync::OnceCell<scraper::Selector> = once_cell::sync::OnceCell::new();
|
||||
RE.get_or_init(|| scraper::Selector::parse(crate::join!(",", $($selector),+)).unwrap())
|
||||
}};
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! join {
|
||||
($pattern:literal,$first:expr$(, $($rest:expr),*)? $(,)?) => {
|
||||
concat!($first$(, $($pattern, $rest),*)?)
|
||||
};
|
||||
}
|
Reference in a new issue