358 lines
10 KiB
Rust
358 lines
10 KiB
Rust
use std::collections::BTreeMap;
|
|
use std::path::Path;
|
|
|
|
use kuchiki::traits::*;
|
|
use regex::Regex;
|
|
use sha1::{Digest, Sha1};
|
|
use url::Url;
|
|
|
|
use crate::error::{EbookmError, Result};
|
|
use crate::graph::{EntryLinkMetadata, LinkPolicy, matches_target};
|
|
use crate::manifest::{DefaultsConfig, EntryDefinition};
|
|
use crate::source::{SourceOrigin, resolve_relative_url};
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct Asset {
|
|
pub id: String,
|
|
pub href: String,
|
|
pub media_type: String,
|
|
pub bytes: Vec<u8>,
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct NormalizedDocument {
|
|
pub title: String,
|
|
pub author: Option<String>,
|
|
pub published: Option<chrono::NaiveDate>,
|
|
pub canonical_url: Option<Url>,
|
|
pub body_xhtml: String,
|
|
pub assets: Vec<Asset>,
|
|
}
|
|
|
|
pub fn normalize_document(
|
|
entry_id: &str,
|
|
entry: &EntryDefinition,
|
|
defaults: &DefaultsConfig,
|
|
origin: &SourceOrigin,
|
|
extracted: crate::extract::ExtractedArticle,
|
|
policy: &LinkPolicy,
|
|
entry_metadata: &BTreeMap<String, EntryLinkMetadata>,
|
|
) -> Result<NormalizedDocument> {
|
|
let mut document = kuchiki::parse_html().one(format!("<div>{}</div>", extracted.body_html));
|
|
|
|
remove_nodes(&mut document, "script,style,noscript,button,svg,source");
|
|
if defaults.normalize_substack_embeds {
|
|
remove_nodes(&mut document, "iframe");
|
|
}
|
|
skip_first_paragraphs(
|
|
&mut document,
|
|
entry
|
|
.processing
|
|
.skip_first_paragraphs
|
|
.unwrap_or(defaults.processing.skip_first_paragraphs),
|
|
);
|
|
scrub_attributes(&mut document);
|
|
|
|
let mut assets = Vec::new();
|
|
if defaults.fetch_images {
|
|
collect_images(origin, &mut document, &mut assets)?;
|
|
}
|
|
|
|
rewrite_links(entry_id, &mut document, origin, policy, entry_metadata);
|
|
let body_xhtml = serialize_document(&document)?;
|
|
|
|
Ok(NormalizedDocument {
|
|
title: entry.title.clone().unwrap_or(extracted.title),
|
|
author: entry
|
|
.metadata
|
|
.author
|
|
.clone()
|
|
.or(extracted.author)
|
|
.or(defaults.metadata.author.clone()),
|
|
published: entry
|
|
.metadata
|
|
.published
|
|
.or(extracted.published)
|
|
.or(defaults.metadata.published),
|
|
canonical_url: extracted.canonical_url,
|
|
body_xhtml,
|
|
assets,
|
|
})
|
|
}
|
|
|
|
fn remove_nodes(document: &mut kuchiki::NodeRef, selector: &str) {
|
|
if let Ok(nodes) = document.select(selector) {
|
|
let selected: Vec<_> = nodes.collect();
|
|
for node in selected {
|
|
node.as_node().detach();
|
|
}
|
|
}
|
|
}
|
|
|
|
fn collect_images(
|
|
origin: &SourceOrigin,
|
|
document: &mut kuchiki::NodeRef,
|
|
assets: &mut Vec<Asset>,
|
|
) -> Result<()> {
|
|
let selected = document
|
|
.select("img")
|
|
.map(|items| items.collect::<Vec<_>>())
|
|
.unwrap_or_default();
|
|
|
|
for node in selected {
|
|
let mut attrs = node.attributes.borrow_mut();
|
|
let src = attrs
|
|
.get("src")
|
|
.or_else(|| attrs.get("data-src"))
|
|
.map(|value| value.to_string());
|
|
let Some(src) = src else {
|
|
continue;
|
|
};
|
|
|
|
if let Ok(asset) = fetch_asset(origin, &src) {
|
|
attrs.insert("src", format!("../{}", asset.href));
|
|
assets.push(asset);
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn fetch_asset(origin: &SourceOrigin, src: &str) -> Result<Asset> {
|
|
match origin {
|
|
SourceOrigin::LocalFile(base_path) => fetch_local_asset(base_path, src),
|
|
SourceOrigin::Remote(base_url) => {
|
|
let resolved = base_url.join(src).map_err(|source| EbookmError::UrlParse {
|
|
value: src.to_string(),
|
|
source,
|
|
})?;
|
|
fetch_remote_asset(&resolved)
|
|
}
|
|
}
|
|
}
|
|
|
|
fn fetch_local_asset(base_path: &Path, src: &str) -> Result<Asset> {
|
|
if let Ok(url) = Url::parse(src) {
|
|
match url.scheme() {
|
|
"http" | "https" => return fetch_remote_asset(&url),
|
|
"file" => {
|
|
let path = url
|
|
.to_file_path()
|
|
.map_err(|_| EbookmError::InvalidSourcePath {
|
|
path: src.to_string(),
|
|
})?;
|
|
return build_asset_from_path(&path);
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
|
|
let path = if Path::new(src).is_absolute() {
|
|
Path::new(src).to_path_buf()
|
|
} else {
|
|
base_path
|
|
.parent()
|
|
.unwrap_or_else(|| Path::new("."))
|
|
.join(src)
|
|
};
|
|
build_asset_from_path(&path)
|
|
}
|
|
|
|
fn fetch_remote_asset(url: &Url) -> Result<Asset> {
|
|
let bytes = reqwest::blocking::get(url.clone())
|
|
.and_then(|response| response.error_for_status())
|
|
.map_err(|source| EbookmError::Request {
|
|
url: url.to_string(),
|
|
source,
|
|
})?
|
|
.bytes()
|
|
.map_err(|source| EbookmError::Request {
|
|
url: url.to_string(),
|
|
source,
|
|
})?
|
|
.to_vec();
|
|
|
|
let extension = infer_extension_from_str(url.path());
|
|
let media_type = infer_media_type(&extension);
|
|
let digest = Sha1::digest(url.as_str().as_bytes());
|
|
let id = format!("{:x}", digest);
|
|
Ok(Asset {
|
|
id: id.clone(),
|
|
href: format!("assets/{}.{}", id, extension),
|
|
media_type,
|
|
bytes,
|
|
})
|
|
}
|
|
|
|
fn build_asset_from_path(path: &Path) -> Result<Asset> {
|
|
let bytes = std::fs::read(path).map_err(|source| EbookmError::Io {
|
|
path: path.display().to_string(),
|
|
source,
|
|
})?;
|
|
let extension = infer_extension_from_path(path);
|
|
let media_type = infer_media_type(&extension);
|
|
let digest = Sha1::digest(path.display().to_string().as_bytes());
|
|
let id = format!("{:x}", digest);
|
|
Ok(Asset {
|
|
id: id.clone(),
|
|
href: format!("assets/{}.{}", id, extension),
|
|
media_type,
|
|
bytes,
|
|
})
|
|
}
|
|
|
|
fn rewrite_links(
|
|
entry_id: &str,
|
|
document: &mut kuchiki::NodeRef,
|
|
origin: &SourceOrigin,
|
|
policy: &LinkPolicy,
|
|
entry_metadata: &BTreeMap<String, EntryLinkMetadata>,
|
|
) {
|
|
let selected = document
|
|
.select("a[href]")
|
|
.map(|items| items.collect::<Vec<_>>())
|
|
.unwrap_or_default();
|
|
|
|
for node in selected {
|
|
let mut attrs = node.attributes.borrow_mut();
|
|
let href = attrs.get("href").map(|value| value.to_string());
|
|
let Some(href) = href else {
|
|
continue;
|
|
};
|
|
|
|
let Some(resolved) = resolve_relative_url(origin, &href) else {
|
|
continue;
|
|
};
|
|
|
|
if let Some((target_id, _)) = entry_metadata.iter().find(|(target_id, metadata)| {
|
|
*target_id != entry_id && matches_target(&resolved, policy, target_id, metadata)
|
|
}) {
|
|
attrs.insert("href", format!("../text/{}.xhtml", target_id));
|
|
}
|
|
}
|
|
}
|
|
|
|
fn serialize_document(document: &kuchiki::NodeRef) -> Result<String> {
|
|
let wrapper = document
|
|
.select_first("div")
|
|
.map_err(|_| EbookmError::Epub {
|
|
message: "failed to serialize normalized document".to_string(),
|
|
})?;
|
|
|
|
let mut bytes = Vec::new();
|
|
for child in wrapper.as_node().children() {
|
|
child
|
|
.serialize(&mut bytes)
|
|
.map_err(|error| EbookmError::Epub {
|
|
message: error.to_string(),
|
|
})?;
|
|
}
|
|
|
|
let html = String::from_utf8(bytes).map_err(|error| EbookmError::Epub {
|
|
message: error.to_string(),
|
|
})?;
|
|
Ok(to_xhtml_fragment(&html))
|
|
}
|
|
|
|
fn scrub_attributes(document: &mut kuchiki::NodeRef) {
|
|
if let Ok(nodes) = document.select("*") {
|
|
let selected: Vec<_> = nodes.collect();
|
|
for node in selected {
|
|
let mut attrs = node.attributes.borrow_mut();
|
|
let names: Vec<_> = attrs.map.keys().cloned().collect();
|
|
for name in names {
|
|
let local = name.local.to_string();
|
|
let keep = match node.name.local.as_ref() {
|
|
"a" => matches!(local.as_str(), "href" | "title"),
|
|
"img" => matches!(local.as_str(), "src" | "alt"),
|
|
_ => false,
|
|
};
|
|
if !keep {
|
|
attrs.map.remove(&name);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
fn skip_first_paragraphs(document: &mut kuchiki::NodeRef, count: u32) {
|
|
if count == 0 {
|
|
return;
|
|
}
|
|
let selected = document
|
|
.select("p")
|
|
.map(|items| items.take(count as usize).collect::<Vec<_>>())
|
|
.unwrap_or_default();
|
|
for node in selected {
|
|
node.as_node().detach();
|
|
}
|
|
}
|
|
|
|
fn infer_extension_from_path(path: &Path) -> String {
|
|
path.extension()
|
|
.and_then(|value| value.to_str())
|
|
.filter(|value| !value.is_empty())
|
|
.unwrap_or("bin")
|
|
.to_string()
|
|
}
|
|
|
|
fn infer_extension_from_str(path: &str) -> String {
|
|
Path::new(path)
|
|
.extension()
|
|
.and_then(|value| value.to_str())
|
|
.filter(|value| !value.is_empty())
|
|
.unwrap_or("bin")
|
|
.to_string()
|
|
}
|
|
|
|
fn infer_media_type(extension: &str) -> String {
|
|
match extension {
|
|
"jpg" | "jpeg" => "image/jpeg",
|
|
"png" => "image/png",
|
|
"gif" => "image/gif",
|
|
"svg" => "image/svg+xml",
|
|
"webp" => "image/webp",
|
|
_ => "application/octet-stream",
|
|
}
|
|
.to_string()
|
|
}
|
|
|
|
fn to_xhtml_fragment(html: &str) -> String {
|
|
let img_re = Regex::new(r#"<img([^>]*)>"#).expect("valid img regex");
|
|
let hr_re = Regex::new(r#"<hr([^>]*)>"#).expect("valid hr regex");
|
|
let br_re = Regex::new(r#"<br([^>]*)>"#).expect("valid br regex");
|
|
|
|
let html = img_re.replace_all(html, "<img$1 />").into_owned();
|
|
let html = hr_re.replace_all(&html, "<hr$1 />").into_owned();
|
|
br_re.replace_all(&html, "<br$1 />").into_owned()
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::to_xhtml_fragment;
|
|
use quick_xml::events::Event;
|
|
use quick_xml::Reader;
|
|
|
|
#[test]
|
|
fn converts_void_html_tags_to_xhtml_self_closing_tags() {
|
|
let input = r#"<p>Intro</p><picture><img alt="" src="a.jpg"></picture><hr><br>"#;
|
|
let xhtml = to_xhtml_fragment(input);
|
|
assert!(xhtml.contains(r#"<img alt="" src="a.jpg" />"#));
|
|
assert!(xhtml.contains("<hr />"));
|
|
assert!(xhtml.contains("<br />"));
|
|
|
|
let wrapped = format!(
|
|
r#"<?xml version="1.0" encoding="UTF-8"?><root>{}</root>"#,
|
|
xhtml
|
|
);
|
|
let mut reader = Reader::from_str(&wrapped);
|
|
loop {
|
|
match reader.read_event() {
|
|
Ok(Event::Eof) => break,
|
|
Ok(_) => {}
|
|
Err(error) => panic!("invalid XML generated: {error}"),
|
|
}
|
|
}
|
|
}
|
|
}
|