use std::collections::BTreeMap; use std::path::Path; use kuchiki::traits::*; use regex::Regex; use sha1::{Digest, Sha1}; use url::Url; use crate::error::{EbookmError, Result}; use crate::graph::{EntryLinkMetadata, LinkPolicy, matches_target}; use crate::manifest::{DefaultsConfig, EntryDefinition}; use crate::source::{SourceOrigin, resolve_relative_url}; #[derive(Debug, Clone)] pub struct Asset { pub id: String, pub href: String, pub media_type: String, pub bytes: Vec, } #[derive(Debug, Clone)] pub struct NormalizedDocument { pub title: String, pub author: Option, pub published: Option, pub canonical_url: Option, pub body_xhtml: String, pub assets: Vec, } pub fn normalize_document( entry_id: &str, entry: &EntryDefinition, defaults: &DefaultsConfig, origin: &SourceOrigin, extracted: crate::extract::ExtractedArticle, policy: &LinkPolicy, entry_metadata: &BTreeMap, ) -> Result { let mut document = kuchiki::parse_html().one(format!("
{}
", extracted.body_html)); remove_nodes(&mut document, "script,style,noscript,button,svg,source"); if defaults.normalize_substack_embeds { remove_nodes(&mut document, "iframe"); } skip_first_paragraphs( &mut document, entry .processing .skip_first_paragraphs .unwrap_or(defaults.processing.skip_first_paragraphs), ); scrub_attributes(&mut document); let mut assets = Vec::new(); if defaults.fetch_images { collect_images(origin, &mut document, &mut assets)?; } rewrite_links(entry_id, &mut document, origin, policy, entry_metadata); let body_xhtml = serialize_document(&document)?; Ok(NormalizedDocument { title: entry.title.clone().unwrap_or(extracted.title), author: entry .metadata .author .clone() .or(extracted.author) .or(defaults.metadata.author.clone()), published: entry .metadata .published .or(extracted.published) .or(defaults.metadata.published), canonical_url: extracted.canonical_url, body_xhtml, assets, }) } fn remove_nodes(document: &mut kuchiki::NodeRef, selector: &str) { if let Ok(nodes) = document.select(selector) { let selected: Vec<_> = nodes.collect(); for node in selected { node.as_node().detach(); } } } fn collect_images( origin: &SourceOrigin, document: &mut kuchiki::NodeRef, assets: &mut Vec, ) -> Result<()> { let selected = document .select("img") .map(|items| items.collect::>()) .unwrap_or_default(); for node in selected { let mut attrs = node.attributes.borrow_mut(); let src = attrs .get("src") .or_else(|| attrs.get("data-src")) .map(|value| value.to_string()); let Some(src) = src else { continue; }; if let Ok(asset) = fetch_asset(origin, &src) { attrs.insert("src", format!("../{}", asset.href)); assets.push(asset); } } Ok(()) } fn fetch_asset(origin: &SourceOrigin, src: &str) -> Result { match origin { SourceOrigin::LocalFile(base_path) => fetch_local_asset(base_path, src), SourceOrigin::Remote(base_url) => { let resolved = base_url.join(src).map_err(|source| EbookmError::UrlParse { value: src.to_string(), source, })?; fetch_remote_asset(&resolved) } } } fn fetch_local_asset(base_path: &Path, src: &str) -> Result { if let Ok(url) = Url::parse(src) { match url.scheme() { "http" | "https" => return fetch_remote_asset(&url), "file" => { let path = url .to_file_path() .map_err(|_| EbookmError::InvalidSourcePath { path: src.to_string(), })?; return build_asset_from_path(&path); } _ => {} } } let path = if Path::new(src).is_absolute() { Path::new(src).to_path_buf() } else { base_path .parent() .unwrap_or_else(|| Path::new(".")) .join(src) }; build_asset_from_path(&path) } fn fetch_remote_asset(url: &Url) -> Result { let bytes = reqwest::blocking::get(url.clone()) .and_then(|response| response.error_for_status()) .map_err(|source| EbookmError::Request { url: url.to_string(), source, })? .bytes() .map_err(|source| EbookmError::Request { url: url.to_string(), source, })? .to_vec(); let extension = infer_extension_from_str(url.path()); let media_type = infer_media_type(&extension); let digest = Sha1::digest(url.as_str().as_bytes()); let id = format!("{:x}", digest); Ok(Asset { id: id.clone(), href: format!("assets/{}.{}", id, extension), media_type, bytes, }) } fn build_asset_from_path(path: &Path) -> Result { let bytes = std::fs::read(path).map_err(|source| EbookmError::Io { path: path.display().to_string(), source, })?; let extension = infer_extension_from_path(path); let media_type = infer_media_type(&extension); let digest = Sha1::digest(path.display().to_string().as_bytes()); let id = format!("{:x}", digest); Ok(Asset { id: id.clone(), href: format!("assets/{}.{}", id, extension), media_type, bytes, }) } fn rewrite_links( entry_id: &str, document: &mut kuchiki::NodeRef, origin: &SourceOrigin, policy: &LinkPolicy, entry_metadata: &BTreeMap, ) { let selected = document .select("a[href]") .map(|items| items.collect::>()) .unwrap_or_default(); for node in selected { let mut attrs = node.attributes.borrow_mut(); let href = attrs.get("href").map(|value| value.to_string()); let Some(href) = href else { continue; }; let Some(resolved) = resolve_relative_url(origin, &href) else { continue; }; if let Some((target_id, _)) = entry_metadata.iter().find(|(target_id, metadata)| { *target_id != entry_id && matches_target(&resolved, policy, target_id, metadata) }) { attrs.insert("href", format!("../text/{}.xhtml", target_id)); } } } fn serialize_document(document: &kuchiki::NodeRef) -> Result { let wrapper = document .select_first("div") .map_err(|_| EbookmError::Epub { message: "failed to serialize normalized document".to_string(), })?; let mut bytes = Vec::new(); for child in wrapper.as_node().children() { child .serialize(&mut bytes) .map_err(|error| EbookmError::Epub { message: error.to_string(), })?; } let html = String::from_utf8(bytes).map_err(|error| EbookmError::Epub { message: error.to_string(), })?; Ok(to_xhtml_fragment(&html)) } fn scrub_attributes(document: &mut kuchiki::NodeRef) { if let Ok(nodes) = document.select("*") { let selected: Vec<_> = nodes.collect(); for node in selected { let mut attrs = node.attributes.borrow_mut(); let names: Vec<_> = attrs.map.keys().cloned().collect(); for name in names { let local = name.local.to_string(); let keep = match node.name.local.as_ref() { "a" => matches!(local.as_str(), "href" | "title"), "img" => matches!(local.as_str(), "src" | "alt"), _ => false, }; if !keep { attrs.map.remove(&name); } } } } } fn skip_first_paragraphs(document: &mut kuchiki::NodeRef, count: u32) { if count == 0 { return; } let selected = document .select("p") .map(|items| items.take(count as usize).collect::>()) .unwrap_or_default(); for node in selected { node.as_node().detach(); } } fn infer_extension_from_path(path: &Path) -> String { path.extension() .and_then(|value| value.to_str()) .filter(|value| !value.is_empty()) .unwrap_or("bin") .to_string() } fn infer_extension_from_str(path: &str) -> String { Path::new(path) .extension() .and_then(|value| value.to_str()) .filter(|value| !value.is_empty()) .unwrap_or("bin") .to_string() } fn infer_media_type(extension: &str) -> String { match extension { "jpg" | "jpeg" => "image/jpeg", "png" => "image/png", "gif" => "image/gif", "svg" => "image/svg+xml", "webp" => "image/webp", _ => "application/octet-stream", } .to_string() } fn to_xhtml_fragment(html: &str) -> String { let img_re = Regex::new(r#"]*)>"#).expect("valid img regex"); let hr_re = Regex::new(r#"]*)>"#).expect("valid hr regex"); let br_re = Regex::new(r#"]*)>"#).expect("valid br regex"); let html = img_re.replace_all(html, "").into_owned(); let html = hr_re.replace_all(&html, "").into_owned(); br_re.replace_all(&html, "").into_owned() } #[cfg(test)] mod tests { use super::to_xhtml_fragment; use quick_xml::events::Event; use quick_xml::Reader; #[test] fn converts_void_html_tags_to_xhtml_self_closing_tags() { let input = r#"

Intro



"#; let xhtml = to_xhtml_fragment(input); assert!(xhtml.contains(r#""#)); assert!(xhtml.contains("
")); assert!(xhtml.contains("
")); let wrapped = format!( r#"{}"#, xhtml ); let mut reader = Reader::from_str(&wrapped); loop { match reader.read_event() { Ok(Event::Eof) => break, Ok(_) => {} Err(error) => panic!("invalid XML generated: {error}"), } } } }