initial commit

2026-05-25 17:05:15 +02:00
commit 6ebe505a07
25 changed files with 5929 additions and 0 deletions
@@ -0,0 +1,357 @@
+use std::collections::BTreeMap;
+use std::path::Path;
+
+use kuchiki::traits::*;
+use regex::Regex;
+use sha1::{Digest, Sha1};
+use url::Url;
+
+use crate::error::{EbookmError, Result};
+use crate::graph::{EntryLinkMetadata, LinkPolicy, matches_target};
+use crate::manifest::{DefaultsConfig, EntryDefinition};
+use crate::source::{SourceOrigin, resolve_relative_url};
+
+#[derive(Debug, Clone)]
+pub struct Asset {
+    pub id: String,
+    pub href: String,
+    pub media_type: String,
+    pub bytes: Vec<u8>,
+}
+
+#[derive(Debug, Clone)]
+pub struct NormalizedDocument {
+    pub title: String,
+    pub author: Option<String>,
+    pub published: Option<chrono::NaiveDate>,
+    pub canonical_url: Option<Url>,
+    pub body_xhtml: String,
+    pub assets: Vec<Asset>,
+}
+
+pub fn normalize_document(
+    entry_id: &str,
+    entry: &EntryDefinition,
+    defaults: &DefaultsConfig,
+    origin: &SourceOrigin,
+    extracted: crate::extract::ExtractedArticle,
+    policy: &LinkPolicy,
+    entry_metadata: &BTreeMap<String, EntryLinkMetadata>,
+) -> Result<NormalizedDocument> {
+    let mut document = kuchiki::parse_html().one(format!("<div>{}</div>", extracted.body_html));
+
+    remove_nodes(&mut document, "script,style,noscript,button,svg,source");
+    if defaults.normalize_substack_embeds {
+        remove_nodes(&mut document, "iframe");
+    }
+    skip_first_paragraphs(
+        &mut document,
+        entry
+            .processing
+            .skip_first_paragraphs
+            .unwrap_or(defaults.processing.skip_first_paragraphs),
+    );
+    scrub_attributes(&mut document);
+
+    let mut assets = Vec::new();
+    if defaults.fetch_images {
+        collect_images(origin, &mut document, &mut assets)?;
+    }
+
+    rewrite_links(entry_id, &mut document, origin, policy, entry_metadata);
+    let body_xhtml = serialize_document(&document)?;
+
+    Ok(NormalizedDocument {
+        title: entry.title.clone().unwrap_or(extracted.title),
+        author: entry
+            .metadata
+            .author
+            .clone()
+            .or(extracted.author)
+            .or(defaults.metadata.author.clone()),
+        published: entry
+            .metadata
+            .published
+            .or(extracted.published)
+            .or(defaults.metadata.published),
+        canonical_url: extracted.canonical_url,
+        body_xhtml,
+        assets,
+    })
+}
+
+fn remove_nodes(document: &mut kuchiki::NodeRef, selector: &str) {
+    if let Ok(nodes) = document.select(selector) {
+        let selected: Vec<_> = nodes.collect();
+        for node in selected {
+            node.as_node().detach();
+        }
+    }
+}
+
+fn collect_images(
+    origin: &SourceOrigin,
+    document: &mut kuchiki::NodeRef,
+    assets: &mut Vec<Asset>,
+) -> Result<()> {
+    let selected = document
+        .select("img")
+        .map(|items| items.collect::<Vec<_>>())
+        .unwrap_or_default();
+
+    for node in selected {
+        let mut attrs = node.attributes.borrow_mut();
+        let src = attrs
+            .get("src")
+            .or_else(|| attrs.get("data-src"))
+            .map(|value| value.to_string());
+        let Some(src) = src else {
+            continue;
+        };
+
+        if let Ok(asset) = fetch_asset(origin, &src) {
+            attrs.insert("src", format!("../{}", asset.href));
+            assets.push(asset);
+        }
+    }
+
+    Ok(())
+}
+
+fn fetch_asset(origin: &SourceOrigin, src: &str) -> Result<Asset> {
+    match origin {
+        SourceOrigin::LocalFile(base_path) => fetch_local_asset(base_path, src),
+        SourceOrigin::Remote(base_url) => {
+            let resolved = base_url.join(src).map_err(|source| EbookmError::UrlParse {
+                value: src.to_string(),
+                source,
+            })?;
+            fetch_remote_asset(&resolved)
+        }
+    }
+}
+
+fn fetch_local_asset(base_path: &Path, src: &str) -> Result<Asset> {
+    if let Ok(url) = Url::parse(src) {
+        match url.scheme() {
+            "http" | "https" => return fetch_remote_asset(&url),
+            "file" => {
+                let path = url
+                    .to_file_path()
+                    .map_err(|_| EbookmError::InvalidSourcePath {
+                        path: src.to_string(),
+                    })?;
+                return build_asset_from_path(&path);
+            }
+            _ => {}
+        }
+    }
+
+    let path = if Path::new(src).is_absolute() {
+        Path::new(src).to_path_buf()
+    } else {
+        base_path
+            .parent()
+            .unwrap_or_else(|| Path::new("."))
+            .join(src)
+    };
+    build_asset_from_path(&path)
+}
+
+fn fetch_remote_asset(url: &Url) -> Result<Asset> {
+    let bytes = reqwest::blocking::get(url.clone())
+        .and_then(|response| response.error_for_status())
+        .map_err(|source| EbookmError::Request {
+            url: url.to_string(),
+            source,
+        })?
+        .bytes()
+        .map_err(|source| EbookmError::Request {
+            url: url.to_string(),
+            source,
+        })?
+        .to_vec();
+
+    let extension = infer_extension_from_str(url.path());
+    let media_type = infer_media_type(&extension);
+    let digest = Sha1::digest(url.as_str().as_bytes());
+    let id = format!("{:x}", digest);
+    Ok(Asset {
+        id: id.clone(),
+        href: format!("assets/{}.{}", id, extension),
+        media_type,
+        bytes,
+    })
+}
+
+fn build_asset_from_path(path: &Path) -> Result<Asset> {
+    let bytes = std::fs::read(path).map_err(|source| EbookmError::Io {
+        path: path.display().to_string(),
+        source,
+    })?;
+    let extension = infer_extension_from_path(path);
+    let media_type = infer_media_type(&extension);
+    let digest = Sha1::digest(path.display().to_string().as_bytes());
+    let id = format!("{:x}", digest);
+    Ok(Asset {
+        id: id.clone(),
+        href: format!("assets/{}.{}", id, extension),
+        media_type,
+        bytes,
+    })
+}
+
+fn rewrite_links(
+    entry_id: &str,
+    document: &mut kuchiki::NodeRef,
+    origin: &SourceOrigin,
+    policy: &LinkPolicy,
+    entry_metadata: &BTreeMap<String, EntryLinkMetadata>,
+) {
+    let selected = document
+        .select("a[href]")
+        .map(|items| items.collect::<Vec<_>>())
+        .unwrap_or_default();
+
+    for node in selected {
+        let mut attrs = node.attributes.borrow_mut();
+        let href = attrs.get("href").map(|value| value.to_string());
+        let Some(href) = href else {
+            continue;
+        };
+
+        let Some(resolved) = resolve_relative_url(origin, &href) else {
+            continue;
+        };
+
+        if let Some((target_id, _)) = entry_metadata.iter().find(|(target_id, metadata)| {
+            *target_id != entry_id && matches_target(&resolved, policy, target_id, metadata)
+        }) {
+            attrs.insert("href", format!("../text/{}.xhtml", target_id));
+        }
+    }
+}
+
+fn serialize_document(document: &kuchiki::NodeRef) -> Result<String> {
+    let wrapper = document
+        .select_first("div")
+        .map_err(|_| EbookmError::Epub {
+            message: "failed to serialize normalized document".to_string(),
+        })?;
+
+    let mut bytes = Vec::new();
+    for child in wrapper.as_node().children() {
+        child
+            .serialize(&mut bytes)
+            .map_err(|error| EbookmError::Epub {
+                message: error.to_string(),
+            })?;
+    }
+
+    let html = String::from_utf8(bytes).map_err(|error| EbookmError::Epub {
+        message: error.to_string(),
+    })?;
+    Ok(to_xhtml_fragment(&html))
+}
+
+fn scrub_attributes(document: &mut kuchiki::NodeRef) {
+    if let Ok(nodes) = document.select("*") {
+        let selected: Vec<_> = nodes.collect();
+        for node in selected {
+            let mut attrs = node.attributes.borrow_mut();
+            let names: Vec<_> = attrs.map.keys().cloned().collect();
+            for name in names {
+                let local = name.local.to_string();
+                let keep = match node.name.local.as_ref() {
+                    "a" => matches!(local.as_str(), "href" | "title"),
+                    "img" => matches!(local.as_str(), "src" | "alt"),
+                    _ => false,
+                };
+                if !keep {
+                    attrs.map.remove(&name);
+                }
+            }
+        }
+    }
+}
+
+fn skip_first_paragraphs(document: &mut kuchiki::NodeRef, count: u32) {
+    if count == 0 {
+        return;
+    }
+    let selected = document
+        .select("p")
+        .map(|items| items.take(count as usize).collect::<Vec<_>>())
+        .unwrap_or_default();
+    for node in selected {
+        node.as_node().detach();
+    }
+}
+
+fn infer_extension_from_path(path: &Path) -> String {
+    path.extension()
+        .and_then(|value| value.to_str())
+        .filter(|value| !value.is_empty())
+        .unwrap_or("bin")
+        .to_string()
+}
+
+fn infer_extension_from_str(path: &str) -> String {
+    Path::new(path)
+        .extension()
+        .and_then(|value| value.to_str())
+        .filter(|value| !value.is_empty())
+        .unwrap_or("bin")
+        .to_string()
+}
+
+fn infer_media_type(extension: &str) -> String {
+    match extension {
+        "jpg" | "jpeg" => "image/jpeg",
+        "png" => "image/png",
+        "gif" => "image/gif",
+        "svg" => "image/svg+xml",
+        "webp" => "image/webp",
+        _ => "application/octet-stream",
+    }
+    .to_string()
+}
+
+fn to_xhtml_fragment(html: &str) -> String {
+    let img_re = Regex::new(r#"<img([^>]*)>"#).expect("valid img regex");
+    let hr_re = Regex::new(r#"<hr([^>]*)>"#).expect("valid hr regex");
+    let br_re = Regex::new(r#"<br([^>]*)>"#).expect("valid br regex");
+
+    let html = img_re.replace_all(html, "<img$1 />").into_owned();
+    let html = hr_re.replace_all(&html, "<hr$1 />").into_owned();
+    br_re.replace_all(&html, "<br$1 />").into_owned()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::to_xhtml_fragment;
+    use quick_xml::events::Event;
+    use quick_xml::Reader;
+
+    #[test]
+    fn converts_void_html_tags_to_xhtml_self_closing_tags() {
+        let input = r#"<p>Intro</p><picture><img alt="" src="a.jpg"></picture><hr><br>"#;
+        let xhtml = to_xhtml_fragment(input);
+        assert!(xhtml.contains(r#"<img alt="" src="a.jpg" />"#));
+        assert!(xhtml.contains("<hr />"));
+        assert!(xhtml.contains("<br />"));
+
+        let wrapped = format!(
+            r#"<?xml version="1.0" encoding="UTF-8"?><root>{}</root>"#,
+            xhtml
+        );
+        let mut reader = Reader::from_str(&wrapped);
+        loop {
+            match reader.read_event() {
+                Ok(Event::Eof) => break,
+                Ok(_) => {}
+                Err(error) => panic!("invalid XML generated: {error}"),
+            }
+        }
+    }
+}