ebookm/ebookm-core/src/normalize.rs

use std::collections::BTreeMap;
use std::path::Path;

use kuchiki::traits::*;
use regex::Regex;
use sha1::{Digest, Sha1};
use url::Url;

use crate::error::{EbookmError, Result};
use crate::graph::{EntryLinkMetadata, LinkPolicy, matches_target};
use crate::manifest::{DefaultsConfig, EntryDefinition};
use crate::source::{SourceOrigin, resolve_relative_url};

#[derive(Debug, Clone)]
pub struct Asset {
    pub id: String,
    pub href: String,
    pub media_type: String,
    pub bytes: Vec<u8>,
}

#[derive(Debug, Clone)]
pub struct NormalizedDocument {
    pub title: String,
    pub author: Option<String>,
    pub published: Option<chrono::NaiveDate>,
    pub canonical_url: Option<Url>,
    pub body_xhtml: String,
    pub assets: Vec<Asset>,
}

pub fn normalize_document(
    entry_id: &str,
    entry: &EntryDefinition,
    defaults: &DefaultsConfig,
    origin: &SourceOrigin,
    extracted: crate::extract::ExtractedArticle,
    policy: &LinkPolicy,
    entry_metadata: &BTreeMap<String, EntryLinkMetadata>,
) -> Result<NormalizedDocument> {
    let mut document = kuchiki::parse_html().one(format!("<div>{}</div>", extracted.body_html));

    remove_nodes(&mut document, "script,style,noscript,button,svg,source");
    if defaults.normalize_substack_embeds {
        remove_nodes(&mut document, "iframe");
    }
    skip_first_paragraphs(
        &mut document,
        entry
            .processing
            .skip_first_paragraphs
            .unwrap_or(defaults.processing.skip_first_paragraphs),
    );
    scrub_attributes(&mut document);

    let mut assets = Vec::new();
    if defaults.fetch_images {
        collect_images(origin, &mut document, &mut assets)?;
    }

    rewrite_links(entry_id, &mut document, origin, policy, entry_metadata);
    let body_xhtml = serialize_document(&document)?;

    Ok(NormalizedDocument {
        title: entry.title.clone().unwrap_or(extracted.title),
        author: entry
            .metadata
            .author
            .clone()
            .or(extracted.author)
            .or(defaults.metadata.author.clone()),
        published: entry
            .metadata
            .published
            .or(extracted.published)
            .or(defaults.metadata.published),
        canonical_url: extracted.canonical_url,
        body_xhtml,
        assets,
    })
}

fn remove_nodes(document: &mut kuchiki::NodeRef, selector: &str) {
    if let Ok(nodes) = document.select(selector) {
        let selected: Vec<_> = nodes.collect();
        for node in selected {
            node.as_node().detach();
        }
    }
}

fn collect_images(
    origin: &SourceOrigin,
    document: &mut kuchiki::NodeRef,
    assets: &mut Vec<Asset>,
) -> Result<()> {
    let selected = document
        .select("img")
        .map(|items| items.collect::<Vec<_>>())
        .unwrap_or_default();

    for node in selected {
        let mut attrs = node.attributes.borrow_mut();
        let src = attrs
            .get("src")
            .or_else(|| attrs.get("data-src"))
            .map(|value| value.to_string());
        let Some(src) = src else {
            continue;
        };

        if let Ok(asset) = fetch_asset(origin, &src) {
            attrs.insert("src", format!("../{}", asset.href));
            assets.push(asset);
        }
    }

    Ok(())
}

fn fetch_asset(origin: &SourceOrigin, src: &str) -> Result<Asset> {
    match origin {
        SourceOrigin::LocalFile(base_path) => fetch_local_asset(base_path, src),
        SourceOrigin::Remote(base_url) => {
            let resolved = base_url.join(src).map_err(|source| EbookmError::UrlParse {
                value: src.to_string(),
                source,
            })?;
            fetch_remote_asset(&resolved)
        }
    }
}

fn fetch_local_asset(base_path: &Path, src: &str) -> Result<Asset> {
    if let Ok(url) = Url::parse(src) {
        match url.scheme() {
            "http" | "https" => return fetch_remote_asset(&url),
            "file" => {
                let path = url
                    .to_file_path()
                    .map_err(|_| EbookmError::InvalidSourcePath {
                        path: src.to_string(),
                    })?;
                return build_asset_from_path(&path);
            }
            _ => {}
        }
    }

    let path = if Path::new(src).is_absolute() {
        Path::new(src).to_path_buf()
    } else {
        base_path
            .parent()
            .unwrap_or_else(|| Path::new("."))
            .join(src)
    };
    build_asset_from_path(&path)
}

fn fetch_remote_asset(url: &Url) -> Result<Asset> {
    let bytes = reqwest::blocking::get(url.clone())
        .and_then(|response| response.error_for_status())
        .map_err(|source| EbookmError::Request {
            url: url.to_string(),
            source,
        })?
        .bytes()
        .map_err(|source| EbookmError::Request {
            url: url.to_string(),
            source,
        })?
        .to_vec();

    let extension = infer_extension_from_str(url.path());
    let media_type = infer_media_type(&extension);
    let digest = Sha1::digest(url.as_str().as_bytes());
    let id = format!("{:x}", digest);
    Ok(Asset {
        id: id.clone(),
        href: format!("assets/{}.{}", id, extension),
        media_type,
        bytes,
    })
}

fn build_asset_from_path(path: &Path) -> Result<Asset> {
    let bytes = std::fs::read(path).map_err(|source| EbookmError::Io {
        path: path.display().to_string(),
        source,
    })?;
    let extension = infer_extension_from_path(path);
    let media_type = infer_media_type(&extension);
    let digest = Sha1::digest(path.display().to_string().as_bytes());
    let id = format!("{:x}", digest);
    Ok(Asset {
        id: id.clone(),
        href: format!("assets/{}.{}", id, extension),
        media_type,
        bytes,
    })
}

fn rewrite_links(
    entry_id: &str,
    document: &mut kuchiki::NodeRef,
    origin: &SourceOrigin,
    policy: &LinkPolicy,
    entry_metadata: &BTreeMap<String, EntryLinkMetadata>,
) {
    let selected = document
        .select("a[href]")
        .map(|items| items.collect::<Vec<_>>())
        .unwrap_or_default();

    for node in selected {
        let mut attrs = node.attributes.borrow_mut();
        let href = attrs.get("href").map(|value| value.to_string());
        let Some(href) = href else {
            continue;
        };

        let Some(resolved) = resolve_relative_url(origin, &href) else {
            continue;
        };

        if let Some((target_id, _)) = entry_metadata.iter().find(|(target_id, metadata)| {
            *target_id != entry_id && matches_target(&resolved, policy, target_id, metadata)
        }) {
            attrs.insert("href", format!("../text/{}.xhtml", target_id));
        }
    }
}

fn serialize_document(document: &kuchiki::NodeRef) -> Result<String> {
    let wrapper = document
        .select_first("div")
        .map_err(|_| EbookmError::Epub {
            message: "failed to serialize normalized document".to_string(),
        })?;

    let mut bytes = Vec::new();
    for child in wrapper.as_node().children() {
        child
            .serialize(&mut bytes)
            .map_err(|error| EbookmError::Epub {
                message: error.to_string(),
            })?;
    }

    let html = String::from_utf8(bytes).map_err(|error| EbookmError::Epub {
        message: error.to_string(),
    })?;
    Ok(to_xhtml_fragment(&html))
}

fn scrub_attributes(document: &mut kuchiki::NodeRef) {
    if let Ok(nodes) = document.select("*") {
        let selected: Vec<_> = nodes.collect();
        for node in selected {
            let mut attrs = node.attributes.borrow_mut();
            let names: Vec<_> = attrs.map.keys().cloned().collect();
            for name in names {
                let local = name.local.to_string();
                let keep = match node.name.local.as_ref() {
                    "a" => matches!(local.as_str(), "href" | "title"),
                    "img" => matches!(local.as_str(), "src" | "alt"),
                    _ => false,
                };
                if !keep {
                    attrs.map.remove(&name);
                }
            }
        }
    }
}

fn skip_first_paragraphs(document: &mut kuchiki::NodeRef, count: u32) {
    if count == 0 {
        return;
    }
    let selected = document
        .select("p")
        .map(|items| items.take(count as usize).collect::<Vec<_>>())
        .unwrap_or_default();
    for node in selected {
        node.as_node().detach();
    }
}

fn infer_extension_from_path(path: &Path) -> String {
    path.extension()
        .and_then(|value| value.to_str())
        .filter(|value| !value.is_empty())
        .unwrap_or("bin")
        .to_string()
}

fn infer_extension_from_str(path: &str) -> String {
    Path::new(path)
        .extension()
        .and_then(|value| value.to_str())
        .filter(|value| !value.is_empty())
        .unwrap_or("bin")
        .to_string()
}

fn infer_media_type(extension: &str) -> String {
    match extension {
        "jpg" | "jpeg" => "image/jpeg",
        "png" => "image/png",
        "gif" => "image/gif",
        "svg" => "image/svg+xml",
        "webp" => "image/webp",
        _ => "application/octet-stream",
    }
    .to_string()
}

fn to_xhtml_fragment(html: &str) -> String {
    let img_re = Regex::new(r#"<img([^>]*)>"#).expect("valid img regex");
    let hr_re = Regex::new(r#"<hr([^>]*)>"#).expect("valid hr regex");
    let br_re = Regex::new(r#"<br([^>]*)>"#).expect("valid br regex");

    let html = img_re.replace_all(html, "<img$1 />").into_owned();
    let html = hr_re.replace_all(&html, "<hr$1 />").into_owned();
    br_re.replace_all(&html, "<br$1 />").into_owned()
}

#[cfg(test)]
mod tests {
    use super::to_xhtml_fragment;
    use quick_xml::events::Event;
    use quick_xml::Reader;

    #[test]
    fn converts_void_html_tags_to_xhtml_self_closing_tags() {
        let input = r#"<p>Intro</p><picture><img alt="" src="a.jpg"></picture><hr><br>"#;
        let xhtml = to_xhtml_fragment(input);
        assert!(xhtml.contains(r#"<img alt="" src="a.jpg" />"#));
        assert!(xhtml.contains("<hr />"));
        assert!(xhtml.contains("<br />"));

        let wrapped = format!(
            r#"<?xml version="1.0" encoding="UTF-8"?><root>{}</root>"#,
            xhtml
        );
        let mut reader = Reader::from_str(&wrapped);
        loop {
            match reader.read_event() {
                Ok(Event::Eof) => break,
                Ok(_) => {}
                Err(error) => panic!("invalid XML generated: {error}"),
            }
        }
    }
}