initial commit

This commit is contained in:
2026-05-25 17:05:15 +02:00
commit 6ebe505a07
25 changed files with 5929 additions and 0 deletions
+357
View File
@@ -0,0 +1,357 @@
use std::collections::BTreeMap;
use std::path::Path;
use kuchiki::traits::*;
use regex::Regex;
use sha1::{Digest, Sha1};
use url::Url;
use crate::error::{EbookmError, Result};
use crate::graph::{EntryLinkMetadata, LinkPolicy, matches_target};
use crate::manifest::{DefaultsConfig, EntryDefinition};
use crate::source::{SourceOrigin, resolve_relative_url};
#[derive(Debug, Clone)]
pub struct Asset {
pub id: String,
pub href: String,
pub media_type: String,
pub bytes: Vec<u8>,
}
#[derive(Debug, Clone)]
pub struct NormalizedDocument {
pub title: String,
pub author: Option<String>,
pub published: Option<chrono::NaiveDate>,
pub canonical_url: Option<Url>,
pub body_xhtml: String,
pub assets: Vec<Asset>,
}
pub fn normalize_document(
entry_id: &str,
entry: &EntryDefinition,
defaults: &DefaultsConfig,
origin: &SourceOrigin,
extracted: crate::extract::ExtractedArticle,
policy: &LinkPolicy,
entry_metadata: &BTreeMap<String, EntryLinkMetadata>,
) -> Result<NormalizedDocument> {
let mut document = kuchiki::parse_html().one(format!("<div>{}</div>", extracted.body_html));
remove_nodes(&mut document, "script,style,noscript,button,svg,source");
if defaults.normalize_substack_embeds {
remove_nodes(&mut document, "iframe");
}
skip_first_paragraphs(
&mut document,
entry
.processing
.skip_first_paragraphs
.unwrap_or(defaults.processing.skip_first_paragraphs),
);
scrub_attributes(&mut document);
let mut assets = Vec::new();
if defaults.fetch_images {
collect_images(origin, &mut document, &mut assets)?;
}
rewrite_links(entry_id, &mut document, origin, policy, entry_metadata);
let body_xhtml = serialize_document(&document)?;
Ok(NormalizedDocument {
title: entry.title.clone().unwrap_or(extracted.title),
author: entry
.metadata
.author
.clone()
.or(extracted.author)
.or(defaults.metadata.author.clone()),
published: entry
.metadata
.published
.or(extracted.published)
.or(defaults.metadata.published),
canonical_url: extracted.canonical_url,
body_xhtml,
assets,
})
}
fn remove_nodes(document: &mut kuchiki::NodeRef, selector: &str) {
if let Ok(nodes) = document.select(selector) {
let selected: Vec<_> = nodes.collect();
for node in selected {
node.as_node().detach();
}
}
}
fn collect_images(
origin: &SourceOrigin,
document: &mut kuchiki::NodeRef,
assets: &mut Vec<Asset>,
) -> Result<()> {
let selected = document
.select("img")
.map(|items| items.collect::<Vec<_>>())
.unwrap_or_default();
for node in selected {
let mut attrs = node.attributes.borrow_mut();
let src = attrs
.get("src")
.or_else(|| attrs.get("data-src"))
.map(|value| value.to_string());
let Some(src) = src else {
continue;
};
if let Ok(asset) = fetch_asset(origin, &src) {
attrs.insert("src", format!("../{}", asset.href));
assets.push(asset);
}
}
Ok(())
}
fn fetch_asset(origin: &SourceOrigin, src: &str) -> Result<Asset> {
match origin {
SourceOrigin::LocalFile(base_path) => fetch_local_asset(base_path, src),
SourceOrigin::Remote(base_url) => {
let resolved = base_url.join(src).map_err(|source| EbookmError::UrlParse {
value: src.to_string(),
source,
})?;
fetch_remote_asset(&resolved)
}
}
}
fn fetch_local_asset(base_path: &Path, src: &str) -> Result<Asset> {
if let Ok(url) = Url::parse(src) {
match url.scheme() {
"http" | "https" => return fetch_remote_asset(&url),
"file" => {
let path = url
.to_file_path()
.map_err(|_| EbookmError::InvalidSourcePath {
path: src.to_string(),
})?;
return build_asset_from_path(&path);
}
_ => {}
}
}
let path = if Path::new(src).is_absolute() {
Path::new(src).to_path_buf()
} else {
base_path
.parent()
.unwrap_or_else(|| Path::new("."))
.join(src)
};
build_asset_from_path(&path)
}
fn fetch_remote_asset(url: &Url) -> Result<Asset> {
let bytes = reqwest::blocking::get(url.clone())
.and_then(|response| response.error_for_status())
.map_err(|source| EbookmError::Request {
url: url.to_string(),
source,
})?
.bytes()
.map_err(|source| EbookmError::Request {
url: url.to_string(),
source,
})?
.to_vec();
let extension = infer_extension_from_str(url.path());
let media_type = infer_media_type(&extension);
let digest = Sha1::digest(url.as_str().as_bytes());
let id = format!("{:x}", digest);
Ok(Asset {
id: id.clone(),
href: format!("assets/{}.{}", id, extension),
media_type,
bytes,
})
}
fn build_asset_from_path(path: &Path) -> Result<Asset> {
let bytes = std::fs::read(path).map_err(|source| EbookmError::Io {
path: path.display().to_string(),
source,
})?;
let extension = infer_extension_from_path(path);
let media_type = infer_media_type(&extension);
let digest = Sha1::digest(path.display().to_string().as_bytes());
let id = format!("{:x}", digest);
Ok(Asset {
id: id.clone(),
href: format!("assets/{}.{}", id, extension),
media_type,
bytes,
})
}
fn rewrite_links(
entry_id: &str,
document: &mut kuchiki::NodeRef,
origin: &SourceOrigin,
policy: &LinkPolicy,
entry_metadata: &BTreeMap<String, EntryLinkMetadata>,
) {
let selected = document
.select("a[href]")
.map(|items| items.collect::<Vec<_>>())
.unwrap_or_default();
for node in selected {
let mut attrs = node.attributes.borrow_mut();
let href = attrs.get("href").map(|value| value.to_string());
let Some(href) = href else {
continue;
};
let Some(resolved) = resolve_relative_url(origin, &href) else {
continue;
};
if let Some((target_id, _)) = entry_metadata.iter().find(|(target_id, metadata)| {
*target_id != entry_id && matches_target(&resolved, policy, target_id, metadata)
}) {
attrs.insert("href", format!("../text/{}.xhtml", target_id));
}
}
}
fn serialize_document(document: &kuchiki::NodeRef) -> Result<String> {
let wrapper = document
.select_first("div")
.map_err(|_| EbookmError::Epub {
message: "failed to serialize normalized document".to_string(),
})?;
let mut bytes = Vec::new();
for child in wrapper.as_node().children() {
child
.serialize(&mut bytes)
.map_err(|error| EbookmError::Epub {
message: error.to_string(),
})?;
}
let html = String::from_utf8(bytes).map_err(|error| EbookmError::Epub {
message: error.to_string(),
})?;
Ok(to_xhtml_fragment(&html))
}
fn scrub_attributes(document: &mut kuchiki::NodeRef) {
if let Ok(nodes) = document.select("*") {
let selected: Vec<_> = nodes.collect();
for node in selected {
let mut attrs = node.attributes.borrow_mut();
let names: Vec<_> = attrs.map.keys().cloned().collect();
for name in names {
let local = name.local.to_string();
let keep = match node.name.local.as_ref() {
"a" => matches!(local.as_str(), "href" | "title"),
"img" => matches!(local.as_str(), "src" | "alt"),
_ => false,
};
if !keep {
attrs.map.remove(&name);
}
}
}
}
}
fn skip_first_paragraphs(document: &mut kuchiki::NodeRef, count: u32) {
if count == 0 {
return;
}
let selected = document
.select("p")
.map(|items| items.take(count as usize).collect::<Vec<_>>())
.unwrap_or_default();
for node in selected {
node.as_node().detach();
}
}
fn infer_extension_from_path(path: &Path) -> String {
path.extension()
.and_then(|value| value.to_str())
.filter(|value| !value.is_empty())
.unwrap_or("bin")
.to_string()
}
fn infer_extension_from_str(path: &str) -> String {
Path::new(path)
.extension()
.and_then(|value| value.to_str())
.filter(|value| !value.is_empty())
.unwrap_or("bin")
.to_string()
}
fn infer_media_type(extension: &str) -> String {
match extension {
"jpg" | "jpeg" => "image/jpeg",
"png" => "image/png",
"gif" => "image/gif",
"svg" => "image/svg+xml",
"webp" => "image/webp",
_ => "application/octet-stream",
}
.to_string()
}
fn to_xhtml_fragment(html: &str) -> String {
let img_re = Regex::new(r#"<img([^>]*)>"#).expect("valid img regex");
let hr_re = Regex::new(r#"<hr([^>]*)>"#).expect("valid hr regex");
let br_re = Regex::new(r#"<br([^>]*)>"#).expect("valid br regex");
let html = img_re.replace_all(html, "<img$1 />").into_owned();
let html = hr_re.replace_all(&html, "<hr$1 />").into_owned();
br_re.replace_all(&html, "<br$1 />").into_owned()
}
#[cfg(test)]
mod tests {
use super::to_xhtml_fragment;
use quick_xml::events::Event;
use quick_xml::Reader;
#[test]
fn converts_void_html_tags_to_xhtml_self_closing_tags() {
let input = r#"<p>Intro</p><picture><img alt="" src="a.jpg"></picture><hr><br>"#;
let xhtml = to_xhtml_fragment(input);
assert!(xhtml.contains(r#"<img alt="" src="a.jpg" />"#));
assert!(xhtml.contains("<hr />"));
assert!(xhtml.contains("<br />"));
let wrapped = format!(
r#"<?xml version="1.0" encoding="UTF-8"?><root>{}</root>"#,
xhtml
);
let mut reader = Reader::from_str(&wrapped);
loop {
match reader.read_event() {
Ok(Event::Eof) => break,
Ok(_) => {}
Err(error) => panic!("invalid XML generated: {error}"),
}
}
}
}