initial commit
This commit is contained in:
@@ -0,0 +1,25 @@
|
||||
[package]
|
||||
name = "ebookm-core"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
indexmap = { version = "2.7", features = ["serde"] }
|
||||
kuchiki = "0.8"
|
||||
miette = { version = "7.2", features = ["fancy"] }
|
||||
quick-xml = "0.38"
|
||||
regex = "1.11"
|
||||
reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] }
|
||||
scraper = "0.24"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
serde_yaml = "0.9"
|
||||
sha1 = "0.10"
|
||||
thiserror = "2.0"
|
||||
url = { version = "2.5", features = ["serde"] }
|
||||
uuid = { version = "1.18", features = ["v4"] }
|
||||
zip = "4.6"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.15"
|
||||
@@ -0,0 +1,298 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
|
||||
use quick_xml::escape::escape;
|
||||
use zip::CompressionMethod;
|
||||
use zip::write::{SimpleFileOptions, ZipWriter};
|
||||
|
||||
use crate::error::{EbookmError, Result};
|
||||
use crate::pipeline::BuiltEntry;
|
||||
|
||||
pub fn write_epub(
|
||||
manifest: &crate::manifest::Manifest,
|
||||
built: &[BuiltEntry],
|
||||
output_path: &Path,
|
||||
cover_bytes: Option<(String, Vec<u8>)>,
|
||||
) -> Result<()> {
|
||||
if let Some(parent) = output_path.parent() {
|
||||
std::fs::create_dir_all(parent).map_err(|source| EbookmError::Io {
|
||||
path: parent.display().to_string(),
|
||||
source,
|
||||
})?;
|
||||
}
|
||||
|
||||
let file = File::create(output_path).map_err(|source| EbookmError::Io {
|
||||
path: output_path.display().to_string(),
|
||||
source,
|
||||
})?;
|
||||
let mut zip = ZipWriter::new(file);
|
||||
|
||||
let stored = SimpleFileOptions::default().compression_method(CompressionMethod::Stored);
|
||||
zip.start_file("mimetype", stored)
|
||||
.map_err(|error| EbookmError::Epub {
|
||||
message: error.to_string(),
|
||||
})?;
|
||||
zip.write_all(b"application/epub+zip")
|
||||
.map_err(|error| EbookmError::Epub {
|
||||
message: error.to_string(),
|
||||
})?;
|
||||
|
||||
let deflated = SimpleFileOptions::default().compression_method(CompressionMethod::Deflated);
|
||||
write_file(&mut zip, "META-INF/container.xml", deflated, CONTAINER_XML)?;
|
||||
write_file(&mut zip, "OEBPS/styles/book.css", deflated, DEFAULT_STYLES)?;
|
||||
|
||||
let nav = build_nav(manifest, built);
|
||||
let ncx = build_ncx(manifest, built);
|
||||
let opf = build_opf(
|
||||
manifest,
|
||||
built,
|
||||
cover_bytes.as_ref().map(|(href, _)| href.as_str()),
|
||||
);
|
||||
|
||||
write_file(&mut zip, "OEBPS/nav.xhtml", deflated, &nav)?;
|
||||
write_file(&mut zip, "OEBPS/toc.ncx", deflated, &ncx)?;
|
||||
write_file(&mut zip, "OEBPS/content.opf", deflated, &opf)?;
|
||||
|
||||
if let Some((href, bytes)) = cover_bytes {
|
||||
write_bytes(&mut zip, &format!("OEBPS/{href}"), deflated, &bytes)?;
|
||||
}
|
||||
|
||||
let mut seen_assets = BTreeSet::new();
|
||||
for entry in built {
|
||||
write_file(
|
||||
&mut zip,
|
||||
&format!("OEBPS/text/{}.xhtml", entry.id),
|
||||
deflated,
|
||||
&entry.chapter.xhtml,
|
||||
)?;
|
||||
for asset in &entry.assets {
|
||||
if seen_assets.insert(asset.href.clone()) {
|
||||
write_bytes(
|
||||
&mut zip,
|
||||
&format!("OEBPS/{}", asset.href),
|
||||
deflated,
|
||||
&asset.bytes,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
zip.finish().map_err(|error| EbookmError::Epub {
|
||||
message: error.to_string(),
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_file(
|
||||
zip: &mut ZipWriter<File>,
|
||||
path: &str,
|
||||
options: SimpleFileOptions,
|
||||
contents: &str,
|
||||
) -> Result<()> {
|
||||
write_bytes(zip, path, options, contents.as_bytes())
|
||||
}
|
||||
|
||||
fn write_bytes(
|
||||
zip: &mut ZipWriter<File>,
|
||||
path: &str,
|
||||
options: SimpleFileOptions,
|
||||
contents: &[u8],
|
||||
) -> Result<()> {
|
||||
zip.start_file(path, options)
|
||||
.map_err(|error| EbookmError::Epub {
|
||||
message: error.to_string(),
|
||||
})?;
|
||||
zip.write_all(contents).map_err(|error| EbookmError::Epub {
|
||||
message: error.to_string(),
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn build_nav(manifest: &crate::manifest::Manifest, built: &[BuiltEntry]) -> String {
|
||||
let mut nav_points = String::new();
|
||||
for section in &manifest.sections {
|
||||
let section_target = section
|
||||
.entries
|
||||
.iter()
|
||||
.find_map(|entry_id| built.iter().find(|candidate| &candidate.id == entry_id))
|
||||
.map(|entry| format!("text/{}.xhtml", entry.id));
|
||||
nav_points.push_str("<li>");
|
||||
if let Some(target) = section_target {
|
||||
nav_points.push_str(&format!(
|
||||
"<a href=\"{}\">{}</a><ol>",
|
||||
escape(&target),
|
||||
escape(§ion.title)
|
||||
));
|
||||
} else {
|
||||
nav_points.push_str(&format!("<span>{}</span><ol>", escape(§ion.title)));
|
||||
}
|
||||
for entry_id in §ion.entries {
|
||||
if let Some(entry) = built.iter().find(|candidate| &candidate.id == entry_id) {
|
||||
if entry.hidden_from_toc {
|
||||
continue;
|
||||
}
|
||||
nav_points.push_str(&format!(
|
||||
"<li><a href=\"text/{}.xhtml\">{}</a></li>",
|
||||
escape(&entry.id),
|
||||
escape(&entry.chapter.nav_title)
|
||||
));
|
||||
}
|
||||
}
|
||||
nav_points.push_str("</ol></li>");
|
||||
}
|
||||
|
||||
format!(
|
||||
r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
|
||||
<head>
|
||||
<title>{}</title>
|
||||
<link rel="stylesheet" type="text/css" href="styles/book.css"/>
|
||||
</head>
|
||||
<body>
|
||||
<nav epub:type="toc" id="toc">
|
||||
<h1>{}</h1>
|
||||
<ol>{}</ol>
|
||||
</nav>
|
||||
</body>
|
||||
</html>"#,
|
||||
escape(&manifest.book.title),
|
||||
escape(&manifest.book.title),
|
||||
nav_points
|
||||
)
|
||||
}
|
||||
|
||||
fn build_ncx(manifest: &crate::manifest::Manifest, built: &[BuiltEntry]) -> String {
|
||||
let mut play_order = 1usize;
|
||||
let mut nav_points = String::new();
|
||||
for section in &manifest.sections {
|
||||
let section_entries: Vec<_> = section
|
||||
.entries
|
||||
.iter()
|
||||
.filter_map(|entry_id| built.iter().find(|candidate| &candidate.id == entry_id))
|
||||
.filter(|entry| !entry.hidden_from_toc)
|
||||
.collect();
|
||||
|
||||
if section_entries.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let section_play_order = play_order;
|
||||
play_order += 1;
|
||||
|
||||
let mut child_points = String::new();
|
||||
for entry in §ion_entries {
|
||||
child_points.push_str(&format!(
|
||||
"<navPoint id=\"nav-{}\" playOrder=\"{}\"><navLabel><text>{}</text></navLabel><content src=\"text/{}.xhtml\"/></navPoint>",
|
||||
escape(&entry.id),
|
||||
play_order,
|
||||
escape(&entry.chapter.nav_title),
|
||||
escape(&entry.id)
|
||||
));
|
||||
play_order += 1;
|
||||
}
|
||||
|
||||
nav_points.push_str(&format!(
|
||||
"<navPoint id=\"section-{}\" playOrder=\"{}\"><navLabel><text>{}</text></navLabel><content src=\"text/{}.xhtml\"/>{}</navPoint>",
|
||||
escape(§ion.id),
|
||||
section_play_order,
|
||||
escape(§ion.title),
|
||||
escape(§ion_entries[0].id),
|
||||
child_points
|
||||
));
|
||||
}
|
||||
|
||||
format!(
|
||||
r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
|
||||
<head>
|
||||
<meta name="dtb:uid" content="{}"/>
|
||||
</head>
|
||||
<docTitle><text>{}</text></docTitle>
|
||||
<navMap>{}</navMap>
|
||||
</ncx>"#,
|
||||
escape(&manifest.book.identifier),
|
||||
escape(&manifest.book.title),
|
||||
nav_points
|
||||
)
|
||||
}
|
||||
|
||||
fn build_opf(
|
||||
manifest: &crate::manifest::Manifest,
|
||||
built: &[BuiltEntry],
|
||||
cover_href: Option<&str>,
|
||||
) -> String {
|
||||
let mut manifest_items = String::from(
|
||||
r#"<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
|
||||
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
|
||||
<item id="css" href="styles/book.css" media-type="text/css"/>"#,
|
||||
);
|
||||
let mut spine_items = String::new();
|
||||
|
||||
for entry in built {
|
||||
manifest_items.push_str(&format!(
|
||||
"<item id=\"{}\" href=\"text/{}.xhtml\" media-type=\"application/xhtml+xml\"/>",
|
||||
escape(&entry.id),
|
||||
escape(&entry.id)
|
||||
));
|
||||
spine_items.push_str(&format!("<itemref idref=\"{}\"/>", escape(&entry.id)));
|
||||
for asset in &entry.assets {
|
||||
manifest_items.push_str(&format!(
|
||||
"<item id=\"{}\" href=\"{}\" media-type=\"{}\"/>",
|
||||
escape(&asset.id),
|
||||
escape(&asset.href),
|
||||
escape(&asset.media_type)
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(cover_href) = cover_href {
|
||||
manifest_items.push_str(&format!(
|
||||
"<item id=\"cover\" href=\"{}\" media-type=\"image/jpeg\" properties=\"cover-image\"/>",
|
||||
escape(cover_href)
|
||||
));
|
||||
}
|
||||
|
||||
let author = manifest
|
||||
.book
|
||||
.author
|
||||
.clone()
|
||||
.unwrap_or_else(|| "Unknown".to_string());
|
||||
let description = manifest.book.description.clone().unwrap_or_default();
|
||||
format!(
|
||||
r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<package version="3.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="bookid">
|
||||
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<dc:identifier id="bookid">{}</dc:identifier>
|
||||
<dc:title>{}</dc:title>
|
||||
<dc:creator>{}</dc:creator>
|
||||
<dc:language>{}</dc:language>
|
||||
<dc:description>{}</dc:description>
|
||||
</metadata>
|
||||
<manifest>{}</manifest>
|
||||
<spine toc="ncx">{}</spine>
|
||||
</package>"#,
|
||||
escape(&manifest.book.identifier),
|
||||
escape(&manifest.book.title),
|
||||
escape(&author),
|
||||
escape(&manifest.book.language),
|
||||
escape(&description),
|
||||
manifest_items,
|
||||
spine_items
|
||||
)
|
||||
}
|
||||
|
||||
const CONTAINER_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
||||
<rootfiles>
|
||||
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
|
||||
</rootfiles>
|
||||
</container>"#;
|
||||
|
||||
const DEFAULT_STYLES: &str = r#"body { font-family: serif; line-height: 1.5; margin: 5%; }
|
||||
h1 { margin-bottom: 0.2em; }
|
||||
.chapter-meta { color: #555; font-size: 0.9em; margin-bottom: 1.5em; }
|
||||
img { max-width: 100%; height: auto; }
|
||||
a { color: #0b4f7a; text-decoration: none; }
|
||||
"#;
|
||||
@@ -0,0 +1,39 @@
|
||||
use thiserror::Error;
|
||||
|
||||
pub type Result<T> = std::result::Result<T, EbookmError>;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum EbookmError {
|
||||
#[error("failed to read file {path}: {source}")]
|
||||
Io {
|
||||
path: String,
|
||||
#[source]
|
||||
source: std::io::Error,
|
||||
},
|
||||
#[error("failed to parse manifest {path}: {source}")]
|
||||
ManifestParse {
|
||||
path: String,
|
||||
#[source]
|
||||
source: serde_yaml::Error,
|
||||
},
|
||||
#[error("manifest validation failed: {issues:?}")]
|
||||
Validation { issues: Vec<String> },
|
||||
#[error("network request failed for {url}: {source}")]
|
||||
Request {
|
||||
url: String,
|
||||
#[source]
|
||||
source: reqwest::Error,
|
||||
},
|
||||
#[error("invalid source path: {path}")]
|
||||
InvalidSourcePath { path: String },
|
||||
#[error("failed to parse URL {value}: {source}")]
|
||||
UrlParse {
|
||||
value: String,
|
||||
#[source]
|
||||
source: url::ParseError,
|
||||
},
|
||||
#[error("article extraction failed for {input}")]
|
||||
Extraction { input: String },
|
||||
#[error("EPUB generation failed: {message}")]
|
||||
Epub { message: String },
|
||||
}
|
||||
@@ -0,0 +1,268 @@
|
||||
use chrono::{DateTime, NaiveDate};
|
||||
use scraper::{Html, Selector};
|
||||
use serde_json::Value;
|
||||
use url::Url;
|
||||
|
||||
use crate::error::{EbookmError, Result};
|
||||
use crate::source::{LoadedSource, SourceOrigin};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ExtractedArticle {
|
||||
pub title: String,
|
||||
pub author: Option<String>,
|
||||
pub published: Option<NaiveDate>,
|
||||
pub canonical_url: Option<Url>,
|
||||
pub body_html: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize)]
|
||||
pub struct InspectResult {
|
||||
pub title: Option<String>,
|
||||
pub author: Option<String>,
|
||||
pub published: Option<String>,
|
||||
pub canonical_url: Option<String>,
|
||||
}
|
||||
|
||||
pub fn extract_article(loaded: &LoadedSource) -> Result<ExtractedArticle> {
|
||||
let document = Html::parse_document(&loaded.html);
|
||||
let json_ld = extract_primary_json_ld(&document);
|
||||
let title = select_content(
|
||||
&document,
|
||||
&[
|
||||
r#"meta[property="og:title"]"#,
|
||||
r#"article .post-title"#,
|
||||
".post-title",
|
||||
"h1",
|
||||
"title",
|
||||
],
|
||||
"content",
|
||||
)
|
||||
.or_else(|| {
|
||||
select_text(
|
||||
&document,
|
||||
&[r#"article .post-title"#, ".post-title", "h1", "title"],
|
||||
)
|
||||
})
|
||||
.or_else(|| json_ld_string(&json_ld, "headline"))
|
||||
.ok_or_else(|| EbookmError::Extraction {
|
||||
input: origin_label(&loaded.origin),
|
||||
})?;
|
||||
|
||||
let author = select_content(
|
||||
&document,
|
||||
&[
|
||||
r#"meta[name="author"]"#,
|
||||
r#"meta[property="article:author"]"#,
|
||||
],
|
||||
"content",
|
||||
)
|
||||
.or_else(|| {
|
||||
select_text(
|
||||
&document,
|
||||
&[
|
||||
"[data-testid='author-name']",
|
||||
".byline",
|
||||
".byline-wrapper a",
|
||||
"address",
|
||||
],
|
||||
)
|
||||
})
|
||||
.or_else(|| json_ld_author(&json_ld));
|
||||
|
||||
let published = select_content(
|
||||
&document,
|
||||
&[r#"meta[property="article:published_time"]"#, "time"],
|
||||
"content",
|
||||
)
|
||||
.or_else(|| select_attr(&document, &["time"], "datetime"))
|
||||
.or_else(|| json_ld_string(&json_ld, "datePublished"))
|
||||
.and_then(parse_date);
|
||||
|
||||
let canonical_url = select_attr(&document, &[r#"link[rel="canonical"]"#], "href")
|
||||
.or_else(|| match &loaded.origin {
|
||||
SourceOrigin::Remote(url) => Some(url.to_string()),
|
||||
SourceOrigin::LocalFile(_) => None,
|
||||
})
|
||||
.and_then(|raw| Url::parse(&raw).ok());
|
||||
|
||||
let body_html = select_html(
|
||||
&document,
|
||||
&[
|
||||
".available-content .body.markup",
|
||||
".available-content .markup",
|
||||
"article .body.markup",
|
||||
".newsletter-post .body.markup",
|
||||
"article",
|
||||
"main",
|
||||
"body",
|
||||
],
|
||||
)
|
||||
.ok_or_else(|| EbookmError::Extraction {
|
||||
input: origin_label(&loaded.origin),
|
||||
})?;
|
||||
|
||||
Ok(ExtractedArticle {
|
||||
title,
|
||||
author,
|
||||
published,
|
||||
canonical_url,
|
||||
body_html,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn inspect_article(loaded: &LoadedSource) -> Result<InspectResult> {
|
||||
let extracted = extract_article(loaded)?;
|
||||
Ok(InspectResult {
|
||||
title: Some(extracted.title),
|
||||
author: extracted.author,
|
||||
published: extracted.published.map(|date| date.to_string()),
|
||||
canonical_url: extracted.canonical_url.map(|url| url.to_string()),
|
||||
})
|
||||
}
|
||||
|
||||
fn select_content(document: &Html, selectors: &[&str], attr: &str) -> Option<String> {
|
||||
selectors.iter().find_map(|selector| {
|
||||
let selector = Selector::parse(selector).ok()?;
|
||||
document
|
||||
.select(&selector)
|
||||
.next()
|
||||
.and_then(|node| node.value().attr(attr))
|
||||
.map(clean_text)
|
||||
})
|
||||
}
|
||||
|
||||
fn select_text(document: &Html, selectors: &[&str]) -> Option<String> {
|
||||
selectors.iter().find_map(|selector| {
|
||||
let selector = Selector::parse(selector).ok()?;
|
||||
document
|
||||
.select(&selector)
|
||||
.next()
|
||||
.map(|node| clean_text(&node.text().collect::<String>()))
|
||||
})
|
||||
}
|
||||
|
||||
fn select_attr(document: &Html, selectors: &[&str], attr: &str) -> Option<String> {
|
||||
selectors.iter().find_map(|selector| {
|
||||
let selector = Selector::parse(selector).ok()?;
|
||||
document
|
||||
.select(&selector)
|
||||
.next()
|
||||
.and_then(|node| node.value().attr(attr))
|
||||
.map(clean_text)
|
||||
})
|
||||
}
|
||||
|
||||
fn select_html(document: &Html, selectors: &[&str]) -> Option<String> {
|
||||
selectors.iter().find_map(|selector| {
|
||||
let selector = Selector::parse(selector).ok()?;
|
||||
document
|
||||
.select(&selector)
|
||||
.next()
|
||||
.map(|node| node.inner_html())
|
||||
})
|
||||
}
|
||||
|
||||
fn clean_text(value: &str) -> String {
|
||||
value.split_whitespace().collect::<Vec<_>>().join(" ")
|
||||
}
|
||||
|
||||
fn parse_date(value: String) -> Option<NaiveDate> {
|
||||
DateTime::parse_from_rfc3339(&value)
|
||||
.map(|parsed| parsed.date_naive())
|
||||
.ok()
|
||||
.or_else(|| NaiveDate::parse_from_str(&value, "%Y-%m-%d").ok())
|
||||
.or_else(|| NaiveDate::parse_from_str(&value, "%b %d, %Y").ok())
|
||||
}
|
||||
|
||||
fn origin_label(origin: &SourceOrigin) -> String {
|
||||
match origin {
|
||||
SourceOrigin::Remote(url) => url.to_string(),
|
||||
SourceOrigin::LocalFile(path) => path.display().to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_primary_json_ld(document: &Html) -> Option<Value> {
|
||||
let selector = Selector::parse(r#"script[type="application/ld+json"]"#).ok()?;
|
||||
for node in document.select(&selector) {
|
||||
let raw = node.inner_html();
|
||||
let Ok(value) = serde_json::from_str::<Value>(&raw) else {
|
||||
continue;
|
||||
};
|
||||
if value.get("@type").and_then(Value::as_str).is_some() {
|
||||
return Some(value);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn json_ld_string(json_ld: &Option<Value>, key: &str) -> Option<String> {
|
||||
json_ld
|
||||
.as_ref()?
|
||||
.get(key)?
|
||||
.as_str()
|
||||
.map(|value| value.to_string())
|
||||
}
|
||||
|
||||
fn json_ld_author(json_ld: &Option<Value>) -> Option<String> {
|
||||
let author = json_ld.as_ref()?.get("author")?;
|
||||
if let Some(author_name) = author.get(0).and_then(|entry| entry.get("name")).and_then(Value::as_str) {
|
||||
return Some(author_name.to_string());
|
||||
}
|
||||
if let Some(author_name) = author.get("name").and_then(Value::as_str) {
|
||||
return Some(author_name.to_string());
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::source::LoadedSource;
|
||||
|
||||
#[test]
|
||||
fn extracts_substack_article_body_without_page_chrome() {
|
||||
let html = r#"<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<meta property="og:title" content="A Contested Island" />
|
||||
<meta name="author" content="John Gu" />
|
||||
<link rel="canonical" href="https://ageofpeace.substack.com/p/a-contested-island" />
|
||||
<script type="application/ld+json">{"@context":"https://schema.org","@type":"NewsArticle","headline":"A Contested Island","datePublished":"2026-03-03T23:37:00+00:00","author":[{"@type":"Person","name":"John Gu"}]}</script>
|
||||
</head>
|
||||
<body>
|
||||
<article class="typography newsletter-post post">
|
||||
<div class="post-header">
|
||||
<h1 class="post-title">Chapter 1: A Contested Island</h1>
|
||||
</div>
|
||||
<div class="available-content">
|
||||
<div class="body markup">
|
||||
<p>First paragraph.</p>
|
||||
<p>Second paragraph.</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="post-footer">
|
||||
<button>Share</button>
|
||||
</div>
|
||||
</article>
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
let loaded = LoadedSource {
|
||||
origin: SourceOrigin::Remote(
|
||||
Url::parse("https://ageofpeace.substack.com/p/a-contested-island").expect("url"),
|
||||
),
|
||||
html: html.to_string(),
|
||||
};
|
||||
|
||||
let extracted = extract_article(&loaded).expect("extract article");
|
||||
assert_eq!(extracted.title, "A Contested Island");
|
||||
assert_eq!(extracted.author.as_deref(), Some("John Gu"));
|
||||
assert_eq!(
|
||||
extracted.published,
|
||||
Some(NaiveDate::from_ymd_opt(2026, 3, 3).expect("date"))
|
||||
);
|
||||
assert!(extracted.body_html.contains("First paragraph."));
|
||||
assert!(!extracted.body_html.contains("post-header"));
|
||||
assert!(!extracted.body_html.contains("Share"));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,167 @@
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
|
||||
use url::Url;
|
||||
|
||||
use crate::manifest::{BuildMode, LinkMatchMode, Manifest};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LinkPolicy {
|
||||
pub match_mode: LinkMatchMode,
|
||||
pub targets: BTreeSet<String>,
|
||||
}
|
||||
|
||||
pub fn build_link_policies(
|
||||
manifest: &Manifest,
|
||||
entry_metadata: &BTreeMap<String, EntryLinkMetadata>,
|
||||
) -> BTreeMap<String, LinkPolicy> {
|
||||
entry_metadata
|
||||
.iter()
|
||||
.map(|(entry_id, _metadata)| {
|
||||
let entry = &manifest.entries[entry_id];
|
||||
let mode = entry
|
||||
.links
|
||||
.mode
|
||||
.clone()
|
||||
.unwrap_or(manifest.link_rules.mode.clone());
|
||||
let targets = resolve_targets(manifest, entry_id, &mode);
|
||||
let match_mode = select_match_mode(manifest, entry_id, &mode);
|
||||
(
|
||||
entry_id.clone(),
|
||||
LinkPolicy {
|
||||
match_mode,
|
||||
targets,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EntryLinkMetadata {
|
||||
pub source_url: Option<Url>,
|
||||
pub canonical_url: Option<Url>,
|
||||
}
|
||||
|
||||
fn resolve_targets(manifest: &Manifest, entry_id: &str, mode: &BuildMode) -> BTreeSet<String> {
|
||||
let entry = &manifest.entries[entry_id];
|
||||
let mut targets = BTreeSet::new();
|
||||
|
||||
match mode {
|
||||
BuildMode::None => return targets,
|
||||
BuildMode::Auto => {
|
||||
for candidate in manifest.entries.keys() {
|
||||
if candidate != entry_id {
|
||||
targets.insert(candidate.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
BuildMode::Explicit => {
|
||||
for rule in &manifest.link_rules.rules {
|
||||
if rule.match_mode == LinkMatchMode::Disabled {
|
||||
continue;
|
||||
}
|
||||
if selector_matches_any(&rule.from, manifest, entry_id) {
|
||||
for target in expand_selectors(&rule.to, manifest) {
|
||||
if target != entry_id {
|
||||
targets.insert(target);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !entry.links.allow_to.is_empty() {
|
||||
targets.retain(|candidate| entry.links.allow_to.contains(candidate));
|
||||
}
|
||||
|
||||
for blocked in &entry.links.block_to {
|
||||
targets.remove(blocked);
|
||||
}
|
||||
|
||||
targets
|
||||
}
|
||||
|
||||
fn select_match_mode(manifest: &Manifest, entry_id: &str, mode: &BuildMode) -> LinkMatchMode {
|
||||
match mode {
|
||||
BuildMode::None => LinkMatchMode::Disabled,
|
||||
BuildMode::Auto => LinkMatchMode::CanonicalUrl,
|
||||
BuildMode::Explicit => manifest
|
||||
.link_rules
|
||||
.rules
|
||||
.iter()
|
||||
.find(|rule| selector_matches_any(&rule.from, manifest, entry_id))
|
||||
.map(|rule| rule.match_mode.clone())
|
||||
.unwrap_or(LinkMatchMode::CanonicalUrl),
|
||||
}
|
||||
}
|
||||
|
||||
fn selector_matches_any(selectors: &[String], manifest: &Manifest, entry_id: &str) -> bool {
|
||||
selectors
|
||||
.iter()
|
||||
.any(|selector| selector_matches(selector, manifest, entry_id))
|
||||
}
|
||||
|
||||
fn selector_matches(selector: &str, manifest: &Manifest, entry_id: &str) -> bool {
|
||||
if selector == "*" {
|
||||
return true;
|
||||
}
|
||||
if selector == entry_id {
|
||||
return true;
|
||||
}
|
||||
if let Some(section_id) = selector.strip_prefix("section:") {
|
||||
return manifest
|
||||
.sections
|
||||
.iter()
|
||||
.find(|section| section.id == section_id)
|
||||
.is_some_and(|section| section.entries.iter().any(|entry| entry == entry_id));
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn expand_selectors(selectors: &[String], manifest: &Manifest) -> BTreeSet<String> {
|
||||
let mut expanded = BTreeSet::new();
|
||||
for selector in selectors {
|
||||
if selector == "*" {
|
||||
expanded.extend(manifest.entries.keys().cloned());
|
||||
continue;
|
||||
}
|
||||
if let Some(section_id) = selector.strip_prefix("section:") {
|
||||
if let Some(section) = manifest
|
||||
.sections
|
||||
.iter()
|
||||
.find(|section| section.id == section_id)
|
||||
{
|
||||
expanded.extend(section.entries.iter().cloned());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if manifest.entries.contains_key(selector) {
|
||||
expanded.insert(selector.clone());
|
||||
}
|
||||
}
|
||||
expanded
|
||||
}
|
||||
|
||||
pub fn matches_target(
|
||||
href: &Url,
|
||||
policy: &LinkPolicy,
|
||||
target_id: &str,
|
||||
metadata: &EntryLinkMetadata,
|
||||
) -> bool {
|
||||
if !policy.targets.contains(target_id) {
|
||||
return false;
|
||||
}
|
||||
|
||||
match policy.match_mode {
|
||||
LinkMatchMode::Disabled => false,
|
||||
LinkMatchMode::CanonicalUrl => metadata
|
||||
.canonical_url
|
||||
.as_ref()
|
||||
.is_some_and(|candidate| candidate.as_str() == href.as_str()),
|
||||
LinkMatchMode::SourceUrl => metadata
|
||||
.source_url
|
||||
.as_ref()
|
||||
.is_some_and(|candidate| candidate.as_str() == href.as_str()),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
mod epub;
|
||||
mod error;
|
||||
pub mod extract;
|
||||
pub mod graph;
|
||||
pub mod manifest;
|
||||
pub mod normalize;
|
||||
mod pipeline;
|
||||
pub mod source;
|
||||
mod template;
|
||||
|
||||
pub use error::{EbookmError, Result};
|
||||
pub use extract::InspectResult;
|
||||
pub use manifest::{
|
||||
BuildMode, EntryDefinition, EntryLinkConfig, LinkMatchMode, LinkRule, Manifest,
|
||||
ProcessingDefaults, ProcessingOverrides,
|
||||
};
|
||||
pub use pipeline::{
|
||||
build_epub, inspect_source, load_manifest, render_init_manifest, validate_manifest,
|
||||
};
|
||||
@@ -0,0 +1,207 @@
|
||||
use chrono::NaiveDate;
|
||||
use indexmap::IndexMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Manifest {
|
||||
pub book: BookMetadata,
|
||||
pub output: OutputConfig,
|
||||
#[serde(default)]
|
||||
pub defaults: DefaultsConfig,
|
||||
#[serde(default)]
|
||||
pub sections: Vec<SectionDefinition>,
|
||||
#[serde(default)]
|
||||
pub entries: IndexMap<String, EntryDefinition>,
|
||||
#[serde(default)]
|
||||
pub link_rules: LinkRulesConfig,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct BookMetadata {
|
||||
pub title: String,
|
||||
#[serde(default)]
|
||||
pub author: Option<String>,
|
||||
#[serde(default = "default_language")]
|
||||
pub language: String,
|
||||
#[serde(default = "default_identifier")]
|
||||
pub identifier: String,
|
||||
#[serde(default)]
|
||||
pub description: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct OutputConfig {
|
||||
pub path: String,
|
||||
#[serde(default)]
|
||||
pub cover_image: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
pub struct DefaultsConfig {
|
||||
#[serde(default = "default_true")]
|
||||
pub fetch_images: bool,
|
||||
#[serde(default = "default_true")]
|
||||
pub normalize_substack_embeds: bool,
|
||||
#[serde(default)]
|
||||
pub processing: ProcessingDefaults,
|
||||
#[serde(default)]
|
||||
pub metadata: MetadataOverrides,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SectionDefinition {
|
||||
pub id: String,
|
||||
pub title: String,
|
||||
#[serde(default)]
|
||||
pub entries: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct EntryDefinition {
|
||||
pub source: SourceDefinition,
|
||||
#[serde(default)]
|
||||
pub title: Option<String>,
|
||||
#[serde(default)]
|
||||
pub metadata: MetadataOverrides,
|
||||
#[serde(default)]
|
||||
pub processing: ProcessingOverrides,
|
||||
#[serde(default)]
|
||||
pub toc: TocConfig,
|
||||
#[serde(default)]
|
||||
pub links: EntryLinkConfig,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind", rename_all = "lowercase")]
|
||||
pub enum SourceDefinition {
|
||||
Substack { url: String },
|
||||
Html { path: String },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
pub struct MetadataOverrides {
|
||||
#[serde(default)]
|
||||
pub author: Option<String>,
|
||||
#[serde(default)]
|
||||
pub published: Option<NaiveDate>,
|
||||
#[serde(default)]
|
||||
pub subtitle: Option<String>,
|
||||
#[serde(default)]
|
||||
pub summary: Option<String>,
|
||||
#[serde(default)]
|
||||
pub tags: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ProcessingDefaults {
|
||||
#[serde(default = "default_true")]
|
||||
pub include_author: bool,
|
||||
#[serde(default = "default_true")]
|
||||
pub include_date: bool,
|
||||
#[serde(default = "default_true")]
|
||||
pub include_source_url: bool,
|
||||
#[serde(default)]
|
||||
pub skip_first_paragraphs: u32,
|
||||
}
|
||||
|
||||
impl Default for ProcessingDefaults {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
include_author: true,
|
||||
include_date: true,
|
||||
include_source_url: true,
|
||||
skip_first_paragraphs: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
pub struct ProcessingOverrides {
|
||||
#[serde(default)]
|
||||
pub include_author: Option<bool>,
|
||||
#[serde(default)]
|
||||
pub include_date: Option<bool>,
|
||||
#[serde(default)]
|
||||
pub include_source_url: Option<bool>,
|
||||
#[serde(default)]
|
||||
pub skip_first_paragraphs: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
pub struct TocConfig {
|
||||
#[serde(default)]
|
||||
pub title: Option<String>,
|
||||
#[serde(default)]
|
||||
pub hidden: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
pub struct EntryLinkConfig {
|
||||
#[serde(default)]
|
||||
pub mode: Option<BuildMode>,
|
||||
#[serde(default)]
|
||||
pub allow_to: Vec<String>,
|
||||
#[serde(default)]
|
||||
pub block_to: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum BuildMode {
|
||||
#[default]
|
||||
Auto,
|
||||
Explicit,
|
||||
None,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum LinkMatchMode {
|
||||
#[default]
|
||||
CanonicalUrl,
|
||||
SourceUrl,
|
||||
Disabled,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LinkRule {
|
||||
pub from: Vec<String>,
|
||||
pub to: Vec<String>,
|
||||
#[serde(default)]
|
||||
pub match_mode: LinkMatchMode,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LinkRulesConfig {
|
||||
#[serde(default)]
|
||||
pub mode: BuildMode,
|
||||
#[serde(default = "default_true")]
|
||||
pub rewrite_external_substack_links: bool,
|
||||
#[serde(default = "default_true")]
|
||||
pub preserve_other_external_links: bool,
|
||||
#[serde(default)]
|
||||
pub rules: Vec<LinkRule>,
|
||||
}
|
||||
|
||||
impl Default for LinkRulesConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
mode: BuildMode::Auto,
|
||||
rewrite_external_substack_links: true,
|
||||
preserve_other_external_links: true,
|
||||
rules: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn default_true() -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn default_language() -> String {
|
||||
"en".to_string()
|
||||
}
|
||||
|
||||
fn default_identifier() -> String {
|
||||
format!("urn:uuid:{}", uuid::Uuid::new_v4())
|
||||
}
|
||||
@@ -0,0 +1,357 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::path::Path;
|
||||
|
||||
use kuchiki::traits::*;
|
||||
use regex::Regex;
|
||||
use sha1::{Digest, Sha1};
|
||||
use url::Url;
|
||||
|
||||
use crate::error::{EbookmError, Result};
|
||||
use crate::graph::{EntryLinkMetadata, LinkPolicy, matches_target};
|
||||
use crate::manifest::{DefaultsConfig, EntryDefinition};
|
||||
use crate::source::{SourceOrigin, resolve_relative_url};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Asset {
|
||||
pub id: String,
|
||||
pub href: String,
|
||||
pub media_type: String,
|
||||
pub bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NormalizedDocument {
|
||||
pub title: String,
|
||||
pub author: Option<String>,
|
||||
pub published: Option<chrono::NaiveDate>,
|
||||
pub canonical_url: Option<Url>,
|
||||
pub body_xhtml: String,
|
||||
pub assets: Vec<Asset>,
|
||||
}
|
||||
|
||||
pub fn normalize_document(
|
||||
entry_id: &str,
|
||||
entry: &EntryDefinition,
|
||||
defaults: &DefaultsConfig,
|
||||
origin: &SourceOrigin,
|
||||
extracted: crate::extract::ExtractedArticle,
|
||||
policy: &LinkPolicy,
|
||||
entry_metadata: &BTreeMap<String, EntryLinkMetadata>,
|
||||
) -> Result<NormalizedDocument> {
|
||||
let mut document = kuchiki::parse_html().one(format!("<div>{}</div>", extracted.body_html));
|
||||
|
||||
remove_nodes(&mut document, "script,style,noscript,button,svg,source");
|
||||
if defaults.normalize_substack_embeds {
|
||||
remove_nodes(&mut document, "iframe");
|
||||
}
|
||||
skip_first_paragraphs(
|
||||
&mut document,
|
||||
entry
|
||||
.processing
|
||||
.skip_first_paragraphs
|
||||
.unwrap_or(defaults.processing.skip_first_paragraphs),
|
||||
);
|
||||
scrub_attributes(&mut document);
|
||||
|
||||
let mut assets = Vec::new();
|
||||
if defaults.fetch_images {
|
||||
collect_images(origin, &mut document, &mut assets)?;
|
||||
}
|
||||
|
||||
rewrite_links(entry_id, &mut document, origin, policy, entry_metadata);
|
||||
let body_xhtml = serialize_document(&document)?;
|
||||
|
||||
Ok(NormalizedDocument {
|
||||
title: entry.title.clone().unwrap_or(extracted.title),
|
||||
author: entry
|
||||
.metadata
|
||||
.author
|
||||
.clone()
|
||||
.or(extracted.author)
|
||||
.or(defaults.metadata.author.clone()),
|
||||
published: entry
|
||||
.metadata
|
||||
.published
|
||||
.or(extracted.published)
|
||||
.or(defaults.metadata.published),
|
||||
canonical_url: extracted.canonical_url,
|
||||
body_xhtml,
|
||||
assets,
|
||||
})
|
||||
}
|
||||
|
||||
fn remove_nodes(document: &mut kuchiki::NodeRef, selector: &str) {
|
||||
if let Ok(nodes) = document.select(selector) {
|
||||
let selected: Vec<_> = nodes.collect();
|
||||
for node in selected {
|
||||
node.as_node().detach();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn collect_images(
|
||||
origin: &SourceOrigin,
|
||||
document: &mut kuchiki::NodeRef,
|
||||
assets: &mut Vec<Asset>,
|
||||
) -> Result<()> {
|
||||
let selected = document
|
||||
.select("img")
|
||||
.map(|items| items.collect::<Vec<_>>())
|
||||
.unwrap_or_default();
|
||||
|
||||
for node in selected {
|
||||
let mut attrs = node.attributes.borrow_mut();
|
||||
let src = attrs
|
||||
.get("src")
|
||||
.or_else(|| attrs.get("data-src"))
|
||||
.map(|value| value.to_string());
|
||||
let Some(src) = src else {
|
||||
continue;
|
||||
};
|
||||
|
||||
if let Ok(asset) = fetch_asset(origin, &src) {
|
||||
attrs.insert("src", format!("../{}", asset.href));
|
||||
assets.push(asset);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn fetch_asset(origin: &SourceOrigin, src: &str) -> Result<Asset> {
|
||||
match origin {
|
||||
SourceOrigin::LocalFile(base_path) => fetch_local_asset(base_path, src),
|
||||
SourceOrigin::Remote(base_url) => {
|
||||
let resolved = base_url.join(src).map_err(|source| EbookmError::UrlParse {
|
||||
value: src.to_string(),
|
||||
source,
|
||||
})?;
|
||||
fetch_remote_asset(&resolved)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_local_asset(base_path: &Path, src: &str) -> Result<Asset> {
|
||||
if let Ok(url) = Url::parse(src) {
|
||||
match url.scheme() {
|
||||
"http" | "https" => return fetch_remote_asset(&url),
|
||||
"file" => {
|
||||
let path = url
|
||||
.to_file_path()
|
||||
.map_err(|_| EbookmError::InvalidSourcePath {
|
||||
path: src.to_string(),
|
||||
})?;
|
||||
return build_asset_from_path(&path);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let path = if Path::new(src).is_absolute() {
|
||||
Path::new(src).to_path_buf()
|
||||
} else {
|
||||
base_path
|
||||
.parent()
|
||||
.unwrap_or_else(|| Path::new("."))
|
||||
.join(src)
|
||||
};
|
||||
build_asset_from_path(&path)
|
||||
}
|
||||
|
||||
fn fetch_remote_asset(url: &Url) -> Result<Asset> {
|
||||
let bytes = reqwest::blocking::get(url.clone())
|
||||
.and_then(|response| response.error_for_status())
|
||||
.map_err(|source| EbookmError::Request {
|
||||
url: url.to_string(),
|
||||
source,
|
||||
})?
|
||||
.bytes()
|
||||
.map_err(|source| EbookmError::Request {
|
||||
url: url.to_string(),
|
||||
source,
|
||||
})?
|
||||
.to_vec();
|
||||
|
||||
let extension = infer_extension_from_str(url.path());
|
||||
let media_type = infer_media_type(&extension);
|
||||
let digest = Sha1::digest(url.as_str().as_bytes());
|
||||
let id = format!("{:x}", digest);
|
||||
Ok(Asset {
|
||||
id: id.clone(),
|
||||
href: format!("assets/{}.{}", id, extension),
|
||||
media_type,
|
||||
bytes,
|
||||
})
|
||||
}
|
||||
|
||||
fn build_asset_from_path(path: &Path) -> Result<Asset> {
|
||||
let bytes = std::fs::read(path).map_err(|source| EbookmError::Io {
|
||||
path: path.display().to_string(),
|
||||
source,
|
||||
})?;
|
||||
let extension = infer_extension_from_path(path);
|
||||
let media_type = infer_media_type(&extension);
|
||||
let digest = Sha1::digest(path.display().to_string().as_bytes());
|
||||
let id = format!("{:x}", digest);
|
||||
Ok(Asset {
|
||||
id: id.clone(),
|
||||
href: format!("assets/{}.{}", id, extension),
|
||||
media_type,
|
||||
bytes,
|
||||
})
|
||||
}
|
||||
|
||||
fn rewrite_links(
|
||||
entry_id: &str,
|
||||
document: &mut kuchiki::NodeRef,
|
||||
origin: &SourceOrigin,
|
||||
policy: &LinkPolicy,
|
||||
entry_metadata: &BTreeMap<String, EntryLinkMetadata>,
|
||||
) {
|
||||
let selected = document
|
||||
.select("a[href]")
|
||||
.map(|items| items.collect::<Vec<_>>())
|
||||
.unwrap_or_default();
|
||||
|
||||
for node in selected {
|
||||
let mut attrs = node.attributes.borrow_mut();
|
||||
let href = attrs.get("href").map(|value| value.to_string());
|
||||
let Some(href) = href else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let Some(resolved) = resolve_relative_url(origin, &href) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
if let Some((target_id, _)) = entry_metadata.iter().find(|(target_id, metadata)| {
|
||||
*target_id != entry_id && matches_target(&resolved, policy, target_id, metadata)
|
||||
}) {
|
||||
attrs.insert("href", format!("../text/{}.xhtml", target_id));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize_document(document: &kuchiki::NodeRef) -> Result<String> {
|
||||
let wrapper = document
|
||||
.select_first("div")
|
||||
.map_err(|_| EbookmError::Epub {
|
||||
message: "failed to serialize normalized document".to_string(),
|
||||
})?;
|
||||
|
||||
let mut bytes = Vec::new();
|
||||
for child in wrapper.as_node().children() {
|
||||
child
|
||||
.serialize(&mut bytes)
|
||||
.map_err(|error| EbookmError::Epub {
|
||||
message: error.to_string(),
|
||||
})?;
|
||||
}
|
||||
|
||||
let html = String::from_utf8(bytes).map_err(|error| EbookmError::Epub {
|
||||
message: error.to_string(),
|
||||
})?;
|
||||
Ok(to_xhtml_fragment(&html))
|
||||
}
|
||||
|
||||
fn scrub_attributes(document: &mut kuchiki::NodeRef) {
|
||||
if let Ok(nodes) = document.select("*") {
|
||||
let selected: Vec<_> = nodes.collect();
|
||||
for node in selected {
|
||||
let mut attrs = node.attributes.borrow_mut();
|
||||
let names: Vec<_> = attrs.map.keys().cloned().collect();
|
||||
for name in names {
|
||||
let local = name.local.to_string();
|
||||
let keep = match node.name.local.as_ref() {
|
||||
"a" => matches!(local.as_str(), "href" | "title"),
|
||||
"img" => matches!(local.as_str(), "src" | "alt"),
|
||||
_ => false,
|
||||
};
|
||||
if !keep {
|
||||
attrs.map.remove(&name);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn skip_first_paragraphs(document: &mut kuchiki::NodeRef, count: u32) {
|
||||
if count == 0 {
|
||||
return;
|
||||
}
|
||||
let selected = document
|
||||
.select("p")
|
||||
.map(|items| items.take(count as usize).collect::<Vec<_>>())
|
||||
.unwrap_or_default();
|
||||
for node in selected {
|
||||
node.as_node().detach();
|
||||
}
|
||||
}
|
||||
|
||||
fn infer_extension_from_path(path: &Path) -> String {
|
||||
path.extension()
|
||||
.and_then(|value| value.to_str())
|
||||
.filter(|value| !value.is_empty())
|
||||
.unwrap_or("bin")
|
||||
.to_string()
|
||||
}
|
||||
|
||||
fn infer_extension_from_str(path: &str) -> String {
|
||||
Path::new(path)
|
||||
.extension()
|
||||
.and_then(|value| value.to_str())
|
||||
.filter(|value| !value.is_empty())
|
||||
.unwrap_or("bin")
|
||||
.to_string()
|
||||
}
|
||||
|
||||
fn infer_media_type(extension: &str) -> String {
|
||||
match extension {
|
||||
"jpg" | "jpeg" => "image/jpeg",
|
||||
"png" => "image/png",
|
||||
"gif" => "image/gif",
|
||||
"svg" => "image/svg+xml",
|
||||
"webp" => "image/webp",
|
||||
_ => "application/octet-stream",
|
||||
}
|
||||
.to_string()
|
||||
}
|
||||
|
||||
fn to_xhtml_fragment(html: &str) -> String {
|
||||
let img_re = Regex::new(r#"<img([^>]*)>"#).expect("valid img regex");
|
||||
let hr_re = Regex::new(r#"<hr([^>]*)>"#).expect("valid hr regex");
|
||||
let br_re = Regex::new(r#"<br([^>]*)>"#).expect("valid br regex");
|
||||
|
||||
let html = img_re.replace_all(html, "<img$1 />").into_owned();
|
||||
let html = hr_re.replace_all(&html, "<hr$1 />").into_owned();
|
||||
br_re.replace_all(&html, "<br$1 />").into_owned()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::to_xhtml_fragment;
|
||||
use quick_xml::events::Event;
|
||||
use quick_xml::Reader;
|
||||
|
||||
#[test]
|
||||
fn converts_void_html_tags_to_xhtml_self_closing_tags() {
|
||||
let input = r#"<p>Intro</p><picture><img alt="" src="a.jpg"></picture><hr><br>"#;
|
||||
let xhtml = to_xhtml_fragment(input);
|
||||
assert!(xhtml.contains(r#"<img alt="" src="a.jpg" />"#));
|
||||
assert!(xhtml.contains("<hr />"));
|
||||
assert!(xhtml.contains("<br />"));
|
||||
|
||||
let wrapped = format!(
|
||||
r#"<?xml version="1.0" encoding="UTF-8"?><root>{}</root>"#,
|
||||
xhtml
|
||||
);
|
||||
let mut reader = Reader::from_str(&wrapped);
|
||||
loop {
|
||||
match reader.read_event() {
|
||||
Ok(Event::Eof) => break,
|
||||
Ok(_) => {}
|
||||
Err(error) => panic!("invalid XML generated: {error}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,432 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use crate::epub::write_epub;
|
||||
use crate::error::{EbookmError, Result};
|
||||
use crate::extract::{InspectResult, inspect_article};
|
||||
use crate::graph::{EntryLinkMetadata, build_link_policies};
|
||||
use crate::manifest::{Manifest, SourceDefinition};
|
||||
use crate::normalize::{Asset, NormalizedDocument};
|
||||
use crate::source::{SourceSpec, load_source};
|
||||
use crate::template::INIT_MANIFEST;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BuiltChapter {
|
||||
pub nav_title: String,
|
||||
pub xhtml: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
struct ChapterHeaderOptions {
|
||||
include_author: bool,
|
||||
include_date: bool,
|
||||
include_source_url: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BuiltEntry {
|
||||
pub id: String,
|
||||
pub hidden_from_toc: bool,
|
||||
pub chapter: BuiltChapter,
|
||||
pub assets: Vec<Asset>,
|
||||
}
|
||||
|
||||
pub fn load_manifest(path: &Path) -> Result<Manifest> {
|
||||
let contents = fs::read_to_string(path).map_err(|source| EbookmError::Io {
|
||||
path: path.display().to_string(),
|
||||
source,
|
||||
})?;
|
||||
serde_yaml::from_str(&contents).map_err(|source| EbookmError::ManifestParse {
|
||||
path: path.display().to_string(),
|
||||
source,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn validate_manifest(manifest: &Manifest) -> Result<Vec<String>> {
|
||||
let mut issues = Vec::new();
|
||||
let mut warnings = Vec::new();
|
||||
|
||||
if manifest.book.title.trim().is_empty() {
|
||||
issues.push("book.title must not be empty".to_string());
|
||||
}
|
||||
if manifest.output.path.trim().is_empty() {
|
||||
issues.push("output.path must not be empty".to_string());
|
||||
}
|
||||
if manifest.sections.is_empty() {
|
||||
issues.push("at least one section is required".to_string());
|
||||
}
|
||||
if manifest.entries.is_empty() {
|
||||
issues.push("at least one entry is required".to_string());
|
||||
}
|
||||
|
||||
for section in &manifest.sections {
|
||||
if section.entries.is_empty() {
|
||||
warnings.push(format!("section {} has no entries", section.id));
|
||||
}
|
||||
for entry_id in §ion.entries {
|
||||
if !manifest.entries.contains_key(entry_id) {
|
||||
issues.push(format!(
|
||||
"section {} references unknown entry {}",
|
||||
section.id, entry_id
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (entry_id, entry) in &manifest.entries {
|
||||
for target in &entry.links.allow_to {
|
||||
if !manifest.entries.contains_key(target) {
|
||||
issues.push(format!(
|
||||
"entry {entry_id} allow_to target {target} does not exist"
|
||||
));
|
||||
}
|
||||
}
|
||||
for target in &entry.links.block_to {
|
||||
if !manifest.entries.contains_key(target) {
|
||||
issues.push(format!(
|
||||
"entry {entry_id} block_to target {target} does not exist"
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for rule in &manifest.link_rules.rules {
|
||||
validate_selectors(manifest, &rule.from, "from", &mut issues);
|
||||
validate_selectors(manifest, &rule.to, "to", &mut issues);
|
||||
}
|
||||
|
||||
for entry_id in manifest.entries.keys() {
|
||||
if !manifest.sections.iter().any(|section| {
|
||||
section
|
||||
.entries
|
||||
.iter()
|
||||
.any(|candidate| candidate == entry_id)
|
||||
}) {
|
||||
warnings.push(format!("entry {entry_id} is not referenced by any section"));
|
||||
}
|
||||
}
|
||||
|
||||
if issues.is_empty() {
|
||||
Ok(warnings)
|
||||
} else {
|
||||
Err(EbookmError::Validation { issues })
|
||||
}
|
||||
}
|
||||
|
||||
pub fn inspect_source(source: &str) -> Result<InspectResult> {
|
||||
let spec = if source.starts_with("http://") || source.starts_with("https://") {
|
||||
SourceSpec::from_definition(
|
||||
&SourceDefinition::Substack {
|
||||
url: source.to_string(),
|
||||
},
|
||||
Path::new("."),
|
||||
)?
|
||||
} else {
|
||||
SourceSpec::from_definition(
|
||||
&SourceDefinition::Html {
|
||||
path: source.to_string(),
|
||||
},
|
||||
Path::new("."),
|
||||
)?
|
||||
};
|
||||
let loaded = load_source(&spec)?;
|
||||
inspect_article(&loaded)
|
||||
}
|
||||
|
||||
pub fn build_epub(manifest: &Manifest, manifest_path: &Path) -> Result<()> {
|
||||
let manifest_dir = manifest_path.parent().unwrap_or_else(|| Path::new("."));
|
||||
|
||||
let mut entry_specs = BTreeMap::new();
|
||||
let mut loaded_sources = BTreeMap::new();
|
||||
let mut extracted = BTreeMap::new();
|
||||
let mut metadata = BTreeMap::new();
|
||||
|
||||
for (entry_id, entry) in &manifest.entries {
|
||||
let spec = SourceSpec::from_definition(&entry.source, manifest_dir)?;
|
||||
let loaded = load_source(&spec)?;
|
||||
let article = crate::extract::extract_article(&loaded)?;
|
||||
let source_url = match &spec {
|
||||
SourceSpec::SubstackUrl(url) => Some(url.clone()),
|
||||
SourceSpec::LocalHtml(_) => None,
|
||||
};
|
||||
|
||||
metadata.insert(
|
||||
entry_id.clone(),
|
||||
EntryLinkMetadata {
|
||||
source_url,
|
||||
canonical_url: article.canonical_url.clone(),
|
||||
},
|
||||
);
|
||||
entry_specs.insert(entry_id.clone(), spec);
|
||||
loaded_sources.insert(entry_id.clone(), loaded);
|
||||
extracted.insert(entry_id.clone(), article);
|
||||
}
|
||||
|
||||
let policies = build_link_policies(manifest, &metadata);
|
||||
let mut built_entries = Vec::new();
|
||||
|
||||
for section in &manifest.sections {
|
||||
for entry_id in §ion.entries {
|
||||
let entry = &manifest.entries[entry_id];
|
||||
let loaded = loaded_sources.get(entry_id).expect("entry was loaded");
|
||||
let article = extracted
|
||||
.get(entry_id)
|
||||
.expect("entry was extracted")
|
||||
.clone();
|
||||
let policy = policies.get(entry_id).expect("policy was built");
|
||||
|
||||
let normalized = crate::normalize::normalize_document(
|
||||
entry_id,
|
||||
entry,
|
||||
&manifest.defaults,
|
||||
&loaded.origin,
|
||||
article,
|
||||
policy,
|
||||
&metadata,
|
||||
)?;
|
||||
|
||||
built_entries.push(BuiltEntry {
|
||||
id: entry_id.clone(),
|
||||
hidden_from_toc: entry.toc.hidden,
|
||||
chapter: build_chapter(entry_id, entry, &manifest.defaults, &normalized),
|
||||
assets: normalized.assets,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let cover = manifest
|
||||
.output
|
||||
.cover_image
|
||||
.as_ref()
|
||||
.map(|path| load_cover(path, manifest_dir))
|
||||
.transpose()?;
|
||||
let output_path = manifest_dir.join(&manifest.output.path);
|
||||
write_epub(manifest, &built_entries, &output_path, cover)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn render_init_manifest() -> &'static str {
|
||||
INIT_MANIFEST
|
||||
}
|
||||
|
||||
fn build_chapter(
|
||||
entry_id: &str,
|
||||
entry: &crate::manifest::EntryDefinition,
|
||||
defaults: &crate::manifest::DefaultsConfig,
|
||||
doc: &NormalizedDocument,
|
||||
) -> BuiltChapter {
|
||||
let nav_title = entry.toc.title.clone().unwrap_or_else(|| doc.title.clone());
|
||||
let header = resolve_header_options(entry, defaults);
|
||||
let author = doc.author.clone().unwrap_or_default();
|
||||
let published = doc
|
||||
.published
|
||||
.map(|date| date.to_string())
|
||||
.unwrap_or_default();
|
||||
let mut meta_lines = Vec::new();
|
||||
if header.include_author && !author.is_empty() {
|
||||
meta_lines.push(format!("<p>{}</p>", escape_html(&author)));
|
||||
}
|
||||
if header.include_date && !published.is_empty() {
|
||||
meta_lines.push(format!("<p>{}</p>", escape_html(&published)));
|
||||
}
|
||||
if header.include_source_url {
|
||||
if let Some(url) = doc.canonical_url.as_ref() {
|
||||
let escaped = escape_html(url.as_str());
|
||||
meta_lines.push(format!(r#"<p><a href="{0}">{0}</a></p>"#, escaped));
|
||||
}
|
||||
}
|
||||
|
||||
let meta_block = if meta_lines.is_empty() {
|
||||
String::new()
|
||||
} else {
|
||||
format!(r#"<div class="chapter-meta">{}</div>"#, meta_lines.join(""))
|
||||
};
|
||||
|
||||
let xhtml = format!(
|
||||
r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>{}</title>
|
||||
<link rel="stylesheet" type="text/css" href="../styles/book.css"/>
|
||||
</head>
|
||||
<body id="{}">
|
||||
<h1>{}</h1>
|
||||
{}
|
||||
{}
|
||||
</body>
|
||||
</html>"#,
|
||||
escape_html(&doc.title),
|
||||
escape_html(entry_id),
|
||||
escape_html(&doc.title),
|
||||
meta_block,
|
||||
doc.body_xhtml
|
||||
);
|
||||
|
||||
BuiltChapter { nav_title, xhtml }
|
||||
}
|
||||
|
||||
fn validate_selectors(
|
||||
manifest: &Manifest,
|
||||
selectors: &[String],
|
||||
field: &str,
|
||||
issues: &mut Vec<String>,
|
||||
) {
|
||||
for selector in selectors {
|
||||
if selector == "*" {
|
||||
continue;
|
||||
}
|
||||
if manifest.entries.contains_key(selector) {
|
||||
continue;
|
||||
}
|
||||
if let Some(section_id) = selector.strip_prefix("section:") {
|
||||
if manifest
|
||||
.sections
|
||||
.iter()
|
||||
.any(|section| section.id == section_id)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
issues.push(format!("unknown {field} selector {selector}"));
|
||||
}
|
||||
}
|
||||
|
||||
fn load_cover(path: &str, manifest_dir: &Path) -> Result<(String, Vec<u8>)> {
|
||||
let full_path = manifest_dir.join(path);
|
||||
let bytes = fs::read(&full_path).map_err(|source| EbookmError::Io {
|
||||
path: full_path.display().to_string(),
|
||||
source,
|
||||
})?;
|
||||
let extension = full_path
|
||||
.extension()
|
||||
.and_then(|value| value.to_str())
|
||||
.unwrap_or("jpg");
|
||||
Ok((format!("assets/cover.{extension}"), bytes))
|
||||
}
|
||||
|
||||
fn escape_html(value: &str) -> String {
|
||||
value
|
||||
.replace('&', "&")
|
||||
.replace('<', "<")
|
||||
.replace('>', ">")
|
||||
.replace('"', """)
|
||||
}
|
||||
|
||||
fn resolve_header_options(
|
||||
entry: &crate::manifest::EntryDefinition,
|
||||
defaults: &crate::manifest::DefaultsConfig,
|
||||
) -> ChapterHeaderOptions {
|
||||
ChapterHeaderOptions {
|
||||
include_author: entry
|
||||
.processing
|
||||
.include_author
|
||||
.unwrap_or(defaults.processing.include_author),
|
||||
include_date: entry
|
||||
.processing
|
||||
.include_date
|
||||
.unwrap_or(defaults.processing.include_date),
|
||||
include_source_url: entry
|
||||
.processing
|
||||
.include_source_url
|
||||
.unwrap_or(defaults.processing.include_source_url),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fs;
|
||||
|
||||
use tempfile::tempdir;
|
||||
use zip::ZipArchive;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn validates_and_builds_local_html_manifest() {
|
||||
let temp = tempdir().expect("tempdir");
|
||||
let root = temp.path();
|
||||
|
||||
fs::write(
|
||||
root.join("article.html"),
|
||||
r#"<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Local Essay</title>
|
||||
<meta name="author" content="Local Author" />
|
||||
<meta property="article:published_time" content="2025-01-10T00:00:00Z" />
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
<p>Hello world.</p>
|
||||
<img src="author.jpg" alt="Author" />
|
||||
</article>
|
||||
</body>
|
||||
</html>"#,
|
||||
)
|
||||
.expect("write html");
|
||||
fs::write(root.join("author.jpg"), b"fake-jpeg-data").expect("write image");
|
||||
|
||||
let manifest_path = root.join("book.yaml");
|
||||
fs::write(
|
||||
&manifest_path,
|
||||
r#"book:
|
||||
title: "Local Book"
|
||||
author: "Editor"
|
||||
language: "en"
|
||||
identifier: "urn:uuid:test-book"
|
||||
output:
|
||||
path: "dist/test.epub"
|
||||
defaults:
|
||||
fetch_images: true
|
||||
normalize_substack_embeds: true
|
||||
processing:
|
||||
include_author: true
|
||||
include_date: false
|
||||
include_source_url: false
|
||||
skip_first_paragraphs: 0
|
||||
sections:
|
||||
- id: "part-1"
|
||||
title: "Part 1"
|
||||
entries:
|
||||
- "essay"
|
||||
entries:
|
||||
essay:
|
||||
source:
|
||||
kind: "html"
|
||||
path: "article.html"
|
||||
link_rules:
|
||||
mode: "auto"
|
||||
"#,
|
||||
)
|
||||
.expect("write manifest");
|
||||
|
||||
let manifest = load_manifest(&manifest_path).expect("manifest");
|
||||
validate_manifest(&manifest).expect("manifest valid");
|
||||
build_epub(&manifest, &manifest_path).expect("build epub");
|
||||
|
||||
let epub_path = root.join("dist/test.epub");
|
||||
assert!(epub_path.exists());
|
||||
|
||||
let file = fs::File::open(&epub_path).expect("epub file");
|
||||
let mut archive = ZipArchive::new(file).expect("zip");
|
||||
assert!(archive.by_name("mimetype").is_ok());
|
||||
assert!(archive.by_name("OEBPS/content.opf").is_ok());
|
||||
let mut chapter = archive
|
||||
.by_name("OEBPS/text/essay.xhtml")
|
||||
.expect("chapter file");
|
||||
let mut chapter_contents = String::new();
|
||||
use std::io::Read;
|
||||
chapter
|
||||
.read_to_string(&mut chapter_contents)
|
||||
.expect("read chapter");
|
||||
assert!(chapter_contents.contains("<p>Local Author</p>"));
|
||||
assert!(!chapter_contents.contains("<p>2025-01-10</p>"));
|
||||
assert!(!chapter_contents.contains("urn:uuid:test-book"));
|
||||
assert!(chapter_contents.contains("../assets/"));
|
||||
drop(chapter);
|
||||
assert!(archive
|
||||
.file_names()
|
||||
.any(|name| name.starts_with("OEBPS/assets/") && name.ends_with(".jpg")));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use url::Url;
|
||||
|
||||
use crate::error::{EbookmError, Result};
|
||||
use crate::manifest::SourceDefinition;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum SourceSpec {
|
||||
SubstackUrl(Url),
|
||||
LocalHtml(PathBuf),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum SourceOrigin {
|
||||
Remote(Url),
|
||||
LocalFile(PathBuf),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LoadedSource {
|
||||
pub origin: SourceOrigin,
|
||||
pub html: String,
|
||||
}
|
||||
|
||||
impl SourceSpec {
|
||||
pub fn from_definition(definition: &SourceDefinition, manifest_dir: &Path) -> Result<Self> {
|
||||
match definition {
|
||||
SourceDefinition::Substack { url } => Ok(SourceSpec::SubstackUrl(
|
||||
Url::parse(url).map_err(|source| EbookmError::UrlParse {
|
||||
value: url.clone(),
|
||||
source,
|
||||
})?,
|
||||
)),
|
||||
SourceDefinition::Html { path } => {
|
||||
let joined = manifest_dir.join(path);
|
||||
Ok(SourceSpec::LocalHtml(joined))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn load_source(spec: &SourceSpec) -> Result<LoadedSource> {
|
||||
match spec {
|
||||
SourceSpec::SubstackUrl(url) => {
|
||||
let client = reqwest::blocking::Client::builder()
|
||||
.user_agent("ebookm/0.1")
|
||||
.build()
|
||||
.map_err(|source| EbookmError::Request {
|
||||
url: url.to_string(),
|
||||
source,
|
||||
})?;
|
||||
let html = client
|
||||
.get(url.clone())
|
||||
.send()
|
||||
.and_then(|response| response.error_for_status())
|
||||
.map_err(|source| EbookmError::Request {
|
||||
url: url.to_string(),
|
||||
source,
|
||||
})?
|
||||
.text()
|
||||
.map_err(|source| EbookmError::Request {
|
||||
url: url.to_string(),
|
||||
source,
|
||||
})?;
|
||||
Ok(LoadedSource {
|
||||
origin: SourceOrigin::Remote(url.clone()),
|
||||
html,
|
||||
})
|
||||
}
|
||||
SourceSpec::LocalHtml(path) => {
|
||||
let html = fs::read_to_string(path).map_err(|source| EbookmError::Io {
|
||||
path: path.display().to_string(),
|
||||
source,
|
||||
})?;
|
||||
Ok(LoadedSource {
|
||||
origin: SourceOrigin::LocalFile(path.clone()),
|
||||
html,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn resolve_relative_url(origin: &SourceOrigin, href: &str) -> Option<Url> {
|
||||
match origin {
|
||||
SourceOrigin::Remote(base) => base.join(href).ok(),
|
||||
SourceOrigin::LocalFile(path) => {
|
||||
if let Ok(url) = Url::parse(href) {
|
||||
return Some(url);
|
||||
}
|
||||
let parent = path.parent()?;
|
||||
let joined = parent.join(href);
|
||||
Url::from_file_path(joined).ok()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
pub const INIT_MANIFEST: &str = r#"book:
|
||||
title: "Collected Substack Essays"
|
||||
author: "Author Name"
|
||||
language: "en"
|
||||
identifier: "urn:uuid:11111111-2222-3333-4444-555555555555"
|
||||
description: "A compiled EPUB built by ebookm"
|
||||
|
||||
output:
|
||||
path: "dist/collection.epub"
|
||||
|
||||
defaults:
|
||||
fetch_images: true
|
||||
normalize_substack_embeds: true
|
||||
processing:
|
||||
include_author: true
|
||||
include_date: true
|
||||
include_source_url: true
|
||||
skip_first_paragraphs: 0
|
||||
metadata:
|
||||
author: "Author Name"
|
||||
|
||||
sections:
|
||||
- id: "essays"
|
||||
title: "Essays"
|
||||
entries:
|
||||
- "opening-post"
|
||||
- "saved-html"
|
||||
|
||||
entries:
|
||||
opening-post:
|
||||
source:
|
||||
kind: "substack"
|
||||
url: "https://example.substack.com/p/opening-post"
|
||||
processing:
|
||||
skip_first_paragraphs: 1
|
||||
toc:
|
||||
title: "Opening Post"
|
||||
|
||||
saved-html:
|
||||
source:
|
||||
kind: "html"
|
||||
path: "articles/saved-post.html"
|
||||
title: "Saved Local Article"
|
||||
links:
|
||||
mode: "explicit"
|
||||
allow_to: ["opening-post"]
|
||||
|
||||
link_rules:
|
||||
mode: "auto"
|
||||
rewrite_external_substack_links: true
|
||||
preserve_other_external_links: true
|
||||
rules:
|
||||
- from: ["section:essays"]
|
||||
to: ["section:essays"]
|
||||
match_mode: "canonical-url"
|
||||
"#;
|
||||
Reference in New Issue
Block a user