Compare commits

..

2 Commits

Author SHA1 Message Date
dan 2b66132d0d fix title extraction and epub errors 2026-05-25 17:48:52 +02:00
dan d8c59dacc1 update Age of Peace 2026-05-25 17:48:37 +02:00
5 changed files with 210 additions and 47 deletions
+5 -3
View File
@@ -2,7 +2,7 @@ book:
title: "Age of Peace"
author: "John Gu"
language: "en"
identifier: "urn:uuid:ageofpeace:johngu"
identifier: "urn:book:ageofpeace:johngu"
description: "Age of Peace: a novel"
output:
@@ -36,7 +36,9 @@ sections:
- "biridana"
- id: "part-3"
title: "Nameless Country"
entries: []
entries:
- "in_the_east"
- "finale"
entries:
@@ -79,7 +81,7 @@ entries:
finale:
source:
kind: "substack"
url: "https://ageofpeace.substack.com/p/finale"
url: "https://ageofpeace.substack.com/p/age-of-peace"
toc:
title: "Finale"
processing:
+52 -9
View File
@@ -3,6 +3,7 @@ use std::fs::File;
use std::io::Write;
use std::path::Path;
use chrono::Utc;
use quick_xml::escape::escape;
use zip::CompressionMethod;
use zip::write::{SimpleFileOptions, ZipWriter};
@@ -181,11 +182,16 @@ fn build_ncx(manifest: &crate::manifest::Manifest, built: &[BuiltEntry]) -> Stri
let section_play_order = play_order;
play_order += 1;
let section_target = section_entries[0]
.section_anchor
.as_ref()
.map(|anchor| format!("text/{}.xhtml#{}", section_entries[0].id, anchor))
.unwrap_or_else(|| format!("text/{}.xhtml", section_entries[0].id));
let mut child_points = String::new();
for entry in &section_entries {
child_points.push_str(&format!(
"<navPoint id=\"nav-{}\" playOrder=\"{}\"><navLabel><text>{}</text></navLabel><content src=\"text/{}.xhtml\"/></navPoint>",
escape(&entry.id),
escape(&xml_id("nav", &entry.id)),
play_order,
escape(&entry.chapter.nav_title),
escape(&entry.id)
@@ -194,11 +200,11 @@ fn build_ncx(manifest: &crate::manifest::Manifest, built: &[BuiltEntry]) -> Stri
}
nav_points.push_str(&format!(
"<navPoint id=\"section-{}\" playOrder=\"{}\"><navLabel><text>{}</text></navLabel><content src=\"text/{}.xhtml\"/>{}</navPoint>",
escape(&section.id),
"<navPoint id=\"{}\" playOrder=\"{}\"><navLabel><text>{}</text></navLabel><content src=\"{}\"/>{}</navPoint>",
escape(&xml_id("section", &section.id)),
section_play_order,
escape(&section.title),
escape(&section_entries[0].id),
escape(&section_target),
child_points
));
}
@@ -233,14 +239,17 @@ fn build_opf(
for entry in built {
manifest_items.push_str(&format!(
"<item id=\"{}\" href=\"text/{}.xhtml\" media-type=\"application/xhtml+xml\"/>",
escape(&entry.id),
escape(&xml_id("entry", &entry.id)),
escape(&entry.id)
));
spine_items.push_str(&format!("<itemref idref=\"{}\"/>", escape(&entry.id)));
spine_items.push_str(&format!(
"<itemref idref=\"{}\"/>",
escape(&xml_id("entry", &entry.id))
));
for asset in &entry.assets {
manifest_items.push_str(&format!(
"<item id=\"{}\" href=\"{}\" media-type=\"{}\"/>",
escape(&asset.id),
escape(&xml_id("asset", &asset.id)),
escape(&asset.href),
escape(&asset.media_type)
));
@@ -249,8 +258,9 @@ fn build_opf(
if let Some(cover_href) = cover_href {
manifest_items.push_str(&format!(
"<item id=\"cover\" href=\"{}\" media-type=\"image/jpeg\" properties=\"cover-image\"/>",
escape(cover_href)
"<item id=\"cover\" href=\"{}\" media-type=\"{}\" properties=\"cover-image\"/>",
escape(cover_href),
escape(&media_type_from_href(cover_href))
));
}
@@ -260,6 +270,7 @@ fn build_opf(
.clone()
.unwrap_or_else(|| "Unknown".to_string());
let description = manifest.book.description.clone().unwrap_or_default();
let modified = Utc::now().format("%Y-%m-%dT%H:%M:%SZ").to_string();
format!(
r#"<?xml version="1.0" encoding="UTF-8"?>
<package version="3.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="bookid">
@@ -269,6 +280,7 @@ fn build_opf(
<dc:creator>{}</dc:creator>
<dc:language>{}</dc:language>
<dc:description>{}</dc:description>
<meta property="dcterms:modified">{}</meta>
</metadata>
<manifest>{}</manifest>
<spine toc="ncx">{}</spine>
@@ -278,11 +290,42 @@ fn build_opf(
escape(&author),
escape(&manifest.book.language),
escape(&description),
escape(&modified),
manifest_items,
spine_items
)
}
fn xml_id(prefix: &str, value: &str) -> String {
let mut id = String::with_capacity(prefix.len() + value.len() + 1);
id.push_str(prefix);
id.push('-');
for ch in value.chars() {
if ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.') {
id.push(ch);
} else {
id.push('-');
}
}
id
}
fn media_type_from_href(href: &str) -> String {
match href.rsplit('.').next().map(|ext| ext.to_ascii_lowercase()) {
Some(extension) => match extension.as_str() {
"jpg" | "jpeg" => "image/jpeg",
"png" => "image/png",
"gif" => "image/gif",
"svg" => "image/svg+xml",
"webp" => "image/webp",
"avif" => "image/avif",
_ => "application/octet-stream",
}
.to_string(),
None => "application/octet-stream".to_string(),
}
}
const CONTAINER_XML: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
+21 -21
View File
@@ -26,25 +26,7 @@ pub struct InspectResult {
pub fn extract_article(loaded: &LoadedSource) -> Result<ExtractedArticle> {
let document = Html::parse_document(&loaded.html);
let json_ld = extract_primary_json_ld(&document);
let title = select_content(
&document,
&[
r#"meta[property="og:title"]"#,
r#"article .post-title"#,
".post-title",
"h1",
"title",
],
"content",
)
.or_else(|| {
select_text(
&document,
&[r#"article .post-title"#, ".post-title", "h1", "title"],
)
})
.or_else(|| json_ld_string(&json_ld, "headline"))
.ok_or_else(|| EbookmError::Extraction {
let title = extract_title(&document, &json_ld).ok_or_else(|| EbookmError::Extraction {
input: origin_label(&loaded.origin),
})?;
@@ -131,6 +113,20 @@ fn select_content(document: &Html, selectors: &[&str], attr: &str) -> Option<Str
})
}
fn extract_title(document: &Html, json_ld: &Option<Value>) -> Option<String> {
select_text(
document,
&[
"h1.post-title",
".post-title",
r#"article .post-title"#,
"title",
"h1",
],
)
.or_else(|| json_ld_string(json_ld, "headline"))
}
fn select_text(document: &Html, selectors: &[&str]) -> Option<String> {
selectors.iter().find_map(|selector| {
let selector = Selector::parse(selector).ok()?;
@@ -205,7 +201,11 @@ fn json_ld_string(json_ld: &Option<Value>, key: &str) -> Option<String> {
fn json_ld_author(json_ld: &Option<Value>) -> Option<String> {
let author = json_ld.as_ref()?.get("author")?;
if let Some(author_name) = author.get(0).and_then(|entry| entry.get("name")).and_then(Value::as_str) {
if let Some(author_name) = author
.get(0)
.and_then(|entry| entry.get("name"))
.and_then(Value::as_str)
{
return Some(author_name.to_string());
}
if let Some(author_name) = author.get("name").and_then(Value::as_str) {
@@ -255,7 +255,7 @@ mod tests {
};
let extracted = extract_article(&loaded).expect("extract article");
assert_eq!(extracted.title, "A Contested Island");
assert_eq!(extracted.title, "Chapter 1: A Contested Island");
assert_eq!(extracted.author.as_deref(), Some("John Gu"));
assert_eq!(
extracted.published,
+105 -10
View File
@@ -3,6 +3,7 @@ use std::path::Path;
use kuchiki::traits::*;
use regex::Regex;
use reqwest::header::CONTENT_TYPE;
use sha1::{Digest, Sha1};
use url::Url;
@@ -159,12 +160,18 @@ fn fetch_local_asset(base_path: &Path, src: &str) -> Result<Asset> {
}
fn fetch_remote_asset(url: &Url) -> Result<Asset> {
let bytes = reqwest::blocking::get(url.clone())
let response = reqwest::blocking::get(url.clone())
.and_then(|response| response.error_for_status())
.map_err(|source| EbookmError::Request {
url: url.to_string(),
source,
})?
})?;
let content_type = response
.headers()
.get(CONTENT_TYPE)
.and_then(|value| value.to_str().ok())
.map(|value| value.to_string());
let bytes = response
.bytes()
.map_err(|source| EbookmError::Request {
url: url.to_string(),
@@ -172,7 +179,7 @@ fn fetch_remote_asset(url: &Url) -> Result<Asset> {
})?
.to_vec();
let extension = infer_extension_from_str(url.path());
let extension = infer_extension_from_response(url.path(), content_type.as_deref(), &bytes);
let media_type = infer_media_type(&extension);
let digest = Sha1::digest(url.as_str().as_bytes());
let id = format!("{:x}", digest);
@@ -189,7 +196,8 @@ fn build_asset_from_path(path: &Path) -> Result<Asset> {
path: path.display().to_string(),
source,
})?;
let extension = infer_extension_from_path(path);
let path_extension = infer_extension_from_path(path);
let extension = infer_extension_from_bytes(Some(path_extension.as_str()), &bytes);
let media_type = infer_media_type(&extension);
let digest = Sha1::digest(path.display().to_string().as_bytes());
let id = format!("{:x}", digest);
@@ -292,8 +300,8 @@ fn infer_extension_from_path(path: &Path) -> String {
path.extension()
.and_then(|value| value.to_str())
.filter(|value| !value.is_empty())
.unwrap_or("bin")
.to_string()
.map(|value| value.to_ascii_lowercase())
.unwrap_or_else(|| "bin".to_string())
}
fn infer_extension_from_str(path: &str) -> String {
@@ -301,8 +309,59 @@ fn infer_extension_from_str(path: &str) -> String {
.extension()
.and_then(|value| value.to_str())
.filter(|value| !value.is_empty())
.unwrap_or("bin")
.to_string()
.map(|value| value.to_ascii_lowercase())
.unwrap_or_else(|| "bin".to_string())
}
fn infer_extension_from_response(path: &str, content_type: Option<&str>, bytes: &[u8]) -> String {
let path_extension = infer_extension_from_str(path);
if let Some(extension) = extension_from_content_type(content_type) {
return extension;
}
infer_extension_from_bytes(Some(path_extension.as_str()), bytes)
}
fn infer_extension_from_bytes(path_extension: Option<&str>, bytes: &[u8]) -> String {
if let Some(extension) = sniff_image_extension(bytes) {
return extension.to_string();
}
path_extension.unwrap_or("bin").to_string()
}
fn extension_from_content_type(content_type: Option<&str>) -> Option<String> {
let normalized = content_type?.split(';').next()?.trim().to_ascii_lowercase();
let extension = match normalized.as_str() {
"image/jpeg" => "jpeg",
"image/png" => "png",
"image/gif" => "gif",
"image/svg+xml" => "svg",
"image/webp" => "webp",
"image/avif" => "avif",
_ => return None,
};
Some(extension.to_string())
}
fn sniff_image_extension(bytes: &[u8]) -> Option<&'static str> {
if bytes.len() >= 3 && bytes.starts_with(&[0xff, 0xd8, 0xff]) {
return Some("jpeg");
}
if bytes.len() >= 8 && bytes.starts_with(&[0x89, b'P', b'N', b'G', 0x0d, 0x0a, 0x1a, 0x0a]) {
return Some("png");
}
if bytes.len() >= 6 && (bytes.starts_with(b"GIF87a") || bytes.starts_with(b"GIF89a")) {
return Some("gif");
}
if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
return Some("webp");
}
if bytes.len() >= 12
&& &bytes[4..8] == b"ftyp"
&& bytes.windows(4).any(|window| window == b"avif")
{
return Some("avif");
}
None
}
fn infer_media_type(extension: &str) -> String {
@@ -312,6 +371,7 @@ fn infer_media_type(extension: &str) -> String {
"gif" => "image/gif",
"svg" => "image/svg+xml",
"webp" => "image/webp",
"avif" => "image/avif",
_ => "application/octet-stream",
}
.to_string()
@@ -329,9 +389,12 @@ fn to_xhtml_fragment(html: &str) -> String {
#[cfg(test)]
mod tests {
use super::to_xhtml_fragment;
use quick_xml::events::Event;
use super::{
extension_from_content_type, infer_extension_from_bytes, infer_extension_from_response,
to_xhtml_fragment,
};
use quick_xml::Reader;
use quick_xml::events::Event;
#[test]
fn converts_void_html_tags_to_xhtml_self_closing_tags() {
@@ -354,4 +417,36 @@ mod tests {
}
}
}
#[test]
fn prefers_actual_jpeg_bytes_over_png_path() {
let jpeg_bytes = [0xff, 0xd8, 0xff, 0xdb];
assert_eq!(
infer_extension_from_response("image.png", None, &jpeg_bytes),
"jpeg"
);
}
#[test]
fn prefers_content_type_for_remote_assets() {
let png_bytes = [0x89, b'P', b'N', b'G', 0x0d, 0x0a, 0x1a, 0x0a];
assert_eq!(
infer_extension_from_response(
"image.jpeg",
Some("image/png; charset=binary"),
&png_bytes
),
"png"
);
assert_eq!(
extension_from_content_type(Some("image/avif")),
Some("avif".to_string())
);
}
#[test]
fn preserves_path_extension_when_bytes_are_unknown() {
let bytes = b"not an image";
assert_eq!(infer_extension_from_bytes(Some("jpg"), bytes), "jpg");
}
}
+26 -3
View File
@@ -28,6 +28,7 @@ struct ChapterHeaderOptions {
pub struct BuiltEntry {
pub id: String,
pub hidden_from_toc: bool,
pub section_anchor: Option<String>,
pub chapter: BuiltChapter,
pub assets: Vec<Asset>,
}
@@ -186,10 +187,23 @@ pub fn build_epub(manifest: &Manifest, manifest_path: &Path) -> Result<()> {
&metadata,
)?;
let section_anchor = section
.entries
.first()
.filter(|first_entry_id| *first_entry_id == entry_id)
.map(|_| format!("section-{}", section.id));
built_entries.push(BuiltEntry {
id: entry_id.clone(),
hidden_from_toc: entry.toc.hidden,
chapter: build_chapter(entry_id, entry, &manifest.defaults, &normalized),
section_anchor: section_anchor.clone(),
chapter: build_chapter(
entry_id,
entry,
&manifest.defaults,
&normalized,
section_anchor.as_deref(),
),
assets: normalized.assets,
});
}
@@ -215,6 +229,7 @@ fn build_chapter(
entry: &crate::manifest::EntryDefinition,
defaults: &crate::manifest::DefaultsConfig,
doc: &NormalizedDocument,
section_anchor: Option<&str>,
) -> BuiltChapter {
let nav_title = entry.toc.title.clone().unwrap_or_else(|| doc.title.clone());
let header = resolve_header_options(entry, defaults);
@@ -243,6 +258,10 @@ fn build_chapter(
format!(r#"<div class="chapter-meta">{}</div>"#, meta_lines.join(""))
};
let section_anchor_markup = section_anchor
.map(|anchor| format!(r#"<div id="{}"></div>"#, escape_html(anchor)))
.unwrap_or_default();
let xhtml = format!(
r#"<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
@@ -251,6 +270,7 @@ fn build_chapter(
<link rel="stylesheet" type="text/css" href="../styles/book.css"/>
</head>
<body id="{}">
{}
<h1>{}</h1>
{}
{}
@@ -258,6 +278,7 @@ fn build_chapter(
</html>"#,
escape_html(&doc.title),
escape_html(entry_id),
section_anchor_markup,
escape_html(&doc.title),
meta_block,
doc.body_xhtml
@@ -425,8 +446,10 @@ link_rules:
assert!(!chapter_contents.contains("urn:uuid:test-book"));
assert!(chapter_contents.contains("../assets/"));
drop(chapter);
assert!(archive
assert!(
archive
.file_names()
.any(|name| name.starts_with("OEBPS/assets/") && name.ends_with(".jpg")));
.any(|name| name.starts_with("OEBPS/assets/") && name.ends_with(".jpg"))
);
}
}