diff --git a/ebookm-core/src/epub.rs b/ebookm-core/src/epub.rs index b419387..1e9576b 100644 --- a/ebookm-core/src/epub.rs +++ b/ebookm-core/src/epub.rs @@ -3,6 +3,7 @@ use std::fs::File; use std::io::Write; use std::path::Path; +use chrono::Utc; use quick_xml::escape::escape; use zip::CompressionMethod; use zip::write::{SimpleFileOptions, ZipWriter}; @@ -181,11 +182,16 @@ fn build_ncx(manifest: &crate::manifest::Manifest, built: &[BuiltEntry]) -> Stri let section_play_order = play_order; play_order += 1; + let section_target = section_entries[0] + .section_anchor + .as_ref() + .map(|anchor| format!("text/{}.xhtml#{}", section_entries[0].id, anchor)) + .unwrap_or_else(|| format!("text/{}.xhtml", section_entries[0].id)); let mut child_points = String::new(); for entry in §ion_entries { child_points.push_str(&format!( "{}", - escape(&entry.id), + escape(&xml_id("nav", &entry.id)), play_order, escape(&entry.chapter.nav_title), escape(&entry.id) @@ -194,11 +200,11 @@ fn build_ncx(manifest: &crate::manifest::Manifest, built: &[BuiltEntry]) -> Stri } nav_points.push_str(&format!( - "{}{}", - escape(§ion.id), + "{}{}", + escape(&xml_id("section", §ion.id)), section_play_order, escape(§ion.title), - escape(§ion_entries[0].id), + escape(§ion_target), child_points )); } @@ -233,14 +239,17 @@ fn build_opf( for entry in built { manifest_items.push_str(&format!( "", - escape(&entry.id), + escape(&xml_id("entry", &entry.id)), escape(&entry.id) )); - spine_items.push_str(&format!("", escape(&entry.id))); + spine_items.push_str(&format!( + "", + escape(&xml_id("entry", &entry.id)) + )); for asset in &entry.assets { manifest_items.push_str(&format!( "", - escape(&asset.id), + escape(&xml_id("asset", &asset.id)), escape(&asset.href), escape(&asset.media_type) )); @@ -249,8 +258,9 @@ fn build_opf( if let Some(cover_href) = cover_href { manifest_items.push_str(&format!( - "", - escape(cover_href) + "", + escape(cover_href), + escape(&media_type_from_href(cover_href)) )); } @@ -260,6 +270,7 @@ fn build_opf( .clone() .unwrap_or_else(|| "Unknown".to_string()); let description = manifest.book.description.clone().unwrap_or_default(); + let modified = Utc::now().format("%Y-%m-%dT%H:%M:%SZ").to_string(); format!( r#" @@ -269,6 +280,7 @@ fn build_opf( {} {} {} + {} {} {} @@ -278,11 +290,42 @@ fn build_opf( escape(&author), escape(&manifest.book.language), escape(&description), + escape(&modified), manifest_items, spine_items ) } +fn xml_id(prefix: &str, value: &str) -> String { + let mut id = String::with_capacity(prefix.len() + value.len() + 1); + id.push_str(prefix); + id.push('-'); + for ch in value.chars() { + if ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.') { + id.push(ch); + } else { + id.push('-'); + } + } + id +} + +fn media_type_from_href(href: &str) -> String { + match href.rsplit('.').next().map(|ext| ext.to_ascii_lowercase()) { + Some(extension) => match extension.as_str() { + "jpg" | "jpeg" => "image/jpeg", + "png" => "image/png", + "gif" => "image/gif", + "svg" => "image/svg+xml", + "webp" => "image/webp", + "avif" => "image/avif", + _ => "application/octet-stream", + } + .to_string(), + None => "application/octet-stream".to_string(), + } +} + const CONTAINER_XML: &str = r#" diff --git a/ebookm-core/src/extract.rs b/ebookm-core/src/extract.rs index d2a4ac9..a7ea7e6 100644 --- a/ebookm-core/src/extract.rs +++ b/ebookm-core/src/extract.rs @@ -26,25 +26,7 @@ pub struct InspectResult { pub fn extract_article(loaded: &LoadedSource) -> Result { let document = Html::parse_document(&loaded.html); let json_ld = extract_primary_json_ld(&document); - let title = select_content( - &document, - &[ - r#"meta[property="og:title"]"#, - r#"article .post-title"#, - ".post-title", - "h1", - "title", - ], - "content", - ) - .or_else(|| { - select_text( - &document, - &[r#"article .post-title"#, ".post-title", "h1", "title"], - ) - }) - .or_else(|| json_ld_string(&json_ld, "headline")) - .ok_or_else(|| EbookmError::Extraction { + let title = extract_title(&document, &json_ld).ok_or_else(|| EbookmError::Extraction { input: origin_label(&loaded.origin), })?; @@ -131,6 +113,20 @@ fn select_content(document: &Html, selectors: &[&str], attr: &str) -> Option) -> Option { + select_text( + document, + &[ + "h1.post-title", + ".post-title", + r#"article .post-title"#, + "title", + "h1", + ], + ) + .or_else(|| json_ld_string(json_ld, "headline")) +} + fn select_text(document: &Html, selectors: &[&str]) -> Option { selectors.iter().find_map(|selector| { let selector = Selector::parse(selector).ok()?; @@ -205,7 +201,11 @@ fn json_ld_string(json_ld: &Option, key: &str) -> Option { fn json_ld_author(json_ld: &Option) -> Option { let author = json_ld.as_ref()?.get("author")?; - if let Some(author_name) = author.get(0).and_then(|entry| entry.get("name")).and_then(Value::as_str) { + if let Some(author_name) = author + .get(0) + .and_then(|entry| entry.get("name")) + .and_then(Value::as_str) + { return Some(author_name.to_string()); } if let Some(author_name) = author.get("name").and_then(Value::as_str) { @@ -255,7 +255,7 @@ mod tests { }; let extracted = extract_article(&loaded).expect("extract article"); - assert_eq!(extracted.title, "A Contested Island"); + assert_eq!(extracted.title, "Chapter 1: A Contested Island"); assert_eq!(extracted.author.as_deref(), Some("John Gu")); assert_eq!( extracted.published, diff --git a/ebookm-core/src/normalize.rs b/ebookm-core/src/normalize.rs index 4bf8d93..905d643 100644 --- a/ebookm-core/src/normalize.rs +++ b/ebookm-core/src/normalize.rs @@ -3,6 +3,7 @@ use std::path::Path; use kuchiki::traits::*; use regex::Regex; +use reqwest::header::CONTENT_TYPE; use sha1::{Digest, Sha1}; use url::Url; @@ -159,12 +160,18 @@ fn fetch_local_asset(base_path: &Path, src: &str) -> Result { } fn fetch_remote_asset(url: &Url) -> Result { - let bytes = reqwest::blocking::get(url.clone()) + let response = reqwest::blocking::get(url.clone()) .and_then(|response| response.error_for_status()) .map_err(|source| EbookmError::Request { url: url.to_string(), source, - })? + })?; + let content_type = response + .headers() + .get(CONTENT_TYPE) + .and_then(|value| value.to_str().ok()) + .map(|value| value.to_string()); + let bytes = response .bytes() .map_err(|source| EbookmError::Request { url: url.to_string(), @@ -172,7 +179,7 @@ fn fetch_remote_asset(url: &Url) -> Result { })? .to_vec(); - let extension = infer_extension_from_str(url.path()); + let extension = infer_extension_from_response(url.path(), content_type.as_deref(), &bytes); let media_type = infer_media_type(&extension); let digest = Sha1::digest(url.as_str().as_bytes()); let id = format!("{:x}", digest); @@ -189,7 +196,8 @@ fn build_asset_from_path(path: &Path) -> Result { path: path.display().to_string(), source, })?; - let extension = infer_extension_from_path(path); + let path_extension = infer_extension_from_path(path); + let extension = infer_extension_from_bytes(Some(path_extension.as_str()), &bytes); let media_type = infer_media_type(&extension); let digest = Sha1::digest(path.display().to_string().as_bytes()); let id = format!("{:x}", digest); @@ -292,8 +300,8 @@ fn infer_extension_from_path(path: &Path) -> String { path.extension() .and_then(|value| value.to_str()) .filter(|value| !value.is_empty()) - .unwrap_or("bin") - .to_string() + .map(|value| value.to_ascii_lowercase()) + .unwrap_or_else(|| "bin".to_string()) } fn infer_extension_from_str(path: &str) -> String { @@ -301,8 +309,59 @@ fn infer_extension_from_str(path: &str) -> String { .extension() .and_then(|value| value.to_str()) .filter(|value| !value.is_empty()) - .unwrap_or("bin") - .to_string() + .map(|value| value.to_ascii_lowercase()) + .unwrap_or_else(|| "bin".to_string()) +} + +fn infer_extension_from_response(path: &str, content_type: Option<&str>, bytes: &[u8]) -> String { + let path_extension = infer_extension_from_str(path); + if let Some(extension) = extension_from_content_type(content_type) { + return extension; + } + infer_extension_from_bytes(Some(path_extension.as_str()), bytes) +} + +fn infer_extension_from_bytes(path_extension: Option<&str>, bytes: &[u8]) -> String { + if let Some(extension) = sniff_image_extension(bytes) { + return extension.to_string(); + } + path_extension.unwrap_or("bin").to_string() +} + +fn extension_from_content_type(content_type: Option<&str>) -> Option { + let normalized = content_type?.split(';').next()?.trim().to_ascii_lowercase(); + let extension = match normalized.as_str() { + "image/jpeg" => "jpeg", + "image/png" => "png", + "image/gif" => "gif", + "image/svg+xml" => "svg", + "image/webp" => "webp", + "image/avif" => "avif", + _ => return None, + }; + Some(extension.to_string()) +} + +fn sniff_image_extension(bytes: &[u8]) -> Option<&'static str> { + if bytes.len() >= 3 && bytes.starts_with(&[0xff, 0xd8, 0xff]) { + return Some("jpeg"); + } + if bytes.len() >= 8 && bytes.starts_with(&[0x89, b'P', b'N', b'G', 0x0d, 0x0a, 0x1a, 0x0a]) { + return Some("png"); + } + if bytes.len() >= 6 && (bytes.starts_with(b"GIF87a") || bytes.starts_with(b"GIF89a")) { + return Some("gif"); + } + if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" { + return Some("webp"); + } + if bytes.len() >= 12 + && &bytes[4..8] == b"ftyp" + && bytes.windows(4).any(|window| window == b"avif") + { + return Some("avif"); + } + None } fn infer_media_type(extension: &str) -> String { @@ -312,6 +371,7 @@ fn infer_media_type(extension: &str) -> String { "gif" => "image/gif", "svg" => "image/svg+xml", "webp" => "image/webp", + "avif" => "image/avif", _ => "application/octet-stream", } .to_string() @@ -329,9 +389,12 @@ fn to_xhtml_fragment(html: &str) -> String { #[cfg(test)] mod tests { - use super::to_xhtml_fragment; - use quick_xml::events::Event; + use super::{ + extension_from_content_type, infer_extension_from_bytes, infer_extension_from_response, + to_xhtml_fragment, + }; use quick_xml::Reader; + use quick_xml::events::Event; #[test] fn converts_void_html_tags_to_xhtml_self_closing_tags() { @@ -354,4 +417,36 @@ mod tests { } } } + + #[test] + fn prefers_actual_jpeg_bytes_over_png_path() { + let jpeg_bytes = [0xff, 0xd8, 0xff, 0xdb]; + assert_eq!( + infer_extension_from_response("image.png", None, &jpeg_bytes), + "jpeg" + ); + } + + #[test] + fn prefers_content_type_for_remote_assets() { + let png_bytes = [0x89, b'P', b'N', b'G', 0x0d, 0x0a, 0x1a, 0x0a]; + assert_eq!( + infer_extension_from_response( + "image.jpeg", + Some("image/png; charset=binary"), + &png_bytes + ), + "png" + ); + assert_eq!( + extension_from_content_type(Some("image/avif")), + Some("avif".to_string()) + ); + } + + #[test] + fn preserves_path_extension_when_bytes_are_unknown() { + let bytes = b"not an image"; + assert_eq!(infer_extension_from_bytes(Some("jpg"), bytes), "jpg"); + } } diff --git a/ebookm-core/src/pipeline.rs b/ebookm-core/src/pipeline.rs index 0a20df2..c6329ae 100644 --- a/ebookm-core/src/pipeline.rs +++ b/ebookm-core/src/pipeline.rs @@ -28,6 +28,7 @@ struct ChapterHeaderOptions { pub struct BuiltEntry { pub id: String, pub hidden_from_toc: bool, + pub section_anchor: Option, pub chapter: BuiltChapter, pub assets: Vec, } @@ -186,10 +187,23 @@ pub fn build_epub(manifest: &Manifest, manifest_path: &Path) -> Result<()> { &metadata, )?; + let section_anchor = section + .entries + .first() + .filter(|first_entry_id| *first_entry_id == entry_id) + .map(|_| format!("section-{}", section.id)); + built_entries.push(BuiltEntry { id: entry_id.clone(), hidden_from_toc: entry.toc.hidden, - chapter: build_chapter(entry_id, entry, &manifest.defaults, &normalized), + section_anchor: section_anchor.clone(), + chapter: build_chapter( + entry_id, + entry, + &manifest.defaults, + &normalized, + section_anchor.as_deref(), + ), assets: normalized.assets, }); } @@ -215,6 +229,7 @@ fn build_chapter( entry: &crate::manifest::EntryDefinition, defaults: &crate::manifest::DefaultsConfig, doc: &NormalizedDocument, + section_anchor: Option<&str>, ) -> BuiltChapter { let nav_title = entry.toc.title.clone().unwrap_or_else(|| doc.title.clone()); let header = resolve_header_options(entry, defaults); @@ -243,6 +258,10 @@ fn build_chapter( format!(r#"
{}
"#, meta_lines.join("")) }; + let section_anchor_markup = section_anchor + .map(|anchor| format!(r#"
"#, escape_html(anchor))) + .unwrap_or_default(); + let xhtml = format!( r#" @@ -251,6 +270,7 @@ fn build_chapter( + {}

{}

{} {} @@ -258,6 +278,7 @@ fn build_chapter( "#, escape_html(&doc.title), escape_html(entry_id), + section_anchor_markup, escape_html(&doc.title), meta_block, doc.body_xhtml @@ -425,8 +446,10 @@ link_rules: assert!(!chapter_contents.contains("urn:uuid:test-book")); assert!(chapter_contents.contains("../assets/")); drop(chapter); - assert!(archive - .file_names() - .any(|name| name.starts_with("OEBPS/assets/") && name.ends_with(".jpg"))); + assert!( + archive + .file_names() + .any(|name| name.starts_with("OEBPS/assets/") && name.ends_with(".jpg")) + ); } }