diff --git a/ebookm-core/src/epub.rs b/ebookm-core/src/epub.rs
index b419387..1e9576b 100644
--- a/ebookm-core/src/epub.rs
+++ b/ebookm-core/src/epub.rs
@@ -3,6 +3,7 @@ use std::fs::File;
use std::io::Write;
use std::path::Path;
+use chrono::Utc;
use quick_xml::escape::escape;
use zip::CompressionMethod;
use zip::write::{SimpleFileOptions, ZipWriter};
@@ -181,11 +182,16 @@ fn build_ncx(manifest: &crate::manifest::Manifest, built: &[BuiltEntry]) -> Stri
let section_play_order = play_order;
play_order += 1;
+ let section_target = section_entries[0]
+ .section_anchor
+ .as_ref()
+ .map(|anchor| format!("text/{}.xhtml#{}", section_entries[0].id, anchor))
+ .unwrap_or_else(|| format!("text/{}.xhtml", section_entries[0].id));
let mut child_points = String::new();
for entry in §ion_entries {
child_points.push_str(&format!(
"{}",
- escape(&entry.id),
+ escape(&xml_id("nav", &entry.id)),
play_order,
escape(&entry.chapter.nav_title),
escape(&entry.id)
@@ -194,11 +200,11 @@ fn build_ncx(manifest: &crate::manifest::Manifest, built: &[BuiltEntry]) -> Stri
}
nav_points.push_str(&format!(
- "{}{}",
- escape(§ion.id),
+ "{}{}",
+ escape(&xml_id("section", §ion.id)),
section_play_order,
escape(§ion.title),
- escape(§ion_entries[0].id),
+ escape(§ion_target),
child_points
));
}
@@ -233,14 +239,17 @@ fn build_opf(
for entry in built {
manifest_items.push_str(&format!(
"- ",
- escape(&entry.id),
+ escape(&xml_id("entry", &entry.id)),
escape(&entry.id)
));
- spine_items.push_str(&format!("", escape(&entry.id)));
+ spine_items.push_str(&format!(
+ "",
+ escape(&xml_id("entry", &entry.id))
+ ));
for asset in &entry.assets {
manifest_items.push_str(&format!(
"
- ",
- escape(&asset.id),
+ escape(&xml_id("asset", &asset.id)),
escape(&asset.href),
escape(&asset.media_type)
));
@@ -249,8 +258,9 @@ fn build_opf(
if let Some(cover_href) = cover_href {
manifest_items.push_str(&format!(
- "
- ",
- escape(cover_href)
+ "
- ",
+ escape(cover_href),
+ escape(&media_type_from_href(cover_href))
));
}
@@ -260,6 +270,7 @@ fn build_opf(
.clone()
.unwrap_or_else(|| "Unknown".to_string());
let description = manifest.book.description.clone().unwrap_or_default();
+ let modified = Utc::now().format("%Y-%m-%dT%H:%M:%SZ").to_string();
format!(
r#"
@@ -269,6 +280,7 @@ fn build_opf(
{}
{}
{}
+ {}
{}
{}
@@ -278,11 +290,42 @@ fn build_opf(
escape(&author),
escape(&manifest.book.language),
escape(&description),
+ escape(&modified),
manifest_items,
spine_items
)
}
+fn xml_id(prefix: &str, value: &str) -> String {
+ let mut id = String::with_capacity(prefix.len() + value.len() + 1);
+ id.push_str(prefix);
+ id.push('-');
+ for ch in value.chars() {
+ if ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.') {
+ id.push(ch);
+ } else {
+ id.push('-');
+ }
+ }
+ id
+}
+
+fn media_type_from_href(href: &str) -> String {
+ match href.rsplit('.').next().map(|ext| ext.to_ascii_lowercase()) {
+ Some(extension) => match extension.as_str() {
+ "jpg" | "jpeg" => "image/jpeg",
+ "png" => "image/png",
+ "gif" => "image/gif",
+ "svg" => "image/svg+xml",
+ "webp" => "image/webp",
+ "avif" => "image/avif",
+ _ => "application/octet-stream",
+ }
+ .to_string(),
+ None => "application/octet-stream".to_string(),
+ }
+}
+
const CONTAINER_XML: &str = r#"
diff --git a/ebookm-core/src/extract.rs b/ebookm-core/src/extract.rs
index d2a4ac9..a7ea7e6 100644
--- a/ebookm-core/src/extract.rs
+++ b/ebookm-core/src/extract.rs
@@ -26,25 +26,7 @@ pub struct InspectResult {
pub fn extract_article(loaded: &LoadedSource) -> Result {
let document = Html::parse_document(&loaded.html);
let json_ld = extract_primary_json_ld(&document);
- let title = select_content(
- &document,
- &[
- r#"meta[property="og:title"]"#,
- r#"article .post-title"#,
- ".post-title",
- "h1",
- "title",
- ],
- "content",
- )
- .or_else(|| {
- select_text(
- &document,
- &[r#"article .post-title"#, ".post-title", "h1", "title"],
- )
- })
- .or_else(|| json_ld_string(&json_ld, "headline"))
- .ok_or_else(|| EbookmError::Extraction {
+ let title = extract_title(&document, &json_ld).ok_or_else(|| EbookmError::Extraction {
input: origin_label(&loaded.origin),
})?;
@@ -131,6 +113,20 @@ fn select_content(document: &Html, selectors: &[&str], attr: &str) -> Option) -> Option {
+ select_text(
+ document,
+ &[
+ "h1.post-title",
+ ".post-title",
+ r#"article .post-title"#,
+ "title",
+ "h1",
+ ],
+ )
+ .or_else(|| json_ld_string(json_ld, "headline"))
+}
+
fn select_text(document: &Html, selectors: &[&str]) -> Option {
selectors.iter().find_map(|selector| {
let selector = Selector::parse(selector).ok()?;
@@ -205,7 +201,11 @@ fn json_ld_string(json_ld: &Option, key: &str) -> Option {
fn json_ld_author(json_ld: &Option) -> Option {
let author = json_ld.as_ref()?.get("author")?;
- if let Some(author_name) = author.get(0).and_then(|entry| entry.get("name")).and_then(Value::as_str) {
+ if let Some(author_name) = author
+ .get(0)
+ .and_then(|entry| entry.get("name"))
+ .and_then(Value::as_str)
+ {
return Some(author_name.to_string());
}
if let Some(author_name) = author.get("name").and_then(Value::as_str) {
@@ -255,7 +255,7 @@ mod tests {
};
let extracted = extract_article(&loaded).expect("extract article");
- assert_eq!(extracted.title, "A Contested Island");
+ assert_eq!(extracted.title, "Chapter 1: A Contested Island");
assert_eq!(extracted.author.as_deref(), Some("John Gu"));
assert_eq!(
extracted.published,
diff --git a/ebookm-core/src/normalize.rs b/ebookm-core/src/normalize.rs
index 4bf8d93..905d643 100644
--- a/ebookm-core/src/normalize.rs
+++ b/ebookm-core/src/normalize.rs
@@ -3,6 +3,7 @@ use std::path::Path;
use kuchiki::traits::*;
use regex::Regex;
+use reqwest::header::CONTENT_TYPE;
use sha1::{Digest, Sha1};
use url::Url;
@@ -159,12 +160,18 @@ fn fetch_local_asset(base_path: &Path, src: &str) -> Result {
}
fn fetch_remote_asset(url: &Url) -> Result {
- let bytes = reqwest::blocking::get(url.clone())
+ let response = reqwest::blocking::get(url.clone())
.and_then(|response| response.error_for_status())
.map_err(|source| EbookmError::Request {
url: url.to_string(),
source,
- })?
+ })?;
+ let content_type = response
+ .headers()
+ .get(CONTENT_TYPE)
+ .and_then(|value| value.to_str().ok())
+ .map(|value| value.to_string());
+ let bytes = response
.bytes()
.map_err(|source| EbookmError::Request {
url: url.to_string(),
@@ -172,7 +179,7 @@ fn fetch_remote_asset(url: &Url) -> Result {
})?
.to_vec();
- let extension = infer_extension_from_str(url.path());
+ let extension = infer_extension_from_response(url.path(), content_type.as_deref(), &bytes);
let media_type = infer_media_type(&extension);
let digest = Sha1::digest(url.as_str().as_bytes());
let id = format!("{:x}", digest);
@@ -189,7 +196,8 @@ fn build_asset_from_path(path: &Path) -> Result {
path: path.display().to_string(),
source,
})?;
- let extension = infer_extension_from_path(path);
+ let path_extension = infer_extension_from_path(path);
+ let extension = infer_extension_from_bytes(Some(path_extension.as_str()), &bytes);
let media_type = infer_media_type(&extension);
let digest = Sha1::digest(path.display().to_string().as_bytes());
let id = format!("{:x}", digest);
@@ -292,8 +300,8 @@ fn infer_extension_from_path(path: &Path) -> String {
path.extension()
.and_then(|value| value.to_str())
.filter(|value| !value.is_empty())
- .unwrap_or("bin")
- .to_string()
+ .map(|value| value.to_ascii_lowercase())
+ .unwrap_or_else(|| "bin".to_string())
}
fn infer_extension_from_str(path: &str) -> String {
@@ -301,8 +309,59 @@ fn infer_extension_from_str(path: &str) -> String {
.extension()
.and_then(|value| value.to_str())
.filter(|value| !value.is_empty())
- .unwrap_or("bin")
- .to_string()
+ .map(|value| value.to_ascii_lowercase())
+ .unwrap_or_else(|| "bin".to_string())
+}
+
+fn infer_extension_from_response(path: &str, content_type: Option<&str>, bytes: &[u8]) -> String {
+ let path_extension = infer_extension_from_str(path);
+ if let Some(extension) = extension_from_content_type(content_type) {
+ return extension;
+ }
+ infer_extension_from_bytes(Some(path_extension.as_str()), bytes)
+}
+
+fn infer_extension_from_bytes(path_extension: Option<&str>, bytes: &[u8]) -> String {
+ if let Some(extension) = sniff_image_extension(bytes) {
+ return extension.to_string();
+ }
+ path_extension.unwrap_or("bin").to_string()
+}
+
+fn extension_from_content_type(content_type: Option<&str>) -> Option {
+ let normalized = content_type?.split(';').next()?.trim().to_ascii_lowercase();
+ let extension = match normalized.as_str() {
+ "image/jpeg" => "jpeg",
+ "image/png" => "png",
+ "image/gif" => "gif",
+ "image/svg+xml" => "svg",
+ "image/webp" => "webp",
+ "image/avif" => "avif",
+ _ => return None,
+ };
+ Some(extension.to_string())
+}
+
+fn sniff_image_extension(bytes: &[u8]) -> Option<&'static str> {
+ if bytes.len() >= 3 && bytes.starts_with(&[0xff, 0xd8, 0xff]) {
+ return Some("jpeg");
+ }
+ if bytes.len() >= 8 && bytes.starts_with(&[0x89, b'P', b'N', b'G', 0x0d, 0x0a, 0x1a, 0x0a]) {
+ return Some("png");
+ }
+ if bytes.len() >= 6 && (bytes.starts_with(b"GIF87a") || bytes.starts_with(b"GIF89a")) {
+ return Some("gif");
+ }
+ if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
+ return Some("webp");
+ }
+ if bytes.len() >= 12
+ && &bytes[4..8] == b"ftyp"
+ && bytes.windows(4).any(|window| window == b"avif")
+ {
+ return Some("avif");
+ }
+ None
}
fn infer_media_type(extension: &str) -> String {
@@ -312,6 +371,7 @@ fn infer_media_type(extension: &str) -> String {
"gif" => "image/gif",
"svg" => "image/svg+xml",
"webp" => "image/webp",
+ "avif" => "image/avif",
_ => "application/octet-stream",
}
.to_string()
@@ -329,9 +389,12 @@ fn to_xhtml_fragment(html: &str) -> String {
#[cfg(test)]
mod tests {
- use super::to_xhtml_fragment;
- use quick_xml::events::Event;
+ use super::{
+ extension_from_content_type, infer_extension_from_bytes, infer_extension_from_response,
+ to_xhtml_fragment,
+ };
use quick_xml::Reader;
+ use quick_xml::events::Event;
#[test]
fn converts_void_html_tags_to_xhtml_self_closing_tags() {
@@ -354,4 +417,36 @@ mod tests {
}
}
}
+
+ #[test]
+ fn prefers_actual_jpeg_bytes_over_png_path() {
+ let jpeg_bytes = [0xff, 0xd8, 0xff, 0xdb];
+ assert_eq!(
+ infer_extension_from_response("image.png", None, &jpeg_bytes),
+ "jpeg"
+ );
+ }
+
+ #[test]
+ fn prefers_content_type_for_remote_assets() {
+ let png_bytes = [0x89, b'P', b'N', b'G', 0x0d, 0x0a, 0x1a, 0x0a];
+ assert_eq!(
+ infer_extension_from_response(
+ "image.jpeg",
+ Some("image/png; charset=binary"),
+ &png_bytes
+ ),
+ "png"
+ );
+ assert_eq!(
+ extension_from_content_type(Some("image/avif")),
+ Some("avif".to_string())
+ );
+ }
+
+ #[test]
+ fn preserves_path_extension_when_bytes_are_unknown() {
+ let bytes = b"not an image";
+ assert_eq!(infer_extension_from_bytes(Some("jpg"), bytes), "jpg");
+ }
}
diff --git a/ebookm-core/src/pipeline.rs b/ebookm-core/src/pipeline.rs
index 0a20df2..c6329ae 100644
--- a/ebookm-core/src/pipeline.rs
+++ b/ebookm-core/src/pipeline.rs
@@ -28,6 +28,7 @@ struct ChapterHeaderOptions {
pub struct BuiltEntry {
pub id: String,
pub hidden_from_toc: bool,
+ pub section_anchor: Option,
pub chapter: BuiltChapter,
pub assets: Vec,
}
@@ -186,10 +187,23 @@ pub fn build_epub(manifest: &Manifest, manifest_path: &Path) -> Result<()> {
&metadata,
)?;
+ let section_anchor = section
+ .entries
+ .first()
+ .filter(|first_entry_id| *first_entry_id == entry_id)
+ .map(|_| format!("section-{}", section.id));
+
built_entries.push(BuiltEntry {
id: entry_id.clone(),
hidden_from_toc: entry.toc.hidden,
- chapter: build_chapter(entry_id, entry, &manifest.defaults, &normalized),
+ section_anchor: section_anchor.clone(),
+ chapter: build_chapter(
+ entry_id,
+ entry,
+ &manifest.defaults,
+ &normalized,
+ section_anchor.as_deref(),
+ ),
assets: normalized.assets,
});
}
@@ -215,6 +229,7 @@ fn build_chapter(
entry: &crate::manifest::EntryDefinition,
defaults: &crate::manifest::DefaultsConfig,
doc: &NormalizedDocument,
+ section_anchor: Option<&str>,
) -> BuiltChapter {
let nav_title = entry.toc.title.clone().unwrap_or_else(|| doc.title.clone());
let header = resolve_header_options(entry, defaults);
@@ -243,6 +258,10 @@ fn build_chapter(
format!(r#"
{}
"#, meta_lines.join(""))
};
+ let section_anchor_markup = section_anchor
+ .map(|anchor| format!(r#""#, escape_html(anchor)))
+ .unwrap_or_default();
+
let xhtml = format!(
r#"
@@ -251,6 +270,7 @@ fn build_chapter(
+ {}
{}
{}
{}
@@ -258,6 +278,7 @@ fn build_chapter(
"#,
escape_html(&doc.title),
escape_html(entry_id),
+ section_anchor_markup,
escape_html(&doc.title),
meta_block,
doc.body_xhtml
@@ -425,8 +446,10 @@ link_rules:
assert!(!chapter_contents.contains("urn:uuid:test-book"));
assert!(chapter_contents.contains("../assets/"));
drop(chapter);
- assert!(archive
- .file_names()
- .any(|name| name.starts_with("OEBPS/assets/") && name.ends_with(".jpg")));
+ assert!(
+ archive
+ .file_names()
+ .any(|name| name.starts_with("OEBPS/assets/") && name.ends_with(".jpg"))
+ );
}
}