From 6d0e693ed70f7d92827bd7b3752d532ea2df8efd Mon Sep 17 00:00:00 2001 From: Nick Groenen Date: Sun, 4 Aug 2024 15:16:33 +0200 Subject: [PATCH 1/3] Upgrade to pulldown-cmark 0.11 --- Cargo.lock | 17 ++- Cargo.toml | 4 +- changelog.d/252.breaking.md | 12 ++ src/lib.rs | 115 +++++++++++------- .../main-samples/pure-markdown-examples.md | 2 +- 5 files changed, 101 insertions(+), 49 deletions(-) create mode 100644 changelog.d/252.breaking.md diff --git a/Cargo.lock b/Cargo.lock index e6ed408..a8db518 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -456,21 +456,28 @@ dependencies = [ [[package]] name = "pulldown-cmark" -version = "0.9.6" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57206b407293d2bcd3af849ce869d52068623f19e1b5ff8e8778e3309439682b" +checksum = "8746739f11d39ce5ad5c2520a9b75285310dbfe78c541ccf832d38615765aec0" dependencies = [ "bitflags 2.6.0", "getopts", "memchr", + "pulldown-cmark-escape", "unicase", ] [[package]] -name = "pulldown-cmark-to-cmark" -version = "11.2.0" +name = "pulldown-cmark-escape" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dd464f32d7631035e849fcd969a603e9bb17ceaebe8467352a7728147f34e42" +checksum = "007d8adb5ddab6f8e3f491ac63566a7d5002cc7ed73901f72057943fa71ae1ae" + +[[package]] +name = "pulldown-cmark-to-cmark" +version = "15.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9c77db841443d89a57ae94f22d29c022f6d9f41b00bddbf1f4024dbaf4bdce1" dependencies = [ "pulldown-cmark", ] diff --git a/Cargo.toml b/Cargo.toml index 98234ab..1109e75 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,8 +31,8 @@ ignore = "0.4.22" matter = "0.1.0-alpha4" pathdiff = "0.2.1" percent-encoding = "2.3.1" -pulldown-cmark = "0.9.3" -pulldown-cmark-to-cmark = "11.0.2" +pulldown-cmark = "0.11.0" +pulldown-cmark-to-cmark = "15.0.0" rayon = "1.10.0" regex = "1.10.5" serde_yaml = "0.9.34" diff --git a/changelog.d/252.breaking.md b/changelog.d/252.breaking.md new file mode 100644 index 0000000..1e1fe55 --- /dev/null +++ b/changelog.d/252.breaking.md @@ -0,0 +1,12 @@ +Upgrade [pulldown-cmark](https://crates.io/crates/pulldown-cmark) from 0.9 to 0.11 + +pulldown-cmark is the Markdown/CommonMark parser that is used to read and convert notes (together with [pulldown-cmark-to-cmark](https://crates.io/crates/pulldown-cmark-to-cmark)). + +For end-users that call the obsidian-export CLI this upgrade will be mostly transparent, except that Math blocks are now properly processed without getting mangled. + +People who use the library directly may face more significant breaking changes if they have custom postprocessors, as pulldown-cmark's events have gone through various breaking changes. +For more information, see: + +- +- +- diff --git a/src/lib.rs b/src/lib.rs index 6a2d821..7d3e570 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,7 +19,7 @@ use frontmatter::{frontmatter_from_str, frontmatter_to_str}; pub use frontmatter::{Frontmatter, FrontmatterStrategy}; use pathdiff::diff_paths; use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS}; -use pulldown_cmark::{CodeBlockKind, CowStr, Event, HeadingLevel, Options, Parser, Tag}; +use pulldown_cmark::{CodeBlockKind, CowStr, Event, HeadingLevel, Options, Parser, Tag, TagEnd}; use pulldown_cmark_to_cmark::cmark_with_options; use rayon::prelude::*; use references::{ObsidianNoteReference, RefParser, RefParserState, RefType}; @@ -662,20 +662,18 @@ impl<'a> Exporter<'a> { // into an image reference instead. Slightly hacky, but avoids needing // to keep another utility function around for this, or introducing an // extra parameter on make_link_to_file. - Event::Start(Tag::Link(linktype, cowstr1, cowstr2)) => { - Event::Start(Tag::Image( - linktype, - CowStr::from(cowstr1.into_string()), - CowStr::from(cowstr2.into_string()), - )) - } - Event::End(Tag::Link(linktype, cowstr1, cowstr2)) => { - Event::End(Tag::Image( - linktype, - CowStr::from(cowstr1.into_string()), - CowStr::from(cowstr2.into_string()), - )) - } + Event::Start(Tag::Link { + link_type, + dest_url, + title, + id, + }) => Event::Start(Tag::Image { + link_type, + dest_url: CowStr::from(dest_url.into_string()), + title: CowStr::from(title.into_string()), + id: CowStr::from(id.into_string()), + }), + Event::End(TagEnd::Link) => Event::End(TagEnd::Image), _ => event, }) .collect() @@ -707,7 +705,7 @@ impl<'a> Exporter<'a> { return vec![ Event::Start(Tag::Emphasis), Event::Text(CowStr::from(reference.display())), - Event::End(Tag::Emphasis), + Event::End(TagEnd::Emphasis), ]; } let target_file = target_file.unwrap(); @@ -731,16 +729,17 @@ impl<'a> Exporter<'a> { link.push_str(&slugify(section)); } - let link_tag = Tag::Link( - pulldown_cmark::LinkType::Inline, - CowStr::from(link), - CowStr::from(""), - ); + let link_tag = Tag::Link { + link_type: pulldown_cmark::LinkType::Inline, + dest_url: CowStr::from(link), + title: CowStr::from(""), + id: CowStr::from(""), + }; vec![ - Event::Start(link_tag.clone()), + Event::Start(link_tag), Event::Text(CowStr::from(reference.display())), - Event::End(link_tag.clone()), + Event::End(TagEnd::Link), ] } } @@ -841,8 +840,7 @@ fn reduce_to_section<'a>(events: MarkdownEvents<'a>, section: &str) -> MarkdownE for event in events { filtered_events.push(event.clone()); match event { - // FIXME: This should propagate fragment_identifier and classes. - Event::Start(Tag::Heading(level, _fragment_identifier, _classes)) => { + Event::Start(Tag::Heading { level, .. }) => { last_tag_was_heading = true; last_level = level; if currently_in_target_section && level <= section_level { @@ -881,10 +879,11 @@ fn reduce_to_section<'a>(events: MarkdownEvents<'a>, section: &str) -> MarkdownE fn event_to_owned<'a>(event: Event<'_>) -> Event<'a> { match event { Event::Start(tag) => Event::Start(tag_to_owned(tag)), - Event::End(tag) => Event::End(tag_to_owned(tag)), + Event::End(tag) => Event::End(tag), Event::Text(cowstr) => Event::Text(CowStr::from(cowstr.into_string())), Event::Code(cowstr) => Event::Code(CowStr::from(cowstr.into_string())), Event::Html(cowstr) => Event::Html(CowStr::from(cowstr.into_string())), + Event::InlineHtml(cowstr) => Event::InlineHtml(CowStr::from(cowstr.into_string())), Event::FootnoteReference(cowstr) => { Event::FootnoteReference(CowStr::from(cowstr.into_string())) } @@ -892,17 +891,37 @@ fn event_to_owned<'a>(event: Event<'_>) -> Event<'a> { Event::HardBreak => Event::HardBreak, Event::Rule => Event::Rule, Event::TaskListMarker(checked) => Event::TaskListMarker(checked), + Event::InlineMath(cowstr) => Event::InlineMath(CowStr::from(cowstr.into_string())), + Event::DisplayMath(cowstr) => Event::DisplayMath(CowStr::from(cowstr.into_string())), } } fn tag_to_owned<'a>(tag: Tag<'_>) -> Tag<'a> { match tag { Tag::Paragraph => Tag::Paragraph, - Tag::Heading(level, _fragment_identifier, _classes) => { - // FIXME: This should propagate fragment_identifier and classes. - Tag::Heading(level, None, Vec::new()) - } - Tag::BlockQuote => Tag::BlockQuote, + Tag::Heading { + level: heading_level, + id, + classes, + attrs, + } => Tag::Heading { + level: heading_level, + id: id.map(|cowstr| CowStr::from(cowstr.into_string())), + classes: classes + .into_iter() + .map(|cowstr| CowStr::from(cowstr.into_string())) + .collect(), + attrs: attrs + .into_iter() + .map(|(attr, value)| { + ( + CowStr::from(attr.into_string()), + value.map(|cowstr| CowStr::from(cowstr.into_string())), + ) + }) + .collect(), + }, + Tag::BlockQuote(blockquote_kind) => Tag::BlockQuote(blockquote_kind), Tag::CodeBlock(codeblock_kind) => Tag::CodeBlock(codeblock_kind_to_owned(codeblock_kind)), Tag::List(optional) => Tag::List(optional), Tag::Item => Tag::Item, @@ -916,16 +935,30 @@ fn tag_to_owned<'a>(tag: Tag<'_>) -> Tag<'a> { Tag::Emphasis => Tag::Emphasis, Tag::Strong => Tag::Strong, Tag::Strikethrough => Tag::Strikethrough, - Tag::Link(linktype, cowstr1, cowstr2) => Tag::Link( - linktype, - CowStr::from(cowstr1.into_string()), - CowStr::from(cowstr2.into_string()), - ), - Tag::Image(linktype, cowstr1, cowstr2) => Tag::Image( - linktype, - CowStr::from(cowstr1.into_string()), - CowStr::from(cowstr2.into_string()), - ), + Tag::Link { + link_type, + dest_url, + title, + id, + } => Tag::Link { + link_type, + dest_url: CowStr::from(dest_url.into_string()), + title: CowStr::from(title.into_string()), + id: CowStr::from(id.into_string()), + }, + Tag::Image { + link_type, + dest_url, + title, + id, + } => Tag::Image { + link_type, + dest_url: CowStr::from(dest_url.into_string()), + title: CowStr::from(title.into_string()), + id: CowStr::from(id.into_string()), + }, + Tag::HtmlBlock => Tag::HtmlBlock, + Tag::MetadataBlock(metadata_block_kind) => Tag::MetadataBlock(metadata_block_kind), } } diff --git a/tests/testdata/expected/main-samples/pure-markdown-examples.md b/tests/testdata/expected/main-samples/pure-markdown-examples.md index 0d94b45..54c7298 100644 --- a/tests/testdata/expected/main-samples/pure-markdown-examples.md +++ b/tests/testdata/expected/main-samples/pure-markdown-examples.md @@ -37,7 +37,7 @@ ~~Strikethrough~~ |Table|| -|-----|--| +|-----|-| |Foo|Bar| [link text](link-location.md) From e2ef435f04d0525dc2cfd83e51a6b41840e87f05 Mon Sep 17 00:00:00 2001 From: Nick Groenen Date: Sun, 4 Aug 2024 20:38:45 +0200 Subject: [PATCH 2/3] Enable math extension This ensures LaTeX/MathJax blocks don't get mangled as seen in https://github.com/zoni/obsidian-export/issues/14. --- changelog.d/14.breaking.md | 1 + changelog.d/14.fix.md | 4 ++++ changelog.d/252.fix.md | 1 + changelog.d/259.breaking.md | 1 + src/lib.rs | 10 +++++----- tests/testdata/expected/main-samples/math.md | 11 +++++++++++ tests/testdata/input/main-samples/math.md | 10 ++++++++++ 7 files changed, 33 insertions(+), 5 deletions(-) create mode 120000 changelog.d/14.breaking.md create mode 100644 changelog.d/14.fix.md create mode 120000 changelog.d/252.fix.md create mode 120000 changelog.d/259.breaking.md create mode 100644 tests/testdata/expected/main-samples/math.md create mode 100644 tests/testdata/input/main-samples/math.md diff --git a/changelog.d/14.breaking.md b/changelog.d/14.breaking.md new file mode 120000 index 0000000..1748068 --- /dev/null +++ b/changelog.d/14.breaking.md @@ -0,0 +1 @@ +252.breaking.md \ No newline at end of file diff --git a/changelog.d/14.fix.md b/changelog.d/14.fix.md new file mode 100644 index 0000000..6779103 --- /dev/null +++ b/changelog.d/14.fix.md @@ -0,0 +1,4 @@ +Don't escape square brackets in math expressions + +The upgrade to [pulldown-cmark](https://crates.io/crates/pulldown-cmark) 0.11 (see Backwards-incompatible Changes) includes official support for LaTeX-style math expressions. +With the markdown parser supporting this syntax natively, math expressions are now processed correctly without edge-cases. diff --git a/changelog.d/252.fix.md b/changelog.d/252.fix.md new file mode 120000 index 0000000..56ead1e --- /dev/null +++ b/changelog.d/252.fix.md @@ -0,0 +1 @@ +14.fix.md \ No newline at end of file diff --git a/changelog.d/259.breaking.md b/changelog.d/259.breaking.md new file mode 120000 index 0000000..1748068 --- /dev/null +++ b/changelog.d/259.breaking.md @@ -0,0 +1 @@ +252.breaking.md \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 7d3e570..bab9cc2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -483,11 +483,11 @@ impl<'a> Exporter<'a> { let frontmatter = frontmatter_from_str(&frontmatter).context(FrontMatterDecodeSnafu { path })?; - let mut parser_options = Options::empty(); - parser_options.insert(Options::ENABLE_TABLES); - parser_options.insert(Options::ENABLE_FOOTNOTES); - parser_options.insert(Options::ENABLE_STRIKETHROUGH); - parser_options.insert(Options::ENABLE_TASKLISTS); + let parser_options = Options::ENABLE_TABLES + | Options::ENABLE_FOOTNOTES + | Options::ENABLE_STRIKETHROUGH + | Options::ENABLE_TASKLISTS + | Options::ENABLE_MATH; let mut ref_parser = RefParser::new(); let mut events = vec![]; diff --git a/tests/testdata/expected/main-samples/math.md b/tests/testdata/expected/main-samples/math.md new file mode 100644 index 0000000..5da3a9c --- /dev/null +++ b/tests/testdata/expected/main-samples/math.md @@ -0,0 +1,11 @@ +This sentence uses `$` delimiters to show math inline: $\sqrt{3x-1}+(1+x)^2$ + +This is the same math expression expressed as a block element: +$$\sqrt{3x-1}+(1+x)^2$$ + + + +With square brackets (inline): $[0, 2\pi]$ + +With square brackets (block): +$$[0, 2\pi]$$ diff --git a/tests/testdata/input/main-samples/math.md b/tests/testdata/input/main-samples/math.md new file mode 100644 index 0000000..de6923e --- /dev/null +++ b/tests/testdata/input/main-samples/math.md @@ -0,0 +1,10 @@ +This sentence uses `$` delimiters to show math inline: $\sqrt{3x-1}+(1+x)^2$ + +This is the same math expression expressed as a block element: +$$\sqrt{3x-1}+(1+x)^2$$ + + +With square brackets (inline): $[0, 2\pi]$ + +With square brackets (block): +$$[0, 2\pi]$$ From 3afab84d697d82df1d4dfcc1948dd72dcdbe4c56 Mon Sep 17 00:00:00 2001 From: Nick Groenen Date: Sun, 4 Aug 2024 23:58:15 +0200 Subject: [PATCH 3/3] Get frontmatter directly from pulldown-cmark pulldown-cmark 0.10.0 introduced metadata blocks in pulldown-cmark/pulldown-cmark#641. Using this we can drop the dependency on matter and grab the frontmatter as we're consuming markdown events from the note being parsed, instead of having to extract the frontmatter separately. --- Cargo.lock | 17 ----------------- Cargo.toml | 1 - src/lib.rs | 33 ++++++++++++++++++++++++++------- 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a8db518..7416b7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -331,12 +331,6 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" -[[package]] -name = "lazy_static" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" - [[package]] name = "libc" version = "0.2.155" @@ -355,16 +349,6 @@ version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" -[[package]] -name = "matter" -version = "0.1.0-alpha4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc16e839c57e0ad77957c42d39baab3692a1c6fa47692066470cddc24a5b0cd0" -dependencies = [ - "lazy_static", - "regex", -] - [[package]] name = "memchr" version = "2.7.4" @@ -379,7 +363,6 @@ dependencies = [ "filetime", "gumdrop", "ignore", - "matter", "pathdiff", "percent-encoding", "pretty_assertions", diff --git a/Cargo.toml b/Cargo.toml index 1109e75..1d23138 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,7 +28,6 @@ doc = false eyre = "0.6.12" gumdrop = "0.8.1" ignore = "0.4.22" -matter = "0.1.0-alpha4" pathdiff = "0.2.1" percent-encoding = "2.3.1" pulldown-cmark = "0.11.0" diff --git a/src/lib.rs b/src/lib.rs index bab9cc2..3dfbdbf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -467,6 +467,7 @@ impl<'a> Exporter<'a> { #[allow(clippy::too_many_lines)] #[allow(clippy::panic_in_result_fn)] + #[allow(clippy::shadow_unrelated)] fn parse_obsidian_note<'b>( &self, path: &Path, @@ -478,23 +479,40 @@ impl<'a> Exporter<'a> { }); } let content = fs::read_to_string(path).context(ReadSnafu { path })?; - let (frontmatter, content) = - matter::matter(&content).unwrap_or((String::new(), content.clone())); - let frontmatter = - frontmatter_from_str(&frontmatter).context(FrontMatterDecodeSnafu { path })?; + let mut frontmatter = String::new(); let parser_options = Options::ENABLE_TABLES | Options::ENABLE_FOOTNOTES | Options::ENABLE_STRIKETHROUGH | Options::ENABLE_TASKLISTS - | Options::ENABLE_MATH; + | Options::ENABLE_MATH + | Options::ENABLE_YAML_STYLE_METADATA_BLOCKS; let mut ref_parser = RefParser::new(); let mut events = vec![]; // Most of the time, a reference triggers 5 events: [ or ![, [, , ], ] let mut buffer = Vec::with_capacity(5); - for event in Parser::new_ext(&content, parser_options) { + let mut parser = Parser::new_ext(&content, parser_options); + 'outer: while let Some(event) = parser.next() { + // When encountering a metadata block (frontmatter), collect all events until getting + // to the end of the block, at which point the nested loop will break out to the outer + // loop again. + if matches!(event, Event::Start(Tag::MetadataBlock(_kind))) { + for event in parser.by_ref() { + match event { + Event::Text(cowstr) => frontmatter.push_str(&cowstr), + Event::End(TagEnd::MetadataBlock(_kind)) => { + continue 'outer; + }, + _ => panic!( + "Encountered an unexpected event while processing frontmatter in {}. Please report this as a bug with a copy of the note contents and this text: \n\nEvent: {:?}\n", + path.display(), + event + ), + } + } + } if ref_parser.state == RefParserState::Resetting { events.append(&mut buffer); buffer.clear(); @@ -583,8 +601,9 @@ impl<'a> Exporter<'a> { if !buffer.is_empty() { events.append(&mut buffer); } + Ok(( - frontmatter, + frontmatter_from_str(&frontmatter).context(FrontMatterDecodeSnafu { path })?, events.into_iter().map(event_to_owned).collect(), )) }