more grammar fuckery and some implementation of markdown parser

This commit is contained in:
Pagwin 2025-08-12 00:03:11 -04:00
parent 7c13bdc4af
commit 94d7047534
No known key found for this signature in database
GPG key ID: 81137023740CA260
2 changed files with 127 additions and 94 deletions

View file

@ -37,14 +37,23 @@ htmlBlock = pure $ HTML $ Raw ""
paragraph :: Parser Element paragraph :: Parser Element
paragraph = do paragraph = do
first_text <- inlineText first <- paragraphLine
rem_text <- many (endOfLine >> inlineText) rem <- many paragraphContinuation
pure $ Paragraph $ P [] pure $ Paragraph $ P []
inlineText :: Parser InlineText paragraphLine :: Parser [InlineText]
inlineText = choi paragraphLine = many inlineText <* endOfLine
blankline :: Parser Element paragraphContinuation :: Parser [InlineText]
blankline = do paragraphContinuation = notFollowedBy blockElemStart *> paragraphLine
inlineText :: Parser InlineText
inlineText = choice [emphasis, strong, inlineCode, link, image, inlineHTML, paragraphLineBreak, escapedChar, plainText]
plainText :: Parser Text
plainText =
blankLine :: Parser Element
blankLine = do
endOfLine endOfLine
pure $ BlankLine BL pure $ BlankLine BL

View file

@ -1,135 +1,158 @@
; Left-Factored ABNF Grammar for Markdown with Embedded HTML ; Left-Factored ABNF Grammar for Markdown with Embedded HTML
; Based on RFC 5234 (ABNF) syntax ; Based on RFC 5234 (ABNF) syntax - no negative lookahead operators
; Top-level document structure ; Top-level document structure
document = *( block-element / CRLF ) document = *( block-element / CRLF )
; Block-level elements (left-factored by common prefixes) ; Block-level elements (true left-factoring with unique prefixes)
block-element = heading / code-block / blockquote / list / horizontal-rule / block-element = heading-block /
html-block / paragraph blockquote-block /
unordered-list-block /
ordered-list-block /
fenced-code-block /
indented-code-block /
horizontal-rule-block /
html-block /
paragraph-block
; Headings (ATX-style: # ## ### etc.) ; Headings (must start with #)
heading = heading-prefix heading-content heading-block = "#" heading-rest
heading-prefix = 1*6"#" *WSP heading-rest = [ 1*5"#" ] *WSP *( inline-element ) line-ending
heading-content = *( inline-element ) line-ending
; Code blocks (fenced with ``` or indented) ; Blockquotes (must start with >)
code-block = fenced-code-block / indented-code-block blockquote-block = ">" blockquote-rest
fenced-code-block = code-fence [ language-info ] line-ending blockquote-rest = [ WSP ] *( inline-element ) line-ending
*( code-line ) *( ">" [ WSP ] *( inline-element ) line-ending )
code-fence line-ending
code-fence = "```"
language-info = 1*( ALPHA / DIGIT / "-" / "+" / "." )
indented-code-block = 1*( indented-code-line )
indented-code-line = 4*WSP 1*VCHAR line-ending
code-line = *VCHAR line-ending
; Blockquotes ; Unordered lists (must start with *, -, or + followed by space)
blockquote = 1*( blockquote-line ) unordered-list-block = unordered-list-item 1*( unordered-list-item )
blockquote-line = ">" [ WSP ] *( inline-element ) line-ending
; Lists (left-factored by list marker)
list = unordered-list / ordered-list
unordered-list = 1*( unordered-list-item )
ordered-list = 1*( ordered-list-item )
unordered-list-item = unordered-marker list-item-content unordered-list-item = unordered-marker list-item-content
ordered-list-item = ordered-marker list-item-content
unordered-marker = ( "*" / "-" / "+" ) WSP unordered-marker = ( "*" / "-" / "+" ) WSP
ordered-marker = 1*DIGIT "." WSP
list-item-content = *( inline-element ) line-ending list-item-content = *( inline-element ) line-ending
*( list-continuation ) *( list-continuation )
list-continuation = 2*WSP *( inline-element ) line-ending list-continuation = 2*WSP *( inline-element ) line-ending
; Horizontal rule ; Ordered lists (must start with digit(s) followed by . and space)
horizontal-rule = hr-dashes / hr-asterisks / hr-underscores ordered-list-block = ordered-list-item 1*( ordered-list-item )
hr-dashes = 3*( "-" ) *( WSP / "-" ) line-ending ordered-list-item = ordered-marker list-item-content
hr-asterisks = 3*( "*" ) *( WSP / "*" ) line-ending ordered-marker = 1*DIGIT "." WSP
hr-underscores = 3*( "_" ) *( WSP / "_" ) line-ending
; HTML blocks ; Fenced code blocks (must start with ``` or ~~~)
html-block = html-block-tag / html-comment-block / html-declaration fenced-code-block = code-fence fenced-code-rest
html-block-tag = "<" tag-name *( WSP attribute ) [ WSP ] [ "/" ] ">" fenced-code-rest = [ language-info ] line-ending
*( code-line )
code-fence line-ending
code-fence = "```" / "~~~"
language-info = 1*( ALPHA / DIGIT / "-" / "+" / "." )
code-line = *VCHAR line-ending
; Indented code blocks (must start with 4+ spaces followed by non-space)
indented-code-block = 4*WSP VCHAR indented-code-rest
indented-code-rest = *VCHAR line-ending
*( 4*WSP *VCHAR line-ending )
; Horizontal rules (must start with 3+ of same character)
horizontal-rule-block = hr-pattern hr-rest
hr-pattern = 3*"*" / 3*"-" / 3*"_"
hr-rest = *( WSP / "*" / "-" / "_" ) line-ending
; HTML blocks (must start with <)
html-block = "<" html-block-rest
html-block-rest = html-tag-block / html-comment-block / html-declaration-block
html-tag-block = tag-name *( WSP attribute ) [ WSP ] [ "/" ] ">"
*( html-content ) *( html-content )
[ "</" tag-name ">" ] [ "</" tag-name ">" ]
line-ending line-ending
html-comment-block = "<!--" *( CHAR ) "-->" line-ending html-comment-block = "!--" *( CHAR ) "-->" line-ending
html-declaration = "<!" 1*ALPHA *( WSP / VCHAR ) ">" line-ending html-declaration-block = "!" 1*ALPHA *( WSP / VCHAR ) ">" line-ending
; Paragraphs (catch-all for regular text) ; Paragraphs (everything else that doesn't match above patterns)
paragraph = paragraph-line 1*( paragraph-continuation ) paragraph-block = paragraph-text-line 1*( paragraph-continuation )
paragraph-line = *( inline-element ) line-ending paragraph-text-line = paragraph-first-element *( inline-element ) line-ending
paragraph-continuation = !block-element-start *( inline-element ) line-ending paragraph-continuation = paragraph-first-element *( inline-element ) line-ending
block-element-start = "#" / ">" / ( 1*DIGIT "." WSP ) /
( ( "*" / "-" / "+" ) WSP ) /
"```" / 4*WSP / "<"
; Inline elements (left-factored by opening characters) ; First element of paragraph - anything that's not a block starter
paragraph-first-element = emphasis / strong / code-span / link / image /
html-inline / line-break / escaped-char / plain-text
; Inline elements
inline-element = emphasis / strong / code-span / link / image / inline-element = emphasis / strong / code-span / link / image /
html-inline / line-break / escaped-char / plain-text html-inline / line-break / escaped-char / plain-text
; Emphasis and strong (left-factored by delimiter) ; Emphasis and strong (left-factored by delimiter)
emphasis = emphasis-asterisk / emphasis-underscore emphasis = emphasis-asterisk / emphasis-underscore
strong = strong-asterisk / strong-underscore strong = strong-asterisk / strong-underscore
emphasis-asterisk = "*" emphasis-content "*" emphasis-asterisk = "*" emphasis-content-asterisk "*"
emphasis-underscore = "_" emphasis-content "_" emphasis-underscore = "_" emphasis-content-underscore "_"
strong-asterisk = "**" strong-content "**" strong-asterisk = "**" strong-content-asterisk "**"
strong-underscore = "__" strong-content "__" strong-underscore = "__" strong-content-underscore "__"
emphasis-content = 1*( !( "*" / line-ending ) inline-element )
strong-content = 1*( !( "**" / line-ending ) inline-element ) ; Content within emphasis/strong (simplified - no nesting for clarity)
emphasis-content-asterisk = 1*( escaped-char / plain-text-no-asterisk )
emphasis-content-underscore = 1*( escaped-char / plain-text-no-underscore )
strong-content-asterisk = 1*( escaped-char / plain-text-no-double-asterisk )
strong-content-underscore = 1*( escaped-char / plain-text-no-double-underscore )
; Code spans ; Code spans
code-span = code-delimiter code-span-content code-delimiter code-span = "`" code-span-content "`" /
code-delimiter = 1*"`" "``" code-span-content-double "``" /
code-span-content = 1*( !"`" CHAR ) "```" code-span-content-triple "```"
code-span-content = 1*( %x20-5F / %x61-10FFFF ) ; Everything except `
code-span-content-double = *( %x20-5F / %x61-10FFFF / "`" )
code-span-content-triple = *( %x20-5F / %x61-10FFFF / "`" / "``" )
; Links and images (left-factored by opening bracket) ; Links and images (left-factored by opening bracket)
link = link-reference / link-inline link = "[" link-content "]" link-destination
image = image-reference / image-inline image = "!" "[" alt-text "]" link-destination
link-reference = "[" link-text "]" "[" reference-id "]" link-content = 1*( escaped-char / plain-text-no-bracket )
link-inline = "[" link-text "]" "(" url [ WSP title ] ")" alt-text = 1*( escaped-char / plain-text-no-bracket )
image-reference = "!" "[" alt-text "]" "[" reference-id "]" link-destination = "(" url [ WSP title ] ")" /
image-inline = "!" "[" alt-text "]" "(" url [ WSP title ] ")" "[" reference-id "]"
link-text = 1*( !( "]" / line-ending ) inline-element ) reference-id = 1*( ALPHA / DIGIT / WSP )
alt-text = 1*( !( "]" / line-ending ) CHAR ) url = 1*( %x21-29 / %x2B-10FFFF ) ; Everything except space and )
reference-id = 1*( !( "]" / line-ending ) CHAR ) title = DQUOTE title-content-dquote DQUOTE /
url = 1*( !( WSP / ")" / line-ending ) CHAR ) "'" title-content-squote "'" /
title = ( DQUOTE title-content DQUOTE ) / "(" title-content-paren ")"
( "'" title-content "'" ) / title-content-dquote = *( %x20-21 / %x23-10FFFF ) ; Everything except "
( "(" title-content ")" ) title-content-squote = *( %x20-26 / %x28-10FFFF ) ; Everything except '
title-content = *( !( DQUOTE / "'" / ")" ) CHAR ) title-content-paren = *( %x20-28 / %x2A-10FFFF ) ; Everything except )
; Inline HTML ; Inline HTML
html-inline = html-tag-inline / html-entity html-inline = "<" html-inline-content
html-tag-inline = "<" tag-name *( WSP attribute ) [ WSP ] [ "/" ] ">" html-inline-content = tag-name *( WSP attribute ) [ WSP ] [ "/" ] ">" /
html-entity = "&" entity-name ";" entity-name ";"
entity-name = 1*( ALPHA / DIGIT ) entity-name = 1*( ALPHA / DIGIT )
; HTML attributes and content ; HTML attributes and tag content
tag-name = ALPHA *( ALPHA / DIGIT / "-" / ":" ) tag-name = ALPHA *( ALPHA / DIGIT / "-" / ":" )
attribute = attribute-name [ "=" attribute-value ] attribute = attribute-name [ "=" attribute-value ]
attribute-name = ( ALPHA / "_" / ":" ) *( ALPHA / DIGIT / "-" / "_" / ":" / "." ) attribute-name = ( ALPHA / "_" / ":" ) *( ALPHA / DIGIT / "-" / "_" / ":" / "." )
attribute-value = ( DQUOTE *( !DQUOTE CHAR ) DQUOTE ) / attribute-value = DQUOTE attribute-value-dquote DQUOTE /
( "'" *( !"'" CHAR ) "'" ) / "'" attribute-value-squote "'" /
( 1*( !( WSP / ">" ) VCHAR ) ) attribute-value-unquoted
html-content = *( !( "</" ) CHAR ) attribute-value-dquote = *( %x20-21 / %x23-10FFFF ) ; Everything except "
attribute-value-squote = *( %x20-26 / %x28-10FFFF ) ; Everything except '
attribute-value-unquoted = 1*( %x21-22 / %x24-26 / %x28-2F / %x30-3D / %x3F-10FFFF )
html-content = *( %x20-3B / %x3D-10FFFF ) ; Everything except <
; Line breaks and escaped characters ; Line breaks and escaped characters
line-break = hard-line-break / soft-line-break line-break = 2*WSP line-ending
hard-line-break = 2*WSP line-ending escaped-char = "\" VCHAR
soft-line-break = line-ending
escaped-char = "\" ( VCHAR / WSP )
; Plain text (everything else) ; Plain text variations (to avoid conflicts)
plain-text = 1*( !special-char CHAR ) plain-text = 1*plain-text-char
special-char = "*" / "_" / "`" / "[" / "]" / "(" / ")" / plain-text-char = %x20-21 / %x23-29 / %x2B-2F / %x30-3B / %x3D /
"<" / ">" / "#" / "+" / "-" / "." / "!" / %x3F-40 / %x41-5A / %x5C / %x5E-5F / %x61-7A /
"&" / "\" / line-ending %x7C / %x7E-10FFFF
plain-text-no-asterisk = %x20-29 / %x2B-10FFFF
plain-text-no-underscore = %x20-5E / %x60-10FFFF
plain-text-no-double-asterisk = 1*( %x20-29 / %x2B-10FFFF ) ; Simplified
plain-text-no-double-underscore = 1*( %x20-5E / %x60-10FFFF ) ; Simplified
plain-text-no-bracket = %x20-5A / %x5C-10FFFF
; Basic definitions ; Basic definitions
line-ending = CRLF / LF / CR line-ending = CRLF / LF / CR
WSP = SP / HTAB WSP = SP / HTAB
CHAR = %x00-10FFFF ; Any Unicode character
VCHAR = %x21-7E ; Visible ASCII characters
ALPHA = %x41-5A / %x61-7A ; A-Z / a-z ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
DIGIT = %x30-39 ; 0-9 DIGIT = %x30-39 ; 0-9
SP = %x20 ; Space SP = %x20 ; Space
@ -138,3 +161,4 @@ CR = %x0D ; Carriage return
LF = %x0A ; Line feed LF = %x0A ; Line feed
CRLF = CR LF ; Internet standard newline CRLF = CR LF ; Internet standard newline
DQUOTE = %x22 ; Double quote DQUOTE = %x22 ; Double quote
VCHAR = %x21-7E ; Visible ASCII characters