Compare commits
No commits in common. "bc0475fde4dadc7e12c5cb3b8db657466128bf76" and "5f7856dad095ed6b781c58a2ff1fac78f97258b1" have entirely different histories.
bc0475fde4
...
5f7856dad0
2 changed files with 191 additions and 28 deletions
|
|
@ -12,7 +12,7 @@ import Data.Text (Text)
|
|||
import qualified Data.Text as T
|
||||
import Data.Void (Void)
|
||||
import IR
|
||||
import Text.Megaparsec (Parsec, anySingle, anySingleBut, between, choice, count, eof, manyTill, notFollowedBy, satisfy, skipSome, try, (<?>))
|
||||
import Text.Megaparsec (Parsec, anySingle, anySingleBut, between, choice, count, eof, manyTill, notFollowedBy, satisfy, skipSome, try)
|
||||
import qualified Text.Megaparsec as MP
|
||||
import Text.Megaparsec.Char (alphaNumChar, char, digitChar, string)
|
||||
|
||||
|
|
@ -50,16 +50,16 @@ document = Doc <$> many element <* eof
|
|||
element :: Parser Element
|
||||
element =
|
||||
choice
|
||||
[ try headingBlock <?> "Element Heading",
|
||||
try fencedCodeBlock <?> "Fenced Code Block",
|
||||
try indentedCodeBlock <?> "Indented Code Block",
|
||||
try blockquoteBlock <?> "BlockQuote",
|
||||
try unorderedListBlock <?> "Unordered List",
|
||||
try orderedListBlock <?> "Ordered List",
|
||||
try horizontalRuleBlock <?> "Horizontal Rule",
|
||||
try htmlBlock <?> "HTML Block",
|
||||
try blankLines <?> "Blank Lines", -- Consume blank lines but don't add to AST
|
||||
paragraphBlock <?> "Paragarph"
|
||||
[ try headingBlock,
|
||||
try fencedCodeBlock,
|
||||
try indentedCodeBlock,
|
||||
try blockquoteBlock,
|
||||
try unorderedListBlock,
|
||||
try orderedListBlock,
|
||||
try horizontalRuleBlock,
|
||||
try htmlBlock,
|
||||
try blankLines, -- Consume blank lines but don't add to AST
|
||||
paragraphBlock
|
||||
]
|
||||
|
||||
-- Blank lines (consumed but not stored)
|
||||
|
|
@ -77,11 +77,11 @@ blankLine = do
|
|||
-- Heading Block
|
||||
headingBlock :: Parser Element
|
||||
headingBlock = do
|
||||
hashes <- some (char '#') <?> "Heading Hashes"
|
||||
hashes <- some (char '#')
|
||||
let level = length hashes
|
||||
guard (level <= 6) <?> "Higher than level 6"
|
||||
many (char ' ' <|> char '\t') <?> "Pre-Text Whitespace"
|
||||
content <- manyTill (inlineElement <?> "Header Text") (try lineEnding <?> "Header Ending")
|
||||
guard (level <= 6)
|
||||
many (char ' ' <|> char '\t')
|
||||
content <- manyTill inlineElement (try lineEnding)
|
||||
pure $ Heading $ H level content
|
||||
|
||||
-- Fenced Code Block
|
||||
|
|
@ -246,15 +246,15 @@ paragraphBlock = do
|
|||
inlineElement :: Parser InlineText
|
||||
inlineElement =
|
||||
choice
|
||||
[ try strong <?> "Inline Strong Text",
|
||||
try emphasis <?> "Inline Italic Text",
|
||||
try crossedText <?> "Inline Crossed Text",
|
||||
try codeSpan <?> "Inline Code",
|
||||
try image <?> "Inline Image",
|
||||
try link <?> "Inline Link",
|
||||
try htmlInline <?> "Inline HTML",
|
||||
try escapedChar <?> "Escaped Character",
|
||||
plainText <?> "Inline Plain Text"
|
||||
[ try strong,
|
||||
try emphasis,
|
||||
try crossedText,
|
||||
try codeSpan,
|
||||
try image,
|
||||
try link,
|
||||
try htmlInline,
|
||||
try escapedChar,
|
||||
plainText
|
||||
]
|
||||
|
||||
-- Strong (Bold)
|
||||
|
|
@ -314,8 +314,8 @@ inlineElementNo c =
|
|||
|
||||
plainTextNo :: [Char] -> Parser InlineText
|
||||
plainTextNo disallow = do
|
||||
firstChar <- noneOf disallow <?> "Plain Text Initial Disallow"
|
||||
remChars <- manyTill (plainTextCharNo disallow) lineEnding <?> "Remaining Characters"
|
||||
firstChar <- noneOf disallow
|
||||
remChars <- some $ plainTextCharNo disallow <* notFollowedBy lineEnding
|
||||
pure $ Text $ T.map wspHandler $ T.pack $ firstChar : remChars
|
||||
where
|
||||
wspHandler '\n' = ' '
|
||||
|
|
@ -422,7 +422,7 @@ escapedChar = do
|
|||
|
||||
-- Plain Text
|
||||
plainText :: Parser InlineText
|
||||
plainText = plainTextNo [] <?> "Baseline Plain Text"
|
||||
plainText = plainTextNo []
|
||||
|
||||
plainTextBaseDisallow :: [Char]
|
||||
plainTextBaseDisallow = "[~`_*<"
|
||||
|
|
@ -441,7 +441,7 @@ plainTextNoBracket = plainTextNo "[]"
|
|||
|
||||
-- Helper Parsers
|
||||
lineEnding :: Parser ()
|
||||
lineEnding = void (try $ count 2 (try (string "\r\n") <|> try (string "\n") <|> string "\r")) <|> eof
|
||||
lineEnding = void $ count 2 (try (string "\r\n") <|> try (string "\n") <|> string "\r")
|
||||
|
||||
wsParser :: Parser ()
|
||||
wsParser = void $ some (char ' ' <|> char '\t')
|
||||
|
|
|
|||
163
markdown.abnf
Normal file
163
markdown.abnf
Normal file
|
|
@ -0,0 +1,163 @@
|
|||
; Left-Factored ABNF Grammar for Markdown with Embedded HTML
|
||||
; Based on RFC 5234 (ABNF) syntax - no negative lookahead operators
|
||||
|
||||
; Top-level document structure
|
||||
document = *( block-element / CRLF )
|
||||
|
||||
; Block-level elements (true left-factoring with unique prefixes)
|
||||
block-element = heading-block /
|
||||
blockquote-block /
|
||||
unordered-list-block /
|
||||
ordered-list-block /
|
||||
fenced-code-block /
|
||||
indented-code-block /
|
||||
horizontal-rule-block /
|
||||
html-block /
|
||||
paragraph-block
|
||||
|
||||
; Headings (must start with #)
|
||||
heading-block = "#" heading-rest
|
||||
heading-rest = [ 1*5"#" ] *WSP *( inline-element ) line-ending
|
||||
|
||||
; Blockquotes (must start with >)
|
||||
blockquote-block = ">" blockquote-rest
|
||||
blockquote-rest = [ WSP ] *( inline-element ) line-ending
|
||||
*( ">" [ WSP ] *( inline-element ) line-ending )
|
||||
|
||||
; Unordered lists (must start with *, -, or + followed by space)
|
||||
unordered-list-block = unordered-list-item 1*( unordered-list-item )
|
||||
unordered-list-item = unordered-marker list-item-content
|
||||
unordered-marker = ( "*" / "-" / "+" ) WSP
|
||||
list-item-content = *( inline-element ) line-ending
|
||||
*( list-continuation )
|
||||
list-continuation = 2*WSP *( inline-element ) line-ending
|
||||
|
||||
; Ordered lists (must start with digit(s) followed by . and space)
|
||||
ordered-list-block = ordered-list-item 1*( ordered-list-item )
|
||||
ordered-list-item = ordered-marker list-item-content
|
||||
ordered-marker = 1*DIGIT "." WSP
|
||||
|
||||
; Fenced code blocks (must start with ``` or ~~~)
|
||||
fenced-code-block = code-fence fenced-code-rest
|
||||
fenced-code-rest = [ language-info ] line-ending
|
||||
*( code-line )
|
||||
code-fence line-ending
|
||||
code-fence = "```" / "~~~"
|
||||
language-info = 1*( ALPHA / DIGIT / "-" / "+" / "." )
|
||||
code-line = *VCHAR line-ending
|
||||
|
||||
; Indented code blocks (must start with 4+ spaces followed by non-space)
|
||||
indented-code-block = 4*WSP VCHAR indented-code-rest
|
||||
indented-code-rest = *VCHAR line-ending
|
||||
*( 4*WSP *VCHAR line-ending )
|
||||
|
||||
; Horizontal rules (must start with 3+ of same character)
|
||||
horizontal-rule-block = hr-pattern hr-rest
|
||||
hr-pattern = 3*"*" / 3*"-" / 3*"_"
|
||||
hr-rest = *( WSP / "*" / "-" / "_" ) line-ending
|
||||
|
||||
; HTML blocks (must start with <)
|
||||
html-block = "<" html-block-rest
|
||||
html-block-rest = html-tag-block / html-comment-block / html-declaration-block
|
||||
html-tag-block = tag-name *( WSP attribute ) [ WSP ] [ "/" ] ">"
|
||||
*( html-content )
|
||||
[ "</" tag-name ">" ]
|
||||
line-ending
|
||||
html-comment-block = "!--" *( CHAR ) "-->" line-ending
|
||||
html-declaration-block = "!" 1*ALPHA *( WSP / VCHAR ) ">" line-ending
|
||||
|
||||
; Paragraphs (everything else that doesn't match above patterns)
|
||||
paragraph-block = paragraph-text-line 1*( paragraph-continuation )
|
||||
paragraph-text-line = paragraph-first-element *( inline-element ) line-ending
|
||||
paragraph-continuation = paragraph-first-element *( inline-element ) line-ending
|
||||
|
||||
; First element of paragraph - anything that's not a block starter
|
||||
paragraph-first-element = emphasis / strong / code-span / link / image /
|
||||
html-inline / escaped-char / plain-text
|
||||
|
||||
; Inline elements
|
||||
inline-element = emphasis / strong / code-span / link / image /
|
||||
html-inline / escaped-char / plain-text
|
||||
|
||||
; Emphasis and strong (left-factored by delimiter)
|
||||
emphasis = emphasis-asterisk / emphasis-underscore
|
||||
strong = strong-asterisk / strong-underscore
|
||||
emphasis-asterisk = "*" emphasis-content-asterisk "*"
|
||||
emphasis-underscore = "_" emphasis-content-underscore "_"
|
||||
strong-asterisk = "**" strong-content-asterisk "**"
|
||||
strong-underscore = "__" strong-content-underscore "__"
|
||||
|
||||
; Content within emphasis/strong (simplified - no nesting for clarity)
|
||||
emphasis-content-asterisk = 1*( escaped-char / plain-text-no-asterisk )
|
||||
emphasis-content-underscore = 1*( escaped-char / plain-text-no-underscore )
|
||||
strong-content-asterisk = 1*( escaped-char / plain-text-no-double-asterisk )
|
||||
strong-content-underscore = 1*( escaped-char / plain-text-no-double-underscore )
|
||||
|
||||
; Code spans
|
||||
code-span = "`" code-span-content "`" /
|
||||
"``" code-span-content-double "``" /
|
||||
"```" code-span-content-triple "```"
|
||||
code-span-content = 1*( %x20-5F / %x61-10FFFF ) ; Everything except `
|
||||
code-span-content-double = *( %x20-5F / %x61-10FFFF / "`" )
|
||||
code-span-content-triple = *( %x20-5F / %x61-10FFFF / "`" / "``" )
|
||||
|
||||
; Links and images (left-factored by opening bracket)
|
||||
link = "[" link-content "]" link-destination
|
||||
image = "!" "[" alt-text "]" link-destination
|
||||
link-content = 1*( escaped-char / plain-text-no-bracket )
|
||||
alt-text = 1*( escaped-char / plain-text-no-bracket )
|
||||
link-destination = "(" url [ WSP title ] ")" /
|
||||
"[" reference-id "]"
|
||||
reference-id = 1*( ALPHA / DIGIT / WSP )
|
||||
url = 1*( %x21-29 / %x2B-10FFFF ) ; Everything except space and )
|
||||
title = DQUOTE title-content-dquote DQUOTE /
|
||||
"'" title-content-squote "'" /
|
||||
"(" title-content-paren ")"
|
||||
title-content-dquote = *( %x20-21 / %x23-10FFFF ) ; Everything except "
|
||||
title-content-squote = *( %x20-26 / %x28-10FFFF ) ; Everything except '
|
||||
title-content-paren = *( %x20-28 / %x2A-10FFFF ) ; Everything except )
|
||||
|
||||
; Inline HTML
|
||||
html-inline = "<" html-inline-content
|
||||
html-inline-content = tag-name *( WSP attribute ) [ WSP ] [ "/" ] ">" /
|
||||
entity-name ";"
|
||||
entity-name = 1*( ALPHA / DIGIT )
|
||||
|
||||
; HTML attributes and tag content
|
||||
tag-name = ALPHA *( ALPHA / DIGIT / "-" / ":" )
|
||||
attribute = attribute-name [ "=" attribute-value ]
|
||||
attribute-name = ALPHA *( ALPHA / DIGIT / "-" / ":" )
|
||||
|
||||
attribute-value = DQUOTE attribute-value-dquote DQUOTE /
|
||||
"'" attribute-value-squote "'" /
|
||||
attribute-value-unquoted
|
||||
attribute-value-dquote = *( %x20-21 / %x23-10FFFF ) ; Everything except "
|
||||
attribute-value-squote = *( %x20-26 / %x28-10FFFF ) ; Everything except '
|
||||
attribute-value-unquoted = 1*( %x21-22 / %x24-26 / %x28-2F / %x30-3D / %x3F-10FFFF )
|
||||
html-content = *( %x20-3B / %x3D-10FFFF ) ; Everything except <
|
||||
|
||||
escaped-char = "\" VCHAR
|
||||
|
||||
; Plain text variations (to avoid conflicts)
|
||||
plain-text = 1*plain-text-char
|
||||
plain-text-char = %x20-21 / %x23-29 / %x2B-2F / %x30-3B / %x3D /
|
||||
%x3F-40 / %x41-5A / %x5C / %x5E-5F / %x61-7A /
|
||||
%x7C / %x7E-10FFFF
|
||||
plain-text-no-asterisk = %x20-29 / %x2B-10FFFF
|
||||
plain-text-no-underscore = %x20-5E / %x60-10FFFF
|
||||
plain-text-no-double-asterisk = 1*( %x20-29 / %x2B-10FFFF ) ; Simplified
|
||||
plain-text-no-double-underscore = 1*( %x20-5E / %x60-10FFFF ) ; Simplified
|
||||
plain-text-no-bracket = %x20-5A / %x5C-10FFFF
|
||||
|
||||
; Basic definitions
|
||||
line-ending = CRLF / LF / CR
|
||||
WSP = SP / HTAB
|
||||
ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
|
||||
DIGIT = %x30-39 ; 0-9
|
||||
SP = %x20 ; Space
|
||||
HTAB = %x09 ; Horizontal tab
|
||||
CR = %x0D ; Carriage return
|
||||
LF = %x0A ; Line feed
|
||||
CRLF = CR LF ; Internet standard newline
|
||||
DQUOTE = %x22 ; Double quote
|
||||
VCHAR = %x21-7E ; Visible ASCII characters
|
||||
Loading…
Reference in a new issue