Compare commits

..

2 commits

2 changed files with 28 additions and 191 deletions

View file

@ -12,7 +12,7 @@ import Data.Text (Text)
import qualified Data.Text as T
import Data.Void (Void)
import IR
import Text.Megaparsec (Parsec, anySingle, anySingleBut, between, choice, count, eof, manyTill, notFollowedBy, satisfy, skipSome, try)
import Text.Megaparsec (Parsec, anySingle, anySingleBut, between, choice, count, eof, manyTill, notFollowedBy, satisfy, skipSome, try, (<?>))
import qualified Text.Megaparsec as MP
import Text.Megaparsec.Char (alphaNumChar, char, digitChar, string)
@ -50,16 +50,16 @@ document = Doc <$> many element <* eof
element :: Parser Element
element =
choice
[ try headingBlock,
try fencedCodeBlock,
try indentedCodeBlock,
try blockquoteBlock,
try unorderedListBlock,
try orderedListBlock,
try horizontalRuleBlock,
try htmlBlock,
try blankLines, -- Consume blank lines but don't add to AST
paragraphBlock
[ try headingBlock <?> "Element Heading",
try fencedCodeBlock <?> "Fenced Code Block",
try indentedCodeBlock <?> "Indented Code Block",
try blockquoteBlock <?> "BlockQuote",
try unorderedListBlock <?> "Unordered List",
try orderedListBlock <?> "Ordered List",
try horizontalRuleBlock <?> "Horizontal Rule",
try htmlBlock <?> "HTML Block",
try blankLines <?> "Blank Lines", -- Consume blank lines but don't add to AST
paragraphBlock <?> "Paragarph"
]
-- Blank lines (consumed but not stored)
@ -77,11 +77,11 @@ blankLine = do
-- Heading Block
headingBlock :: Parser Element
headingBlock = do
hashes <- some (char '#')
hashes <- some (char '#') <?> "Heading Hashes"
let level = length hashes
guard (level <= 6)
many (char ' ' <|> char '\t')
content <- manyTill inlineElement (try lineEnding)
guard (level <= 6) <?> "Higher than level 6"
many (char ' ' <|> char '\t') <?> "Pre-Text Whitespace"
content <- manyTill (inlineElement <?> "Header Text") (try lineEnding <?> "Header Ending")
pure $ Heading $ H level content
-- Fenced Code Block
@ -246,15 +246,15 @@ paragraphBlock = do
inlineElement :: Parser InlineText
inlineElement =
choice
[ try strong,
try emphasis,
try crossedText,
try codeSpan,
try image,
try link,
try htmlInline,
try escapedChar,
plainText
[ try strong <?> "Inline Strong Text",
try emphasis <?> "Inline Italic Text",
try crossedText <?> "Inline Crossed Text",
try codeSpan <?> "Inline Code",
try image <?> "Inline Image",
try link <?> "Inline Link",
try htmlInline <?> "Inline HTML",
try escapedChar <?> "Escaped Character",
plainText <?> "Inline Plain Text"
]
-- Strong (Bold)
@ -314,8 +314,8 @@ inlineElementNo c =
plainTextNo :: [Char] -> Parser InlineText
plainTextNo disallow = do
firstChar <- noneOf disallow
remChars <- some $ plainTextCharNo disallow <* notFollowedBy lineEnding
firstChar <- noneOf disallow <?> "Plain Text Initial Disallow"
remChars <- manyTill (plainTextCharNo disallow) lineEnding <?> "Remaining Characters"
pure $ Text $ T.map wspHandler $ T.pack $ firstChar : remChars
where
wspHandler '\n' = ' '
@ -422,7 +422,7 @@ escapedChar = do
-- Plain Text
plainText :: Parser InlineText
plainText = plainTextNo []
plainText = plainTextNo [] <?> "Baseline Plain Text"
plainTextBaseDisallow :: [Char]
plainTextBaseDisallow = "[~`_*<"
@ -441,7 +441,7 @@ plainTextNoBracket = plainTextNo "[]"
-- Helper Parsers
lineEnding :: Parser ()
lineEnding = void $ count 2 (try (string "\r\n") <|> try (string "\n") <|> string "\r")
lineEnding = void (try $ count 2 (try (string "\r\n") <|> try (string "\n") <|> string "\r")) <|> eof
wsParser :: Parser ()
wsParser = void $ some (char ' ' <|> char '\t')

View file

@ -1,163 +0,0 @@
; Left-Factored ABNF Grammar for Markdown with Embedded HTML
; Based on RFC 5234 (ABNF) syntax - no negative lookahead operators
; Top-level document structure
document = *( block-element / CRLF )
; Block-level elements (true left-factoring with unique prefixes)
block-element = heading-block /
blockquote-block /
unordered-list-block /
ordered-list-block /
fenced-code-block /
indented-code-block /
horizontal-rule-block /
html-block /
paragraph-block
; Headings (must start with #)
heading-block = "#" heading-rest
heading-rest = [ 1*5"#" ] *WSP *( inline-element ) line-ending
; Blockquotes (must start with >)
blockquote-block = ">" blockquote-rest
blockquote-rest = [ WSP ] *( inline-element ) line-ending
*( ">" [ WSP ] *( inline-element ) line-ending )
; Unordered lists (must start with *, -, or + followed by space)
unordered-list-block = unordered-list-item 1*( unordered-list-item )
unordered-list-item = unordered-marker list-item-content
unordered-marker = ( "*" / "-" / "+" ) WSP
list-item-content = *( inline-element ) line-ending
*( list-continuation )
list-continuation = 2*WSP *( inline-element ) line-ending
; Ordered lists (must start with digit(s) followed by . and space)
ordered-list-block = ordered-list-item 1*( ordered-list-item )
ordered-list-item = ordered-marker list-item-content
ordered-marker = 1*DIGIT "." WSP
; Fenced code blocks (must start with ``` or ~~~)
fenced-code-block = code-fence fenced-code-rest
fenced-code-rest = [ language-info ] line-ending
*( code-line )
code-fence line-ending
code-fence = "```" / "~~~"
language-info = 1*( ALPHA / DIGIT / "-" / "+" / "." )
code-line = *VCHAR line-ending
; Indented code blocks (must start with 4+ spaces followed by non-space)
indented-code-block = 4*WSP VCHAR indented-code-rest
indented-code-rest = *VCHAR line-ending
*( 4*WSP *VCHAR line-ending )
; Horizontal rules (must start with 3+ of same character)
horizontal-rule-block = hr-pattern hr-rest
hr-pattern = 3*"*" / 3*"-" / 3*"_"
hr-rest = *( WSP / "*" / "-" / "_" ) line-ending
; HTML blocks (must start with <)
html-block = "<" html-block-rest
html-block-rest = html-tag-block / html-comment-block / html-declaration-block
html-tag-block = tag-name *( WSP attribute ) [ WSP ] [ "/" ] ">"
*( html-content )
[ "</" tag-name ">" ]
line-ending
html-comment-block = "!--" *( CHAR ) "-->" line-ending
html-declaration-block = "!" 1*ALPHA *( WSP / VCHAR ) ">" line-ending
; Paragraphs (everything else that doesn't match above patterns)
paragraph-block = paragraph-text-line 1*( paragraph-continuation )
paragraph-text-line = paragraph-first-element *( inline-element ) line-ending
paragraph-continuation = paragraph-first-element *( inline-element ) line-ending
; First element of paragraph - anything that's not a block starter
paragraph-first-element = emphasis / strong / code-span / link / image /
html-inline / escaped-char / plain-text
; Inline elements
inline-element = emphasis / strong / code-span / link / image /
html-inline / escaped-char / plain-text
; Emphasis and strong (left-factored by delimiter)
emphasis = emphasis-asterisk / emphasis-underscore
strong = strong-asterisk / strong-underscore
emphasis-asterisk = "*" emphasis-content-asterisk "*"
emphasis-underscore = "_" emphasis-content-underscore "_"
strong-asterisk = "**" strong-content-asterisk "**"
strong-underscore = "__" strong-content-underscore "__"
; Content within emphasis/strong (simplified - no nesting for clarity)
emphasis-content-asterisk = 1*( escaped-char / plain-text-no-asterisk )
emphasis-content-underscore = 1*( escaped-char / plain-text-no-underscore )
strong-content-asterisk = 1*( escaped-char / plain-text-no-double-asterisk )
strong-content-underscore = 1*( escaped-char / plain-text-no-double-underscore )
; Code spans
code-span = "`" code-span-content "`" /
"``" code-span-content-double "``" /
"```" code-span-content-triple "```"
code-span-content = 1*( %x20-5F / %x61-10FFFF ) ; Everything except `
code-span-content-double = *( %x20-5F / %x61-10FFFF / "`" )
code-span-content-triple = *( %x20-5F / %x61-10FFFF / "`" / "``" )
; Links and images (left-factored by opening bracket)
link = "[" link-content "]" link-destination
image = "!" "[" alt-text "]" link-destination
link-content = 1*( escaped-char / plain-text-no-bracket )
alt-text = 1*( escaped-char / plain-text-no-bracket )
link-destination = "(" url [ WSP title ] ")" /
"[" reference-id "]"
reference-id = 1*( ALPHA / DIGIT / WSP )
url = 1*( %x21-29 / %x2B-10FFFF ) ; Everything except space and )
title = DQUOTE title-content-dquote DQUOTE /
"'" title-content-squote "'" /
"(" title-content-paren ")"
title-content-dquote = *( %x20-21 / %x23-10FFFF ) ; Everything except "
title-content-squote = *( %x20-26 / %x28-10FFFF ) ; Everything except '
title-content-paren = *( %x20-28 / %x2A-10FFFF ) ; Everything except )
; Inline HTML
html-inline = "<" html-inline-content
html-inline-content = tag-name *( WSP attribute ) [ WSP ] [ "/" ] ">" /
entity-name ";"
entity-name = 1*( ALPHA / DIGIT )
; HTML attributes and tag content
tag-name = ALPHA *( ALPHA / DIGIT / "-" / ":" )
attribute = attribute-name [ "=" attribute-value ]
attribute-name = ALPHA *( ALPHA / DIGIT / "-" / ":" )
attribute-value = DQUOTE attribute-value-dquote DQUOTE /
"'" attribute-value-squote "'" /
attribute-value-unquoted
attribute-value-dquote = *( %x20-21 / %x23-10FFFF ) ; Everything except "
attribute-value-squote = *( %x20-26 / %x28-10FFFF ) ; Everything except '
attribute-value-unquoted = 1*( %x21-22 / %x24-26 / %x28-2F / %x30-3D / %x3F-10FFFF )
html-content = *( %x20-3B / %x3D-10FFFF ) ; Everything except <
escaped-char = "\" VCHAR
; Plain text variations (to avoid conflicts)
plain-text = 1*plain-text-char
plain-text-char = %x20-21 / %x23-29 / %x2B-2F / %x30-3B / %x3D /
%x3F-40 / %x41-5A / %x5C / %x5E-5F / %x61-7A /
%x7C / %x7E-10FFFF
plain-text-no-asterisk = %x20-29 / %x2B-10FFFF
plain-text-no-underscore = %x20-5E / %x60-10FFFF
plain-text-no-double-asterisk = 1*( %x20-29 / %x2B-10FFFF ) ; Simplified
plain-text-no-double-underscore = 1*( %x20-5E / %x60-10FFFF ) ; Simplified
plain-text-no-bracket = %x20-5A / %x5C-10FFFF
; Basic definitions
line-ending = CRLF / LF / CR
WSP = SP / HTAB
ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
DIGIT = %x30-39 ; 0-9
SP = %x20 ; Space
HTAB = %x09 ; Horizontal tab
CR = %x0D ; Carriage return
LF = %x0A ; Line feed
CRLF = CR LF ; Internet standard newline
DQUOTE = %x22 ; Double quote
VCHAR = %x21-7E ; Visible ASCII characters