Compare commits

..

2 commits

2 changed files with 28 additions and 191 deletions

View file

@ -12,7 +12,7 @@ import Data.Text (Text)
import qualified Data.Text as T import qualified Data.Text as T
import Data.Void (Void) import Data.Void (Void)
import IR import IR
import Text.Megaparsec (Parsec, anySingle, anySingleBut, between, choice, count, eof, manyTill, notFollowedBy, satisfy, skipSome, try) import Text.Megaparsec (Parsec, anySingle, anySingleBut, between, choice, count, eof, manyTill, notFollowedBy, satisfy, skipSome, try, (<?>))
import qualified Text.Megaparsec as MP import qualified Text.Megaparsec as MP
import Text.Megaparsec.Char (alphaNumChar, char, digitChar, string) import Text.Megaparsec.Char (alphaNumChar, char, digitChar, string)
@ -50,16 +50,16 @@ document = Doc <$> many element <* eof
element :: Parser Element element :: Parser Element
element = element =
choice choice
[ try headingBlock, [ try headingBlock <?> "Element Heading",
try fencedCodeBlock, try fencedCodeBlock <?> "Fenced Code Block",
try indentedCodeBlock, try indentedCodeBlock <?> "Indented Code Block",
try blockquoteBlock, try blockquoteBlock <?> "BlockQuote",
try unorderedListBlock, try unorderedListBlock <?> "Unordered List",
try orderedListBlock, try orderedListBlock <?> "Ordered List",
try horizontalRuleBlock, try horizontalRuleBlock <?> "Horizontal Rule",
try htmlBlock, try htmlBlock <?> "HTML Block",
try blankLines, -- Consume blank lines but don't add to AST try blankLines <?> "Blank Lines", -- Consume blank lines but don't add to AST
paragraphBlock paragraphBlock <?> "Paragarph"
] ]
-- Blank lines (consumed but not stored) -- Blank lines (consumed but not stored)
@ -77,11 +77,11 @@ blankLine = do
-- Heading Block -- Heading Block
headingBlock :: Parser Element headingBlock :: Parser Element
headingBlock = do headingBlock = do
hashes <- some (char '#') hashes <- some (char '#') <?> "Heading Hashes"
let level = length hashes let level = length hashes
guard (level <= 6) guard (level <= 6) <?> "Higher than level 6"
many (char ' ' <|> char '\t') many (char ' ' <|> char '\t') <?> "Pre-Text Whitespace"
content <- manyTill inlineElement (try lineEnding) content <- manyTill (inlineElement <?> "Header Text") (try lineEnding <?> "Header Ending")
pure $ Heading $ H level content pure $ Heading $ H level content
-- Fenced Code Block -- Fenced Code Block
@ -246,15 +246,15 @@ paragraphBlock = do
inlineElement :: Parser InlineText inlineElement :: Parser InlineText
inlineElement = inlineElement =
choice choice
[ try strong, [ try strong <?> "Inline Strong Text",
try emphasis, try emphasis <?> "Inline Italic Text",
try crossedText, try crossedText <?> "Inline Crossed Text",
try codeSpan, try codeSpan <?> "Inline Code",
try image, try image <?> "Inline Image",
try link, try link <?> "Inline Link",
try htmlInline, try htmlInline <?> "Inline HTML",
try escapedChar, try escapedChar <?> "Escaped Character",
plainText plainText <?> "Inline Plain Text"
] ]
-- Strong (Bold) -- Strong (Bold)
@ -314,8 +314,8 @@ inlineElementNo c =
plainTextNo :: [Char] -> Parser InlineText plainTextNo :: [Char] -> Parser InlineText
plainTextNo disallow = do plainTextNo disallow = do
firstChar <- noneOf disallow firstChar <- noneOf disallow <?> "Plain Text Initial Disallow"
remChars <- some $ plainTextCharNo disallow <* notFollowedBy lineEnding remChars <- manyTill (plainTextCharNo disallow) lineEnding <?> "Remaining Characters"
pure $ Text $ T.map wspHandler $ T.pack $ firstChar : remChars pure $ Text $ T.map wspHandler $ T.pack $ firstChar : remChars
where where
wspHandler '\n' = ' ' wspHandler '\n' = ' '
@ -422,7 +422,7 @@ escapedChar = do
-- Plain Text -- Plain Text
plainText :: Parser InlineText plainText :: Parser InlineText
plainText = plainTextNo [] plainText = plainTextNo [] <?> "Baseline Plain Text"
plainTextBaseDisallow :: [Char] plainTextBaseDisallow :: [Char]
plainTextBaseDisallow = "[~`_*<" plainTextBaseDisallow = "[~`_*<"
@ -441,7 +441,7 @@ plainTextNoBracket = plainTextNo "[]"
-- Helper Parsers -- Helper Parsers
lineEnding :: Parser () lineEnding :: Parser ()
lineEnding = void $ count 2 (try (string "\r\n") <|> try (string "\n") <|> string "\r") lineEnding = void (try $ count 2 (try (string "\r\n") <|> try (string "\n") <|> string "\r")) <|> eof
wsParser :: Parser () wsParser :: Parser ()
wsParser = void $ some (char ' ' <|> char '\t') wsParser = void $ some (char ' ' <|> char '\t')

View file

@ -1,163 +0,0 @@
; Left-Factored ABNF Grammar for Markdown with Embedded HTML
; Based on RFC 5234 (ABNF) syntax - no negative lookahead operators
; Top-level document structure
document = *( block-element / CRLF )
; Block-level elements (true left-factoring with unique prefixes)
block-element = heading-block /
blockquote-block /
unordered-list-block /
ordered-list-block /
fenced-code-block /
indented-code-block /
horizontal-rule-block /
html-block /
paragraph-block
; Headings (must start with #)
heading-block = "#" heading-rest
heading-rest = [ 1*5"#" ] *WSP *( inline-element ) line-ending
; Blockquotes (must start with >)
blockquote-block = ">" blockquote-rest
blockquote-rest = [ WSP ] *( inline-element ) line-ending
*( ">" [ WSP ] *( inline-element ) line-ending )
; Unordered lists (must start with *, -, or + followed by space)
unordered-list-block = unordered-list-item 1*( unordered-list-item )
unordered-list-item = unordered-marker list-item-content
unordered-marker = ( "*" / "-" / "+" ) WSP
list-item-content = *( inline-element ) line-ending
*( list-continuation )
list-continuation = 2*WSP *( inline-element ) line-ending
; Ordered lists (must start with digit(s) followed by . and space)
ordered-list-block = ordered-list-item 1*( ordered-list-item )
ordered-list-item = ordered-marker list-item-content
ordered-marker = 1*DIGIT "." WSP
; Fenced code blocks (must start with ``` or ~~~)
fenced-code-block = code-fence fenced-code-rest
fenced-code-rest = [ language-info ] line-ending
*( code-line )
code-fence line-ending
code-fence = "```" / "~~~"
language-info = 1*( ALPHA / DIGIT / "-" / "+" / "." )
code-line = *VCHAR line-ending
; Indented code blocks (must start with 4+ spaces followed by non-space)
indented-code-block = 4*WSP VCHAR indented-code-rest
indented-code-rest = *VCHAR line-ending
*( 4*WSP *VCHAR line-ending )
; Horizontal rules (must start with 3+ of same character)
horizontal-rule-block = hr-pattern hr-rest
hr-pattern = 3*"*" / 3*"-" / 3*"_"
hr-rest = *( WSP / "*" / "-" / "_" ) line-ending
; HTML blocks (must start with <)
html-block = "<" html-block-rest
html-block-rest = html-tag-block / html-comment-block / html-declaration-block
html-tag-block = tag-name *( WSP attribute ) [ WSP ] [ "/" ] ">"
*( html-content )
[ "</" tag-name ">" ]
line-ending
html-comment-block = "!--" *( CHAR ) "-->" line-ending
html-declaration-block = "!" 1*ALPHA *( WSP / VCHAR ) ">" line-ending
; Paragraphs (everything else that doesn't match above patterns)
paragraph-block = paragraph-text-line 1*( paragraph-continuation )
paragraph-text-line = paragraph-first-element *( inline-element ) line-ending
paragraph-continuation = paragraph-first-element *( inline-element ) line-ending
; First element of paragraph - anything that's not a block starter
paragraph-first-element = emphasis / strong / code-span / link / image /
html-inline / escaped-char / plain-text
; Inline elements
inline-element = emphasis / strong / code-span / link / image /
html-inline / escaped-char / plain-text
; Emphasis and strong (left-factored by delimiter)
emphasis = emphasis-asterisk / emphasis-underscore
strong = strong-asterisk / strong-underscore
emphasis-asterisk = "*" emphasis-content-asterisk "*"
emphasis-underscore = "_" emphasis-content-underscore "_"
strong-asterisk = "**" strong-content-asterisk "**"
strong-underscore = "__" strong-content-underscore "__"
; Content within emphasis/strong (simplified - no nesting for clarity)
emphasis-content-asterisk = 1*( escaped-char / plain-text-no-asterisk )
emphasis-content-underscore = 1*( escaped-char / plain-text-no-underscore )
strong-content-asterisk = 1*( escaped-char / plain-text-no-double-asterisk )
strong-content-underscore = 1*( escaped-char / plain-text-no-double-underscore )
; Code spans
code-span = "`" code-span-content "`" /
"``" code-span-content-double "``" /
"```" code-span-content-triple "```"
code-span-content = 1*( %x20-5F / %x61-10FFFF ) ; Everything except `
code-span-content-double = *( %x20-5F / %x61-10FFFF / "`" )
code-span-content-triple = *( %x20-5F / %x61-10FFFF / "`" / "``" )
; Links and images (left-factored by opening bracket)
link = "[" link-content "]" link-destination
image = "!" "[" alt-text "]" link-destination
link-content = 1*( escaped-char / plain-text-no-bracket )
alt-text = 1*( escaped-char / plain-text-no-bracket )
link-destination = "(" url [ WSP title ] ")" /
"[" reference-id "]"
reference-id = 1*( ALPHA / DIGIT / WSP )
url = 1*( %x21-29 / %x2B-10FFFF ) ; Everything except space and )
title = DQUOTE title-content-dquote DQUOTE /
"'" title-content-squote "'" /
"(" title-content-paren ")"
title-content-dquote = *( %x20-21 / %x23-10FFFF ) ; Everything except "
title-content-squote = *( %x20-26 / %x28-10FFFF ) ; Everything except '
title-content-paren = *( %x20-28 / %x2A-10FFFF ) ; Everything except )
; Inline HTML
html-inline = "<" html-inline-content
html-inline-content = tag-name *( WSP attribute ) [ WSP ] [ "/" ] ">" /
entity-name ";"
entity-name = 1*( ALPHA / DIGIT )
; HTML attributes and tag content
tag-name = ALPHA *( ALPHA / DIGIT / "-" / ":" )
attribute = attribute-name [ "=" attribute-value ]
attribute-name = ALPHA *( ALPHA / DIGIT / "-" / ":" )
attribute-value = DQUOTE attribute-value-dquote DQUOTE /
"'" attribute-value-squote "'" /
attribute-value-unquoted
attribute-value-dquote = *( %x20-21 / %x23-10FFFF ) ; Everything except "
attribute-value-squote = *( %x20-26 / %x28-10FFFF ) ; Everything except '
attribute-value-unquoted = 1*( %x21-22 / %x24-26 / %x28-2F / %x30-3D / %x3F-10FFFF )
html-content = *( %x20-3B / %x3D-10FFFF ) ; Everything except <
escaped-char = "\" VCHAR
; Plain text variations (to avoid conflicts)
plain-text = 1*plain-text-char
plain-text-char = %x20-21 / %x23-29 / %x2B-2F / %x30-3B / %x3D /
%x3F-40 / %x41-5A / %x5C / %x5E-5F / %x61-7A /
%x7C / %x7E-10FFFF
plain-text-no-asterisk = %x20-29 / %x2B-10FFFF
plain-text-no-underscore = %x20-5E / %x60-10FFFF
plain-text-no-double-asterisk = 1*( %x20-29 / %x2B-10FFFF ) ; Simplified
plain-text-no-double-underscore = 1*( %x20-5E / %x60-10FFFF ) ; Simplified
plain-text-no-bracket = %x20-5A / %x5C-10FFFF
; Basic definitions
line-ending = CRLF / LF / CR
WSP = SP / HTAB
ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
DIGIT = %x30-39 ; 0-9
SP = %x20 ; Space
HTAB = %x09 ; Horizontal tab
CR = %x0D ; Carriage return
LF = %x0A ; Line feed
CRLF = CR LF ; Internet standard newline
DQUOTE = %x22 ; Double quote
VCHAR = %x21-7E ; Visible ASCII characters