trying to figure out how the hell to make this work for new lines but now the header seems to include the follow on line

removing abnf, it isn't representative so no point
2025-11-13 17:01:08 -05:00 · 2025-11-13 17:00:47 -05:00
2 changed files with 28 additions and 191 deletions
--- a/app/Markdown.hs
+++ b/app/Markdown.hs
@ -12,7 +12,7 @@ import Data.Text (Text)
 import qualified Data.Text as T
 import Data.Void (Void)
 import IR
-import Text.Megaparsec (Parsec, anySingle, anySingleBut, between, choice, count, eof, manyTill, notFollowedBy, satisfy, skipSome, try)
+import Text.Megaparsec (Parsec, anySingle, anySingleBut, between, choice, count, eof, manyTill, notFollowedBy, satisfy, skipSome, try, (<?>))
 import qualified Text.Megaparsec as MP
 import Text.Megaparsec.Char (alphaNumChar, char, digitChar, string)
@ -50,16 +50,16 @@ document = Doc <$> many element <* eof
 element :: Parser Element
 element =
  choice
-    [ try headingBlock,
+    [ try headingBlock <?> "Element Heading",
-      try fencedCodeBlock,
+      try fencedCodeBlock <?> "Fenced Code Block",
-      try indentedCodeBlock,
+      try indentedCodeBlock <?> "Indented Code Block",
-      try blockquoteBlock,
+      try blockquoteBlock <?> "BlockQuote",
-      try unorderedListBlock,
+      try unorderedListBlock <?> "Unordered List",
-      try orderedListBlock,
+      try orderedListBlock <?> "Ordered List",
-      try horizontalRuleBlock,
+      try horizontalRuleBlock <?> "Horizontal Rule",
-      try htmlBlock,
+      try htmlBlock <?> "HTML Block",
-      try blankLines, -- Consume blank lines but don't add to AST
+      try blankLines <?> "Blank Lines", -- Consume blank lines but don't add to AST
-      paragraphBlock
+      paragraphBlock <?> "Paragarph"
    ]
 -- Blank lines (consumed but not stored)
@ -77,11 +77,11 @@ blankLine = do
 -- Heading Block
 headingBlock :: Parser Element
 headingBlock = do
-  hashes <- some (char '#')
+  hashes <- some (char '#') <?> "Heading Hashes"
  let level = length hashes
-  guard (level <= 6)
+  guard (level <= 6) <?> "Higher than level 6"
-  many (char ' ' <|> char '\t')
+  many (char ' ' <|> char '\t') <?> "Pre-Text Whitespace"
-  content <- manyTill inlineElement (try lineEnding)
+  content <- manyTill (inlineElement <?> "Header Text") (try lineEnding <?> "Header Ending")
  pure $ Heading $ H level content
 -- Fenced Code Block
@ -246,15 +246,15 @@ paragraphBlock = do
 inlineElement :: Parser InlineText
 inlineElement =
  choice
-    [ try strong,
+    [ try strong <?> "Inline Strong Text",
-      try emphasis,
+      try emphasis <?> "Inline Italic Text",
-      try crossedText,
+      try crossedText <?> "Inline Crossed Text",
-      try codeSpan,
+      try codeSpan <?> "Inline Code",
-      try image,
+      try image <?> "Inline Image",
-      try link,
+      try link <?> "Inline Link",
-      try htmlInline,
+      try htmlInline <?> "Inline HTML",
-      try escapedChar,
+      try escapedChar <?> "Escaped Character",
-      plainText
+      plainText <?> "Inline Plain Text"
    ]
 -- Strong (Bold)
@ -314,8 +314,8 @@ inlineElementNo c =
 plainTextNo :: [Char] -> Parser InlineText
 plainTextNo disallow = do
-  firstChar <- noneOf disallow
+  firstChar <- noneOf disallow <?> "Plain Text Initial Disallow"
-  remChars <- some $ plainTextCharNo disallow <* notFollowedBy lineEnding
+  remChars <- manyTill (plainTextCharNo disallow) lineEnding <?> "Remaining Characters"
  pure $ Text $ T.map wspHandler $ T.pack $ firstChar : remChars
  where
    wspHandler '\n' = ' '
@ -422,7 +422,7 @@ escapedChar = do
 -- Plain Text
 plainText :: Parser InlineText
-plainText = plainTextNo []
+plainText = plainTextNo [] <?> "Baseline Plain Text"
 plainTextBaseDisallow :: [Char]
 plainTextBaseDisallow = "[~`_*<"
@ -441,7 +441,7 @@ plainTextNoBracket = plainTextNo "[]"
 -- Helper Parsers
 lineEnding :: Parser ()
-lineEnding = void $ count 2 (try (string "\r\n") <|> try (string "\n") <|> string "\r")
+lineEnding = void (try $ count 2 (try (string "\r\n") <|> try (string "\n") <|> string "\r")) <|> eof
 wsParser :: Parser ()
 wsParser = void $ some (char ' ' <|> char '\t')
--- a/markdown.abnf
+++ b/markdown.abnf
@ -1,163 +0,0 @@
 ; Left-Factored ABNF Grammar for Markdown with Embedded HTML
 ; Based on RFC 5234 (ABNF) syntax - no negative lookahead operators
 ; Top-level document structure
 document = *( block-element / CRLF )
 ; Block-level elements (true left-factoring with unique prefixes)
 block-element = heading-block / 
                blockquote-block / 
                unordered-list-block /
                ordered-list-block /
                fenced-code-block /
                indented-code-block /
                horizontal-rule-block /
                html-block /
                paragraph-block
 ; Headings (must start with #)
 heading-block = "#" heading-rest
 heading-rest = [ 1*5"#" ] *WSP *( inline-element ) line-ending
 ; Blockquotes (must start with >)
 blockquote-block = ">" blockquote-rest
 blockquote-rest = [ WSP ] *( inline-element ) line-ending
                 *( ">" [ WSP ] *( inline-element ) line-ending )
 ; Unordered lists (must start with *, -, or + followed by space)
 unordered-list-block = unordered-list-item 1*( unordered-list-item )
 unordered-list-item = unordered-marker list-item-content
 unordered-marker = ( "*" / "-" / "+" ) WSP
 list-item-content = *( inline-element ) line-ending
                   *( list-continuation )
 list-continuation = 2*WSP *( inline-element ) line-ending
 ; Ordered lists (must start with digit(s) followed by . and space)
 ordered-list-block = ordered-list-item 1*( ordered-list-item )
 ordered-list-item = ordered-marker list-item-content
 ordered-marker = 1*DIGIT "." WSP
 ; Fenced code blocks (must start with ``` or ~~~)
 fenced-code-block = code-fence fenced-code-rest
 fenced-code-rest = [ language-info ] line-ending
                  *( code-line )
                  code-fence line-ending
 code-fence = "```" / "~~~"
 language-info = 1*( ALPHA / DIGIT / "-" / "+" / "." )
 code-line = *VCHAR line-ending
 ; Indented code blocks (must start with 4+ spaces followed by non-space)
 indented-code-block = 4*WSP VCHAR indented-code-rest
 indented-code-rest = *VCHAR line-ending 
                    *( 4*WSP *VCHAR line-ending )
 ; Horizontal rules (must start with 3+ of same character)
 horizontal-rule-block = hr-pattern hr-rest
 hr-pattern = 3*"*" / 3*"-" / 3*"_"
 hr-rest = *( WSP / "*" / "-" / "_" ) line-ending
 ; HTML blocks (must start with <)
 html-block = "<" html-block-rest
 html-block-rest = html-tag-block / html-comment-block / html-declaration-block
 html-tag-block = tag-name *( WSP attribute ) [ WSP ] [ "/" ] ">"
                *( html-content )
                [ "</" tag-name ">" ]
                line-ending
 html-comment-block = "!--" *( CHAR ) "-->" line-ending
 html-declaration-block = "!" 1*ALPHA *( WSP / VCHAR ) ">" line-ending
 ; Paragraphs (everything else that doesn't match above patterns)
 paragraph-block = paragraph-text-line 1*( paragraph-continuation )
 paragraph-text-line = paragraph-first-element *( inline-element ) line-ending
 paragraph-continuation = paragraph-first-element *( inline-element ) line-ending
 ; First element of paragraph - anything that's not a block starter
 paragraph-first-element = emphasis / strong / code-span / link / image / 
                         html-inline  / escaped-char / plain-text
 ; Inline elements
 inline-element = emphasis / strong / code-span / link / image / 
                html-inline / escaped-char / plain-text
 ; Emphasis and strong (left-factored by delimiter)
 emphasis = emphasis-asterisk / emphasis-underscore
 strong = strong-asterisk / strong-underscore
 emphasis-asterisk = "*" emphasis-content-asterisk "*"
 emphasis-underscore = "_" emphasis-content-underscore "_"
 strong-asterisk = "**" strong-content-asterisk "**"
 strong-underscore = "__" strong-content-underscore "__"
 ; Content within emphasis/strong (simplified - no nesting for clarity)
 emphasis-content-asterisk = 1*( escaped-char / plain-text-no-asterisk )
 emphasis-content-underscore = 1*( escaped-char / plain-text-no-underscore )
 strong-content-asterisk = 1*( escaped-char / plain-text-no-double-asterisk )
 strong-content-underscore = 1*( escaped-char / plain-text-no-double-underscore )
 ; Code spans
 code-span = "`" code-span-content "`" /
           "``" code-span-content-double "``" /
           "```" code-span-content-triple "```"
 code-span-content = 1*( %x20-5F / %x61-10FFFF )  ; Everything except `
 code-span-content-double = *( %x20-5F / %x61-10FFFF / "`" )
 code-span-content-triple = *( %x20-5F / %x61-10FFFF / "`" / "``" )
 ; Links and images (left-factored by opening bracket)
 link = "[" link-content "]" link-destination
 image = "!" "[" alt-text "]" link-destination
 link-content = 1*( escaped-char / plain-text-no-bracket )
 alt-text = 1*( escaped-char / plain-text-no-bracket )
 link-destination = "(" url [ WSP title ] ")" / 
                  "[" reference-id "]"
 reference-id = 1*( ALPHA / DIGIT / WSP )
 url = 1*( %x21-29 / %x2B-10FFFF )  ; Everything except space and )
 title = DQUOTE title-content-dquote DQUOTE / 
        "'" title-content-squote "'" /
        "(" title-content-paren ")"
 title-content-dquote = *( %x20-21 / %x23-10FFFF )  ; Everything except "
 title-content-squote = *( %x20-26 / %x28-10FFFF )  ; Everything except '
 title-content-paren = *( %x20-28 / %x2A-10FFFF )   ; Everything except )
 ; Inline HTML
 html-inline = "<" html-inline-content
 html-inline-content = tag-name *( WSP attribute ) [ WSP ] [ "/" ] ">" /
                     entity-name ";"
 entity-name = 1*( ALPHA / DIGIT )
 ; HTML attributes and tag content
 tag-name = ALPHA *( ALPHA / DIGIT / "-" / ":" )
 attribute = attribute-name [ "=" attribute-value ]
 attribute-name = ALPHA *( ALPHA / DIGIT / "-" / ":" )
 attribute-value = DQUOTE attribute-value-dquote DQUOTE /
                  "'" attribute-value-squote "'" /
                  attribute-value-unquoted
 attribute-value-dquote = *( %x20-21 / %x23-10FFFF )  ; Everything except "
 attribute-value-squote = *( %x20-26 / %x28-10FFFF )  ; Everything except '
 attribute-value-unquoted = 1*( %x21-22 / %x24-26 / %x28-2F / %x30-3D / %x3F-10FFFF )
 html-content = *( %x20-3B / %x3D-10FFFF )  ; Everything except <
 escaped-char = "\" VCHAR
 ; Plain text variations (to avoid conflicts)
 plain-text = 1*plain-text-char
 plain-text-char = %x20-21 / %x23-29 / %x2B-2F / %x30-3B / %x3D / 
                 %x3F-40 / %x41-5A / %x5C / %x5E-5F / %x61-7A / 
                 %x7C / %x7E-10FFFF
 plain-text-no-asterisk = %x20-29 / %x2B-10FFFF
 plain-text-no-underscore = %x20-5E / %x60-10FFFF
 plain-text-no-double-asterisk = 1*( %x20-29 / %x2B-10FFFF ) ; Simplified
 plain-text-no-double-underscore = 1*( %x20-5E / %x60-10FFFF ) ; Simplified
 plain-text-no-bracket = %x20-5A / %x5C-10FFFF
 ; Basic definitions
 line-ending = CRLF / LF / CR
 WSP = SP / HTAB
 ALPHA = %x41-5A / %x61-7A  ; A-Z / a-z  
 DIGIT = %x30-39     ; 0-9
 SP = %x20           ; Space
 HTAB = %x09         ; Horizontal tab
 CR = %x0D           ; Carriage return
 LF = %x0A           ; Line feed
 CRLF = CR LF        ; Internet standard newline
 DQUOTE = %x22       ; Double quote
 VCHAR = %x21-7E     ; Visible ASCII characters
Author	SHA1	Message	Date
Pagwin	bc0475fde4	trying to figure out how the hell to make this work for new lines but now the header seems to include the follow on line	2025-11-13 17:01:08 -05:00
Pagwin	f3e9c4c8b2	removing abnf, it isn't representative so no point	2025-11-13 17:00:47 -05:00