realized there was backtracking and didn't want to backtrack so asked LLM for left factored ABNF

This commit is contained in:
Pagwin 2025-08-11 20:58:30 -04:00
parent 0e4613f27c
commit 7c13bdc4af
No known key found for this signature in database
GPG key ID: 81137023740CA260
4 changed files with 152 additions and 467 deletions

View file

@ -1,3 +1,4 @@
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-} {-# LANGUAGE RankNTypes #-}
@ -8,7 +9,7 @@ import IR
import Text.Parsec import Text.Parsec
import Text.Parsec.Combinator import Text.Parsec.Combinator
type Parser a = forall s u m t. (Stream s m t) => ParsecT s u m a type Parser a = forall s u m. (Stream s m Char) => ParsecT s u m a
markdownParser :: Parser Document markdownParser :: Parser Document
markdownParser = Doc <$> many block markdownParser = Doc <$> many block
@ -35,7 +36,15 @@ htmlBlock :: Parser Element
htmlBlock = pure $ HTML $ Raw "" htmlBlock = pure $ HTML $ Raw ""
paragraph :: Parser Element paragraph :: Parser Element
paragraph = pure $ Paragraph $ P "" paragraph = do
first_text <- inlineText
rem_text <- many (endOfLine >> inlineText)
pure $ Paragraph $ P []
inlineText :: Parser InlineText
inlineText = choi
blankline :: Parser Element blankline :: Parser Element
blankline = pure $ BlankLine BL blankline = do
endOfLine
pure $ BlankLine BL

140
markdown.abnf Normal file
View file

@ -0,0 +1,140 @@
; Left-Factored ABNF Grammar for Markdown with Embedded HTML
; Based on RFC 5234 (ABNF) syntax
; Top-level document structure
document = *( block-element / CRLF )
; Block-level elements (left-factored by common prefixes)
block-element = heading / code-block / blockquote / list / horizontal-rule /
html-block / paragraph
; Headings (ATX-style: # ## ### etc.)
heading = heading-prefix heading-content
heading-prefix = 1*6"#" *WSP
heading-content = *( inline-element ) line-ending
; Code blocks (fenced with ``` or indented)
code-block = fenced-code-block / indented-code-block
fenced-code-block = code-fence [ language-info ] line-ending
*( code-line )
code-fence line-ending
code-fence = "```"
language-info = 1*( ALPHA / DIGIT / "-" / "+" / "." )
indented-code-block = 1*( indented-code-line )
indented-code-line = 4*WSP 1*VCHAR line-ending
code-line = *VCHAR line-ending
; Blockquotes
blockquote = 1*( blockquote-line )
blockquote-line = ">" [ WSP ] *( inline-element ) line-ending
; Lists (left-factored by list marker)
list = unordered-list / ordered-list
unordered-list = 1*( unordered-list-item )
ordered-list = 1*( ordered-list-item )
unordered-list-item = unordered-marker list-item-content
ordered-list-item = ordered-marker list-item-content
unordered-marker = ( "*" / "-" / "+" ) WSP
ordered-marker = 1*DIGIT "." WSP
list-item-content = *( inline-element ) line-ending
*( list-continuation )
list-continuation = 2*WSP *( inline-element ) line-ending
; Horizontal rule
horizontal-rule = hr-dashes / hr-asterisks / hr-underscores
hr-dashes = 3*( "-" ) *( WSP / "-" ) line-ending
hr-asterisks = 3*( "*" ) *( WSP / "*" ) line-ending
hr-underscores = 3*( "_" ) *( WSP / "_" ) line-ending
; HTML blocks
html-block = html-block-tag / html-comment-block / html-declaration
html-block-tag = "<" tag-name *( WSP attribute ) [ WSP ] [ "/" ] ">"
*( html-content )
[ "</" tag-name ">" ]
line-ending
html-comment-block = "<!--" *( CHAR ) "-->" line-ending
html-declaration = "<!" 1*ALPHA *( WSP / VCHAR ) ">" line-ending
; Paragraphs (catch-all for regular text)
paragraph = paragraph-line 1*( paragraph-continuation )
paragraph-line = *( inline-element ) line-ending
paragraph-continuation = !block-element-start *( inline-element ) line-ending
block-element-start = "#" / ">" / ( 1*DIGIT "." WSP ) /
( ( "*" / "-" / "+" ) WSP ) /
"```" / 4*WSP / "<"
; Inline elements (left-factored by opening characters)
inline-element = emphasis / strong / code-span / link / image /
html-inline / line-break / escaped-char / plain-text
; Emphasis and strong (left-factored by delimiter)
emphasis = emphasis-asterisk / emphasis-underscore
strong = strong-asterisk / strong-underscore
emphasis-asterisk = "*" emphasis-content "*"
emphasis-underscore = "_" emphasis-content "_"
strong-asterisk = "**" strong-content "**"
strong-underscore = "__" strong-content "__"
emphasis-content = 1*( !( "*" / line-ending ) inline-element )
strong-content = 1*( !( "**" / line-ending ) inline-element )
; Code spans
code-span = code-delimiter code-span-content code-delimiter
code-delimiter = 1*"`"
code-span-content = 1*( !"`" CHAR )
; Links and images (left-factored by opening bracket)
link = link-reference / link-inline
image = image-reference / image-inline
link-reference = "[" link-text "]" "[" reference-id "]"
link-inline = "[" link-text "]" "(" url [ WSP title ] ")"
image-reference = "!" "[" alt-text "]" "[" reference-id "]"
image-inline = "!" "[" alt-text "]" "(" url [ WSP title ] ")"
link-text = 1*( !( "]" / line-ending ) inline-element )
alt-text = 1*( !( "]" / line-ending ) CHAR )
reference-id = 1*( !( "]" / line-ending ) CHAR )
url = 1*( !( WSP / ")" / line-ending ) CHAR )
title = ( DQUOTE title-content DQUOTE ) /
( "'" title-content "'" ) /
( "(" title-content ")" )
title-content = *( !( DQUOTE / "'" / ")" ) CHAR )
; Inline HTML
html-inline = html-tag-inline / html-entity
html-tag-inline = "<" tag-name *( WSP attribute ) [ WSP ] [ "/" ] ">"
html-entity = "&" entity-name ";"
entity-name = 1*( ALPHA / DIGIT )
; HTML attributes and content
tag-name = ALPHA *( ALPHA / DIGIT / "-" / ":" )
attribute = attribute-name [ "=" attribute-value ]
attribute-name = ( ALPHA / "_" / ":" ) *( ALPHA / DIGIT / "-" / "_" / ":" / "." )
attribute-value = ( DQUOTE *( !DQUOTE CHAR ) DQUOTE ) /
( "'" *( !"'" CHAR ) "'" ) /
( 1*( !( WSP / ">" ) VCHAR ) )
html-content = *( !( "</" ) CHAR )
; Line breaks and escaped characters
line-break = hard-line-break / soft-line-break
hard-line-break = 2*WSP line-ending
soft-line-break = line-ending
escaped-char = "\" ( VCHAR / WSP )
; Plain text (everything else)
plain-text = 1*( !special-char CHAR )
special-char = "*" / "_" / "`" / "[" / "]" / "(" / ")" /
"<" / ">" / "#" / "+" / "-" / "." / "!" /
"&" / "\" / line-ending
; Basic definitions
line-ending = CRLF / LF / CR
WSP = SP / HTAB
CHAR = %x00-10FFFF ; Any Unicode character
VCHAR = %x21-7E ; Visible ASCII characters
ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
DIGIT = %x30-39 ; 0-9
SP = %x20 ; Space
HTAB = %x09 ; Horizontal tab
CR = %x0D ; Carriage return
LF = %x0A ; Line feed
CRLF = CR LF ; Internet standard newline
DQUOTE = %x22 ; Double quote

View file

@ -1,179 +0,0 @@
(* Markdown EBNF Grammar with HTML Support *)
document = { block } ;
block = heading
| code_block
| quote_block
| list
| table
| html_block
| paragraph
| blank_line ;
(* Headings *)
heading = atx_heading ;
atx_heading = "#" { "#" } [ " " ] inline_text newline ;
(* Code Blocks *)
code_block = fenced_code_block;
fenced_code_block = "```" [ language_identifier ] newline
{ code_line }
"```" newline ;
code_line = { character - newline } newline ;
language_identifier = { letter | digit | "-" | "+" } ;
(* Quote Blocks *)
quote_block = { ">" [ " " ] ( inline_text | "" ) newline } ;
(* Lists *)
list = unordered_list | ordered_list ;
unordered_list = { unordered_list_item } ;
ordered_list = { ordered_list_item } ;
unordered_list_item = [ " " { " " } ] ( "*" | "+" | "-" ) " " inline_text newline
{ continuation_line } ;
ordered_list_item = [ " " { " " } ] digit { digit } "." " " inline_text newline
{ continuation_line } ;
continuation_line = " " inline_text newline ;
(* Tables *)
table = table_header table_separator { table_row } ;
table_header = "|" { table_cell "|" } newline ;
table_separator = "|" { table_align_spec "|" } newline ;
table_row = "|" { table_cell "|" } newline ;
table_cell = { character - ( "|" | newline ) } ;
table_align_spec = [ ":" ] "-" { "-" } [ ":" ] ;
(* HTML Support *)
html_block = html_block_element;
html_block_element = html_open_tag { html_content } html_close_tag newline
| html_self_closing_tag newline
| html_void_tag newline ;
html_open_tag = "<" html_tag_name { " " html_attribute } [ " " ] ">" ;
html_close_tag = "</" html_tag_name ">" ;
html_self_closing_tag = "<" html_tag_name { " " html_attribute } [ " " ] "/>" ;
html_void_tag = "<" html_void_tag_name { " " html_attribute } [ " " ] [ "/" ] ">" ;
html_void_tag_name = "area" | "base" | "br" | "col" | "embed" | "hr" | "img" | "input"
| "link" | "meta" | "param" | "source" | "track" | "wbr" ;
html_tag_name = letter { letter | digit | "-" | "_" | ":" | "." } ;
html_attribute = html_attribute_name [ "=" html_attribute_value ] ;
html_attribute_name = ( letter | "_" | ":" ) { letter | digit | "-" | "_" | ":" | "." } ;
html_attribute_value = html_quoted_value | html_unquoted_value ;
html_quoted_value = ( '"' { character - '"' } '"' )
| ( "'" { character - "'" } "'" ) ;
html_unquoted_value = { character - ( " " | "\t" | "\n" | "\r" | ">" | "/" | "=" ) } ;
html_content = html_text | html_element | html_comment | html_processing_instruction ;
html_text = { character - "<" } ;
html_element = html_open_tag { html_content } html_close_tag
| html_self_closing_tag
| html_void_tag ;
html_comment = "<!--" { character - ( "-" "-" ">" ) | "-" character - ( "-" ">" ) | "-" "-" character - ">" } "-->" ;
(* Paragraphs *)
paragraph = inline_text { newline inline_text } newline ;
(* Inline Elements *)
inline_text = { inline_element } ;
inline_element = emphasis
| strong
| code_span
| link
| image
| autolink
| line_break
| plain_text ;
emphasis = ( "*" non_asterisk_text "*" )
| ( "_" non_underscore_text "_" ) ;
strong = ( "**" non_asterisk_text "**" )
| ( "__" non_underscore_text "__" ) ;
code_span = "`" { "`" } non_backtick_text { "`" } "`" ;
link = "[" link_text "]" "(" link_url [ " " link_title ] ")" ;
image = "!" "[" alt_text "]" "(" image_url [ " " image_title ] ")" ;
autolink = "<" ( url | email ) ">" ;
line_break = " " newline | "\\" newline ;
(* Text Content *)
plain_text = { character - special_char } ;
non_asterisk_text = { character - "*" } ;
non_underscore_text = { character - "_" } ;
non_backtick_text = { character - "`" } ;
link_text = { character - ( "[" | "]" ) } ;
alt_text = { character - ( "[" | "]" ) } ;
link_url = { character - ( "(" | ")" | " " ) } ;
image_url = { character - ( "(" | ")" | " " ) } ;
link_title = quote_string ;
image_title = quote_string ;
quote_string = ( '"' { character - '"' } '"' )
| ( "'" { character - "'" } "'" ) ;
url = "http" [ "s" ] "://" { character - ">" } ;
email = { character - ( "@" | ">" ) } "@" { character - ">" } ;
(* Utilities *)
blank_line = newline ;
special_char = "*" | "_" | "`" | "[" | "]" | "(" | ")" | "#" | ">" | "|" | "!" | "\\" | "<" ;
newline = "\n" | "\r\n" ;
character = letter | digit | symbol | " " ;
letter = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m"
| "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
| "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M"
| "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" ;
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
symbol = "!" | "@" | "#" | "$" | "%" | "^" | "&" | "*" | "(" | ")" | "-" | "_" | "="
| "+" | "[" | "]" | "{" | "}" | "\\" | "|" | ";" | ":" | "'" | '"' | "," | "."
| "<" | ">" | "/" | "?" | "~" | "`" ;

View file

@ -1,285 +0,0 @@
(* reStructuredText EBNF Grammar *)
document = { block } ;
block = section
| transition
| paragraph
| literal_block
| line_block
| block_quote
| doctest_block
| table
| bullet_list
| enumerated_list
| definition_list
| field_list
| option_list
| directive
| comment
| substitution_definition
| target
| blank_line ;
(* Sections *)
section = section_title section_underline [ section_overline ] ;
section_title = inline_text newline ;
section_underline = section_adornment newline ;
section_overline = section_adornment newline ;
section_adornment = adornment_char { adornment_char } ;
adornment_char = "!" | '"' | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*" | "+" | "," | "-" | "." | "/" | ":" | ";" | "<" | "=" | ">" | "?" | "@" | "[" | "\\" | "]" | "^" | "_" | "`" | "{" | "|" | "}" | "~" ;
(* Transitions *)
transition = transition_marker newline ;
transition_marker = transition_char { transition_char } ;
transition_char = adornment_char ;
(* Paragraphs *)
paragraph = inline_text { newline inline_text } newline ;
(* Literal Blocks *)
literal_block = literal_block_marker newline { indented_line } ;
literal_block_marker = "::" | paragraph "::" ;
indented_line = indent line_content newline ;
indent = " " | "\t" ;
line_content = { character - newline } ;
(* Line Blocks *)
line_block = { line_block_line } ;
line_block_line = "|" [ " " ] inline_text newline ;
(* Block Quotes *)
block_quote = { indented_paragraph } [ attribution ] ;
indented_paragraph = indent inline_text { newline indent inline_text } newline ;
attribution = indent "-- " inline_text newline ;
(* Doctest Blocks *)
doctest_block = { doctest_line } ;
doctest_line = ">>>" " " line_content newline
| "..." " " line_content newline ;
(* Tables *)
table = simple_table | grid_table ;
simple_table = simple_table_row { simple_table_row } simple_table_separator { simple_table_row } ;
simple_table_row = { table_cell } newline ;
simple_table_separator = "=" { ( "=" | " " ) } newline ;
grid_table = grid_table_border { grid_table_row grid_table_border } ;
grid_table_border = "+" { ( "-" | "+" ) } newline ;
grid_table_row = "|" { table_cell "|" } newline ;
table_cell = { character - ( "|" | newline ) } ;
(* Lists *)
bullet_list = { bullet_list_item } ;
bullet_list_item = bullet_marker " " list_item_content ;
bullet_marker = "*" | "+" | "-" | "•" | "‣" | "" ;
enumerated_list = { enumerated_list_item } ;
enumerated_list_item = enumeration_marker " " list_item_content ;
enumeration_marker = ( digit { digit } "." )
| ( digit { digit } ")" )
| ( "(" digit { digit } ")" )
| ( letter "." )
| ( letter ")" )
| ( "(" letter ")" )
| ( roman "." )
| ( roman ")" )
| ( "(" roman ")" )
| "#." | "#)" | "(#)" ;
list_item_content = inline_text { newline [ indent ] inline_text } newline ;
definition_list = { definition_list_item } ;
definition_list_item = term newline indent definition newline ;
term = inline_text ;
definition = inline_text { newline indent inline_text } ;
(* Field Lists *)
field_list = { field_list_item } ;
field_list_item = ":" field_name ":" " " field_body newline ;
field_name = { letter | digit | " " | "-" | "_" } ;
field_body = inline_text { newline indent inline_text } ;
(* Option Lists *)
option_list = { option_list_item } ;
option_list_item = option_group " " option_description newline ;
option_group = option { ", " option } ;
option = short_option | long_option ;
short_option = "-" letter [ " " option_argument ] ;
long_option = "--" { letter | digit | "-" } [ "=" option_argument ] ;
option_argument = { letter | digit | "-" | "_" } ;
option_description = inline_text { newline indent inline_text } ;
(* Directives *)
directive = ".." " " directive_name "::" [ " " directive_arguments ] newline
[ directive_options ]
[ blank_line ]
[ directive_content ] ;
directive_name = { letter | digit | "-" | "_" } ;
directive_arguments = { character - newline } ;
directive_options = { directive_option } ;
directive_option = indent ":" option_name ":" [ " " option_value ] newline ;
option_name = { letter | digit | "-" | "_" } ;
option_value = { character - newline } ;
directive_content = { indented_line } ;
(* Comments *)
comment = ".." [ " " comment_text ] newline { indented_line } ;
comment_text = { character - newline } ;
(* Substitution Definitions *)
substitution_definition = ".." " " "|" substitution_name "|" " " directive_name "::" [ " " directive_arguments ] newline
[ directive_options ]
[ directive_content ] ;
substitution_name = { character - ( "|" | newline ) } ;
(* Targets *)
target = internal_target | external_target ;
internal_target = ".." " " "_" target_name ":" newline ;
external_target = ".." " " "_" target_name ":" " " target_url newline ;
target_name = { character - ( ":" | newline ) } ;
target_url = { character - newline } ;
(* Inline Elements *)
inline_text = { inline_element } ;
inline_element = emphasis
| strong
| literal
| interpreted_text
| phrase_reference
| substitution_reference
| inline_internal_target
| hyperlink_reference
| footnote_reference
| citation_reference
| inline_literal
| plain_text ;
emphasis = "*" emphasis_text "*" ;
strong = "**" strong_text "**" ;
literal = "``" literal_text "``" ;
interpreted_text = "`" interpreted_text_content "`" [ role_suffix ]
| role_prefix "`" interpreted_text_content "`" ;
role_prefix = ":" role_name ":" ;
role_suffix = ":" role_name ":" ;
role_name = { letter | digit | "-" | "_" | "." } ;
interpreted_text_content = { character - "`" } ;
phrase_reference = "`" phrase_reference_text "`_" [ "_" ] ;
phrase_reference_text = { character - ( "`" | "<" ) } [ " " "<" target_url ">" ] ;
substitution_reference = "|" substitution_name "|" [ "_" [ "_" ] ] ;
inline_internal_target = "_`" target_text "`" ;
target_text = { character - "`" } ;
hyperlink_reference = reference_name "_" [ "_" ] ;
reference_name = { letter | digit | "-" | "_" | "." } ;
footnote_reference = "[" footnote_label "]_" ;
footnote_label = digit { digit } | "#" [ footnote_name ] | "*" ;
footnote_name = { letter | digit | "-" | "_" } ;
citation_reference = "[" citation_label "]_" ;
citation_label = { letter | digit | "-" | "_" | "." } ;
inline_literal = "`" "`" literal_content "`" "`" ;
literal_content = { character - "`" } ;
(* Text Content *)
plain_text = { character - markup_char } ;
emphasis_text = { character - ( "*" | newline ) } ;
strong_text = { character - ( "*" | newline ) } ;
literal_text = { character - ( "`" | newline ) } ;
markup_char = "*" | "`" | "_" | "|" | "[" | "]" | ":" | "." | ">" | "<" ;
(* Utilities *)
blank_line = newline ;
roman = "i" | "ii" | "iii" | "iv" | "v" | "vi" | "vii" | "viii" | "ix" | "x"
| "I" | "II" | "III" | "IV" | "V" | "VI" | "VII" | "VIII" | "IX" | "X" ;
newline = "\n" | "\r\n" ;
character = letter | digit | symbol | " " ;
letter = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m"
| "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
| "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M"
| "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" ;
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
symbol = "!" | "@" | "#" | "$" | "%" | "^" | "&" | "*" | "(" | ")" | "-" | "_" | "="
| "+" | "[" | "]" | "{" | "}" | "\\" | "|" | ";" | ":" | "'" | '"' | "," | "."
| "<" | ">" | "/" | "?" | "~" | "`" ;