From 7c13bdc4afe2376a31b808d1d5377eb4b7b57c9b Mon Sep 17 00:00:00 2001 From: Pagwin Date: Mon, 11 Aug 2025 20:58:30 -0400 Subject: [PATCH] realized there was backtracking and didn't want to backtrack so asked LLM for left factored ABNF --- app/Markdown.hs | 15 ++- markdown.abnf | 140 +++++++++++++++++++++++ markdown.ebnf | 179 ----------------------------- restructured.ebnf | 285 ---------------------------------------------- 4 files changed, 152 insertions(+), 467 deletions(-) create mode 100644 markdown.abnf delete mode 100644 markdown.ebnf delete mode 100644 restructured.ebnf diff --git a/app/Markdown.hs b/app/Markdown.hs index 6faf95c..5fd56bb 100644 --- a/app/Markdown.hs +++ b/app/Markdown.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} @@ -8,7 +9,7 @@ import IR import Text.Parsec import Text.Parsec.Combinator -type Parser a = forall s u m t. (Stream s m t) => ParsecT s u m a +type Parser a = forall s u m. (Stream s m Char) => ParsecT s u m a markdownParser :: Parser Document markdownParser = Doc <$> many block @@ -35,7 +36,15 @@ htmlBlock :: Parser Element htmlBlock = pure $ HTML $ Raw "" paragraph :: Parser Element -paragraph = pure $ Paragraph $ P "" +paragraph = do + first_text <- inlineText + rem_text <- many (endOfLine >> inlineText) + pure $ Paragraph $ P [] + +inlineText :: Parser InlineText +inlineText = choi blankline :: Parser Element -blankline = pure $ BlankLine BL +blankline = do + endOfLine + pure $ BlankLine BL diff --git a/markdown.abnf b/markdown.abnf new file mode 100644 index 0000000..ed85af0 --- /dev/null +++ b/markdown.abnf @@ -0,0 +1,140 @@ +; Left-Factored ABNF Grammar for Markdown with Embedded HTML +; Based on RFC 5234 (ABNF) syntax + +; Top-level document structure +document = *( block-element / CRLF ) + +; Block-level elements (left-factored by common prefixes) +block-element = heading / code-block / blockquote / list / horizontal-rule / + html-block / paragraph + +; Headings (ATX-style: # ## ### etc.) +heading = heading-prefix heading-content +heading-prefix = 1*6"#" *WSP +heading-content = *( inline-element ) line-ending + +; Code blocks (fenced with ``` or indented) +code-block = fenced-code-block / indented-code-block +fenced-code-block = code-fence [ language-info ] line-ending + *( code-line ) + code-fence line-ending +code-fence = "```" +language-info = 1*( ALPHA / DIGIT / "-" / "+" / "." ) +indented-code-block = 1*( indented-code-line ) +indented-code-line = 4*WSP 1*VCHAR line-ending +code-line = *VCHAR line-ending + +; Blockquotes +blockquote = 1*( blockquote-line ) +blockquote-line = ">" [ WSP ] *( inline-element ) line-ending + +; Lists (left-factored by list marker) +list = unordered-list / ordered-list +unordered-list = 1*( unordered-list-item ) +ordered-list = 1*( ordered-list-item ) +unordered-list-item = unordered-marker list-item-content +ordered-list-item = ordered-marker list-item-content +unordered-marker = ( "*" / "-" / "+" ) WSP +ordered-marker = 1*DIGIT "." WSP +list-item-content = *( inline-element ) line-ending + *( list-continuation ) +list-continuation = 2*WSP *( inline-element ) line-ending + +; Horizontal rule +horizontal-rule = hr-dashes / hr-asterisks / hr-underscores +hr-dashes = 3*( "-" ) *( WSP / "-" ) line-ending +hr-asterisks = 3*( "*" ) *( WSP / "*" ) line-ending +hr-underscores = 3*( "_" ) *( WSP / "_" ) line-ending + +; HTML blocks +html-block = html-block-tag / html-comment-block / html-declaration +html-block-tag = "<" tag-name *( WSP attribute ) [ WSP ] [ "/" ] ">" + *( html-content ) + [ "" ] + line-ending +html-comment-block = "" line-ending +html-declaration = "" line-ending + +; Paragraphs (catch-all for regular text) +paragraph = paragraph-line 1*( paragraph-continuation ) +paragraph-line = *( inline-element ) line-ending +paragraph-continuation = !block-element-start *( inline-element ) line-ending +block-element-start = "#" / ">" / ( 1*DIGIT "." WSP ) / + ( ( "*" / "-" / "+" ) WSP ) / + "```" / 4*WSP / "<" + +; Inline elements (left-factored by opening characters) +inline-element = emphasis / strong / code-span / link / image / + html-inline / line-break / escaped-char / plain-text + +; Emphasis and strong (left-factored by delimiter) +emphasis = emphasis-asterisk / emphasis-underscore +strong = strong-asterisk / strong-underscore +emphasis-asterisk = "*" emphasis-content "*" +emphasis-underscore = "_" emphasis-content "_" +strong-asterisk = "**" strong-content "**" +strong-underscore = "__" strong-content "__" +emphasis-content = 1*( !( "*" / line-ending ) inline-element ) +strong-content = 1*( !( "**" / line-ending ) inline-element ) + +; Code spans +code-span = code-delimiter code-span-content code-delimiter +code-delimiter = 1*"`" +code-span-content = 1*( !"`" CHAR ) + +; Links and images (left-factored by opening bracket) +link = link-reference / link-inline +image = image-reference / image-inline +link-reference = "[" link-text "]" "[" reference-id "]" +link-inline = "[" link-text "]" "(" url [ WSP title ] ")" +image-reference = "!" "[" alt-text "]" "[" reference-id "]" +image-inline = "!" "[" alt-text "]" "(" url [ WSP title ] ")" +link-text = 1*( !( "]" / line-ending ) inline-element ) +alt-text = 1*( !( "]" / line-ending ) CHAR ) +reference-id = 1*( !( "]" / line-ending ) CHAR ) +url = 1*( !( WSP / ")" / line-ending ) CHAR ) +title = ( DQUOTE title-content DQUOTE ) / + ( "'" title-content "'" ) / + ( "(" title-content ")" ) +title-content = *( !( DQUOTE / "'" / ")" ) CHAR ) + +; Inline HTML +html-inline = html-tag-inline / html-entity +html-tag-inline = "<" tag-name *( WSP attribute ) [ WSP ] [ "/" ] ">" +html-entity = "&" entity-name ";" +entity-name = 1*( ALPHA / DIGIT ) + +; HTML attributes and content +tag-name = ALPHA *( ALPHA / DIGIT / "-" / ":" ) +attribute = attribute-name [ "=" attribute-value ] +attribute-name = ( ALPHA / "_" / ":" ) *( ALPHA / DIGIT / "-" / "_" / ":" / "." ) +attribute-value = ( DQUOTE *( !DQUOTE CHAR ) DQUOTE ) / + ( "'" *( !"'" CHAR ) "'" ) / + ( 1*( !( WSP / ">" ) VCHAR ) ) +html-content = *( !( "" / "#" / "+" / "-" / "." / "!" / + "&" / "\" / line-ending + +; Basic definitions +line-ending = CRLF / LF / CR +WSP = SP / HTAB +CHAR = %x00-10FFFF ; Any Unicode character +VCHAR = %x21-7E ; Visible ASCII characters +ALPHA = %x41-5A / %x61-7A ; A-Z / a-z +DIGIT = %x30-39 ; 0-9 +SP = %x20 ; Space +HTAB = %x09 ; Horizontal tab +CR = %x0D ; Carriage return +LF = %x0A ; Line feed +CRLF = CR LF ; Internet standard newline +DQUOTE = %x22 ; Double quote diff --git a/markdown.ebnf b/markdown.ebnf deleted file mode 100644 index 5304cc5..0000000 --- a/markdown.ebnf +++ /dev/null @@ -1,179 +0,0 @@ -(* Markdown EBNF Grammar with HTML Support *) - -document = { block } ; - -block = heading - | code_block - | quote_block - | list - | table - | html_block - | paragraph - | blank_line ; - -(* Headings *) -heading = atx_heading ; - -atx_heading = "#" { "#" } [ " " ] inline_text newline ; - -(* Code Blocks *) -code_block = fenced_code_block; - -fenced_code_block = "```" [ language_identifier ] newline - { code_line } - "```" newline ; - -code_line = { character - newline } newline ; - -language_identifier = { letter | digit | "-" | "+" } ; - -(* Quote Blocks *) -quote_block = { ">" [ " " ] ( inline_text | "" ) newline } ; - -(* Lists *) -list = unordered_list | ordered_list ; - -unordered_list = { unordered_list_item } ; - -ordered_list = { ordered_list_item } ; - -unordered_list_item = [ " " { " " } ] ( "*" | "+" | "-" ) " " inline_text newline - { continuation_line } ; - -ordered_list_item = [ " " { " " } ] digit { digit } "." " " inline_text newline - { continuation_line } ; - -continuation_line = " " inline_text newline ; - -(* Tables *) -table = table_header table_separator { table_row } ; - -table_header = "|" { table_cell "|" } newline ; - -table_separator = "|" { table_align_spec "|" } newline ; - -table_row = "|" { table_cell "|" } newline ; - -table_cell = { character - ( "|" | newline ) } ; - -table_align_spec = [ ":" ] "-" { "-" } [ ":" ] ; - -(* HTML Support *) -html_block = html_block_element; - -html_block_element = html_open_tag { html_content } html_close_tag newline - | html_self_closing_tag newline - | html_void_tag newline ; - -html_open_tag = "<" html_tag_name { " " html_attribute } [ " " ] ">" ; - -html_close_tag = "" ; - -html_self_closing_tag = "<" html_tag_name { " " html_attribute } [ " " ] "/>" ; - -html_void_tag = "<" html_void_tag_name { " " html_attribute } [ " " ] [ "/" ] ">" ; - -html_void_tag_name = "area" | "base" | "br" | "col" | "embed" | "hr" | "img" | "input" - | "link" | "meta" | "param" | "source" | "track" | "wbr" ; - -html_tag_name = letter { letter | digit | "-" | "_" | ":" | "." } ; - -html_attribute = html_attribute_name [ "=" html_attribute_value ] ; - -html_attribute_name = ( letter | "_" | ":" ) { letter | digit | "-" | "_" | ":" | "." } ; - -html_attribute_value = html_quoted_value | html_unquoted_value ; - -html_quoted_value = ( '"' { character - '"' } '"' ) - | ( "'" { character - "'" } "'" ) ; - -html_unquoted_value = { character - ( " " | "\t" | "\n" | "\r" | ">" | "/" | "=" ) } ; - -html_content = html_text | html_element | html_comment | html_processing_instruction ; - -html_text = { character - "<" } ; - -html_element = html_open_tag { html_content } html_close_tag - | html_self_closing_tag - | html_void_tag ; - -html_comment = "" ; - -(* Paragraphs *) -paragraph = inline_text { newline inline_text } newline ; - -(* Inline Elements *) -inline_text = { inline_element } ; - -inline_element = emphasis - | strong - | code_span - | link - | image - | autolink - | line_break - | plain_text ; - -emphasis = ( "*" non_asterisk_text "*" ) - | ( "_" non_underscore_text "_" ) ; - -strong = ( "**" non_asterisk_text "**" ) - | ( "__" non_underscore_text "__" ) ; - -code_span = "`" { "`" } non_backtick_text { "`" } "`" ; - -link = "[" link_text "]" "(" link_url [ " " link_title ] ")" ; - -image = "!" "[" alt_text "]" "(" image_url [ " " image_title ] ")" ; - -autolink = "<" ( url | email ) ">" ; - -line_break = " " newline | "\\" newline ; - -(* Text Content *) -plain_text = { character - special_char } ; - -non_asterisk_text = { character - "*" } ; - -non_underscore_text = { character - "_" } ; - -non_backtick_text = { character - "`" } ; - -link_text = { character - ( "[" | "]" ) } ; - -alt_text = { character - ( "[" | "]" ) } ; - -link_url = { character - ( "(" | ")" | " " ) } ; - -image_url = { character - ( "(" | ")" | " " ) } ; - -link_title = quote_string ; - -image_title = quote_string ; - -quote_string = ( '"' { character - '"' } '"' ) - | ( "'" { character - "'" } "'" ) ; - -url = "http" [ "s" ] "://" { character - ">" } ; - -email = { character - ( "@" | ">" ) } "@" { character - ">" } ; - -(* Utilities *) -blank_line = newline ; - -special_char = "*" | "_" | "`" | "[" | "]" | "(" | ")" | "#" | ">" | "|" | "!" | "\\" | "<" ; - -newline = "\n" | "\r\n" ; - -character = letter | digit | symbol | " " ; - -letter = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" - | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" - | "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" - | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" ; - -digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ; - -symbol = "!" | "@" | "#" | "$" | "%" | "^" | "&" | "*" | "(" | ")" | "-" | "_" | "=" - | "+" | "[" | "]" | "{" | "}" | "\\" | "|" | ";" | ":" | "'" | '"' | "," | "." - | "<" | ">" | "/" | "?" | "~" | "`" ; diff --git a/restructured.ebnf b/restructured.ebnf deleted file mode 100644 index 1d22040..0000000 --- a/restructured.ebnf +++ /dev/null @@ -1,285 +0,0 @@ -(* reStructuredText EBNF Grammar *) - -document = { block } ; - -block = section - | transition - | paragraph - | literal_block - | line_block - | block_quote - | doctest_block - | table - | bullet_list - | enumerated_list - | definition_list - | field_list - | option_list - | directive - | comment - | substitution_definition - | target - | blank_line ; - -(* Sections *) -section = section_title section_underline [ section_overline ] ; - -section_title = inline_text newline ; - -section_underline = section_adornment newline ; - -section_overline = section_adornment newline ; - -section_adornment = adornment_char { adornment_char } ; - -adornment_char = "!" | '"' | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*" | "+" | "," | "-" | "." | "/" | ":" | ";" | "<" | "=" | ">" | "?" | "@" | "[" | "\\" | "]" | "^" | "_" | "`" | "{" | "|" | "}" | "~" ; - -(* Transitions *) -transition = transition_marker newline ; - -transition_marker = transition_char { transition_char } ; - -transition_char = adornment_char ; - -(* Paragraphs *) -paragraph = inline_text { newline inline_text } newline ; - -(* Literal Blocks *) -literal_block = literal_block_marker newline { indented_line } ; - -literal_block_marker = "::" | paragraph "::" ; - -indented_line = indent line_content newline ; - -indent = " " | "\t" ; - -line_content = { character - newline } ; - -(* Line Blocks *) -line_block = { line_block_line } ; - -line_block_line = "|" [ " " ] inline_text newline ; - -(* Block Quotes *) -block_quote = { indented_paragraph } [ attribution ] ; - -indented_paragraph = indent inline_text { newline indent inline_text } newline ; - -attribution = indent "-- " inline_text newline ; - -(* Doctest Blocks *) -doctest_block = { doctest_line } ; - -doctest_line = ">>>" " " line_content newline - | "..." " " line_content newline ; - -(* Tables *) -table = simple_table | grid_table ; - -simple_table = simple_table_row { simple_table_row } simple_table_separator { simple_table_row } ; - -simple_table_row = { table_cell } newline ; - -simple_table_separator = "=" { ( "=" | " " ) } newline ; - -grid_table = grid_table_border { grid_table_row grid_table_border } ; - -grid_table_border = "+" { ( "-" | "+" ) } newline ; - -grid_table_row = "|" { table_cell "|" } newline ; - -table_cell = { character - ( "|" | newline ) } ; - -(* Lists *) -bullet_list = { bullet_list_item } ; - -bullet_list_item = bullet_marker " " list_item_content ; - -bullet_marker = "*" | "+" | "-" | "•" | "‣" | "⁃" ; - -enumerated_list = { enumerated_list_item } ; - -enumerated_list_item = enumeration_marker " " list_item_content ; - -enumeration_marker = ( digit { digit } "." ) - | ( digit { digit } ")" ) - | ( "(" digit { digit } ")" ) - | ( letter "." ) - | ( letter ")" ) - | ( "(" letter ")" ) - | ( roman "." ) - | ( roman ")" ) - | ( "(" roman ")" ) - | "#." | "#)" | "(#)" ; - -list_item_content = inline_text { newline [ indent ] inline_text } newline ; - -definition_list = { definition_list_item } ; - -definition_list_item = term newline indent definition newline ; - -term = inline_text ; - -definition = inline_text { newline indent inline_text } ; - -(* Field Lists *) -field_list = { field_list_item } ; - -field_list_item = ":" field_name ":" " " field_body newline ; - -field_name = { letter | digit | " " | "-" | "_" } ; - -field_body = inline_text { newline indent inline_text } ; - -(* Option Lists *) -option_list = { option_list_item } ; - -option_list_item = option_group " " option_description newline ; - -option_group = option { ", " option } ; - -option = short_option | long_option ; - -short_option = "-" letter [ " " option_argument ] ; - -long_option = "--" { letter | digit | "-" } [ "=" option_argument ] ; - -option_argument = { letter | digit | "-" | "_" } ; - -option_description = inline_text { newline indent inline_text } ; - -(* Directives *) -directive = ".." " " directive_name "::" [ " " directive_arguments ] newline - [ directive_options ] - [ blank_line ] - [ directive_content ] ; - -directive_name = { letter | digit | "-" | "_" } ; - -directive_arguments = { character - newline } ; - -directive_options = { directive_option } ; - -directive_option = indent ":" option_name ":" [ " " option_value ] newline ; - -option_name = { letter | digit | "-" | "_" } ; - -option_value = { character - newline } ; - -directive_content = { indented_line } ; - -(* Comments *) -comment = ".." [ " " comment_text ] newline { indented_line } ; - -comment_text = { character - newline } ; - -(* Substitution Definitions *) -substitution_definition = ".." " " "|" substitution_name "|" " " directive_name "::" [ " " directive_arguments ] newline - [ directive_options ] - [ directive_content ] ; - -substitution_name = { character - ( "|" | newline ) } ; - -(* Targets *) -target = internal_target | external_target ; - -internal_target = ".." " " "_" target_name ":" newline ; - -external_target = ".." " " "_" target_name ":" " " target_url newline ; - -target_name = { character - ( ":" | newline ) } ; - -target_url = { character - newline } ; - -(* Inline Elements *) -inline_text = { inline_element } ; - -inline_element = emphasis - | strong - | literal - | interpreted_text - | phrase_reference - | substitution_reference - | inline_internal_target - | hyperlink_reference - | footnote_reference - | citation_reference - | inline_literal - | plain_text ; - -emphasis = "*" emphasis_text "*" ; - -strong = "**" strong_text "**" ; - -literal = "``" literal_text "``" ; - -interpreted_text = "`" interpreted_text_content "`" [ role_suffix ] - | role_prefix "`" interpreted_text_content "`" ; - -role_prefix = ":" role_name ":" ; - -role_suffix = ":" role_name ":" ; - -role_name = { letter | digit | "-" | "_" | "." } ; - -interpreted_text_content = { character - "`" } ; - -phrase_reference = "`" phrase_reference_text "`_" [ "_" ] ; - -phrase_reference_text = { character - ( "`" | "<" ) } [ " " "<" target_url ">" ] ; - -substitution_reference = "|" substitution_name "|" [ "_" [ "_" ] ] ; - -inline_internal_target = "_`" target_text "`" ; - -target_text = { character - "`" } ; - -hyperlink_reference = reference_name "_" [ "_" ] ; - -reference_name = { letter | digit | "-" | "_" | "." } ; - -footnote_reference = "[" footnote_label "]_" ; - -footnote_label = digit { digit } | "#" [ footnote_name ] | "*" ; - -footnote_name = { letter | digit | "-" | "_" } ; - -citation_reference = "[" citation_label "]_" ; - -citation_label = { letter | digit | "-" | "_" | "." } ; - -inline_literal = "`" "`" literal_content "`" "`" ; - -literal_content = { character - "`" } ; - -(* Text Content *) -plain_text = { character - markup_char } ; - -emphasis_text = { character - ( "*" | newline ) } ; - -strong_text = { character - ( "*" | newline ) } ; - -literal_text = { character - ( "`" | newline ) } ; - -markup_char = "*" | "`" | "_" | "|" | "[" | "]" | ":" | "." | ">" | "<" ; - -(* Utilities *) -blank_line = newline ; - -roman = "i" | "ii" | "iii" | "iv" | "v" | "vi" | "vii" | "viii" | "ix" | "x" - | "I" | "II" | "III" | "IV" | "V" | "VI" | "VII" | "VIII" | "IX" | "X" ; - -newline = "\n" | "\r\n" ; - -character = letter | digit | symbol | " " ; - -letter = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" - | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" - | "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" - | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" ; - -digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ; - -symbol = "!" | "@" | "#" | "$" | "%" | "^" | "&" | "*" | "(" | ")" | "-" | "_" | "=" - | "+" | "[" | "]" | "{" | "}" | "\\" | "|" | ";" | ":" | "'" | '"' | "," | "." - | "<" | ">" | "/" | "?" | "~" | "`" ;