began scaffolding down to markdown parser

2025-07-24 19:39:44 -04:00 · 2025-07-24 19:39:44 -04:00 · 0e4613f27c
commit 0e4613f27c
parent 016d50aff1
3 changed files with 68 additions and 30 deletions
--- a/app/IR.hs
+++ b/app/IR.hs
@ -2,13 +2,36 @@ module IR where

 import Data.Text

-- Html and Math tags come with their data because they are leaves for us
-- We aren't parsing that if we can avoid it
-data Tag = Heading {level :: Int} | Paragraph | Blockquote | Code | Html {html :: Text} | Anchor | Italic | Bold | Math {mathML :: Text}
+newtype Document = Doc [Element]

-data Data = Ast {ast :: AST} | Text {text :: Text}
+data Element = Heading Heading | Code Code | BlockQuote BlockQuote | List List | Table Table | HTML HTML | Paragraph Paragraph | BlankLine BlankLine

-data AST = AST {tag :: Tag, child :: [Data]}
+data Heading = H {level :: Int, text :: Text}
+
+data Code = C {language :: Text, code :: Text}
+
+newtype BlockQuote = Q Text
+
+data ListType = Ordered | Unordered
+
+data ListItem = LI {content :: Text, children :: [List]}
+
+data List = L {list_type :: ListType, items :: [ListItem]}
+
+data Table = T {header :: TableHeader, rows :: [TableRow]}
+
+-- TODO: layout/sizing info?
+newtype TableHeader = TH [Text]
+
+newtype TableRow = TR Text
+
+newtype HTML = Raw Text
+
+newtype Paragraph = P [InlineText]
+
+data InlineText = Normal Text | Bold InlineText | Italic InlineText | CodeLine Text | Link {nest :: InlineText, href :: Text}
+
+data BlankLine = BL

 -- for processing math
 -- https://hackage.haskell.org/package/typst-0.6.1/docs/Typst-Parse.html#v:parseTypst
--- a/app/Markdown.hs
+++ b/app/Markdown.hs
@ -1,4 +1,41 @@
 {-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE RankNTypes #-}

-module Markdown () where
+module Markdown (markdownParser) where

+import Data.Text
+import IR
+import Text.Parsec
+import Text.Parsec.Combinator
+
+type Parser a = forall s u m t. (Stream s m t) => ParsecT s u m a
+
+markdownParser :: Parser Document
+markdownParser = Doc <$> many block
+
+block :: Parser Element
+block = choice [heading, codeBlock, quoteBlock, list, table, htmlBlock, paragraph, blankLine]
+
+heading :: Parser Element
+heading = pure $ Heading $ H {level = 1, text = ""}
+
+codeBlock :: Parser Element
+codeBlock = pure $ Code $ C {language = "", code = ""}
+
+quoteBlock :: Parser Element
+quoteBlock = pure $ BlockQuote $ Q ""
+
+list :: Parser Element
+list = pure $ List $ L {list_type = Ordered, items = []}
+
+table :: Parser Element
+table = pure $ Table $ T {header = TH [], rows = []}
+
+htmlBlock :: Parser Element
+htmlBlock = pure $ HTML $ Raw ""
+
+paragraph :: Parser Element
+paragraph = pure $ Paragraph $ P ""
+
+blankline :: Parser Element
+blankline = pure $ BlankLine BL
--- a/markdown.ebnf
+++ b/markdown.ebnf
@ -3,7 +3,6 @@
 document = { block } ;

 block = heading
-      | horizontal_rule
      | code_block
      | quote_block
      | list
@ -13,27 +12,17 @@ block = heading
      | blank_line ;

 (* Headings *)
-heading = atx_heading | setext_heading ;
+heading = atx_heading ;

 atx_heading = "#" { "#" } [ " " ] inline_text newline ;

-setext_heading = inline_text newline
-                 ( ( "=" { "=" } ) | ( "-" { "-" } ) ) newline ;
-
-(* Horizontal Rule *)
-horizontal_rule = ( ( "*" [ " " ] "*" [ " " ] "*" { [ " " ] "*" } )
-                  | ( "-" [ " " ] "-" [ " " ] "-" { [ " " ] "-" } )
-                  | ( "_" [ " " ] "_" [ " " ] "_" { [ " " ] "_" } ) ) newline ;
-
 (* Code Blocks *)
-code_block = fenced_code_block | indented_code_block ;
+code_block = fenced_code_block;

 fenced_code_block = "```" [ language_identifier ] newline
                    { code_line }
                    "```" newline ;

-indented_code_block = { "    " code_line } ;
-
 code_line = { character - newline } newline ;

 language_identifier = { letter | digit | "-" | "+" } ;
@ -122,20 +111,9 @@ inline_element = emphasis
               | link
               | image
               | autolink
-               | html_inline
               | line_break
               | plain_text ;

-html_inline = html_inline_element | html_comment | html_processing_instruction ;
-
-html_inline_element = html_open_tag { html_inline_content } html_close_tag
-                    | html_self_closing_tag
-                    | html_void_tag ;
-
-html_inline_content = html_inline_text | html_inline_element | html_comment | html_processing_instruction ;
-
-html_inline_text = { character - "<" } ;
-
 emphasis = ( "*" non_asterisk_text "*" )
         | ( "_" non_underscore_text "_" ) ;