From 0e4613f27cbaa096ababc8e123fdd3a618672519 Mon Sep 17 00:00:00 2001 From: Pagwin Date: Thu, 24 Jul 2025 19:39:44 -0400 Subject: [PATCH] began scaffolding down to markdown parser --- app/IR.hs | 33 ++++++++++++++++++++++++++++----- app/Markdown.hs | 39 ++++++++++++++++++++++++++++++++++++++- markdown.ebnf | 26 ++------------------------ 3 files changed, 68 insertions(+), 30 deletions(-) diff --git a/app/IR.hs b/app/IR.hs index 9a78e84..321baba 100644 --- a/app/IR.hs +++ b/app/IR.hs @@ -2,13 +2,36 @@ module IR where import Data.Text --- Html and Math tags come with their data because they are leaves for us --- We aren't parsing that if we can avoid it -data Tag = Heading {level :: Int} | Paragraph | Blockquote | Code | Html {html :: Text} | Anchor | Italic | Bold | Math {mathML :: Text} +newtype Document = Doc [Element] -data Data = Ast {ast :: AST} | Text {text :: Text} +data Element = Heading Heading | Code Code | BlockQuote BlockQuote | List List | Table Table | HTML HTML | Paragraph Paragraph | BlankLine BlankLine -data AST = AST {tag :: Tag, child :: [Data]} +data Heading = H {level :: Int, text :: Text} + +data Code = C {language :: Text, code :: Text} + +newtype BlockQuote = Q Text + +data ListType = Ordered | Unordered + +data ListItem = LI {content :: Text, children :: [List]} + +data List = L {list_type :: ListType, items :: [ListItem]} + +data Table = T {header :: TableHeader, rows :: [TableRow]} + +-- TODO: layout/sizing info? +newtype TableHeader = TH [Text] + +newtype TableRow = TR Text + +newtype HTML = Raw Text + +newtype Paragraph = P [InlineText] + +data InlineText = Normal Text | Bold InlineText | Italic InlineText | CodeLine Text | Link {nest :: InlineText, href :: Text} + +data BlankLine = BL -- for processing math -- https://hackage.haskell.org/package/typst-0.6.1/docs/Typst-Parse.html#v:parseTypst diff --git a/app/Markdown.hs b/app/Markdown.hs index 5e73599..6faf95c 100644 --- a/app/Markdown.hs +++ b/app/Markdown.hs @@ -1,4 +1,41 @@ {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RankNTypes #-} -module Markdown () where +module Markdown (markdownParser) where +import Data.Text +import IR +import Text.Parsec +import Text.Parsec.Combinator + +type Parser a = forall s u m t. (Stream s m t) => ParsecT s u m a + +markdownParser :: Parser Document +markdownParser = Doc <$> many block + +block :: Parser Element +block = choice [heading, codeBlock, quoteBlock, list, table, htmlBlock, paragraph, blankLine] + +heading :: Parser Element +heading = pure $ Heading $ H {level = 1, text = ""} + +codeBlock :: Parser Element +codeBlock = pure $ Code $ C {language = "", code = ""} + +quoteBlock :: Parser Element +quoteBlock = pure $ BlockQuote $ Q "" + +list :: Parser Element +list = pure $ List $ L {list_type = Ordered, items = []} + +table :: Parser Element +table = pure $ Table $ T {header = TH [], rows = []} + +htmlBlock :: Parser Element +htmlBlock = pure $ HTML $ Raw "" + +paragraph :: Parser Element +paragraph = pure $ Paragraph $ P "" + +blankline :: Parser Element +blankline = pure $ BlankLine BL diff --git a/markdown.ebnf b/markdown.ebnf index 9c39e98..5304cc5 100644 --- a/markdown.ebnf +++ b/markdown.ebnf @@ -3,7 +3,6 @@ document = { block } ; block = heading - | horizontal_rule | code_block | quote_block | list @@ -13,27 +12,17 @@ block = heading | blank_line ; (* Headings *) -heading = atx_heading | setext_heading ; +heading = atx_heading ; atx_heading = "#" { "#" } [ " " ] inline_text newline ; -setext_heading = inline_text newline - ( ( "=" { "=" } ) | ( "-" { "-" } ) ) newline ; - -(* Horizontal Rule *) -horizontal_rule = ( ( "*" [ " " ] "*" [ " " ] "*" { [ " " ] "*" } ) - | ( "-" [ " " ] "-" [ " " ] "-" { [ " " ] "-" } ) - | ( "_" [ " " ] "_" [ " " ] "_" { [ " " ] "_" } ) ) newline ; - (* Code Blocks *) -code_block = fenced_code_block | indented_code_block ; +code_block = fenced_code_block; fenced_code_block = "```" [ language_identifier ] newline { code_line } "```" newline ; -indented_code_block = { " " code_line } ; - code_line = { character - newline } newline ; language_identifier = { letter | digit | "-" | "+" } ; @@ -122,20 +111,9 @@ inline_element = emphasis | link | image | autolink - | html_inline | line_break | plain_text ; -html_inline = html_inline_element | html_comment | html_processing_instruction ; - -html_inline_element = html_open_tag { html_inline_content } html_close_tag - | html_self_closing_tag - | html_void_tag ; - -html_inline_content = html_inline_text | html_inline_element | html_comment | html_processing_instruction ; - -html_inline_text = { character - "<" } ; - emphasis = ( "*" non_asterisk_text "*" ) | ( "_" non_underscore_text "_" ) ;