began scaffolding down to markdown parser

This commit is contained in:
Pagwin 2025-07-24 19:39:44 -04:00
parent 016d50aff1
commit 0e4613f27c
3 changed files with 68 additions and 30 deletions

View file

@ -2,13 +2,36 @@ module IR where
import Data.Text
-- Html and Math tags come with their data because they are leaves for us
-- We aren't parsing that if we can avoid it
data Tag = Heading {level :: Int} | Paragraph | Blockquote | Code | Html {html :: Text} | Anchor | Italic | Bold | Math {mathML :: Text}
newtype Document = Doc [Element]
data Data = Ast {ast :: AST} | Text {text :: Text}
data Element = Heading Heading | Code Code | BlockQuote BlockQuote | List List | Table Table | HTML HTML | Paragraph Paragraph | BlankLine BlankLine
data AST = AST {tag :: Tag, child :: [Data]}
data Heading = H {level :: Int, text :: Text}
data Code = C {language :: Text, code :: Text}
newtype BlockQuote = Q Text
data ListType = Ordered | Unordered
data ListItem = LI {content :: Text, children :: [List]}
data List = L {list_type :: ListType, items :: [ListItem]}
data Table = T {header :: TableHeader, rows :: [TableRow]}
-- TODO: layout/sizing info?
newtype TableHeader = TH [Text]
newtype TableRow = TR Text
newtype HTML = Raw Text
newtype Paragraph = P [InlineText]
data InlineText = Normal Text | Bold InlineText | Italic InlineText | CodeLine Text | Link {nest :: InlineText, href :: Text}
data BlankLine = BL
-- for processing math
-- https://hackage.haskell.org/package/typst-0.6.1/docs/Typst-Parse.html#v:parseTypst

View file

@ -1,4 +1,41 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
module Markdown () where
module Markdown (markdownParser) where
import Data.Text
import IR
import Text.Parsec
import Text.Parsec.Combinator
type Parser a = forall s u m t. (Stream s m t) => ParsecT s u m a
markdownParser :: Parser Document
markdownParser = Doc <$> many block
block :: Parser Element
block = choice [heading, codeBlock, quoteBlock, list, table, htmlBlock, paragraph, blankLine]
heading :: Parser Element
heading = pure $ Heading $ H {level = 1, text = ""}
codeBlock :: Parser Element
codeBlock = pure $ Code $ C {language = "", code = ""}
quoteBlock :: Parser Element
quoteBlock = pure $ BlockQuote $ Q ""
list :: Parser Element
list = pure $ List $ L {list_type = Ordered, items = []}
table :: Parser Element
table = pure $ Table $ T {header = TH [], rows = []}
htmlBlock :: Parser Element
htmlBlock = pure $ HTML $ Raw ""
paragraph :: Parser Element
paragraph = pure $ Paragraph $ P ""
blankline :: Parser Element
blankline = pure $ BlankLine BL

View file

@ -3,7 +3,6 @@
document = { block } ;
block = heading
| horizontal_rule
| code_block
| quote_block
| list
@ -13,27 +12,17 @@ block = heading
| blank_line ;
(* Headings *)
heading = atx_heading | setext_heading ;
heading = atx_heading ;
atx_heading = "#" { "#" } [ " " ] inline_text newline ;
setext_heading = inline_text newline
( ( "=" { "=" } ) | ( "-" { "-" } ) ) newline ;
(* Horizontal Rule *)
horizontal_rule = ( ( "*" [ " " ] "*" [ " " ] "*" { [ " " ] "*" } )
| ( "-" [ " " ] "-" [ " " ] "-" { [ " " ] "-" } )
| ( "_" [ " " ] "_" [ " " ] "_" { [ " " ] "_" } ) ) newline ;
(* Code Blocks *)
code_block = fenced_code_block | indented_code_block ;
code_block = fenced_code_block;
fenced_code_block = "```" [ language_identifier ] newline
{ code_line }
"```" newline ;
indented_code_block = { " " code_line } ;
code_line = { character - newline } newline ;
language_identifier = { letter | digit | "-" | "+" } ;
@ -122,20 +111,9 @@ inline_element = emphasis
| link
| image
| autolink
| html_inline
| line_break
| plain_text ;
html_inline = html_inline_element | html_comment | html_processing_instruction ;
html_inline_element = html_open_tag { html_inline_content } html_close_tag
| html_self_closing_tag
| html_void_tag ;
html_inline_content = html_inline_text | html_inline_element | html_comment | html_processing_instruction ;
html_inline_text = { character - "<" } ;
emphasis = ( "*" non_asterisk_text "*" )
| ( "_" non_underscore_text "_" ) ;