began scaffolding down to markdown parser

This commit is contained in:
Pagwin 2025-07-24 19:39:44 -04:00
parent 016d50aff1
commit 0e4613f27c
3 changed files with 68 additions and 30 deletions

View file

@ -2,13 +2,36 @@ module IR where
import Data.Text import Data.Text
-- Html and Math tags come with their data because they are leaves for us newtype Document = Doc [Element]
-- We aren't parsing that if we can avoid it
data Tag = Heading {level :: Int} | Paragraph | Blockquote | Code | Html {html :: Text} | Anchor | Italic | Bold | Math {mathML :: Text}
data Data = Ast {ast :: AST} | Text {text :: Text} data Element = Heading Heading | Code Code | BlockQuote BlockQuote | List List | Table Table | HTML HTML | Paragraph Paragraph | BlankLine BlankLine
data AST = AST {tag :: Tag, child :: [Data]} data Heading = H {level :: Int, text :: Text}
data Code = C {language :: Text, code :: Text}
newtype BlockQuote = Q Text
data ListType = Ordered | Unordered
data ListItem = LI {content :: Text, children :: [List]}
data List = L {list_type :: ListType, items :: [ListItem]}
data Table = T {header :: TableHeader, rows :: [TableRow]}
-- TODO: layout/sizing info?
newtype TableHeader = TH [Text]
newtype TableRow = TR Text
newtype HTML = Raw Text
newtype Paragraph = P [InlineText]
data InlineText = Normal Text | Bold InlineText | Italic InlineText | CodeLine Text | Link {nest :: InlineText, href :: Text}
data BlankLine = BL
-- for processing math -- for processing math
-- https://hackage.haskell.org/package/typst-0.6.1/docs/Typst-Parse.html#v:parseTypst -- https://hackage.haskell.org/package/typst-0.6.1/docs/Typst-Parse.html#v:parseTypst

View file

@ -1,4 +1,41 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
module Markdown () where module Markdown (markdownParser) where
import Data.Text
import IR
import Text.Parsec
import Text.Parsec.Combinator
type Parser a = forall s u m t. (Stream s m t) => ParsecT s u m a
markdownParser :: Parser Document
markdownParser = Doc <$> many block
block :: Parser Element
block = choice [heading, codeBlock, quoteBlock, list, table, htmlBlock, paragraph, blankLine]
heading :: Parser Element
heading = pure $ Heading $ H {level = 1, text = ""}
codeBlock :: Parser Element
codeBlock = pure $ Code $ C {language = "", code = ""}
quoteBlock :: Parser Element
quoteBlock = pure $ BlockQuote $ Q ""
list :: Parser Element
list = pure $ List $ L {list_type = Ordered, items = []}
table :: Parser Element
table = pure $ Table $ T {header = TH [], rows = []}
htmlBlock :: Parser Element
htmlBlock = pure $ HTML $ Raw ""
paragraph :: Parser Element
paragraph = pure $ Paragraph $ P ""
blankline :: Parser Element
blankline = pure $ BlankLine BL

View file

@ -3,7 +3,6 @@
document = { block } ; document = { block } ;
block = heading block = heading
| horizontal_rule
| code_block | code_block
| quote_block | quote_block
| list | list
@ -13,27 +12,17 @@ block = heading
| blank_line ; | blank_line ;
(* Headings *) (* Headings *)
heading = atx_heading | setext_heading ; heading = atx_heading ;
atx_heading = "#" { "#" } [ " " ] inline_text newline ; atx_heading = "#" { "#" } [ " " ] inline_text newline ;
setext_heading = inline_text newline
( ( "=" { "=" } ) | ( "-" { "-" } ) ) newline ;
(* Horizontal Rule *)
horizontal_rule = ( ( "*" [ " " ] "*" [ " " ] "*" { [ " " ] "*" } )
| ( "-" [ " " ] "-" [ " " ] "-" { [ " " ] "-" } )
| ( "_" [ " " ] "_" [ " " ] "_" { [ " " ] "_" } ) ) newline ;
(* Code Blocks *) (* Code Blocks *)
code_block = fenced_code_block | indented_code_block ; code_block = fenced_code_block;
fenced_code_block = "```" [ language_identifier ] newline fenced_code_block = "```" [ language_identifier ] newline
{ code_line } { code_line }
"```" newline ; "```" newline ;
indented_code_block = { " " code_line } ;
code_line = { character - newline } newline ; code_line = { character - newline } newline ;
language_identifier = { letter | digit | "-" | "+" } ; language_identifier = { letter | digit | "-" | "+" } ;
@ -122,20 +111,9 @@ inline_element = emphasis
| link | link
| image | image
| autolink | autolink
| html_inline
| line_break | line_break
| plain_text ; | plain_text ;
html_inline = html_inline_element | html_comment | html_processing_instruction ;
html_inline_element = html_open_tag { html_inline_content } html_close_tag
| html_self_closing_tag
| html_void_tag ;
html_inline_content = html_inline_text | html_inline_element | html_comment | html_processing_instruction ;
html_inline_text = { character - "<" } ;
emphasis = ( "*" non_asterisk_text "*" ) emphasis = ( "*" non_asterisk_text "*" )
| ( "_" non_underscore_text "_" ) ; | ( "_" non_underscore_text "_" ) ;