naive refactor of markdown generation for the new IR

This commit is contained in:
Pagwin 2026-04-06 15:17:39 -04:00
parent 24606a9204
commit 22f4f89137
No known key found for this signature in database
GPG key ID: 81137023740CA260
4 changed files with 91 additions and 46 deletions

View file

@ -36,7 +36,7 @@ test-suite test-markdown-parse
hs-source-dirs: tests
type: exitcode-stdio-1.0
main-is: Markdown/Parse.hs
build-depends: base >=4.20 && < 4.21, text >= 2.1.2 , megaparsec >= 9.7.0 , transformers >= 0.6.2 , hedgehog >= 1.7 , time, psb
build-depends: base, text, megaparsec, transformers, hedgehog, time, psb
default-extensions: ApplicativeDo DataKinds NamedFieldPuns DerivingVia LambdaCase TypeApplications DeriveGeneric OverloadedRecordDot NamedFieldPuns DuplicateRecordFields DisambiguateRecordFields FlexibleInstances
default-language: Haskell2010

View file

@ -34,37 +34,63 @@ genHeaderId header =
textSub ' ' = '-'
textSub c = c
genHeaderClasses :: Heading -> T.Text
genHeaderClasses = const ""
tshow :: (Show s) => s -> T.Text
tshow = T.pack . show
compileToHTML :: Document -> T.Text
compileToHTML (Doc elements) = T.concat $ map elementToHTML elements
elementsToHTML :: [Element] -> T.Text
elementsToHTML = T.concat . map elementToHTML
elementToHTML :: Element -> T.Text
elementToHTML (Heading header) = T.concat ["<h", tshow header.level, genHeaderId header, genHeaderClasses header, ">", serializeInlineToHTML header.text, "</h", tshow header.level, ">"]
elementToHTML (Heading header attrs) = T.concat ["<h", tshow header.level, headerAttrs header attrs, ">", serializeInlineToHTML header.text, "</h", tshow header.level, ">"]
--
elementToHTML (Code code_block) = T.concat ["<pre class=\"sourceCode ", language, "\"><code class=\"sourceCode ", language, "\">", escapeText code_block.code, "</code>", "</pre>"]
where
language = fromMaybe "" code_block.language
elementToHTML (BlockQuote (Q elems)) = T.concat ["<blockquote>", serializeInlineToHTML elems, "</blockquote>"]
elementToHTML (List (L {list_type = Ordered, items})) = T.concat ["<ol>", generateLiElems items, "</ol>"]
elementToHTML (List (L {list_type = Unordered, items})) = T.concat ["<ul>", generateLiElems items, "</ul>"]
elementToHTML (BlockQuote (Q elems)) = T.concat ["<blockquote>", elementsToHTML elems, "</blockquote>"]
elementToHTML (List (L {list_type = Ordered {start_number, style}, items}) attrs) = T.concat ["<ol", maybe "" handleStart start_number, maybe "" handleStyle style, ">", generateLiElems items, "</ol>"]
elementToHTML (List (L {list_type = Unordered {style}, items}) attrs) = T.concat ["<ul", maybe "" handleStyle style, ">", generateLiElems items, "</ul>"]
elementToHTML (HTML (HTMLTag {html_content})) = html_content
elementToHTML (Paragraph (P snippets)) = T.concat ["<p>", serializeInlineToHTML snippets, "</p>"]
elementToHTML (Paragraph (P snippets) attrs) = T.concat ["<p", handleAttrs attrs, ">", serializeInlineToHTML snippets, "</p>"]
elementToHTML HorizontalRule = "<hr>"
elementToHTML (Table _ _) = error "TODO"
elementToHTML (Container _ _) = error "TODO"
elementToHTML (Footnote _ _) = error "TODO"
elementToHTML (DescriptionList _ _) = error "TODO"
elementToHTML (RawBlock _ _) = error "TODO"
elementToHTML (TaskList _ _) = error "TODO"
handleStyle :: T.Text -> T.Text
handleStyle style = T.concat ["type=\"", style, "\""]
handleStart :: Int -> T.Text
handleStart start = T.concat ["start=\"", T.show start, "\""]
handleAttrs :: Attrs -> T.Text
handleAttrs Attrs {attrId, attrClasses, attrKV} =
T.concat
[ maybe "" (\id -> "id=\"" <> id <> "\"") attrId,
"class=\"" <> T.intercalate " " attrClasses <> "\"",
T.intercalate " " $ map (\(key, val) -> key <> "=\"" <> val <> "\"") attrKV
]
headerAttrs :: Heading -> Attrs -> T.Text
headerAttrs header Attrs {attrId, attrClasses, attrKV} =
T.concat
-- id is overcomplicated due to header having id logic written I don't wanna bother with
[ maybe (genHeaderId header) (\id -> "id=\"" <> id <> "\"") attrId,
"class=\"" <> T.intercalate " " attrClasses <> "\"",
T.intercalate " " $ map (\(key, val) -> key <> "=\"" <> val <> "\"") attrKV
]
generateLiElems :: [ListItem] -> T.Text
generateLiElems [] = ""
generateLiElems (element : remainder) =
T.concat
[ "<li>",
-- We assume child lists are stricly after our contents
-- if they aren't this is fucked
serializeInlineToHTML element.content,
fromMaybe "" $ fmap (elementToHTML . List) element.child,
elementsToHTML element.content,
"</li>",
generateLiElems remainder
]
@ -72,10 +98,19 @@ generateLiElems (element : remainder) =
serializeInlineToHTML :: [InlineText] -> T.Text
serializeInlineToHTML [] = ""
serializeInlineToHTML (Text t : remaining) = escapeText t <> serializeInlineToHTML remaining
serializeInlineToHTML (Bold elems : remaining) = T.concat ["<b>", serializeInlineToHTML elems, "</b>", serializeInlineToHTML remaining]
serializeInlineToHTML (Italic elems : remaining) = T.concat ["<i>", serializeInlineToHTML elems, "</i>", serializeInlineToHTML remaining]
serializeInlineToHTML (Crossed elems : remaining) = T.concat ["<s>", serializeInlineToHTML elems, "</s>", serializeInlineToHTML remaining]
serializeInlineToHTML (InlineCode code : remaining) = T.concat ["<code>", escapeText code, "</code>", serializeInlineToHTML remaining]
serializeInlineToHTML (Link {linkText, url, title} : remaining) = T.concat ["<a href=\"", url, "\"", maybe "" (\t -> T.concat ["title=\"", escapeText t, "\""]) title, ">", serializeInlineToHTML linkText, "</a>", serializeInlineToHTML remaining]
serializeInlineToHTML (Image {altText, url, title} : remaining) = T.concat ["<img src=\"", url, "\" alt=\"", escapeText altText, "\"", maybe "" (\t -> T.concat ["title=\"", escapeText t, "\""]) title, ">", serializeInlineToHTML remaining]
serializeInlineToHTML ((Bold elems attrs) : remaining) = T.concat ["<b", handleAttrs attrs, ">", serializeInlineToHTML elems, "</b>", serializeInlineToHTML remaining]
serializeInlineToHTML ((Italic elems attrs) : remaining) = T.concat ["<i", handleAttrs attrs, ">", serializeInlineToHTML elems, "</i>", serializeInlineToHTML remaining]
serializeInlineToHTML ((Crossed elems attrs) : remaining) = T.concat ["<s", handleAttrs attrs, ">", serializeInlineToHTML elems, "</s>", serializeInlineToHTML remaining]
serializeInlineToHTML ((InlineCode code attrs) : remaining) = T.concat ["<code", handleAttrs attrs, ">", escapeText code, "</code>", serializeInlineToHTML remaining]
serializeInlineToHTML (Link {linkText, url, title, misc_attrs} : remaining) = T.concat ["<a href=\"", url, "\"", handleAttrs misc_attrs, maybe "" (\t -> T.concat ["title=\"", escapeText t, "\""]) title, ">", serializeInlineToHTML linkText, "</a>", serializeInlineToHTML remaining]
serializeInlineToHTML (Image {altText, url, title, misc_attrs} : remaining) = T.concat ["<img src=\"", url, "\" alt=\"", escapeText altText, "\"", maybe "" (\t -> T.concat ["title=\"", escapeText t, "\""]) title, handleAttrs misc_attrs, ">", serializeInlineToHTML remaining]
serializeInlineToHTML (HTMLInline {inline_html_content} : remaining) = inline_html_content <> serializeInlineToHTML remaining
serializeInlineToHTML ((Superscript _ _) : remaining) = error "TODO"
serializeInlineToHTML ((Subscript _ _) : remaining) = error "TODO"
serializeInlineToHTML ((Highlighted _ _) : remaining) = error "TODO"
serializeInlineToHTML ((Insert _ _) : remaining) = error "TODO"
serializeInlineToHTML ((Math _ _) : remaining) = error "TODO"
serializeInlineToHTML ((FootnoteReference {}) : remaining) = error "TODO"
serializeInlineToHTML ((Symbol _) : remaining) = error "TODO"
serializeInlineToHTML ((RawInline _ _) : remaining) = error "TODO"
serializeInlineToHTML ((Span _ _) : remaining) = error "TODO"

View file

@ -14,12 +14,10 @@ data Element
HTML HTML
| Paragraph Paragraph Attrs
| HorizontalRule
| -- TODO
Table Table Attrs
| Table Table Attrs
| -- Djot :::
Container [Element] Attrs
| -- TODO
Footnote Footnote Attrs
| Footnote Footnote Attrs
| DescriptionList DescriptionList Attrs
| RawBlock RawBlock Attrs
| TaskList TaskList Attrs
@ -39,15 +37,15 @@ data Code = C
}
deriving (Show)
data BlockQuote = Q [Element] deriving (Show)
newtype BlockQuote = Q [Element] deriving (Show)
data ListItem = LI
{ content :: [Element], -- Flatten continuations into here
child :: Maybe List
newtype ListItem = LI
-- children are just more elements
{ content :: [Element] -- Flatten continuations into here
}
deriving (Show)
data ListType = Ordered {start_number :: Text, style :: Text} | Unordered {style :: Text} deriving (Show)
data ListType = Ordered {start_number :: Maybe Int, style :: Maybe Text} | Unordered {style :: Maybe Text} deriving (Show)
data List = L
{ list_type :: ListType,
@ -64,7 +62,7 @@ newtype HTML
newtype Paragraph = P [InlineText] deriving (Show)
data InlineText
= Text Text Attrs -- Combined Normal and Escaped
= Text Text -- Combined Normal and Escaped
| Bold [InlineText] Attrs
| Italic [InlineText] Attrs
| Crossed [InlineText] Attrs

View file

@ -65,16 +65,24 @@ inlineText' disallow = choice [try $ strikethrough disallow, try $ bold disallow
where
between' disallow start end middle_piece = between start end $ many ((notFollowedBy ((try $ void end) <|> disallow)) *> middle_piece)
strikethrough disallow = Crossed <$> (between' disallow (string "~~") (void $ string "~~") (inlineText' (disallow <|> (void $ string "~~"))))
strikethrough disallow = do
content <- between' disallow (string "~~") (void $ string "~~") (inlineText' (disallow <|> (void $ string "~~")))
pure $ Crossed content emptyAttrs
bold disallow = Bold <$> (between' disallow (string "**") (disallow <|> (void $ string "**")) (inlineText' (disallow <|> (void $ string "**"))))
bold disallow = do
content <- between' disallow (string "**") (disallow <|> (void $ string "**")) (inlineText' (disallow <|> (void $ string "**")))
pure $ Bold content emptyAttrs
italic :: (HasCallStack) => Parser s m () -> Parser s m InlineText
italic disallow = Italic <$> (between' disallow (char '*') ((void $ char '*') <|> disallow) (inlineText' (disallow <|> (void $ char '*'))))
italic disallow = do
content <- between' disallow (char '*') ((void $ char '*') <|> disallow) (inlineText' (disallow <|> (void $ char '*')))
pure $ Italic content emptyAttrs
underlined disallow = Underlined <$> (between' disallow (string "__") ((void $ string "__") <|> disallow) (inlineText' (disallow <|> (void $ string "__"))))
code = InlineCode . T.pack <$> (between' disallow (char '`') (char '`') ((notFollowedBy lineEnding) *> anySingle))
code = do
contents <- T.pack <$> between' disallow (char '`') (char '`') (notFollowedBy lineEnding *> anySingle)
pure $ InlineCode contents emptyAttrs
link :: (HasCallStack) => Parser s m () -> Parser s m InlineText
link disallow = do
linkText <- between' disallow (char '[') ((void $ char ']') <|> disallow) (inlineText' (disallow <|> (void $ char ']')))
@ -88,7 +96,7 @@ inlineText' disallow = choice [try $ strikethrough disallow, try $ bold disallow
Nothing -> pure Nothing
char ')'
pure (url, title)
pure Link {linkText, url, title}
pure Link {linkText, url, title, misc_attrs = emptyAttrs}
image disallow = do
logDebug "image:before excl"
@ -102,7 +110,7 @@ inlineText' disallow = choice [try $ strikethrough disallow, try $ bold disallow
Link {linkText = [Text altText], url, title} -> pure (altText, url, title)
Link {linkText = [], url, title} -> pure ("", url, title)
_ -> fail "Image alt text must be normal text, cannot be stylized in any way"
pure Image {altText, url, title}
pure Image {altText, url, title, misc_attrs = emptyAttrs}
inline_html =
HTMLInline <$> do
@ -125,7 +133,7 @@ headingBlock = do
heading_level <- length <$> (some $ char '#')
optional spaceChar
text <- many ((notFollowedBy blockEnding) *> inlineText)
pure $ Heading $ H {level = heading_level, text}
pure $ Heading (H {level = heading_level, text}) emptyAttrs
fencedCodeBlock :: (Logger m, Characters s) => Parser s m Element
fencedCodeBlock = between (string "```") (string "```") $ do
@ -136,7 +144,9 @@ fencedCodeBlock = between (string "```") (string "```") $ do
pure $ Code $ C {language, code}
blockquoteBlock :: (Logger m, Characters s) => Parser s m Element
blockquoteBlock = BlockQuote . Q . concat <$> (some blockquoteLine)
blockquoteBlock = do
content <- Q . concat <$> some blockquoteLine
pure $ BlockQuote content
where
blockquoteLine = do
char '>'
@ -145,7 +155,7 @@ blockquoteBlock = BlockQuote . Q . concat <$> (some blockquoteLine)
-- this dance with optional and notFollowedBy is done so we
-- aren't accidentally consuming part of a block ending
(optional ((notFollowedBy blockEnding) *> lineEnding))
pure ret
pure [(Paragraph $ P ret) emptyAttrs]
-- type of list the parser returns
-- parser which grabs the prefix for each item of the list
@ -154,7 +164,7 @@ blockquoteBlock = BlockQuote . Q . concat <$> (some blockquoteLine)
listBlock :: (Logger m, Characters s) => ListType -> Parser s m prefix -> (Int -> Parser s m List) -> Int -> Parser s m Element
listBlock list_type prefix child_parser_factory nest_level = do
items <- some $ listItem
pure $ List $ L {list_type, items}
pure $ List (L {list_type, items}) emptyAttrs
where
listItem = do
count nest_level ((try $ void $ char '\t') <|> (void $ (count 4 $ char ' ')))
@ -164,23 +174,23 @@ listBlock list_type prefix child_parser_factory nest_level = do
optional ((notFollowedBy blockEnding) *> lineEnding)
child <- optional $ child_parser_factory $ nest_level + 1
pure $ LI {content, child}
error "TODO, child list handling works different now, child needs to be combined into content"
pure $ LI {content = [Paragraph (P content) emptyAttrs]}
unorderedListBlock :: (Logger m, Characters s) => Int -> Parser s m Element
unorderedListBlock = listBlock Unordered unordered_prefix (\level -> unwrap <$> ((try $ unorderedListBlock level) <|> orderedListBlock level))
unorderedListBlock = listBlock Unordered {style = Nothing} unordered_prefix (\level -> unwrap <$> ((try $ unorderedListBlock level) <|> orderedListBlock level))
where
unordered_prefix = (choice $ map char "*-+") *> (notFollowedBy newline *> spaceChar)
-- not exhaustive but we know listBlock is returning a List
unwrap (List l) = l
unwrap (List l _) = l
orderedListBlock :: (Logger m, Characters s) => Int -> Parser s m Element
orderedListBlock = listBlock Ordered ordered_prefix (\level -> unwrap <$> ((try $ unorderedListBlock level) <|> orderedListBlock level))
orderedListBlock = listBlock Ordered {style = Nothing, start_number = Nothing} ordered_prefix (\level -> unwrap <$> ((try $ unorderedListBlock level) <|> orderedListBlock level))
where
-- regex equivalent: [0-9]+[.)]\s?
ordered_prefix = (some digitChar) *> (char '.' <|> char ')') *> (notFollowedBy newline *> spaceChar)
-- not exhaustive but we know listBlock is returning a List
unwrap (List l) = l
unwrap (List l _) = l
htmlBlock :: forall m s. (Logger m, Characters s) => Parser s m Element
htmlBlock = do
@ -216,4 +226,6 @@ htmlBlock = do
pure $ mconcat [name, "=\"", value, "\""]
paragraphBlock :: (Logger m, Characters s) => Parser s m Element
paragraphBlock = Paragraph . P <$> (many ((notFollowedBy blockEnding) *> inlineText))
paragraphBlock = do
content <- P <$> many ((notFollowedBy blockEnding) *> inlineText)
pure $ Paragraph content emptyAttrs