escaped html stuff that needs escaping, added crossing out text and fixed how plain text consumed everything alongside setup for some future refactoring of parsing

This commit is contained in:
Pagwin 2025-11-07 21:38:48 -05:00
parent 24606b1e83
commit b834cd6dbb
No known key found for this signature in database
GPG key ID: 81137023740CA260
3 changed files with 48 additions and 9 deletions

View file

@ -6,6 +6,19 @@ import Data.Maybe (fromMaybe)
import qualified Data.Text as T import qualified Data.Text as T
import IR import IR
escapeChar :: Char -> T.Text
escapeChar '<' = "&lt;"
escapeChar '>' = "&gt;"
escapeChar '"' = "&quot;"
escapeChar '\'' = "&#39;"
escapeChar '&' = "&amp;"
escapeChar c = T.singleton c
-- not effiecient, main optimization would be to have unpack go to
-- some haskell vector or array impl rather than the list impl
escapeText :: T.Text -> T.Text
escapeText = T.concat . map escapeChar . T.unpack
tshow :: (Show s) => s -> T.Text tshow :: (Show s) => s -> T.Text
tshow = T.pack . show tshow = T.pack . show
@ -15,7 +28,7 @@ compileToHTML (Doc elements) = T.concat $ map elementToHTML elements
elementToHTML :: Element -> T.Text elementToHTML :: Element -> T.Text
elementToHTML (Heading header) = T.concat ["<h", tshow header.level, ">", serializeInlineToHTML header.text, "</h", tshow header.level, ">"] elementToHTML (Heading header) = T.concat ["<h", tshow header.level, ">", serializeInlineToHTML header.text, "</h", tshow header.level, ">"]
-- --
elementToHTML (Code code_block) = T.concat ["<pre class=\"sourceCode ", language, "\"><code class=\"sourceCode ", language, "\">", code_block.code, "</code>", "</pre>"] elementToHTML (Code code_block) = T.concat ["<pre class=\"sourceCode ", language, "\"><code class=\"sourceCode ", language, "\">", escapeText code_block.code, "</code>", "</pre>"]
where where
language = fromMaybe "" code_block.language language = fromMaybe "" code_block.language
elementToHTML (BlockQuote (Q elems)) = T.concat ["<blockquote>", serializeInlineToHTML elems, "</blockquote>"] elementToHTML (BlockQuote (Q elems)) = T.concat ["<blockquote>", serializeInlineToHTML elems, "</blockquote>"]
@ -40,10 +53,11 @@ generateLiElems (element : remainder) =
serializeInlineToHTML :: [InlineText] -> T.Text serializeInlineToHTML :: [InlineText] -> T.Text
serializeInlineToHTML [] = "" serializeInlineToHTML [] = ""
serializeInlineToHTML (Text t : remaining) = t <> serializeInlineToHTML remaining serializeInlineToHTML (Text t : remaining) = escapeText t <> serializeInlineToHTML remaining
serializeInlineToHTML (Bold elems : remaining) = T.concat ["<b>", serializeInlineToHTML elems, "</b>", serializeInlineToHTML remaining] serializeInlineToHTML (Bold elems : remaining) = T.concat ["<b>", serializeInlineToHTML elems, "</b>", serializeInlineToHTML remaining]
serializeInlineToHTML (Italic elems : remaining) = T.concat ["<i>", serializeInlineToHTML elems, "</i>", serializeInlineToHTML remaining] serializeInlineToHTML (Italic elems : remaining) = T.concat ["<i>", serializeInlineToHTML elems, "</i>", serializeInlineToHTML remaining]
serializeInlineToHTML (InlineCode code : remaining) = T.concat ["<code>", code, "</code>", serializeInlineToHTML remaining] serializeInlineToHTML (Crossed elems : remaining) = T.concat ["<s>", serializeInlineToHTML elems, "</s>", serializeInlineToHTML remaining]
serializeInlineToHTML (Link {linkText, url, title} : remaining) = T.concat ["<a href=\"", url, "\" ", maybe "" (\t -> T.concat ["title=\"", t, "\""]) title, "\">", serializeInlineToHTML linkText, "</a>", serializeInlineToHTML remaining] serializeInlineToHTML (InlineCode code : remaining) = T.concat ["<code>", escapeText code, "</code>", serializeInlineToHTML remaining]
serializeInlineToHTML (Image {altText, url, title} : remaining) = T.concat ["<img src=\">", url, "\" alt=\"", altText, "\"", maybe "" (\t -> T.concat ["title=\"", t, "\""]) title, ">", serializeInlineToHTML remaining] serializeInlineToHTML (Link {linkText, url, title} : remaining) = T.concat ["<a href=\"", url, "\" ", maybe "" (\t -> T.concat ["title=\"", escapeText t, "\""]) title, "\">", serializeInlineToHTML linkText, "</a>", serializeInlineToHTML remaining]
serializeInlineToHTML (Image {altText, url, title} : remaining) = T.concat ["<img src=\">", url, "\" alt=\"", escapeText altText, "\"", maybe "" (\t -> T.concat ["title=\"", escapeText t, "\""]) title, ">", serializeInlineToHTML remaining]
serializeInlineToHTML (HTMLInline {inline_html_content} : remaining) = inline_html_content <> serializeInlineToHTML remaining serializeInlineToHTML (HTMLInline {inline_html_content} : remaining) = inline_html_content <> serializeInlineToHTML remaining

View file

@ -57,6 +57,7 @@ data InlineText
= Text Text -- Combined Normal and Escaped = Text Text -- Combined Normal and Escaped
| Bold [InlineText] | Bold [InlineText]
| Italic [InlineText] | Italic [InlineText]
| Crossed [InlineText]
| InlineCode Text | InlineCode Text
| Link | Link
{ linkText :: [InlineText], { linkText :: [InlineText],

View file

@ -223,6 +223,7 @@ inlineElement =
choice choice
[ try strong, [ try strong,
try emphasis, try emphasis,
try crossedText,
try codeSpan, try codeSpan,
try image, try image,
try link, try link,
@ -249,6 +250,13 @@ strongUnderscore = do
string "__" string "__"
pure $ Bold content pure $ Bold content
crossedText :: Parser InlineText
crossedText = do
string "~~"
content <- some (notFollowedBy (string "~~") >> inlineElementNo '~')
string "~~"
pure $ Crossed content
-- Emphasis (Italic) -- Emphasis (Italic)
emphasis :: Parser InlineText emphasis :: Parser InlineText
emphasis = emphasisAsterisk <|> emphasisUnderscore emphasis = emphasisAsterisk <|> emphasisUnderscore
@ -267,6 +275,21 @@ emphasisUnderscore = do
char '_' char '_'
pure $ Italic content pure $ Italic content
inlineElementNo :: Char -> Parser InlineText
inlineElementNo c =
choice
[ try strong,
try codeSpan,
try image,
try link,
try htmlInline,
try escapedChar,
plainTextNo c
]
plainTextNo :: Char -> Parser InlineText
plainTextNo c = fmap (Text . T.pack) $ some $ noneOf [c, '\n']
inlineElementNoAsterisk :: Parser InlineText inlineElementNoAsterisk :: Parser InlineText
inlineElementNoAsterisk = inlineElementNoAsterisk =
choice choice
@ -276,7 +299,7 @@ inlineElementNoAsterisk =
try link, try link,
try htmlInline, try htmlInline,
try escapedChar, try escapedChar,
plainTextNoAsterisk plainTextNo '*'
] ]
inlineElementNoUnderscore :: Parser InlineText inlineElementNoUnderscore :: Parser InlineText
@ -288,7 +311,7 @@ inlineElementNoUnderscore =
try link, try link,
try htmlInline, try htmlInline,
try escapedChar, try escapedChar,
plainTextNoUnderscore plainTextNo '_'
] ]
-- Code Span -- Code Span
@ -385,11 +408,12 @@ escapedChar = do
pure $ Text (T.singleton c) pure $ Text (T.singleton c)
-- Plain Text -- Plain Text
-- TODO: this eats stuff it shouldn't, inefficient solution is to try other inline elements and exit if they succeed
plainText :: Parser InlineText plainText :: Parser InlineText
plainText = Text . T.pack <$> some plainTextChar plainText = fmap (Text . T.pack) (liftA2 (:) (noneOf "\n") $ many plainTextChar)
plainTextChar :: Parser Char plainTextChar :: Parser Char
plainTextChar = noneOf "\n" plainTextChar = noneOf "\n[~`_*"
plainTextNoAsterisk :: Parser InlineText plainTextNoAsterisk :: Parser InlineText
plainTextNoAsterisk = fmap (Text . T.pack) $ some $ noneOf "*\n" plainTextNoAsterisk = fmap (Text . T.pack) $ some $ noneOf "*\n"