redoing markdown parsing and adding underlining

This commit is contained in:
Pagwin 2025-12-08 21:18:22 -05:00
parent 82c9fd8799
commit 115270a70b
No known key found for this signature in database
GPG key ID: 81137023740CA260
2 changed files with 78 additions and 380 deletions

View file

@ -58,6 +58,7 @@ data InlineText
| Bold [InlineText]
| Italic [InlineText]
| Crossed [InlineText]
| Underlined [InlineText]
| InlineCode Text
| Link
{ linkText :: [InlineText],

View file

@ -18,413 +18,110 @@ import IR
import Logger (Logger (logDebug))
import Text.Megaparsec (ParsecT, Stream, Token, Tokens, anySingle, anySingleBut, between, choice, count, eof, manyTill, notFollowedBy, satisfy, skipSome, try, (<?>))
import qualified Text.Megaparsec as MP
import Text.Megaparsec.Char (alphaNumChar, char, digitChar, string)
import Text.Megaparsec.Char (alphaNumChar, char, digitChar, newline, string)
type ParserTG = ParsecT Void
type Parser = ParsecT Void
type ParserT m = ParserTG T.Text m
class (Token s ~ Char, Stream s, IsString (Tokens s), Show s) => Characters s
type Parser = ParserT Identity
instance Characters Text
anyChar :: (Token s ~ Char, Stream s) => ParserTG s m Char
anyChar = anySingle
instance Characters String
alphaNum :: (Token s ~ Char, Stream s) => ParserTG s m Char
alphaNum = alphaNumChar
digit :: (Token s ~ Char, Stream s) => ParserTG s m Char
digit = digitChar
noneOf :: (Token s ~ Char, Stream s) => [Char] -> ParserTG s m Char
noneOf = MP.noneOf
oneOf :: (Token s ~ Char, Stream s) => [Char] -> ParserTG s m Char
oneOf = MP.oneOf
optionMaybe :: (Token s ~ Char, Stream s) => ParserTG s m a -> ParserTG s m (Maybe a)
optionMaybe = optional
skipMany1 :: (Token s ~ Char, Stream s) => ParserTG s m a -> ParserTG s m ()
skipMany1 = skipSome
metadata :: (Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Text
metadata :: (Logger m, Characters s) => Parser s m Text
metadata = bound *> fmap T.pack (many $ try (char '-' <* notFollowedBy (string "--")) <|> anySingleBut '-') <* bound
where
bound = string "---"
document :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Document
document = do
logDebug "document"
Doc <$> many element <* eof
document :: (Logger m, Characters s) => Parser s m Document
document = Doc <$> many element
element :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element
element :: (Logger m, Characters s) => Parser s m Element
element =
choice
[ try headingBlock <?> "Element Heading",
try fencedCodeBlock <?> "Fenced Code Block",
try indentedCodeBlock <?> "Indented Code Block",
try blockquoteBlock <?> "BlockQuote",
try unorderedListBlock <?> "Unordered List",
try orderedListBlock <?> "Ordered List",
try horizontalRuleBlock <?> "Horizontal Rule",
(try $ unorderedListBlock 0) <?> "Unordered List",
(try $ orderedListBlock 0) <?> "Ordered List",
try htmlBlock <?> "HTML Block",
try blankLines <?> "Blank Lines", -- Consume blank lines but don't add to AST
paragraphBlock <?> "Paragarph"
]
<* blockEnding
-- Blank lines (consumed but not stored)
blankLines :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element
blankLines = do
skipMany1 (blankLine *> notFollowedBy eof)
element <|> fmap (const $ HTML HTMLTag {html_content = ""}) eof -- Parse the next element (or handle eof)
lineEnding :: (Logger m, Characters s) => Parser s m ()
lineEnding = (try eof) <|> void newline
blankLine :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m ()
blankLine = do
many (char ' ' <|> char '\t')
lineEnding
pure ()
blockEnding :: (Logger m, Characters s) => Parser s m ()
blockEnding = (try (lineEnding *> lineEnding)) <|> eof
-- Heading Block
headingBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element
headingBlock = do
hashes <- some (char '#') <?> "Heading Hashes"
let level = length hashes
guard (level <= 6) <?> "Higher than level 6"
many (char ' ' <|> char '\t') <?> "Pre-Text Whitespace"
content <- manyTill (inlineElement <?> "Header Text") (try lineEnding <?> "Header Ending")
pure $ Heading $ H level content
-- Fenced Code Block
fencedCodeBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element
fencedCodeBlock = do
logDebug "fenced_coding_block"
fence <- string "```" <|> string "~~~"
logDebug "fence"
lang <- optionMaybe languageInfo
logDebug "langInfo"
lineEnding'
logDebug "lineEnding"
codeLines <- manyTill (codeLine fence) (try $ string fence)
logDebug "lines"
pure $ Code $ C lang (T.pack $ unlines codeLines)
languageInfo :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Text
languageInfo = T.pack <$> some (alphaNum <|> char '-' <|> char '+' <|> char '.')
codeLine :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => (Tokens s) -> ParserTG s m String
codeLine fence = do
-- this is a hack which can only haunt me if I continue using markdown
line <- many $ (notFollowedBy $ string fence) *> noneOf "\n\r"
lineEnding'
pure line
-- Indented Code Block
indentedCodeBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element
indentedCodeBlock = do
lines' <- some indentedLine
pure $ Code $ C Nothing (T.pack $ unlines lines')
-- TODO: check if inlineHTML needs to be handled in any markdown posts
-- TODO: link impl
-- TODO: image impl
inlineText :: (Logger m, Characters s) => Parser s m InlineText
inlineText = choice [try strikethrough, try bold, try italic, try underlined, try code, try link, try image, plain_text]
where
indentedLine = do
count 4 (char ' ' <|> char '\t')
line <- many $ noneOf "\n\r"
lineEnding
pure line
between' start end middle_piece = between start end $ many ((notFollowedBy end) *> middle_piece)
-- Blockquote Block
blockquoteBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element
blockquoteBlock = do
lines' <- some blockquoteLine
pure $ BlockQuote $ Q (concat lines')
strikethrough = Crossed <$> (between' (string "~~") (string "~~") inlineText)
bold = Bold <$> (between' (string "**") (string "**") inlineText)
italic = Italic <$> (between' (char '*') (char '*') inlineText)
underlined = Underlined <$> (between' (string "__") (string "__") inlineText)
code = InlineCode . T.pack <$> (between' (char '`') (char '`') (notFollowedBy lineEnding *> anySingle))
link = do
linkText <- error "linkText parser"
url <- error "url parser"
title <- error "title parser"
pure Link {linkText, url, title}
image = do
altText <- error "altText"
url <- error "url"
title <- error "title"
pure Image {altText, url, title}
plain_text = Text . T.pack <$> (many ((notFollowedBy blockEnding) *> anySingle))
headingBlock :: (Logger m, Characters s) => Parser s m Element
headingBlock = do
heading_level <- length <$> (some $ char '#')
optional $ char ' '
text <- many $ inlineText
pure $ Heading $ H {level = heading_level, text}
fencedCodeBlock :: (Logger m, Characters s) => Parser s m Element
fencedCodeBlock = between (string "```") (string "```") $ do
language' <- T.pack <$> (many (notFollowedBy lineEnding *> anySingle))
lineEnding
code <- T.pack <$> (many ((notFollowedBy $ string "```") *> anySingle))
let language = if language' == "" then Just language' else Nothing
pure $ Code $ C {language, code}
blockquoteBlock :: (Logger m, Characters s) => Parser s m Element
blockquoteBlock = BlockQuote . Q . concat <$> (some blockquoteLine)
where
blockquoteLine = do
char '>'
optional (char ' ')
content <- many $ notFollowedBy lineEnding' *> inlineElement
pure content
optional $ char ' '
ret <- (many ((notFollowedBy lineEnding) *> inlineText))
-- this dance with optional and notFollowedBy is done so we
-- aren't accidentally consuming part of a block ending
(optional ((notFollowedBy blockEnding) *> lineEnding))
pure ret
-- Horizontal Rule Block
horizontalRuleBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element
horizontalRuleBlock = do
choice
[ try (count 3 (char '*') >> many (char ' ' <|> char '*')),
try (count 3 (char '-') >> many (char ' ' <|> char '-')),
try (count 3 (char '_') >> many (char ' ' <|> char '_'))
]
lineEnding
pure HorizontalRule
unorderedListBlock :: (Logger m, Characters s) => Int -> Parser s m Element
unorderedListBlock indent_level = error "TODO: unorderedListBlock"
-- Unordered List Block
unorderedListBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element
unorderedListBlock = do
items <- some unorderedListItem
lineEnding'
pure $ List $ L Unordered items
orderedListBlock :: (Logger m, Characters s) => Int -> Parser s m Element
orderedListBlock indent_level = error "TODO: orderedListBlock"
unorderedListItem :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m ListItem
unorderedListItem = do
oneOf "*-+"
optional (char ' ' <|> char '\t')
content <- many $ notFollowedBy lineEnding' *> inlineElement
lineEnding'
-- continuations <- many listContinuation
children <- maybeToList <$> (optional $ indentedList 1)
pure $ LI content children
htmlBlock :: (Logger m, Characters s) => Parser s m Element
htmlBlock = error "TODO: htmlBlock"
-- TODO: handle list indentation at all levels
indentedList :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => Int -> ParserTG s m List
indentedList n = do
void $ (count (4 * n) (char ' ')) <|> count n (char '\t')
ret <- (try indentedUnorderedList) <|> indentedOrderedList
pure ret <* lineEnding'
indentedUnorderedList :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m List
indentedUnorderedList = do
items <- some (indentedListItem (oneOf "*-+" >> void (char ' ' <|> char '\t')))
pure $ L Unordered items
indentedOrderedList :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m List
indentedOrderedList = do
items <- some (try $ indentedListItem (some digit >> char '.' >> void (char ' ' <|> char '\t')))
pure $ L Ordered items
indentedListItem :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m () -> ParserTG s m ListItem
indentedListItem marker = do
marker
content <- many $ notFollowedBy lineEnding' *> inlineElement
pure $ LI content []
-- Ordered List Block
orderedListBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element
orderedListBlock = do
items <- some orderedListItem
lineEnding'
pure $ List $ L Ordered items
orderedListItem :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m ListItem
orderedListItem = do
some digit
char '.' <|> char ')'
optional (char ' ' <|> char '\t')
content <- many $ notFollowedBy lineEnding' *> inlineElement
lineEnding'
-- continuations <- many listContinuation
children <- maybeToList <$> (optional $ indentedList 1)
pure $ LI content children
-- HTML Block
htmlBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element
htmlBlock = do
char '<'
-- Capture the entire HTML block as raw text
rest <- manyTill anyChar (try $ char '>' >> lineEnding)
let content = '<' : (rest <> ">")
return $ HTML $ HTMLTag (T.pack content)
-- Paragraph Block
paragraphBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element
paragraphBlock = do
logDebug "paragraph"
content <- some (notFollowedBy lineEnding *> inlineElement)
lineEnding <|> eof
pure $ Paragraph $ P content
-- Inline Elements
inlineElement :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText
inlineElement =
logDebug "inlineElement"
*> choice
[ try strong <?> "Inline Strong Text",
try emphasis <?> "Inline Italic Text",
try crossedText <?> "Inline Crossed Text",
try codeSpan <?> "Inline Code",
try image <?> "Inline Image",
try link <?> "Inline Link",
try htmlInline <?> "Inline HTML",
try escapedChar <?> "Escaped Character",
plainText <?> "Inline Plain Text"
]
-- Strong (Bold)
strong :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText
strong = strongAsterisk <|> strongUnderscore
strongAsterisk :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText
strongAsterisk = do
string "**"
content <- some (notFollowedBy (string "**") >> inlineElement)
string "**"
pure $ Bold content
strongUnderscore :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText
strongUnderscore = do
string "__"
content <- some (notFollowedBy (string "__") >> inlineElement)
string "__"
pure $ Bold content
crossedText :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText
crossedText = do
string "~~"
content <- some (notFollowedBy (string "~~") >> inlineElement)
string "~~"
pure $ Crossed content
-- Emphasis (Italic)
emphasis :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText
emphasis = emphasisAsterisk <|> emphasisUnderscore
emphasisAsterisk :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText
emphasisAsterisk = do
char '*'
content <- some (notFollowedBy (char '*') >> inlineElementNoAsterisk)
char '*'
pure $ Italic content
emphasisUnderscore :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText
emphasisUnderscore = do
char '_'
content <- some inlineElementNoUnderscore
char '_'
pure $ Italic content
inlineElementNo :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => Char -> ParserTG s m InlineText
inlineElementNo c =
choice
[ try strong,
try codeSpan,
try image,
try link,
try htmlInline,
try escapedChar,
plainTextNo [c]
]
plainTextNo :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => [Char] -> ParserTG s m InlineText
plainTextNo list = do
plainTextNo' False list
plainTextNo' :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => Bool -> [Char] -> ParserTG s m InlineText
plainTextNo' block_whitespace disallow = do
logDebug $ "base plain Text: " <> T.show block_whitespace <> " " <> T.show disallow
firstChar <- noneOf (disallow <> if block_whitespace then " \t\r\n" else []) <?> "Plain Text Initial Disallow"
remChars <- many $ notFollowedBy lineEnding' *> plainTextCharNo disallow
pure $ Text $ T.map wspHandler $ T.pack $ firstChar : remChars
where
wspHandler '\n' = ' '
wspHandler c = c
inlineElementNoAsterisk :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText
inlineElementNoAsterisk = inlineElementNo '*'
inlineElementNoUnderscore :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText
inlineElementNoUnderscore = inlineElementNo '_'
-- Code Span
codeSpan :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText
codeSpan =
choice
[ try tripleBacktick,
try doubleBacktick,
singleBacktick
]
where
singleBacktick = do
char '`'
content <- many $ noneOf "`\n\r"
char '`'
pure $ InlineCode (T.pack content)
doubleBacktick = do
string "``"
content <- manyTill anyChar (try $ string "``")
pure $ InlineCode (T.pack content)
tripleBacktick = do
string "```"
content <- manyTill anyChar (try $ string "```")
pure $ InlineCode (T.pack content)
-- Image
image :: (Logger m, Token s ~ Char, Stream s) => ParserTG s m InlineText
image = do
char '!'
char '['
alt <- T.pack <$> many (noneOf "]\n\r")
char ']'
(url, title) <- linkDestination
return $ Image {altText = alt, url = url, title = title}
-- Link
link :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText
link = do
char '['
content <- some (notFollowedBy (char ']') >> inlineElementNoBracket)
char ']'
(url, title) <- linkDestination
pure $ Link content url title
inlineElementNoBracket :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText
inlineElementNoBracket =
choice
[ try strong,
try emphasis,
try codeSpan,
try htmlInline,
try escapedChar,
plainTextNo "[]"
]
linkDestination :: (Logger m, Token s ~ Char, Stream s) => ParserTG s m (Text, Maybe Text)
linkDestination = directLink <|> referenceLink
where
directLink = do
char '('
url <- many $ noneOf " \t\n\r)"
title <- optionMaybe (try $ some (char ' ' <|> char '\t') >> titleParser)
char ')'
pure (T.pack url, title)
referenceLink = do
char '['
ref <- some (alphaNum <|> char ' ' <|> char '\t')
char ']'
-- For simplicity, we're not resolving references here
-- In a real implementation, you'd look up the reference
pure (T.pack $ "[" ++ ref ++ "]", Nothing)
titleParser :: (Logger m, Token s ~ Char, Stream s) => ParserTG s m Text
titleParser =
T.pack
<$> choice
[ between (char '"') (char '"') (many $ anySingleBut '"'),
between (char '\'') (char '\'') (many $ anySingleBut '\''),
between (char '(') (char ')') (many $ anySingleBut ')')
]
-- HTML Inline
htmlInline :: (Logger m, Token s ~ Char, Stream s) => ParserTG s m InlineText
htmlInline = do
start <- char '<'
content <- manyTill anyChar (try $ char '>')
return $ HTMLInline (T.pack (start : content ++ ">"))
-- Escaped Character
escapedChar :: (Logger m, Token s ~ Char, Stream s) => ParserTG s m InlineText
escapedChar = do
char '\\'
c <- satisfy (\x -> x >= '!' && x <= '~')
pure $ Text (T.singleton c)
-- Plain Text
plainText :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText
plainText = plainTextNo' False [] <?> "Baseline Plain Text"
plainTextBaseDisallow :: [Char]
plainTextBaseDisallow = "[~`_*<"
plainTextCharNo :: (Logger m, Token s ~ Char, Stream s) => [Char] -> ParserTG s m Char
plainTextCharNo additional = noneOf $ additional <> plainTextBaseDisallow
-- Helper Parsers
lineEnding :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m ()
lineEnding = void (try $ count 2 (try (string "\r\n") <|> try (string "\n") <|> string "\r")) <|> eof
lineEnding' :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m ()
lineEnding' = void (try (string "\r\n") <|> try (string "\n") <|> string "\r") <|> eof
paragraphBlock :: (Logger m, Characters s) => Parser s m Element
paragraphBlock = Paragraph . P <$> (many inlineText)