From 115270a70b50c9047c6ba8dbd808cc4594faa4ab Mon Sep 17 00:00:00 2001 From: Pagwin Date: Mon, 8 Dec 2025 21:18:22 -0500 Subject: [PATCH] redoing markdown parsing and adding underlining --- src/IR.hs | 1 + src/Markdown.hs | 457 ++++++++---------------------------------------- 2 files changed, 78 insertions(+), 380 deletions(-) diff --git a/src/IR.hs b/src/IR.hs index 29f3cd7..3aa30b0 100644 --- a/src/IR.hs +++ b/src/IR.hs @@ -58,6 +58,7 @@ data InlineText | Bold [InlineText] | Italic [InlineText] | Crossed [InlineText] + | Underlined [InlineText] | InlineCode Text | Link { linkText :: [InlineText], diff --git a/src/Markdown.hs b/src/Markdown.hs index 0ec8b8b..5710973 100644 --- a/src/Markdown.hs +++ b/src/Markdown.hs @@ -18,413 +18,110 @@ import IR import Logger (Logger (logDebug)) import Text.Megaparsec (ParsecT, Stream, Token, Tokens, anySingle, anySingleBut, between, choice, count, eof, manyTill, notFollowedBy, satisfy, skipSome, try, ()) import qualified Text.Megaparsec as MP -import Text.Megaparsec.Char (alphaNumChar, char, digitChar, string) +import Text.Megaparsec.Char (alphaNumChar, char, digitChar, newline, string) -type ParserTG = ParsecT Void +type Parser = ParsecT Void -type ParserT m = ParserTG T.Text m +class (Token s ~ Char, Stream s, IsString (Tokens s), Show s) => Characters s -type Parser = ParserT Identity +instance Characters Text -anyChar :: (Token s ~ Char, Stream s) => ParserTG s m Char -anyChar = anySingle +instance Characters String -alphaNum :: (Token s ~ Char, Stream s) => ParserTG s m Char -alphaNum = alphaNumChar - -digit :: (Token s ~ Char, Stream s) => ParserTG s m Char -digit = digitChar - -noneOf :: (Token s ~ Char, Stream s) => [Char] -> ParserTG s m Char -noneOf = MP.noneOf - -oneOf :: (Token s ~ Char, Stream s) => [Char] -> ParserTG s m Char -oneOf = MP.oneOf - -optionMaybe :: (Token s ~ Char, Stream s) => ParserTG s m a -> ParserTG s m (Maybe a) -optionMaybe = optional - -skipMany1 :: (Token s ~ Char, Stream s) => ParserTG s m a -> ParserTG s m () -skipMany1 = skipSome - -metadata :: (Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Text +metadata :: (Logger m, Characters s) => Parser s m Text metadata = bound *> fmap T.pack (many $ try (char '-' <* notFollowedBy (string "--")) <|> anySingleBut '-') <* bound where bound = string "---" -document :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Document -document = do - logDebug "document" - Doc <$> many element <* eof +document :: (Logger m, Characters s) => Parser s m Document +document = Doc <$> many element -element :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element +element :: (Logger m, Characters s) => Parser s m Element element = choice [ try headingBlock "Element Heading", try fencedCodeBlock "Fenced Code Block", - try indentedCodeBlock "Indented Code Block", try blockquoteBlock "BlockQuote", - try unorderedListBlock "Unordered List", - try orderedListBlock "Ordered List", - try horizontalRuleBlock "Horizontal Rule", + (try $ unorderedListBlock 0) "Unordered List", + (try $ orderedListBlock 0) "Ordered List", try htmlBlock "HTML Block", - try blankLines "Blank Lines", -- Consume blank lines but don't add to AST paragraphBlock "Paragarph" ] + <* blockEnding --- Blank lines (consumed but not stored) -blankLines :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element -blankLines = do - skipMany1 (blankLine *> notFollowedBy eof) - element <|> fmap (const $ HTML HTMLTag {html_content = ""}) eof -- Parse the next element (or handle eof) +lineEnding :: (Logger m, Characters s) => Parser s m () +lineEnding = (try eof) <|> void newline -blankLine :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m () -blankLine = do - many (char ' ' <|> char '\t') - lineEnding - pure () +blockEnding :: (Logger m, Characters s) => Parser s m () +blockEnding = (try (lineEnding *> lineEnding)) <|> eof --- Heading Block -headingBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element -headingBlock = do - hashes <- some (char '#') "Heading Hashes" - let level = length hashes - guard (level <= 6) "Higher than level 6" - many (char ' ' <|> char '\t') "Pre-Text Whitespace" - content <- manyTill (inlineElement "Header Text") (try lineEnding "Header Ending") - pure $ Heading $ H level content - --- Fenced Code Block -fencedCodeBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element -fencedCodeBlock = do - logDebug "fenced_coding_block" - fence <- string "```" <|> string "~~~" - logDebug "fence" - lang <- optionMaybe languageInfo - logDebug "langInfo" - lineEnding' - logDebug "lineEnding" - codeLines <- manyTill (codeLine fence) (try $ string fence) - logDebug "lines" - pure $ Code $ C lang (T.pack $ unlines codeLines) - -languageInfo :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Text -languageInfo = T.pack <$> some (alphaNum <|> char '-' <|> char '+' <|> char '.') - -codeLine :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => (Tokens s) -> ParserTG s m String -codeLine fence = do - -- this is a hack which can only haunt me if I continue using markdown - line <- many $ (notFollowedBy $ string fence) *> noneOf "\n\r" - lineEnding' - pure line - --- Indented Code Block -indentedCodeBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element -indentedCodeBlock = do - lines' <- some indentedLine - pure $ Code $ C Nothing (T.pack $ unlines lines') +-- TODO: check if inlineHTML needs to be handled in any markdown posts +-- TODO: link impl +-- TODO: image impl +inlineText :: (Logger m, Characters s) => Parser s m InlineText +inlineText = choice [try strikethrough, try bold, try italic, try underlined, try code, try link, try image, plain_text] where - indentedLine = do - count 4 (char ' ' <|> char '\t') - line <- many $ noneOf "\n\r" - lineEnding - pure line + between' start end middle_piece = between start end $ many ((notFollowedBy end) *> middle_piece) --- Blockquote Block -blockquoteBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element -blockquoteBlock = do - lines' <- some blockquoteLine - pure $ BlockQuote $ Q (concat lines') + strikethrough = Crossed <$> (between' (string "~~") (string "~~") inlineText) + + bold = Bold <$> (between' (string "**") (string "**") inlineText) + + italic = Italic <$> (between' (char '*') (char '*') inlineText) + + underlined = Underlined <$> (between' (string "__") (string "__") inlineText) + + code = InlineCode . T.pack <$> (between' (char '`') (char '`') (notFollowedBy lineEnding *> anySingle)) + + link = do + linkText <- error "linkText parser" + url <- error "url parser" + title <- error "title parser" + pure Link {linkText, url, title} + + image = do + altText <- error "altText" + url <- error "url" + title <- error "title" + pure Image {altText, url, title} + + plain_text = Text . T.pack <$> (many ((notFollowedBy blockEnding) *> anySingle)) + +headingBlock :: (Logger m, Characters s) => Parser s m Element +headingBlock = do + heading_level <- length <$> (some $ char '#') + optional $ char ' ' + text <- many $ inlineText + pure $ Heading $ H {level = heading_level, text} + +fencedCodeBlock :: (Logger m, Characters s) => Parser s m Element +fencedCodeBlock = between (string "```") (string "```") $ do + language' <- T.pack <$> (many (notFollowedBy lineEnding *> anySingle)) + lineEnding + code <- T.pack <$> (many ((notFollowedBy $ string "```") *> anySingle)) + let language = if language' == "" then Just language' else Nothing + pure $ Code $ C {language, code} + +blockquoteBlock :: (Logger m, Characters s) => Parser s m Element +blockquoteBlock = BlockQuote . Q . concat <$> (some blockquoteLine) where blockquoteLine = do char '>' - optional (char ' ') - content <- many $ notFollowedBy lineEnding' *> inlineElement - pure content + optional $ char ' ' + ret <- (many ((notFollowedBy lineEnding) *> inlineText)) + -- this dance with optional and notFollowedBy is done so we + -- aren't accidentally consuming part of a block ending + (optional ((notFollowedBy blockEnding) *> lineEnding)) + pure ret --- Horizontal Rule Block -horizontalRuleBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element -horizontalRuleBlock = do - choice - [ try (count 3 (char '*') >> many (char ' ' <|> char '*')), - try (count 3 (char '-') >> many (char ' ' <|> char '-')), - try (count 3 (char '_') >> many (char ' ' <|> char '_')) - ] - lineEnding - pure HorizontalRule +unorderedListBlock :: (Logger m, Characters s) => Int -> Parser s m Element +unorderedListBlock indent_level = error "TODO: unorderedListBlock" --- Unordered List Block -unorderedListBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element -unorderedListBlock = do - items <- some unorderedListItem - lineEnding' - pure $ List $ L Unordered items +orderedListBlock :: (Logger m, Characters s) => Int -> Parser s m Element +orderedListBlock indent_level = error "TODO: orderedListBlock" -unorderedListItem :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m ListItem -unorderedListItem = do - oneOf "*-+" - optional (char ' ' <|> char '\t') - content <- many $ notFollowedBy lineEnding' *> inlineElement - lineEnding' - -- continuations <- many listContinuation - children <- maybeToList <$> (optional $ indentedList 1) - pure $ LI content children +htmlBlock :: (Logger m, Characters s) => Parser s m Element +htmlBlock = error "TODO: htmlBlock" --- TODO: handle list indentation at all levels -indentedList :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => Int -> ParserTG s m List -indentedList n = do - void $ (count (4 * n) (char ' ')) <|> count n (char '\t') - ret <- (try indentedUnorderedList) <|> indentedOrderedList - pure ret <* lineEnding' - -indentedUnorderedList :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m List -indentedUnorderedList = do - items <- some (indentedListItem (oneOf "*-+" >> void (char ' ' <|> char '\t'))) - pure $ L Unordered items - -indentedOrderedList :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m List -indentedOrderedList = do - items <- some (try $ indentedListItem (some digit >> char '.' >> void (char ' ' <|> char '\t'))) - pure $ L Ordered items - -indentedListItem :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m () -> ParserTG s m ListItem -indentedListItem marker = do - marker - content <- many $ notFollowedBy lineEnding' *> inlineElement - pure $ LI content [] - --- Ordered List Block -orderedListBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element -orderedListBlock = do - items <- some orderedListItem - lineEnding' - pure $ List $ L Ordered items - -orderedListItem :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m ListItem -orderedListItem = do - some digit - char '.' <|> char ')' - optional (char ' ' <|> char '\t') - content <- many $ notFollowedBy lineEnding' *> inlineElement - lineEnding' - -- continuations <- many listContinuation - children <- maybeToList <$> (optional $ indentedList 1) - pure $ LI content children - --- HTML Block -htmlBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element -htmlBlock = do - char '<' - -- Capture the entire HTML block as raw text - rest <- manyTill anyChar (try $ char '>' >> lineEnding) - let content = '<' : (rest <> ">") - return $ HTML $ HTMLTag (T.pack content) - --- Paragraph Block -paragraphBlock :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m Element -paragraphBlock = do - logDebug "paragraph" - content <- some (notFollowedBy lineEnding *> inlineElement) - lineEnding <|> eof - pure $ Paragraph $ P content - --- Inline Elements -inlineElement :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText -inlineElement = - logDebug "inlineElement" - *> choice - [ try strong "Inline Strong Text", - try emphasis "Inline Italic Text", - try crossedText "Inline Crossed Text", - try codeSpan "Inline Code", - try image "Inline Image", - try link "Inline Link", - try htmlInline "Inline HTML", - try escapedChar "Escaped Character", - plainText "Inline Plain Text" - ] - --- Strong (Bold) -strong :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText -strong = strongAsterisk <|> strongUnderscore - -strongAsterisk :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText -strongAsterisk = do - string "**" - content <- some (notFollowedBy (string "**") >> inlineElement) - string "**" - pure $ Bold content - -strongUnderscore :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText -strongUnderscore = do - string "__" - content <- some (notFollowedBy (string "__") >> inlineElement) - string "__" - pure $ Bold content - -crossedText :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText -crossedText = do - string "~~" - content <- some (notFollowedBy (string "~~") >> inlineElement) - string "~~" - pure $ Crossed content - --- Emphasis (Italic) -emphasis :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText -emphasis = emphasisAsterisk <|> emphasisUnderscore - -emphasisAsterisk :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText -emphasisAsterisk = do - char '*' - content <- some (notFollowedBy (char '*') >> inlineElementNoAsterisk) - char '*' - pure $ Italic content - -emphasisUnderscore :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText -emphasisUnderscore = do - char '_' - content <- some inlineElementNoUnderscore - char '_' - pure $ Italic content - -inlineElementNo :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => Char -> ParserTG s m InlineText -inlineElementNo c = - choice - [ try strong, - try codeSpan, - try image, - try link, - try htmlInline, - try escapedChar, - plainTextNo [c] - ] - -plainTextNo :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => [Char] -> ParserTG s m InlineText -plainTextNo list = do - plainTextNo' False list - -plainTextNo' :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => Bool -> [Char] -> ParserTG s m InlineText -plainTextNo' block_whitespace disallow = do - logDebug $ "base plain Text: " <> T.show block_whitespace <> " " <> T.show disallow - firstChar <- noneOf (disallow <> if block_whitespace then " \t\r\n" else []) "Plain Text Initial Disallow" - remChars <- many $ notFollowedBy lineEnding' *> plainTextCharNo disallow - pure $ Text $ T.map wspHandler $ T.pack $ firstChar : remChars - where - wspHandler '\n' = ' ' - wspHandler c = c - -inlineElementNoAsterisk :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText -inlineElementNoAsterisk = inlineElementNo '*' - -inlineElementNoUnderscore :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText -inlineElementNoUnderscore = inlineElementNo '_' - --- Code Span -codeSpan :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText -codeSpan = - choice - [ try tripleBacktick, - try doubleBacktick, - singleBacktick - ] - where - singleBacktick = do - char '`' - content <- many $ noneOf "`\n\r" - char '`' - pure $ InlineCode (T.pack content) - doubleBacktick = do - string "``" - content <- manyTill anyChar (try $ string "``") - pure $ InlineCode (T.pack content) - tripleBacktick = do - string "```" - content <- manyTill anyChar (try $ string "```") - pure $ InlineCode (T.pack content) - --- Image -image :: (Logger m, Token s ~ Char, Stream s) => ParserTG s m InlineText -image = do - char '!' - char '[' - alt <- T.pack <$> many (noneOf "]\n\r") - char ']' - (url, title) <- linkDestination - return $ Image {altText = alt, url = url, title = title} - --- Link -link :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText -link = do - char '[' - content <- some (notFollowedBy (char ']') >> inlineElementNoBracket) - char ']' - (url, title) <- linkDestination - pure $ Link content url title - -inlineElementNoBracket :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText -inlineElementNoBracket = - choice - [ try strong, - try emphasis, - try codeSpan, - try htmlInline, - try escapedChar, - plainTextNo "[]" - ] - -linkDestination :: (Logger m, Token s ~ Char, Stream s) => ParserTG s m (Text, Maybe Text) -linkDestination = directLink <|> referenceLink - where - directLink = do - char '(' - url <- many $ noneOf " \t\n\r)" - title <- optionMaybe (try $ some (char ' ' <|> char '\t') >> titleParser) - char ')' - pure (T.pack url, title) - referenceLink = do - char '[' - ref <- some (alphaNum <|> char ' ' <|> char '\t') - char ']' - -- For simplicity, we're not resolving references here - -- In a real implementation, you'd look up the reference - pure (T.pack $ "[" ++ ref ++ "]", Nothing) - -titleParser :: (Logger m, Token s ~ Char, Stream s) => ParserTG s m Text -titleParser = - T.pack - <$> choice - [ between (char '"') (char '"') (many $ anySingleBut '"'), - between (char '\'') (char '\'') (many $ anySingleBut '\''), - between (char '(') (char ')') (many $ anySingleBut ')') - ] - --- HTML Inline -htmlInline :: (Logger m, Token s ~ Char, Stream s) => ParserTG s m InlineText -htmlInline = do - start <- char '<' - content <- manyTill anyChar (try $ char '>') - return $ HTMLInline (T.pack (start : content ++ ">")) - --- Escaped Character -escapedChar :: (Logger m, Token s ~ Char, Stream s) => ParserTG s m InlineText -escapedChar = do - char '\\' - c <- satisfy (\x -> x >= '!' && x <= '~') - pure $ Text (T.singleton c) - --- Plain Text -plainText :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m InlineText -plainText = plainTextNo' False [] "Baseline Plain Text" - -plainTextBaseDisallow :: [Char] -plainTextBaseDisallow = "[~`_*<" - -plainTextCharNo :: (Logger m, Token s ~ Char, Stream s) => [Char] -> ParserTG s m Char -plainTextCharNo additional = noneOf $ additional <> plainTextBaseDisallow - --- Helper Parsers -lineEnding :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m () -lineEnding = void (try $ count 2 (try (string "\r\n") <|> try (string "\n") <|> string "\r")) <|> eof - -lineEnding' :: (Logger m, Token s ~ Char, Stream s, IsString (Tokens s)) => ParserTG s m () -lineEnding' = void (try (string "\r\n") <|> try (string "\n") <|> string "\r") <|> eof +paragraphBlock :: (Logger m, Characters s) => Parser s m Element +paragraphBlock = Paragraph . P <$> (many inlineText)