{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} -- (document, metadata) module Markdown (document, metadata) where import Control.Applicative (many, optional, some, (<|>)) import Control.Monad (guard, void) import Data.Char (isAlpha) import Data.Functor.Identity (Identity) import Data.Text (Text) import qualified Data.Text as T import Data.Void (Void) import IR import Logger (Logger (logDebug)) import Text.Megaparsec (ParsecT, anySingle, anySingleBut, between, choice, count, eof, manyTill, notFollowedBy, satisfy, skipSome, try, ()) import qualified Text.Megaparsec as MP import Text.Megaparsec.Char (alphaNumChar, char, digitChar, string) import Utilities (tee) type ParserTG = ParsecT Void type ParserT m = ParserTG T.Text m type Parser = ParserT Identity logP :: (Logger m, Show s) => ParserTG s m s -> ParserTG s m s logP = tee (logDebug . T.show) anyChar :: ParserTG s m Char anyChar = anySingle alphaNum :: ParserTG s m Char alphaNum = alphaNumChar digit :: ParserTG s m Char digit = digitChar noneOf :: [Char] -> ParserTG s m Char noneOf = MP.noneOf oneOf :: [Char] -> ParserTG s m Char oneOf = MP.oneOf optionMaybe :: ParserTG s m a -> ParserTG s m (Maybe a) optionMaybe = optional skipMany1 :: ParserTG s m a -> ParserTG s m () skipMany1 = skipSome metadata :: ParserTG s m Text metadata = bound *> fmap T.pack (many $ try (char '-' <* notFollowedBy (string "--")) <|> anySingleBut '-') <* bound where bound = string "---" document :: (Logger m) => ParserTG s m Document document = Doc <$> many element <* eof element :: (Logger m) => ParserTG s m Element element = choice [ try headingBlock "Element Heading", try fencedCodeBlock "Fenced Code Block", try indentedCodeBlock "Indented Code Block", try blockquoteBlock "BlockQuote", try unorderedListBlock "Unordered List", try orderedListBlock "Ordered List", try horizontalRuleBlock "Horizontal Rule", try htmlBlock "HTML Block", try blankLines "Blank Lines", -- Consume blank lines but don't add to AST paragraphBlock "Paragarph" ] -- Blank lines (consumed but not stored) blankLines :: (Logger m) => ParserTG s m Element blankLines = do skipMany1 (blankLine *> notFollowedBy eof) element <|> fmap (const $ HTML HTMLTag {html_content = ""}) eof -- Parse the next element (or handle eof) blankLine :: (Logger m) => ParserTG s m () blankLine = do many (char ' ' <|> char '\t') lineEnding pure () -- Heading Block headingBlock :: (Logger m) => ParserTG s m Element headingBlock = do hashes <- some (char '#') "Heading Hashes" let level = length hashes guard (level <= 6) "Higher than level 6" many (char ' ' <|> char '\t') "Pre-Text Whitespace" content <- manyTill (inlineElement "Header Text") (try lineEnding "Header Ending") pure $ Heading $ H level content -- Fenced Code Block fencedCodeBlock :: (Logger m) => ParserTG s m Element fencedCodeBlock = do fence <- string "```" <|> string "~~~" lang <- optionMaybe languageInfo lineEnding codeLines <- manyTill codeLine (try $ string fence) pure $ Code $ C lang (T.pack $ unlines codeLines) languageInfo :: (Logger m) => ParserTG s m Text languageInfo = T.pack <$> some (alphaNum <|> char '-' <|> char '+' <|> char '.') codeLine :: (Logger m) => ParserTG s m String codeLine = do line <- many $ noneOf "\n\r" lineEnding pure line -- Indented Code Block indentedCodeBlock :: (Logger m) => ParserTG s m Element indentedCodeBlock = do lines' <- some indentedLine pure $ Code $ C Nothing (T.pack $ unlines lines') where indentedLine = do count 4 (char ' ' <|> char '\t') line <- many $ noneOf "\n\r" lineEnding pure line -- Blockquote Block blockquoteBlock :: (Logger m) => ParserTG s m Element blockquoteBlock = do lines' <- some blockquoteLine pure $ BlockQuote $ Q (concat lines') where blockquoteLine = do char '>' optional (char ' ') content <- manyTill inlineElement (try lineEnding) pure content -- Horizontal Rule Block horizontalRuleBlock :: (Logger m) => ParserTG s m Element horizontalRuleBlock = do choice [ try (count 3 (char '*') >> many (char ' ' <|> char '*')), try (count 3 (char '-') >> many (char ' ' <|> char '-')), try (count 3 (char '_') >> many (char ' ' <|> char '_')) ] lineEnding pure HorizontalRule -- Unordered List Block unorderedListBlock :: (Logger m) => ParserTG s m Element unorderedListBlock = do items <- some unorderedListItem pure $ List $ L Unordered items unorderedListItem :: (Logger m) => ParserTG s m ListItem unorderedListItem = do oneOf "*-+" char ' ' <|> char '\t' content <- manyTill inlineElement (try lineEnding) -- continuations <- many listContinuation children <- many (try indentedList) pure $ LI content children listContinuation :: (Logger m) => ParserTG s m [InlineText] listContinuation = do count 2 (char ' ' <|> char '\t') many (char ' ' <|> char '\t') notFollowedBy (oneOf "*-+") notFollowedBy (digit >> char '.') content <- manyTill inlineElement (try lineEnding) pure content -- TODO: handle list indentation at all levels indentedList :: (Logger m) => ParserTG s m List indentedList = do let n = 1 void $ count (4 * n) (char ' ') <|> count n (char '\t') choice [try indentedUnorderedList, indentedOrderedList] indentedUnorderedList :: (Logger m) => ParserTG s m List indentedUnorderedList = do items <- some (try $ indentedListItem (oneOf "*-+" >> void (char ' ' <|> char '\t'))) pure $ L Unordered items indentedOrderedList :: (Logger m) => ParserTG s m List indentedOrderedList = do items <- some (try $ indentedListItem (some digit >> char '.' >> void (char ' ' <|> char '\t'))) pure $ L Ordered items indentedListItem :: (Logger m) => ParserTG s m () -> ParserTG s m ListItem indentedListItem marker = do marker content <- manyTill inlineElement (try $ lineEnding <|> eof) pure $ LI content [] -- Ordered List Block orderedListBlock :: (Logger m) => ParserTG s m Element orderedListBlock = do items <- some orderedListItem pure $ List $ L Ordered items orderedListItem :: (Logger m) => ParserTG s m ListItem orderedListItem = do some digit char '.' char ' ' <|> char '\t' content <- manyTill inlineElement (try lineEnding) continuations <- many listContinuation children <- many (try indentedList) pure $ LI (content ++ concat continuations) children -- HTML Block htmlBlock :: (Logger m) => ParserTG s m Element htmlBlock = do char '<' -- Capture the entire HTML block as raw text rest <- manyTill anyChar (try $ char '>' >> lineEnding) let content = '<' : rest return $ HTML $ HTMLTag (T.pack content) tagName :: (Logger m) => ParserTG s m String tagName = do first <- satisfy isAlpha rest <- many (alphaNum <|> char '-' <|> char ':') pure (first : rest) attribute :: (Logger m) => ParserTG s m (Text, Maybe Text) attribute = do name <- attributeName value <- optionMaybe (char '=' >> attributeValue) pure (T.pack name, fmap T.pack value) attributeName :: (Logger m) => ParserTG s m String attributeName = do first <- satisfy isAlpha rest <- many (alphaNum <|> char '-' <|> char ':') pure (first : rest) attributeValue :: (Logger m) => ParserTG s m String attributeValue = choice [ between (char '"') (char '"') (many $ anySingleBut '"'), between (char '\'') (char '\'') (many $ anySingleBut '\''), some $ noneOf " \t\n\r>\"'=<`" ] -- Paragraph Block paragraphBlock :: (Logger m) => ParserTG s m Element paragraphBlock = do content <- some inlineElement lineEnding <|> eof pure $ Paragraph $ P content -- Inline Elements inlineElement :: (Logger m) => ParserTG s m InlineText inlineElement = choice [ try strong "Inline Strong Text", try emphasis "Inline Italic Text", try crossedText "Inline Crossed Text", try codeSpan "Inline Code", try image "Inline Image", try link "Inline Link", try htmlInline "Inline HTML", try escapedChar "Escaped Character", plainText "Inline Plain Text" ] -- Strong (Bold) strong :: (Logger m) => ParserTG s m InlineText strong = strongAsterisk <|> strongUnderscore strongAsterisk :: (Logger m) => ParserTG s m InlineText strongAsterisk = do string "**" content <- some (notFollowedBy (string "**") >> inlineElement) string "**" pure $ Bold content strongUnderscore :: (Logger m) => ParserTG s m InlineText strongUnderscore = do string "__" content <- some (notFollowedBy (string "__") >> inlineElement) string "__" pure $ Bold content crossedText :: (Logger m) => ParserTG s m InlineText crossedText = do string "~~" content <- some (notFollowedBy (string "~~") >> inlineElement) string "~~" pure $ Crossed content -- Emphasis (Italic) emphasis :: (Logger m) => ParserTG s m InlineText emphasis = emphasisAsterisk <|> emphasisUnderscore emphasisAsterisk :: (Logger m) => ParserTG s m InlineText emphasisAsterisk = do char '*' content <- some (notFollowedBy (char '*') >> inlineElementNoAsterisk) char '*' pure $ Italic content emphasisUnderscore :: (Logger m) => ParserTG s m InlineText emphasisUnderscore = do char '_' content <- some inlineElementNoUnderscore char '_' pure $ Italic content inlineElementNo :: (Logger m) => Char -> ParserTG s m InlineText inlineElementNo c = choice [ try strong, try codeSpan, try image, try link, try htmlInline, try escapedChar, plainTextNo [c] ] plainTextNo :: (Logger m) => [Char] -> ParserTG s m InlineText plainTextNo = plainTextNo' False plainTextNo' :: (Logger m) => Bool -> [Char] -> ParserTG s m InlineText plainTextNo' block_whitespace disallow = do firstChar <- noneOf (disallow <> if block_whitespace then " \t\r\n" else []) "Plain Text Initial Disallow" remChars <- many $ notFollowedBy lineEnding *> plainTextCharNo disallow pure $ Text $ T.map wspHandler $ T.pack $ firstChar : remChars where wspHandler '\n' = ' ' wspHandler c = c inlineElementNoAsterisk :: (Logger m) => ParserTG s m InlineText inlineElementNoAsterisk = inlineElementNo '*' inlineElementNoUnderscore :: (Logger m) => ParserTG s m InlineText inlineElementNoUnderscore = inlineElementNo '_' -- Code Span codeSpan :: (Logger m) => ParserTG s m InlineText codeSpan = choice [ try tripleBacktick, try doubleBacktick, singleBacktick ] where singleBacktick = do char '`' content <- many $ noneOf "`\n\r" char '`' pure $ InlineCode (T.pack content) doubleBacktick = do string "``" content <- manyTill anyChar (try $ string "``") pure $ InlineCode (T.pack content) tripleBacktick = do string "```" content <- manyTill anyChar (try $ string "```") pure $ InlineCode (T.pack content) -- Image image :: (Logger m) => ParserTG s m InlineText image = do char '!' char '[' alt <- T.pack <$> many (noneOf "]\n\r") char ']' (url, title) <- linkDestination return $ Image {altText = alt, url = url, title = title} -- Link link :: (Logger m) => ParserTG s m InlineText link = do char '[' content <- some (notFollowedBy (char ']') >> inlineElementNoBracket) char ']' (url, title) <- linkDestination pure $ Link content url title inlineElementNoBracket :: (Logger m) => ParserTG s m InlineText inlineElementNoBracket = choice [ try strong, try emphasis, try codeSpan, try htmlInline, try escapedChar, plainTextNoBracket ] linkDestination :: (Logger m) => ParserTG s m (Text, Maybe Text) linkDestination = directLink <|> referenceLink where directLink = do char '(' url <- many $ noneOf " \t\n\r)" title <- optionMaybe (try $ some (char ' ' <|> char '\t') >> titleParser) char ')' pure (T.pack url, title) referenceLink = do char '[' ref <- some (alphaNum <|> char ' ' <|> char '\t') char ']' -- For simplicity, we're not resolving references here -- In a real implementation, you'd look up the reference pure (T.pack $ "[" ++ ref ++ "]", Nothing) titleParser :: (Logger m) => ParserTG s m Text titleParser = T.pack <$> choice [ between (char '"') (char '"') (many $ anySingleBut '"'), between (char '\'') (char '\'') (many $ anySingleBut '\''), between (char '(') (char ')') (many $ anySingleBut ')') ] -- HTML Inline htmlInline :: (Logger m) => ParserTG s m InlineText htmlInline = do start <- char '<' content <- manyTill anyChar (try $ char '>') return $ HTMLInline (T.pack (start : content ++ ">")) -- Escaped Character escapedChar :: (Logger m) => ParserTG s m InlineText escapedChar = do char '\\' c <- satisfy (\x -> x >= '!' && x <= '~') pure $ Text (T.singleton c) -- Plain Text plainText :: (Logger m) => ParserTG s m InlineText plainText = plainTextNo' True [] "Baseline Plain Text" plainTextBaseDisallow :: [Char] plainTextBaseDisallow = "[~`_*<" plainTextCharNo :: (Logger m) => [Char] -> ParserTG s m Char plainTextCharNo additional = noneOf $ additional <> plainTextBaseDisallow plainTextNoAsterisk :: (Logger m) => ParserTG s m InlineText plainTextNoAsterisk = plainTextNo "*" plainTextNoUnderscore :: (Logger m) => ParserTG s m InlineText plainTextNoUnderscore = plainTextNo "_" plainTextNoBracket :: (Logger m) => ParserTG s m InlineText plainTextNoBracket = plainTextNo "[]" -- Helper Parsers lineEnding :: (Logger m) => ParserTG s m () lineEnding = void (try $ count 2 (try (string "\r\n") <|> try (string "\n") <|> string "\r")) <|> eof wsParser :: (Logger m) => ParserTG s m () wsParser = void $ some (char ' ' <|> char '\t')