{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} -- (document, metadata) module Markdown where import Control.Applicative (many, optional, some, (<|>)) import Control.Monad (guard, void) import Data.Char (isAlpha) import Data.Text (Text) import qualified Data.Text as T import IR import Text.Parsec hiding (Line, many, optional, (<|>)) import Text.Parsec.String (Parser) metadata :: Parser Text metadata = bound *> fmap T.pack (many $ try (char '-' <* notFollowedBy (string "--")) <|> noneOf "-") <* bound where bound = string "---" document :: Parser Document document = Doc <$> many element <* eof element :: Parser Element element = choice [ try headingBlock, try fencedCodeBlock, try indentedCodeBlock, try blockquoteBlock, try unorderedListBlock, try orderedListBlock, try horizontalRuleBlock, try htmlBlock, try blankLines, -- Consume blank lines but don't add to AST paragraphBlock ] -- Blank lines (consumed but not stored) blankLines :: Parser Element blankLines = do skipMany1 blankLine element -- Parse the next element blankLine :: Parser () blankLine = do many (char ' ' <|> char '\t') lineEnding pure () -- Heading Block headingBlock :: Parser Element headingBlock = do hashes <- some (char '#') let level = length hashes guard (level <= 6) many (char ' ' <|> char '\t') content <- manyTill inlineElement (try lineEnding) pure $ Heading $ H level content -- Fenced Code Block fencedCodeBlock :: Parser Element fencedCodeBlock = do fence <- string "```" <|> string "~~~" lang <- optionMaybe languageInfo lineEnding codeLines <- manyTill codeLine (try $ string fence >> lineEnding) pure $ Code $ C lang (T.pack $ unlines codeLines) languageInfo :: Parser Text languageInfo = T.pack <$> many1 (alphaNum <|> char '-' <|> char '+' <|> char '.') codeLine :: Parser String codeLine = do line <- many (noneOf "\n\r") lineEnding pure line -- Indented Code Block indentedCodeBlock :: Parser Element indentedCodeBlock = do lines' <- some indentedLine pure $ Code $ C Nothing (T.pack $ unlines lines') where indentedLine = do count 4 (char ' ' <|> char '\t') line <- many (noneOf "\n\r") lineEnding pure line -- Blockquote Block blockquoteBlock :: Parser Element blockquoteBlock = do lines' <- some blockquoteLine pure $ BlockQuote $ Q (concat lines') where blockquoteLine = do char '>' optional (char ' ') content <- manyTill inlineElement (try lineEnding) pure content -- Horizontal Rule Block horizontalRuleBlock :: Parser Element horizontalRuleBlock = do choice [ try (count 3 (char '*') >> many (char ' ' <|> char '*')), try (count 3 (char '-') >> many (char ' ' <|> char '-')), try (count 3 (char '_') >> many (char ' ' <|> char '_')) ] lineEnding pure HorizontalRule -- Unordered List Block unorderedListBlock :: Parser Element unorderedListBlock = do items <- some unorderedListItem pure $ List $ L Unordered items unorderedListItem :: Parser ListItem unorderedListItem = do oneOf "*-+" char ' ' <|> char '\t' content <- manyTill inlineElement (try lineEnding) continuations <- many listContinuation children <- many (try indentedList) pure $ LI (content ++ concat continuations) children listContinuation :: Parser [InlineText] listContinuation = do count 2 (char ' ' <|> char '\t') many (char ' ' <|> char '\t') notFollowedBy (oneOf "*-+") notFollowedBy (digit >> char '.') content <- manyTill inlineElement (try lineEnding) pure content indentedList :: Parser List indentedList = do void $ count 2 (char ' ' <|> char '\t') choice [try indentedUnorderedList, indentedOrderedList] indentedUnorderedList :: Parser List indentedUnorderedList = do items <- some (try $ indentedListItem (oneOf "*-+" >> void (char ' ' <|> char '\t'))) pure $ L Unordered items indentedOrderedList :: Parser List indentedOrderedList = do items <- some (try $ indentedListItem (some digit >> char '.' >> void (char ' ' <|> char '\t'))) pure $ L Ordered items indentedListItem :: Parser () -> Parser ListItem indentedListItem marker = do marker content <- manyTill inlineElement (try lineEnding) pure $ LI content [] -- Ordered List Block orderedListBlock :: Parser Element orderedListBlock = do items <- some orderedListItem pure $ List $ L Ordered items orderedListItem :: Parser ListItem orderedListItem = do some digit char '.' char ' ' <|> char '\t' content <- manyTill inlineElement (try lineEnding) continuations <- many listContinuation children <- many (try indentedList) pure $ LI (content ++ concat continuations) children -- HTML Block htmlBlock :: Parser Element htmlBlock = do char '<' -- Capture the entire HTML block as raw text rest <- manyTill anyChar (try $ char '>' >> lineEnding) let content = '<' : rest return $ HTML $ HTMLTag (T.pack content) tagName :: Parser String tagName = do first <- satisfy isAlpha rest <- many (alphaNum <|> char '-' <|> char ':') pure (first : rest) attribute :: Parser (Text, Maybe Text) attribute = do name <- attributeName value <- optionMaybe (char '=' >> attributeValue) pure (T.pack name, fmap T.pack value) attributeName :: Parser String attributeName = do first <- satisfy isAlpha rest <- many (alphaNum <|> char '-' <|> char ':') pure (first : rest) attributeValue :: Parser String attributeValue = choice [ between (char '"') (char '"') (many $ noneOf "\""), between (char '\'') (char '\'') (many $ noneOf "'"), some (noneOf " \t\n\r>\"'=<`") ] -- Paragraph Block paragraphBlock :: Parser Element paragraphBlock = do -- Make sure we're not starting another block element notFollowedBy (char '#') notFollowedBy (char '>') notFollowedBy (oneOf "*-+" >> (char ' ' <|> char '\t')) notFollowedBy (digit >> char '.') notFollowedBy (string "```" <|> string "~~~") notFollowedBy (count 4 (char ' ' <|> char '\t')) notFollowedBy (count 3 (char '*') <|> count 3 (char '-') <|> count 3 (char '_')) -- notFollowedBy (char '<') content <- some inlineElement lineEnding <|> eof pure $ Paragraph $ P content -- Inline Elements inlineElement :: Parser InlineText inlineElement = choice [ try strong, try emphasis, try codeSpan, try image, try link, try htmlInline, try escapedChar, plainText ] -- Strong (Bold) strong :: Parser InlineText strong = strongAsterisk <|> strongUnderscore strongAsterisk :: Parser InlineText strongAsterisk = do string "**" content <- some (notFollowedBy (string "**") >> inlineElement) string "**" pure $ Bold content strongUnderscore :: Parser InlineText strongUnderscore = do string "__" content <- some (notFollowedBy (string "__") >> inlineElement) string "__" pure $ Bold content -- Emphasis (Italic) emphasis :: Parser InlineText emphasis = emphasisAsterisk <|> emphasisUnderscore emphasisAsterisk :: Parser InlineText emphasisAsterisk = do char '*' notFollowedBy (char '*') content <- some (notFollowedBy (char '*') >> inlineElementNoAsterisk) char '*' pure $ Italic content emphasisUnderscore :: Parser InlineText emphasisUnderscore = do char '_' notFollowedBy (char '_') content <- some (notFollowedBy (char '_') >> inlineElementNoUnderscore) char '_' pure $ Italic content inlineElementNoAsterisk :: Parser InlineText inlineElementNoAsterisk = choice [ try strong, try codeSpan, try image, try link, try htmlInline, try escapedChar, plainTextNoAsterisk ] inlineElementNoUnderscore :: Parser InlineText inlineElementNoUnderscore = choice [ try strong, try codeSpan, try image, try link, try htmlInline, try escapedChar, plainTextNoUnderscore ] -- Code Span codeSpan :: Parser InlineText codeSpan = choice [ try tripleBacktick, try doubleBacktick, singleBacktick ] where singleBacktick = do char '`' content <- many (noneOf "`\n\r") char '`' pure $ InlineCode (T.pack content) doubleBacktick = do string "``" content <- manyTill anyChar (try $ string "``") pure $ InlineCode (T.pack content) tripleBacktick = do string "```" content <- manyTill anyChar (try $ string "```") pure $ InlineCode (T.pack content) -- Image image :: Parser InlineText image = do char '!' char '[' alt <- T.pack <$> many (noneOf "]\n\r") char ']' (url, title) <- linkDestination return $ Image {altText = alt, url = url, title = title} -- Link link :: Parser InlineText link = do char '[' content <- some (notFollowedBy (char ']') >> inlineElementNoBracket) char ']' (url, title) <- linkDestination pure $ Link content url title inlineElementNoBracket :: Parser InlineText inlineElementNoBracket = choice [ try strong, try emphasis, try codeSpan, try htmlInline, try escapedChar, plainTextNoBracket ] linkDestination :: Parser (Text, Maybe Text) linkDestination = directLink <|> referenceLink where directLink = do char '(' url <- many (noneOf " \t\n\r)") title <- optionMaybe (try $ some (char ' ' <|> char '\t') >> titleParser) char ')' pure (T.pack url, title) referenceLink = do char '[' ref <- some (alphaNum <|> char ' ' <|> char '\t') char ']' -- For simplicity, we're not resolving references here -- In a real implementation, you'd look up the reference pure (T.pack $ "[" ++ ref ++ "]", Nothing) titleParser :: Parser Text titleParser = T.pack <$> choice [ between (char '"') (char '"') (many $ noneOf "\""), between (char '\'') (char '\'') (many $ noneOf "'"), between (char '(') (char ')') (many $ noneOf ")") ] -- HTML Inline htmlInline :: Parser InlineText htmlInline = do start <- char '<' content <- manyTill anyChar (try $ char '>') return $ HTMLInline (T.pack (start : content ++ ">")) -- Escaped Character escapedChar :: Parser InlineText escapedChar = do char '\\' c <- satisfy (\x -> x >= '!' && x <= '~') pure $ Text (T.singleton c) -- Plain Text plainText :: Parser InlineText plainText = Text . T.pack <$> some plainTextChar plainTextChar :: Parser Char plainTextChar = satisfy $ \c -> (c `notElem` ("*_`[<\\\n\r" :: String)) && c >= ' ' plainTextNoAsterisk :: Parser InlineText plainTextNoAsterisk = Text . T.pack <$> some ( satisfy $ \c -> (c `notElem` ("*_`[<\\\n\r" :: String)) && c >= ' ' ) plainTextNoUnderscore :: Parser InlineText plainTextNoUnderscore = Text . T.pack <$> some ( satisfy $ \c -> not (c `elem` ("_*`[<\\\n\r" :: String)) && c >= ' ' ) plainTextNoBracket :: Parser InlineText plainTextNoBracket = Text . T.pack <$> some ( satisfy $ \c -> not (c `elem` ("]_*`[<\\\n\r" :: String)) && c >= ' ' ) -- Helper Parsers lineEnding :: Parser () lineEnding = void (try (string "\r\n") <|> try (string "\n") <|> string "\r") wsParser :: Parser () wsParser = void $ some (char ' ' <|> char '\t')