psb/src/Markdown.hs

213 lines
9.5 KiB
Haskell

{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeOperators #-}
-- (document, metadata)
module Markdown (document, metadata) where
import Control.Applicative (many, optional, some, (<|>))
import Control.Monad (guard, void)
import Data.Functor.Identity (Identity)
import Data.Maybe (fromMaybe, maybeToList)
import Data.Proxy (Proxy (Proxy))
import Data.String (IsString)
import Data.Text (Text)
import qualified Data.Text as T
import Data.Void (Void)
import GHC.Stack (HasCallStack, callStack, prettyCallStack)
import IR
import Logger (Logger (logCallStack, logDebug))
import Text.Megaparsec (ParsecT, Stream, Token, Tokens, anySingle, anySingleBut, between, choice, chunk, count, eof, lookAhead, manyTill, notFollowedBy, satisfy, sepBy, skipSome, try, (<?>))
import qualified Text.Megaparsec as MP
import Text.Megaparsec.Char (alphaNumChar, char, digitChar, newline, space, spaceChar)
import qualified Text.Megaparsec.Stream as MPS
import Utilities.Parsing
string :: (MP.MonadParsec e s m) => Tokens s -> m (Tokens s)
string = chunk
metadata :: (Logger m, Characters s) => Parser s m Text
metadata = bound *> fmap T.pack (many $ try (char '-' <* notFollowedBy (chunk "--")) <|> anySingleBut '-') <* bound
where
bound = string "---"
document :: (Logger m, Characters s) => Parser s m Document
document = Doc <$> many ((notFollowedBy eof) *> element)
element :: (Logger m, Characters s) => Parser s m Element
element =
choice
[ try headingBlock <?> "Element Heading",
try fencedCodeBlock <?> "Fenced Code Block",
try blockquoteBlock <?> "BlockQuote",
(try $ unorderedListBlock 0) <?> "Unordered List",
(try $ orderedListBlock 0) <?> "Ordered List",
try htmlBlock <?> "HTML Block",
paragraphBlock <?> "Paragarph"
]
<* blockEnding
lineEnding :: (Logger m, Characters s, HasCallStack) => Parser s m ()
lineEnding = {-logCallStack *>-} ((try eof) <|> void newline)
-- we don't need to parse eof, lineEnding does that, eof *> eof works just fine in place of eof
blockEnding :: (Logger m, Characters s, HasCallStack) => Parser s m ()
blockEnding = lineEnding *> lineEnding
inlineText :: forall m s. (HasCallStack, Logger m, Characters s) => Parser s m InlineText
inlineText = inlineText' blockEnding
inlineText' :: forall m s. (HasCallStack, Logger m, Characters s) => Parser s m () -> Parser s m InlineText
inlineText' disallow = choice [try $ strikethrough disallow, try $ bold disallow, try $ italic disallow, try $ underlined disallow, try code, try $ link disallow, try $ image disallow, try inline_html, plain_text disallow]
where
between' disallow start end middle_piece = between start end $ many ((notFollowedBy ((try $ void end) <|> disallow)) *> middle_piece)
strikethrough disallow = Crossed <$> (between' disallow (string "~~") (void $ string "~~") (inlineText' (disallow <|> (void $ string "~~"))))
-- TODO: bold and italic eat a lineEnding that they shouldn't for some reason
bold disallow = Bold <$> (between' disallow (string "**") (disallow <|> (void $ string "**")) (inlineText' (disallow <|> (void $ string "**"))))
italic :: (HasCallStack) => Parser s m () -> Parser s m InlineText
italic disallow = Italic <$> (between' disallow (char '*') ((void $ char '*') <|> disallow) (inlineText' (disallow <|> (void $ char '*'))))
underlined disallow = Underlined <$> (between' disallow (string "__") ((void $ string "__") <|> disallow) (inlineText' (disallow <|> (void $ string "__"))))
code = InlineCode . T.pack <$> (between' disallow (char '`') (char '`') ((notFollowedBy lineEnding) *> anySingle))
link :: (HasCallStack) => Parser s m () -> Parser s m InlineText
link disallow = do
linkText <- between' disallow (char '[') ((void $ char ']') <|> disallow) (inlineText' (disallow <|> (void $ char ']')))
(url, title) <- do
char '('
-- might fail on newline char situation
url <- T.pack <$> (many (notFollowedBy (char ')' <|> spaceChar) *> anySingle))
hasTitle <- optional spaceChar
title <- case hasTitle of
Just _ -> Just . T.pack <$> (many (notFollowedBy ((void $ char ')') <|> lineEnding) *> anySingle))
Nothing -> pure Nothing
char ')'
pure (url, title)
pure Link {linkText, url, title}
image disallow = do
char '!'
-- Is this a hack? Yes. Bite me
link_hack <- link disallow
(altText, url, title) <- case link_hack of
Link {linkText = [Text altText], url, title} -> pure (altText, url, title)
_ -> fail "Image alt text must be normal text, cannot be stylized in any way"
pure Image {altText, url, title}
inline_html =
HTMLInline <$> do
char '<'
inner <- toText . MPS.tokensToChunk (Proxy :: Proxy s) <$> (many (anySingleBut '>'))
char '>'
pure $ mconcat ["<", inner, ">"]
plain_text :: Parser s m () -> Parser s m InlineText
plain_text disallow = do
first <- optional $ ((notFollowedBy disallow) *> anySingle)
rem <- many ((notFollowedBy (disallow <|> (void $ choice $ (map char "`*[~")))) *> anySingle)
pure $ Text $ T.pack $ case first of
Nothing -> []
Just c -> (c : rem)
headingBlock :: (Logger m, Characters s) => Parser s m Element
headingBlock = do
heading_level <- length <$> (some $ char '#')
optional spaceChar
text <- many ((notFollowedBy blockEnding) *> inlineText)
pure $ Heading $ H {level = heading_level, text}
fencedCodeBlock :: (Logger m, Characters s) => Parser s m Element
fencedCodeBlock = between (string "```") (string "```") $ do
language' <- T.pack <$> (many (notFollowedBy lineEnding *> anySingle))
lineEnding
code <- T.pack <$> (many ((notFollowedBy $ string "```") *> anySingle))
let language = if language' == "" then Just language' else Nothing
pure $ Code $ C {language, code}
blockquoteBlock :: (Logger m, Characters s) => Parser s m Element
blockquoteBlock = BlockQuote . Q . concat <$> (some blockquoteLine)
where
blockquoteLine = do
char '>'
optional $ char ' '
ret <- (many ((notFollowedBy lineEnding) *> inlineText))
-- this dance with optional and notFollowedBy is done so we
-- aren't accidentally consuming part of a block ending
(optional ((notFollowedBy blockEnding) *> lineEnding))
pure ret
-- type of list the parser returns
-- parser which grabs the prefix for each item of the list
-- parser used for child lists
-- nesting amount
listBlock :: (Logger m, Characters s) => ListType -> Parser s m prefix -> (Int -> Parser s m List) -> Int -> Parser s m Element
listBlock list_type prefix child_parser_factory nest_level = do
items <- some $ listItem
pure $ List $ L {list_type, items}
where
listItem = do
count nest_level ((try $ void $ char '\t') <|> (void $ (count 4 $ char ' ')))
prefix
content <- many ((notFollowedBy lineEnding) *> inlineText' lineEnding)
optional ((notFollowedBy blockEnding) *> lineEnding)
child <- optional $ child_parser_factory $ nest_level + 1
pure $ LI {content, child}
unorderedListBlock :: (Logger m, Characters s) => Int -> Parser s m Element
unorderedListBlock = listBlock Unordered unordered_prefix (\level -> unwrap <$> ((try $ unorderedListBlock level) <|> orderedListBlock level))
where
unordered_prefix = (choice $ map char "*-+") *> (notFollowedBy newline *> spaceChar)
-- not exhaustive but we know listBlock is returning a List
unwrap (List l) = l
orderedListBlock :: (Logger m, Characters s) => Int -> Parser s m Element
orderedListBlock = listBlock Ordered ordered_prefix (\level -> unwrap <$> ((try $ unorderedListBlock level) <|> orderedListBlock level))
where
-- regex equivalent: [0-9]+[.)]\s?
ordered_prefix = (some digitChar) *> (char '.' <|> char ')') *> (notFollowedBy newline *> spaceChar)
-- not exhaustive but we know listBlock is returning a List
unwrap (List l) = l
htmlBlock :: forall m s. (Logger m, Characters s) => Parser s m Element
htmlBlock = do
char '<'
tagName <- MPS.tokensToChunk (Proxy :: Proxy s) <$> (some ((notFollowedBy ((try $ void tagNameEnd) <|> blockEnding)) *> (anySingle :: Parser s m (Token s))))
notFollowedBy blockEnding
ending <- tagNameEnd
hasEnded <- case ending of
'>' -> pure True
_ -> pure False
attrs <-
if not hasEnded
then
Just . toText . mconcat <$> htmlAttrs
else pure Nothing
-- technically not standard markdown but I don't want to write a full HTML parser in my
inside <- many (notFollowedBy ((chunk $ "</" <> tagName <> ">") <|> chunk "</>") *> anySingle)
end <- toText <$> ((chunk $ "</" <> tagName <> ">") <|> chunk "</>")
-- if a blockEnding after some whitespace isn't next when we should parse this as inline text/paragraph
many ((notFollowedBy lineEnding) *> spaceChar)
lookAhead blockEnding
pure $ HTML $ HTMLTag $ T.concat ["<", toText tagName, fromMaybe "" attrs, ">", T.pack inside, if end == "</>" then "" else end]
where
tagNameEnd = (lookAhead spaceChar <* space) <|> char '>'
htmlAttrs = ((notFollowedBy $ char '>') *> htmlAttr) `sepBy` space
htmlAttr = do
name <- many (notFollowedBy (lineEnding <|> (void $ char '=')) *> anySingle)
char '='
char '"'
value <- many (notFollowedBy (lineEnding <|> (void $ char '"')) *> anySingle)
char '"'
pure $ mconcat [name, "=\"", value, "\""]
paragraphBlock :: (Logger m, Characters s) => Parser s m Element
paragraphBlock = Paragraph . P <$> (many ((notFollowedBy blockEnding) *> inlineText))