213 lines
9.5 KiB
Haskell
213 lines
9.5 KiB
Haskell
{-# LANGUAGE FlexibleContexts #-}
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
{-# LANGUAGE RankNTypes #-}
|
|
{-# LANGUAGE ScopedTypeVariables #-}
|
|
{-# LANGUAGE TypeOperators #-}
|
|
|
|
-- (document, metadata)
|
|
module Markdown (document, metadata) where
|
|
|
|
import Control.Applicative (many, optional, some, (<|>))
|
|
import Control.Monad (guard, void)
|
|
import Data.Functor.Identity (Identity)
|
|
import Data.Maybe (fromMaybe, maybeToList)
|
|
import Data.Proxy (Proxy (Proxy))
|
|
import Data.String (IsString)
|
|
import Data.Text (Text)
|
|
import qualified Data.Text as T
|
|
import Data.Void (Void)
|
|
import GHC.Stack (HasCallStack, callStack, prettyCallStack)
|
|
import IR
|
|
import Logger (Logger (logCallStack, logDebug))
|
|
import Text.Megaparsec (ParsecT, Stream, Token, Tokens, anySingle, anySingleBut, between, choice, chunk, count, eof, lookAhead, manyTill, notFollowedBy, satisfy, sepBy, skipSome, try, (<?>))
|
|
import qualified Text.Megaparsec as MP
|
|
import Text.Megaparsec.Char (alphaNumChar, char, digitChar, newline, space, spaceChar)
|
|
import qualified Text.Megaparsec.Stream as MPS
|
|
import Utilities.Parsing
|
|
|
|
string :: (MP.MonadParsec e s m) => Tokens s -> m (Tokens s)
|
|
string = chunk
|
|
|
|
metadata :: (Logger m, Characters s) => Parser s m Text
|
|
metadata = bound *> fmap T.pack (many $ try (char '-' <* notFollowedBy (chunk "--")) <|> anySingleBut '-') <* bound
|
|
where
|
|
bound = string "---"
|
|
|
|
document :: (Logger m, Characters s) => Parser s m Document
|
|
document = Doc <$> many ((notFollowedBy eof) *> element)
|
|
|
|
element :: (Logger m, Characters s) => Parser s m Element
|
|
element =
|
|
choice
|
|
[ try headingBlock <?> "Element Heading",
|
|
try fencedCodeBlock <?> "Fenced Code Block",
|
|
try blockquoteBlock <?> "BlockQuote",
|
|
(try $ unorderedListBlock 0) <?> "Unordered List",
|
|
(try $ orderedListBlock 0) <?> "Ordered List",
|
|
try htmlBlock <?> "HTML Block",
|
|
paragraphBlock <?> "Paragarph"
|
|
]
|
|
<* blockEnding
|
|
|
|
lineEnding :: (Logger m, Characters s, HasCallStack) => Parser s m ()
|
|
lineEnding = {-logCallStack *>-} ((try eof) <|> void newline)
|
|
|
|
-- we don't need to parse eof, lineEnding does that, eof *> eof works just fine in place of eof
|
|
blockEnding :: (Logger m, Characters s, HasCallStack) => Parser s m ()
|
|
blockEnding = lineEnding *> lineEnding
|
|
|
|
inlineText :: forall m s. (HasCallStack, Logger m, Characters s) => Parser s m InlineText
|
|
inlineText = inlineText' blockEnding
|
|
|
|
inlineText' :: forall m s. (HasCallStack, Logger m, Characters s) => Parser s m () -> Parser s m InlineText
|
|
inlineText' disallow = choice [try $ strikethrough disallow, try $ bold disallow, try $ italic disallow, try $ underlined disallow, try code, try $ link disallow, try $ image disallow, try inline_html, plain_text disallow]
|
|
where
|
|
between' disallow start end middle_piece = between start end $ many ((notFollowedBy ((try $ void end) <|> disallow)) *> middle_piece)
|
|
|
|
strikethrough disallow = Crossed <$> (between' disallow (string "~~") (void $ string "~~") (inlineText' (disallow <|> (void $ string "~~"))))
|
|
|
|
-- TODO: bold and italic eat a lineEnding that they shouldn't for some reason
|
|
bold disallow = Bold <$> (between' disallow (string "**") (disallow <|> (void $ string "**")) (inlineText' (disallow <|> (void $ string "**"))))
|
|
|
|
italic :: (HasCallStack) => Parser s m () -> Parser s m InlineText
|
|
italic disallow = Italic <$> (between' disallow (char '*') ((void $ char '*') <|> disallow) (inlineText' (disallow <|> (void $ char '*'))))
|
|
|
|
underlined disallow = Underlined <$> (between' disallow (string "__") ((void $ string "__") <|> disallow) (inlineText' (disallow <|> (void $ string "__"))))
|
|
|
|
code = InlineCode . T.pack <$> (between' disallow (char '`') (char '`') ((notFollowedBy lineEnding) *> anySingle))
|
|
link :: (HasCallStack) => Parser s m () -> Parser s m InlineText
|
|
link disallow = do
|
|
linkText <- between' disallow (char '[') ((void $ char ']') <|> disallow) (inlineText' (disallow <|> (void $ char ']')))
|
|
(url, title) <- do
|
|
char '('
|
|
-- might fail on newline char situation
|
|
url <- T.pack <$> (many (notFollowedBy (char ')' <|> spaceChar) *> anySingle))
|
|
hasTitle <- optional spaceChar
|
|
title <- case hasTitle of
|
|
Just _ -> Just . T.pack <$> (many (notFollowedBy ((void $ char ')') <|> lineEnding) *> anySingle))
|
|
Nothing -> pure Nothing
|
|
char ')'
|
|
pure (url, title)
|
|
pure Link {linkText, url, title}
|
|
|
|
image disallow = do
|
|
char '!'
|
|
-- Is this a hack? Yes. Bite me
|
|
link_hack <- link disallow
|
|
(altText, url, title) <- case link_hack of
|
|
Link {linkText = [Text altText], url, title} -> pure (altText, url, title)
|
|
_ -> fail "Image alt text must be normal text, cannot be stylized in any way"
|
|
pure Image {altText, url, title}
|
|
|
|
inline_html =
|
|
HTMLInline <$> do
|
|
char '<'
|
|
inner <- toText . MPS.tokensToChunk (Proxy :: Proxy s) <$> (many (anySingleBut '>'))
|
|
char '>'
|
|
pure $ mconcat ["<", inner, ">"]
|
|
|
|
plain_text :: Parser s m () -> Parser s m InlineText
|
|
plain_text disallow = do
|
|
first <- optional $ ((notFollowedBy disallow) *> anySingle)
|
|
rem <- many ((notFollowedBy (disallow <|> (void $ choice $ (map char "`*[~")))) *> anySingle)
|
|
|
|
pure $ Text $ T.pack $ case first of
|
|
Nothing -> []
|
|
Just c -> (c : rem)
|
|
|
|
headingBlock :: (Logger m, Characters s) => Parser s m Element
|
|
headingBlock = do
|
|
heading_level <- length <$> (some $ char '#')
|
|
optional spaceChar
|
|
text <- many ((notFollowedBy blockEnding) *> inlineText)
|
|
pure $ Heading $ H {level = heading_level, text}
|
|
|
|
fencedCodeBlock :: (Logger m, Characters s) => Parser s m Element
|
|
fencedCodeBlock = between (string "```") (string "```") $ do
|
|
language' <- T.pack <$> (many (notFollowedBy lineEnding *> anySingle))
|
|
lineEnding
|
|
code <- T.pack <$> (many ((notFollowedBy $ string "```") *> anySingle))
|
|
let language = if language' == "" then Just language' else Nothing
|
|
pure $ Code $ C {language, code}
|
|
|
|
blockquoteBlock :: (Logger m, Characters s) => Parser s m Element
|
|
blockquoteBlock = BlockQuote . Q . concat <$> (some blockquoteLine)
|
|
where
|
|
blockquoteLine = do
|
|
char '>'
|
|
optional $ char ' '
|
|
ret <- (many ((notFollowedBy lineEnding) *> inlineText))
|
|
-- this dance with optional and notFollowedBy is done so we
|
|
-- aren't accidentally consuming part of a block ending
|
|
(optional ((notFollowedBy blockEnding) *> lineEnding))
|
|
pure ret
|
|
|
|
-- type of list the parser returns
|
|
-- parser which grabs the prefix for each item of the list
|
|
-- parser used for child lists
|
|
-- nesting amount
|
|
listBlock :: (Logger m, Characters s) => ListType -> Parser s m prefix -> (Int -> Parser s m List) -> Int -> Parser s m Element
|
|
listBlock list_type prefix child_parser_factory nest_level = do
|
|
items <- some $ listItem
|
|
pure $ List $ L {list_type, items}
|
|
where
|
|
listItem = do
|
|
count nest_level ((try $ void $ char '\t') <|> (void $ (count 4 $ char ' ')))
|
|
prefix
|
|
content <- many ((notFollowedBy lineEnding) *> inlineText' lineEnding)
|
|
|
|
optional ((notFollowedBy blockEnding) *> lineEnding)
|
|
|
|
child <- optional $ child_parser_factory $ nest_level + 1
|
|
|
|
pure $ LI {content, child}
|
|
|
|
unorderedListBlock :: (Logger m, Characters s) => Int -> Parser s m Element
|
|
unorderedListBlock = listBlock Unordered unordered_prefix (\level -> unwrap <$> ((try $ unorderedListBlock level) <|> orderedListBlock level))
|
|
where
|
|
unordered_prefix = (choice $ map char "*-+") *> (notFollowedBy newline *> spaceChar)
|
|
-- not exhaustive but we know listBlock is returning a List
|
|
unwrap (List l) = l
|
|
|
|
orderedListBlock :: (Logger m, Characters s) => Int -> Parser s m Element
|
|
orderedListBlock = listBlock Ordered ordered_prefix (\level -> unwrap <$> ((try $ unorderedListBlock level) <|> orderedListBlock level))
|
|
where
|
|
-- regex equivalent: [0-9]+[.)]\s?
|
|
ordered_prefix = (some digitChar) *> (char '.' <|> char ')') *> (notFollowedBy newline *> spaceChar)
|
|
-- not exhaustive but we know listBlock is returning a List
|
|
unwrap (List l) = l
|
|
|
|
htmlBlock :: forall m s. (Logger m, Characters s) => Parser s m Element
|
|
htmlBlock = do
|
|
char '<'
|
|
tagName <- MPS.tokensToChunk (Proxy :: Proxy s) <$> (some ((notFollowedBy ((try $ void tagNameEnd) <|> blockEnding)) *> (anySingle :: Parser s m (Token s))))
|
|
notFollowedBy blockEnding
|
|
ending <- tagNameEnd
|
|
hasEnded <- case ending of
|
|
'>' -> pure True
|
|
_ -> pure False
|
|
attrs <-
|
|
if not hasEnded
|
|
then
|
|
Just . toText . mconcat <$> htmlAttrs
|
|
else pure Nothing
|
|
-- technically not standard markdown but I don't want to write a full HTML parser in my
|
|
inside <- many (notFollowedBy ((chunk $ "</" <> tagName <> ">") <|> chunk "</>") *> anySingle)
|
|
end <- toText <$> ((chunk $ "</" <> tagName <> ">") <|> chunk "</>")
|
|
-- if a blockEnding after some whitespace isn't next when we should parse this as inline text/paragraph
|
|
many ((notFollowedBy lineEnding) *> spaceChar)
|
|
lookAhead blockEnding
|
|
pure $ HTML $ HTMLTag $ T.concat ["<", toText tagName, fromMaybe "" attrs, ">", T.pack inside, if end == "</>" then "" else end]
|
|
where
|
|
tagNameEnd = (lookAhead spaceChar <* space) <|> char '>'
|
|
htmlAttrs = ((notFollowedBy $ char '>') *> htmlAttr) `sepBy` space
|
|
htmlAttr = do
|
|
name <- many (notFollowedBy (lineEnding <|> (void $ char '=')) *> anySingle)
|
|
char '='
|
|
char '"'
|
|
value <- many (notFollowedBy (lineEnding <|> (void $ char '"')) *> anySingle)
|
|
char '"'
|
|
pure $ mconcat [name, "=\"", value, "\""]
|
|
|
|
paragraphBlock :: (Logger m, Characters s) => Parser s m Element
|
|
paragraphBlock = Paragraph . P <$> (many ((notFollowedBy blockEnding) *> inlineText))
|