psb/src/Markdown.hs

{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeOperators #-}

-- (document, metadata)
module Markdown (document, metadata) where

import Control.Applicative (many, optional, some, (<|>))
import Control.Monad (guard, void)
import Data.Functor.Identity (Identity)
import Data.Maybe (fromMaybe, maybeToList)
import Data.Proxy (Proxy (Proxy))
import Data.String (IsString)
import Data.Text (Text)
import qualified Data.Text as T
import Data.Void (Void)
import GHC.Stack (HasCallStack, callStack, prettyCallStack)
import IR
import Logger (Logger (logCallStack, logDebug))
import Text.Megaparsec (ParsecT, Stream, Token, Tokens, anySingle, anySingleBut, between, choice, chunk, count, eof, lookAhead, manyTill, notFollowedBy, satisfy, sepBy, skipSome, try, (<?>))
import qualified Text.Megaparsec as MP
import Text.Megaparsec.Char (alphaNumChar, char, digitChar, newline, space, spaceChar)
import qualified Text.Megaparsec.Stream as MPS
import Utilities.Parsing

string :: (MP.MonadParsec e s m) => Tokens s -> m (Tokens s)
string = chunk

metadata :: (Logger m, Characters s) => Parser s m Text
metadata = bound *> fmap T.pack (many $ try (char '-' <* notFollowedBy (chunk "--")) <|> anySingleBut '-') <* bound
  where
    bound = string "---"

document :: (Logger m, Characters s) => Parser s m Document
document = Doc <$> many ((notFollowedBy eof) *> element)

element :: (Logger m, Characters s) => Parser s m Element
element =
  choice
    [ try headingBlock <?> "Element Heading",
      try fencedCodeBlock <?> "Fenced Code Block",
      try blockquoteBlock <?> "BlockQuote",
      (try $ unorderedListBlock 0) <?> "Unordered List",
      (try $ orderedListBlock 0) <?> "Ordered List",
      try htmlBlock <?> "HTML Block",
      paragraphBlock <?> "Paragarph"
    ]
    <* blockEnding

lineEnding :: (Logger m, Characters s, HasCallStack) => Parser s m ()
lineEnding = {-logCallStack *>-} ((try eof) <|> void newline)

-- we don't need to parse eof, lineEnding does that, eof *> eof works just fine in place of eof
blockEnding :: (Logger m, Characters s, HasCallStack) => Parser s m ()
blockEnding = lineEnding *> lineEnding

inlineText :: forall m s. (HasCallStack, Logger m, Characters s) => Parser s m InlineText
inlineText = inlineText' blockEnding

inlineText' :: forall m s. (HasCallStack, Logger m, Characters s) => Parser s m () -> Parser s m InlineText
inlineText' disallow = choice [try $ strikethrough disallow, try $ bold disallow, try $ italic disallow, try $ underlined disallow, try code, try $ link disallow, try $ image disallow, try inline_html, plain_text disallow]
  where
    between' disallow start end middle_piece = between start end $ many ((notFollowedBy ((try $ void end) <|> disallow)) *> middle_piece)

    strikethrough disallow = Crossed <$> (between' disallow (string "~~") (void $ string "~~") (inlineText' (disallow <|> (void $ string "~~"))))

    -- TODO: bold and italic eat a lineEnding that they shouldn't for some reason
    bold disallow = Bold <$> (between' disallow (string "**") (disallow <|> (void $ string "**")) (inlineText' (disallow <|> (void $ string "**"))))

    italic :: (HasCallStack) => Parser s m () -> Parser s m InlineText
    italic disallow = Italic <$> (between' disallow (char '*') ((void $ char '*') <|> disallow) (inlineText' (disallow <|> (void $ char '*'))))

    underlined disallow = Underlined <$> (between' disallow (string "__") ((void $ string "__") <|> disallow) (inlineText' (disallow <|> (void $ string "__"))))

    code = InlineCode . T.pack <$> (between' disallow (char '`') (char '`') ((notFollowedBy lineEnding) *> anySingle))
    link :: (HasCallStack) => Parser s m () -> Parser s m InlineText
    link disallow = do
      linkText <- between' disallow (char '[') ((void $ char ']') <|> disallow) (inlineText' (disallow <|> (void $ char ']')))
      (url, title) <- do
        char '('
        -- might fail on newline char situation
        url <- T.pack <$> (many (notFollowedBy (char ')' <|> spaceChar) *> anySingle))
        hasTitle <- optional spaceChar
        title <- case hasTitle of
          Just _ -> Just . T.pack <$> (many (notFollowedBy ((void $ char ')') <|> lineEnding) *> anySingle))
          Nothing -> pure Nothing
        char ')'
        pure (url, title)
      pure Link {linkText, url, title}

    image disallow = do
      char '!'
      -- Is this a hack? Yes. Bite me
      link_hack <- link disallow
      (altText, url, title) <- case link_hack of
        Link {linkText = [Text altText], url, title} -> pure (altText, url, title)
        _ -> fail "Image alt text must be normal text, cannot be stylized in any way"
      pure Image {altText, url, title}

    inline_html =
      HTMLInline <$> do
        char '<'
        inner <- toText . MPS.tokensToChunk (Proxy :: Proxy s) <$> (many (anySingleBut '>'))
        char '>'
        pure $ mconcat ["<", inner, ">"]

    plain_text :: Parser s m () -> Parser s m InlineText
    plain_text disallow = do
      first <- optional $ ((notFollowedBy disallow) *> anySingle)
      rem <- many ((notFollowedBy (disallow <|> (void $ choice $ (map char "`*[~")))) *> anySingle)

      pure $ Text $ T.pack $ case first of
        Nothing -> []
        Just c -> (c : rem)

headingBlock :: (Logger m, Characters s) => Parser s m Element
headingBlock = do
  heading_level <- length <$> (some $ char '#')
  optional spaceChar
  text <- many ((notFollowedBy blockEnding) *> inlineText)
  pure $ Heading $ H {level = heading_level, text}

fencedCodeBlock :: (Logger m, Characters s) => Parser s m Element
fencedCodeBlock = between (string "```") (string "```") $ do
  language' <- T.pack <$> (many (notFollowedBy lineEnding *> anySingle))
  lineEnding
  code <- T.pack <$> (many ((notFollowedBy $ string "```") *> anySingle))
  let language = if language' == "" then Just language' else Nothing
  pure $ Code $ C {language, code}

blockquoteBlock :: (Logger m, Characters s) => Parser s m Element
blockquoteBlock = BlockQuote . Q . concat <$> (some blockquoteLine)
  where
    blockquoteLine = do
      char '>'
      optional $ char ' '
      ret <- (many ((notFollowedBy lineEnding) *> inlineText))
      -- this dance with optional and notFollowedBy is done so we
      -- aren't accidentally consuming part of a block ending
      (optional ((notFollowedBy blockEnding) *> lineEnding))
      pure ret

-- type of list the parser returns
-- parser which grabs the prefix for each item of the list
-- parser used for child lists
-- nesting amount
listBlock :: (Logger m, Characters s) => ListType -> Parser s m prefix -> (Int -> Parser s m List) -> Int -> Parser s m Element
listBlock list_type prefix child_parser_factory nest_level = do
  items <- some $ listItem
  pure $ List $ L {list_type, items}
  where
    listItem = do
      count nest_level ((try $ void $ char '\t') <|> (void $ (count 4 $ char ' ')))
      prefix
      content <- many ((notFollowedBy lineEnding) *> inlineText' lineEnding)

      optional ((notFollowedBy blockEnding) *> lineEnding)

      child <- optional $ child_parser_factory $ nest_level + 1

      pure $ LI {content, child}

unorderedListBlock :: (Logger m, Characters s) => Int -> Parser s m Element
unorderedListBlock = listBlock Unordered unordered_prefix (\level -> unwrap <$> ((try $ unorderedListBlock level) <|> orderedListBlock level))
  where
    unordered_prefix = (choice $ map char "*-+") *> (notFollowedBy newline *> spaceChar)
    -- not exhaustive but we know listBlock is returning a List
    unwrap (List l) = l

orderedListBlock :: (Logger m, Characters s) => Int -> Parser s m Element
orderedListBlock = listBlock Ordered ordered_prefix (\level -> unwrap <$> ((try $ unorderedListBlock level) <|> orderedListBlock level))
  where
    -- regex equivalent: [0-9]+[.)]\s?
    ordered_prefix = (some digitChar) *> (char '.' <|> char ')') *> (notFollowedBy newline *> spaceChar)
    -- not exhaustive but we know listBlock is returning a List
    unwrap (List l) = l

htmlBlock :: forall m s. (Logger m, Characters s) => Parser s m Element
htmlBlock = do
  char '<'
  tagName <- MPS.tokensToChunk (Proxy :: Proxy s) <$> (some ((notFollowedBy ((try $ void tagNameEnd) <|> blockEnding)) *> (anySingle :: Parser s m (Token s))))
  notFollowedBy blockEnding
  ending <- tagNameEnd
  hasEnded <- case ending of
    '>' -> pure True
    _ -> pure False
  attrs <-
    if not hasEnded
      then
        Just . toText . mconcat <$> htmlAttrs
      else pure Nothing
  -- technically not standard markdown but I don't want to write a full HTML parser in my
  inside <- many (notFollowedBy ((chunk $ "</" <> tagName <> ">") <|> chunk "</>") *> anySingle)
  end <- toText <$> ((chunk $ "</" <> tagName <> ">") <|> chunk "</>")
  -- if a blockEnding after some whitespace isn't next when we should parse this as inline text/paragraph
  many ((notFollowedBy lineEnding) *> spaceChar)
  lookAhead blockEnding
  pure $ HTML $ HTMLTag $ T.concat ["<", toText tagName, fromMaybe "" attrs, ">", T.pack inside, if end == "</>" then "" else end]
  where
    tagNameEnd = (lookAhead spaceChar <* space) <|> char '>'
    htmlAttrs = ((notFollowedBy $ char '>') *> htmlAttr) `sepBy` space
    htmlAttr = do
      name <- many (notFollowedBy (lineEnding <|> (void $ char '=')) *> anySingle)
      char '='
      char '"'
      value <- many (notFollowedBy (lineEnding <|> (void $ char '"')) *> anySingle)
      char '"'
      pure $ mconcat [name, "=\"", value, "\""]

paragraphBlock :: (Logger m, Characters s) => Parser s m Element
paragraphBlock = Paragraph . P <$> (many ((notFollowedBy blockEnding) *> inlineText))