psb/app/Markdown.hs

460 lines
13 KiB
Haskell

{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
-- (document, metadata)
module Markdown where
import Control.Applicative (many, optional, some, (<|>))
import Control.Monad (guard, void)
import Data.Char (isAlpha)
import Data.Functor.Identity (Identity)
import Data.Text (Text)
import qualified Data.Text as T
import Data.Void (Void)
import IR
import Logger (Logger (logDebug))
import Text.Megaparsec (ParsecT, anySingle, anySingleBut, between, choice, count, eof, manyTill, notFollowedBy, satisfy, skipSome, try, (<?>))
import qualified Text.Megaparsec as MP
import Text.Megaparsec.Char (alphaNumChar, char, digitChar, string)
type ParserT m = ParsecT Void String m
type Parser = ParserT Identity
logP :: (Logger m, Show s) => ParserT m s -> ParserT m s
logP v = do
underlying <- v
logDebug $ T.show underlying
v
anyChar :: ParserT m Char
anyChar = anySingle
alphaNum :: ParserT m Char
alphaNum = alphaNumChar
digit :: ParserT m Char
digit = digitChar
noneOf :: [Char] -> ParserT m Char
noneOf = MP.noneOf
oneOf :: [Char] -> ParserT m Char
oneOf = MP.oneOf
optionMaybe :: ParserT m a -> ParserT m (Maybe a)
optionMaybe = optional
skipMany1 :: ParserT m a -> ParserT m ()
skipMany1 = skipSome
metadata :: ParserT m Text
metadata = bound *> fmap T.pack (many $ try (char '-' <* notFollowedBy (string "--")) <|> anySingleBut '-') <* bound
where
bound = string "---"
document :: (Logger m) => ParserT m Document
document = Doc <$> many element <* eof
element :: (Logger m) => ParserT m Element
element =
choice
[ try headingBlock <?> "Element Heading",
try fencedCodeBlock <?> "Fenced Code Block",
try indentedCodeBlock <?> "Indented Code Block",
try blockquoteBlock <?> "BlockQuote",
try unorderedListBlock <?> "Unordered List",
try orderedListBlock <?> "Ordered List",
try horizontalRuleBlock <?> "Horizontal Rule",
try htmlBlock <?> "HTML Block",
try blankLines <?> "Blank Lines", -- Consume blank lines but don't add to AST
paragraphBlock <?> "Paragarph"
]
-- Blank lines (consumed but not stored)
blankLines :: (Logger m) => ParserT m Element
blankLines = do
skipMany1 (blankLine *> notFollowedBy eof)
element <|> fmap (const $ HTML HTMLTag {html_content = ""}) eof -- Parse the next element (or handle eof)
blankLine :: (Logger m) => ParserT m ()
blankLine = do
many (char ' ' <|> char '\t')
lineEnding
pure ()
-- Heading Block
headingBlock :: (Logger m) => ParserT m Element
headingBlock = do
hashes <- some (char '#') <?> "Heading Hashes"
let level = length hashes
guard (level <= 6) <?> "Higher than level 6"
many (char ' ' <|> char '\t') <?> "Pre-Text Whitespace"
content <- manyTill (inlineElement <?> "Header Text") (try lineEnding <?> "Header Ending")
pure $ Heading $ H level content
-- Fenced Code Block
fencedCodeBlock :: (Logger m) => ParserT m Element
fencedCodeBlock = do
fence <- string "```" <|> string "~~~"
lang <- optionMaybe languageInfo
lineEnding
codeLines <- manyTill codeLine (try $ string fence)
pure $ Code $ C lang (T.pack $ unlines codeLines)
languageInfo :: (Logger m) => ParserT m Text
languageInfo = T.pack <$> some (alphaNum <|> char '-' <|> char '+' <|> char '.')
codeLine :: (Logger m) => ParserT m String
codeLine = do
line <- many $ noneOf "\n\r"
lineEnding
pure line
-- Indented Code Block
indentedCodeBlock :: (Logger m) => ParserT m Element
indentedCodeBlock = do
lines' <- some indentedLine
pure $ Code $ C Nothing (T.pack $ unlines lines')
where
indentedLine = do
count 4 (char ' ' <|> char '\t')
line <- many $ noneOf "\n\r"
lineEnding
pure line
-- Blockquote Block
blockquoteBlock :: (Logger m) => ParserT m Element
blockquoteBlock = do
lines' <- some blockquoteLine
pure $ BlockQuote $ Q (concat lines')
where
blockquoteLine = do
char '>'
optional (char ' ')
content <- manyTill inlineElement (try lineEnding)
pure content
-- Horizontal Rule Block
horizontalRuleBlock :: (Logger m) => ParserT m Element
horizontalRuleBlock = do
choice
[ try (count 3 (char '*') >> many (char ' ' <|> char '*')),
try (count 3 (char '-') >> many (char ' ' <|> char '-')),
try (count 3 (char '_') >> many (char ' ' <|> char '_'))
]
lineEnding
pure HorizontalRule
-- Unordered List Block
unorderedListBlock :: (Logger m) => ParserT m Element
unorderedListBlock = do
items <- some unorderedListItem
pure $ List $ L Unordered items
unorderedListItem :: (Logger m) => ParserT m ListItem
unorderedListItem = do
oneOf "*-+"
char ' ' <|> char '\t'
content <- manyTill inlineElement (try lineEnding)
-- continuations <- many listContinuation
children <- many (try indentedList)
pure $ LI content children
listContinuation :: (Logger m) => ParserT m [InlineText]
listContinuation = do
count 2 (char ' ' <|> char '\t')
many (char ' ' <|> char '\t')
notFollowedBy (oneOf "*-+")
notFollowedBy (digit >> char '.')
content <- manyTill inlineElement (try lineEnding)
pure content
-- TODO: handle list indentation at all levels
indentedList :: (Logger m) => ParserT m List
indentedList = do
let n = 1
void $ count (4 * n) (char ' ') <|> count n (char '\t')
choice [try indentedUnorderedList, indentedOrderedList]
indentedUnorderedList :: (Logger m) => ParserT m List
indentedUnorderedList = do
items <- some (try $ indentedListItem (oneOf "*-+" >> void (char ' ' <|> char '\t')))
pure $ L Unordered items
indentedOrderedList :: (Logger m) => ParserT m List
indentedOrderedList = do
items <- some (try $ indentedListItem (some digit >> char '.' >> void (char ' ' <|> char '\t')))
pure $ L Ordered items
indentedListItem :: (Logger m) => ParserT m () -> ParserT m ListItem
indentedListItem marker = do
marker
content <- manyTill inlineElement (try $ lineEnding <|> eof)
pure $ LI content []
-- Ordered List Block
orderedListBlock :: (Logger m) => ParserT m Element
orderedListBlock = do
items <- some orderedListItem
pure $ List $ L Ordered items
orderedListItem :: (Logger m) => ParserT m ListItem
orderedListItem = do
some digit
char '.'
char ' ' <|> char '\t'
content <- manyTill inlineElement (try lineEnding)
continuations <- many listContinuation
children <- many (try indentedList)
pure $ LI (content ++ concat continuations) children
-- HTML Block
htmlBlock :: (Logger m) => ParserT m Element
htmlBlock = do
char '<'
-- Capture the entire HTML block as raw text
rest <- manyTill anyChar (try $ char '>' >> lineEnding)
let content = '<' : rest
return $ HTML $ HTMLTag (T.pack content)
tagName :: (Logger m) => ParserT m String
tagName = do
first <- satisfy isAlpha
rest <- many (alphaNum <|> char '-' <|> char ':')
pure (first : rest)
attribute :: (Logger m) => ParserT m (Text, Maybe Text)
attribute = do
name <- attributeName
value <- optionMaybe (char '=' >> attributeValue)
pure (T.pack name, fmap T.pack value)
attributeName :: (Logger m) => ParserT m String
attributeName = do
first <- satisfy isAlpha
rest <- many (alphaNum <|> char '-' <|> char ':')
pure (first : rest)
attributeValue :: (Logger m) => ParserT m String
attributeValue =
choice
[ between (char '"') (char '"') (many $ anySingleBut '"'),
between (char '\'') (char '\'') (many $ anySingleBut '\''),
some $ noneOf " \t\n\r>\"'=<`"
]
-- Paragraph Block
paragraphBlock :: (Logger m) => ParserT m Element
paragraphBlock = do
content <- some inlineElement
lineEnding <|> eof
pure $ Paragraph $ P content
-- Inline Elements
inlineElement :: (Logger m) => ParserT m InlineText
inlineElement =
choice
[ try strong <?> "Inline Strong Text",
try emphasis <?> "Inline Italic Text",
try crossedText <?> "Inline Crossed Text",
try codeSpan <?> "Inline Code",
try image <?> "Inline Image",
try link <?> "Inline Link",
try htmlInline <?> "Inline HTML",
try escapedChar <?> "Escaped Character",
plainText <?> "Inline Plain Text"
]
-- Strong (Bold)
strong :: (Logger m) => ParserT m InlineText
strong = strongAsterisk <|> strongUnderscore
strongAsterisk :: (Logger m) => ParserT m InlineText
strongAsterisk = do
string "**"
content <- some (notFollowedBy (string "**") >> inlineElement)
string "**"
pure $ Bold content
strongUnderscore :: (Logger m) => ParserT m InlineText
strongUnderscore = do
string "__"
content <- some (notFollowedBy (string "__") >> inlineElement)
string "__"
pure $ Bold content
crossedText :: (Logger m) => ParserT m InlineText
crossedText = do
string "~~"
content <- some (notFollowedBy (string "~~") >> inlineElement)
string "~~"
pure $ Crossed content
-- Emphasis (Italic)
emphasis :: (Logger m) => ParserT m InlineText
emphasis = emphasisAsterisk <|> emphasisUnderscore
emphasisAsterisk :: (Logger m) => ParserT m InlineText
emphasisAsterisk = do
char '*'
content <- some (notFollowedBy (char '*') >> inlineElementNoAsterisk)
char '*'
pure $ Italic content
emphasisUnderscore :: (Logger m) => ParserT m InlineText
emphasisUnderscore = do
char '_'
content <- some inlineElementNoUnderscore
char '_'
pure $ Italic content
inlineElementNo :: (Logger m) => Char -> ParserT m InlineText
inlineElementNo c =
choice
[ try strong,
try codeSpan,
try image,
try link,
try htmlInline,
try escapedChar,
plainTextNo [c]
]
plainTextNo :: (Logger m) => [Char] -> ParserT m InlineText
plainTextNo = plainTextNo' False
plainTextNo' :: (Logger m) => Bool -> [Char] -> ParserT m InlineText
plainTextNo' block_whitespace disallow = do
firstChar <- noneOf (disallow <> if block_whitespace then " \t\r\n" else []) <?> "Plain Text Initial Disallow"
remChars <- many $ notFollowedBy lineEnding *> plainTextCharNo disallow
pure $ Text $ T.map wspHandler $ T.pack $ firstChar : remChars
where
wspHandler '\n' = ' '
wspHandler c = c
inlineElementNoAsterisk :: (Logger m) => ParserT m InlineText
inlineElementNoAsterisk = inlineElementNo '*'
inlineElementNoUnderscore :: (Logger m) => ParserT m InlineText
inlineElementNoUnderscore = inlineElementNo '_'
-- Code Span
codeSpan :: (Logger m) => ParserT m InlineText
codeSpan =
choice
[ try tripleBacktick,
try doubleBacktick,
singleBacktick
]
where
singleBacktick = do
char '`'
content <- many $ noneOf "`\n\r"
char '`'
pure $ InlineCode (T.pack content)
doubleBacktick = do
string "``"
content <- manyTill anyChar (try $ string "``")
pure $ InlineCode (T.pack content)
tripleBacktick = do
string "```"
content <- manyTill anyChar (try $ string "```")
pure $ InlineCode (T.pack content)
-- Image
image :: (Logger m) => ParserT m InlineText
image = do
char '!'
char '['
alt <- T.pack <$> many (noneOf "]\n\r")
char ']'
(url, title) <- linkDestination
return $ Image {altText = alt, url = url, title = title}
-- Link
link :: (Logger m) => ParserT m InlineText
link = do
char '['
content <- some (notFollowedBy (char ']') >> inlineElementNoBracket)
char ']'
(url, title) <- linkDestination
pure $ Link content url title
inlineElementNoBracket :: (Logger m) => ParserT m InlineText
inlineElementNoBracket =
choice
[ try strong,
try emphasis,
try codeSpan,
try htmlInline,
try escapedChar,
plainTextNoBracket
]
linkDestination :: (Logger m) => ParserT m (Text, Maybe Text)
linkDestination = directLink <|> referenceLink
where
directLink = do
char '('
url <- many $ noneOf " \t\n\r)"
title <- optionMaybe (try $ some (char ' ' <|> char '\t') >> titleParser)
char ')'
pure (T.pack url, title)
referenceLink = do
char '['
ref <- some (alphaNum <|> char ' ' <|> char '\t')
char ']'
-- For simplicity, we're not resolving references here
-- In a real implementation, you'd look up the reference
pure (T.pack $ "[" ++ ref ++ "]", Nothing)
titleParser :: (Logger m) => ParserT m Text
titleParser =
T.pack
<$> choice
[ between (char '"') (char '"') (many $ anySingleBut '"'),
between (char '\'') (char '\'') (many $ anySingleBut '\''),
between (char '(') (char ')') (many $ anySingleBut ')')
]
-- HTML Inline
htmlInline :: (Logger m) => ParserT m InlineText
htmlInline = do
start <- char '<'
content <- manyTill anyChar (try $ char '>')
return $ HTMLInline (T.pack (start : content ++ ">"))
-- Escaped Character
escapedChar :: (Logger m) => ParserT m InlineText
escapedChar = do
char '\\'
c <- satisfy (\x -> x >= '!' && x <= '~')
pure $ Text (T.singleton c)
-- Plain Text
plainText :: (Logger m) => ParserT m InlineText
plainText = plainTextNo' True [] <?> "Baseline Plain Text"
plainTextBaseDisallow :: [Char]
plainTextBaseDisallow = "[~`_*<"
plainTextCharNo :: (Logger m) => [Char] -> ParserT m Char
plainTextCharNo additional = noneOf $ additional <> plainTextBaseDisallow
plainTextNoAsterisk :: (Logger m) => ParserT m InlineText
plainTextNoAsterisk = plainTextNo "*"
plainTextNoUnderscore :: (Logger m) => ParserT m InlineText
plainTextNoUnderscore = plainTextNo "_"
plainTextNoBracket :: (Logger m) => ParserT m InlineText
plainTextNoBracket = plainTextNo "[]"
-- Helper Parsers
lineEnding :: (Logger m) => ParserT m ()
lineEnding = void (try $ count 2 (try (string "\r\n") <|> try (string "\n") <|> string "\r")) <|> eof
wsParser :: (Logger m) => ParserT m ()
wsParser = void $ some (char ' ' <|> char '\t')