psb/app/Markdown.hs

447 lines
11 KiB
Haskell

{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
-- (document, metadata)
module Markdown where
import Control.Applicative (many, optional, some, (<|>))
import Control.Monad (guard, void)
import Data.Char (isAlpha)
import Data.Text (Text)
import qualified Data.Text as T
import Data.Void (Void)
import IR
import Text.Megaparsec (Parsec, anySingle, anySingleBut, between, choice, count, eof, manyTill, notFollowedBy, satisfy, skipSome, try)
import qualified Text.Megaparsec as MP
import Text.Megaparsec.Char (alphaNumChar, char, digitChar, string)
type Parser = Parsec Void String
anyChar :: Parser Char
anyChar = anySingle
alphaNum :: Parser Char
alphaNum = alphaNumChar
digit :: Parser Char
digit = digitChar
noneOf :: [Char] -> Parser Char
noneOf = MP.noneOf
oneOf :: [Char] -> Parser Char
oneOf = MP.oneOf
optionMaybe :: Parser a -> Parser (Maybe a)
optionMaybe = optional
skipMany1 :: Parser a -> Parser ()
skipMany1 = skipSome
metadata :: Parser Text
metadata = bound *> fmap T.pack (many $ try (char '-' <* notFollowedBy (string "--")) <|> anySingleBut '-') <* bound
where
bound = string "---"
document :: Parser Document
document = Doc <$> many element <* eof
element :: Parser Element
element =
choice
[ try headingBlock,
try fencedCodeBlock,
try indentedCodeBlock,
try blockquoteBlock,
try unorderedListBlock,
try orderedListBlock,
try horizontalRuleBlock,
try htmlBlock,
try blankLines, -- Consume blank lines but don't add to AST
paragraphBlock
]
-- Blank lines (consumed but not stored)
blankLines :: Parser Element
blankLines = do
skipMany1 blankLine
element <|> fmap (const $ HTML HTMLTag {html_content = ""}) eof -- Parse the next element (or handle eof)
blankLine :: Parser ()
blankLine = do
many (char ' ' <|> char '\t')
lineEnding
pure ()
-- Heading Block
headingBlock :: Parser Element
headingBlock = do
hashes <- some (char '#')
let level = length hashes
guard (level <= 6)
many (char ' ' <|> char '\t')
content <- manyTill inlineElement (try lineEnding)
pure $ Heading $ H level content
-- Fenced Code Block
fencedCodeBlock :: Parser Element
fencedCodeBlock = do
fence <- string "```" <|> string "~~~"
lang <- optionMaybe languageInfo
lineEnding
codeLines <- manyTill codeLine (try $ string fence)
pure $ Code $ C lang (T.pack $ unlines codeLines)
languageInfo :: Parser Text
languageInfo = T.pack <$> some (alphaNum <|> char '-' <|> char '+' <|> char '.')
codeLine :: Parser String
codeLine = do
line <- many $ noneOf "\n\r"
lineEnding
pure line
-- Indented Code Block
indentedCodeBlock :: Parser Element
indentedCodeBlock = do
lines' <- some indentedLine
pure $ Code $ C Nothing (T.pack $ unlines lines')
where
indentedLine = do
count 4 (char ' ' <|> char '\t')
line <- many $ noneOf "\n\r"
lineEnding
pure line
-- Blockquote Block
blockquoteBlock :: Parser Element
blockquoteBlock = do
lines' <- some blockquoteLine
pure $ BlockQuote $ Q (concat lines')
where
blockquoteLine = do
char '>'
optional (char ' ')
content <- manyTill inlineElement (try lineEnding)
pure content
-- Horizontal Rule Block
horizontalRuleBlock :: Parser Element
horizontalRuleBlock = do
choice
[ try (count 3 (char '*') >> many (char ' ' <|> char '*')),
try (count 3 (char '-') >> many (char ' ' <|> char '-')),
try (count 3 (char '_') >> many (char ' ' <|> char '_'))
]
lineEnding
pure HorizontalRule
-- Unordered List Block
unorderedListBlock :: Parser Element
unorderedListBlock = do
items <- some unorderedListItem
pure $ List $ L Unordered items
unorderedListItem :: Parser ListItem
unorderedListItem = do
oneOf "*-+"
char ' ' <|> char '\t'
content <- manyTill inlineElement (try lineEnding)
-- continuations <- many listContinuation
children <- many (try indentedList)
pure $ LI content children
listContinuation :: Parser [InlineText]
listContinuation = do
count 2 (char ' ' <|> char '\t')
many (char ' ' <|> char '\t')
notFollowedBy (oneOf "*-+")
notFollowedBy (digit >> char '.')
content <- manyTill inlineElement (try lineEnding)
pure content
-- TODO: handle list indentation at all levels
indentedList :: Parser List
indentedList = do
let n = 1
void $ count (4 * n) (char ' ') <|> count n (char '\t')
choice [try indentedUnorderedList, indentedOrderedList]
indentedUnorderedList :: Parser List
indentedUnorderedList = do
items <- some (try $ indentedListItem (oneOf "*-+" >> void (char ' ' <|> char '\t')))
pure $ L Unordered items
indentedOrderedList :: Parser List
indentedOrderedList = do
items <- some (try $ indentedListItem (some digit >> char '.' >> void (char ' ' <|> char '\t')))
pure $ L Ordered items
indentedListItem :: Parser () -> Parser ListItem
indentedListItem marker = do
marker
content <- manyTill inlineElement (try $ lineEnding <|> eof)
pure $ LI content []
-- Ordered List Block
orderedListBlock :: Parser Element
orderedListBlock = do
items <- some orderedListItem
pure $ List $ L Ordered items
orderedListItem :: Parser ListItem
orderedListItem = do
some digit
char '.'
char ' ' <|> char '\t'
content <- manyTill inlineElement (try lineEnding)
continuations <- many listContinuation
children <- many (try indentedList)
pure $ LI (content ++ concat continuations) children
-- HTML Block
htmlBlock :: Parser Element
htmlBlock = do
char '<'
-- Capture the entire HTML block as raw text
rest <- manyTill anyChar (try $ char '>' >> lineEnding)
let content = '<' : rest
return $ HTML $ HTMLTag (T.pack content)
tagName :: Parser String
tagName = do
first <- satisfy isAlpha
rest <- many (alphaNum <|> char '-' <|> char ':')
pure (first : rest)
attribute :: Parser (Text, Maybe Text)
attribute = do
name <- attributeName
value <- optionMaybe (char '=' >> attributeValue)
pure (T.pack name, fmap T.pack value)
attributeName :: Parser String
attributeName = do
first <- satisfy isAlpha
rest <- many (alphaNum <|> char '-' <|> char ':')
pure (first : rest)
attributeValue :: Parser String
attributeValue =
choice
[ between (char '"') (char '"') (many $ anySingleBut '"'),
between (char '\'') (char '\'') (many $ anySingleBut '\''),
some $ noneOf " \t\n\r>\"'=<`"
]
-- Paragraph Block
paragraphBlock :: Parser Element
paragraphBlock = do
content <- some inlineElement
lineEnding <|> eof
pure $ Paragraph $ P content
-- Inline Elements
inlineElement :: Parser InlineText
inlineElement =
choice
[ try strong,
try emphasis,
try crossedText,
try codeSpan,
try image,
try link,
try htmlInline,
try escapedChar,
plainText
]
-- Strong (Bold)
strong :: Parser InlineText
strong = strongAsterisk <|> strongUnderscore
strongAsterisk :: Parser InlineText
strongAsterisk = do
string "**"
content <- some (notFollowedBy (string "**") >> inlineElement)
string "**"
pure $ Bold content
strongUnderscore :: Parser InlineText
strongUnderscore = do
string "__"
content <- some (notFollowedBy (string "__") >> inlineElement)
string "__"
pure $ Bold content
crossedText :: Parser InlineText
crossedText = do
string "~~"
content <- some (notFollowedBy (string "~~") >> inlineElement)
string "~~"
pure $ Crossed content
-- Emphasis (Italic)
emphasis :: Parser InlineText
emphasis = emphasisAsterisk <|> emphasisUnderscore
emphasisAsterisk :: Parser InlineText
emphasisAsterisk = do
char '*'
content <- some (notFollowedBy (char '*') >> inlineElementNoAsterisk)
char '*'
pure $ Italic content
emphasisUnderscore :: Parser InlineText
emphasisUnderscore = do
char '_'
content <- some inlineElementNoUnderscore
char '_'
pure $ Italic content
inlineElementNo :: Char -> Parser InlineText
inlineElementNo c =
choice
[ try strong,
try codeSpan,
try image,
try link,
try htmlInline,
try escapedChar,
plainTextNo [c]
]
plainTextNo :: [Char] -> Parser InlineText
plainTextNo disallow = do
firstChar <- noneOf disallow
remChars <- some $ plainTextCharNo disallow <* notFollowedBy lineEnding
pure $ Text $ T.map wspHandler $ T.pack $ firstChar : remChars
where
wspHandler '\n' = ' '
wspHandler c = c
inlineElementNoAsterisk :: Parser InlineText
inlineElementNoAsterisk = inlineElementNo '*'
inlineElementNoUnderscore :: Parser InlineText
inlineElementNoUnderscore = inlineElementNo '_'
-- Code Span
codeSpan :: Parser InlineText
codeSpan =
choice
[ try tripleBacktick,
try doubleBacktick,
singleBacktick
]
where
singleBacktick = do
char '`'
content <- many $ noneOf "`\n\r"
char '`'
pure $ InlineCode (T.pack content)
doubleBacktick = do
string "``"
content <- manyTill anyChar (try $ string "``")
pure $ InlineCode (T.pack content)
tripleBacktick = do
string "```"
content <- manyTill anyChar (try $ string "```")
pure $ InlineCode (T.pack content)
-- Image
image :: Parser InlineText
image = do
char '!'
char '['
alt <- T.pack <$> many (noneOf "]\n\r")
char ']'
(url, title) <- linkDestination
return $ Image {altText = alt, url = url, title = title}
-- Link
link :: Parser InlineText
link = do
char '['
content <- some (notFollowedBy (char ']') >> inlineElementNoBracket)
char ']'
(url, title) <- linkDestination
pure $ Link content url title
inlineElementNoBracket :: Parser InlineText
inlineElementNoBracket =
choice
[ try strong,
try emphasis,
try codeSpan,
try htmlInline,
try escapedChar,
plainTextNoBracket
]
linkDestination :: Parser (Text, Maybe Text)
linkDestination = directLink <|> referenceLink
where
directLink = do
char '('
url <- many $ noneOf " \t\n\r)"
title <- optionMaybe (try $ some (char ' ' <|> char '\t') >> titleParser)
char ')'
pure (T.pack url, title)
referenceLink = do
char '['
ref <- some (alphaNum <|> char ' ' <|> char '\t')
char ']'
-- For simplicity, we're not resolving references here
-- In a real implementation, you'd look up the reference
pure (T.pack $ "[" ++ ref ++ "]", Nothing)
titleParser :: Parser Text
titleParser =
T.pack
<$> choice
[ between (char '"') (char '"') (many $ anySingleBut '"'),
between (char '\'') (char '\'') (many $ anySingleBut '\''),
between (char '(') (char ')') (many $ anySingleBut ')')
]
-- HTML Inline
htmlInline :: Parser InlineText
htmlInline = do
start <- char '<'
content <- manyTill anyChar (try $ char '>')
return $ HTMLInline (T.pack (start : content ++ ">"))
-- Escaped Character
escapedChar :: Parser InlineText
escapedChar = do
char '\\'
c <- satisfy (\x -> x >= '!' && x <= '~')
pure $ Text (T.singleton c)
-- Plain Text
plainText :: Parser InlineText
plainText = plainTextNo []
plainTextBaseDisallow :: [Char]
plainTextBaseDisallow = "[~`_*<"
plainTextCharNo :: [Char] -> Parser Char
plainTextCharNo additional = noneOf $ additional <> plainTextBaseDisallow
plainTextNoAsterisk :: Parser InlineText
plainTextNoAsterisk = plainTextNo "*"
plainTextNoUnderscore :: Parser InlineText
plainTextNoUnderscore = plainTextNo "_"
plainTextNoBracket :: Parser InlineText
plainTextNoBracket = plainTextNo "[]"
-- Helper Parsers
lineEnding :: Parser ()
lineEnding = void $ count 2 (try (string "\r\n") <|> try (string "\n") <|> string "\r")
wsParser :: Parser ()
wsParser = void $ some (char ' ' <|> char '\t')