psb/app/Markdown.hs

434 lines
11 KiB
Haskell

{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
module Markdown (document, metadata) where
import Control.Applicative (many, optional, some, (<|>))
import Control.Monad (guard, void, when)
import Data.Char (isAlpha)
import Data.Text (Text)
import qualified Data.Text as T
import IR
import Text.Parsec hiding (Line, many, optional, (<|>))
import Text.Parsec.String (Parser)
metadata :: Parser Text
metadata = bound *> fmap T.pack (many $ try (char '-' <* notFollowedBy (string "--")) <|> noneOf "-") <* bound
where
bound = string "---"
document :: Parser Document
document = Doc <$> many element <* eof
element :: Parser Element
element =
choice
[ try headingBlock,
try fencedCodeBlock,
try indentedCodeBlock,
try blockquoteBlock,
try unorderedListBlock,
try orderedListBlock,
try horizontalRuleBlock,
try htmlBlock,
try blankLines, -- Consume blank lines but don't add to AST
paragraphBlock
]
-- Blank lines (consumed but not stored)
blankLines :: Parser Element
blankLines = do
skipMany1 blankLine
element -- Parse the next element
blankLine :: Parser ()
blankLine = do
many (char ' ' <|> char '\t')
lineEnding
pure ()
-- Heading Block
headingBlock :: Parser Element
headingBlock = do
hashes <- some (char '#')
let level = length hashes
guard (level <= 6)
many (char ' ' <|> char '\t')
content <- manyTill inlineElement (try lineEnding)
pure $ Heading $ H level content
-- Fenced Code Block
fencedCodeBlock :: Parser Element
fencedCodeBlock = do
fence <- string "```" <|> string "~~~"
lang <- optionMaybe languageInfo
lineEnding
codeLines <- manyTill codeLine (try $ string fence >> lineEnding)
pure $ Code $ C lang (T.pack $ unlines codeLines)
languageInfo :: Parser Text
languageInfo = T.pack <$> many1 (alphaNum <|> char '-' <|> char '+' <|> char '.')
codeLine :: Parser String
codeLine = do
line <- many (noneOf "\n\r")
lineEnding
pure line
-- Indented Code Block
indentedCodeBlock :: Parser Element
indentedCodeBlock = do
lines' <- some indentedLine
pure $ Code $ C Nothing (T.pack $ unlines lines')
where
indentedLine = do
count 4 (char ' ' <|> char '\t')
line <- many (noneOf "\n\r")
lineEnding
pure line
-- Blockquote Block
blockquoteBlock :: Parser Element
blockquoteBlock = do
lines' <- some blockquoteLine
pure $ BlockQuote $ Q (concat lines')
where
blockquoteLine = do
char '>'
optional (char ' ')
content <- manyTill inlineElement (try lineEnding)
pure content
-- Horizontal Rule Block
horizontalRuleBlock :: Parser Element
horizontalRuleBlock = do
choice
[ try (count 3 (char '*') >> many (char ' ' <|> char '*')),
try (count 3 (char '-') >> many (char ' ' <|> char '-')),
try (count 3 (char '_') >> many (char ' ' <|> char '_'))
]
lineEnding
pure HorizontalRule
-- Unordered List Block
unorderedListBlock :: Parser Element
unorderedListBlock = do
items <- some unorderedListItem
pure $ List $ L Unordered items
unorderedListItem :: Parser ListItem
unorderedListItem = do
oneOf "*-+"
char ' ' <|> char '\t'
content <- manyTill inlineElement (try lineEnding)
continuations <- many listContinuation
children <- many (try indentedList)
pure $ LI (content ++ concat continuations) children
listContinuation :: Parser [InlineText]
listContinuation = do
count 2 (char ' ' <|> char '\t')
many (char ' ' <|> char '\t')
notFollowedBy (oneOf "*-+")
notFollowedBy (digit >> char '.')
content <- manyTill inlineElement (try lineEnding)
pure content
indentedList :: Parser List
indentedList = do
void $ count 2 (char ' ' <|> char '\t')
choice [try indentedUnorderedList, indentedOrderedList]
indentedUnorderedList :: Parser List
indentedUnorderedList = do
items <- some (try $ indentedListItem (oneOf "*-+" >> void (char ' ' <|> char '\t')))
pure $ L Unordered items
indentedOrderedList :: Parser List
indentedOrderedList = do
items <- some (try $ indentedListItem (some digit >> char '.' >> void (char ' ' <|> char '\t')))
pure $ L Ordered items
indentedListItem :: Parser () -> Parser ListItem
indentedListItem marker = do
marker
content <- manyTill inlineElement (try lineEnding)
pure $ LI content []
-- Ordered List Block
orderedListBlock :: Parser Element
orderedListBlock = do
items <- some orderedListItem
pure $ List $ L Ordered items
orderedListItem :: Parser ListItem
orderedListItem = do
some digit
char '.'
char ' ' <|> char '\t'
content <- manyTill inlineElement (try lineEnding)
continuations <- many listContinuation
children <- many (try indentedList)
pure $ LI (content ++ concat continuations) children
-- HTML Block
htmlBlock :: Parser Element
htmlBlock = do
start <- getPosition
char '<'
-- Capture the entire HTML block as raw text
rest <- manyTill anyChar (try $ char '>' >> lineEnding)
let content = '<' : rest
return $ HTML $ HTMLTag (T.pack content)
tagName :: Parser String
tagName = do
first <- satisfy isAlpha
rest <- many (alphaNum <|> char '-' <|> char ':')
pure (first : rest)
attribute :: Parser (Text, Maybe Text)
attribute = do
name <- attributeName
value <- optionMaybe (char '=' >> attributeValue)
pure (T.pack name, fmap T.pack value)
attributeName :: Parser String
attributeName = do
first <- satisfy isAlpha
rest <- many (alphaNum <|> char '-' <|> char ':')
pure (first : rest)
attributeValue :: Parser String
attributeValue =
choice
[ between (char '"') (char '"') (many $ noneOf "\""),
between (char '\'') (char '\'') (many $ noneOf "'"),
some (noneOf " \t\n\r>\"'=<`")
]
-- Paragraph Block
paragraphBlock :: Parser Element
paragraphBlock = do
-- Make sure we're not starting another block element
notFollowedBy (char '#')
notFollowedBy (char '>')
notFollowedBy (oneOf "*-+" >> (char ' ' <|> char '\t'))
notFollowedBy (digit >> char '.')
notFollowedBy (string "```" <|> string "~~~")
notFollowedBy (count 4 (char ' ' <|> char '\t'))
notFollowedBy (count 3 (char '*') <|> count 3 (char '-') <|> count 3 (char '_'))
notFollowedBy (char '<')
content <- some inlineElement
lineEnding
pure $ Paragraph $ P content
-- Inline Elements
inlineElement :: Parser InlineText
inlineElement =
choice
[ try strong,
try emphasis,
try codeSpan,
try image,
try link,
try htmlInline,
try escapedChar,
plainText
]
-- Strong (Bold)
strong :: Parser InlineText
strong = strongAsterisk <|> strongUnderscore
strongAsterisk :: Parser InlineText
strongAsterisk = do
string "**"
content <- some (notFollowedBy (string "**") >> inlineElement)
string "**"
pure $ Bold content
strongUnderscore :: Parser InlineText
strongUnderscore = do
string "__"
content <- some (notFollowedBy (string "__") >> inlineElement)
string "__"
pure $ Bold content
-- Emphasis (Italic)
emphasis :: Parser InlineText
emphasis = emphasisAsterisk <|> emphasisUnderscore
emphasisAsterisk :: Parser InlineText
emphasisAsterisk = do
char '*'
notFollowedBy (char '*')
content <- some (notFollowedBy (char '*') >> inlineElementNoAsterisk)
char '*'
pure $ Italic content
emphasisUnderscore :: Parser InlineText
emphasisUnderscore = do
char '_'
notFollowedBy (char '_')
content <- some (notFollowedBy (char '_') >> inlineElementNoUnderscore)
char '_'
pure $ Italic content
inlineElementNoAsterisk :: Parser InlineText
inlineElementNoAsterisk =
choice
[ try strong,
try codeSpan,
try image,
try link,
try htmlInline,
try escapedChar,
plainTextNoAsterisk
]
inlineElementNoUnderscore :: Parser InlineText
inlineElementNoUnderscore =
choice
[ try strong,
try codeSpan,
try image,
try link,
try htmlInline,
try escapedChar,
plainTextNoUnderscore
]
-- Code Span
codeSpan :: Parser InlineText
codeSpan =
choice
[ try tripleBacktick,
try doubleBacktick,
singleBacktick
]
where
singleBacktick = do
char '`'
content <- many (noneOf "`\n\r")
char '`'
pure $ InlineCode (T.pack content)
doubleBacktick = do
string "``"
content <- manyTill anyChar (try $ string "``")
pure $ InlineCode (T.pack content)
tripleBacktick = do
string "```"
content <- manyTill anyChar (try $ string "```")
pure $ InlineCode (T.pack content)
-- Image
image :: Parser InlineText
image = do
char '!'
char '['
alt <- T.pack <$> many (noneOf "]\n\r")
char ']'
(url, title) <- linkDestination
return $ Image {altText = alt, url = url, title = title}
-- Link
link :: Parser InlineText
link = do
char '['
content <- some (notFollowedBy (char ']') >> inlineElementNoBracket)
char ']'
(url, title) <- linkDestination
pure $ Link content url title
inlineElementNoBracket :: Parser InlineText
inlineElementNoBracket =
choice
[ try strong,
try emphasis,
try codeSpan,
try htmlInline,
try escapedChar,
plainTextNoBracket
]
linkDestination :: Parser (Text, Maybe Text)
linkDestination = directLink <|> referenceLink
where
directLink = do
char '('
url <- many (noneOf " \t\n\r)")
title <- optionMaybe (try $ some (char ' ' <|> char '\t') >> titleParser)
char ')'
pure (T.pack url, title)
referenceLink = do
char '['
ref <- some (alphaNum <|> char ' ' <|> char '\t')
char ']'
-- For simplicity, we're not resolving references here
-- In a real implementation, you'd look up the reference
pure (T.pack $ "[" ++ ref ++ "]", Nothing)
titleParser :: Parser Text
titleParser =
T.pack
<$> choice
[ between (char '"') (char '"') (many $ noneOf "\""),
between (char '\'') (char '\'') (many $ noneOf "'"),
between (char '(') (char ')') (many $ noneOf ")")
]
-- HTML Inline
htmlInline :: Parser InlineText
htmlInline = do
start <- char '<'
content <- manyTill anyChar (try $ char '>')
return $ HTMLInline (T.pack (start : content ++ ">"))
-- Escaped Character
escapedChar :: Parser InlineText
escapedChar = do
char '\\'
c <- satisfy (\x -> x >= '!' && x <= '~')
pure $ Text (T.singleton c)
-- Plain Text
plainText :: Parser InlineText
plainText = Text . T.pack <$> some plainTextChar
plainTextChar :: Parser Char
plainTextChar = satisfy $ \c ->
(c `notElem` ("*_`[!<\\\n\r" :: String)) && c >= ' '
plainTextNoAsterisk :: Parser InlineText
plainTextNoAsterisk =
Text . T.pack
<$> some
( satisfy $ \c ->
(c `notElem` ("*_`[!<\\\n\r" :: String)) && c >= ' '
)
plainTextNoUnderscore :: Parser InlineText
plainTextNoUnderscore =
Text . T.pack
<$> some
( satisfy $ \c ->
not (c `elem` ("_*`[!<\\\n\r" :: String)) && c >= ' '
)
plainTextNoBracket :: Parser InlineText
plainTextNoBracket =
Text . T.pack
<$> some
( satisfy $ \c ->
not (c `elem` ("]_*`[!<\\\n\r" :: String)) && c >= ' '
)
-- Helper Parsers
lineEnding :: Parser ()
lineEnding = void (try (string "\r\n") <|> try (string "\n") <|> string "\r")
wsParser :: Parser ()
wsParser = void $ some (char ' ' <|> char '\t')