434 lines
11 KiB
Haskell
434 lines
11 KiB
Haskell
{-# LANGUAGE FlexibleContexts #-}
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
{-# LANGUAGE RankNTypes #-}
|
|
|
|
-- (document, metadata)
|
|
module Markdown where
|
|
|
|
import Control.Applicative (many, optional, some, (<|>))
|
|
import Control.Monad (guard, void)
|
|
import Data.Char (isAlpha)
|
|
import Data.Text (Text)
|
|
import qualified Data.Text as T
|
|
import IR
|
|
import Text.Parsec hiding (Line, many, optional, (<|>))
|
|
import Text.Parsec.String (Parser)
|
|
|
|
metadata :: Parser Text
|
|
metadata = bound *> fmap T.pack (many $ try (char '-' <* notFollowedBy (string "--")) <|> noneOf "-") <* bound
|
|
where
|
|
bound = string "---"
|
|
|
|
document :: Parser Document
|
|
document = Doc <$> many element <* eof
|
|
|
|
element :: Parser Element
|
|
element =
|
|
choice
|
|
[ try headingBlock,
|
|
try fencedCodeBlock,
|
|
try indentedCodeBlock,
|
|
try blockquoteBlock,
|
|
try unorderedListBlock,
|
|
try orderedListBlock,
|
|
try horizontalRuleBlock,
|
|
try htmlBlock,
|
|
try blankLines, -- Consume blank lines but don't add to AST
|
|
paragraphBlock
|
|
]
|
|
|
|
-- Blank lines (consumed but not stored)
|
|
blankLines :: Parser Element
|
|
blankLines = do
|
|
skipMany1 blankLine
|
|
element -- Parse the next element
|
|
|
|
blankLine :: Parser ()
|
|
blankLine = do
|
|
many (char ' ' <|> char '\t')
|
|
lineEnding
|
|
pure ()
|
|
|
|
-- Heading Block
|
|
headingBlock :: Parser Element
|
|
headingBlock = do
|
|
hashes <- some (char '#')
|
|
let level = length hashes
|
|
guard (level <= 6)
|
|
many (char ' ' <|> char '\t')
|
|
content <- manyTill inlineElement (try lineEnding)
|
|
pure $ Heading $ H level content
|
|
|
|
-- Fenced Code Block
|
|
fencedCodeBlock :: Parser Element
|
|
fencedCodeBlock = do
|
|
fence <- string "```" <|> string "~~~"
|
|
lang <- optionMaybe languageInfo
|
|
lineEnding
|
|
codeLines <- manyTill codeLine (try $ string fence >> lineEnding)
|
|
pure $ Code $ C lang (T.pack $ unlines codeLines)
|
|
|
|
languageInfo :: Parser Text
|
|
languageInfo = T.pack <$> many1 (alphaNum <|> char '-' <|> char '+' <|> char '.')
|
|
|
|
codeLine :: Parser String
|
|
codeLine = do
|
|
line <- many (noneOf "\n\r")
|
|
lineEnding
|
|
pure line
|
|
|
|
-- Indented Code Block
|
|
indentedCodeBlock :: Parser Element
|
|
indentedCodeBlock = do
|
|
lines' <- some indentedLine
|
|
pure $ Code $ C Nothing (T.pack $ unlines lines')
|
|
where
|
|
indentedLine = do
|
|
count 4 (char ' ' <|> char '\t')
|
|
line <- many (noneOf "\n\r")
|
|
lineEnding
|
|
pure line
|
|
|
|
-- Blockquote Block
|
|
blockquoteBlock :: Parser Element
|
|
blockquoteBlock = do
|
|
lines' <- some blockquoteLine
|
|
pure $ BlockQuote $ Q (concat lines')
|
|
where
|
|
blockquoteLine = do
|
|
char '>'
|
|
optional (char ' ')
|
|
content <- manyTill inlineElement (try lineEnding)
|
|
pure content
|
|
|
|
-- Horizontal Rule Block
|
|
horizontalRuleBlock :: Parser Element
|
|
horizontalRuleBlock = do
|
|
choice
|
|
[ try (count 3 (char '*') >> many (char ' ' <|> char '*')),
|
|
try (count 3 (char '-') >> many (char ' ' <|> char '-')),
|
|
try (count 3 (char '_') >> many (char ' ' <|> char '_'))
|
|
]
|
|
lineEnding
|
|
pure HorizontalRule
|
|
|
|
-- Unordered List Block
|
|
unorderedListBlock :: Parser Element
|
|
unorderedListBlock = do
|
|
items <- some unorderedListItem
|
|
pure $ List $ L Unordered items
|
|
|
|
unorderedListItem :: Parser ListItem
|
|
unorderedListItem = do
|
|
oneOf "*-+"
|
|
char ' ' <|> char '\t'
|
|
content <- manyTill inlineElement (try lineEnding)
|
|
continuations <- many listContinuation
|
|
children <- many (try indentedList)
|
|
pure $ LI (content ++ concat continuations) children
|
|
|
|
listContinuation :: Parser [InlineText]
|
|
listContinuation = do
|
|
count 2 (char ' ' <|> char '\t')
|
|
many (char ' ' <|> char '\t')
|
|
notFollowedBy (oneOf "*-+")
|
|
notFollowedBy (digit >> char '.')
|
|
content <- manyTill inlineElement (try lineEnding)
|
|
pure content
|
|
|
|
indentedList :: Parser List
|
|
indentedList = do
|
|
void $ count 2 (char ' ' <|> char '\t')
|
|
choice [try indentedUnorderedList, indentedOrderedList]
|
|
|
|
indentedUnorderedList :: Parser List
|
|
indentedUnorderedList = do
|
|
items <- some (try $ indentedListItem (oneOf "*-+" >> void (char ' ' <|> char '\t')))
|
|
pure $ L Unordered items
|
|
|
|
indentedOrderedList :: Parser List
|
|
indentedOrderedList = do
|
|
items <- some (try $ indentedListItem (some digit >> char '.' >> void (char ' ' <|> char '\t')))
|
|
pure $ L Ordered items
|
|
|
|
indentedListItem :: Parser () -> Parser ListItem
|
|
indentedListItem marker = do
|
|
marker
|
|
content <- manyTill inlineElement (try lineEnding)
|
|
pure $ LI content []
|
|
|
|
-- Ordered List Block
|
|
orderedListBlock :: Parser Element
|
|
orderedListBlock = do
|
|
items <- some orderedListItem
|
|
pure $ List $ L Ordered items
|
|
|
|
orderedListItem :: Parser ListItem
|
|
orderedListItem = do
|
|
some digit
|
|
char '.'
|
|
char ' ' <|> char '\t'
|
|
content <- manyTill inlineElement (try lineEnding)
|
|
continuations <- many listContinuation
|
|
children <- many (try indentedList)
|
|
pure $ LI (content ++ concat continuations) children
|
|
|
|
-- HTML Block
|
|
htmlBlock :: Parser Element
|
|
htmlBlock = do
|
|
char '<'
|
|
-- Capture the entire HTML block as raw text
|
|
rest <- manyTill anyChar (try $ char '>' >> lineEnding)
|
|
let content = '<' : rest
|
|
return $ HTML $ HTMLTag (T.pack content)
|
|
|
|
tagName :: Parser String
|
|
tagName = do
|
|
first <- satisfy isAlpha
|
|
rest <- many (alphaNum <|> char '-' <|> char ':')
|
|
pure (first : rest)
|
|
|
|
attribute :: Parser (Text, Maybe Text)
|
|
attribute = do
|
|
name <- attributeName
|
|
value <- optionMaybe (char '=' >> attributeValue)
|
|
pure (T.pack name, fmap T.pack value)
|
|
|
|
attributeName :: Parser String
|
|
attributeName = do
|
|
first <- satisfy isAlpha
|
|
rest <- many (alphaNum <|> char '-' <|> char ':')
|
|
pure (first : rest)
|
|
|
|
attributeValue :: Parser String
|
|
attributeValue =
|
|
choice
|
|
[ between (char '"') (char '"') (many $ noneOf "\""),
|
|
between (char '\'') (char '\'') (many $ noneOf "'"),
|
|
some (noneOf " \t\n\r>\"'=<`")
|
|
]
|
|
|
|
-- Paragraph Block
|
|
paragraphBlock :: Parser Element
|
|
paragraphBlock = do
|
|
-- Make sure we're not starting another block element
|
|
notFollowedBy (char '#')
|
|
notFollowedBy (char '>')
|
|
notFollowedBy (oneOf "*-+" >> (char ' ' <|> char '\t'))
|
|
notFollowedBy (digit >> char '.')
|
|
notFollowedBy (string "```" <|> string "~~~")
|
|
notFollowedBy (count 4 (char ' ' <|> char '\t'))
|
|
notFollowedBy (count 3 (char '*') <|> count 3 (char '-') <|> count 3 (char '_'))
|
|
-- notFollowedBy (char '<')
|
|
|
|
content <- some inlineElement
|
|
lineEnding <|> eof
|
|
pure $ Paragraph $ P content
|
|
|
|
-- Inline Elements
|
|
inlineElement :: Parser InlineText
|
|
inlineElement =
|
|
choice
|
|
[ try strong,
|
|
try emphasis,
|
|
try codeSpan,
|
|
try image,
|
|
try link,
|
|
try htmlInline,
|
|
try escapedChar,
|
|
plainText
|
|
]
|
|
|
|
-- Strong (Bold)
|
|
strong :: Parser InlineText
|
|
strong = strongAsterisk <|> strongUnderscore
|
|
|
|
strongAsterisk :: Parser InlineText
|
|
strongAsterisk = do
|
|
string "**"
|
|
content <- some (notFollowedBy (string "**") >> inlineElement)
|
|
string "**"
|
|
pure $ Bold content
|
|
|
|
strongUnderscore :: Parser InlineText
|
|
strongUnderscore = do
|
|
string "__"
|
|
content <- some (notFollowedBy (string "__") >> inlineElement)
|
|
string "__"
|
|
pure $ Bold content
|
|
|
|
-- Emphasis (Italic)
|
|
emphasis :: Parser InlineText
|
|
emphasis = emphasisAsterisk <|> emphasisUnderscore
|
|
|
|
emphasisAsterisk :: Parser InlineText
|
|
emphasisAsterisk = do
|
|
char '*'
|
|
notFollowedBy (char '*')
|
|
content <- some (notFollowedBy (char '*') >> inlineElementNoAsterisk)
|
|
char '*'
|
|
pure $ Italic content
|
|
|
|
emphasisUnderscore :: Parser InlineText
|
|
emphasisUnderscore = do
|
|
char '_'
|
|
notFollowedBy (char '_')
|
|
content <- some (notFollowedBy (char '_') >> inlineElementNoUnderscore)
|
|
char '_'
|
|
pure $ Italic content
|
|
|
|
inlineElementNoAsterisk :: Parser InlineText
|
|
inlineElementNoAsterisk =
|
|
choice
|
|
[ try strong,
|
|
try codeSpan,
|
|
try image,
|
|
try link,
|
|
try htmlInline,
|
|
try escapedChar,
|
|
plainTextNoAsterisk
|
|
]
|
|
|
|
inlineElementNoUnderscore :: Parser InlineText
|
|
inlineElementNoUnderscore =
|
|
choice
|
|
[ try strong,
|
|
try codeSpan,
|
|
try image,
|
|
try link,
|
|
try htmlInline,
|
|
try escapedChar,
|
|
plainTextNoUnderscore
|
|
]
|
|
|
|
-- Code Span
|
|
codeSpan :: Parser InlineText
|
|
codeSpan =
|
|
choice
|
|
[ try tripleBacktick,
|
|
try doubleBacktick,
|
|
singleBacktick
|
|
]
|
|
where
|
|
singleBacktick = do
|
|
char '`'
|
|
content <- many (noneOf "`\n\r")
|
|
char '`'
|
|
pure $ InlineCode (T.pack content)
|
|
doubleBacktick = do
|
|
string "``"
|
|
content <- manyTill anyChar (try $ string "``")
|
|
pure $ InlineCode (T.pack content)
|
|
tripleBacktick = do
|
|
string "```"
|
|
content <- manyTill anyChar (try $ string "```")
|
|
pure $ InlineCode (T.pack content)
|
|
|
|
-- Image
|
|
image :: Parser InlineText
|
|
image = do
|
|
char '!'
|
|
char '['
|
|
alt <- T.pack <$> many (noneOf "]\n\r")
|
|
char ']'
|
|
(url, title) <- linkDestination
|
|
return $ Image {altText = alt, url = url, title = title}
|
|
|
|
-- Link
|
|
link :: Parser InlineText
|
|
link = do
|
|
char '['
|
|
content <- some (notFollowedBy (char ']') >> inlineElementNoBracket)
|
|
char ']'
|
|
(url, title) <- linkDestination
|
|
pure $ Link content url title
|
|
|
|
inlineElementNoBracket :: Parser InlineText
|
|
inlineElementNoBracket =
|
|
choice
|
|
[ try strong,
|
|
try emphasis,
|
|
try codeSpan,
|
|
try htmlInline,
|
|
try escapedChar,
|
|
plainTextNoBracket
|
|
]
|
|
|
|
linkDestination :: Parser (Text, Maybe Text)
|
|
linkDestination = directLink <|> referenceLink
|
|
where
|
|
directLink = do
|
|
char '('
|
|
url <- many (noneOf " \t\n\r)")
|
|
title <- optionMaybe (try $ some (char ' ' <|> char '\t') >> titleParser)
|
|
char ')'
|
|
pure (T.pack url, title)
|
|
referenceLink = do
|
|
char '['
|
|
ref <- some (alphaNum <|> char ' ' <|> char '\t')
|
|
char ']'
|
|
-- For simplicity, we're not resolving references here
|
|
-- In a real implementation, you'd look up the reference
|
|
pure (T.pack $ "[" ++ ref ++ "]", Nothing)
|
|
|
|
titleParser :: Parser Text
|
|
titleParser =
|
|
T.pack
|
|
<$> choice
|
|
[ between (char '"') (char '"') (many $ noneOf "\""),
|
|
between (char '\'') (char '\'') (many $ noneOf "'"),
|
|
between (char '(') (char ')') (many $ noneOf ")")
|
|
]
|
|
|
|
-- HTML Inline
|
|
htmlInline :: Parser InlineText
|
|
htmlInline = do
|
|
start <- char '<'
|
|
content <- manyTill anyChar (try $ char '>')
|
|
return $ HTMLInline (T.pack (start : content ++ ">"))
|
|
|
|
-- Escaped Character
|
|
escapedChar :: Parser InlineText
|
|
escapedChar = do
|
|
char '\\'
|
|
c <- satisfy (\x -> x >= '!' && x <= '~')
|
|
pure $ Text (T.singleton c)
|
|
|
|
-- Plain Text
|
|
plainText :: Parser InlineText
|
|
plainText = Text . T.pack <$> some plainTextChar
|
|
|
|
plainTextChar :: Parser Char
|
|
plainTextChar = satisfy $ \c ->
|
|
(c `notElem` ("*_`[<\\\n\r" :: String)) && c >= ' '
|
|
|
|
plainTextNoAsterisk :: Parser InlineText
|
|
plainTextNoAsterisk =
|
|
Text . T.pack
|
|
<$> some
|
|
( satisfy $ \c ->
|
|
(c `notElem` ("*_`[<\\\n\r" :: String)) && c >= ' '
|
|
)
|
|
|
|
plainTextNoUnderscore :: Parser InlineText
|
|
plainTextNoUnderscore =
|
|
Text . T.pack
|
|
<$> some
|
|
( satisfy $ \c ->
|
|
not (c `elem` ("_*`[<\\\n\r" :: String)) && c >= ' '
|
|
)
|
|
|
|
plainTextNoBracket :: Parser InlineText
|
|
plainTextNoBracket =
|
|
Text . T.pack
|
|
<$> some
|
|
( satisfy $ \c ->
|
|
not (c `elem` ("]_*`[<\\\n\r" :: String)) && c >= ' '
|
|
)
|
|
|
|
-- Helper Parsers
|
|
lineEnding :: Parser ()
|
|
lineEnding = void (try (string "\r\n") <|> try (string "\n") <|> string "\r")
|
|
|
|
wsParser :: Parser ()
|
|
wsParser = void $ some (char ' ' <|> char '\t')
|