hooked everything up to make use of new markdown parsing, just need to write the code to generate HTML

brought in LLM impl of markdown parsing
minor cleanup
2025-11-01 17:01:59 -04:00 · 2025-11-01 17:01:17 -04:00 · 2025-11-01 17:00:57 -04:00
5 changed files with 455 additions and 67 deletions
--- a/app/HTML.hs
+++ b/app/HTML.hs
@ -0,0 +1,9 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+module HTML (compileToHTML) where
+
+import Data.Text
+import IR
+
+compileToHTML :: Document -> Text
+compileToHTML = const ""
--- a/app/Main.hs
+++ b/app/Main.hs
@ -148,10 +148,9 @@ rss =
 -- Shake.putInfo $ "Built " <> target

 readPost :: FilePath -> Action Post
-readPost postPath = do
-  case FP.takeExtension postPath of
-    ".md" -> readMarkdownPost postPath
-    _ -> error $ "unknown file extension for file" <> postPath
+readPost postPath = case FP.takeExtension postPath of
+  ".md" -> readMarkdownPost postPath
+  _ -> error $ "unknown file extension for file" <> postPath

 readMarkdownPost :: FilePath -> Action Post
 readMarkdownPost postPath = do
--- a/app/Markdown.hs
+++ b/app/Markdown.hs
@ -2,91 +2,461 @@
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE RankNTypes #-}

-module Markdown (markdownParser) where
+module Markdown (document, metadata) where

-import Data.Functor
-import Data.Text
+import Control.Applicative (many, optional, some, (<|>))
+import Control.Monad (guard, void, when)
+import Data.Char (isAlpha)
+import Data.Text (Text)
+import qualified Data.Text as T
 import IR
-import Text.Parsec
-import Text.Parsec.Combinator
+import Text.Parsec hiding (Line, many, optional, (<|>))
+import Text.Parsec.String (Parser)

-type Parser a = forall s u m. (Stream s m Char) => ParsecT s u m a
+metadata :: Parser Text
+metadata = bound *> fmap T.pack (many $ try (char '-' <* notFollowedBy (string "--")) <|> noneOf "-") <* bound
+  where
+    bound = string "---"

-markdownParser :: Parser Document
-markdownParser = Doc <$> many block
+document :: Parser Document
+document = Doc <$> many element <* eof

-block :: Parser Element
-block = choice [heading, codeBlock, quoteBlock, list, table, htmlBlock, paragraph, blankLine]
+element :: Parser Element
+element =
+  choice
+    [ try headingBlock,
+      try fencedCodeBlock,
+      try indentedCodeBlock,
+      try blockquoteBlock,
+      try unorderedListBlock,
+      try orderedListBlock,
+      try horizontalRuleBlock,
+      try htmlBlock,
+      try blankLines, -- Consume blank lines but don't add to AST
+      paragraphBlock
+    ]

-heading :: Parser Element
-heading = pure $ Heading $ H {level = 1, text = ""}
+-- Blank lines (consumed but not stored)
+blankLines :: Parser Element
+blankLines = do
+  skipMany1 blankLine
+  element -- Parse the next element

-codeBlock :: Parser Element
-codeBlock = pure $ Code $ C {language = "", code = ""}
+blankLine :: Parser ()
+blankLine = do
+  many (char ' ' <|> char '\t')
+  lineEnding
+  pure ()

-quoteBlock :: Parser Element
-quoteBlock = pure $ BlockQuote $ Q ""
+-- Heading Block
+headingBlock :: Parser Element
+headingBlock = do
+  hashes <- some (char '#')
+  let level = length hashes
+  guard (level <= 6)
+  many (char ' ' <|> char '\t')
+  content <- manyTill inlineElement (try lineEnding)
+  pure $ Heading $ H level content

-list :: Parser Element
-list = pure $ List $ L {list_type = Ordered, items = []}
+-- Fenced Code Block
+fencedCodeBlock :: Parser Element
+fencedCodeBlock = do
+  fence <- string "```" <|> string "~~~"
+  lang <- optionMaybe languageInfo
+  lineEnding
+  codeLines <- manyTill codeLine (try $ string fence >> lineEnding)
+  pure $ Code $ C lang (T.pack $ unlines codeLines)

-table :: Parser Element
-table = pure $ Table $ T {header = TH [], rows = []}
+languageInfo :: Parser Text
+languageInfo = T.pack <$> many1 (alphaNum <|> char '-' <|> char '+' <|> char '.')

+codeLine :: Parser String
+codeLine = do
+  line <- many (noneOf "\n\r")
+  lineEnding
+  pure line
+
+-- Indented Code Block
+indentedCodeBlock :: Parser Element
+indentedCodeBlock = do
+  lines' <- some indentedLine
+  pure $ Code $ C Nothing (T.pack $ unlines lines')
+  where
+    indentedLine = do
+      count 4 (char ' ' <|> char '\t')
+      line <- many (noneOf "\n\r")
+      lineEnding
+      pure line
+
+-- Blockquote Block
+blockquoteBlock :: Parser Element
+blockquoteBlock = do
+  lines' <- some blockquoteLine
+  pure $ BlockQuote $ Q (concat lines')
+  where
+    blockquoteLine = do
+      char '>'
+      optional (char ' ')
+      content <- manyTill inlineElement (try lineEnding)
+      pure content
+
+-- Horizontal Rule Block
+horizontalRuleBlock :: Parser Element
+horizontalRuleBlock = do
+  choice
+    [ try (count 3 (char '*') >> many (char ' ' <|> char '*')),
+      try (count 3 (char '-') >> many (char ' ' <|> char '-')),
+      try (count 3 (char '_') >> many (char ' ' <|> char '_'))
+    ]
+  lineEnding
+  pure HorizontalRule
+
+-- Unordered List Block
+unorderedListBlock :: Parser Element
+unorderedListBlock = do
+  items <- some unorderedListItem
+  pure $ List $ L Unordered items
+
+unorderedListItem :: Parser ListItem
+unorderedListItem = do
+  oneOf "*-+"
+  char ' ' <|> char '\t'
+  content <- manyTill inlineElement (try lineEnding)
+  continuations <- many listContinuation
+  children <- many (try indentedList)
+  pure $ LI (content ++ concat continuations) children
+
+listContinuation :: Parser [InlineText]
+listContinuation = do
+  count 2 (char ' ' <|> char '\t')
+  many (char ' ' <|> char '\t')
+  notFollowedBy (oneOf "*-+")
+  notFollowedBy (digit >> char '.')
+  content <- manyTill inlineElement (try lineEnding)
+  pure content
+
+indentedList :: Parser List
+indentedList = do
+  void $ count 2 (char ' ' <|> char '\t')
+  choice [try indentedUnorderedList, indentedOrderedList]
+
+indentedUnorderedList :: Parser List
+indentedUnorderedList = do
+  items <- some (try $ indentedListItem (oneOf "*-+" >> void (char ' ' <|> char '\t')))
+  pure $ L Unordered items
+
+indentedOrderedList :: Parser List
+indentedOrderedList = do
+  items <- some (try $ indentedListItem (some digit >> char '.' >> void (char ' ' <|> char '\t')))
+  pure $ L Ordered items
+
+indentedListItem :: Parser () -> Parser ListItem
+indentedListItem marker = do
+  marker
+  content <- manyTill inlineElement (try lineEnding)
+  pure $ LI content []
+
+-- Ordered List Block
+orderedListBlock :: Parser Element
+orderedListBlock = do
+  items <- some orderedListItem
+  pure $ List $ L Ordered items
+
+orderedListItem :: Parser ListItem
+orderedListItem = do
+  some digit
+  char '.'
+  char ' ' <|> char '\t'
+  content <- manyTill inlineElement (try lineEnding)
+  continuations <- many listContinuation
+  children <- many (try indentedList)
+  pure $ LI (content ++ concat continuations) children
+
+-- HTML Block
 htmlBlock :: Parser Element
-htmlBlock = pure $ HTML $ Raw ""
+htmlBlock = char '<' *> choice [try htmlCommentBlock, try htmlDeclarationBlock, htmlTagBlock]

-paragraph :: Parser Element
-paragraph = do
-  first <- paragraphLine
-  rem <- many paragraphContinuation
-  let combined = Prelude.concat (first : rem)
-  pure $ Paragraph $ P combined
+htmlCommentBlock :: Parser Element
+htmlCommentBlock = do
+  string "!--"
+  content <- manyTill anyChar (try $ string "-->")
+  lineEnding
+  pure $ HTML $ HTMLTag "!--" [] (T.pack content)

-paragraphLine :: Parser [InlineText]
-paragraphLine = many inlineText <* endOfLine
+htmlDeclarationBlock :: Parser Element
+htmlDeclarationBlock = do
+  char '!'
+  decl <- some (satisfy isAlpha)
+  rest <- many (noneOf ">\n\r")
+  char '>'
+  lineEnding
+  pure $ HTML $ HTMLTag (T.pack $ "!" ++ decl) [] (T.pack rest)

-paragraphContinuation :: Parser [InlineText]
-paragraphContinuation = notFollowedBy blockElemStart *> paragraphLine
+htmlTagBlock :: Parser Element
+htmlTagBlock = do
+  name <- Markdown.tagName
+  attrs <- many (try $ wsParser >> attribute)
+  optional wsParser
+  selfClose <- option False (char '/' >> pure True)
+  char '>'
+  content <-
+    if selfClose
+      then pure ""
+      else manyTill anyChar (try $ string "</" >> string name >> char '>')
+  when (not selfClose) lineEnding
+  pure $ HTML $ HTMLTag (T.pack name) attrs (T.pack content)

-inlineText :: Parser InlineText
-inlineText = choice [emphasis, strong, inlineCode, link, image, inlineHTML, escapedChar, plainText]
+tagName :: Parser String
+tagName = do
+  first <- satisfy isAlpha
+  rest <- many (alphaNum <|> char '-' <|> char ':')
+  pure (first : rest)

-plainText :: Parser InlineText
-- abnf is very specific about what's allowed due to actual ABNF not allowing negation but I'm lazy
-plainText = fmap (Normal . pack) $ many $ noneOf "*_`[]()<>#+-.!&\\\n"
+attribute :: Parser (Text, Maybe Text)
+attribute = do
+  name <- attributeName
+  value <- optionMaybe (char '=' >> attributeValue)
+  pure (T.pack name, fmap T.pack value)

-escapedChar :: Parser InlineText
-escapedChar = char '\\' *> fmap Escaped visibleChar
+attributeName :: Parser String
+attributeName = do
+  first <- satisfy isAlpha
+  rest <- many (alphaNum <|> char '-' <|> char ':')
+  pure (first : rest)

+attributeValue :: Parser String
+attributeValue =
+  choice
+    [ between (char '"') (char '"') (many $ noneOf "\""),
+      between (char '\'') (char '\'') (many $ noneOf "'"),
+      some (noneOf " \t\n\r>\"'=<`")
+    ]
+
+-- Paragraph Block
+paragraphBlock :: Parser Element
+paragraphBlock = do
+  -- Make sure we're not starting another block element
+  notFollowedBy (char '#')
+  notFollowedBy (char '>')
+  notFollowedBy (oneOf "*-+" >> (char ' ' <|> char '\t'))
+  notFollowedBy (digit >> char '.')
+  notFollowedBy (string "```" <|> string "~~~")
+  notFollowedBy (count 4 (char ' ' <|> char '\t'))
+  notFollowedBy (count 3 (char '*') <|> count 3 (char '-') <|> count 3 (char '_'))
+  notFollowedBy (char '<')
+
+  content <- some inlineElement
+  lineEnding
+  pure $ Paragraph $ P content
+
+-- Inline Elements
+inlineElement :: Parser InlineText
+inlineElement =
+  choice
+    [ try strong,
+      try emphasis,
+      try codeSpan,
+      try image,
+      try link,
+      try htmlInline,
+      try escapedChar,
+      plainText
+    ]
+
+-- Strong (Bold)
+strong :: Parser InlineText
+strong = strongAsterisk <|> strongUnderscore
+
+strongAsterisk :: Parser InlineText
+strongAsterisk = do
+  string "**"
+  content <- some (notFollowedBy (string "**") >> inlineElement)
+  string "**"
+  pure $ Bold content
+
+strongUnderscore :: Parser InlineText
+strongUnderscore = do
+  string "__"
+  content <- some (notFollowedBy (string "__") >> inlineElement)
+  string "__"
+  pure $ Bold content
+
+-- Emphasis (Italic)
+emphasis :: Parser InlineText
+emphasis = emphasisAsterisk <|> emphasisUnderscore
+
+emphasisAsterisk :: Parser InlineText
+emphasisAsterisk = do
+  char '*'
+  notFollowedBy (char '*')
+  content <- some (notFollowedBy (char '*') >> inlineElementNoAsterisk)
+  char '*'
+  pure $ Italic content
+
+emphasisUnderscore :: Parser InlineText
+emphasisUnderscore = do
+  char '_'
+  notFollowedBy (char '_')
+  content <- some (notFollowedBy (char '_') >> inlineElementNoUnderscore)
+  char '_'
+  pure $ Italic content
+
+inlineElementNoAsterisk :: Parser InlineText
+inlineElementNoAsterisk =
+  choice
+    [ try strong,
+      try codeSpan,
+      try image,
+      try link,
+      try htmlInline,
+      try escapedChar,
+      plainTextNoAsterisk
+    ]
+
+inlineElementNoUnderscore :: Parser InlineText
+inlineElementNoUnderscore =
+  choice
+    [ try strong,
+      try codeSpan,
+      try image,
+      try link,
+      try htmlInline,
+      try escapedChar,
+      plainTextNoUnderscore
+    ]
+
+-- Code Span
+codeSpan :: Parser InlineText
+codeSpan =
+  choice
+    [ try tripleBacktick,
+      try doubleBacktick,
+      singleBacktick
+    ]
+  where
+    singleBacktick = do
+      char '`'
+      content <- many (noneOf "`\n\r")
+      char '`'
+      pure $ InlineCode (T.pack content)
+    doubleBacktick = do
+      string "``"
+      content <- manyTill anyChar (try $ string "``")
+      pure $ InlineCode (T.pack content)
+    tripleBacktick = do
+      string "```"
+      content <- manyTill anyChar (try $ string "```")
+      pure $ InlineCode (T.pack content)
+
+-- Image
+image :: Parser InlineText
+image = do
+  char '!'
+  char '['
+  alt <- some (notFollowedBy (char ']') >> inlineElementNoBracket)
+  char ']'
+  (url, title) <- linkDestination
+  pure $ Image alt url title
+
+-- Link
+link :: Parser InlineText
+link = do
+  char '['
+  content <- some (notFollowedBy (char ']') >> inlineElementNoBracket)
+  char ']'
+  (url, title) <- linkDestination
+  pure $ Link content url title
+
+inlineElementNoBracket :: Parser InlineText
+inlineElementNoBracket =
+  choice
+    [ try strong,
+      try emphasis,
+      try codeSpan,
+      try htmlInline,
+      try escapedChar,
+      plainTextNoBracket
+    ]
+
+linkDestination :: Parser (Text, Maybe Text)
+linkDestination = directLink <|> referenceLink
+  where
+    directLink = do
+      char '('
+      url <- many (noneOf " \t\n\r)")
+      title <- optionMaybe (try $ some (char ' ' <|> char '\t') >> titleParser)
+      char ')'
+      pure (T.pack url, title)
+    referenceLink = do
+      char '['
+      ref <- some (alphaNum <|> char ' ' <|> char '\t')
+      char ']'
+      -- For simplicity, we're not resolving references here
+      -- In a real implementation, you'd look up the reference
+      pure (T.pack $ "[" ++ ref ++ "]", Nothing)
+
+titleParser :: Parser Text
+titleParser =
+  T.pack
+    <$> choice
+      [ between (char '"') (char '"') (many $ noneOf "\""),
+        between (char '\'') (char '\'') (many $ noneOf "'"),
+        between (char '(') (char ')') (many $ noneOf ")")
+      ]
+
+-- HTML Inline
 htmlInline :: Parser InlineText
 htmlInline = do
  char '<'
-  tagName <- name
-  remaining <- htmlInlineRemainder
-  whiteSpace
+  name <- Markdown.tagName
+  attrs <- many (try $ wsParser >> attribute)
+  optional wsParser
+  _ <- option False (char '/' >> pure True)
  char '>'
-  let remainingTagText = foldl' (\ongoing current -> ongoing ++ ' ' : current) "" remaining
+  pure $ HTMLInline (T.pack name) attrs

-  pure $ HTMLIn $ pack $ '<' : name ++ remaining
-  where
-    htmlInlineRemainder = many $ whiteSpace *> attribute
-    name = many $ choice [alphaNum, char '-', char ':']
-    value = do
-      char '"'
-      l <- letter
-      rem <- many $ choice [alphaNum, char '-', char ':']
-      char '"'
-      pure '"' : l : rem ++ "\""
-    attribute = do
-      attrName <- name
-      char '='
-      attrValue <- value
-      pure attrName ++ ('=' : attrValue)
+-- Escaped Character
+escapedChar :: Parser InlineText
+escapedChar = do
+  char '\\'
+  c <- satisfy (\x -> x >= '!' && x <= '~')
+  pure $ Text (T.singleton c)

-whiteSpace :: Parser Text
-whiteSpace = pack <$> many space
+-- Plain Text
+plainText :: Parser InlineText
+plainText = Text . T.pack <$> some plainTextChar

-visibleChar :: Parser Char
-- technically more strict but I'm just going to hope I never have to deal with that
-visibleChar = anyChar
+plainTextChar :: Parser Char
+plainTextChar = satisfy $ \c ->
+  (c `notElem` ("*_`[!<\\\n\r" :: String)) && c >= ' '
+
+plainTextNoAsterisk :: Parser InlineText
+plainTextNoAsterisk =
+  Text . T.pack
+    <$> some
+      ( satisfy $ \c ->
+          (c `notElem` ("*_`[!<\\\n\r" :: String)) && c >= ' '
+      )
+
+plainTextNoUnderscore :: Parser InlineText
+plainTextNoUnderscore =
+  Text . T.pack
+    <$> some
+      ( satisfy $ \c ->
+          not (c `elem` ("_*`[!<\\\n\r" :: String)) && c >= ' '
+      )
+
+plainTextNoBracket :: Parser InlineText
+plainTextNoBracket =
+  Text . T.pack
+    <$> some
+      ( satisfy $ \c ->
+          not (c `elem` ("]_*`[!<\\\n\r" :: String)) && c >= ' '
+      )
+
+-- Helper Parsers
+lineEnding :: Parser ()
+lineEnding = void (try (string "\r\n") <|> try (string "\n") <|> string "\r")
+
+wsParser :: Parser ()
+wsParser = void $ some (char ' ' <|> char '\t')
--- a/app/Utilities.hs
+++ b/app/Utilities.hs
@ -7,6 +7,7 @@ import qualified Data.Aeson as A
 import Data.List (find)
 import Data.Text (Text)
 import qualified Data.Text as T
+import Data.Text.Encoding (encodeUtf8)
 import Data.Time
 import Data.Time.Format.ISO8601 (iso8601Show)
 import Data.Yaml.Aeson
@ -14,8 +15,11 @@ import Development.Shake (Action)
 import qualified Development.Shake as Shake
 import Development.Shake.FilePath ((<.>), (</>))
 import qualified Development.Shake.FilePath as FP
+import HTML
+import Markdown
 import Text.Pandoc (Block (Plain), Meta (..), MetaValue (..), Pandoc (..))
 import qualified Text.Pandoc as Pandoc
+import Text.Parsec hiding (Error)
 import Types

 indexHtmlOutputPath :: FilePath -> FilePath
@ -36,6 +40,12 @@ indexHtmlMarkdownSourcePath =

 markdownToHtml :: (FromJSON a) => FilePath -> Action (a, Text)
 markdownToHtml filePath = do
+  content <- Shake.readFile' filePath
+  let Right (metadataText, document) = parse (liftA2 (,) Markdown.metadata Markdown.document) filePath content
+  let Right metadata = decodeEither' $ encodeUtf8 metadataText
+  pure (metadata, compileToHTML document)
+
+markdownToHtml_ filePath = do
  content <- Shake.readFile' filePath
  Shake.quietly . Shake.traced "Markdown to HTML" $ do
    pandoc@(Pandoc meta _) <-
--- a/psb.cabal
+++ b/psb.cabal
@ -29,7 +29,7 @@ executable psb
    -- .hs or .lhs file containing the Main module.
    main-is:          Main.hs

-    other-modules: Config Utilities Templates Types IR Markdown Restruct
+    other-modules: Config Utilities Templates Types IR Markdown Restruct HTML

    default-extensions: ApplicativeDo DataKinds NamedFieldPuns DerivingVia LambdaCase TypeApplications DeriveGeneric
Author	SHA1	Message	Date
Pagwin	d61fd86b76	hooked everything up to make use of new markdown parsing, just need to write the code to generate HTML	2025-11-01 17:01:59 -04:00
Pagwin	e1064e342f	brought in LLM impl of markdown parsing	2025-11-01 17:01:17 -04:00
Pagwin	b346c399f5	minor cleanup	2025-11-01 17:00:57 -04:00