Compare commits

...

2 commits

Author SHA1 Message Date
Pagwin
b6a689868b
Brought in code by Gemini for handling lists and Task lists
lists and task lists are a PITA to handle, I may need to handle them
myself in the future but that'll probably become clear via testing
rather than simply doing it as is because the parsing logic is annoying
2026-04-10 16:17:37 -04:00
Pagwin
1a2feb95e9
bunch of work on djot 2026-04-10 15:22:06 -04:00
6 changed files with 361 additions and 33 deletions

View file

@ -27,7 +27,7 @@ common warnings
library
hs-source-dirs: src
exposed-modules: Djot Markdown HTML Logger IR Logger.Shake Psb.Main Utilities Utilities.FilePath Utilities.Action Utilities.Javascript Utilities.CSS Templates Types Config Utilities.Bundling
exposed-modules: Djot Djot.Lists Markdown HTML Logger IR Logger.Shake Psb.Main Utilities Utilities.FilePath Utilities.Action Utilities.Javascript Utilities.CSS Templates Types Config Utilities.Bundling
other-modules: Utilities.Parsing
build-depends: base, mustache >=2.4.2, shake >= 0.19.8, deriving-aeson >= 0.2.9, aeson, text >= 2.1.2, time, unordered-containers, yaml, megaparsec >= 9.7.0, transformers >= 0.6.2, bytestring
default-extensions: ApplicativeDo DataKinds NamedFieldPuns DerivingVia LambdaCase TypeApplications DeriveGeneric OverloadedRecordDot NamedFieldPuns DuplicateRecordFields DisambiguateRecordFields FlexibleInstances

View file

@ -1,3 +1,4 @@
{-# LANGUAGE BlockArguments #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
@ -10,14 +11,18 @@ module Djot
where
import Control.Applicative (many, optional, some, (<|>))
import Control.Monad.ST (runST)
import Data.Foldable (for_)
import Data.Functor (void, (<$>))
import Data.List (elemIndex)
import qualified Data.List.NonEmpty as NE
import Data.STRef (modifySTRef', newSTRef, readSTRef, writeSTRef)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Djot.Lists
import IR
import Logger (Logger (logCallStack, logDebug, logError))
import Text.Megaparsec (MonadParsec (lookAhead, notFollowedBy, parseError, try), ParseErrorBundle (ParseErrorBundle, bundleErrors), SourcePos (sourceColumn), anySingle, choice, errorOffset, getInput, getOffset, manyTill, parse, sepBy, setErrorOffset, someTill)
import Text.Megaparsec (MonadParsec (lookAhead, notFollowedBy, parseError, try), ParseErrorBundle (ParseErrorBundle, bundleErrors), SourcePos (sourceColumn), anySingle, choice, errorOffset, getInput, getOffset, manyTill, parse, satisfy, sepBy, setErrorOffset, someTill)
import Text.Megaparsec.Char (char, lowerChar, newline, numberChar, space, string, tab, upperChar)
import Utilities.Parsing
@ -36,9 +41,8 @@ blockElement accumulated_attributes =
choice
[ lookAhead (char '#') *> header accumulated_attributes,
lookAhead (char '>') *> blockQuote accumulated_attributes,
lookAhead listMarker
*> (lookAhead (bullet *> taskMarker) *> taskListBlock accumulated_attributes)
<|> listBlock accumulated_attributes,
try $ taskListBlock accumulated_attributes,
try $ listBlock accumulated_attributes,
lookAhead codeFence
*> (lookAhead (codeFence *> rawLang) *> rawBlock accumulated_attributes)
<|> codeBlock accumulated_attributes,
@ -46,10 +50,10 @@ blockElement accumulated_attributes =
try $ thematicBreak accumulated_attributes,
lookAhead (string ":::") *> containerBlock accumulated_attributes,
-- try used due to table having a non-trivial structure at the start
try tableBlock accumulated_attributes,
try $ tableBlock accumulated_attributes,
-- using try due to ambiguity between these and normal text until we've already done some amount of parsing
try footnoteDefinition accumulated_attributes,
try referenceDef accumulated_attributes,
try $ footnoteDefinition accumulated_attributes,
try $ referenceDef accumulated_attributes,
lookAhead (char '{') *> blockAttribute accumulated_attributes,
paragraph accumulated_attributes
]
@ -86,7 +90,9 @@ blockQuote attrs = do
parseError $ remap $ NE.head errs
where
blockQuote' = do
pure $ BlockQuote (error "todo") attrs
-- using document for convenience
(Doc contents) <- document
pure $ BlockQuote (Q contents) attrs
bq_line = do
char '>'
space
@ -94,21 +100,8 @@ blockQuote attrs = do
listBlock :: (Logger m, Characters s) => Attrs -> Parser s m Element
listBlock attrs = do
error "this is probably fundamentally wrong and needs to be redone to work"
startOffset <- getOffset
list_type <- listMarker
first_item <- list_item_content
rem_items <- manyTill (listMarker *> list_item_content) $ lookAhead blockSeparator
let items = first_item : rem_items
case parse listBlock' "" $ error "todo: figure out how to take our list of items and put them toegether for easy parsing" of
Right ret -> pure ret
Left (ParseErrorBundle errs _) -> do
logError "Error in blockQuote offset may be off"
let remap err = setErrorOffset (errorOffset err + startOffset) err
parseError $ remap $ NE.head errs
where
list_item_content = error "todo"
listBlock' = error "todo"
list <- Djot.Lists.djotList
error ""
taskListBlock :: (Logger m, Characters s) => Attrs -> Parser s m Element
taskListBlock attrs = do
@ -127,6 +120,41 @@ lowerAlphabet = ['a' .. 'z']
upperAlphabet :: [Char]
upperAlphabet = ['A' .. 'Z']
-- inner parsing of roman numerals done via Gemini due to lazy
romanValue :: Char -> Int
romanValue 'I' = 1
romanValue 'V' = 5
romanValue 'X' = 10
romanValue 'L' = 50
romanValue 'C' = 100
romanValue 'D' = 500
romanValue 'M' = 1000
romanValue _ = 0
romanToInt :: Text -> Int
romanToInt s = runST $ do
total <- newSTRef 0
prevVal <- newSTRef 0
for_ (T.unpack s) $ \char -> do
let curr = romanValue char
p <- readSTRef prevVal
-- If current > previous (e.g., IV), we subtract the previous twice
if curr > p
then modifySTRef' total (\t -> t + curr - 2 * p)
else modifySTRef' total (+ curr)
writeSTRef prevVal curr
readSTRef total
upperRoman :: [Char]
upperRoman = "IVXLCDM"
lowerRoman :: [Char]
lowerRoman = T.unpack $ T.toLower $ T.pack upperRoman
-- consumes whitespace as well for convenience in element parser
listMarker :: (Logger m, Characters s) => Parser s m ListType
listMarker = choice [bullet, decimal, try lower_letter, try upper_letter, try lower_roman_numeral, try upper_roman_numeral] <* space
@ -145,7 +173,14 @@ listMarker = choice [bullet, decimal, try lower_letter, try upper_letter, try lo
letter <- choice $ map ($ upperChar) [try . surroundParen, try . rightParen, point]
let start_number = elemIndex letter upperAlphabet
pure $ Ordered {start_number, style = Just "A"}
roman_numeral = error "todo"
lower_roman_numeral = do
numeral <- choice $ map ($ some (satisfy (`elem` lowerRoman))) [try . surroundParen, try . rightParen, point]
let start_number = Just $ romanToInt $ T.toUpper $ toText numeral
pure $ Ordered {start_number, style = Just "i"}
upper_roman_numeral = do
numeral <- choice $ map ($ some (satisfy (`elem` upperRoman))) [try . surroundParen, try . rightParen, point]
let start_number = Just $ romanToInt $ T.toUpper $ toText numeral
pure $ Ordered {start_number, style = Just "I"}
surroundParen :: (Logger m, Characters s) => Parser s m a -> Parser s m a
surroundParen parser = do
@ -180,7 +215,7 @@ codeBlock attrs = do
else
Just $ toText language'
code <- toText <$> manyTill anySingle codeFence
pure $ Code $ C {language, code}
pure $ Code (C {language, code}) attrs
blockAttribute :: (Logger m, Characters s) => Attrs -> Parser s m Element
blockAttribute attrs = do
@ -310,4 +345,76 @@ inlineContent' opened = someTill (inlineElement opened) $ lookAhead blockSeparat
inlineElement :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
inlineElement opened =
choice
[lookAhead (string "![") *> image, lookAhead]
-- spamming try because backtracking is easier than having state for partial parses
[ try $ image opened,
try $ Djot.link opened,
try $ autolink opened,
try $ verbatim opened,
try $ emphasis opened,
try $ highlight opened,
try $ superscript opened,
try $ subscript opened,
try $ insert opened,
try $ math opened,
try $ footnoteRef opened,
try $ linebreak opened,
try $ symbol opened,
try $ rawInline opened,
try $ Djot.span opened,
try $ inlineAttribute opened,
plainText
]
image :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
image opened = error "todo"
link :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
link opened = error "todo"
autolink :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
autolink opened = error "todo"
verbatim :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
verbatim opened = error "todo"
emphasis :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
emphasis opened = error "todo"
highlight :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
highlight opened = error "todo"
superscript :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
superscript opene = error "todo"
subscript :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
subscript opened = error "todo"
insert :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
insert opened = error "todo"
delete :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
delete = error "todo"
math :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
math opened = error "todo"
footnoteRef :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
footnoteRef opened = error "todo"
linebreak :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
linebreak opened = error "todo"
symbol :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
symbol opened = error "todo"
rawInline :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
rawInline opened = error "todo"
span :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
span opened = error "todo"
inlineAttribute :: (Logger m, Characters s) => [OpenInline] -> Parser s m InlineText
inlineAttribute opened = error "todo"
plainText :: (Logger m, Characters s) => Parser s m InlineText
plainText = error "todo"

221
src/Djot/Lists.hs Normal file
View file

@ -0,0 +1,221 @@
{-# LANGUAGE OverloadedStrings #-}
-- This is written by Gemini barring minor adjustments made manually to fit
-- into the Parsing utilities and logger stuff
module Djot.Lists (List, djotList, TaskList, djotTaskList) where
import Data.Functor (void)
import Data.Text (Text)
import qualified Data.Text as T
import Data.Void
import Logger (Logger (logCallStack, logDebug, logError))
import Text.Megaparsec
import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L
import Utilities.Parsing
-- | Djot supports three types of ordered list delimiters.
data Delimiter = Period | RightParen | Enclosed -- e.g., '1.', '1)', '(1)'
deriving (Show, Eq)
-- | The style of the list marker.
data MarkerStyle
= Bullet Char -- '*', '-', '+'
| Decimal Delimiter -- '1'
| LowerAlpha Delimiter -- 'a'
| UpperAlpha Delimiter -- 'A'
| LowerRoman Delimiter -- 'i'
| UpperRoman Delimiter -- 'I'
deriving (Show, Eq)
-- | A single item in a list.
data ListItem = ListItem
{ itemStyle :: MarkerStyle,
itemIndent :: Int, -- Column where the block content starts
itemContent :: [Text] -- Simplified: usually this would be [Block]
}
deriving (Show, Eq)
-- | A full list is a collection of items sharing the exact same MarkerStyle.
data List = List MarkerStyle [ListItem]
deriving (Show, Eq)
-- | Parses whitespace but does not consume newlines.
sc :: (Logger m, Characters s) => Parser s m ()
sc = hidden (void $ takeWhileP (Just "space") (== ' '))
-- | Parses the punctuation after/around an ordered list number/letter.
-- E.g., takes a parser for the numeral (like `lowerRoman`) and applies delimiters.
orderedDelimiter :: (Logger m, Characters s) => Parser s m a -> Parser s m (a, Delimiter)
orderedDelimiter inner =
(try (between (char '(') (char ')') inner) >>= \x -> return (x, Enclosed))
<|> (try (inner <* char ')') >>= \x -> return (x, RightParen))
<|> (try (inner <* char '.') >>= \x -> return (x, Period))
-- | Parses a bullet marker.
bulletMarker :: (Logger m, Characters s) => Parser s m MarkerStyle
bulletMarker = Bullet <$> oneOf ['*', '-', '+']
-- | Decimals: '1.', '1)', '(1)'
decimalMarker :: (Logger m, Characters s) => Parser s m MarkerStyle
decimalMarker = Decimal . snd <$> orderedDelimiter L.decimal
-- | Lower/Upper Alpha: 'a.', 'A)', etc.
-- (Note: In a full implementation, you must ensure these are single letters)
lowerAlphaMarker :: (Logger m, Characters s) => Parser s m MarkerStyle
lowerAlphaMarker = LowerAlpha . snd <$> orderedDelimiter lowerChar
upperAlphaMarker :: (Logger m, Characters s) => Parser s m MarkerStyle
upperAlphaMarker = UpperAlpha . snd <$> orderedDelimiter upperChar
-- | Lower/Upper Roman
-- Uses a simplified roman numeral regex/parser for illustration.
romanLower :: (Logger m, Characters s) => Parser s m Text
romanLower = toText <$> takeWhile1P (Just "lower roman") (`elem` ['i', 'v', 'x', 'l', 'c', 'd', 'm'])
romanUpper :: (Logger m, Characters s) => Parser s m Text
romanUpper = toText <$> takeWhile1P (Just "upper roman") (`elem` ['I', 'V', 'X', 'L', 'C', 'D', 'M'])
lowerRomanMarker :: (Logger m, Characters s) => Parser s m MarkerStyle
lowerRomanMarker = LowerRoman . snd <$> orderedDelimiter romanLower
upperRomanMarker :: (Logger m, Characters s) => Parser s m MarkerStyle
upperRomanMarker = UpperRoman . snd <$> orderedDelimiter romanUpper
-- | Attempts to parse any valid Djot list marker.
listMarker :: (Logger m, Characters s) => Parser s m MarkerStyle
listMarker =
choice
[ bulletMarker,
decimalMarker,
lowerRomanMarker, -- Order matters here to prevent 'i.' parsing as LowerAlpha
lowerAlphaMarker,
upperRomanMarker,
upperAlphaMarker
]
-- | Parses a single list item.
listItem :: (Logger m, Characters s) => Parser s m ListItem
listItem = do
-- 1. Grab the current indentation level before the item
startPos <- L.indentLevel
-- 2. Parse the marker
marker <- listMarker
-- 3. Consume spaces between marker and content (mandatory in Djot)
void $ some (char ' ')
-- 4. Find the column where the actual content begins.
-- Djot relies on this column to group subsequent lines.
contentPos <- L.indentLevel
-- 5. Parse the block content based on indentation.
-- We use `L.indentBlock` or manually grab lines that are indented >= contentPos.
content <- some (indentedLine contentPos)
return $ ListItem marker (unPos contentPos) content
-- | Helper to grab a line that respects the item's indentation level.
indentedLine :: (Logger m, Characters s) => Pos -> Parser s m Text
indentedLine minIndent = do
currentIndent <- L.indentLevel
if currentIndent >= minIndent
then do
line <- takeWhileP (Just "character") (/= '\n')
void (char '\n') <|> eof
return $ toText line
else fail "Line not indented enough for list item"
-- | Parses a complete list by ensuring all subsequent items share the same MarkerStyle.
djotList :: (Logger m, Characters s) => Parser s m List
djotList = do
firstItem <- listItem
let style = itemStyle firstItem
-- Try to parse more items, but ONLY if they have the exact same style.
restItems <- many (try $ itemWithStyle style)
return $ List style (firstItem : restItems)
-- | Parses an item only if it matches the expected style.
itemWithStyle :: (Logger m, Characters s) => MarkerStyle -> Parser s m ListItem
itemWithStyle expectedStyle = do
item <- listItem
if itemStyle item == expectedStyle
then return item
else fail "List marker style mismatch"
-- | Represents the state of the task checkbox.
data TaskStatus = Unchecked | Checked
deriving (Show, Eq)
-- | A specific list item that is guaranteed to be a task.
data TaskListItem = TaskListItem
{ taskStyle :: MarkerStyle,
taskIndent :: Int,
taskStatus :: TaskStatus,
taskContent :: [Text]
}
deriving (Show, Eq)
-- | A list composed entirely of TaskListItems.
data TaskList = TaskList MarkerStyle [TaskListItem]
deriving (Show, Eq)
-- | Parses the Djot task checkbox and returns its status.
taskCheckbox :: (Logger m, Characters s) => Parser s m TaskStatus
taskCheckbox = do
void $ char '['
status <-
(char ' ' *> pure Unchecked)
<|> (oneOf ['x', 'X'] *> pure Checked)
void $ char ']'
-- Djot requires a space after the checkbox (or a newline if the item is blank)
void (char ' ') <|> lookAhead (void $ char '\n') <|> eof
pure status
-- | Parses a single task list item.
taskListItem :: (Logger m, Characters s) => Parser s m TaskListItem
taskListItem = do
_startPos <- L.indentLevel
-- 1. Parse the list marker (bullets or ordered)
marker <- listMarker
-- 2. Consume mandatory spaces after the marker
void $ some (char ' ')
-- 3. Capture the indentation level right HERE.
-- The '[' of the checkbox dictates the column that subsequent lines must respect.
contentPos <- L.indentLevel
-- 4. Parse the mandatory checkbox (if this fails, it's not a task list item)
status <- taskCheckbox
-- 5. Parse the content, ensuring continuation lines respect the `contentPos`
-- (Using the same `indentedLine` helper from the previous parser)
content <- some (indentedLine contentPos)
pure $ TaskListItem marker (unPos contentPos) status content
-- | Parses a complete task list, ensuring consistent marker styles.
djotTaskList :: (Logger m, Characters s) => Parser s m TaskList
djotTaskList = do
firstItem <- taskListItem
let style = taskStyle firstItem
restItems <- many (try $ taskItemWithStyle style)
pure $ TaskList style (firstItem : restItems)
-- | Helper to strictly enforce uniform marker styles across the task list.
taskItemWithStyle :: (Logger m, Characters s) => MarkerStyle -> Parser s m TaskListItem
taskItemWithStyle expectedStyle = do
item <- taskListItem
if taskStyle item == expectedStyle
then pure item
else fail "Task list marker style mismatch"

View file

@ -49,16 +49,16 @@ spaceSep = T.intercalate " "
elementToHTML :: Element -> T.Text
elementToHTML (Heading header attrs) = T.concat ["<h", tshow header.level, headerAttrs header attrs, ">", serializeInlineToHTML header.text, "</h", tshow header.level, ">"]
--
elementToHTML (Code code_block) = T.concat ["<pre class=\"sourceCode ", language, "\"><code class=\"sourceCode ", language, "\">", escapeText code_block.code, "</code>", "</pre>"]
elementToHTML (Code code_block attrs) = T.concat ["<pre class=\"sourceCode ", language, "\" ", handleAttrs attrs, "><code class=\"sourceCode ", language, "\">", escapeText code_block.code, "</code>", "</pre>"]
where
language = fromMaybe "" code_block.language
elementToHTML (BlockQuote (Q elems)) = T.concat ["<blockquote>", elementsToHTML elems, "</blockquote>"]
elementToHTML (BlockQuote (Q elems) attrs) = T.concat ["<blockquote ", handleAttrs attrs, ">", elementsToHTML elems, "</blockquote>"]
elementToHTML (List (L {list_type = Ordered {start_number, style}, items}) attrs) = spaceSep ["<ol", maybe "" handleStart start_number, maybe "" handleStyle style, ">", generateLiElems items, "</ol>"]
elementToHTML (List (L {list_type = Unordered {style}, items}) attrs) = spaceSep ["<ul", maybe "" handleStyle style, ">", generateLiElems items, "</ul>"]
elementToHTML (HTML (HTMLTag {html_content})) = html_content
elementToHTML (Paragraph (P snippets) attrs) = spaceSep ["<p", handleAttrs attrs, ">", serializeInlineToHTML snippets, "</p>"]
elementToHTML (Transparent snippets) = serializeInlineToHTML snippets
elementToHTML HorizontalRule = "<hr>"
elementToHTML (HorizontalRule attrs) = "<hr " <> handleAttrs attrs <> ">"
elementToHTML (Table _ _) = error "TODO"
elementToHTML (Container _ _) = error "TODO"
elementToHTML (Footnote _ _) = error "TODO"

View file

@ -138,7 +138,7 @@ fencedCodeBlock = between (string "```") (string "```") $ do
lineEnding
code <- T.pack <$> (many ((notFollowedBy $ string "```") *> anySingle))
let language = if language' == "" then Just language' else Nothing
pure $ Code $ C {language, code}
pure $ Code (C {language, code}) mempty
blockquoteBlock :: (Logger m, Characters s) => Parser s m Element
blockquoteBlock = do

View file

@ -6,11 +6,11 @@ module Utilities.Parsing where
import Data.String (IsString)
import Data.Text (Text, pack, unpack)
import Data.Void (Void)
import Text.Megaparsec (ParsecT, Stream, Token, Tokens)
import Text.Megaparsec (ParsecT, Stream, Token, Tokens, TraversableStream)
type Parser = ParsecT Void
class (Token s ~ Char, Stream s, ToText (Tokens s), ToText s, IsString (Tokens s), IsString s, Monoid (Tokens s), ToChar (Token s), Eq (Tokens s), Show s, Monoid s) => Characters s
class (Token s ~ Char, Stream s, ToText (Tokens s), ToText s, IsString (Tokens s), IsString s, Monoid (Tokens s), ToChar (Token s), Eq (Tokens s), Show s, Monoid s, TraversableStream s) => Characters s
class ToText t where
toText :: t -> Text