psb/src/Utilities/Javascript.hs
2026-01-03 21:08:26 -05:00

520 lines
23 KiB
Haskell

{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Utilities.Javascript
( minify,
minifyTokens,
toTokens,
displayToken,
)
where
import Control.Applicative (Alternative (many, some), empty, optional, (<|>))
import Control.Monad.Trans.Class (MonadTrans (lift))
import Control.Monad.Trans.State (StateT, evalStateT, get, put)
import Data.Data (Proxy (Proxy))
import Data.Functor (void, (<&>))
import Data.Functor.Identity (Identity (Identity, runIdentity))
import Data.Maybe (maybeToList)
import Data.String (IsString (fromString))
import Data.Void (Void)
import Logger
import Text.Megaparsec (MonadParsec (notFollowedBy, try), ParseErrorBundle, ParsecT, Stream (tokensToChunk), anySingle, between, choice, parse, runParserT)
import qualified Text.Megaparsec as MP
import Text.Megaparsec.Char (binDigitChar, char, digitChar, eol, hexDigitChar, hspace, letterChar, newline, octDigitChar, string)
import Utilities.Parsing (Characters, ToChar (fromChar, toChar), ToText (fromText, toString, toText))
data Possibility = ExprAllowed | ExprNotAllowed deriving (Eq)
type Parser s m = ParsecT Void s (StateT Possibility m)
minify :: forall s. (Characters s, MP.VisualStream s, MP.TraversableStream s) => s -> s
minify src = foldMap displayToken $ minifyTokens $ case runIdentity ((toTokens "" src) :: Identity (Either (ParseErrorBundle s Void) [Token s])) of
Left e -> error $ "Attempt to tokenize javascript file failed with: " <> MP.errorBundlePretty e
Right v -> v
minifyTokens :: (Characters s) => [Token s] -> [Token s]
minifyTokens = reduce_identifiers . remove_redundants
where
-- need to figure out how to add State into this
reduce_identifiers = map $ \token -> case token of
Identifier name -> Identifier name
v -> v
-- this could also use state so I can remove redundant newlines
remove_redundants = filter $ \token -> case token of
WhiteSpace -> False
_ -> True
toTokens :: (Characters s, Logger m) => String -> s -> m (Either (ParseErrorBundle s Void) [Token s])
toTokens src stream = evalStateT (runParserT tokens src stream) ExprAllowed
displayToken :: (ToText s) => Token s -> s
displayToken WhiteSpace = fromText " "
displayToken LineTerminator = fromText "\n"
displayToken (Identifier i) = i
displayToken (HashBangComment text) = fromText ("#!" <> toText text)
displayToken (SingleLineComment text) = fromText ("//" <> toText text)
displayToken (MultiLineComment text) = fromText ("/*" <> toText text <> "*/")
displayToken (PrivateIdentifier i) = fromText ("#" <> toText i)
displayToken (ReservedWord r) = displayReserved r
displayToken (Literal l) = displayLiteral l
displayToken (Punc p) = displayPunc p
displayReserved :: (ToText s) => Reserved -> s
displayReserved Await = fromText "await"
displayReserved Break = fromText "break"
displayReserved Case = fromText "case"
displayReserved Catch = fromText "catch"
displayReserved Class = fromText "class"
displayReserved Const = fromText "const"
displayReserved Continue = fromText "continue"
displayReserved Debugger = fromText "debugger"
displayReserved Default = fromText "default"
displayReserved Delete = fromText "delete"
displayReserved Do = fromText "do"
displayReserved Else = fromText "else"
displayReserved Enum = fromText "enum"
displayReserved Export = fromText "export"
displayReserved Extends = fromText "extends"
displayReserved FalseVal = fromText "false"
displayReserved Finally = fromText "finally"
displayReserved For = fromText "for"
displayReserved Function = fromText "function"
displayReserved If = fromText "if"
displayReserved Import = fromText "import"
displayReserved In = fromText "in"
displayReserved Instanceof = fromText "instanceof"
displayReserved New = fromText "new"
displayReserved Null = fromText "null"
displayReserved Return = fromText "return"
displayReserved Super = fromText "super"
displayReserved Switch = fromText "switch"
displayReserved This = fromText "this"
displayReserved Throw = fromText "throw"
displayReserved TrueVal = fromText "true"
displayReserved Try = fromText "try"
displayReserved Typeof = fromText "typeof"
displayReserved Var = fromText "var"
displayReserved Void = fromText "void"
displayReserved While = fromText "while"
displayReserved With = fromText "with"
displayReserved Yield = fromText "yield"
displayLiteral :: (ToText s) => Literal s -> s
displayLiteral (Number num) = num
displayLiteral (String s) = fromText $ "\"" <> toText s <> "\""
displayLiteral (Regex {body, flags}) = fromText $ "/" <> toText body <> "/" <> toText flags
displayLiteral (TemplateFragment frag) = displayTemplateFrag frag
displayTemplateFrag :: (ToText s) => TemplateFragment s -> s
displayTemplateFrag (NoSub s) = fromText $ "`" <> toText s <> "`"
displayTemplateFrag (TemplateHead s) = fromText $ "`" <> toText s <> "${"
displayTemplateFrag (TemplateMiddle s) = fromText $ "}" <> toText s <> "${"
displayTemplateFrag (TemplateTail s) = fromText $ "}" <> toText s <> "`"
displayPunc :: (ToText s) => Punctuator -> s
displayPunc Add = fromText "+"
displayPunc Sub = fromText "-"
displayPunc Mult = fromText "*"
displayPunc Div = fromText "/"
displayPunc Mod = fromText "%"
displayPunc Exp = fromText "**"
displayPunc Inc = fromText "++"
displayPunc Dec = fromText "--"
displayPunc Utilities.Javascript.LT = fromText "<"
displayPunc Utilities.Javascript.GT = fromText ">"
displayPunc LTEQ = fromText "<="
displayPunc GTEQ = fromText ">="
displayPunc DoubleEqual = fromText "=="
displayPunc NotEqual = fromText "!="
displayPunc TripleEqual = fromText "==="
displayPunc DoubleNotEqual = fromText "!=="
displayPunc LeftShift = fromText "<<"
displayPunc RightShift = fromText ">>"
displayPunc UnsignedRightShift = fromText ">>>"
displayPunc BitwiseAnd = fromText "&"
displayPunc BitwiseOr = fromText "|"
displayPunc BitwiseXor = fromText "^"
displayPunc BitwiseNot = fromText "~"
displayPunc LogicalAnd = fromText "&&"
displayPunc LogicalOr = fromText "||"
displayPunc LogicalNot = fromText "!"
displayPunc Nullish = fromText "??"
displayPunc Assign = fromText "="
displayPunc AddAssign = fromText "+="
displayPunc SubAssign = fromText "-="
displayPunc MultAssign = fromText "*="
displayPunc DivAssign = fromText "/="
displayPunc ModAssign = fromText "%="
displayPunc ExpAssign = fromText "**="
displayPunc LeftShiftAssign = fromText "<<="
displayPunc RightShiftAssign = fromText ">>="
displayPunc UnsignedRightShiftAssign = fromText ">>>="
displayPunc BitwiseAndAssign = fromText "&="
displayPunc BitwiseOrAssign = fromText "|="
displayPunc BitwiseXorAssign = fromText "^="
displayPunc LogicalAndAssign = fromText "&&="
displayPunc LogicalOrAssign = fromText "||="
displayPunc NullishAssign = fromText "??="
displayPunc LParen = fromText "("
displayPunc RParen = fromText ")"
displayPunc LCurly = fromText "{"
displayPunc RCurly = fromText "}"
displayPunc LSquare = fromText "["
displayPunc RSquare = fromText "]"
displayPunc Dot = fromText "."
displayPunc Spread = fromText "..."
displayPunc Semicolon = fromText ";"
displayPunc Comma = fromText ","
displayPunc OptionalChain = fromText "?."
-- yeah I guess I'm making a javascript tokenizer
-- s is either Text or String
-- Regex will be tokenized
data Token s
= WhiteSpace
| LineTerminator
| SingleLineComment s
| MultiLineComment s
| HashBangComment s
| Identifier s
| PrivateIdentifier s
| ReservedWord Reserved
| Literal (Literal s)
| Punc Punctuator
deriving (Eq)
data Reserved = Await | Break | Case | Catch | Class | Const | Continue | Debugger | Default | Delete | Do | Else | Enum | Export | Extends | FalseVal | Finally | For | Function | If | Import | In | Instanceof | New | Null | Return | Super | Switch | This | Throw | TrueVal | Try | Typeof | Var | Void | While | With | Yield deriving (Eq)
data Literal s = Number s | String s | Regex {body :: s, flags :: s} | TemplateFragment (TemplateFragment s) deriving (Eq)
data TemplateFragment s = NoSub s | TemplateHead s | TemplateMiddle s | TemplateTail s deriving (Eq)
data Punctuator = Add | Sub | Mult | Div | Mod | Exp | Inc | Dec | LT | GT | LTEQ | GTEQ | DoubleEqual | NotEqual | TripleEqual | DoubleNotEqual | LeftShift | RightShift {- >>> -} | UnsignedRightShift | BitwiseAnd | BitwiseOr | BitwiseXor | BitwiseNot | LogicalAnd | LogicalOr | LogicalNot {- ?? -} | Nullish | Assign | AddAssign | SubAssign | MultAssign | DivAssign | ModAssign | ExpAssign | LeftShiftAssign | RightShiftAssign | UnsignedRightShiftAssign | BitwiseAndAssign | BitwiseOrAssign | BitwiseXorAssign | LogicalAndAssign | LogicalOrAssign | NullishAssign | LParen | RParen | LCurly | RCurly | LSquare | RSquare | Dot | Spread | Semicolon | Comma | OptionalChain deriving (Eq)
tokens :: (Logger m, Characters s) => Parser s m [Token s]
tokens = do
hashbang <- (optional hashbang_comment)
tokens <- many token
pure $ (maybeToList hashbang) ++ tokens
exprAllowed :: (Stream s, Monad m) => Parser s m ()
exprAllowed = lift $ put ExprAllowed
exprNotAllowed :: (Stream s, Monad m) => Parser s m ()
exprNotAllowed = lift $ put ExprNotAllowed
exprNoop :: (Stream s, Monad m) => String -> Parser s m ()
-- string arg is just as a comment
exprNoop _ = pure ()
-- INFO: read https://github.com/jquery/esprima/blob/main/src/scanner.ts
-- and https://github.com/acornjs/acorn/blob/master/acorn/src/tokenize.js
-- specific logic at https://github.com/acornjs/acorn/blob/54097dcf8c08733695df7168692d0faac3a2f768/acorn/src/tokencontext.js#L92
-- https://astexplorer.net/
-- atm this is guesswork
token :: (Logger m, Characters s) => Parser s m (Token s)
token =
choice
[ try comment
<* exprNoop "comments don't change whether an expression is allowed or not",
try reserved_word
<* exprNoop "each reserved word can be different so it needs to be handled in there",
-- discovered when I realized yield can be an identifier and doesn't let you get away with nonsense
try identifier <* exprNotAllowed,
-- Assuming it's the same as identifier
try private_identifier <* exprNotAllowed,
-- briefly was concerned about {} but then realized that isn't a literal
try literal <* exprNotAllowed,
-- handled on a case by case basis
try punctuator <* exprNoop "Decided on a case by case basis",
try linebreak <* exprNoop "technically wrong due to semicolon insertion but hopefully that never comes up usage for this",
whitespace <* exprNoop "non linebreak whitespace doesn't change whether an expression is allowed or not, same as comments"
]
hashbang_comment :: (Logger m, Characters s) => Parser s m (Token s)
hashbang_comment = do
string "#!"
text <- many ((notFollowedBy newline) *> anySingle)
pure $ HashBangComment $ fromString text
comment :: (Logger m, Characters s) => Parser s m (Token s)
comment = (try singleline_com) <|> multiline_com
where
singleline_com = do
string "//"
text <- many ((notFollowedBy newline) *> anySingle)
pure $ SingleLineComment $ fromString text
multiline_com = do
string "/*"
text <- many ((notFollowedBy $ string "*/") *> anySingle)
pure $ MultiLineComment $ fromString text
reserved_word :: (Logger m, Characters s) => Parser s m (Token s)
reserved_word =
choice
[ try await <* exprAllowed,
try break <* exprNotAllowed,
try case_ <* exprAllowed,
try catch_ <* exprAllowed, -- literally invalid this is pure guess work
try class_ <* exprAllowed, -- literally invalid this is pure guess work
try const <* exprAllowed, -- literally invalid this is pure guess work
try continue <* exprNotAllowed,
try debugger <* exprNotAllowed,
try default_ <* exprNotAllowed, -- literally invalid this is pure guess work
try delete <* exprAllowed,
try do_ <* exprNotAllowed,
try if_ <* exprAllowed, -- literally invalid this is pure guess work
try else_ <* exprAllowed, -- literally invalid this is pure guess work
try enum <* exprNotAllowed, -- literally invalid this is pure guess work
try export <* exprNotAllowed, -- literally invalid this is pure guess work
try extends <* exprAllowed,
try false <* exprNotAllowed,
try finally_ <* exprNotAllowed, -- literally invalid this is pure guess work
try for_ <* exprNotAllowed, -- literally invalid this is pure guess work
try function <* exprNotAllowed, -- literally invalid this is pure guess work
try import_ <* exprNotAllowed, -- literally invalid this is pure guess work
try in_ <* exprAllowed,
try instanceof <* exprAllowed,
try new <* exprAllowed,
try null <* exprNotAllowed,
try return <* exprAllowed,
try super <* exprNotAllowed,
try switch <* exprNotAllowed,
try this <* exprNotAllowed,
try throw_ <* exprAllowed,
try true <* exprNotAllowed,
try try_ <* exprNotAllowed, -- literally invalid this is pure guess work
try typeof <* exprAllowed,
try var <* exprNotAllowed,
try void <* exprNotAllowed, -- literally invalid this is pure guess work
try while <* exprNotAllowed, -- literally invalid this is pure guess work
try with <* exprNotAllowed, -- literally invalid this is pure guess work
yield <* exprNotAllowed
]
where
await = string "await" *> pure (ReservedWord Await)
break = string "break" *> pure (ReservedWord Break)
case_ = string "case" *> pure (ReservedWord Case)
catch_ = string "catch" *> pure (ReservedWord Catch)
class_ = string "class" *> pure (ReservedWord Class)
const = string "const" *> pure (ReservedWord Const)
continue = string "continue" *> pure (ReservedWord Continue)
debugger = string "debugger" *> pure (ReservedWord Debugger)
default_ = string "default" *> pure (ReservedWord Default)
delete = string "delete" *> pure (ReservedWord Delete)
do_ = string "do" *> pure (ReservedWord Do)
else_ = string "else" *> pure (ReservedWord Else)
enum = string "enum" *> pure (ReservedWord Enum)
export = string "export" *> pure (ReservedWord Export)
extends = string "extends" *> pure (ReservedWord Extends)
false = string "false" *> pure (ReservedWord FalseVal)
finally_ = string "finally" *> pure (ReservedWord Finally)
for_ = string "for" *> pure (ReservedWord For)
function = string "function" *> pure (ReservedWord Function)
if_ = string "if" *> pure (ReservedWord If)
import_ = string "import" *> pure (ReservedWord Import)
in_ = string "in" *> pure (ReservedWord In)
instanceof = string "instanceof" *> pure (ReservedWord Instanceof)
new = string "new" *> pure (ReservedWord New)
null = string "null" *> pure (ReservedWord Null)
return = string "return" *> pure (ReservedWord Return)
super = string "super" *> pure (ReservedWord Super)
switch = string "switch" *> pure (ReservedWord Switch)
this = string "this" *> pure (ReservedWord This)
throw_ = string "throw" *> pure (ReservedWord Throw)
true = string "true" *> pure (ReservedWord TrueVal)
try_ = string "try" *> pure (ReservedWord Try)
typeof = string "typeof" *> pure (ReservedWord Typeof)
var = string "var" *> pure (ReservedWord Var)
void = string "void" *> pure (ReservedWord Void)
while = string "while" *> pure (ReservedWord While)
with = string "with" *> pure (ReservedWord With)
yield = string "yield" *> pure (ReservedWord Yield)
identifier :: forall s m. (Logger m, Characters s) => Parser s m (Token s)
identifier = do
first <- start_char
rem <- many rem_char
pure $ Identifier $ fromString (first : rem)
where
start_char :: Parser s m (MP.Token s)
start_char = (char '$') <|> char '_' <|> letterChar
rem_char :: Parser s m (MP.Token s)
rem_char = start_char <|> digitChar
private_identifier :: forall s m. (Logger m, Characters s) => Parser s m (Token s)
private_identifier =
char '#'
*> identifier
<&> \(Identifier i) -> PrivateIdentifier i
literal :: forall s m. (Logger m, Characters s) => Parser s m (Token s)
literal =
Literal
<$> ( choice
[ try template_fragment,
try string_lit,
num_lit
]
)
where
template_fragment :: Parser s m (Literal s)
template_fragment = TemplateFragment <$> (choice [try head_temp_frag, try mid_temp_frag, try tail_temp_frag, no_sub_temp_frag])
no_sub_temp_frag = do
char '`'
contents <- many template_char
char '`'
pure $ NoSub $ fromText $ mconcat $ map toText $ contents
head_temp_frag :: Parser s m (TemplateFragment s)
head_temp_frag = do
char '`'
contents <- many template_char
string "${"
pure $ TemplateHead $ fromText $ mconcat $ map toText $ contents
mid_temp_frag = do
char '}'
contents <- many template_char
string "${"
pure $ TemplateMiddle $ fromText $ mconcat $ map toText $ contents
tail_temp_frag = do
char '}'
contents <- many template_char
char '`'
pure $ TemplateTail $ fromText $ mconcat $ map toText $ contents
template_char :: Parser s m s
template_char =
fromText . toText
<$> choice
[ try (string "$" <* (notFollowedBy $ char '{')),
try escape_seq,
try ((optional $ char '\\') *> (eol)),
-- I'm sure this is doable without do but do makes it much easier
do
notFollowedBy (choice [void linebreak, void $ char '`', void $ char '\\', void $ char '$'])
c <- source_char
pure $ fromString $ c : []
]
source_char = anySingle
escape_seq = do
char '\\'
ret <- anySingle
pure $ fromString ('\\' : [ret])
num_lit = Number <$> (choice [try legacy_oct, try decimal_bigint, try decimal_literal, try hex_int, try oct_int, try bin_int, zero])
zero = char '0' *> pure "0"
decimal_literal = fromString <$> some (digitChar <|> char '_')
decimal_bigint = do
most <- decimal_literal
char 'n'
pure $ fromText $ toText most <> "n"
legacy_oct = char '0' *> (fromString <$> some (octDigitChar <|> char '_'))
oct_int = char '0' *> (char 'o' <|> char 'O') *> (fromString <$> some (octDigitChar <|> char '_'))
hex_int = char '0' *> (char 'x' <|> char 'X') *> (fromString <$> some (hexDigitChar <|> char '_'))
bin_int = char '0' *> (char 'b' <|> char 'B') *> (fromString <$> some (binDigitChar <|> char '_'))
string_lit = String <$> ((try $ g_string '\'') <|> g_string '"')
-- turn from [String] to String to s
g_string :: Char -> Parser s m s
g_string c = mconcat <$> between (char c) (char c) (many $ str_char c)
str_char :: Char -> Parser s m s
str_char c = choice [try escape_seq, try line_continuation, basic_str_char c]
basic_str_char :: Char -> Parser s m s
basic_str_char c = fromString . (: []) <$> (notFollowedBy ((void linebreak) <|> (void $ char '\\') <|> (void $ char c)) *> anySingle)
line_continuation :: Parser s m s
line_continuation = do
char '\\'
c <- linebreak
-- technically wrong but I'm lazy
pure $ fromText "\\\n"
fslash_handler :: forall s m. (Logger m, Characters s) => Parser s m (Token s)
fslash_handler = do
allowed <- lift $ get
let re = case allowed of
ExprNotAllowed -> empty
ExprAllowed -> regex_literal
choice [try re, try division_assign, division]
where
regex_literal :: Parser s m (Token s)
regex_literal = do
char '/'
error "TODO"
pure $ Literal $ Regex {}
division_assign :: Parser s m (Token s)
division_assign = (string "/=") *> (pure $ Punc $ DivAssign :: Parser s m (Token s))
division :: Parser s m (Token s)
division = char '/' *> (pure $ Punc $ Div :: Parser s m (Token s))
punctuator :: (Logger m, Characters s) => Parser s m (Token s)
punctuator =
Punc
<$> ( choice
[ try $ string ">>>=" *> pure UnsignedRightShiftAssign <* exprAllowed,
try $ string "..." *> pure Spread <* exprNotAllowed,
try $ string "===" *> pure TripleEqual <* exprAllowed,
try $ string "!==" *> pure DoubleNotEqual <* exprAllowed,
try $ string "<<=" *> pure LeftShiftAssign <* exprAllowed,
try $ string ">>=" *> pure RightShiftAssign <* exprAllowed,
try $ string ">>>" *> pure UnsignedRightShift <* exprAllowed,
try $ string "**=" *> pure ExpAssign <* exprAllowed,
try $ string "&&=" *> pure LogicalAndAssign <* exprAllowed,
try $ string "||=" *> pure LogicalOrAssign <* exprAllowed,
try $ string "??=" *> pure NullishAssign <* exprAllowed,
-- best effort guess based on my usage that it'll always be postcrement
-- Shouldn't come up in my use case though
try $ string "++" *> pure Inc <* exprNotAllowed,
try $ string "--" *> pure Dec <* exprNotAllowed,
try $ string "?." *> (notFollowedBy digitChar) *> pure OptionalChain <* exprNotAllowed,
try $ string "**" *> pure Exp <* exprAllowed,
try $ string "<=" *> pure LTEQ <* exprAllowed,
try $ string ">=" *> pure GTEQ <* exprAllowed,
try $ string "==" *> pure DoubleEqual <* exprAllowed,
try $ string "!=" *> pure NotEqual <* exprAllowed,
try $ string "<<" *> pure LeftShift <* exprAllowed,
try $ string ">>" *> pure RightShift <* exprAllowed,
try $ string "+=" *> pure AddAssign <* exprAllowed,
try $ string "-=" *> pure SubAssign <* exprAllowed,
try $ string "*=" *> pure MultAssign <* exprAllowed,
try $ string "%=" *> pure ModAssign <* exprAllowed,
try $ string "&=" *> pure BitwiseAndAssign <* exprAllowed,
try $ string "|=" *> pure BitwiseOrAssign <* exprAllowed,
try $ string "^=" *> pure BitwiseXorAssign <* exprAllowed,
try $ string "&&" *> pure LogicalAnd <* exprAllowed,
try $ string "||" *> pure LogicalOr <* exprAllowed,
try $ string "??" *> pure Nullish <* exprAllowed,
char '+' *> pure Add <* exprAllowed,
char '-' *> pure Sub <* exprAllowed,
char '*' *> pure Mult <* exprAllowed,
char '%' *> pure Mod <* exprAllowed,
char '<' *> pure Utilities.Javascript.LT <* exprAllowed,
char '>' *> pure Utilities.Javascript.GT <* exprAllowed,
char '&' *> pure BitwiseAnd <* exprAllowed,
char '|' *> pure BitwiseOr <* exprAllowed,
char '^' *> pure BitwiseXor <* exprAllowed,
char '~' *> pure BitwiseNot <* exprAllowed,
char '=' *> pure Assign <* exprAllowed,
char ';' *> pure Semicolon <* exprAllowed,
char ',' *> pure Comma <* exprAllowed,
char '!' *> pure LogicalNot <* exprAllowed,
-- Note: parens and curlies are unambiguously ambiguous
-- Opening ones will generally allow an expression and closing ones will generally not allow an expression
char '(' *> pure LParen <* exprAllowed,
char ')' *> pure RParen <* exprNotAllowed,
char '{' *> pure LCurly <* exprAllowed,
char '}' *> pure RCurly <* exprNotAllowed,
char '[' *> pure LSquare <* exprNotAllowed,
char ']' *> pure RSquare <* exprNotAllowed,
char '.' *> pure Dot <* exprNotAllowed
]
)
linebreak :: (Logger m, Characters s) => Parser s m (Token s)
linebreak = newline *> pure LineTerminator
whitespace :: (Logger m, Characters s) => Parser s m (Token s)
whitespace = hspace *> pure WhiteSpace