more js tokenization work

This commit is contained in:
Pagwin 2025-12-28 20:38:13 -05:00
parent 0f44431086
commit 7df40c237c
No known key found for this signature in database
GPG key ID: 81137023740CA260

View file

@ -10,12 +10,14 @@ where
import Control.Applicative (Alternative (many), optional, (<|>))
import Data.Data (Proxy (Proxy))
import Data.Functor ((<&>))
import Data.Maybe (maybeToList)
import Data.String (IsString (fromString))
import Data.Void (Void)
import Logger
import Text.Megaparsec (MonadParsec (notFollowedBy, try), ParseErrorBundle, Stream (tokensToChunk), anySingle, choice, parse)
import Text.Megaparsec.Char (hspace, newline, string)
import qualified Text.Megaparsec as MP
import Text.Megaparsec.Char (char, digitChar, hspace, letterChar, newline, string)
import Utilities.Parsing
minify :: (Characters s) => [Token s] -> [Token s]
@ -30,8 +32,8 @@ minify = reduce_identifiers . remove_redundants
WhiteSpace -> False
_ -> True
toTokens :: (Characters s) => s -> Either (ParseErrorBundle s Void) [Token s]
toTokens = parse tokens ""
toTokens :: (Characters s) => String -> s -> Either (ParseErrorBundle s Void) [Token s]
toTokens = parse tokens
displayToken :: (ToText s) => Token s -> s
displayToken _ = error "TODO"
@ -50,14 +52,15 @@ data Token s
| ReservedWord Reserved
| Literal (Literal s)
| Punc Punctuator
deriving (Eq)
data Reserved = Await | Break | Case | Catch | Class | Const | Continue | Debugger | Default | Delete | Do | Else | Enum | Export | Extends | FalseVal | Finally | For | Function | If | Import | In | Instanceof | New | Null | Return | Super | Switch | This | Throw | TrueVal | Try | Typeof | Var | Void | While | With | Yield
data Reserved = Await | Break | Case | Catch | Class | Const | Continue | Debugger | Default | Delete | Do | Else | Enum | Export | Extends | FalseVal | Finally | For | Function | If | Import | In | Instanceof | New | Null | Return | Super | Switch | This | Throw | TrueVal | Try | Typeof | Var | Void | While | With | Yield deriving (Eq)
data Literal s = Number s | String s | Regex s | TemplateFragment (TemplateFragment s)
data Literal s = Number s | String s | Regex s | TemplateFragment (TemplateFragment s) deriving (Eq)
data TemplateFragment s = NoSub s | TemplateHead s | TemplateMiddle s | TemplateTail s
data TemplateFragment s = NoSub s | TemplateHead s | TemplateMiddle s | TemplateTail s deriving (Eq)
data Punctuator = Add | Sub | Mult | Div | Mod | Exp | Inc | Dec | LT | GT | LTEQ | GTEQ | DoubleEqual | NotEqual | TripleEqual | DoubleNotEqual | LeftShift | RightShift {- >>> -} | UnsignedRightShift | BitwiseAnd | BitwiseOr | BitwiseXor | BitwiseNot | LogicalAnd | LogicalOr | LogicalNot {- ?? -} | Nullish | Assign | AddAssign | SubAssign | MultAssign | DivAssign | ModAssign | ExpAssign | LeftShiftAssign | RightShiftAssign | UnsignedRightShiftAssign | BitwiseAndAssign | BitwiseOrAssign | BitwiseXorAssign | LogicalAndAssign | LogicalOrAssign | NullishAssign | LParen | RParen | LCurly | RCurly | LSquare | RSquare | Dot | Spread | Semicolon | Comma | OptionalChain
data Punctuator = Add | Sub | Mult | Div | Mod | Exp | Inc | Dec | LT | GT | LTEQ | GTEQ | DoubleEqual | NotEqual | TripleEqual | DoubleNotEqual | LeftShift | RightShift {- >>> -} | UnsignedRightShift | BitwiseAnd | BitwiseOr | BitwiseXor | BitwiseNot | LogicalAnd | LogicalOr | LogicalNot {- ?? -} | Nullish | Assign | AddAssign | SubAssign | MultAssign | DivAssign | ModAssign | ExpAssign | LeftShiftAssign | RightShiftAssign | UnsignedRightShiftAssign | BitwiseAndAssign | BitwiseOrAssign | BitwiseXorAssign | LogicalAndAssign | LogicalOrAssign | NullishAssign | LParen | RParen | LCurly | RCurly | LSquare | RSquare | Dot | Spread | Semicolon | Comma | OptionalChain deriving (Eq)
tokens :: (Logger m, Characters s) => Parser s m [Token s]
tokens = do
@ -138,26 +141,103 @@ reserved_word = choice [try await, try break, try case_, try catch_, try class_,
with = string "with" *> pure (ReservedWord With)
yield = string "yield" *> pure (ReservedWord Yield)
identifier :: (Logger m, Characters s) => Parser s m (Token s)
identifier :: forall s m. (Logger m, Characters s) => Parser s m (Token s)
identifier = do
first <- start_char
rem <- many rem_char
let tmp = toString $ tokensToChunk (Proxy :: Proxy s) rem
pure $ Identifier $ fromString (first : tmp)
pure $ Identifier $ fromString (first : rem)
where
start_char :: Parser s m (Token s)
start_char = error "TODO"
rem_char :: Parser s m (Token s)
rem_char = error "TODO"
start_char :: Parser s m (MP.Token s)
start_char = (char '$') <|> char '_' <|> letterChar
rem_char :: Parser s m (MP.Token s)
rem_char = start_char <|> digitChar
private_identifier :: (Logger m, Characters s) => Parser s m (Token s)
private_identifier = error "TODO"
private_identifier =
char '#'
*> identifier
<&> \(Identifier i) -> PrivateIdentifier i
literal :: (Logger m, Characters s) => Parser s m (Token s)
literal = error "TODO"
literal =
Literal
<$> ( choice
[ try template_fragment,
try string_lit,
num_lit
]
)
where
template_fragment = TemplateFragment <$> error "TODO"
string_lit = String <$> error "TODO"
num_lit = Number <$> (choice [try decimal_literal, try decimal_bigint, try plain_bigint, try normal_integer, octal_int])
decimal_literal = error "TODO"
decimal_bigint = error "TODO"
plain_bigint = error "TODO"
normal_integer = error "TODO"
octal_int = error "TODO"
fslash_handler :: (Logger m, Characters s) => Parser s m (Token s)
fslash_handler = error "TODO: Regex literal, division and division assignment"
punctuator :: (Logger m, Characters s) => Parser s m (Token s)
punctuator = error "TODO"
punctuator =
Punc
<$> ( choice
[ try $ string ">>>=" *> pure UnsignedRightShiftAssign,
try $ string "..." *> pure Spread,
try $ string "===" *> pure TripleEqual,
try $ string "!==" *> pure DoubleNotEqual,
try $ string "<<=" *> pure LeftShiftAssign,
try $ string ">>=" *> pure RightShiftAssign,
try $ string ">>>" *> pure UnsignedRightShift,
try $ string "**=" *> pure ExpAssign,
try $ string "&&=" *> pure LogicalAndAssign,
try $ string "||=" *> pure LogicalOrAssign,
try $ string "??=" *> pure NullishAssign,
try $ string "?." *> (notFollowedBy digitChar) *> pure OptionalChain,
try $ string "**" *> pure Exp,
try $ string "++" *> pure Inc,
try $ string "--" *> pure Dec,
try $ string "<=" *> pure LTEQ,
try $ string ">=" *> pure GTEQ,
try $ string "==" *> pure DoubleEqual,
try $ string "!=" *> pure NotEqual,
try $ string "<<" *> pure LeftShift,
try $ string ">>" *> pure RightShift,
try $ string "+=" *> pure AddAssign,
try $ string "-=" *> pure SubAssign,
try $ string "*=" *> pure MultAssign,
try $ string "%=" *> pure ModAssign,
try $ string "&=" *> pure BitwiseAndAssign,
try $ string "|=" *> pure BitwiseOrAssign,
try $ string "^=" *> pure BitwiseXorAssign,
try $ string "&&" *> pure LogicalAnd,
try $ string "||" *> pure LogicalOr,
try $ string "??" *> pure Nullish,
char '+' *> pure Add,
char '-' *> pure Sub,
char '*' *> pure Mult,
char '%' *> pure Mod,
char '<' *> pure Utilities.Javascript.LT,
char '>' *> pure Utilities.Javascript.GT,
char '&' *> pure BitwiseAnd,
char '|' *> pure BitwiseOr,
char '^' *> pure BitwiseXor,
char '~' *> pure BitwiseNot,
char '=' *> pure Assign,
char '(' *> pure LParen,
char ')' *> pure RParen,
char '{' *> pure LCurly,
char '}' *> pure RCurly,
char '[' *> pure LSquare,
char ']' *> pure RSquare,
char '.' *> pure Dot,
char ';' *> pure Semicolon,
char ',' *> pure Comma,
char '!' *> pure LogicalNot
]
)
linebreak :: (Logger m, Characters s) => Parser s m (Token s)
linebreak = newline *> pure WhiteSpace