{-# LANGUAGE OverloadedStrings #-} module Lexer ( space, lexeme, symbol , number , identifier , charLiteral, stringLiteral , surroundWith, parens, brackets, sqBrackets ) where import Text.Megaparsec.Char ( space1, char, hexDigitChar, letterChar ) import qualified Text.Megaparsec.Char.Lexer as L import Text.Megaparsec import Parser.Util import Data.Scientific (Scientific) import Numeric (readHex) import Data.Text (Text, cons) import Data.Char (isAlphaNum) -- | Parses whitespace. space :: Parser () space = L.space space1 (L.skipLineComment "//") (L.skipBlockComment "/*" "*/") -- | Wrapper for lexemes. lexeme :: Parser a -> Parser a lexeme = L.lexeme space symbol :: Text -> Parser Text symbol = L.symbol space -- -- | Parse a number with an optional sign. -- signed :: Num a => Parser a -> Parser a -- signed = L.signed space -- | Parse a number. This number can have arbitrary integer and fractional digits. number :: Parser Scientific number = lexeme L.scientific -- TODO: parse binary and hexadecimal digits. identifier :: Parser Text identifier = lexeme $ cons <$> (letterChar <|> char '_') <*> takeWhileP (Just "letter") (\c -> isAlphaNum c || c == '_') -- | Parse an escape sequence. escapedChar :: Parser Char escapedChar = char '\\' *> choice [ '"' <$ char '"' -- Escaped double quote. , '\'' <$ char '\'' -- Escaped single quote. , '\\' <$ char '\\' -- Escaped backslash. , '\n' <$ char 'n' -- Newline. , '\t' <$ char 't' -- Tab. , '\r' <$ char 'r' -- Carriage return. , '\f' <$ char 'f' -- Form feed. , '\b' <$ char 'b' -- Backspace. , '\v' <$ char 'v' -- Vertical tab. , unicodeEscape -- Unicode escape sequences. ] -- TODO: better error message (say "not a valid escape sequence" instead of "unexpected... expecting ...") -- | Parse an escape sequence for unicode characters. unicodeEscape :: Parser Char unicodeEscape = check . readHex =<< (char 'u' *> count 4 hexDigitChar) where check [(n, "")] = return $ toEnum n check _ = fail "Invalid Unicode escape sequence" -- | Parse a string literal. stringLiteral :: Parser String stringLiteral = char '"' *> manyTill (escapedChar <|> noneOf ['\\', '"'] "character") (char '"') -- | Parse a character literal. charLiteral :: Parser Char charLiteral = char '\'' *> (escapedChar <|> anySingleBut '\\' "character") <* char '\'' surroundWith :: Char -> Char -> Parser a -> Parser a surroundWith l r p = char l *> p <* char r parens :: Parser a -> Parser a parens = surroundWith '(' ')' brackets :: Parser a -> Parser a brackets = surroundWith '{' '}' sqBrackets :: Parser a -> Parser a sqBrackets = surroundWith '[' ']'