{-# LANGUAGE DerivingStrategies #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE ViewPatterns #-} module DearImGui.Generator.Tokeniser where -- base import Control.Arrow ( first, second ) import Control.Applicative ( (<|>), some ) import Data.Char ( isAlpha, isAlphaNum, isDigit, isPunctuation, isSpace, isSymbol, toLower ) import Data.Function ( (&) ) import Data.Functor ( ($>) ) import Data.Monoid ( Sum(..) ) -- megaparsec import Text.Megaparsec ( MonadParsec, VisualStream(..) , chunk, parseMaybe, satisfy, try ) import Text.Megaparsec.Char.Lexer ( hexadecimal, scientific ) -- parser-combinators import Control.Monad.Combinators ( optional ) -- scientific import Data.Scientific ( Scientific ) -- text import Data.Text ( Text ) import qualified Data.Text as Text ( break, breakOn, cons, drop, dropWhile , head, last, length , pack, snoc, span, strip, tail, take , uncons, unpack ) -- unordered-containers import Data.HashSet ( HashSet ) import qualified Data.HashSet as HashSet ( fromList, member ) -------------------------------------------------------------------------------- data TokeniserError = Couldn'tParseNumber { problem :: !Text } | UnhandledCase { unhandled :: !( Char, Text ) } deriving stock ( Eq, Ord, Show ) data Tok = Keyword !Text | ReservedSymbol !Char | Symbolic !Text | Identifier !Text | Comment !Text | Char !Char | String !Text | Number !Scientific !Text | BeginCPP !Text | EndCPPLine deriving stock ( Show, Eq, Ord ) showToken :: Tok -> String showToken = \case Keyword t -> Text.unpack t ReservedSymbol c -> [c] Symbolic t -> Text.unpack t Identifier t -> Text.unpack t Comment t -> Text.unpack t Char c -> [c] String t -> Text.unpack t Number s t -> show s <> Text.unpack t BeginCPP t -> "#" <> Text.unpack t EndCPPLine -> "EndCppLine" tokenLength :: Tok -> Int tokenLength = \case Keyword t -> Text.length t ReservedSymbol _ -> 1 Symbolic t -> Text.length t Identifier t -> Text.length t Comment t -> Text.length t Char _ -> 1 String t -> Text.length t Number s t -> length ( show s ) + Text.length t BeginCPP t -> 1 + Text.length t EndCPPLine -> length ( "EndCPPLine" :: String ) instance VisualStream [Tok] where showTokens _ = foldMap showToken tokensLength _ = getSum . foldMap ( Sum . tokenLength ) keywords :: HashSet Text keywords = HashSet.fromList [ "auto", "break", "case", "char", "const", "continue", "default", "do", "double" , "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int", "long" , "register", "restrict", "return", "short", "signed", "sizeof", "static", "struct" , "switch", "typedef", "union", "unsigned", "void", "volatile", "while" ] reservedSymbols :: HashSet Char reservedSymbols = HashSet.fromList [ '(', ')', '{', '}', ',', ';', '=', '#' ] tokenise :: Text -> Either TokeniserError [ Tok ] tokenise ( Text.uncons -> Just ( c, cs ) ) | isSpace c = tokenise ( Text.dropWhile isSpace cs ) | isAlpha c || c == '_' , let this, rest :: Text ( this, rest ) = first ( c `Text.cons` ) $ Text.span ( \ x -> isAlphaNum x || x == '_' ) cs = if this `HashSet.member` keywords then ( Keyword this : ) <$> tokenise rest else ( Identifier this : ) <$> tokenise rest | isDigit c , let this, rest :: Text ( this, rest ) = continuePastExponent $ first ( c `Text.cons` ) $ Text.span ( \ x -> isAlphaNum x || x == '.' ) cs = case parseMaybe @() parseNumber this of Just numTok -> ( numTok : ) <$> tokenise rest Nothing -> Left ( Couldn'tParseNumber { problem = this } ) | c == '\'' , Just ( '\'', rest ) <- Text.uncons ( Text.drop 1 cs ) = ( Char ( Text.head cs ) : ) <$> tokenise rest | c == '\"' , let this, rest :: Text ( this, rest ) = second Text.tail $ Text.break ( == '"') cs = ( String this : ) <$> tokenise rest | c == '#' , let directive, line, rest :: Text ( directive, ( line, rest ) ) = cs & Text.break ( isSpace ) & second ( Text.break ( `elem` [ '\n', '\r' ] ) ) = do lineTokens <- tokenise line restTokens <- tokenise rest pure ( ( BeginCPP directive : lineTokens ) <> ( EndCPPLine : restTokens ) ) | c `HashSet.member` reservedSymbols = ( ReservedSymbol c : ) <$> tokenise cs | c == '/' = case Text.take 1 cs of "/" -> let comm, rest :: Text ( comm, rest ) = first Text.strip $ Text.break ( `elem` [ '\n', '\r' ] ) ( Text.drop 1 cs ) in ( Comment comm : ) <$> tokenise rest "*" -> let comm, rest :: Text ( comm, rest ) = Text.breakOn "*/" ( Text.drop 1 cs ) in ( Comment comm : ) <$> tokenise rest _ -> let this, rest :: Text ( this, rest ) = first ( c `Text.cons` ) $ Text.span ( \ x -> x /= '_' && ( isSymbol x || isPunctuation x ) ) cs in ( Symbolic this : ) <$> tokenise rest | isSymbol c || isPunctuation c , let this, rest :: Text ( this, rest ) = first ( c `Text.cons` ) $ Text.span ( \ x -> x /= '_' && ( isSymbol x || isPunctuation x ) ) cs = ( Symbolic this : ) <$> tokenise rest | otherwise = Left $ UnhandledCase { unhandled = ( c, cs ) } tokenise _ = Right [] continuePastExponent :: ( Text, Text ) -> ( Text, Text ) continuePastExponent ( this, rest ) | toLower ( Text.last this ) `elem` [ 'e', 'p' ] , Just ( r, rs ) <- Text.uncons rest , r `elem` [ '+', '-' ] , ( this', rest' ) <- Text.span isAlphaNum rs = ( this `Text.snoc` r <> this', rest' ) | otherwise = ( this, rest ) parseNumber :: MonadParsec e Text m => m Tok parseNumber = try ( chunk "0.f" $> Number 0 "f" ) <|> do value <- try ( chunk "0x" *> hexadecimal ) <|> scientific mbSuffix <- fmap ( maybe "" Text.pack ) . optional . some $ satisfy ( \ s -> toLower s `elem` ( "uflz" :: String ) ) pure ( Number value mbSuffix )