{ {-# LANGUAGE OverloadedStrings #-} -- | Lexing logic for the Annah language module Annah.Lexer ( -- * Lexer lexExpr, -- * Types Token(..), Position(..), LocatedToken(..) ) where import Control.Monad.Trans.State.Strict (State) import Data.Bits (shiftR, (.&.)) import Data.Char (ord, digitToInt, isDigit) import Data.Int (Int64) import Data.Text.Lazy (Text) import Data.Word (Word8) import Filesystem.Path.CurrentOS (FilePath, fromText) import Lens.Family.State.Strict ((.=), (+=)) import Pipes (Producer, for, lift, yield) import Prelude hiding (FilePath) import qualified Control.Monad.Trans.State.Strict as State import qualified Data.Text.Lazy as Text } $digit = 0-9 -- Same as Haskell $opchar = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~] $fst = [A-Za-z_] $label = [A-Za-z0-9_] $nonwhite = ~$white $whiteNoNewline = $white # \n $path = [$label \\\/\.] tokens :- $whiteNoNewline+ ; \n { \_ -> lift (do line += 1 column .= 0 ) } "--".* ; "(" { \_ -> yield OpenParen } ")" { \_ -> yield CloseParen } "{" { \_ -> yield OpenBrace } "}" { \_ -> yield CloseBrace } "[nil" { \_ -> yield OpenList } "[id" { \_ -> yield OpenPath } "]" { \_ -> yield CloseBracket } "," { \_ -> yield Comma } ":" { \_ -> yield Colon } ";" { \_ -> yield Semicolon } "@" { \_ -> yield At } "*" { \_ -> yield Star } "BOX" | "□" { \_ -> yield Box } "->" | "→" { \_ -> yield Arrow } "<-" | "←" { \_ -> yield LArrow } "\/" | "|~|" | "forall" | "∀" | "Π" { \_ -> yield Pi } "\" | "λ" { \_ -> yield Lambda } "type" { \_ -> yield Type } "fold" { \_ -> yield Fold } "data" { \_ -> yield Data } "let" { \_ -> yield Let } "=" { \_ -> yield Equals } "in" { \_ -> yield In } "do" { \_ -> yield Do } $digit+ { \text -> yield (Number (toInt text)) } $fst $label* | "(" $opchar+ ")" { \text -> yield (Label text) } "https://" $nonwhite+ { \text -> yield (URL text) } "http://" $nonwhite+ { \text -> yield (URL text) } "/" $nonwhite+ { \text -> yield (File (toFile 0 text)) } "./" $nonwhite+ { \text -> yield (File (toFile 2 text)) } "../" $nonwhite+ { \text -> yield (File (toFile 0 text)) } { toInt :: Text -> Int toInt = Text.foldl' (\x c -> 10 * x + digitToInt c) 0 toFile :: Int64 -> Text -> FilePath toFile n = fromText . Text.toStrict . Text.drop n trim :: Text -> Text trim = Text.init . Text.tail -- This was lifted almost intact from the @alex@ source code encode :: Char -> (Word8, [Word8]) encode c = (fromIntegral h, map fromIntegral t) where (h, t) = go (ord c) go n | n <= 0x7f = (n, []) | n <= 0x7ff = (0xc0 + (n `shiftR` 6), [0x80 + n .&. 0x3f]) | n <= 0xffff = ( 0xe0 + (n `shiftR` 12) , [ 0x80 + ((n `shiftR` 6) .&. 0x3f) , 0x80 + n .&. 0x3f ] ) | otherwise = ( 0xf0 + (n `shiftR` 18) , [ 0x80 + ((n `shiftR` 12) .&. 0x3f) , 0x80 + ((n `shiftR` 6) .&. 0x3f) , 0x80 + n .&. 0x3f ] ) -- | The cursor's location while lexing the text data Position = P { lineNo :: {-# UNPACK #-} !Int , columnNo :: {-# UNPACK #-} !Int } deriving (Show) -- line :: Lens' Position Int line :: Functor f => (Int -> f Int) -> Position -> f Position line k (P l c) = fmap (\l' -> P l' c) (k l) -- column :: Lens' Position Int column :: Functor f => (Int -> f Int) -> Position -> f Position column k (P l c) = fmap (\c' -> P l c') (k c) {- @alex@ does not provide a `Text` wrapper, so the following code just modifies the code from their @basic@ wrapper to work with `Text` I could not get the @basic-bytestring@ wrapper to work; it does not correctly recognize Unicode regular expressions. -} data AlexInput = AlexInput { prevChar :: Char , currBytes :: [Word8] , currInput :: Text } alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) alexGetByte (AlexInput c bytes text) = case bytes of b:ytes -> Just (b, AlexInput c ytes text) [] -> case Text.uncons text of Nothing -> Nothing Just (t, ext) -> case encode t of (b, ytes) -> Just (b, AlexInput t ytes ext) alexInputPrevChar :: AlexInput -> Char alexInputPrevChar = prevChar {-| Convert a text representation of an expression into a stream of tokens `lexExpr` keeps track of position and returns the remainder of the input if lexing fails. -} lexExpr :: Text -> Producer LocatedToken (State Position) (Maybe Text) lexExpr text = for (go (AlexInput '\n' [] text)) tag where tag token = do pos <- lift State.get yield (LocatedToken token pos) go input = case alexScan input 0 of AlexEOF -> return Nothing AlexError (AlexInput _ _ text) -> return (Just text) AlexSkip input' len -> do lift (column += len) go input' AlexToken input' len act -> do act (Text.take (fromIntegral len) (currInput input)) lift (column += len) go input' -- | A `Token` augmented with `Position` information data LocatedToken = LocatedToken { token :: !Token , position :: {-# UNPACK #-} !Position } deriving (Show) -- | Token type, used to communicate between the lexer and parser data Token = OpenParen | CloseParen | OpenBrace | CloseBrace | OpenList | OpenPath | CloseBracket | Period | Comma | Colon | Semicolon | At | Star | Box | Arrow | LArrow | Lambda | Pi | Type | Fold | Data | Let | Equals | In | Do | Label Text | Number Int | File FilePath | URL Text | EOF deriving (Eq, Show) }