module Wumpus.Microprint.Tokenizer
(
TokenizerConfig(..)
, haskellTokenizer
, runTokenizer
) where
import Wumpus.Microprint.Datatypes
import Wumpus.Basic.Utils.HList
import Wumpus.Core
import Control.Applicative
import Control.Monad
import Data.Char ( isSpace )
import Data.List
data TokenizerConfig = TokenizerConfig
{ standard_colour :: RGBi
, sgl_comment_start :: String
, comment_start :: String
, comment_end :: String
, comment_colour :: RGBi
}
haskellTokenizer :: RGBi -> RGBi -> TokenizerConfig
haskellTokenizer std_rgb comment_rgb = TokenizerConfig
{ standard_colour = std_rgb
, sgl_comment_start = "--"
, comment_start = "{-"
, comment_end = "-}"
, comment_colour = comment_rgb
}
data TokState = CommentML | CommentSL | Normal
deriving (Eq,Ord,Show)
data St = St TokState (H Tile)
newtype Lexer a = Lexer { getLexer :: TokenizerConfig -> St -> (a,St) }
instance Functor Lexer where
fmap f m = Lexer $ \r s -> let (a,s1) = getLexer m r s in (f a, s1)
instance Applicative Lexer where
pure a = Lexer $ \_ s -> (a,s)
mf <*> ma = Lexer $ \r s -> let (f,s1) = getLexer mf r s
(a,s2) = getLexer ma r s1
in (f a, s2)
instance Monad Lexer where
return a = Lexer $ \_ s -> (a,s)
m >>= k = Lexer $ \r s -> let (a,s1) = getLexer m r s
in (getLexer . k) a r s1
tellSpaces :: Int -> Lexer ()
tellSpaces i = Lexer $ \_ (St ts ac) ->
let ac1 = snocH ac (Space i) in ((), St ts ac1)
tellChars :: Int -> RGBi -> Lexer ()
tellChars i rgb = Lexer $ \_ (St ts ac) ->
let ac1 = snocH ac (Word rgb i) in ((), St ts ac1)
askColour :: Lexer RGBi
askColour = Lexer $ \r s@(St ts _) ->
case ts of
Normal -> (standard_colour r, s)
_ -> (comment_colour r, s)
asksTC :: (TokenizerConfig -> a) -> Lexer a
asksTC fn = Lexer $ \r s -> (fn r, s)
setTokState :: TokState -> Lexer ()
setTokState st = Lexer $ \_ (St _ ac) -> ((),St st ac)
getTokState :: Lexer TokState
getTokState = Lexer $ \_ s@(St st _) -> (st,s)
runTokenizer :: TokenizerConfig -> String -> GreekText
runTokenizer cfg input = step Normal $ lines input
where
step _ [] = (0,[])
step st (s:ss) = let (st1,l1) = lexLine cfg st s
(h,rest) = step st1 ss
in (h+1,l1:rest)
lexLine :: TokenizerConfig -> TokState -> String -> (TokState,[Tile])
lexLine cfg st ss =
let (_,St st1 hf) = runLexer cfg st ss in (st1, toListH hf)
runLexer :: TokenizerConfig -> TokState -> String -> ((),St)
runLexer cfg ts ss = getLexer (lexer ss) cfg (St (normalize ts) emptyH)
where
normalize CommentSL = Normal
normalize a = a
lexer :: String -> Lexer ()
lexer (' ':xs) = spaces 1 xs
lexer ('\t':xs) = spaces 8 xs
lexer xs = word xs
spaces :: Int -> String -> Lexer ()
spaces n (' ':xs) = spaces (n+1) xs
spaces n ('\t':xs) = spaces (n+8) xs
spaces n xs = tellSpaces n >> word xs
word :: String -> Lexer ()
word [] = return ()
word xs = let (pre,rest) = break isSpace xs in do
st <- getTokState
when (st==Normal) (testPrefix pre)
rgb <- askColour
tellChars (length pre) rgb
when (st==CommentML) (testSuffix pre)
spaces 0 rest
testPrefix :: String -> Lexer ()
testPrefix ss =
asksTC sgl_comment_start >>= \a ->
if isPrefixOf a ss then setTokState CommentSL
else asksTC comment_start >>= \b ->
if isPrefixOf b ss then setTokState CommentML
else return ()
testSuffix :: String -> Lexer ()
testSuffix ss =
asksTC comment_end >>= \a ->
if isSuffixOf a ss then setTokState Normal
else return ()