{-# OPTIONS -Wall #-}

--------------------------------------------------------------------------------
-- |
-- Module      :  Wumpus.Microprint.Tokenizer
-- Copyright   :  (c) Stephen Tetley 2010
-- License     :  BSD3
--
-- Maintainer  :  stephen.tetley@gmail.com
-- Stability   :  unstable
-- Portability :  GHC
--
-- Simple tokenizing builder.
--
--------------------------------------------------------------------------------

module Wumpus.Microprint.Tokenizer
  ( 
    TokenizerConfig(..)
  , haskellTokenizer
  , runTokenizer 

  ) where


import Wumpus.Microprint.Datatypes

import Wumpus.Basic.Utils.HList			-- package: wumpus-basic
import Wumpus.Core				-- package: wumpus-core

import Control.Applicative
import Control.Monad
import Data.Char ( isSpace )
import Data.List

data TokenizerConfig = TokenizerConfig
      { standard_colour		:: RGBi
      , sgl_comment_start 	:: String   -- note - can be a prefix of a word 
      , comment_start	  	:: String
      , comment_end		:: String
      , comment_colour		:: RGBi
      }


haskellTokenizer :: RGBi -> RGBi -> TokenizerConfig
haskellTokenizer std_rgb comment_rgb = TokenizerConfig 
      { standard_colour		= std_rgb
      , sgl_comment_start 	= "--"  
      , comment_start	  	= "{-"
      , comment_end		= "-}"
      , comment_colour		= comment_rgb
      }
 

data TokState = CommentML | CommentSL | Normal
  deriving (Eq,Ord,Show)

data St = St TokState (H Tile)

newtype Lexer a  = Lexer { getLexer :: TokenizerConfig -> St -> (a,St) }

instance Functor Lexer where
  fmap f m = Lexer $ \r s -> let (a,s1) = getLexer m r s in (f a, s1)

instance Applicative Lexer where
  pure a    = Lexer $ \_ s -> (a,s)
  mf <*> ma = Lexer $ \r s -> let (f,s1) = getLexer mf r s 
     	      	      	    	  (a,s2) = getLexer ma r s1
			      in (f a, s2)

instance Monad Lexer where
  return a = Lexer $ \_ s -> (a,s)
  m >>= k  = Lexer $ \r s -> let (a,s1) = getLexer m r s
    	     	     	      in (getLexer . k) a r s1


tellSpaces :: Int -> Lexer ()
tellSpaces i = Lexer $ \_ (St ts ac) -> 
    let ac1 = snocH ac (Space i) in ((), St ts ac1) 


tellChars :: Int -> RGBi -> Lexer ()
tellChars i rgb = Lexer $ \_ (St ts ac) -> 
    let ac1 = snocH ac (Word rgb i) in ((), St ts ac1)

askColour :: Lexer RGBi
askColour = Lexer $ \r s@(St ts _) -> 
    case ts of
      Normal -> (standard_colour r, s)
      _      -> (comment_colour r,  s)


asksTC :: (TokenizerConfig -> a) -> Lexer a
asksTC fn = Lexer $ \r s -> (fn r, s) 

setTokState :: TokState -> Lexer ()
setTokState st = Lexer $ \_ (St _ ac) -> ((),St st ac)  

getTokState :: Lexer TokState 
getTokState = Lexer $ \_ s@(St st _) -> (st,s)

runTokenizer :: TokenizerConfig -> String -> GreekText
runTokenizer cfg input = step Normal $ lines input
  where
    step _  []      = (0,[])
    step st (s:ss)  = let (st1,l1) = lexLine cfg st s
    	 	   	  (h,rest) = step st1 ss
                      in (h+1,l1:rest) 

lexLine :: TokenizerConfig -> TokState -> String -> (TokState,[Tile])
lexLine cfg st ss = 
    let (_,St st1 hf) = runLexer cfg st ss in (st1, toListH hf) 
  

runLexer :: TokenizerConfig -> TokState -> String -> ((),St)
runLexer cfg ts ss = getLexer (lexer ss) cfg (St (normalize ts) emptyH) 
  where
    normalize CommentSL = Normal
    normalize a         = a

lexer :: String -> Lexer ()
lexer (' ':xs)      = spaces 1 xs
lexer ('\t':xs)     = spaces 8 xs
lexer xs            = word xs


spaces :: Int -> String -> Lexer ()
spaces n (' ':xs)   = spaces (n+1) xs
spaces n ('\t':xs)  = spaces (n+8) xs
spaces n xs         = tellSpaces n >> word xs

word :: String -> Lexer ()
word []	= return ()
word xs	= let (pre,rest) = break isSpace xs in do
    st <- getTokState
    when (st==Normal)  (testPrefix pre)
    rgb <- askColour
    tellChars (length pre) rgb
    when (st==CommentML) (testSuffix pre)
    spaces 0 rest

testPrefix :: String -> Lexer ()
testPrefix ss = 
    asksTC sgl_comment_start >>= \a -> 
    if isPrefixOf a ss then setTokState CommentSL
      else asksTC comment_start >>= \b ->
           if isPrefixOf b ss then setTokState CommentML
	      		      else return ()

testSuffix :: String -> Lexer ()
testSuffix ss = 
    asksTC comment_end >>= \a ->
    if isSuffixOf a ss then setTokState Normal
                       else return ()