module Text.Highlighter.Lexer where
import Control.Monad.Error
import Control.Monad.State
import Prelude hiding (lex)
import Text.Regex.PCRE.Light hiding (compile)
import Text.Regex.PCRE.Light.Char8 (compile)
import qualified Data.ByteString as BS
import Text.Highlighter.Types
data LexerState =
LexerState
{ lsLexer :: Lexer
, lsInput :: BS.ByteString
, lsState :: [TokenMatcher]
, lsLexed :: [Token]
}
deriving Show
type LexerM = ErrorT LexerError (State LexerState)
data LexerError
= NoMatchFor BS.ByteString
| OtherLexerError String
deriving Show
instance Error LexerError where
noMsg = OtherLexerError "unknown"
strMsg = OtherLexerError
runLexer :: Lexer -> BS.ByteString -> Either LexerError [Token]
runLexer l s = evalState (runErrorT lex) (LexerState l s [lStart l] [])
lex :: LexerM [Token]
lex = do
done <- gets (BS.null . lsInput)
if done
then gets lsLexed
else do
ms <- getState
ts <- tryAll ms
modify $ \ls -> ls { lsLexed = lsLexed ls ++ ts }
lex
where
getState = gets (head . lsState)
tryAll :: [Match] -> LexerM [Token]
tryAll [] = do
i <- gets lsInput
throwError (NoMatchFor i)
tryAll (AnyOf ms:ms') =
tryAll (ms ++ ms')
tryAll (m:ms) = do
atbol <- isBOL
fs <- gets (lFlags . lsLexer)
let opts
| atbol = [exec_anchored]
| otherwise = [exec_anchored, exec_notbol]
i <- gets lsInput
case match (compile (mRegexp m) fs) i opts of
Just [] -> do
nextState (mNextState m)
return []
Just (s:ss) -> do
modify $ \ls -> ls { lsInput = BS.drop (BS.length s) i }
nextState (mNextState m)
toTokens (s:ss) (mType m)
Nothing ->
tryAll ms `catchError` trySkipping
where
trySkipping e = do
case e of
NoMatchFor _ ->
tryAllFirst (m:ms)
_ -> throwError e
tryAllFirst :: [Match] -> LexerM [Token]
tryAllFirst [] = do
i <- gets lsInput
throwError (NoMatchFor i)
tryAllFirst (AnyOf ms:ms') =
tryAllFirst (ms ++ ms')
tryAllFirst (m:ms) = do
atbol <- isBOL
fs <- gets (lFlags . lsLexer)
let opts
| atbol = []
| otherwise = [exec_notbol]
i <- gets lsInput
case match (compile (mRegexp m) fs) i opts of
Just (s:ss) -> do
let (skipped, next) = skipFailed i s
modify $ \ls -> ls { lsInput = next }
ts <- toTokens (s:ss) (mType m)
return (Token Error skipped:ts)
_ -> tryAllFirst ms
isBOL :: LexerM Bool
isBOL = do
ld <- gets lsLexed
case ld of
[] -> return True
ts ->
let nonempty = dropWhile (BS.null . tText) (reverse ts)
in
case nonempty of
[] -> return True
(t:_) -> return (BS.last (tText t) == 10)
toTokens :: [BS.ByteString] -> TokenType -> LexerM [Token]
toTokens (s:_) (Using l) = do
either throwError return (runLexer l s)
toTokens (_:ss) (ByGroups ts) = liftM concat $ zipWithM (\s t -> toTokens [s] t) ss ts
toTokens (s:_) t = return [Token t s]
toTokens [] _ = return []
skipFailed :: BS.ByteString -> BS.ByteString -> (BS.ByteString, BS.ByteString)
skipFailed i r
| r `BS.isPrefixOf` i = (BS.empty, BS.drop (BS.length r) i)
| otherwise =
let (pre, next) = skipFailed (BS.tail i) r
in (BS.cons (BS.head i) pre, next)
nextState :: NextState -> LexerM ()
nextState Continue = return ()
nextState Pop =
modify $ \ls -> ls { lsState = tail (lsState ls) }
nextState (PopNum n) =
modify $ \ls -> ls { lsState = drop n (lsState ls) }
nextState Push =
modify $ \ls -> ls { lsState = head (lsState ls) : lsState ls }
nextState (GoTo n) =
modify $ \ls -> ls { lsState = n : lsState ls }
nextState (DoAll nss) = mapM_ nextState nss
nextState (Combined nss) =
modify $ \ls -> ls { lsState = concat nss : lsState ls }