{-# LANGUAGE ViewPatterns #-} -- | 'Text.Tokenify' is a module used for generating a -- tokenizer from a regex based grammar module Text.Tokenify ( tokenize, matchHead, module DSL, module Types ) where import Prelude hiding (head) import qualified Text.Tokenify.Response as Response import qualified Text.Tokenify.CharSeq as CSeq import qualified Text.Tokenify.Regex as Regex import qualified Text.Tokenify.Types as Types import qualified Text.Tokenify.DSL as DSL import Text.Tokenify.Response (Response) import Text.Tokenify.CharSeq (CharSeq) import Text.Tokenify.Regex (Regex) import Text.Tokenify.Types import qualified Data.Monoid as Monoid import qualified Data.Sequence as Seq import Data.Sequence ((|>), Seq) import Data.Monoid ((<>)) import Control.Applicative ((<|>)) {-- main functionality --} -- | 'tokenize' will transform a 'CharSeq' into a sequence of tokens tokenize :: (CharSeq s) => Tokenizer s a -> s -> Either String (Seq a) tokenize tokenizers input = impl tokenizers Seq.empty 0 input where inputInfo :: [Int] inputInfo = CSeq.lineInfo input getPos :: Int -> (Int, Int) getPos offset = getPosImpl 0 1 inputInfo where getPosImpl position row (lineLength:lines) -- in this line | offset >= position && offset <= position+lineLength = (row, offset - position) -- beyond this line | offset > position+lineLength = getPosImpl (position + lineLength) (row + 1) lines -- exit edge condition impl _ acc _ input | CSeq.null input = Right acc -- fail edge condition impl [] acc position _ = Left ("failed to match at" ++ show (getPos position)) -- normal loop impl ((rx, rs):ts) acc position input = case matchHead rx input of Nothing -> impl ts acc position input Just (matched, rest, moved) -> let position' = position+moved coordants = getPos position in case rs of Response.Error -> Left ("matched error at " ++ show position) Response.Ignore -> impl tokenizers acc position' rest Response.Display p -> impl tokenizers (acc |> p coordants) position' rest Response.Process p -> impl tokenizers (acc |> p matched coordants) position' rest -- | Attmpts to match the front of a 'CharSeq' with a 'Regex', -- if succeful, it returns a tuple containing -- -- * The matched 'CharSeq' -- * The remaining 'CharSeq' -- * The amount of characters consumed matchHead :: (CharSeq s) => Regex s -> s -> Maybe (s, s, Int) matchHead regex input = case regex of -- match nothing Regex.NoPass -> Nothing -- match the char 'c' Regex.Char c -> CSeq.head input >>= \head -> if head == c then return (CSeq.singleton head, CSeq.tail input, 1) else Nothing -- match the string 's' Regex.String s -> prefixTail s input >>= \(diff, dSize) -> return (s, diff, dSize) -- either 'l' or 'r' Regex.Alt l r -> matchHead l input <|> matchHead r input -- matches char between 's' & 'e' Regex.Range s e -> do head <- CSeq.head input if head >= s && e >= head then return (CSeq.singleton head, CSeq.tail input, 1) else Nothing -- chain 'l' & 'r' Regex.Append l r -> do (a, cont, ai) <- matchHead l input (b, cont, bi) <- matchHead r cont return (a <> b, cont, ai + bi) -- optionally matches Regex.Option o -> case matchHead o input of Nothing -> return (Monoid.mempty, input, 0) anythingElse -> anythingElse -- repeat 0 or more times Regex.Repeat r -> impl Monoid.mempty input 0 where impl acc cont@(matchHead r -> Nothing) i = Just (acc, cont, i) impl a (matchHead r -> Just (b, cont, ib)) ia = impl (a <> b) cont (ia + ib) -- repeat 1 or more times Regex.Repeat1 r -> do (a, cont, ai) <- matchHead r input (b, cont, bi) <- matchHead (Regex.Repeat r) input return (a <> b, cont, ai + bi) -- -- dumb helpers -- prefixTail :: (CharSeq s) => s -> s -> Maybe (s, Int) prefixTail prefix input = trySplit prefix input 0 where trySplit pre dec index | CSeq.null pre = Just (dec, index) -- if (dec.length != 0 && dec.head == pre.head) -- trySplit(pre.tail, dec.tail, ++index); | not (CSeq.null dec) && (CSeq.head dec == CSeq.head pre) = trySplit (CSeq.tail pre) (CSeq.tail dec) (index + 1) | otherwise = Nothing