{-# LANGUAGE FlexibleContexts, RankNTypes, ScopedTypeVariables, ExistentialQuantification #-} {- | Module: Text.AhoCorasick Copyright: Sergey S Lymar (c) 2012 License: BSD-3-Clause Maintainer: Sergey S Lymar Stability: experimental Portability: portable Aho-Corasick string matching algorithm = Simplest example @ example1 = mapM_ print $ findAll simpleSM \"ushers\" where simpleSM = makeSimpleStateMachine [\"he\",\"she\",\"his\",\"hers\"] @ @ Position {pIndex = 1, pLength = 3, pVal = \"she\"} Position {pIndex = 2, pLength = 2, pVal = \"he\"} Position {pIndex = 2, pLength = 4, pVal = \"hers\"} @ = With data @ example2 = mapM_ print $ findAll sm \"ushers\" where sm = makeStateMachine [(\"he\",0),(\"she\",1),(\"his\",2),(\"hers\",3)] @ @ Position {pIndex = 1, pLength = 3, pVal = 1} Position {pIndex = 2, pLength = 2, pVal = 0} Position {pIndex = 2, pLength = 4, pVal = 3} @ = Step-by-step state machine evaluation @ example3 = mapM_ print $ next sm \"ushers\" where sm = makeSimpleStateMachine [\"he\",\"she\",\"his\",\"hers\"] next _ [] = [] next sm (s:n) = let (SMStepRes match nextSM) = stateMachineStep sm s in (s, match) : next nextSM n @ @ (\'u\',[]) (\'s\',[]) (\'h\',[]) (\'e\',[(3,\"she\"),(2,\"he\")]) (\'r\',[]) (\'s\',[(4,\"hers\")]) @ -} module Text.AhoCorasick ( -- ** Basic interface makeStateMachine , makeSimpleStateMachine , findAll , Position(..) -- ** Low-level interface , stateMachineStep , SMStepRes(..) , resetStateMachine -- ** Types , StateMachine , KeyLength ) where import Control.Monad.State.Lazy (execStateT, get, put) import Control.Monad.ST.Strict (ST, runST) import Control.Monad.Trans (lift) import Data.Array.IArray (Array, array, (!)) import Data.Functor ((<&>)) import Data.Hashable (Hashable) import Data.Maybe (fromJust) import Data.STRef (STRef, newSTRef, readSTRef, writeSTRef, modifySTRef) import qualified Data.HashMap.Strict as M import Text.AhoCorasick.Internal.Deque (mkDQ, pushBack, popFront, dqLength, DQ) data (Eq keySymb, Hashable keySymb) => TNode keySymb s = TNode { tnId :: Int , tnLinks :: M.HashMap keySymb (STRef s (TNode keySymb s)) , tnFail :: Maybe (STRef s (TNode keySymb s)) , tnValuesIds :: [Int] } type KeyLength = Int data (Eq keySymb, Hashable keySymb) => TTree keySymb val s = TTree { ttRoot :: STRef s (TNode keySymb s) , ttLastId :: STRef s Int , ttValues :: DQ (KeyLength, val) s } type NodeIndex = Int data (Eq keySymb, Hashable keySymb) => SMElem keySymb = SMElem { smeLinks :: M.HashMap keySymb NodeIndex , smeFail :: NodeIndex , smeValuesIds :: [Int] } data (Eq keySymb, Hashable keySymb) => StateMachine keySymb val = StateMachine { smStates :: Array NodeIndex (SMElem keySymb) , smValues :: Array Int (KeyLength, val) , smState :: Int } data (Eq keySymb, Hashable keySymb) => SMStepRes keySymb val = SMStepRes { smsrMatch :: [(KeyLength, val)] , smsrNextSM :: StateMachine keySymb val } data Position val = Position { pIndex :: Int , pLength :: Int , pVal :: val } instance (Eq keySymb, Hashable keySymb, Show keySymb) => Show (SMElem keySymb) where show (SMElem l f v) = concat ["SMElem {smeLinks = ", show l, ", smeFail = ", show f,", smeValuesIds = ", show v, "}"] instance (Eq keySymb, Hashable keySymb, Show keySymb, Show val) => Show (StateMachine keySymb val) where show (StateMachine st vals state) = concat [ "StateMachine {smStates = ", show st, ", smValues = ", show vals, ", smState = ", show state,"}"] instance (Eq keySymb, Hashable keySymb, Show keySymb, Show val) => Show (SMStepRes keySymb val) where show (SMStepRes f n) = concat [ "StateMachineStepRes {smsrFound = ", show f, ", smsrNewSM = ", show n,"}"] instance (Show val) => Show (Position val) where show (Position i l v) = concat [ "Position {pIndex = ", show i, ", pLength = ", show l, ", pVal = ", show v,"}"] (~>) :: t1 -> (t1 -> t2) -> t2 x ~> f = f x infixl 9 ~> rootNodeId :: Int rootNodeId = 0 initNewTTree :: (Eq keySymb, Hashable keySymb) => ST s (TTree keySymb a s) initNewTTree = do root <- newSTRef $ TNode rootNodeId M.empty Nothing [] lid <- newSTRef rootNodeId TTree root lid <$> mkDQ mkNewTNode :: (Eq keySymb, Hashable keySymb) => TTree keySymb a s -> ST s (TNode keySymb s) mkNewTNode tree = do modifySTRef lid (+1) lv <- readSTRef lid return $ TNode lv M.empty Nothing [] where lid = ttLastId tree addKeyVal :: forall val s keySymb. (Eq keySymb, Hashable keySymb) => TTree keySymb val s -> [keySymb] -> val -> ST s () addKeyVal tree key val = addSymb (ttRoot tree) key where addSymb :: STRef s (TNode keySymb s) -> [keySymb] -> ST s () addSymb node [] = do vi <- dqLength (ttValues tree) pushBack (ttValues tree) (length key, val) modifySTRef node (\r -> r { tnValuesIds = [vi] }) addSymb node (c:nc) = do n <- readSTRef node let nlnks = tnLinks n case M.lookup c nlnks of Just tn -> addSymb tn nc Nothing -> do nnd <- mkNewTNode tree refNewN <- newSTRef nnd writeSTRef node (n {tnLinks = M.insert c refNewN nlnks}) addSymb refNewN nc findFailures :: (Eq keySymb, Hashable keySymb) => TTree keySymb val s -> ST s () findFailures tree = do modifySTRef root (\n -> n {tnFail = Just root}) dq <- mkDQ pushBack dq root procAll dq where root = ttRoot tree procAll dq = do n <- popFront dq case n of Nothing -> return () Just node -> do procNode dq node procAll dq procNode dq nodeRef = do node <- readSTRef nodeRef mapM_ (\(symb, link) -> do pushBack dq link fRef <- findParentFail link (tnFail node) symb f <- readSTRef fRef modifySTRef link (\n -> n {tnFail = Just fRef, tnValuesIds = tnValuesIds n ++ tnValuesIds f}) ) $ tnLinks node ~> M.toList findParentFail link (Just cfRef) symb = do cf <- readSTRef cfRef case (M.lookup symb (tnLinks cf), cfRef == root) of (Just nl, _) -> if nl == link then return root else return nl (Nothing, True) -> return root _ -> findParentFail link (tnFail cf) symb convertToStateMachine :: forall val s keySymb. (Eq keySymb, Hashable keySymb) => TTree keySymb val s -> ST s (StateMachine keySymb val) convertToStateMachine tree = do size <- readSTRef $ ttLastId tree nds <- execStateT (convertNode $ ttRoot tree) [] vlsSize <- dqLength $ ttValues tree vls <- mapM (\i -> do k <- popFront (ttValues tree) return (i,fromJust k) ) [0..(vlsSize-1)] StateMachine (array (0, size) nds) (array (0, vlsSize-1) vls) rootNodeId ~> return where convertNode node = do (n,l,fail) <- lift $ do n <- readSTRef node l <- tnLinks n ~> convertLinks fail <- (tnFail n ~> fromJust ~> readSTRef) <&> tnId return (n,l,fail) v <- get put $ (tnId n, SMElem l fail (tnValuesIds n)) : v M.toList (tnLinks n) ~> map snd ~> mapM_ convertNode convertLinks :: M.HashMap keySymb (STRef s (TNode keySymb s)) -> ST s (M.HashMap keySymb Int) convertLinks lnksMap = do nl <- mapM (\(symb, link) -> do l <- readSTRef link return (symb, tnId l) ) $ M.toList lnksMap return $ M.fromList nl resetStateMachine :: (Eq keySymb, Hashable keySymb) => StateMachine keySymb val -> StateMachine keySymb val resetStateMachine m = m { smState = rootNodeId } stateMachineStep :: (Eq keySymb, Hashable keySymb) => StateMachine keySymb val -> keySymb -> SMStepRes keySymb val stateMachineStep sm symb = case (M.lookup symb links, currentState == rootNodeId) of (Just nextState, _) -> SMStepRes (smStates sm ! nextState ~> smeValuesIds ~> convertToVals) (sm { smState = nextState }) (Nothing, True) -> SMStepRes [] sm (Nothing, False) -> stateMachineStep (sm { smState = smeFail currentNode}) symb where currentState = smState sm currentNode = smStates sm ! currentState links = smeLinks currentNode convertToVals = map (\i -> smValues sm ! i) findAll :: (Eq keySymb, Hashable keySymb) => StateMachine keySymb val -> [keySymb] -> [Position val] findAll sm str = step (resetStateMachine sm) (zip [0..] str) ~> concat where step _ [] = [] step csm ((idx,symb):next) = case stateMachineStep csm symb of SMStepRes [] newsm -> step newsm next SMStepRes r newsm -> map (cnvToPos idx) r : step newsm next cnvToPos idx (keyLength, val) = Position (idx - keyLength + 1) keyLength val -- | Returns search keys as values makeSimpleStateMachine :: (Eq keySymb, Hashable keySymb) => [[keySymb]] -> StateMachine keySymb [keySymb] makeSimpleStateMachine keys = runST $ do tree <- initNewTTree mapM_ (\s -> addKeyVal tree s s) keys findFailures tree convertToStateMachine tree -- | Associate custom values with the search keys makeStateMachine :: (Eq keySymb, Hashable keySymb) => [([keySymb], val)] -> StateMachine keySymb val makeStateMachine kv = runST $ do tree <- initNewTTree mapM_ (uncurry (addKeyVal tree)) kv findFailures tree convertToStateMachine tree