module Text.Regex.TDFA.UTF8(Utf8(Utf8,utf8)) where import Data.Array.IArray((!)) import Data.Maybe(listToMaybe) import qualified Data.ByteString.Lazy.Char8 as L(ByteString,empty) import qualified Data.ByteString.Lazy.UTF8 as U(take,drop,uncons,toString) import Text.Regex.Base(RegexLike(..),RegexMaker(..),Extract(..),MatchArray,RegexContext(..)) import Text.Regex.Base.Impl(polymatch,polymatchM) import Text.Regex.TDFA.String() -- instances only import Text.Regex.TDFA.Common(Regex(..),CompOption,ExecOption(captureGroups),Position) import Text.Regex.TDFA.NewDFA.Uncons(Uncons(uncons)) import qualified Text.Regex.TDFA.NewDFA.Engine as Engine(execMatch) import qualified Text.Regex.TDFA.NewDFA.Tester as Tester(matchTest) -- This is a newtype for the instances we are making. -- You will likely want to use a pre-existing newtype from your code. newtype Utf8 = Utf8 { utf8 :: L.ByteString } deriving (Show,Read,Eq,Ord) instance Extract Utf8 where {-# INLINE empty #-} empty = Utf8 L.empty {-# INLINE before #-} before i = Utf8 . U.take (fromIntegral i) . utf8 {-# INLINE after #-} after i = Utf8 . U.drop (fromIntegral i) . utf8 instance Uncons Utf8 where {-# INLINE uncons #-} uncons = fmap (fmap Utf8) . U.uncons . utf8 instance RegexMaker Regex CompOption ExecOption Utf8 where makeRegexOptsM c e source = makeRegexOptsM c e (U.toString (utf8 source)) {-# SPECIALIZE execMatch :: Regex -> Position -> Char -> Utf8 -> [MatchArray] #-} execMatch :: Uncons text => Regex -> Position -> Char -> text -> [MatchArray] execMatch = Engine.execMatch {-# SPECIALIZE myMatchTest :: Regex -> Utf8 -> Bool #-} myMatchTest :: Uncons text => Regex -> text -> Bool myMatchTest = Tester.matchTest instance RegexLike Regex Utf8 where matchOnce r s = listToMaybe (matchAll r s) matchAll r s = execMatch r 0 '\n' s matchCount r s = length (matchAll r' s) where r' = r { regex_execOptions = (regex_execOptions r) {captureGroups = False} } matchTest = myMatchTest matchOnceText regex source = fmap (\ ma -> let (o,l) = ma!0 in (before o source ,fmap (\ ol -> (extract ol source,ol)) ma ,after (o+l) source)) (matchOnce regex source) matchAllText regex source = let go i _ _ | i `seq` False = undefined go _i _t [] = [] go i t (x:xs) = let (off0,len0) = x!0 trans pair@(off,len) = (extract (off-i,len) t,pair) t' = after (off0+(len0-i)) t in fmap trans x : seq t' (go (off0+len0) t' xs) in go 0 source (matchAll regex source) instance RegexContext Regex Utf8 Utf8 where match = polymatch matchM = polymatchM