{-# LANGUAGE RankNTypes #-} {-# LANGUAGE MultiParamTypeClasses, FlexibleInstances #-} module Text.RegExp.Data where import Data.Semiring -- | -- Regular expressions are represented as values of type 'RegExp' @c@ -- where @c@ is the character type of the underlying alphabet. Values -- of type @RegExp@ @c@ can be matched against lists of type @[c]@. -- newtype RegExp c = RegExp (forall w . Semiring w => RegW w c) data RegW w c = RegW { active :: !Bool, empty :: !w, final_ :: !w, reg :: !(Reg w c) } final :: Semiring w => RegW w c -> w final r = if active r then final_ r else zero data Reg w c = Eps | Sym String (c -> w) | Alt (RegW w c) (RegW w c) | Seq (RegW w c) (RegW w c) | Rep (RegW w c) class Semiring w => Weight a b w where symWeight :: (a -> w) -> b -> w defaultSymWeight :: (a -> w) -> a -> w defaultSymWeight = id instance Weight c c Bool where symWeight = defaultSymWeight instance Num a => Weight c c (Numeric a) where symWeight = defaultSymWeight weighted :: Weight a b w => RegW w a -> RegW w b weighted (RegW a e f r) = case r of Eps -> RegW a e f Eps Sym s p -> RegW a e f (Sym s (symWeight p)) Alt p q -> RegW a e f (Alt (weighted p) (weighted q)) Seq p q -> RegW a e f (Seq (weighted p) (weighted q)) Rep p -> RegW a e f (Rep (weighted p)) -- | -- Matches the empty word. 'eps' has no direct string representation -- but is used to implement other constructs such as optional -- components like @a?@. -- eps :: RegExp c eps = RegExp epsW epsW :: Semiring w => RegW w c epsW = RegW False one zero Eps -- | Matches the given character. -- char :: Char -> RegExp Char char c = psym (quote c) (c==) -- | Matches the given symbol. -- sym :: (Eq c, Show c) => c -> RegExp c sym c = psym (show c) (c==) quote :: Char -> String quote c | c `elem` " \\|*+?.[]{}^" = '\\' : [c] | otherwise = [c] -- | Matches a symbol that satisfies the given predicate. -- psym :: String -> (c -> Bool) -> RegExp c psym s p = RegExp (symW s (fromBool . p)) symW :: Semiring w => String -> (c -> w) -> RegW w c symW s p = RegW False zero zero $ Sym s p -- | Matches an arbitrary symbol. -- anySym :: RegExp c anySym = psym "." (const True) -- | Does not match anything. 'noMatch' is an identity for 'alt'. -- noMatch :: RegExp c noMatch = psym "[]" (const False) -- | -- Matches either of two regular expressions. For example @a+b@ -- matches either the character @a@ or the character @b@. -- alt :: RegExp c -> RegExp c -> RegExp c alt (RegExp p) (RegExp q) = RegExp (RegW False (empty p .+. empty q) zero (Alt p q)) altW :: Semiring w => RegW w c -> RegW w c -> RegW w c altW p q = RegW (active p || active q) (empty p .+. empty q) (final p .+. final q) (Alt p q) -- | -- Matches the sequence of two regular expressions. For example the -- regular expressions @ab@ matches the word @ab@. -- seq_ :: RegExp c -> RegExp c -> RegExp c seq_ (RegExp p) (RegExp q) = RegExp (RegW False (empty p .*. empty q) zero (Seq p q)) seqW :: Semiring w => RegW w c -> RegW w c -> RegW w c seqW p q = RegW (active p || active q) (empty p .*. empty q) (final p .*. empty q .+. final q) (Seq p q) -- | Matches zero or more occurrences of the given regular -- expression. For example @a*@ matches the character @a@ zero or -- more times. -- rep :: RegExp c -> RegExp c rep (RegExp r) = RegExp (RegW False one zero (Rep r)) repW :: Semiring w => RegW w c -> RegW w c repW r = RegW (active r) one (final r) (Rep r) -- | Matches one or more occurrences of the given regular -- expression. For example @a+@ matches the character @a@ one or -- more times. -- rep1 :: RegExp c -> RegExp c rep1 r = r `seq_` rep r -- | -- Matches the given regular expression or the empty word. Optional -- expressions are usually written @a?@ but could also be written -- @(|a)@, that is, as alternative between 'eps' and @a@. -- opt :: RegExp c -> RegExp c opt r = eps `alt` r -- | -- Matches a regular expression a given number of times. For example, -- the regular expression @a{4,7}@ matches the character @a@ four to -- seven times. If the minimal and maximal occurences are identical, -- one can be left out, that is, @a{2}@ matches two occurrences of the -- character @a@. -- -- Numerical bounds are implemented via translation into ordinary -- regular expressions. For example, @a{4,7}@ is translated into -- @aaaa(a(a(a)?)?)?@. -- brep :: (Int,Int) -> RegExp c -> RegExp c brep (n,m) r | n < 0 || m < 0 || n > m = error msg | n == 0 && m == 0 = eps | n == m = foldr1 seq_ (replicate n r) | otherwise = foldr seq_ rest (replicate n r) where rest = foldr nestopt (opt r) (replicate (m-n-1) r) nestopt p q = opt (seq_ p q) msg = "Text.RegExp.brep: invalid repetition bounds: " ++ show (n,m) regW :: Semiring w => RegExp c -> RegW w c regW (RegExp r) = r instance Show (RegExp Char) where showsPrec n r = showsPrec n (regW r :: RegW Bool Char) instance Show (RegW Bool Char) where showsPrec n r = showsPrec n (reg r) instance Show (Reg Bool Char) where showsPrec _ Eps = showString "()" showsPrec _ (Sym s _) = showString s showsPrec n (Alt p q) = showParen (n > 0) $ showsPrec 1 p . showString "|" . shows q showsPrec n (Seq p q) = showParen (n > 1) $ showsPrec 2 p . showsPrec 1 q showsPrec _ (Rep r) = showsPrec 2 r . showString "*" instance Eq (RegExp Char) where p == q = regW p == (regW q :: RegW Bool Char) instance Eq (RegW Bool Char) where p == q = reg p == reg q instance Eq (Reg Bool Char) where Eps == Eps = True Sym s _ == Sym t _ = s==t Alt a b == Alt c d = a==c && b==d Seq a b == Seq c d = a==c && b==d Rep a == Rep b = a==b _ == _ = False