{-# LANGUAGE BangPatterns #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE FunctionalDependencies #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE Safe #-} {-# LANGUAGE ScopedTypeVariables #-} module Kleene.RE ( RE (..), -- * Construction -- -- | Binary operators are -- -- * '<>' for append -- * '\/' for union -- empty, eps, char, charRange, anyChar, appends, unions, star, string, -- * Derivative nullable, derivate, -- * Transition map transitionMap, leadingChars, -- * Equivalence equivalent, -- * Generation generate, -- * Other isEmpty, ) where import Prelude () import Prelude.Compat import Algebra.Lattice (BoundedJoinSemiLattice (..), JoinSemiLattice (..)) import Control.Applicative (liftA2) import Data.Foldable (toList) import Data.List (foldl') import Data.Map (Map) import Data.RangeSet.Map (RSet) import Data.Set (Set) import Data.String (IsString (..)) import qualified Data.Function.Step.Discrete.Closed as SF import qualified Data.Map as Map import qualified Data.RangeSet.Map as RSet import qualified Data.Set as Set import qualified Test.QuickCheck as QC import qualified Test.QuickCheck.Gen as QC (unGen) import qualified Test.QuickCheck.Random as QC (mkQCGen) import qualified Kleene.Classes as C import qualified Kleene.Internal.Partition as P import Kleene.Internal.Pretty -- | Regular expression -- -- Constructors are exposed, but you should use -- smart constructors in this module to construct 'RE'. -- -- The 'Eq' and 'Ord' instances are structural. -- The 'Kleene' etc constructors do "weak normalisation", so for values -- constructed using those operations 'Eq' witnesses "weak equivalence". -- See 'equivalent' for regular-expression equivalence. -- -- Structure is exposed in "Kleene.RE" module but consider constructors as -- half-internal. There are soft-invariants, but violating them shouldn't -- break anything in the package. (e.g. 'transitionMap' will eventually -- terminate, but may create more redundant states if starting regexp is not -- "weakly normalised"). -- data RE c = REChars (RSet c) -- ^ Single character | REAppend [RE c] -- ^ Concatenation | REUnion (RSet c) (Set (RE c)) -- ^ Union | REStar (RE c) -- ^ Kleene star deriving (Eq, Ord, Show) ------------------------------------------------------------------------------- -- Smart constructor ------------------------------------------------------------------------------- -- | Empty regex. Doesn't accept anything. -- -- >>> putPretty (empty :: RE Char) -- ^[]$ -- -- >>> putPretty (bottom :: RE Char) -- ^[]$ -- -- prop> match (empty :: RE Char) (s :: String) === False -- empty :: RE c empty = REChars RSet.empty -- | Everything. -- -- >>> putPretty everything -- ^[^]*$ -- -- prop> match (everything :: RE Char) (s :: String) === True -- everything :: Bounded c => RE c everything = REStar (REChars RSet.full) -- | Empty string. /Note:/ different than 'empty'. -- -- >>> putPretty eps -- ^$ -- -- >>> putPretty (mempty :: RE Char) -- ^$ -- -- prop> match (eps :: RE Char) s === null (s :: String) -- eps :: RE c eps = REAppend [] -- | -- -- >>> putPretty (char 'x') -- ^x$ -- char :: c -> RE c char = REChars . RSet.singleton -- | -- -- >>> putPretty $ charRange 'a' 'z' -- ^[a-z]$ -- charRange :: Ord c => c -> c -> RE c charRange c c' = REChars $ RSet.singletonRange (c, c') -- | Any character. /Note:/ different than dot! -- -- >>> putPretty anyChar -- ^[^]$ -- anyChar :: Bounded c => RE c anyChar = REChars RSet.full -- | Concatenate regular expressions. -- -- prop> (asREChar r <> s) <> t === r <> (s <> t) -- -- prop> asREChar r <> empty === empty -- prop> empty <> asREChar r === empty -- -- prop> asREChar r <> eps === r -- prop> eps <> asREChar r === r -- appends :: Eq c => [RE c] -> RE c appends rs0 | elem empty rs1 = empty | otherwise = case rs1 of [r] -> r rs -> REAppend rs where -- flatten one level of REAppend rs1 = concatMap f rs0 f (REAppend rs) = rs f r = [r] -- | Union of regular expressions. -- -- prop> asREChar r \/ r === r -- prop> asREChar r \/ s === s \/ r -- prop> (asREChar r \/ s) \/ t === r \/ (s \/ t) -- -- prop> empty \/ asREChar r === r -- prop> asREChar r \/ empty === r -- -- prop> everything \/ asREChar r === everything -- prop> asREChar r \/ everything === everything -- unions :: (Ord c, Enum c, Bounded c) => [RE c] -> RE c unions = uncurry mk . foldMap f where mk cs rss | Set.null rss = REChars cs | Set.member everything rss = everything | RSet.null cs = case Set.toList rss of [] -> empty [r] -> r _ -> REUnion cs rss | otherwise = REUnion cs rss f (REUnion cs rs) = (cs, rs) f (REChars cs) = (cs, Set.empty) f r = (mempty, Set.singleton r) -- | Kleene star. -- -- prop> star (star r) === star (asREChar r) -- -- prop> star eps === asREChar eps -- prop> star empty === asREChar eps -- prop> star anyChar === asREChar everything -- -- prop> star (r \/ eps) === star (asREChar r) -- prop> star (char c \/ eps) === star (asREChar (char c)) -- prop> star (empty \/ eps) === asREChar eps -- star :: Ord c => RE c -> RE c star r = case r of REStar _ -> r REAppend [] -> eps REChars cs | RSet.null cs -> eps REUnion cs rs | Set.member eps rs -> case Set.toList rs' of [] -> star (REChars cs) [r'] | RSet.null cs -> star r' _ -> REStar (REUnion cs rs') where rs' = Set.delete eps rs _ -> REStar r -- | Literal string. -- -- >>> putPretty ("foobar" :: RE Char) -- ^foobar$ -- -- >>> putPretty ("(.)" :: RE Char) -- ^\(\.\)$ -- string :: [c] -> RE c string [] = eps string [c] = REChars (RSet.singleton c) string cs = REAppend $ map (REChars . RSet.singleton) cs instance (Ord c, Enum c, Bounded c) => C.Kleene c (RE c) where empty = empty eps = eps char = char appends = appends unions = unions star = star instance (Ord c, Enum c, Bounded c) => C.FiniteKleene c (RE c) where everything = everything charRange = charRange fromRSet = REChars anyChar = anyChar ------------------------------------------------------------------------------- -- derivative ------------------------------------------------------------------------------- -- | We say that a regular expression r is nullable if the language it defines -- contains the empty string. -- -- >>> nullable eps -- True -- -- >>> nullable (star "x") -- True -- -- >>> nullable "foo" -- False -- nullable :: RE c -> Bool nullable (REChars _) = False nullable (REAppend rs) = all nullable rs nullable (REUnion _cs rs) = any nullable rs nullable (REStar _) = True -- | Intuitively, the derivative of a language \(\mathcal{L} \subset \Sigma^\star\) -- with respect to a symbol \(a \in \Sigma\) is the language that includes only -- those suffixes of strings with a leading symbol \(a\) in \(\mathcal{L}\). -- -- >>> putPretty $ derivate 'f' "foobar" -- ^oobar$ -- -- >>> putPretty $ derivate 'x' $ "xyz" \/ "abc" -- ^yz$ -- -- >>> putPretty $ derivate 'x' $ star "xyz" -- ^yz(xyz)*$ -- derivate :: (Ord c, Enum c, Bounded c) => c -> RE c -> RE c derivate c (REChars cs) = derivateChars c cs derivate c (REUnion cs rs) = unions $ derivateChars c cs : [ derivate c r | r <- toList rs] derivate c (REAppend rs) = derivateAppend c rs derivate c rs@(REStar r) = derivate c r <> rs derivateAppend :: (Ord c, Enum c, Bounded c) => c -> [RE c] -> RE c derivateAppend _ [] = empty derivateAppend c [r] = derivate c r derivateAppend c (r:rs) | nullable r = unions [r' <> appends rs, rs'] | otherwise = r' <> appends rs where r' = derivate c r rs' = derivateAppend c rs derivateChars :: Ord c => c -> RSet c -> RE c derivateChars c cs | c `RSet.member` cs = eps | otherwise = empty instance (Ord c, Enum c, Bounded c) => C.Derivate c (RE c) where nullable = nullable derivate = derivate instance (Ord c, Enum c, Bounded c) => C.Match c (RE c) where match r = nullable . foldl' (flip derivate) r ------------------------------------------------------------------------------- -- isEmpty ------------------------------------------------------------------------------- -- | Whether 'RE' is (structurally) equal to 'empty'. -- -- prop> isEmpty r === all (not . nullable) (Map.keys $ transitionMap $ asREChar r) isEmpty :: RE c -> Bool isEmpty (REChars rs) = RSet.null rs isEmpty _ = False ------------------------------------------------------------------------------- -- States ------------------------------------------------------------------------------- -- | Transition map. Used to construct 'Kleene.DFA.DFA'. -- -- >>> void $ Map.traverseWithKey (\k v -> putStrLn $ pretty k ++ " : " ++ SF.showSF (fmap pretty v)) $ transitionMap ("ab" :: RE Char) -- ^[]$ : \_ -> "^[]$" -- ^b$ : \x -> if -- | x <= 'a' -> "^[]$" -- | x <= 'b' -> "^$" -- | otherwise -> "^[]$" -- ^$ : \_ -> "^[]$" -- ^ab$ : \x -> if -- | x <= '`' -> "^[]$" -- | x <= 'a' -> "^b$" -- | otherwise -> "^[]$" -- transitionMap :: forall c. (Ord c, Enum c, Bounded c) => RE c -> Map (RE c) (SF.SF c (RE c)) transitionMap re = go Map.empty [re] where go :: Map (RE c) (SF.SF c (RE c)) -> [RE c] -> Map (RE c) (SF.SF c (RE c)) go !acc [] = acc go acc (r : rs) | r `Map.member` acc = go acc rs | otherwise = go (Map.insert r pm acc) (SF.values pm ++ rs) where pm = P.toSF (\c -> derivate c r) (leadingChars r) instance (Ord c, Enum c, Bounded c) => C.TransitionMap c (RE c) where transitionMap = transitionMap -- | Leading character sets of regular expression. -- -- >>> leadingChars "foo" -- fromSeparators "ef" -- -- >>> leadingChars (star "b" <> star "e") -- fromSeparators "abde" -- -- >>> leadingChars (charRange 'b' 'z') -- fromSeparators "az" -- leadingChars :: (Ord c, Enum c, Bounded c) => RE c -> P.Partition c leadingChars (REChars cs) = P.fromRSet cs leadingChars (REUnion cs rs) = P.fromRSet cs <> foldMap leadingChars rs leadingChars (REStar r) = leadingChars r leadingChars (REAppend rs) = leadingCharsAppend rs leadingCharsAppend :: (Ord c, Enum c, Bounded c) => [RE c] -> P.Partition c leadingCharsAppend [] = P.whole leadingCharsAppend (r : rs) | nullable r = leadingChars r <> leadingCharsAppend rs | otherwise = leadingChars r ------------------------------------------------------------------------------- -- Equivalence ------------------------------------------------------------------------------- -- | Whether two regexps are equivalent. -- -- @ -- 'equivalent' re1 re2 <=> forall s. 'match' re1 s === 'match' re1 s -- @ -- -- >>> let re1 = star "a" <> "a" -- >>> let re2 = "a" <> star "a" -- -- These are different regular expressions, even we perform -- some normalisation-on-construction: -- -- >>> re1 == re2 -- False -- -- They are however equivalent: -- -- >>> equivalent re1 re2 -- True -- -- The algorithm works by executing 'states' on "product" regexp, -- and checking whether all resulting states are both accepting or rejecting. -- -- @ -- re1 == re2 ==> 'equivalent' re1 re2 -- @ -- -- === More examples -- -- >>> let example re1 re2 = putPretty re1 >> putPretty re2 >> return (equivalent re1 re2) -- >>> example re1 re2 -- ^a*a$ -- ^aa*$ -- True -- -- >>> example (star "aa") (star "aaa") -- ^(aa)*$ -- ^(aaa)*$ -- False -- -- >>> example (star "aa" <> star "aaa") (star "aaa" <> star "aa") -- ^(aa)*(aaa)*$ -- ^(aaa)*(aa)*$ -- True -- -- >>> example (star ("a" \/ "b")) (star $ star "a" <> star "b") -- ^[a-b]*$ -- ^(a*b*)*$ -- True -- equivalent :: forall c. (Ord c, Enum c, Bounded c) => RE c -> RE c -> Bool equivalent x0 y0 = go mempty [(x0, y0)] where go :: Set (RE c, RE c) -> [(RE c, RE c)] -> Bool go !_ [] = True go acc (p@(x, y) : zs) | p `Set.member` acc = go acc zs -- if two regexps are structurally the same, we don't need to recurse. | x == y = go (Set.insert p acc) zs | all agree ps = go (Set.insert p acc) (ps ++ zs) | otherwise = False where cs = toList $ P.examples $ leadingChars x `P.wedge` leadingChars y ps = map (\c -> (derivate c x, derivate c y)) cs agree :: (RE c, RE c) -> Bool agree (x, y) = nullable x == nullable y instance (Ord c, Enum c, Bounded c) => C.Equivalent c (RE c) where equivalent = equivalent ------------------------------------------------------------------------------- -- Generation ------------------------------------------------------------------------------- -- | Generate random strings of the language @RE c@ describes. -- -- >>> let example = traverse_ print . take 3 . generate (curry QC.choose) 42 -- >>> example "abc" -- "abc" -- "abc" -- "abc" -- -- >>> example $ star $ "a" \/ "b" -- "aaaaba" -- "bbba" -- "abbbbaaaa" -- -- >>> example empty -- -- prop> all (match r) $ take 10 $ generate (curry QC.choose) 42 (r :: RE Char) -- generate :: (c -> c -> QC.Gen c) -- ^ character range generator -> Int -- ^ seed -> RE c -> [[c]] -- ^ infinite list of results generate c seed re | isEmpty re = [] | otherwise = QC.unGen (QC.infiniteListOf (generator c re)) (QC.mkQCGen seed) 10 generator :: (c -> c -> QC.Gen c) -> RE c -> QC.Gen [c] generator c = go where go (REChars cs) = goChars cs go (REAppend rs) = concat <$> traverse go rs go (REUnion cs rs) | RSet.null cs = QC.oneof [ go r | r <- toList rs ] | otherwise = QC.oneof $ goChars cs : [ go r | r <- toList rs ] go (REStar r) = QC.sized $ \n -> do n' <- QC.choose (0, n) concat <$> sequence (replicate n' (go r)) goChars cs = pure <$> QC.oneof [ c x y | (x,y) <- RSet.toRangeList cs ] ------------------------------------------------------------------------------- -- Instances ------------------------------------------------------------------------------- instance Eq c => Semigroup (RE c) where r <> r' = appends [r, r'] instance Eq c => Monoid (RE c) where mempty = eps mappend = (<>) mconcat = appends instance (Ord c, Enum c, Bounded c) => JoinSemiLattice (RE c) where r \/ r' = unions [r, r'] instance (Ord c, Enum c, Bounded c) => BoundedJoinSemiLattice (RE c) where bottom = empty instance c ~ Char => IsString (RE c) where fromString = string instance (Ord c, Enum c, Bounded c, QC.Arbitrary c) => QC.Arbitrary (RE c) where arbitrary = QC.sized arb where c :: QC.Gen (RE c) c = REChars . RSet.fromRangeList <$> QC.arbitrary arb :: Int -> QC.Gen (RE c) arb n | n <= 0 = QC.oneof [c, fmap char QC.arbitrary, pure eps] | otherwise = QC.oneof [ c , pure eps , fmap char QC.arbitrary , liftA2 (<>) (arb n2) (arb n2) , liftA2 (\/) (arb n2) (arb n2) , fmap star (arb n2) ] where n2 = n `div` 2 instance (QC.CoArbitrary c) => QC.CoArbitrary (RE c) where coarbitrary (REChars cs) = QC.variant (0 :: Int) . QC.coarbitrary (RSet.toRangeList cs) coarbitrary (REAppend rs) = QC.variant (1 :: Int) . QC.coarbitrary rs coarbitrary (REUnion cs rs) = QC.variant (2 :: Int) . QC.coarbitrary (RSet.toRangeList cs, Set.toList rs) coarbitrary (REStar r) = QC.variant (3 :: Int) . QC.coarbitrary r ------------------------------------------------------------------------------- -- JavaScript ------------------------------------------------------------------------------- instance c ~ Char => Pretty (RE c) where prettyS x = showChar '^' . go False x . showChar '$' where go :: Bool -> RE Char -> ShowS go p (REStar a) = parens p $ go True a . showChar '*' go p (REAppend rs) = parens p $ goMany id rs go p (REUnion cs rs) | RSet.null cs = goUnion p rs | Set.null rs = prettyS cs | otherwise = goUnion p (Set.insert (REChars cs) rs) go _ (REChars cs) = prettyS cs goUnion p rs | Set.member eps rs = parens p $ goUnion' True . showChar '?' | otherwise = goUnion' p where goUnion' p' = case Set.toList (Set.delete eps rs) of [] -> go True empty [r] -> go p' r (r:rs') -> parens True $ goSome1 (showChar '|') r rs' goMany :: ShowS -> [RE Char] -> ShowS goMany sep = foldr (\a b -> go False a . sep . b) id goSome1 :: ShowS -> RE Char -> [RE Char] -> ShowS goSome1 sep r = foldl (\a b -> a . sep . go False b) (go False r) parens :: Bool -> ShowS -> ShowS parens True s = showString "(" . s . showChar ')' parens False s = s ------------------------------------------------------------------------------- -- Doctest ------------------------------------------------------------------------------- -- $setup -- >>> :set -XOverloadedStrings -- >>> import Control.Monad (void) -- >>> import Data.Foldable (traverse_) -- >>> import Data.List (sort) -- -- >>> import Test.QuickCheck ((===)) -- >>> import qualified Test.QuickCheck as QC -- -- >>> import Kleene.Classes (match) -- >>> let asREChar :: RE Char -> RE Char; asREChar = id