> {-# LANGUAGE GeneralizedNewtypeDeriving #-}
> {-# LANGUAGE MultiParamTypeClasses, FlexibleContexts, FlexibleInstances #-}
> {-# LANGUAGE ScopedTypeVariables #-}
> {-# LANGUAGE OverloadedStrings #-}
> {-# OPTIONS_GHC -fno-warn-missing-methods -fno-warn-orphans #-}

We specify a `Monoid` instance for a `newtype` of lists.

> import Data.Monoid ( Monoid(..) )

We use QuickCheck version 1 for testing because version 2 cannot be
used in batch mode.

> import Test.QuickCheck
> import Test.QuickCheck.Batch
> import Control.Monad ( ap, replicateM )
> import Data.Char ( chr, ord )
> import Data.List ( permutations )

We import the semiring properties in order to check them for the
defined instances. We also define our own `sum` function for
semirings.

> import Data.Semiring.Properties
> import Prelude hiding ( sum )

Finally, we need the `RegExp` datatype, the `symWeight` function from
the `Weight` class, and the different semirings used for matching.

> import Text.RegExp
> import Text.RegExp.Data
> import Text.RegExp.Matching.Leftmost.Type ( Leftmost(..) )
> import Text.RegExp.Matching.Longest.Type  ( Longest(..) )
> import Text.RegExp.Matching.LeftLong.Type ( LeftLong(..) )
> import Text.RegExp.Matching.Leftmost ( getLeftmost )
> import Text.RegExp.Matching.Longest  ( getLongest )
> import Text.RegExp.Matching.LeftLong ( getLeftLong )
> import qualified Text.RegExp.Matching.Leftmost as Leftmost
> import qualified Text.RegExp.Matching.Longest  as Longest
> import qualified Text.RegExp.Matching.LeftLong as LeftLong

The `main` function runs all tests defined in this program.

> main :: IO ()
> main = 
>  do runChecks "semiring laws (Bool)" (semiring'laws :: Checks Bool)
>     runChecks "semiring laws (Int)" (semiring'laws :: Checks (Numeric Int))
>     runChecks "semiring laws (Leftmost)" (semiring'laws :: Checks Leftmost)
>     runChecks "semiring laws (Longest)" (semiring'laws :: Checks Longest)
>     runChecks "semiring laws (LeftLong)" semiring'laws'LeftLong
>     runTests (pad "full match") options $
>       checks (full'match'spec acceptFull id :: Checks Bool) ++
>       checks (full'match'spec matchingCount getNumeric
>               :: Checks (Numeric Int))
>     runTests (pad "partial match") options $
>       checks (partial'match'spec acceptPartial id :: Checks Bool) ++
>       checks (indexed'match'spec Leftmost.matching getLeftmost) ++
>       checks (partial'match'spec Longest.matching getLongest) ++
>       checks (indexed'match'spec LeftLong.matching getLeftLong)
>     runTests (pad "parse printed regexp") options [run parse'printed]
>     runChecks "lazy infinite regexps" infinite'regexp'checks
>     runTests "permutation parsing" options [run perm'parser'check]
>  where
>   options = defOpt { no_of_tests = 1000, length_of_tests = 60 }
>   runChecks s = runTests (pad s) options . checks
>   pad s = replicate (25-length s) ' ' ++ s

The `Arbitrary` instance for numeric types wraps the underlying
instance. We also provide one for `Char` which is not predefined.

> instance (Num a, Arbitrary a) => Arbitrary (Numeric a) where
>   arbitrary = Numeric `fmap` arbitrary
>
> instance Arbitrary Char where
>   arbitrary = elements "abcde \\|*+?.[]{}"

We provide generic `Semiring` instances for the semirings used for
matching.

> instance Arbitrary Leftmost where
>   arbitrary = frequency [ (1, return zero)
>                         , (1, return one)
>                         , (3, (Leftmost . abs) `fmap` arbitrary) ]
>
> instance Arbitrary Longest where
>   arbitrary = frequency [ (1, return zero) 
>                         , (1, return one)
>                         , (3, (Longest . succ . abs) `fmap` arbitrary) ]
>
> instance Arbitrary LeftLong where
>   arbitrary = frequency [ (1, return zero)
>                         , (1, return one)
>                         , (3, do x <- abs `fmap` arbitrary
>                                  y <- abs `fmap` arbitrary
>                                  return $ LeftLong (min x y) (max x y)) ]

We define a list of `Checks` for the semiring laws.

> semiring'laws :: (Arbitrary s, Show s, Semiring s) => Checks s
> semiring'laws = mconcat [ prop2 plus'comm
>                         , prop1 left'zero
>                         , prop3 add'assoc
>                         , prop1 left'one
>                         , prop1 right'one
>                         , prop3 mul'assoc
>                         , prop3 left'distr
>                         , prop3 right'distr
>                         , prop1 left'ann
>                         , prop1 right'ann
>                         ]

`Checks` is a `newtype` for a list of batch tests with a phantom type
that can be used in definitions of the properties.

> newtype Checks a = Checks { checks :: [TestOptions -> IO TestResult] }
>  deriving ( Monoid )

We define the auxiliary functions to create semiring properties with
different arities.

> prop1 :: (Arbitrary s, Show s, Testable a) => (s -> a) -> Checks s
> prop1 prop = Checks [run prop]
>
> prop2 :: (Arbitrary s, Show s, Testable a) => (s -> s -> a) -> Checks s
> prop2 prop = Checks [run prop]
>
> prop3 :: (Arbitrary s, Show s, Testable a) => (s-> s -> s -> a) -> Checks s
> prop3 prop = Checks [run prop]

The `LeftLong` type satisfies the distributive laws only with a
precondition on all involved multiplications: multiplied matches must
be adjacent and the start position must be smaller than the end
position. This precondition is satisfied for all multiplications
during regular expression matching.

We define a variant of `semiring'laws` with this precondition on the
distributive laws.

> semiring'laws'LeftLong :: Checks LeftLong
> semiring'laws'LeftLong = mconcat
>   [ prop2 plus'comm
>   , prop1 left'zero
>   , prop3 add'assoc
>   , prop1 left'one
>   , prop1 right'one
>   , prop3 mul'assoc
>   , prop3 left'distr'LeftLong
>   , prop3 right'distr'LeftLong
>   , prop1 left'ann
>   , prop1 right'ann
>   ]

For testing the distributive laws, we adjust the randomly generated
`LeftLong` values such that the arguments of multiplications are
adjacent.

> left'distr'LeftLong :: LeftLong -> LeftLong -> LeftLong -> Bool
> left'distr'LeftLong a b c = left'distr a (shift a b) (shift a c)
>  where
>   shift (LeftLong _ x) (LeftLong y z) = LeftLong (x+1) (z+x+1-y)
>   shift _              x              = x
>
> right'distr'LeftLong :: LeftLong -> LeftLong -> LeftLong -> Bool
> right'distr'LeftLong a b c = right'distr (shift a c) (shift b c) c
>  where
>   shift (LeftLong x y) (LeftLong z _) = LeftLong (x+z-1-y) (z-1)
>   shift x              _              = x

Now we turn to the correctness of the `match` function. In order to
check it, we compare it with a executable specification which is
correct by definition:

> full'match'spec :: (Show a, Weight Char Char s)
>                 => (RegExp Char -> String -> a)
>                 -> (s -> a)
>                 -> Checks s
> full'match'spec = match'spec fullMatchSpec
>
> partial'match'spec :: (Show a, Weight Char Char s)
>                    => (RegExp Char -> String -> a)
>                    -> (s -> a)
>                    -> Checks s
> partial'match'spec = match'spec partialMatchSpec
>
> indexed'match'spec :: (Show a, Weight Char (Int,Char) s)
>                    => (RegExp Char -> String -> a)
>                    -> (s -> a)
>                    -> Checks s
> indexed'match'spec = match'spec (\r -> partialMatchSpec r . zip [(0::Int)..])
>
> match'spec :: (Show a, Semiring s)
>            => (RegExp Char -> String -> s)
>            -> (RegExp Char -> String -> a)
>            -> (s -> a)
>            -> Checks s
> match'spec spec convmatch conv =
>   Checks [run (check'match'spec spec convmatch conv)]
>

> check'match'spec :: (Show a, Semiring s)
>                  => (RegExp Char -> String -> s)
>                  -> (RegExp Char -> String -> a)
>                  -> (s -> a)
>                  -> RegExp Char -> String -> Bool
> check'match'spec spec convmatch conv r s =
>   show (convmatch r s') == show (conv (spec r s'))
>  where s' = take 5 s

To make this work, we need an `Arbitrary` instance for regular
expressions.

> instance Arbitrary (RegExp Char) where
>   arbitrary = sized regexp
>
> regexp :: Int -> Gen (RegExp Char)
> regexp 0 = frequency [ (1, return eps)
>                      , (4, char `fmap` simpleChar) ]
> regexp n = frequency [ (3, regexp 0)
>                      , (1, alt  `fmap` subexp `ap` subexp)
>                      , (2, seq_ `fmap` subexp `ap` subexp)
>                      , (1, rep  `fmap` regexp (n-1))
>                      , (2, fromString `fmap` parsedRegExp n) ]
>  where subexp = regexp (n `div` 2)
>
> simpleChar :: Gen Char
> simpleChar = elements "abcde"
>
> parsedRegExp :: Int -> Gen String
> parsedRegExp n = frequency [ (4, symClass)
>                            , (2, (++"?") `fmap` subexp)
>                            , (2, (++"+") `fmap` subexp)
>                            , (1, mkBrep1 =<< subexp)
>                            , (1, mkBrep2 =<< subexp) ]
>  where
>   subexp = (($"") . showParen True . shows)
>     `fmap` (resize (n-1) arbitrary :: Gen (RegExp Char))
>
>   mkBrep1 r = do x <- elements [0..3] :: Gen Int
>                  return $ r ++ "{" ++ show x ++ "}"
>
>   mkBrep2 r = do x <- elements [0..2] :: Gen Int
>                  y <- elements [0..2] :: Gen Int
>                  return $ r ++ "{" ++ show x ++ "," ++ show (x+y) ++ "}"
>
> symClass :: Gen String
> symClass = frequency [ (1, specialChar)
>                      , (2, do n <- choose (0,3)
>                               cs <- replicateM n charClass
>                               s <- (["","^"]!!) `fmap` choose (0,1)
>                               return $ "[" ++ s ++ concat cs ++ "]") ]
>  where
>   specialChar = elements (map (:[]) "." ++
>                           map (\c -> '\\':[c]) "abcdewWdDsS \\|*+?.[]{}^")
>   charClass   = oneof [ (:[]) `fmap` simpleChar
>                       , specialChar
>                       , do x <- simpleChar
>                            y <- simpleChar
>                            return $ x : '-' : [chr (ord x+ord y-ord 'a')] ]

The specification of the matching function is defined inductively on
the structure of a regular expression. It uses exhaustive search to
find all possibilities to match a regexp against a word.

> fullMatchSpec :: Weight a b s => RegExp a -> [b] -> s
> fullMatchSpec (RegExp r) = matchSpec (reg (weighted r))
>
> matchSpec :: Semiring s => Reg s c -> [c] -> s
> matchSpec Eps        u  =  if null u then one else zero
> matchSpec (Sym _ f)  u  =  case u of [c] -> f c; _ -> zero
> matchSpec (Alt p q)  u  =  matchSpec (reg p) u .+. matchSpec (reg q) u
> matchSpec (Seq p q)  u  =
>   sum [ matchSpec (reg p) u1 .*. matchSpec (reg q) u2 | (u1,u2) <- split u ]
> matchSpec (Rep p)    u  =
>   sum [ prod [ matchSpec (reg p) ui | ui <- ps] | ps <- parts u ]
>
> sum, prod :: Semiring s => [s] -> s
> sum   =  foldr (.+.) zero
> prod  =  foldr (.*.) one
>
> split :: [a] -> [([a],[a])]
> split []      =  [([],[])]
> split (c:cs)  =  ([],c:cs) : [ (c:s1,s2) | (s1,s2) <- split cs ]
>
> parts :: [a] ->  [[[a]]]
> parts []      =  [[]]
> parts [c]     =  [[[c]]]
> parts (c:cs)  =  concat [ [(c:p):ps,[c]:p:ps] | p:ps <- parts cs ]

We can perform a similar test for partial instead of full matches.

> partialMatchSpec :: Weight a b s => RegExp a -> [b] -> s
> partialMatchSpec (RegExp r) =
>   matchSpec (reg (arb `seqW` weighted r `seqW` arb))
>  where RegExp arb = rep anySym

As a check for the parser, we check whether the representation
generated by the `Show` instance of regular expressions can be parsed
back and yields the original expression.

> parse'printed :: RegExp Char -> Bool
> parse'printed r = fromString (show r) == r

We can also match infinite regular expressions lazily to recognize
context-free or even context-sensitive languages.

> infinite'regexp'checks :: Checks Bool
> infinite'regexp'checks = Checks [run context'free, run context'sensitive]

As an example for a context-free language, we recognize the language
 ${a^nb^n | n >= 0}$.

> context'free :: String -> Bool
> context'free s = isInAnBn s == (anbn =~ s)
>
> isInAnBn :: String -> Bool
> isInAnBn s = all (=='a') xs && all (=='b') ys && length xs == length ys
>  where (xs,ys) = break (=='b') s
>
> anbn :: RegExp Char
> anbn = eps `alt` seq_ "a" (anbn `seq_` "b")

As an example for a context-sensitive language we use the language
${a^nb^nc^n | n >= 0}$. To show that the alphabet cannot only contain
characters, we use numbers instead of characters.

> context'sensitive :: [Int] -> Bool
> context'sensitive s = isInAnBnCn s == acceptFull anbncn s
>
> isInAnBnCn :: [Int] -> Bool
> isInAnBnCn s = all (==1) xs && all (==2) ys && all (==3) zs
>             && length xs == length ys && length ys == length zs
>  where (xs,l)  = break (==2) s
>        (ys,zs) = break (==3) l
>
> anbncn :: RegExp Int
> anbncn = mkAnBnCn 0
>  where
>   mkAnBnCn n = brep (n,n) (sym 2) `seq_` brep (n,n) (sym 3)
>          `alt` seq_ (sym 1) (mkAnBnCn (n+1))

The library provides a combinator that matches a list of regular
expressions in sequence, each occurring once in any order.

> perm'parser'check :: String -> Bool
> perm'parser'check cs = all (acceptFull (perm (map char s))) (permutations s)
>  where s = take 5 cs

We restrict the test to at most 5! (that is five factorial)
permutations because otherwise there are too many. Note that it is
possible to match much longer permutations:

    ghci> accept (perm (map char ['a'..'z'])) $ reverse ['a'..'z']
    True
    (0.05 secs, 8706356 bytes)

But matching `perm (map char ['a'..'z'])` against *all* permutations
of ['a'..'z'] takes too long.