{-# LANGUAGE FlexibleInstances,FlexibleContexts,DeriveGeneric,DeriveFunctor #-} module GenLanguage where import Data.String (IsString(..)) import Data.Text (Text) import qualified Data.Text as T import Data.List (transpose) import Control.Applicative (liftA2) import GHC.Generics (Generic) import Test.QuickCheck (Arbitrary(..)) import qualified Test.QuickCheck as Q import Data.Set (Set) import qualified Data.Set as Set import Data.Void (Void) import Text.Megaparsec -- * Generating random languages to parse -- ** Languages -- | The language is designed to test the basic -- parser operations concatenation and choice. data Language x = Word String | Choice x (Language x) (Language x) | Concat x (Language x) (Language x) deriving (Show,Eq,Ord,Generic,Functor) instance IsString (Language x) where fromString = Word instance Semigroup (Language ()) where (<>) = Concat () instance Semigroup (Language (Set Text)) where (Word w1) <> (Word w2) = Word (w1<>w2) x@(Word w) <> y@(Choice ws _ _) = let pfx = fromString w in Concat (Set.mapMonotonic (pfx<>) ws) x y x@(Word w) <> y@(Concat ws _ _) = let pfx = fromString w in Concat (Set.mapMonotonic (pfx<>) ws) x y x <> y = let ws = Set.map (uncurry (<>)) (Set.cartesianProduct (allWords x) (allWords y)) in Concat ws x y class Monad m => NonDeterministic m where nonDeterministically :: [m a] -> m a instance NonDeterministic [] where nonDeterministically = concat . transpose -- can enumerate finite choice of infinite lists instance NonDeterministic Q.Gen where nonDeterministically = Q.oneof class Conditionable m where suchThat :: m a -> (a -> Bool) -> m a instance Conditionable [] where suchThat = flip filter instance Conditionable Q.Gen where suchThat = Q.suchThat instance Conditionable Set where suchThat = flip Set.filter infixl 3 <||> -- | The choice operator of 'Language's (<||>) :: Language (Set Text) -> Language (Set Text) -> Language (Set Text) x <||> y = Choice (choiceWords (allWords x) (allWords y)) x y -- |Even with backtracking, -- a parser may fail to recognize a word of the language -- if the choices are ordered in a way such that -- an earlier choice contains a proper prefix of a later choice. -- Consider for example the regular expression -- -- @ -- (a|ab)c -- @ -- -- and the word @abc@ which is not accepted by the parser -- -- @ -- (try (chunk "a") <|> chunk "ab") <> chunk "c" -- @ -- -- but is accepted by the parser -- -- @ -- (try (chunk "ab") <|> chunk "a") <> chunk "c" -- @ choiceWords :: Set Text -> Set Text -> Set Text choiceWords left right = Set.union left (right `suchThat` notSuffixOf left) where notSuffixOf earlier = \w -> not (any (\a -> a `T.isPrefixOf` w) earlier) -- | 'nonDeterministically' sample from 'allWords' of a 'Language'. genWords :: NonDeterministic gen => Language (Set Text) -> gen Text genWords = nonDeterministically . fmap pure . Set.toList . allWords -- | We record the set of words in the constructor. allWords :: Language (Set Text) -> Set Text allWords (Word w) = Set.singleton (fromString w) allWords (Choice ws _ _) = ws allWords (Concat ws _ _) = ws -- ** parsing a language -- | generate a 'MonadParsec' parser for words of the given 'Language' genParser :: MonadParsec Void Text p => Language x -> p Text genParser (Word txt) = chunk (fromString txt) genParser (Choice _ x y) = try (genParser x) <|> genParser y -- backtracking choice genParser (Concat _ x y) = liftA2 (<>) (genParser x) (genParser y) -- ** non-deterministic language generation -- non-deterministically generate a language -- -- >>> mapM_ print $ fmap (const ()) $ genLanguage ["Foo","Bar"] 1 -- >>> Q.sample' (arbitrary :: Q.Gen (Language (Set Text))) >>= (print.fmap (const ()).head) genLanguage :: NonDeterministic gen => gen String -> Int -> gen (Language (Set Text)) genLanguage genWord = let sizedLang = \size -> if size <= 0 then fmap Word genWord else let lang' = sizedLang (size `div` 2) in nonDeterministically [ fmap Word genWord, liftA2 (<>) lang' lang', liftA2 (<||>) lang' lang' ] in sizedLang -- | We make single-letter alphanumeric words the basic building blocks of languages genAlphaChar :: NonDeterministic gen => gen Char genAlphaChar = nonDeterministically [return c | c <- ['a'..'z']] genWordQ :: Q.Gen String genWordQ = fmap pure genAlphaChar -- use single-letter words as building blocks -- genWordQ = let g = genAlphaChar in liftA2 (:) (fmap toUpper g) (Q.listOf g) instance Arbitrary (Language (Set Text)) where arbitrary = Q.sized (genLanguage genWordQ) shrink (Word _) = [] shrink (Concat _ lang1 lang2) = [lang1,lang2] ++ [x <> y | (x,y) <- shrink (lang1,lang2)] shrink (Choice _ lang1 lang2) = [lang1,lang2] ++ [x <||> y | (x,y) <- shrink (lang1,lang2)] -- maximum word length maxWord :: Language (Set Text) -> Int maxWord = Set.foldl' (\imum w -> max imum (T.length w)) 0 . allWords