-- | The data types that define a CFG. module FormalLanguage.CFG.Grammar.Types where import Control.Lens hiding (Index,index) import Data.Default import Data.Map.Strict (Map) import Data.Semigroup import Data.Set (Set) import Data.String import qualified Data.Map.Strict as M import qualified Data.Set as S import Data.Data (Data,Typeable) newtype IndexName = IndexName { _getIndexName :: String } deriving (Show,Eq,Ord,IsString,Data,Typeable) makeLenses ''IndexName data IOP = IPlus -- in rules | IMinus -- in rules | IEq -- in rules | INone -- grammar-global | ISymbol -- when creating the symbol, here @=n@ says to use @[0..n-1]@ deriving (Show,Eq,Ord,Data,Typeable) -- | Encode the index of the syntactic or terminal variable. -- -- In case of grammar-based indexing, keep @indexRange@ empty. The -- @indexStep@ keeps track of any @+k@ / @-k@ given in the production -- rules. -- -- We allow indexing terminals now, too. When glueing together terminals, -- one might want to be able to differentiate between terminals. data Index = Index { _indexName :: IndexName , _indexHere :: Integer , _indexOp :: IOP , _indexRange :: [Integer] , _indexStep :: Integer } -- TODO need a version, where we have figured out everything -- , i.e. replaced @i+2@ with, say, @1@ @(i==1+2 `mod` 3)@. -- Use the @_indexVar = j@ version in the set of syn-vars, but -- @_indexVar=x, x \in _indexRange@ in rules? deriving (Show,Eq,Ord,Data,Typeable) makeLenses ''Index -- | Newtype wrapper for symbol names. newtype SymbolName = SymbolName { _getSteName :: String } deriving (Show,Eq,Ord,IsString,Data,Typeable) makeLenses ''SymbolName -- | The tape, a terminal operates on. Terminals on different tapes could -- still have the same @SymbolName@ but different type and input! newtype Tape = Tape { _getTape :: Int } deriving (Show,Eq,Ord,Enum,Num,Data,Typeable) makeLenses ''Tape -- | Symbols, potentially with an index or more than one. data SynTermEps -- | Syntactic variables. = SynVar { _name :: SymbolName , _index :: [Index] , _splitN :: Integer , _splitK :: Integer } -- syntactic terminals. Inside-synvars used in an outside context. | SynTerm { _name :: SymbolName , _index :: [Index] } -- | Regular old terminal symbol -- reads stuff from the input. | Term { _name :: SymbolName , _index :: [Index] } -- | This sym denotes the case, where we have an @Deletion@ terminal, i.e. -- something is matched to nothing. This is actually just a regular -- terminal symbol, we just treat it differently. | Deletion -- | Finally, a real epsilon. Again, these are somewhat regular terminal -- symbols, but it is important to be able to recognize these, when -- trying to create outside variants of our algorithms. | Epsilon deriving (Show,Eq,Ord,Data,Typeable) makeLenses ''SynTermEps makePrisms ''SynTermEps -- | The length of the list encodes the dimension of the symbol. Forms a monoid -- over dimensional concatenation. newtype Symbol = Symbol { _getSymbolList :: [SynTermEps] } deriving (Show,Eq,Ord,Monoid,Semigroup,Data,Typeable) makeLenses ''Symbol -- | The name of an attribute function newtype AttributeFunction = Attr { _getAttr :: String } deriving (Show,Eq,Ord,IsString,Data,Typeable) makeLenses ''AttributeFunction -- | Production rules for at-most CFGs. data Rule = Rule { _lhs :: Symbol -- ^ the left-hand side of the rule , _attr :: [AttributeFunction] -- ^ the attribute for this rule , _rhs :: [Symbol] -- ^ the right-hand side with a collection of terminals and syntactic variables } deriving (Show,Eq,Ord,Data,Typeable) makeLenses ''Rule data DerivedGrammar = Inside | Outside String deriving (Show,Eq,Data,Typeable) isOutside (Outside _) = True isOutside _ = False instance Default DerivedGrammar where def = Inside makeLenses ''DerivedGrammar -- | Complete descrition of a grammar. In principle it would be enough to hold -- @_rules@ and the @_start@ symbol name. We also store dimensionless names for -- syntactiv variables, and terminals. This makes certain checks easier or -- possible. -- -- We store all single-tape symbol names dimensionless. This means that, for -- terminals, symbols with the same name have the same tape. This is slightly -- inconvenient for special applications (say Protein-DNA alignment) but one -- can easily rename terminals. -- -- TODO better way to handle indexed symbols? data Grammar = Grammar { _synvars :: Map SymbolName SynTermEps -- ^ regular syntactic variables, without dimension , _synterms :: Map SymbolName SynTermEps -- ^ Terminal synvars are somewhat weird. They are used in Outside -- grammars, and hold previously calculated inside values. , _termvars :: Map SymbolName SynTermEps -- ^ regular terminal symbols , _outside :: DerivedGrammar -- ^ Is this an automatically derived outside grammar , _rules :: Set Rule -- ^ set of production rules , _start :: Symbol -- ^ start symbol , _params :: Map IndexName Index -- ^ any global variables , _indices :: Map IndexName Index -- ^ active indices , _grammarName :: String -- ^ grammar name , _write :: Bool -- ^ some grammar file requested this grammar to be expanded into code -- -- TODO remove, we have an emission queue } deriving (Show,Data,Typeable) instance Default Grammar where def = Grammar { _synvars = M.empty , _synterms = M.empty , _termvars = M.empty , _outside = def , _rules = S.empty , _start = mempty , _params = M.empty , _indices = M.empty , _grammarName = "" , _write = False } makeLenses ''Grammar