{-# LANGUAGE DataKinds #-} {-# LANGUAGE DuplicateRecordFields #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE StandaloneDeriving #-} -- | This module holds type definitions that describe the internal -- representation of language syntaxen as understood by nanopass. module Nanopass.Internal.Representation ( -- * Types for Base Languages -- $ir Language(..) , LanguageInfo(..) , Nonterm(..) , Production(..) , TypeDesc(..) -- * Types for Modifying Manguages , LangMod(..) , NontermsEdit(..) , ProductionsEdit(..) -- * Types for Passes , Pass(..) -- * Helper Types , UpName, toUpName, fromUpName , LowName, toLowName, fromLowName , UpDotName, toUpDotName, fromUpDotName, splitUpDotName , unDotted, upDotQualifier, upDotBase, upDotChBase , Name(..), Validate(..) ) where import Data.Char (isLower,isUpper,isAlphaNum) import Data.List (intercalate) import Data.Map (Map) import GHC.Records (HasField(..)) import qualified Language.Haskell.TH as TH ------ Names ------ -- | Strings matching @[A-Z][a-zA-Z0-9_]@ newtype UpName = UpName String deriving (Show,Eq,Ord) -- | Introduction form for 'UpName' toUpName :: String -> Maybe UpName toUpName (c:cs) | isUpper c && all isAlphaNum cs = Just $ UpName (c:cs) toUpName _ = Nothing -- | Elimination form for 'UpName' fromUpName :: UpName -> String fromUpName (UpName str) = str -- | Strings matching @[a-z][a-zA-Z0-9_]@ newtype LowName = LowName String deriving (Show,Eq,Ord) -- | Introduction form for 'LowName' toLowName :: String -> Maybe LowName toLowName (c:cs) | isLower c && all isAlphaNum cs = Just $ LowName (c:cs) toLowName _ = Nothing -- | Elimination form for 'LowName' fromLowName :: LowName -> String fromLowName (LowName str) = str -- | Strings matching @[A-Z][a-zA-Z0-9_]\(:[A-Z][a-zA-Z0-9_])*@ data UpDotName = UpDotName [UpName] UpName deriving (Show,Eq,Ord) -- | Introduction form for 'UpDotName' toUpDotName :: String -> Maybe UpDotName toUpDotName = loop [] where loop acc inp = case break (== '.') inp of ([], _) -> Nothing -- no leading dot, double dot, or empty string allowed (_, ".") -> Nothing -- no trailing dot allowed (str, []) -> do -- no more dots endName <- toUpName str pure $ UpDotName (reverse acc) endName (str, _:rest) -> do qual <- toUpName str loop (qual:acc) rest -- | Elimination form for 'UpDotName' fromUpDotName :: UpDotName -> String fromUpDotName (UpDotName strs str) = intercalate "." $ fromUpName <$> (strs ++ [str]) -- | Conversion from 'UpName' unDotted :: UpName -> UpDotName unDotted x = UpDotName [] x -- | Get the parts of a dotted name that come before the last dot upDotQualifier :: UpDotName -> [UpName] upDotQualifier (UpDotName xs _) = xs -- | Get the last part of a dotted name upDotBase :: UpDotName -> UpName upDotBase (UpDotName _ x) = x -- | Create a dotted name identical to the first, but with the last part replaced upDotChBase :: UpDotName -> UpName -> UpDotName upDotChBase (UpDotName xs _) y = UpDotName xs y -- | Get the last part of a dotted name and it's prefix splitUpDotName :: UpDotName -> ([UpName], UpName) splitUpDotName (UpDotName xs x) = (xs, x) data Validate = Valid | Unvalidated data Name v n where SourceName :: { name_ :: n } -> Name 'Unvalidated n ValidName :: { base_ :: n, th :: TH.Name } -> Name 'Valid n deriving instance (Show n) => Show (Name v n) deriving instance (Eq n) => Eq (Name v n) deriving instance (Ord n) => Ord (Name v n) instance HasField "name" (Name v n) n where getField (SourceName n) = n getField (ValidName n _) = n ---------------------------- ------ Base Languages ------ ---------------------------- -- $ir -- -- The types 'Language', 'Nonterm', 'Production' mediate between Haskell and the theory of context-free grammars (CFGs). -- Each of them is an intermediate representation that can be seen from two perspectives: -- -- * What Haskell concept do they map to? -- * What CFG concept do they map to? -- -- We use something like usual, minimal definition of a CFG as a 4-tuple G = (V, Σ, R, S) where -- -- 1. V is a set of non-terminals (named by 'nontermName') -- 2. Σ is a set of terminals (which are just ordinary Haskell data types) -- 3. R is a relation in V × (V ∪ Σ)*. Members of this relation are called rewrite rules (and map to the arguments of a Haskell data constructor). -- 4. S is the start symbol, though it is not used by nanopass. -- | This attributes a name to a set of grammatical types. -- Languages can have different names in different contexts; -- importantly, they must not be qualified when defining, but they may need to be dotted when referring to a language from another module. data Language v n = Language { langName :: Name v n , langInfo :: LanguageInfo v } deriving(Show) -- | Seen as a Haskell entity, each 'Language' is a set of mutually-recursive types. -- Seen from the persepctive of a CFG, each of these types is a non-terminal used to define the abstract grammar of a language. -- -- See 'Language' for attributing a name to a set of these types. data LanguageInfo v = LanguageInfo { langParams :: ![Name v LowName] -- ^ type parameters; these apply to each of the 'Nonterm' types , nonterms :: !(Map UpName (Nonterm v)) -- TODO make this a list , originalProgram :: !(Maybe String) , baseDefdLang :: !(Maybe (Language 'Valid UpDotName)) } deriving(Show) -- | Seen as a Haskell entity, each 'Nonterm' is a single type with some number of constructors. -- Seem from the perspective of a CFG, each 'Nonterm' is… well, a non-terminal symbol. -- -- 'Nonterm's are the primary constituent of a 'Language'. data Nonterm v = Nonterm { nontermName :: !(Name v UpName) , productions :: !(Map UpName (Production v)) } deriving(Show) -- | Seen as a Haskell entity, each 'Production' maps to a constructor for a 'Nonterm' data type. -- Seen from the perspective of a CFG, each 'Production' maps to a single rewrite rule. -- -- 'Production's are the primary constituent of 'Nonterm's. data Production v = Production { prodName :: !(Name v UpName) , subterms :: ![TypeDesc v] } deriving(Show) -- | Seen as a Haskell entity, a 'TypeDesc' gives the type of an argument of a constructor ('Production'). -- Seen from the perspective of a CFG, each 'TypeDesc' is a symbol (terminal or non-terminal) on the right-hand side of a rewrite rule. -- -- 'TypeDesc's are the primary constituent of 'Production's. data TypeDesc v = RecursiveType UpName -- ^ a non-terminal symbol/recursive use of a 'Nonterm' type -- -- These types need not be applied to any arguments, the language 'langParams' get auromatically applied. | VarType (Name v LowName) -- ^ allows the use of 'langParams' as terminal symbols | CtorType (Name v UpDotName) [TypeDesc v] -- ^ allows the use of plain (not defined by nanopass) Haskell types, -- either as terminal symbols, or as combinators over non-terminal and terminal symbols | ListType (TypeDesc v) -- because otherwise, you'd have to always be saying `type List a = [a]` -- ^ nanopass has built-in knowledge of lists, so they are represented specially as opposed to with 'CtorType' | MaybeType (TypeDesc v) -- ^ nanopass has built-in knowledge of optionals, so they are represented specially as opposed to with 'CtorType' | NonEmptyType (TypeDesc v) -- ^ nanopass has built-in knowledge of non-empty lists, so they are represented specially as opposed to with 'CtorType' | UnitType -- ^ nanopass has built-in knowledge of the unit type, so they are represented specially as opposed to with 'CtorType' | TupleType (TypeDesc v) (TypeDesc v) [TypeDesc v] -- ^ nanopass has built-in knowledge of the tuple types, so they are represented specially as opposed to with 'CtorType' deriving(Eq,Show) --------------------------------- ------ Modifying Languages ------ --------------------------------- data LangMod = LangMod { baseLang :: UpDotName , newLang :: UpName , newParams :: [Name 'Unvalidated LowName] , nontermsEdit :: [NontermsEdit] , originalModProgram :: Maybe String } deriving(Show) data NontermsEdit = AddNonterm (Nonterm 'Unvalidated) | ModNonterm UpName [ProductionsEdit] | DelNonterm UpName deriving(Show) data ProductionsEdit = AddProd (Production 'Unvalidated) | DelProd UpName deriving(Show) -------------------- ------ Passes ------ -------------------- data Pass = Pass { sourceLang :: Name 'Unvalidated UpDotName , targetLang :: Name 'Unvalidated UpDotName } deriving (Show)