-- | -- Module : Data.MultextEastMsd -- Copyright : (c) 2009 Jan Snajder -- License : BSD-3 (see the LICENSE file) -- -- Maintainer : Jan Snajder -- Stability : experimental -- Portability : portable -- -- Implementation of the MULTEXT-East morphosyntactic descriptors. -- -- MULTEXT-East encodes values of morphosyntatic attributes in a single string, -- using positional encoding. Each attribute is represented by a single letter -- at a predefined position, while non-applicable attributes are represented by -- hyphens. For example, @Ncmsg@ denotes a common noun (@Nc@) in masculine -- singular genitive (@msg@) case. For details, refer to . -- -- Currently, only MULTEXT-East Version 3 is supported. MULTEXT-East Version 3 -- covers morphosyntactic descriptions for Bulgarian, Croatian, Czech, English, -- Estonian, Hungarian, Lithuanian, Macedonian, Persian, Polish, Resian, -- Romanian, Russian, Serbian, Slovak, Slovene, and Ukrainian. For details, -- refer to . -- -- Usage example: -- -- >>> let Just d1 = fromString "Ncmsg" -- >>> pos d1 -- Noun -- >>> features d1 -- [NType Common,Gender Masculine,Number Singular,Case Genitive] -- >>> let d2 = unset NType d1 -- >>> toString d2 -- "N-msg" -- >>> d1 == d2 -- False -- >>> d1 =~= d2 -- True ------------------------------------------------------------------------------- module Data.MultextEastMsd ( -- * Datatype constructor Msd, msd, PoS (..), -- * Getting and setting values Attribute, get, set, unset, check, features, pos, -- * Wildcard matching (=~=), -- * From/to string conversion toString, fromString, validString, -- * Morphosyntactic features Feature (..), AType (..), Aspect (..), Case (..), Class (..), CoordType (..), CType (..), Definiteness (..), Degree (..), Formation (..), Gender (..), MForm (..), MType (..), NType (..), Number (..), Person (..), SType (..), SubType (..), Tense (..), VForm (..), Voice (..), VType (..)) where import Data.List (find,sort,delete,findIndex,deleteBy,intersectBy,nubBy) import Data.Maybe (catMaybes,isJust) import Control.Monad (liftM) data Msd = Msd PoS [Feature] deriving (Show) instance Eq Msd where Msd p1 fs1 == Msd p2 fs2 = p1==p2 && sort fs1==sort fs2 data PoS = Noun | Verb | Adjective | Adposition | Conjunction | Numeral deriving (Eq,Enum,Show) data Feature = Animate Bool | AType AType | Aspect Aspect | Case Case | Class Class | Clitic Bool | CliticS Bool | CoordType CoordType | Courtesy Bool | CType CType | Definiteness Definiteness | Degree Degree | Formation Formation | Gender Gender | MForm MForm | MType MType | Negative Bool | NType NType | Number Number | OwnedNumber Number | OwnerNumber Number | OwnerPerson Person | Person Person | SType SType | SubType SubType | Tense Tense | VForm VForm | Voice Voice | VType VType deriving (Show,Ord,Eq) data NType = Common | Proper deriving (Eq,Enum,Ord,Show) data Gender = Masculine | Feminine | Neuter deriving (Eq,Enum,Ord,Show) data Number = Singular | Plural | Dual | Count | Collective deriving (Eq,Enum,Ord,Show) data Case = Nominative | Genitive | Dative | Accusative | Vocative | Locative | Instrumental | Direct | Oblique | Partitive | Illative | Inessive | Elative | Allative | Adessive | Ablative | Translative | Terminative | Essive | Abessive | Komitative | Aditive | Temporalis | Causalis | Sublative | Delative | Sociative | Factive | Superessive | Distributive | EssiveFormal | Multiplicative deriving (Eq,Enum,Ord,Show) data Definiteness = No | Yes | ShortArt | FullArt | OneSTwoS deriving (Eq,Enum,Ord,Show) data VType = Main | Auxiliary | Modal | Copula | Base deriving (Eq,Enum,Ord,Show) data VForm = Indicative | Subjunctive | Imperative | Conditional | Infinitive | Participle | Gerund | Supine | Transgressive | Quotative deriving (Eq,Enum,Ord,Show) data Tense = Present | Imperfect | Future | Past | Pluperfect | Aorist deriving (Eq,Enum,Ord,Show) data Person = First | Second | Third deriving (Eq,Enum,Ord,Show) data Voice = Active | Passive deriving (Eq,Enum,Ord,Show) data Aspect = Progressive | Perfective deriving (Eq,Enum,Ord,Show) data AType = Qualificative | Indefinite | Possessive | OrdinalT deriving (Eq,Enum,Ord,Show) data Degree = Positive | Comparative | Superlative | ElativeD | Diminutive deriving (Eq,Enum,Ord,Show) data Formation = Nominal | Simple | Compound deriving (Eq,Enum,Ord,Show) data MType = Cardinal | Ordinal | Fractal | Multiple | Collect | Special deriving (Eq,Enum,Ord,Show) data MForm = Digit | Roman | Letter | Both | MForm_ | Approx deriving (Eq,Enum,Ord,Show) data Class = Definite1 | Definite2 | Definite34 | Definite | Demonstrative | IndefiniteC | Interrogative | Relative deriving (Eq,Enum,Ord,Show) data SType = Preposition | Postposition deriving (Eq,Enum,Ord,Show) data CType = Coordinating | Subordinating | Portmanteau deriving (Eq,Enum,Ord,Show) data CoordType = CTSimple | CTRepetit | CTCorrelat | CTSentence | CTWords | Initial | NonInitial deriving (Eq,Enum,Ord,Show) data SubType = STNegative | STPositive deriving (Eq,Enum,Ord,Show) -- A helper function to identify constructors in a list eq (Animate _) (Animate _) = True eq (AType _) (AType _) = True eq (Aspect _) (Aspect _) = True eq (Case _) (Case _) = True eq (Class _) (Class _) = True eq (Clitic _) (Clitic _) = True eq (CoordType _) (CoordType _) = True eq (Courtesy _) (Courtesy _) = True eq (CType _) (CType _) = True eq (Definiteness _) (Definiteness _) = True eq (Degree _) (Degree _) = True eq (Formation _) (Formation _) = True eq (Gender _) (Gender _) = True eq (MForm _) (MForm _) = True eq (MType _) (MType _) = True eq (Negative _) (Negative _) = True eq (NType _) (NType _) = True eq (Number _) (Number _) = True eq (OwnedNumber _) (OwnedNumber _) = True eq (OwnerNumber _) (OwnerNumber _) = True eq (OwnerPerson _) (OwnerPerson _) = True eq (Person _) (Person _) = True eq (SType _) (SType _) = True eq (SubType _) (SubType _) = True eq (Tense _) (Tense _) = True eq (VForm _) (VForm _) = True eq (Voice _) (Voice _) = True eq (VType _) (VType _) = True eq _ _ = False -- | Constructs a morphosyntactic descriptor (an abstract @Msd@ datatype) of -- a specified part-of-speech and with specified features (attribute-value -- pairs). Duplicated attributes and attributes not applicable to the given -- part-of-speech are ignored. msd :: PoS -> [Feature] -> Msd msd p fs = set fs $ Msd p [] class MsdPattern a where -- | A wildcard-matching operator between two Msd patterns. -- Relation @ msd1 =~= msd2 @ holds iff @msd1@ and @msd2@ are of the same -- part-of-speech and the attributes common to @msd1@ -- and @msd2@ have identical values. The attributes of @msd1@ that are not -- set in @msd2@ (and conversely) are ignored in the comparison. -- In MULTEXT-East notation, this is tantamount to -- having character code @-@ (hyphen) act as a wildcard. (=~=) :: a -> a -> Bool infix 4 =~= instance MsdPattern Msd where Msd p1 fs1 =~= Msd p2 fs2 = Msd p1 (intersectBy eq fs1 fs2) == Msd p2 (intersectBy eq fs2 fs1) instance MsdPattern a => MsdPattern (Maybe a) where (Just x) =~= (Just y) = x =~= y _ =~= _ = False instance MsdPattern a => MsdPattern [a] where xs =~= ys = and (zipWith (=~=) xs ys) && length xs == length ys type Attribute a = a -> Feature x_ :: (Enum a) => a x_ = toEnum 0 -- | Gets the value of a specified attribute. get :: (Enum a) => Attribute a -> Msd -> Maybe Feature get a (Msd _ fs) = find (`eq` a x_) fs -- | Sets the specified features (attribute-value pairs). Duplicated -- attributes and attributes not applicable to the given part-of-speech -- are ignored. set :: [Feature] -> Msd -> Msd set fs2 (Msd p fs1) = Msd p $ nubBy eq fs3 where fs3 = intersectBy eq fs2 (posFeatures p) ++ fs1 -- | Unsets the value of a specified attribute. unset :: (Enum a) => Attribute a -> Msd -> Msd unset a (Msd p fs) = Msd p $ deleteBy eq (a x_) fs -- | Checks whether the attributes are set to the specified values. check :: [Feature] -> Msd -> Bool check fs2 (Msd _ fs1) = all (\av -> isJust . find (==av) $ fs1) fs2 -- | Returns the features (attribute-value pairs) of a @Msd@. features :: Msd -> [Feature] features (Msd _ fs) = fs -- | Returns a part-of-speech ('PoS' value) of an @Msd@. pos :: Msd -> PoS pos (Msd pos _) = pos posCodes = "NVASCM" decodeWith :: (Enum a) => String -> Char -> Maybe a decodeWith cs c = toEnum `liftM` findIndex (==c) cs encodeWith :: (Enum a) => String -> a -> Char encodeWith cs x = cs !! fromEnum x -- | Converts an @Msd@ datatype into a MULTEXT-East string notation. toString :: Msd -> String toString (Msd p fs) = trim $ c : map (\av -> enc $ get av fs) (posFeatures p) where trim = reverse . dropWhile (=='-') . reverse enc Nothing = '-' enc (Just x) = encode x c = encodeWith posCodes p get av = find (`eq` av) -- | Converts a MULTEXT-East string notation into an @Msd@ datatype. -- Returns @Nothing@ if string is not a valid MULTEXT-East string. fromString :: String -> Maybe Msd fromString (c:cs) = do p <- decodeWith posCodes c let fs = zipWith decode (posFeatures p) cs if (all isJust fs) then Just $ Msd p (catMaybes . catMaybes $ fs) else Nothing -- | Checks whether the string conforms to the MULTEXT-East specification. -- Defined as: -- @ validString = isJust . fromString @ validString :: String -> Bool validString = isJust . fromString posFeatures Noun = [NType x_,Gender x_,Number x_,Case x_,Clitic x_,Definiteness x_, Animate x_,OwnerNumber x_,OwnerPerson x_,OwnedNumber x_] posFeatures Verb = [VType x_,VForm x_,Tense x_,Person x_,Number x_,Gender x_,Voice x_, Negative x_,Definiteness x_,Clitic x_,Case x_,Animate x_,CliticS x_, Aspect x_,Courtesy x_] posFeatures Adjective = [AType x_,Degree x_,Gender x_,Number x_,Case x_,Definiteness x_, Clitic x_,Animate x_,Formation x_,OwnerNumber x_,OwnerPerson x_, OwnedNumber x_] posFeatures Adposition = [SType x_,Formation x_,Case x_,Clitic x_] posFeatures Conjunction = [CType x_,Formation x_,CoordType x_,SubType x_,Clitic x_,Number x_, Person x_] posFeatures Numeral = [MType x_,Gender x_,Number x_,Case x_,MForm x_,Definiteness x_, Clitic x_,Class x_,Animate x_,OwnerNumber x_,OwnerPerson x_, OwnedNumber x_] -- Helper functions to unwrap the values encode (NType v) = enc NType v encode (Gender v) = enc Gender v encode (Number v) = enc Number v encode (Case v) = enc Case v encode (Definiteness v) = enc Definiteness v encode (Clitic v) = enc Clitic v encode (CliticS v) = enc CliticS v encode (VType v) = enc VType v encode (VForm v) = enc VForm v encode (Tense v) = enc Tense v encode (Person v) = enc Person v encode (AType v) = enc AType v encode (Voice v) = enc Voice v encode (Aspect v) = enc Aspect v encode (Degree v) = enc Degree v encode (Formation v) = enc Formation v encode (MType v) = enc MType v encode (MForm v) = enc MForm v encode (Class v) = enc Class v encode (SType v) = enc SType v encode (CType v) = enc CType v encode (CoordType v) = enc CoordType v encode (SubType v) = enc SubType v encode (Animate v) = enc Animate v encode (OwnerNumber v) = enc OwnerNumber v encode (OwnerPerson v) = enc OwnerPerson v encode (OwnedNumber v) = enc OwnedNumber v encode (Negative v) = enc Negative v encode (Courtesy v) = enc Courtesy v decode (NType _) = dec NType decode (Gender _) = dec Gender decode (Number _) = dec Number decode (Case _) = dec Case decode (Definiteness _) = dec Definiteness decode (Clitic _) = dec Clitic decode (CliticS _) = dec CliticS decode (VType _) = dec VType decode (VForm _) = dec VForm decode (Tense _) = dec Tense decode (Person _) = dec Person decode (AType _) = dec AType decode (Voice _) = dec Voice decode (Aspect _) = dec Aspect decode (Degree _) = dec Degree decode (Formation _) = dec Formation decode (MType _) = dec MType decode (MForm _) = dec MForm decode (Class _) = dec Class decode (SType _) = dec SType decode (CType _) = dec CType decode (CoordType _) = dec CoordType decode (SubType _) = dec SubType decode (Animate _) = dec Animate decode (OwnerNumber _) = dec OwnerNumber decode (OwnerPerson _) = dec OwnerPerson decode (OwnedNumber _) = dec OwnedNumber decode (Negative _) = dec Negative decode (Courtesy _) = dec Courtesy enc :: (Enum a) => Attribute a -> a -> Char enc a v = encodeWith (codes $ a x_) v dec :: (Enum a) => Attribute a -> Char -> Maybe (Maybe Feature) dec _ '-' = Just Nothing dec a c = decodeWith (codes $ a x_) c >>= return . Just . a -- MULTEXT-East Version 3 codes codes :: Feature -> String codes (NType x) = "cp" codes (Gender x_) = "mfn" codes (Number x_) = "spdtl" codes (Case x_) = "ngdavliro1x2et3b49w5k7mcshqypuf6" codes (Definiteness x_) = "nysf2" codes (Clitic x_) = "ny" codes (CliticS x_) = "ny" codes (VType x_) = "maoc" codes (VForm x_) = "ismcnpgutq" codes (Tense x_) = "pifsla" codes (Person x_) = "123" codes (AType x_) = "fiso" codes (Voice x_) = "ap" codes (Aspect x_) = "pe" codes (Degree x_) = "pcsed" codes (Formation x_) = "nsc" codes (MType x_) = "cofmls" codes (MForm x_) = "drlbma" codes (Class x_) = "123fdiqr" codes (SType x_) = "pt" codes (CType x_) = "csr" codes (CoordType x_) = "srcpwin" codes (SubType x_) = "zp" codes (Animate x_) = "ny" codes (Courtesy x_) = "ny" codes (Negative x_) = "ny" codes (OwnerNumber x_) = "spdtl" codes (OwnerPerson x_) = "123" codes (OwnedNumber x_) = "spdtl"