{-# LANGUAGE ScopedTypeVariables, ExplicitForAll, MultiParamTypeClasses, GeneralizedNewtypeDeriving, OverloadedStrings #-} {-| Module: Text.PhonotacticLearner.PhonotacticConstraints Description: Description of phonological features and consrtraints. Copyright: © 2016-2017 George Steel and Peter Jurgec License: GPL-2+ Maintainer: george.steel@gmail.com Data structures and functions for working with phonological features and natural classes. Feature tables are designed work with strings reperesented as lists of 'SegRef' indices into their internal segment lists, enabling processing with fast array lookups even with non-contiguous sets of segments. -} module Text.PhonotacticLearner.PhonotacticConstraints ( -- * Phonological Features FeatureState(..), SegRef, FeatureTable(..), srBounds, ftlook, segsToRefs, refsToSegs, csvToFeatureTable, -- * Natural Classes NaturalClass(..), classToSeglist, GlobReps(..), ClassGlob(..), classesToLists, cgMatchCounter ) where import Text.PhonotacticLearner.Util.Ring import Text.PhonotacticLearner.Util.Probability import Text.PhonotacticLearner.MaxentGrammar import Text.PhonotacticLearner.DFST import Text.CSV import Data.Array.IArray import Data.Maybe import Data.Tuple import Data.List import Data.Char import Data.Monoid import Control.Monad import Control.Applicative hiding (many, some) import Control.DeepSeq import Control.Arrow((***),(&&&),first,second) import qualified Data.Text as T import qualified Data.Map.Lazy as M import Text.ParserCombinators.ReadP import Text.Read(Read(..),lift,parens) -- | Enumeration for feature states (can be +,-,0) data FeatureState = FOff | FPlus | FMinus deriving (Enum, Eq, Ord, Read, Show) instance NFData FeatureState where rnf fs = fs `seq` () -- | Indices for segment lookup table newtype SegRef = Seg Int deriving (Eq, Ord, Read, Show, Ix, NFData) -- | Type for phonological feature table. Segments and features are referred to by indices so this structure includes lookup tables for those. data FeatureTable sigma = FeatureTable { featTable :: Array (SegRef,Int) FeatureState , featNames :: Array Int T.Text , segNames :: Array SegRef sigma , featLookup :: M.Map T.Text Int , segLookup :: M.Map sigma SegRef } deriving (Show) -- | Bounds for segment references srBounds :: FeatureTable sigma -> (SegRef, SegRef) srBounds ft = bounds (segNames ft) {-# INLINE srBounds #-} -- | Shortcut for feature table array access ftlook :: FeatureTable sigma -> SegRef -> Int -> FeatureState ftlook ft sr fi = featTable ft ! (sr,fi) {-# INLINE ftlook #-} -- | Convert a string of raw segments to a string of 'SegRef's segsToRefs :: (Ord sigma) => FeatureTable sigma -> [sigma] -> [SegRef] segsToRefs ft = mapMaybe (\x -> M.lookup x (segLookup ft)) -- | Convert a string of 'SegRef's back to segments refsToSegs :: FeatureTable sigma -> [SegRef] -> [sigma] refsToSegs ft = fmap (segNames ft !) . filter (inRange (srBounds ft)) -- | List of all features in table allFeatures :: FeatureTable sigma -> [T.Text] allFeatures ft = elems (featNames ft) lstrip :: String -> String lstrip (' ':xs) = lstrip xs lstrip ('\t':xs) = lstrip xs lstrip xs = xs {- | Parse feature table from CSV. To use a feature table other than the default IPA one, you may define it in CSV format (RFC 4180). The segment names are defined by the first row (they may be any strings as long as they are all distinct, i.e. no duplicate names) and the feature names are defined by the first column (they are not hard-coded). Data cells should contain @+@, @-@, or @0@ for binary features and @+@ or @0@ for privative features (where we do not want a minus set that could form classes). As a simple example, consider the following CSV file, defining three segments (a, n, and t), and two features (vowel and nasal). > ,a,n,t > vowel,+,-,- > nasal,0,+,- If a row contains a different number of cells (separated by commas) than the header line, is rejected as invalid and does not define a feature (and will not be dispayed in the formatted feature table). If the CSV which is entered has duplicate segment names, no segments, or no valid features, the entire table is rejected (indicated by a red border around the text area, green is normal) and the last valid table is used and displayed. -} csvToFeatureTable :: (Ord sigma) => (String -> sigma) -> String -> Maybe (FeatureTable sigma) csvToFeatureTable readSeg rawcsv = do Right parsedcsv <- return (parseCSV "" rawcsv) ((_:segcells) : rawfeatrecs) <- return (fmap (fmap lstrip) parsedcsv) let numsegs = length segcells guard (numsegs > 0) let seglist = listArray (Seg 1, Seg numsegs) (fmap readSeg segcells) featrecs = filter (\xs -> length xs == numsegs + 1) rawfeatrecs numfeats = length featrecs guard (numfeats > 0) let featlist = listArray (1, numfeats) (fmap (T.pack . head) featrecs) ft = array ((Seg 1,1), (Seg numsegs, numfeats)) $ do (featidx, _:featdata) <- zip [1..] featrecs (segidx, segfield) <- zip (fmap Seg [1..]) featdata let fstate = case segfield of "+" -> FPlus "✓" -> FPlus "√" -> FPlus "-" -> FMinus "−" -> FMinus _ -> FOff return ((segidx, featidx), fstate) let segmap = M.fromList (fmap swap (assocs seglist)) featmap = M.fromList (fmap swap (assocs featlist)) guard (M.size segmap == rangeSize (bounds seglist)) return (FeatureTable ft featlist seglist featmap segmap) -------------------------------------------------------------------------------- -- | Representation of a natural class as a list of features and their states. Can ahso handle inverted classes. data NaturalClass = NClass { isInverted :: Bool , featureList :: [(FeatureState, T.Text)] } deriving (Eq, Ord) instance NFData NaturalClass where rnf c@(NClass b fs) = b `seq` rnf fs -- | Uses SPE format instance Show NaturalClass where show (NClass isNegated feats) = (if isNegated then "[¬ " else "[") ++ unwords (fmap showfeat feats) ++ "]" where showfeat (fs, fn) = (case fs of FPlus -> "+" FMinus -> "−" FOff -> "0") ++ T.unpack fn isPrintNelem :: String -> Char -> Bool isPrintNelem s c = isPrint c && not (isSpace c) && c `notElem` s featP :: ReadP (FeatureState, T.Text) featP = do skipSpaces state <- choice [FPlus <$ char '+', FPlus <$ char '✓', FPlus <$ char '√', FMinus <$ char '-', FMinus <$ char '−', FOff <$ char '0'] <++ return FPlus feat <- (:) <$> satisfy (isPrintNelem "0123456789+-√−✓(){}[]¬^") <*> munch (isPrintNelem "(){}[]¬^") return (state, T.pack feat) classP = between (char '[') (char ']') (NClass <$> ((True <$ char '¬') +++ (True <$ char '^') +++ return False) <*> many featP) instance Read NaturalClass where readPrec = parens (lift (skipSpaces >> classP)) xor :: Bool -> Bool -> Bool xor False p = p xor True p = not p -- | Convert a class to a 'SegSet' classToSeglist :: FeatureTable sigma -> NaturalClass -> SegSet SegRef classToSeglist ft (NClass isNegated cls) = force $ fnArray (srBounds ft) (\c -> isNegated `xor` and [ftlook ft c fi == fs | (fs,fi) <- icls]) where icls = do (s,fn) <- cls Just fi <- return (M.lookup fn (featLookup ft)) return (s,fi) -- | Globs using 'NaturalClass' instead of 'SegSet' data ClassGlob = ClassGlob Bool Bool [(GlobReps, NaturalClass)] deriving (Eq, Ord) instance NFData ClassGlob where rnf (ClassGlob isinit isfin gparts) = isinit `seq` isfin `seq` rnf gparts -- | Uses SPE format instance Show ClassGlob where show (ClassGlob isinit isfin parts) = (guard isinit >> "#") ++ (showGP =<< parts) ++ (guard isfin >> "#") where showGP (GStar, NClass False []) = "…" showGP (rep, NClass False [(FPlus,"syllabic")]) = "V" ++ suf rep showGP (rep, NClass False [(FMinus,"syllabic")]) = "C" ++ suf rep showGP (rep, c) = show c ++ suf rep suf GSingle = "" suf GPlus = "₁" suf GStar = "₀" globRepsP :: ReadP GlobReps globRepsP = choice [GPlus <$ char '+', GPlus <$ char '₁', GStar <$ char '*', GStar <$ char '₀', return GSingle] classGlobP :: ReadP ClassGlob classGlobP = do isinit <- (True <$ char '#') +++ return False gparts <- many1 $ ((GStar, NClass False []) <$ char '…') +++ do cls <- classP +++ (NClass False [(FPlus,"syllabic")] <$ char 'V') +++ (NClass False [(FMinus,"syllabic")] <$ char 'C') rep <- globRepsP return (rep,cls) isfin <- (True <$ char '#') +++ return False return (ClassGlob isinit isfin gparts) instance Read ClassGlob where readPrec = parens (lift (skipSpaces >> classGlobP)) -- | Convert to a 'ListGlob' classesToLists :: FeatureTable sigma -> ClassGlob -> ListGlob SegRef classesToLists ft (ClassGlob isinit isfin gparts) = ListGlob isinit isfin (fmap (second (classToSeglist ft)) gparts) -- | Create a DFST which counts the matches of the glob. cgMatchCounter :: FeatureTable sigma -> ClassGlob -> ShortDFST SegRef cgMatchCounter ft = matchCounter . classesToLists ft