src/Text/PhonotacticLearner/PhonotacticConstraints/FileFormats.hs

{-# LANGUAGE OverloadedStrings, ScopedTypeVariables, ParallelListComp #-}

{-|
Module: Text.PhonotacticLearner.PhonotacticConstraints.FileFormats
Description: Generation of candidate constraint sets.
Copyright: © 2016-2017 George Steel and Peter Jurgec
License: GPL-2+
Maintainer: george.steel@gmail.com

Functions for saving and loading lexicons and 'ClassGlob' constraint grammars in standard formats.
-}

module Text.PhonotacticLearner.PhonotacticConstraints.FileFormats where

import Control.Monad
import Data.Traversable
import Data.Monoid
import Data.Foldable
import Text.Read
import Data.List
import Data.Maybe
import qualified Data.Text as T
import qualified Data.Set as S
import qualified Data.Map as M
import Data.Array.IArray
import Numeric
import Control.DeepSeq

import Text.PhonotacticLearner.PhonotacticConstraints
import Text.PhonotacticLearner.MaxentGrammar
import Text.PhonotacticLearner.Util.Ring


-- | Given a set of possible segments and a string, break a string into segments.
-- Uses the rules in Fiero orthography (a phonetic writing system using ASCII characters) where the longest possible match is always taken and apostrophes are used as a digraph break.
segmentFiero :: S.Set String -- ^ All possible segments
             -> String -- ^ Raw text
             -> [String] -- ^ Segmented text
--segmentFiero [] = error "Empty segment list."
segmentFiero allsegs = go msl where
    msl = maximum . S.map length $ allsegs
    go _ [] = []
    go _ ('\'':xs) = go msl xs
    go 0 (x:xs) = go msl xs
    go len xs | S.member seg allsegs = seg : go msl rest
              | otherwise = go (len-1) xs
        where (seg,rest) = splitAt len xs

-- | Joins segments together using Fiero rules. Inserts apostrophes where necerssary.
joinFiero :: S.Set String -- ^ All possible segments
          -> [String] -- ^ Segmented text
          -> String -- ^ Raw text
joinFiero allsegs = go where
    msl = maximum . S.map length $ allsegs
    go [] = []
    go [x] = x
    go (x:xs@(y:_)) = let z = x++y
                      in  if any (\s -> isPrefixOf s z && not (isPrefixOf s x)) allsegs
                          then x ++ ('\'' : go xs)
                          else x ++ go xs

-- | Structure for reperesenting lexicon entries
data LexRow = LexRow [String] Int

-- | Parse a lexicon from a file. Segmentation of a word uses fiero rules (which will also decode space-separated segments and single-character segments).
-- Words may optionally be followed by a tab character and an integer indicating frequency (1 by default).
parseWordlist :: S.Set String -> T.Text -> [LexRow]
parseWordlist segs rawlist = do
    line <- T.lines rawlist
    let (rawword : rest) = T.split (== '\t') line
        w = segmentFiero segs (T.unpack rawword)
        fr = fromMaybe 1 $ do
            [f] <- return (rest >>= T.words)
            readMaybe (T.unpack f)
    guard (w /= [])
    return $ LexRow w fr

-- | Collate a list of words and frequencies from raw phonetic text.
collateWordlist :: S.Set String -> T.Text -> [LexRow]
collateWordlist segs rawtext = fmap (uncurry LexRow) . M.assocs . M.fromListWith (+) $ do
    rawword <- T.words rawtext
    let w = segmentFiero segs (T.unpack rawword)
    guard (w /= [])
    return (w, 1)

-- | Serializes a list of words and frequerncies to a string for decoding with 'parseWordlist'. Connects segments using Fiero rules.
serWordlist :: S.Set String -> [LexRow] -> T.Text
serWordlist segs = T.unlines . mapMaybe showRow where
    showRow (LexRow _ n) | n <= 0 = Nothing
    showRow (LexRow w 1) = Just . T.pack $ joinFiero segs w
    showRow (LexRow w n) = Just . T.pack $ joinFiero segs w ++ "\t" ++ show n

-- | Serializes a list of words and frequerncies to a string for decoding with 'parseWordlist'. Puts spaces between segments.
serWordlistSpaced :: [LexRow] -> T.Text
serWordlistSpaced = T.unlines . mapMaybe showRow where
    showRow (LexRow _ n) | n <= 0 = Nothing
    showRow (LexRow w 1) = Just . T.pack $ unwords w
    showRow (LexRow w n) = Just . T.pack $ unwords w ++ "\t" ++ show n

-- | Reperesentation of a 'ClassGlob' grammar.
data PhonoGrammar = PhonoGrammar {
    lengthDist :: (Array Length Int), -- ^ Distribution of word lengths
    constraintSet :: [ClassGlob], -- ^ Set of constraints
    weightSet :: Vec -- ^ Set of weights in same order as constraints
} deriving (Eq, Show)

instance NFData PhonoGrammar where
    rnf (PhonoGrammar lendist grammar weights) = rnf lendist `seq` rnf grammar `seq` rnf weights

-- | Parse a grammar from a file. Blank lines ans lines begining with # are ignored.
-- The first regular line must contain a list of (Length,Int) pairs and subsequent lines must contain a weight followed by a ClassGlob.
parseGrammar :: T.Text -> Maybe PhonoGrammar
parseGrammar rawgrammar = do
    let noncomment l = not (T.null l) && (T.head l /= '#')
    (fline:glines) <- return $ filter noncomment (T.lines rawgrammar)
    lenlist <- readMaybe (T.unpack fline)
    let maxlen = maximum (fmap fst lenlist)
        lenarr = accumArray (+) 0 (1,maxlen) (filter ((> 0) . fst) lenlist)
        readline l = do
            let (wt, ct') = T.breakOn " " l
            w::Double <- readMaybe (T.unpack wt)
            (' ', ct) <- T.uncons ct'
            c::ClassGlob <- readMaybe (T.unpack ct)
            return (c,w)
    cs <- traverse readline (reverse glines)
    return $ PhonoGrammar lenarr (fmap fst cs) (vec (fmap snd cs))

-- | Serialize a grammar without length distribution
serGrammarRules :: [ClassGlob] -> Vec -> T.Text
serGrammarRules grammar weights =
    (T.unlines . reverse) [T.pack $ showFFloat (Just 3) w "  " ++ show c | c <- grammar | w <- coords weights]

-- | Serialize a grammar including length distribution
serGrammar :: PhonoGrammar -> T.Text
serGrammar (PhonoGrammar lendist grammar weights) =
    "# Length Distribution:\n" <> (T.pack . show . assocs $ lendist) <> "\n\n# Constraints:\n" <> serGrammarRules grammar weights