module Text.PhonotacticLearner(
generateGrammarIO,
segmentFiero, joinFiero,
) where
import Text.PhonotacticLearner.Util.Ring
import Text.PhonotacticLearner.Util.Probability
import Text.PhonotacticLearner.DFST
import Text.PhonotacticLearner.MaxentGrammar
import System.Random
import Control.Monad.State
import Control.DeepSeq
import Data.Ix
import Numeric
import Data.IORef
import Data.List
import System.IO
import Control.Exception
stopsigint :: AsyncException -> IO ()
stopsigint e = case e of
UserInterrupt -> do
putStrLn "\n\nInterrupted!"
return ()
_ -> throw e
generateGrammarIO :: forall clabel sigma . (Show clabel, Ix sigma, NFData sigma, NFData clabel, Eq clabel)
=> Int
-> [Double]
-> [(clabel, ShortDFST sigma)]
-> [([sigma],Int)]
-> IO ([clabel], MulticountDFST sigma, Vec)
generateGrammarIO samplesize thresholds candidates wfs = do
let lwfs = sortLexicon wfs
cbound = psegBounds . snd . head $ candidates
blankdfa = nildfa cbound
lendist = lengthCdf lwfs
pwfs = packMultiText cbound (wordFreqs lwfs)
hashctr :: IORef Int <- newIORef 0
let mark500 = do
c <- readIORef hashctr
when (c `mod` 500 == 0) $ do
hPutStr stderr "#"
hFlush stderr
modifyIORef' hashctr (+1)
currentGrammar :: IORef ([clabel], MulticountDFST sigma, Vec) <- newIORef ([],pruneAndPack blankdfa ,zero)
let genSalad :: IO (PackedText sigma)
genSalad = do
(_,dfa,weights) <- readIORef currentGrammar
salad' <- getStdRandom . runState $ sampleWordSalad (fmap (maxentProb weights) (unpackDFA dfa)) lendist samplesize
return . packMultiText cbound . wordFreqs . sortLexicon . fmap (\x -> (x,1)) $ salad'
currentSalad <- newIORef undefined
handle stopsigint $ do
forM_ thresholds $ \accuracy -> do
putStrLn $ "\n\n\nStarting pass with threshold " ++ showFFloat (Just 3) accuracy ""
writeIORef currentSalad =<< genSalad
forM_ candidates $ \(cl,cdfa) -> do
mark500
(grammar, dfa, weights) <- readIORef currentGrammar
salad <- readIORef currentSalad
let o = fromIntegral $ transducePackedShort cdfa pwfs
o' = fromIntegral $ transducePackedShort cdfa salad
e = o' * fromIntegral (totalWords lwfs) / fromIntegral samplesize
score = upperConfidenceOE o e
when (score < accuracy && cl `notElem` grammar) $ do
hPutStrLn stderr ""
putStrLn $ "\nSelected Constraint " ++ show cl ++ " (score=" ++ showFFloat (Just 4) score [] ++ ", o=" ++ showFFloat (Just 1) o [] ++ ", e=" ++ showFFloat (Just 1) e [] ++ ")."
let newgrammar = cl:grammar
newdfa :: MulticountDFST sigma = pruneAndPack (rawIntersection consMC (unpackDFA cdfa) (unpackDFA dfa))
putStrLn $ "New grammar has " ++ show (length newgrammar) ++ " constraints and " ++ show (numStates newdfa) ++ " states."
let oldweights = consVec 0 weights
newweights <- evaluate . force $ llpOptimizeWeights (lengthFreqs lwfs) pwfs newdfa oldweights
hPutStrLn stderr ""
putStrLn $ "Recalculated weights: " ++ showFVec (Just 2) newweights
atomicWriteIORef currentGrammar . force $ (newgrammar, newdfa, newweights)
writeIORef currentSalad =<< genSalad
putStrLn "\n\n\nAll Pases Complete."
readIORef currentGrammar
segmentFiero :: [String]
-> String
-> [String]
segmentFiero [] = error "Empty segment list."
segmentFiero allsegs = go msl where
msl = maximum . fmap length $ allsegs
go _ [] = []
go _ ('\'':xs) = go msl xs
go 0 (x:xs) = go msl xs
go len xs | seg `elem` allsegs = seg : go msl rest
| otherwise = go (len1) xs
where (seg,rest) = splitAt len xs
joinFiero :: [String]
-> [String]
-> String
joinFiero allsegs = go where
msl = maximum . fmap length $ allsegs
go [] = []
go [x] = x
go (x:xs@(y:_)) = let z = x++y
in if any (\s -> isPrefixOf s z && not (isPrefixOf s x)) allsegs
then x ++ ('\'' : go xs)
else x ++ go xs