-- | -- -- TODO normalization of characters! (though it might be better to do this not in the importer, but -- a normalization function) module NLP.Scoring.Unigram.Import where import Control.Applicative import Control.Arrow (first, (***)) import Control.Lens import Control.Monad import Control.Monad.IO.Class (MonadIO, liftIO) import Control.Monad.State.Class import Control.Monad.Trans.State.Strict hiding (gets) import Control.Monad.Except import Data.ByteString (ByteString) import Data.Char import Data.HashMap.Strict (fromList, HashMap) import Data.HashSet (HashSet) import Data.Maybe import Data.Monoid import Data.String (IsString) import Data.Text (Text) import Debug.Trace import qualified Data.ByteString as BS import qualified Data.ByteString.UTF8 as UTF8 import qualified Data.HashMap.Strict as HM import qualified Data.HashSet as HS import qualified Data.Sequence as S import qualified Data.Text as T import qualified Data.Text.IO as T import System.Exit (exitFailure) import System.IO (stdout) import Text.Parser.LookAhead import Text.Parser.Token.Style import Text.PrettyPrint.ANSI.Leijen (displayIO, renderPretty, linebreak, displayS) import Text.Trifecta as TT import Text.Trifecta.Delta (Delta(..)) import Data.ByteString.Interned import NLP.Scoring.Unigram data Env = Env { _warnings ∷ !(S.Seq Text) , _defaults ∷ !(HashMap Text Double) , _charGroups ∷ !(HashMap Text (HashSet Text)) , _matchScores ∷ !(HashMap (Text,Text) Double) , _ignoredScoresFstK ∷ !(HashMap Text Double) , _ignoredScoresSndL ∷ !(HashMap Text Double) } deriving (Show) makeLenses ''Env defaultEnv = Env { _warnings = S.empty , _defaults = HM.empty , _charGroups = HM.empty , _matchScores = HM.empty , _ignoredScoresFstK = HM.empty , _ignoredScoresSndL = HM.empty } test = runExceptT $ fromFile True "scoring/unigramdefault.score" -- | This will prettyprint the error message and ungracefully exit prettyErrorAndExit ∷ MonadIO m ⇒ ErrInfo → m () prettyErrorAndExit e = do liftIO $ displayIO stdout $ renderPretty 0.8 80 $ (_errDoc e) <> linebreak liftIO $ exitFailure -- | Returns the error message, but will not exit. errorToString :: ErrInfo → String errorToString e = (displayS . renderPretty 0.8 80 $ _errDoc e) "" fromByteString ∷ ByteString → String → Except ErrInfo (UnigramScoring k l) fromByteString s fn = r where p = parseByteString (runP $ runStateT pUnigram defaultEnv) (Directed (UTF8.fromString fn) 0 0 0 0) s r = case p of Success (p',e) → return p' Failure e → throwError e fromFile ∷ Bool → FilePath → ExceptT ErrInfo IO (UnigramScoring k l) fromFile warn fp = do p' <- TT.parseFromFileEx (runP $ runStateT pUnigram defaultEnv) fp case p' of Success (p,e) → do let ws = e^.warnings unless (null ws || not warn) $ do liftIO $ mapM_ T.putStrLn ws return p Failure e → throwError e pUnigram ∷ UnigramParser (UnigramScoring k l) pUnigram = do whiteSpace many $ choice [pDefaults,pCharGroup,pSimilarity,pEquality,pIgnored] eof let uconstants ∷ Text → UnigramParser Double uconstants k = do kv ← use defaults case HM.lookup k kv of Nothing → do warnings %= (S.|> ("constant " <> k <> " not found, using default (-999999)")) return (-999999) Just v → return v usUnigramMatch ← (HM.fromList . map (first (ibsText *** ibsText)) . HM.toList) <$> use matchScores usUnigramInsertFstK ← (HM.fromList . map (first ibsText) . HM.toList) <$> use ignoredScoresFstK usUnigramInsertSndL ← (HM.fromList . map (first ibsText) . HM.toList) <$> use ignoredScoresSndL usGapLinear <- uconstants "GapLinear" usGapOpen <- uconstants "GapOpen" usGapExtension <- uconstants "GapExtension" usDefaultMatch <- uconstants "Match" usDefaultMismatch <- uconstants "Mismatch" usPrefixSuffixLinear <- uconstants "PrefixSuffixLinear" usPrefixSuffixOpen <- uconstants "PrefixSuffixOpen" usPrefixSuffixExtension <- uconstants "PrefixSuffixExtension" -- -- Given the @Env@, we can now construct the actual scoring system. return UnigramScoring{..} -- | Defaults are key-value pairs, of which there is only a small set. pDefaults :: UnigramParser () pDefaults = choice $ map pConstant cs where cs = [ "GapLinear", "GapOpen", "GapExtend", "PrefixSuffixOpen", "PrefixSuffixExtend" , "Match", "Mismatch" ] pConstant :: Text -> UnigramParser () pConstant r = do reserveText reserved r ds <- use defaults when (HM.member r ds) (fail $ show r ++ " already defined") v <- either fromIntegral id <$> integerOrDouble defaults %= HM.insert r v -- | Gives a name to a set of characters we want to work with later on. pCharGroup :: UnigramParser () pCharGroup = do reserve reserved "CharGroup" ty <- ident reserved -- TODO the {,} guys ls <- option [] . braces $ pExpansionOptions `sepEndBy` comma vs' <- runUnlined $ do gs <- HS.unions <$> (some $ (HS.singleton <$> pGrapheme) <|> pKnownCharGroup) rol <- restOfLine unless (rol == "\n") $ fail $ show (gs,rol) return gs someSpace let vs = applySpecialFunctions ls vs' charGroups %= HM.insert ty vs -- | Parses a similarity line and updates the scores for the pairs of -- characters. pSimilarity :: UnigramParser () pSimilarity = do reserve reserved "Similarity" ls1 <- option [] . braces $ pExpansionOptions `sepEndBy` comma ty1 <- runUnlined pKnownCharGroup ls2 <- option [] . braces $ pExpansionOptions `sepEndBy` comma ty2 <- runUnlined pKnownCharGroup v <- either fromIntegral id <$> integerOrDouble let xs = applySpecialFunctions ls1 ty1 let ys = applySpecialFunctions ls2 ty2 let vs = HM.fromList [ ((x,y),v) | x <- HS.toList xs, y <- HS.toList ys ] -- mapping from the first will be the result in clashes, hence @HS.union -- vs old@ ... matchScores %= HM.union vs -- | Parses an equality line and updates the scores for the pairs of -- characters. pEquality :: UnigramParser () pEquality = do reserve reserved "Equality" ls <- option [] . braces $ pExpansionOptions `sepEndBy` comma ty <- runUnlined pKnownCharGroup v <- either fromIntegral id <$> integerOrDouble let xss = map (applySpecialFunctions ls . HS.singleton) $ HS.toList ty let vs = HM.fromList [ ((x,y),v) | xs <- xss, x <- HS.toList xs, y <- HS.toList xs ] matchScores %= HM.union vs data FstKSndL = FstK | SndL deriving (Eq,Ord) pIgnored :: UnigramParser () pIgnored = (reserve reserved "Ignored" >> go [FstK,SndL]) <|> (reserve reserved "IgnoredFst" >> go [FstK] ) <|> (reserve reserved "IgnoredSnd" >> go [ SndL]) where go what = do ls <- option [] . braces $ pExpansionOptions `sepEndBy` comma ty <- runUnlined pKnownCharGroup v <- either fromIntegral id <$> integerOrDouble let xs = applySpecialFunctions ls ty let vs = HM.fromList [ (x,v) | x <- HS.toList xs ] when (FstK `elem` what) $ ignoredScoresFstK %= HM.union vs when (SndL `elem` what) $ ignoredScoresSndL %= HM.union vs -- | Defines what a grapheme is. Basically, don't be a whitespace and don't -- start with '$'. -- -- TODO we probably want to allow \$ to stand for '$'. pGrapheme :: (CharParsing p, TokenParsing p) => p Text pGrapheme = (T.pack <$> some (satisfy allowed) <* someSpace) "pGrapheme" where allowed x = (not $ isSpace x || x `elem` ("${}" :: String)) -- | Returns the set of characters from a known character group pKnownCharGroup :: Unlined UnigramParser (HS.HashSet Text) pKnownCharGroup = go "pKnownCharGroup" where go = do char '$' ty <- ident reserved cgs <- use charGroups case HM.lookup ty cgs of Nothing -> fail $ show ty ++ " is not a known CharGroup!" Just cg -> return cg -- | How we can expand a group with special functions. pExpansionOptions :: UnigramParser Text pExpansionOptions = choice $ map (text . fst) specialFunctions specialFunctions ∷ [(Text, Text → Text)] specialFunctions = [ ("ToUpper", T.toUpper) , ("ToLower", T.toLower) ] applySpecialFunctions ls xs = HS.unions $ xs : [ HS.map sf xs | (sfn,sf) <- specialFunctions, sfn `elem` ls ] -- | TODO only insert warning, not error, after seeing a character again! setIdent :: HashSet Text -> Unlined UnigramParser Text setIdent e = try $ do k <- ident reserved when (HS.member k e) $ fail "Character already present in EqualChars!" return k reserved :: TokenParsing m => IdentifierStyle m reserved = emptyIdents { _styleReserved = rs } where rs = HS.fromList [ -- "EqualChars", "SimilarChars", "EqualScore", "SimilarScore" -- , "IgnoredChars", "CharGroup" ] -- | This is just the trifecta parser, but with haskell-style comments enabled. newtype P a = P { runP :: Parser a } deriving ( Applicative , Monad , Functor , DeltaParsing , MonadPlus , Alternative , CharParsing , Parsing ) -- | This enables the haskell-style comments. instance TokenParsing P where someSpace = buildSomeSpaceParser (skipSome (satisfy isSpace)) haskellCommentStyle type UnigramParser = StateT Env P deriving instance DeltaParsing (Unlined UnigramParser)