module NLP.Scoring.Unigram.Import where
import Control.Applicative
import Control.Arrow (first, (***))
import Control.Lens
import Control.Monad
import Control.Monad.IO.Class (MonadIO, liftIO)
import Control.Monad.State.Class
import Control.Monad.Trans.State.Strict hiding (gets)
import Control.Monad.Except
import Data.ByteString (ByteString)
import Data.Char
import Data.HashMap.Strict (fromList, HashMap)
import Data.HashSet (HashSet)
import Data.Maybe
import Data.Monoid
import Data.String (IsString)
import Data.Text (Text)
import Debug.Trace
import qualified Data.ByteString as BS
import qualified Data.ByteString.UTF8 as UTF8
import qualified Data.HashMap.Strict as HM
import qualified Data.HashSet as HS
import qualified Data.Sequence as S
import qualified Data.Text as T
import qualified Data.Text.IO as T
import System.Exit (exitFailure)
import System.IO (stdout)
import Text.Parser.LookAhead
import Text.Parser.Token.Style
import Text.PrettyPrint.ANSI.Leijen (displayIO, renderPretty, linebreak, displayS)
import Text.Trifecta as TT
import Text.Trifecta.Delta (Delta(..))
import Data.ByteString.Interned
import NLP.Scoring.Unigram
data Env = Env
{ _warnings ∷ !(S.Seq Text)
, _defaults ∷ !(HashMap Text Double)
, _charGroups ∷ !(HashMap Text (HashSet Text))
, _matchScores ∷ !(HashMap (Text,Text) Double)
, _ignoredScoresFstK ∷ !(HashMap Text Double)
, _ignoredScoresSndL ∷ !(HashMap Text Double)
}
deriving (Show)
makeLenses ''Env
defaultEnv = Env
{ _warnings = S.empty
, _defaults = HM.empty
, _charGroups = HM.empty
, _matchScores = HM.empty
, _ignoredScoresFstK = HM.empty
, _ignoredScoresSndL = HM.empty
}
test = runExceptT $ fromFile True "scoring/unigramdefault.score"
prettyErrorAndExit ∷ MonadIO m ⇒ ErrInfo → m ()
prettyErrorAndExit e = do
liftIO $ displayIO stdout $ renderPretty 0.8 80 $ (_errDoc e) <> linebreak
liftIO $ exitFailure
errorToString :: ErrInfo → String
errorToString e = (displayS . renderPretty 0.8 80 $ _errDoc e) ""
fromByteString ∷ ByteString → String → Except ErrInfo (UnigramScoring k l)
fromByteString s fn = r where
p = parseByteString (runP $ runStateT pUnigram defaultEnv)
(Directed (UTF8.fromString fn) 0 0 0 0) s
r = case p of
Success (p',e) → return p'
Failure e → throwError e
fromFile ∷ Bool → FilePath → ExceptT ErrInfo IO (UnigramScoring k l)
fromFile warn fp = do
p' <- TT.parseFromFileEx (runP $ runStateT pUnigram defaultEnv) fp
case p' of
Success (p,e) → do
let ws = e^.warnings
unless (null ws || not warn) $ do
liftIO $ mapM_ T.putStrLn ws
return p
Failure e → throwError e
pUnigram ∷ UnigramParser (UnigramScoring k l)
pUnigram = do
whiteSpace
many $ choice [pDefaults,pCharGroup,pSimilarity,pEquality,pIgnored]
eof
let uconstants ∷ Text → UnigramParser Double
uconstants k = do
kv ← use defaults
case HM.lookup k kv of
Nothing → do
warnings %= (S.|> ("constant " <> k <> " not found, using default (-999999)"))
return (-999999)
Just v → return v
usUnigramMatch ← (HM.fromList . map (first (ibsText *** ibsText)) . HM.toList) <$> use matchScores
usUnigramInsertFstK ← (HM.fromList . map (first ibsText) . HM.toList) <$> use ignoredScoresFstK
usUnigramInsertSndL ← (HM.fromList . map (first ibsText) . HM.toList) <$> use ignoredScoresSndL
usGapLinear <- uconstants "GapLinear"
usGapOpen <- uconstants "GapOpen"
usGapExtension <- uconstants "GapExtension"
usDefaultMatch <- uconstants "Match"
usDefaultMismatch <- uconstants "Mismatch"
usPrefixSuffixLinear <- uconstants "PrefixSuffixLinear"
usPrefixSuffixOpen <- uconstants "PrefixSuffixOpen"
usPrefixSuffixExtension <- uconstants "PrefixSuffixExtension"
return UnigramScoring{..}
pDefaults :: UnigramParser ()
pDefaults = choice $ map pConstant cs
where
cs = [ "GapLinear", "GapOpen", "GapExtend", "PrefixSuffixOpen", "PrefixSuffixExtend"
, "Match", "Mismatch"
]
pConstant :: Text -> UnigramParser ()
pConstant r = do
reserveText reserved r
ds <- use defaults
when (HM.member r ds) (fail $ show r ++ " already defined")
v <- either fromIntegral id <$> integerOrDouble
defaults %= HM.insert r v
pCharGroup :: UnigramParser ()
pCharGroup = do
reserve reserved "CharGroup"
ty <- ident reserved
ls <- option [] . braces $ pExpansionOptions `sepEndBy` comma
vs' <- runUnlined $ do
gs <- HS.unions <$> (some $ (HS.singleton <$> pGrapheme) <|> pKnownCharGroup)
rol <- restOfLine
unless (rol == "\n") $ fail $ show (gs,rol)
return gs
someSpace
let vs = applySpecialFunctions ls vs'
charGroups %= HM.insert ty vs
pSimilarity :: UnigramParser ()
pSimilarity = do
reserve reserved "Similarity"
ls1 <- option [] . braces $ pExpansionOptions `sepEndBy` comma
ty1 <- runUnlined pKnownCharGroup
ls2 <- option [] . braces $ pExpansionOptions `sepEndBy` comma
ty2 <- runUnlined pKnownCharGroup
v <- either fromIntegral id <$> integerOrDouble
let xs = applySpecialFunctions ls1 ty1
let ys = applySpecialFunctions ls2 ty2
let vs = HM.fromList [ ((x,y),v) | x <- HS.toList xs, y <- HS.toList ys ]
matchScores %= HM.union vs
pEquality :: UnigramParser ()
pEquality = do
reserve reserved "Equality"
ls <- option [] . braces $ pExpansionOptions `sepEndBy` comma
ty <- runUnlined pKnownCharGroup
v <- either fromIntegral id <$> integerOrDouble
let xss = map (applySpecialFunctions ls . HS.singleton) $ HS.toList ty
let vs = HM.fromList [ ((x,y),v) | xs <- xss, x <- HS.toList xs, y <- HS.toList xs ]
matchScores %= HM.union vs
data FstKSndL = FstK | SndL
deriving (Eq,Ord)
pIgnored :: UnigramParser ()
pIgnored = (reserve reserved "Ignored" >> go [FstK,SndL])
<|> (reserve reserved "IgnoredFst" >> go [FstK] )
<|> (reserve reserved "IgnoredSnd" >> go [ SndL])
where
go what = do
ls <- option [] . braces $ pExpansionOptions `sepEndBy` comma
ty <- runUnlined pKnownCharGroup
v <- either fromIntegral id <$> integerOrDouble
let xs = applySpecialFunctions ls ty
let vs = HM.fromList [ (x,v) | x <- HS.toList xs ]
when (FstK `elem` what) $ ignoredScoresFstK %= HM.union vs
when (SndL `elem` what) $ ignoredScoresSndL %= HM.union vs
pGrapheme :: (CharParsing p, TokenParsing p) => p Text
pGrapheme = (T.pack <$> some (satisfy allowed) <* someSpace) <?> "pGrapheme"
where allowed x = (not $ isSpace x || x `elem` ("${}" :: String))
pKnownCharGroup :: Unlined UnigramParser (HS.HashSet Text)
pKnownCharGroup = go <?> "pKnownCharGroup" where
go = do
char '$'
ty <- ident reserved
cgs <- use charGroups
case HM.lookup ty cgs of
Nothing -> fail $ show ty ++ " is not a known CharGroup!"
Just cg -> return cg
pExpansionOptions :: UnigramParser Text
pExpansionOptions = choice $ map (text . fst) specialFunctions
specialFunctions ∷ [(Text, Text → Text)]
specialFunctions =
[ ("ToUpper", T.toUpper)
, ("ToLower", T.toLower)
]
applySpecialFunctions ls xs =
HS.unions $ xs : [ HS.map sf xs | (sfn,sf) <- specialFunctions, sfn `elem` ls ]
setIdent :: HashSet Text -> Unlined UnigramParser Text
setIdent e = try $ do
k <- ident reserved
when (HS.member k e) $ fail "Character already present in EqualChars!"
return k
reserved :: TokenParsing m => IdentifierStyle m
reserved = emptyIdents { _styleReserved = rs }
where rs = HS.fromList [
]
newtype P a = P { runP :: Parser a }
deriving ( Applicative
, Monad
, Functor
, DeltaParsing
, MonadPlus
, Alternative
, CharParsing
, Parsing
)
instance TokenParsing P where
someSpace = buildSomeSpaceParser
(skipSome (satisfy isSpace))
haskellCommentStyle
type UnigramParser = StateT Env P
deriving instance DeltaParsing (Unlined UnigramParser)