module Data.Alpino.Model ( FeatureValue(..),
TrainingInstance(..),
TrainingInstanceType(..),
bestScore,
bestScore',
bsToTrainingInstance,
filterFeatures,
filterFeaturesFunctor,
randomSample,
scoreToBinary,
scoreToBinaryNorm,
scoreToNorm,
trainingInstanceToBs
) where
import qualified Data.ByteString as B
import Data.ByteString.Internal (c2w)
import Data.ByteString.Lex.Double (readDouble)
import qualified Data.ByteString.UTF8 as BU
import Data.List (foldl')
import Data.Maybe (fromJust)
import qualified Data.Set as Set
import GHC.Word (Word8)
import System.Random (RandomGen)
import System.Random.Shuffle (shuffle')
import Text.Printf (printf)
data TrainingInstance = TrainingInstance {
instanceType :: TrainingInstanceType,
instanceKey :: B.ByteString,
instanceN :: B.ByteString,
instanceScore :: Double,
instanceFeatures :: Features
} deriving (Show, Eq)
data TrainingInstanceType = ParsingInstance
| GenerationInstance
deriving (Show, Eq)
data Features = FeaturesString B.ByteString
| FeaturesList [FeatureValue]
deriving (Show, Eq)
data FeatureValue = FeatureValue {
feature :: B.ByteString,
value :: Double
} deriving (Show, Eq)
bestScore :: [TrainingInstance] -> Double
bestScore = foldl (\acc e -> max acc $ instanceScore e) 0.0
bestScore' :: [TrainingInstance] -> Double
bestScore' = foldl' (\acc e -> max acc $ instanceScore e) 0.0
bsToTrainingInstance :: B.ByteString -> Maybe TrainingInstance
bsToTrainingInstance l
| length lineParts /= 5 = Nothing
| otherwise = Just $ TrainingInstance instType key n score features
where lineParts = B.split instanceFieldSep l
instType = bsToType $ lineParts !! 0
key = lineParts !! 1
n = lineParts !! 2
score = fst . fromJust . readDouble $ lineParts !! 3
features = FeaturesString $ lineParts !! 4
trainingInstanceToBs :: TrainingInstance -> B.ByteString
trainingInstanceToBs (TrainingInstance instType keyBS nBS sc fvals) =
B.intercalate fieldSep [typeBS, keyBS, nBS, scoreBS, fValsBS]
where typeBS = typeToBS instType
scoreBS = BU.fromString $ printf "%f" sc
fValsBS = featuresToBs fvals
fieldSep = BU.fromString "#"
instanceFieldSep :: GHC.Word.Word8
instanceFieldSep = c2w '#'
bsToType :: B.ByteString -> TrainingInstanceType
bsToType bs
| bs == parseMarker = ParsingInstance
| bs == generationMarker = GenerationInstance
| otherwise = error "Unknown marker."
typeToBS :: TrainingInstanceType -> B.ByteString
typeToBS ParsingInstance = parseMarker
typeToBS GenerationInstance = generationMarker
parseMarker :: BU.ByteString
parseMarker = BU.fromString "P"
generationMarker :: BU.ByteString
generationMarker = BU.fromString "G"
parsedFeatures :: Features -> [FeatureValue]
parsedFeatures (FeaturesList l) = l
parsedFeatures (FeaturesString s) = map fVal $ B.split fieldSep s
where fVal p = FeatureValue f (fst $ fromJust $ readDouble valBs)
where [valBs, f] = B.split fValSep p
fieldSep = c2w '|'
fValSep = c2w '@'
featuresToBs :: Features -> B.ByteString
featuresToBs (FeaturesString s) = s
featuresToBs (FeaturesList l) = B.intercalate fieldSep $ map toBs l
where toBs (FeatureValue f val) = B.intercalate fValSep
[BU.fromString $ printf "%f" val, f]
fieldSep = BU.fromString "|"
fValSep = BU.fromString "@"
filterFeatures :: (Bool -> Bool) -> Set.Set B.ByteString -> TrainingInstance ->
TrainingInstance
filterFeatures f keepFeatures i =
i { instanceFeatures = FeaturesList $ filter keep $
parsedFeatures $ instanceFeatures i}
where keep fv = f $ Set.member (feature fv) keepFeatures
filterFeaturesFunctor :: (Bool -> Bool) -> Set.Set B.ByteString ->
TrainingInstance -> TrainingInstance
filterFeaturesFunctor f keepFeatures i =
i { instanceFeatures = FeaturesList $ filter keep $ parsedFeatures $
instanceFeatures i}
where keep fv = f $ Set.member (functor $ feature fv) keepFeatures
functor func = B.split argOpen func !! 0
argOpen = c2w '('
randomSample :: RandomGen g => g -> Int -> [TrainingInstance] ->
[TrainingInstance]
randomSample g n i
| instLen <= n = i
| otherwise = take n $ shuffle' i instLen g
where instLen = length i
scoreToBinary :: [TrainingInstance] -> [TrainingInstance]
scoreToBinary ctx = map (rescoreEvt maxScore) ctx
where maxScore = bestScore ctx
rescoreEvt maxS evt
| instanceScore evt == maxS = evt { instanceScore = 1.0 }
| otherwise = evt { instanceScore = 0.0 }
scoreToBinaryNorm :: [TrainingInstance] -> [TrainingInstance]
scoreToBinaryNorm ctx = map (rescoreEvt maxScore) ctx
where maxScore = bestScore ctx
numMax = length . filter (\e -> instanceScore e == maxScore) $ ctx
correctScore = 1.0 / fromIntegral numMax
rescoreEvt maxS evt
| instanceScore evt == maxS =
evt { instanceScore = correctScore }
| otherwise = evt { instanceScore = 0.0 }
scoreToNorm :: [TrainingInstance] -> [TrainingInstance]
scoreToNorm ctx = map (rescoreEvt norm) ctx
where norm = sum $ map instanceScore ctx
rescoreEvt n evt =
evt { instanceScore = (instanceScore evt) / n }