-- | This module defines a simple scoring scheme based on pairs of unigrams. module NLP.Scoring.Unigram where import Data.Aeson import Data.Hashable import Data.HashMap.Strict import GHC.Generics import Data.ByteString.Interned -- | Score 'IBS's @x@ and @y@ based on the simple scoring system: (i) -- lookup (x,y) and use the score if found; (ii) if (x,y) is not in the -- database, then return the default matching 'defaultMatch' score if -- @x==y@, otherwise return the default mismatch 'defaultMismatch' score. -- Note that even though @IBS k@ and @IBS l@ have different types, -- mismatches are checked using the underlying @Int@ representation. matchUnigram ∷ UnigramScoring k l → IBS k → IBS l → Double matchUnigram UnigramScoring{..} x y = lookupDefault (if getIBS x == getIBS y then usDefaultMatch else usDefaultMismatch) (x,y) usUnigramMatch {-# Inline matchUnigram #-} -- | Provides a score for the unigram characters in an @in/del@ -- environment. In case the character @x@ in the pairing @x == '-'@ is -- found in the @unigramInsert@ database, that score is used, otherwise the -- @gapLinear@ score is used. insertUnigramFstK ∷ UnigramScoring k l → IBS k → Double insertUnigramFstK UnigramScoring{..} x = lookupDefault usGapLinear x usUnigramInsertFstK {-# Inline insertUnigramFstK #-} -- | Analog to 'insertUnigramSndL', but works on the @IBS l@ with phantom -- type @l@. insertUnigramSndL ∷ UnigramScoring k l → IBS l → Double insertUnigramSndL UnigramScoring{..} x = lookupDefault usGapLinear x usUnigramInsertSndL {-# Inline insertUnigramSndL #-} -- TODO $UTF-Vowels , etc in parsing ?! -- | Collect the hashtable and scalar values for simple scoring. -- -- TODO binary and cereal instances data UnigramScoring k l = UnigramScoring { usUnigramMatch ∷ !(HashMap (IBS k, IBS l) Double) -- ^ All known matching characters and associated scores. , usUnigramInsertFstK ∷ !(HashMap (IBS k) Double) -- ^ Characters that can be deleted with costs different from -- @gapOpen@/@gapExtension@. This is the insertion map, associated with -- the first type @k@. , usUnigramInsertSndL ∷ !(HashMap (IBS l) Double) -- ^ Characters that can be deleted with costs different from -- @gapOpen@/@gapExtension@. This is the insertion map, associated with -- the second type @l@. , usGapLinear ∷ !Double -- ^ linear gap scores , usGapOpen ∷ !Double -- ^ Gap opening costs for Gotoh-style grammars. , usGapExtension ∷ !Double -- ^ Gap extension costs for Gotoh-style grammars. , usDefaultMatch ∷ !Double -- ^ Default score for characters matching, i.e. @x==y@. , usDefaultMismatch ∷ !Double -- ^ Default score for characters not matching, i.e. @x/=y@. , usPrefixSuffixLinear ∷ !Double -- ^ Special gap score for a prefix or suffix. , usPrefixSuffixOpen ∷ !Double -- ^ Special gap opening score for a prefix or suffix. , usPrefixSuffixExtension ∷ !Double -- ^ Special gap extension score for a prefix or suffix. } deriving (Read,Show,Eq,Generic) instance Hashable (UnigramScoring k l) instance FromJSON (UnigramScoring k l) where parseJSON (Object v) = UnigramScoring <$> (fromList `fmap` (v .: "unigramMatch")) <*> (fromList `fmap` (v .: "unigramInsertFstK")) <*> (fromList `fmap` (v .: "unigramInsertSndL")) <*> v .: "gapLinear" <*> v .: "gapOpen" <*> v .: "gapExtension" <*> v .: "defaultMatch" <*> v .: "defaultMismatch" <*> v .: "prefixSuffixLinear" <*> v .: "prefixSuffixOpen" <*> v .: "prefixSuffixExtension" instance ToJSON (UnigramScoring k l) where toJSON UnigramScoring {..} = object [ "unigramMatch" .= toList usUnigramMatch , "unigramInsertFstK" .= toList usUnigramInsertFstK , "unigramInsertSndL" .= toList usUnigramInsertSndL , "gapLinear" .= usGapLinear , "gapOpen" .= usGapOpen , "gapExtension" .= usGapExtension , "defaultMatch" .= usDefaultMatch , "defaultMismatch" .= usDefaultMismatch , "prefixSuffixLinear" .= usPrefixSuffixLinear , "prefixSuffixOpen" .= usPrefixSuffixOpen , "prefixSuffixExtension" .= usPrefixSuffixExtension ]