{-# LANGUAGE OverloadedStrings #-} module NLP.Sequor.Features ( features , maybeFeatures , inputFeatures , outputFeatures , indexFeatures , eval ) where import qualified Helper.Text as Text import Helper.Text (Txt) import qualified Helper.ListZipper as LZ import Helper.ListZipper (ListZipper,at) import qualified Data.Char as Char import Data.List (group,sort) import qualified Data.IntSet as IntSet import qualified Data.IntMap as IntMap import Helper.Atom (MonadAtoms,AtomTable,from,toAtom,maybeToAtom) import Data.Maybe (catMaybes,isNothing) import Control.Monad (liftM2) import Data.Monoid (mappend) import NLP.Sequor.Config import qualified Data.Vector.Unboxed as V import NLP.Sequor.FeatureTemplate (Feature(..)) import Data.Word (Word,Word64) import qualified Hashable as H import Data.Int import NLP.Sequor.CoNLL toAtom' :: Int -> Txt -> Int toAtom' size s = fromIntegral ((H.hash s::Word64) `rem` fromIntegral size) iNDEX_SUFFIX :: Txt iNDEX_SUFFIX="::index" iNPUT_PREFIX :: Txt iNPUT_PREFIX="in:" oUTPUT_PREFIX :: Txt oUTPUT_PREFIX="out:" nULL_MARK :: Txt nULL_MARK = "" eval :: ListZipper Token -> Feature -> [Maybe Txt] eval z (Cell r c) = case z `at` r of [] -> [Nothing] fs -> [fs `index` c] eval z (Rect r c r' c') = concat [ eval z (Cell i j) | i <- [r..r'] , j <- [c..c'] ] eval z (Row r) = concat [ eval z (Cell r j) | j <- [0..length (z `at` 0)-1] ] eval z (MarkNull f) = [ maybe (Just nULL_MARK) Just fi | fi <- eval z f ] eval z (Index f) = [ fi+++Just iNDEX_SUFFIX | fi <- eval z f ] eval z (Cat fs) = concatMap (eval z) fs eval z (Cart f f') = [ fmap Text.normalize $ fi +++ Just "," +++ fi' | fi <- eval z f , fi' <- eval z f' ] eval z (Lower f) = [ fmap (Text.map Char.toLower) fi | fi <- eval z f ] eval z (Suffix i f) = [ fmap (Text.reverse . Text.take (fromIntegral i) . Text.reverse ) $ fi | fi <- eval z f ] eval z (Prefix i f) = [ fmap (Text.take (fromIntegral i)) $ fi | fi <- eval z f ] eval z (WordShape f) = [ fmap (spellingSpec) fi | fi <- eval z f ] spellingSpec = Text.fromString . map (\(x:xs) -> x) . group . map collapse . Text.toString collapse c | Char.isAlpha c && Char.isUpper c = 'X' | Char.isAlpha c && Char.isLower c = 'x' | Char.isDigit c = '0' | c == '-' = '-' | c == '_' = '_' | otherwise = '*' indexFeatures :: AtomTable -> IntSet.IntSet indexFeatures = IntMap.keysSet . IntMap.filter (iNDEX_SUFFIX `Text.isSuffixOf`) . from inputFeatures :: Config -> ListZipper Token -> [Txt] inputFeatures config x = catMaybes . prefixIndex iNPUT_PREFIX . eval x . featureTemplate $ config outputFeatures :: [Txt] -> [Txt] outputFeatures ys = catMaybes . prefixIndex oUTPUT_PREFIX . map Just $ case ys of (y:y':_) -> [y,y`Text.append`y'] [y] -> [y] [] -> [] features :: (Functor m, MonadAtoms m) => Maybe (Int,Int) -> Config -> ListZipper Token -> m (V.Vector Int) features bounds config = do case (flagHash . flags $ config,bounds) of (True,Just (_,size)) -> return . V.fromList . map (toAtom' size) . inputFeatures config (False,Nothing) -> fmap V.fromList . mapM toAtom . inputFeatures config maybeFeatures :: (Functor m, MonadAtoms m) => Maybe (Int,Int) -> Config -> ListZipper Token -> m (V.Vector Int) maybeFeatures bounds config = do case (flagHash . flags $ config,bounds) of (True,Just _) -> features bounds config (False,Nothing) -> fmap V.fromList . fmap catMaybes . mapM maybeToAtom . inputFeatures config prefixIndex :: Txt -> [Maybe Txt] -> [Maybe Txt] prefixIndex str = zipWith (\i x -> Just str +++ Just (Text.show i) +++ Just "=" +++ x ) [1..] (+++) = liftM2 (\s t -> Text.concat [s,t]) index [] _ = Nothing index (x:_) 0 = Just x index (_:xs) i = index xs (i-1) sent = LZ.fromList [["I","pro"],["like","v"],["Ike","pn"]] :: ListZipper Token