{-# LANGUAGE OverloadedStrings #-} module Features ( features , inputFeatures , outputFeatures , indexFeatures , maybeFeatures , eval ) where import qualified Text import Text (Txt) import qualified ListZipper as LZ import ListZipper (ListZipper,at) import CorpusReader (Token,fromWords) import qualified Data.Char as Char import Data.List (group,sort) import qualified Data.IntSet as IntSet import qualified Data.IntMap as IntMap import Atom (MonadAtoms,AtomTable,from,toAtom,maybeToAtom) import Data.Maybe (catMaybes,isNothing) import Control.Monad (liftM2) import Data.Monoid (mappend) import Config import qualified Data.Vector.Unboxed as V import FeatureTemplate (Feature(..)) iNDEX_SUFFIX :: Txt iNDEX_SUFFIX="::index" iNPUT_PREFIX :: Txt iNPUT_PREFIX="in:" oUTPUT_PREFIX :: Txt oUTPUT_PREFIX="out:" nULL_MARK :: Txt nULL_MARK = "" eval :: ListZipper Token -> Feature -> [Maybe Txt] eval z (Cell r c) = case z `at` r of [] -> [Nothing] fs -> [fs `index` c] eval z (Rect r c r' c') = concat [ eval z (Cell i j) | i <- [r..r'] , j <- [c..c'] ] eval z (Row r) = concat [ eval z (Cell r j) | j <- [0..length (z `at` 0)-1] ] eval z (MarkNull f) = [ maybe (Just nULL_MARK) Just fi | fi <- eval z f ] eval z (Index f) = [ fi+++Just iNDEX_SUFFIX | fi <- eval z f ] eval z (Cat fs) = concatMap (eval z) fs eval z (Cart f f') = [ fmap Text.normalize $ fi +++ Just "," +++ fi' | fi <- eval z f , fi' <- eval z f' ] eval z (Lower f) = [ fmap (Text.map Char.toLower) fi | fi <- eval z f ] eval z (Suffix i f) = [ fmap (Text.reverse . Text.take (fromIntegral i) . Text.reverse ) $ fi | fi <- eval z f ] eval z (Prefix i f) = [ fmap (Text.take (fromIntegral i)) $ fi | fi <- eval z f ] eval z (WordShape f) = [ fmap (spellingSpec) fi | fi <- eval z f ] spellingSpec = Text.fromString . map (\(x:xs) -> x) . group . map collapse . Text.toString collapse c | Char.isAlpha c && Char.isUpper c = 'X' | Char.isAlpha c && Char.isLower c = 'x' | Char.isDigit c = '0' | c == '-' = '-' | c == '_' = '_' | otherwise = '*' indexFeatures :: AtomTable -> IntSet.IntSet indexFeatures = IntMap.keysSet . IntMap.filter (iNDEX_SUFFIX `Text.isSuffixOf`) . from inputFeatures :: Config -> ListZipper Token -> [Txt] inputFeatures config x = catMaybes . prefixIndex oUTPUT_PREFIX . eval x . featureTemplate $ config outputFeatures :: [Txt] -> [Txt] outputFeatures ys = catMaybes . prefixIndex oUTPUT_PREFIX . map Just $ case ys of (y:y':_) -> [y,y`Text.append`y'] [y] -> [y] [] -> [] features :: (MonadAtoms m) => Config -> ListZipper Token -> m (V.Vector Int) features config x = do ifs <- mapM toAtom (inputFeatures config x) return $ V.fromList ifs maybeFeatures :: (MonadAtoms m) => Config -> ListZipper Token -> m (V.Vector Int) maybeFeatures config x = do ifs <- mapM maybeToAtom (inputFeatures config x) return (V.fromList $ catMaybes $ ifs) prefixIndex :: Txt -> [Maybe Txt] -> [Maybe Txt] prefixIndex str = zipWith (\i x -> Just str +++ Just (Text.show i) +++ Just "=" +++ x ) [1..] (+++) = liftM2 Text.append index [] _ = Nothing index (x:_) 0 = Just x index (_:xs) i = index xs (i-1) sent = LZ.fromList [["I","pro"],["like","v"],["Ike","pn"]] :: ListZipper Token