module NLP.Concraft.Polish.Morphosyntax
(
Tag
, Seg (..)
, Word (..)
, Interp (..)
, Space (..)
, select
, select'
, selectWMap
, Sent
, SentO (..)
, restore
, withOrig
, packSeg
, packSent
, packSentO
) where
import Control.Applicative ((<$>), (<*>))
import Control.Arrow (first)
import Data.Maybe (catMaybes)
import Data.Aeson
import Data.Binary (Binary, put, get, putWord8, getWord8)
import qualified Data.Aeson as Aeson
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Tagset.Positional as P
import qualified NLP.Concraft.Morphosyntax as X
type Tag = T.Text
data Seg t = Seg
{ word :: Word
, interps :: X.WMap (Interp t) }
deriving (Show, Eq, Ord)
instance (Ord t, Binary t) => Binary (Seg t) where
put Seg{..} = put word >> put interps
get = Seg <$> get <*> get
data Word = Word
{ orth :: T.Text
, space :: Space
, known :: Bool }
deriving (Show, Eq, Ord)
instance X.Word Word where
orth = orth
oov = not.known
instance ToJSON Word where
toJSON Word{..} = object
[ "orth" .= orth
, "space" .= space
, "known" .= known ]
instance FromJSON Word where
parseJSON (Object v) = Word
<$> v .: "orth"
<*> v .: "space"
<*> v .: "known"
parseJSON _ = error "parseJSON [Word]"
instance Binary Word where
put Word{..} = put orth >> put space >> put known
get = Word <$> get <*> get <*> get
data Interp t = Interp
{ base :: T.Text
, tag :: t }
deriving (Show, Eq, Ord)
instance (Ord t, Binary t) => Binary (Interp t) where
put Interp{..} = put base >> put tag
get = Interp <$> get <*> get
data Space
= None
| Space
| NewLine
deriving (Show, Eq, Ord)
instance Binary Space where
put x = case x of
None -> putWord8 1
Space -> putWord8 2
NewLine -> putWord8 3
get = getWord8 >>= \x -> return $ case x of
1 -> None
2 -> Space
_ -> NewLine
instance ToJSON Space where
toJSON x = Aeson.String $ case x of
None -> "none"
Space -> "space"
NewLine -> "newline"
instance FromJSON Space where
parseJSON (Aeson.String x) = return $ case x of
"none" -> None
"space" -> Space
"newline" -> NewLine
_ -> error "parseJSON [Space]"
parseJSON _ = error "parseJSON [Space]"
select :: Ord a => a -> Seg a -> Seg a
select = select' []
select' :: Ord a => [a] -> a -> Seg a -> Seg a
select' ys x = selectWMap . X.mkWMap $ (x, 1) : map (,0) ys
selectWMap :: Ord a => X.WMap a -> Seg a -> Seg a
selectWMap wMap seg =
seg { interps = newInterps }
where
wSet = S.fromList . map tag . M.keys . X.unWMap . interps $ seg
newInterps = X.mkWMap $
[ case M.lookup (tag interp) (X.unWMap wMap) of
Just x -> (interp, x)
Nothing -> (interp, 0)
| interp <- (M.keys . X.unWMap) (interps seg) ]
++ catMaybes
[ if tag `S.member` wSet
then Nothing
else Just (Interp lemma tag, x)
| let lemma = orth $ word seg
, (tag, x) <- M.toList (X.unWMap wMap) ]
type Sent t = [Seg t]
data SentO t = SentO
{ segs :: [Seg t]
, orig :: L.Text }
restore :: Sent t -> L.Text
restore =
let wordStr Word{..} = [spaceStr space, orth]
spaceStr None = ""
spaceStr Space = " "
spaceStr NewLine = "\n"
in L.fromChunks . concatMap (wordStr . word)
withOrig :: Sent t -> SentO t
withOrig s = SentO
{ segs = s
, orig = restore s }
packSeg_ :: Ord a => Seg a -> X.Seg Word a
packSeg_ Seg{..}
= X.Seg word
$ X.mkWMap
$ map (first tag)
$ M.toList
$ X.unWMap interps
packSeg :: P.Tagset -> Seg Tag -> X.Seg Word P.Tag
packSeg tagset = X.mapSeg (P.parseTag tagset) . packSeg_
packSent :: P.Tagset -> Sent Tag -> X.Sent Word P.Tag
packSent = map . packSeg
packSentO :: P.Tagset -> SentO Tag -> X.SentO Word P.Tag
packSentO tagset s = X.SentO
{ segs = packSent tagset (segs s)
, orig = orig s }