module Text.NKJP.Named
(
Cert (..)
, Ptr (..)
, Deriv (..)
, Para (..)
, Sent (..)
, NE (..)
, parseNamed
, readNamed
, readCorpus
, readTrees
, mkForest
) where
import Data.Maybe (mapMaybe)
import qualified Data.Map as M
import qualified Data.Tree as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.IO as L
import Text.XML.PolySoup
import qualified Data.Named.Graph as Nd
import qualified Data.Named.Tree as Nd
import qualified Text.NKJP.Tar as Tar
import qualified Text.NKJP.Morphosyntax as Mx
data Cert
= High
| Medium
| Low
deriving (Show)
data Ptr t
= Local
{ target :: t }
| Global
{ target :: t
, location :: t }
deriving (Show, Functor)
data Deriv t = Deriv
{ derivType :: t
, derivFrom :: t }
deriving (Show, Functor)
data Para t = Para
{ paraID :: t
, sentences :: [Sent t] }
deriving (Show, Functor)
data Sent t = Sent
{ sentID :: t
, names :: [NE t] }
deriving (Show, Functor)
data NE t = NE
{ neID :: t
, derived :: Maybe (Deriv t)
, neType :: t
, subType :: Maybe t
, orth :: t
, base :: Either t t
, cert :: Cert
, certComment :: Maybe t
, ptrs :: [Ptr t] }
deriving (Show)
instance Functor NE where
fmap f NE{..} = NE
{ neID = f neID
, derived = fmap (fmap f) derived
, neType = f neType
, subType = fmap f subType
, orth = f orth
, base = case base of
Left x -> Left (f x)
Right x -> Right (f x)
, cert = cert
, certComment = fmap f certComment
, ptrs = map (fmap f) ptrs }
mkForest :: Ord t => [Mx.Seg t] -> [NE t]
-> T.Forest (Either (NE t) (Mx.Seg t))
mkForest xs ns =
Nd.mapForest decode (Nd.toForest graph)
where
pos = (M.!) $ M.fromList (zip (map Mx.segID xs) [0..])
word = (M.!) $ M.fromList (zip [0..] xs)
name = (M.!) $ M.fromList [(neID ne, ne) | ne <- ns]
graph = Nd.mkGraph (0, length xs 1)
[ ( neID ne
, map resolve (ptrs ne) )
| ne <- ns ]
resolve (Local ptr) = Left ptr
resolve (Global ptr _) = Right (pos ptr)
decode (Left neID) = Left (name neID)
decode (Right k) = Right (word k)
type P a = XmlParser L.Text a
namedP :: P [Para L.Text]
namedP = true //> paraP
paraP :: P (Para L.Text)
paraP = uncurry Para <$> (tag "p" *> getAttr "xml:id" </> sentP)
sentP :: P (Sent L.Text)
sentP = uncurry Sent <$> (tag "s" *> getAttr "xml:id" </> nameP)
nameP :: P (NE L.Text)
nameP = (tag "seg" *> getAttr "xml:id") `join` \_neID -> do
ne <- nameBodyP
_ptrs <- some namePtrP
<|> failBad ("no targets specified for " ++ L.unpack _neID)
return $ ne { neID = _neID, ptrs = _ptrs }
nameBodyP :: P (NE L.Text)
nameBodyP = (tag "fs" *> hasAttr "type" "named") `joinR` do
_deriv <- optional derivP
_neType <- fSymP "type"
_subType <- optional (fSymP "subtype")
_orth <- fStrP "orth"
_base <- (Left <$> fStrP "base") <|> (Right <$> fStrP "when")
_cert <- certP
_certComment <- optional (fStrP "comment")
return $ NE { neType = _neType, subType = _subType, orth = _orth
, base = _base, derived = _deriv, cert = _cert
, certComment = _certComment, neID = "", ptrs = [] }
derivP :: P (Deriv L.Text)
derivP = fP "derived" `joinR` ( fsP "derivation" `joinR` do
Deriv <$> fSymP "derivType" <*> fStrP "derivedFrom" )
fP :: L.Text -> TagPred L.Text ()
fP x = tag "f" *> hasAttr "name" x
fsP :: L.Text -> TagPred L.Text ()
fsP x = tag "fs" *> hasAttr "type" x
certP :: P Cert
certP =
mkCert <$> fSymP "certainty"
where
mkCert "high" = High
mkCert "medium" = Medium
mkCert "low" = Low
mkCert _ = Medium
namePtrP :: P (Ptr L.Text)
namePtrP = cut (tag "ptr" *> getAttr "target") >>= \x -> return $
case L.break (=='#') x of
(ptr, "") -> Local ptr
(loc, ptr) -> Global
{ location = loc
, target = (L.tail ptr) }
fStrP :: L.Text -> P L.Text
fStrP x =
let checkName = tag "f" *> hasAttr "name" x
safeHead [] = ""
safeHead xs = head xs
in safeHead <$> (checkName #> tag "string" /> text)
fSymP :: L.Text -> P L.Text
fSymP x =
let checkName = tag "f" *> hasAttr "name" x
p = cut (tag "symbol" *> getAttr "value")
in head <$> (checkName /> p)
parseNamed :: L.Text -> [Para L.Text]
parseNamed = parseXml namedP
readNamed :: FilePath -> IO [Para L.Text]
readNamed namedPath = parseNamed <$> L.readFile namedPath
readCorpus :: FilePath -> IO [(FilePath, Maybe [Para L.Text])]
readCorpus = Tar.readCorpus "ann_named" parseNamed
readTrees :: FilePath -> IO [T.Forest (Either (NE L.Text) (Mx.Seg L.Text))]
readTrees path = do
morph <- Mx.readCorpus path
named <- readCorpus path
return . concat $ map toTrees (sync morph named)
where
toTrees (_, xs, ys) = map toForest $ zip
(concatMap Mx.sentences xs)
(concatMap sentences ys)
toForest (x, y) = mkForest (Mx.segments x) (names y)
sync :: [(FilePath, Maybe a)] -> [(FilePath, Maybe b)] -> [(FilePath, a, b)]
sync as bs =
mapMaybe (uncurry just) (zip as bs)
where
just (dir, Just x) (_, Just y) = Just (dir, x, y)
just _ _ = Nothing