{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE RecordWildCards #-} -- | Parsing the NKJP named entity layer. module Text.NKJP.Named ( -- * Data types Cert (..) , Ptr (..) , Deriv (..) , Para (..) , Sent (..) , NE (..) -- * Parsing , parseNamed , readNamed , readCorpus , readTrees -- * Utilities , mkForest ) where import Data.Maybe (mapMaybe) import qualified Data.Map as M import qualified Data.Tree as T import qualified Data.Text.Lazy as L import qualified Data.Text.Lazy.IO as L import Text.XML.PolySoup import qualified Data.Named.Graph as Nd import qualified Data.Named.Tree as Nd import qualified Text.NKJP.Tar as Tar import qualified Text.NKJP.Morphosyntax as Mx -- | A certainty of an annotator. data Cert = High | Medium | Low deriving (Show) -- | A pointer. data Ptr t -- | Of #id form. = Local { target :: t } -- | Of loc#id form. | Global { target :: t , location :: t } deriving (Show, Functor) -- | A derivation structure. data Deriv t = Deriv { derivType :: t , derivFrom :: t } deriving (Show, Functor) -- | A paragraph. data Para t = Para { paraID :: t , sentences :: [Sent t] } deriving (Show, Functor) -- | A sentence. data Sent t = Sent { sentID :: t , names :: [NE t] } deriving (Show, Functor) -- | A segment element in a file. data NE t = NE { neID :: t , derived :: Maybe (Deriv t) , neType :: t , subType :: Maybe t , orth :: t -- | Left base or Right when. , base :: Either t t , cert :: Cert , certComment :: Maybe t , ptrs :: [Ptr t] } deriving (Show) instance Functor NE where fmap f NE{..} = NE { neID = f neID , derived = fmap (fmap f) derived , neType = f neType , subType = fmap f subType , orth = f orth , base = case base of Left x -> Left (f x) Right x -> Right (f x) , cert = cert , certComment = fmap f certComment , ptrs = map (fmap f) ptrs } -- | Make NE forest from a segment list and a list of NEs, both lists -- corresponding to the same sentence. mkForest :: Ord t => [Mx.Seg t] -> [NE t] -> T.Forest (Either (NE t) (Mx.Seg t)) mkForest xs ns = Nd.mapForest decode (Nd.toForest graph) where -- Position of segment ID pos = (M.!) $ M.fromList (zip (map Mx.segID xs) [0..]) -- Segment on the given position word = (M.!) $ M.fromList (zip [0..] xs) -- NE with given ID name = (M.!) $ M.fromList [(neID ne, ne) | ne <- ns] graph = Nd.mkGraph (0, length xs - 1) [ ( neID ne , map resolve (ptrs ne) ) | ne <- ns ] resolve (Local ptr) = Left ptr resolve (Global ptr _) = Right (pos ptr) decode (Left neID) = Left (name neID) decode (Right k) = Right (word k) -- | TEI NKJP ann_morphosyntax parser. type P a = XmlParser L.Text a namedP :: P [Para L.Text] namedP = true //> paraP paraP :: P (Para L.Text) paraP = uncurry Para <$> (tag "p" *> getAttr "xml:id" sentP) sentP :: P (Sent L.Text) sentP = uncurry Sent <$> (tag "s" *> getAttr "xml:id" nameP) nameP :: P (NE L.Text) nameP = (tag "seg" *> getAttr "xml:id") `join` \_neID -> do ne <- nameBodyP _ptrs <- some namePtrP <|> failBad ("no targets specified for " ++ L.unpack _neID) return $ ne { neID = _neID, ptrs = _ptrs } nameBodyP :: P (NE L.Text) nameBodyP = (tag "fs" *> hasAttr "type" "named") `joinR` do _deriv <- optional derivP _neType <- fSymP "type" _subType <- optional (fSymP "subtype") _orth <- fStrP "orth" _base <- (Left <$> fStrP "base") <|> (Right <$> fStrP "when") _cert <- certP _certComment <- optional (fStrP "comment") return $ NE { neType = _neType, subType = _subType, orth = _orth , base = _base, derived = _deriv, cert = _cert , certComment = _certComment, neID = "", ptrs = [] } derivP :: P (Deriv L.Text) derivP = fP "derived" `joinR` ( fsP "derivation" `joinR` do Deriv <$> fSymP "derivType" <*> fStrP "derivedFrom" ) fP :: L.Text -> TagPred L.Text () fP x = tag "f" *> hasAttr "name" x fsP :: L.Text -> TagPred L.Text () fsP x = tag "fs" *> hasAttr "type" x certP :: P Cert certP = mkCert <$> fSymP "certainty" where mkCert "high" = High mkCert "medium" = Medium mkCert "low" = Low mkCert _ = Medium -- It should not happen! namePtrP :: P (Ptr L.Text) namePtrP = cut (tag "ptr" *> getAttr "target") >>= \x -> return $ case L.break (=='#') x of (ptr, "") -> Local ptr (loc, ptr) -> Global { location = loc , target = (L.tail ptr) } fStrP :: L.Text -> P L.Text fStrP x = let checkName = tag "f" *> hasAttr "name" x -- Body sometimes is empty. safeHead [] = "" safeHead xs = head xs in safeHead <$> (checkName #> tag "string" /> text) fSymP :: L.Text -> P L.Text fSymP x = let checkName = tag "f" *> hasAttr "name" x p = cut (tag "symbol" *> getAttr "value") in head <$> (checkName /> p) -- | Parse textual contents of the ann_named.xml file. parseNamed :: L.Text -> [Para L.Text] parseNamed = parseXml namedP -- | Parse the stand-alone ann_named.xml file. readNamed :: FilePath -> IO [Para L.Text] readNamed namedPath = parseNamed <$> L.readFile namedPath -- | Parse all ann_named.xml files from the NCP .tar.gz file. readCorpus :: FilePath -> IO [(FilePath, Maybe [Para L.Text])] readCorpus = Tar.readCorpus "ann_named" parseNamed -- | Parse the NCP .tar.gz corpus, extract all NEs and translate them -- to the tree form using the 'mkForest' function. readTrees :: FilePath -> IO [T.Forest (Either (NE L.Text) (Mx.Seg L.Text))] readTrees path = do morph <- Mx.readCorpus path named <- readCorpus path return . concat $ map toTrees (sync morph named) where toTrees (_, xs, ys) = map toForest $ zip (concatMap Mx.sentences xs) (concatMap sentences ys) toForest (x, y) = mkForest (Mx.segments x) (names y) sync :: [(FilePath, Maybe a)] -> [(FilePath, Maybe b)] -> [(FilePath, a, b)] sync as bs = mapMaybe (uncurry just) (zip as bs) where just (dir, Just x) (_, Just y) = Just (dir, x, y) just _ _ = Nothing