{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
module NLP.Nerf.XCES
( nerXCES
) where
import Prelude hiding (Word)
import qualified Data.Text.Lazy as L
import Data.List (intercalate, intersperse)
import Data.Char (isSpace)
import Text.HTML.TagSoup ((~==))
import qualified Text.HTML.TagSoup as S
import Text.XML.PolySoup hiding (Parser)
import Data.Named.Tree
import NLP.Nerf.Types
import qualified NLP.Nerf.Tokenize as Tok
type Tag = S.Tag L.Text
type Parser a = XmlParser L.Text a
sentOpen :: Tag
sentOpen = S.TagOpen "chunk" [("type", "s")]
sentClose :: Tag
sentClose = S.TagClose "chunk"
tokOpen :: Tag
tokOpen = S.TagOpen "tok" []
nsOpen :: Tag
nsOpen = S.TagOpen "ns" []
chunk :: [Tag] -> [Either Tag [Tag]]
chunk (x:xs)
| x ~== sentOpen =
let (sent, rest) = takeSent xs
in Right (x:sent) : chunk rest
| otherwise = Left x : chunk xs
chunk [] = []
takeSent :: [Tag] -> ([Tag], [Tag])
takeSent = go [] where
go acc (x:xs)
| x == sentClose = (reverse $ x:acc, xs)
| otherwise = go (x:acc) xs
go _ [] = error "takeSent: expected sentence closing tag"
unChunk :: [Either Tag [Tag]] -> [Tag]
unChunk = concatMap $ either (:[]) id
data SentI = SentI {
sentBegI :: Tag
, sentConI :: [(SegT, XmlTree)]
} deriving (Show)
data SegT = TokT | NsT | OtherT deriving (Show)
idTreeT :: XmlTree -> SegT
idTreeT (Node x _)
| x ~== tokOpen = TokT
| x ~== nsOpen = NsT
| otherwise = OtherT
sentIP :: Parser SentI
sentIP =
begP >^> \x -> SentI x <$> many elemP
where
begP = tag "chunk" *> hasAttr "type" "s" *> getTag
elemP = (\x -> (idTreeT x, x)) <$> xmlTreeP
data Sent t = Sent {
sentBeg :: Tag
, sentCon :: t Tok
, sentAdd :: [XmlTree] }
joinSent :: SentI -> Sent []
joinSent SentI{..} =
uncurry (Sent sentBegI) (go [] [] False sentConI)
where
go acc res hasNs ((typ, tagTree) : xs) = case typ of
TokT ->
let tok = Tok
{ orth = tagsParseXml tokOrthP (enumTree tagTree)
, nps = hasNs
, tagsIn = tagTree
, tagsBf = reverse acc }
in go [] (tok:res) False xs
NsT -> go (tagTree:acc) res True xs
OtherT -> go (tagTree:acc) res hasNs xs
go acc res _ [] = (reverse res, reverse acc)
parseSent :: [Tag] -> Sent []
parseSent = joinSent . tagsParseXml sentIP
newtype Ann a = Ann { unAnn :: NeForest NE a }
neOpen :: NE -> Tag
neOpen x = S.TagOpen "group" [("type", L.fromStrict x)]
neClose :: Tag
neClose = S.TagClose "group"
renderAnnSent :: Sent Ann -> [Tag]
renderAnnSent Sent{..} = between
[sentBeg, newline]
[newline, sentClose]
( interMap renderNeTree (unAnn sentCon) )
renderNeTree :: NeTree NE Tok -> [Tag]
renderNeTree (Node (Left v) xs)
= between
[neOpen v, newline]
[newline, neClose]
$ interMap renderNeTree xs
renderNeTree (Node (Right t) _) = renderTok t
data Tok = Tok
{ orth :: L.Text
, nps :: Bool
, tagsIn :: XmlTree
, tagsBf :: [XmlTree]
}
instance Tok.Word Tok where
word = Tok.word . orth
tokOrthP :: Parser L.Text
tokOrthP = maybe "" id <$> (tag "tok" ^> findIgnore (tag "orth" ^> text))
renderTok :: Tok -> [Tag]
renderTok Tok{..} = case before of
[] -> inside
_ -> intercalate [newline] [before, inside]
where
before = interMap enumTree tagsBf
inside =
let Node v xs = tagsIn
in between [v, newline] [newline, endFrom v]
(interMap enumTree xs)
type XmlTree = Tree Tag
xmlTreeP :: Parser XmlTree
xmlTreeP =
let commTag = satisfyPred ((,) <$> getTag <*> isTagComment)
textTag = satisfyPred ((,) <$> getTag <*> isTagText)
leafTag = fst <$> (textTag <|> commTag)
in trueXmlTreeP <|> (Node <$> leafTag <*> pure [])
trueXmlTreeP :: Parser XmlTree
trueXmlTreeP = do
(beg, name) <- satisfyPred ((,) <$> getTag <*> tagOpenName)
subForest <- beg `seq` name `seq` many xmlTreeP
satisfyPred $ isTagCloseName name
return $ Node beg subForest
enumTree :: XmlTree -> [Tag]
enumTree (Node v xs) = if S.isTagOpen v
then v : concatMap enumTree xs ++ [endFrom v]
else [v]
between :: [a] -> [a] -> [a] -> [a]
between p q xs = p ++ xs ++ q
newline :: Tag
newline = S.TagText "\n"
endFrom :: Tag -> Tag
endFrom (S.TagOpen x _) = S.TagClose x
endFrom _ = error "endFrom: not an opening tag"
interMap :: (a -> [Tag]) -> [a] -> [Tag]
interMap f = intercalate [newline] . map f
nerXCES :: (String -> NeForest NE Word) -> L.Text -> L.Text
nerXCES nerFun
= S.renderTagsOptions opts
. unChunk
. intersperse (Left newline)
. mapR
( renderAnnSent
. nerSent nerFun
. parseSent )
. chunk
. filter relevant
. S.parseTags
where
opts = S.renderOptions {S.optMinimize = const True}
mapR = map . fmap
relevant (S.TagWarning _) = False
relevant (S.TagPosition _ _) = False
relevant (S.TagText x) = not $ L.all isSpace x
relevant _ = True
nerSent :: (String -> NeForest NE Word) -> Sent [] -> Sent Ann
nerSent nerFun s@Sent{..} = s
{ sentCon = Ann $ Tok.sync
(nerFun $ restoreOrigSent sentCon)
sentCon }
restoreOrigSent :: [Tok] -> String
restoreOrigSent
= dropWhile isSpace
. concatMap tokStr
where
tokStr Tok{..} = (if nps then "" else " ") ++ (L.unpack orth)