{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}


-- | Support for the XCES format.


module NLP.Nerf.XCES
( nerXCES
) where


import           Prelude hiding (Word)

import qualified Data.Text.Lazy as L
import           Data.List (intercalate, intersperse)
import           Data.Char (isSpace)
import           Text.HTML.TagSoup ((~==))
import qualified Text.HTML.TagSoup as S
import           Text.XML.PolySoup hiding (Parser)

import           Data.Named.Tree
import           NLP.Nerf.Types
import qualified NLP.Nerf.Tokenize as Tok


---------------------------------------------------------------------
-- Core types
---------------------------------------------------------------------


-- | An XML tag.
type Tag = S.Tag L.Text


-- | An XML parser.
type Parser a = XmlParser L.Text a


---------------------------------------------------------------------
-- XML tags
---------------------------------------------------------------------


-- | A sentence opening tag.
sentOpen :: Tag
sentOpen = S.TagOpen "chunk" [("type", "s")]


-- | A sentence opening tag.
sentClose :: Tag
sentClose = S.TagClose "chunk"


-- | A sentence opening tag.
tokOpen :: Tag
tokOpen = S.TagOpen "tok" []


-- -- | A sentence opening tag.
-- tokClose :: Tag
-- tokClose = S.TagClose "tok"


-- | A sentence opening tag.
nsOpen :: Tag
nsOpen = S.TagOpen "ns" []


---------------------------------------------------------------------
-- XML chunking
---------------------------------------------------------------------


-- | Group tags corresponding to individual sentences as right elements.
chunk :: [Tag] -> [Either Tag [Tag]]
chunk (x:xs)
    | x ~== sentOpen =
        let (sent, rest) = takeSent xs
        in  Right (x:sent) : chunk rest
    | otherwise = Left x : chunk xs
chunk [] = []


-- | Take tags starting with a sentence.
takeSent :: [Tag] -> ([Tag], [Tag])
takeSent = go [] where
    go acc (x:xs)
        | x == sentClose    = (reverse $ x:acc, xs)
        | otherwise         = go (x:acc) xs
    go _ []                 = error "takeSent: expected sentence closing tag"


-- | Remove division into chunks.
unChunk :: [Either Tag [Tag]] -> [Tag]
unChunk = concatMap $ either (:[]) id


---------------------------------------------------------------------
-- XML sentence intermediate
---------------------------------------------------------------------


-- | Intermediate sentence representation.  The ending tag is not preserved
-- since it is always the same.  It should be remembered during the sentence
-- rendering process.
data SentI = SentI {
    -- | Beginning tag.
      sentBegI  :: Tag
    -- | Sentence contents.
    , sentConI  :: [(SegT, XmlTree)]
    } deriving (Show)


-- | Type of a sentence sub-element.
data SegT = TokT | NsT | OtherT deriving (Show)


-- | Identify type of a sub-tree.
idTreeT :: XmlTree -> SegT
idTreeT (Node x _)
    | x ~== tokOpen = TokT
    | x ~== nsOpen  = NsT
    | otherwise     = OtherT


-- | XML intermediate sentence parser.
sentIP :: Parser SentI
sentIP =
    begP >^> \x -> SentI x <$> many elemP
  where
    begP = tag "chunk" *> hasAttr "type" "s" *> getTag
    elemP = (\x -> (idTreeT x, x)) <$> xmlTreeP


---------------------------------------------------------------------
-- XML sentence
---------------------------------------------------------------------


-- | An XML sentence.  The ending tag is not preserved since it is always
-- the same.  It should be remembered during the sentence rendering process.
data Sent t = Sent {
    -- | Beginning tag of a sentence.
      sentBeg   :: Tag
    -- | Contents of a sentence.
    , sentCon   :: t Tok
    -- | Additional, non-token tags, placed after the last token
    , sentAdd   :: [XmlTree] }


-- | Translate sentence into its final representation.
joinSent :: SentI -> Sent []
joinSent SentI{..} =
    uncurry (Sent sentBegI) (go [] [] False sentConI)
  where
    -- TODO: could we represent this function as a fold?
    go acc res hasNs ((typ, tagTree) : xs) = case typ of
        TokT ->
            let tok = Tok
                    { orth   = tagsParseXml tokOrthP (enumTree tagTree)
                    , nps    = hasNs
                    , tagsIn = tagTree
                    , tagsBf = reverse acc }
            in  go [] (tok:res) False xs
        NsT    -> go (tagTree:acc) res True xs
        OtherT -> go (tagTree:acc) res hasNs xs
    go acc res _ [] = (reverse res, reverse acc)


-- | Parse a list of tags into a sentence.
parseSent :: [Tag] -> Sent []
parseSent = joinSent . tagsParseXml sentIP


---------------------------------------------------------------------
-- Annotated XML sentence
---------------------------------------------------------------------


-- | List of a elements annotated with NEs.
newtype Ann a = Ann { unAnn :: NeForest NE a }


-- | A sentence opening tag.
neOpen :: NE -> Tag
neOpen x = S.TagOpen "group" [("type", L.fromStrict x)]


-- | A sentence opening tag.
neClose :: Tag
neClose = S.TagClose "group"


-- | Render an annotated sentence.
renderAnnSent :: Sent Ann -> [Tag]
renderAnnSent Sent{..} = between
    [sentBeg, newline]
    [newline, sentClose]
        ( interMap renderNeTree (unAnn sentCon) )
        -- TODO: ponizej nie intersperse, trzeba dodac newline
        -- przed kazdym elementem.
        -- ++ intersperse newline (concatMap enumTree sentAdd) )


-- | Render an element of an annotated sentence.
renderNeTree :: NeTree NE Tok -> [Tag]
renderNeTree (Node (Left v) xs)
    = between
        [neOpen v, newline]
        [newline, neClose]
    $ interMap renderNeTree xs
renderNeTree (Node (Right t) _) = renderTok t


---------------------------------------------------------------------
-- XML Token
---------------------------------------------------------------------


-- | An XML token.
data Tok = Tok
    { orth      :: L.Text    -- ^ Orthographic form
    , nps       :: Bool      -- ^ No preceding space
    , tagsIn    :: XmlTree   -- ^ Token tags
    , tagsBf    :: [XmlTree] -- ^ Non-token tags before the token
    }


instance Tok.Word Tok where
    word = Tok.word . orth


tokOrthP :: Parser L.Text
tokOrthP = maybe "" id <$> (tag "tok" ^> findIgnore (tag "orth" ^> text))


-- | Render token.
renderTok :: Tok -> [Tag]
renderTok Tok{..} = case before of
    []  -> inside
    _   -> intercalate [newline] [before, inside]
  where
    before = interMap enumTree tagsBf
    inside =
        let Node v xs = tagsIn
        in  between [v, newline] [newline, endFrom v]
                (interMap enumTree xs)




---------------------------------------------------------------------
-- XML generic
---------------------------------------------------------------------


-- | A parsed XML tree.  In nodes the content/opening tags are preserved.
type XmlTree = Tree Tag


-- | Parse tags to an XML tree representation.
xmlTreeP :: Parser XmlTree
xmlTreeP =
    let commTag = satisfyPred ((,) <$> getTag <*> isTagComment)
        textTag = satisfyPred ((,) <$> getTag <*> isTagText)
        leafTag = fst <$> (textTag <|> commTag)
    in  trueXmlTreeP <|> (Node <$> leafTag <*> pure [])


trueXmlTreeP :: Parser XmlTree
trueXmlTreeP = do
    (beg, name) <- satisfyPred ((,) <$> getTag <*> tagOpenName)
    subForest <- beg `seq` name `seq` many xmlTreeP
    satisfyPred $ isTagCloseName name
    return $ Node beg subForest


-- | Enumerate tags present in the tree.
enumTree :: XmlTree -> [Tag]
enumTree (Node v xs) = if S.isTagOpen v
    then v : concatMap enumTree xs ++ [endFrom v]
    else [v]


---------------------------------------------------------------------
-- Misc
---------------------------------------------------------------------


-- | Put the list between the two elements.
between :: [a] -> [a] -> [a] -> [a]
between p q xs = p ++ xs ++ q


-- | A newline tag.
newline :: Tag
newline = S.TagText "\n"


-- | Make closing tag from the opening tag.
endFrom :: Tag -> Tag
endFrom (S.TagOpen x _) = S.TagClose x
endFrom _               = error "endFrom: not an opening tag"


-- | Map and intercalate with newlines.
interMap :: (a -> [Tag]) -> [a] -> [Tag]
interMap f = intercalate [newline] . map f


---------------------------------------------------------------------
-- NER
---------------------------------------------------------------------


-- | Annotate XCES (in a form of a tag list) with NEs with respect
-- to the given NER function.
-- nerXCES :: Nerf.Nerf -> L.Text -> L.Text
nerXCES :: (String -> NeForest NE Word) -> L.Text -> L.Text
nerXCES nerFun
    = S.renderTagsOptions opts
    . unChunk
    . intersperse (Left newline)
    . mapR
        ( renderAnnSent
        . nerSent nerFun
        . parseSent )
    . chunk
    . filter relevant
    . S.parseTags
  where
    opts = S.renderOptions {S.optMinimize = const True}
    mapR = map . fmap
    relevant (S.TagWarning _)       = False
    relevant (S.TagPosition _ _)    = False
    relevant (S.TagText x)          = not $ L.all isSpace x
    relevant _                      = True


-- | Annotate XCES sentence with NEs.
-- nerSent :: Nerf.Nerf -> Sent [] -> Sent Ann
nerSent :: (String -> NeForest NE Word) -> Sent [] -> Sent Ann
nerSent nerFun s@Sent{..} = s
    { sentCon = Ann $ Tok.sync
        (nerFun $ restoreOrigSent sentCon)
        sentCon }


-- | Restore original sentence.
restoreOrigSent :: [Tok] -> String
restoreOrigSent
    = dropWhile isSpace
    . concatMap tokStr
  where
    tokStr Tok{..} = (if nps then "" else " ") ++ (L.unpack orth)