{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}

-- | Simple format for morphosyntax representation which
-- assumes that all tags have a textual representation
-- with no spaces inside and that one of the tags indicates
-- unknown words.

module NLP.Concraft.Format.Plain
( plainFormat
) where

import Control.Arrow (first)
import Data.Monoid (Monoid, mappend, mconcat)
import Data.Maybe (catMaybes)
import Data.List (groupBy)
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Builder as L

import qualified NLP.Concraft.Morphosyntax as Mx
import qualified NLP.Concraft.Format as F

-- | No space, space or newline.
data Space
    = None
    | Space
    | NewLine
    deriving (Show, Eq, Ord)

-- | A token.
data Token = Token
    { orth      :: T.Text
    , space     :: Space
    , known     :: Bool
    -- | Interpretations of the token, each interpretation annotated
    -- with a /disamb/ Boolean value (if 'True', the interpretation
    -- is correct within the context).
    , interps   :: M.Map Interp Bool }
    deriving (Show, Eq, Ord)
    
data Interp = Interp
    { base  :: Maybe T.Text
    , tag   :: F.Tag }
    deriving (Show, Eq, Ord)

noneBase :: T.Text
noneBase = "None"

-- | Create document handler given value of the /ignore/ tag.
plainFormat :: F.Tag -> F.Doc [] [Token] Token
plainFormat ign = F.Doc (parsePlain ign) (showPlain ign) sentHandler

-- | Sentence handler.
sentHandler :: F.Sent [Token] Token
sentHandler = F.Sent id (\xs _ -> xs) wordHandler

-- | Word handler.
wordHandler :: F.Word Token
wordHandler = F.Word extract select

-- | Extract information relevant for tagging.
extract :: Token -> Mx.Word F.Tag
extract tok = Mx.Word
    { Mx.orth       = orth tok
    , Mx.tagWMap    = Mx.mkWMap
        [ (tag x, if disamb then 1 else 0)
        | (x, disamb) <- M.toList (interps tok) ]
    , Mx.oov        = not (known tok) }

-- | Select interpretations.
select :: Mx.WMap F.Tag -> Token -> Token
select wMap tok =
    tok { interps = newInterps }
  where
    wSet = M.fromList . map (first tag) . M.toList . interps
    asDmb x = if x > 0
        then True
        else False
    newInterps = M.fromList $
        [ case M.lookup (tag interp) (Mx.unWMap wMap) of
            Just x  -> (interp, asDmb x)
            Nothing -> (interp, False)
        | interp <- M.keys (interps tok) ]
            ++ catMaybes
        [ if tag `M.member` wSet tok
            then Nothing
            else Just (Interp Nothing tag, asDmb x)
        | (tag, x) <- M.toList (Mx.unWMap wMap) ]

parsePlain :: F.Tag -> L.Text -> [[Token]]
parsePlain ign = map (parseSent ign) . init . L.splitOn "\n\n"

parseSent :: F.Tag -> L.Text -> [Token]
parseSent ign
    = map (parseWord ignL)
    . groupBy (\_ x -> cond x)
    . L.lines
  where
    cond = ("\t" `L.isPrefixOf`)
    ignL = L.fromStrict ign

parseWord :: L.Text -> [L.Text] -> Token
parseWord ign xs =
    (Token _orth _space _known _interps)
  where
    (_orth, _space) = parseHeader (head xs)
    ys          = map (parseInterp ign) (tail xs)
    _known      = not (Nothing `elem` ys)
    _interps    = M.fromListWith max (catMaybes ys)

parseInterp :: L.Text -> L.Text -> Maybe (Interp, Bool)
parseInterp ign =
    doIt . tail . L.splitOn "\t"
  where
    doIt [form, tag]
        | tag == ign    = Nothing
        | otherwise     = Just $
            (mkInterp form tag, False)
    doIt [form, tag, "disamb"] = Just $
        (mkInterp form tag, True)
    doIt xs = error $ "parseInterp: " ++ show xs
    mkInterp form tag
        | formS == noneBase = Interp Nothing tagS
        | otherwise         = Interp (Just formS) tagS
      where
        formS   = L.toStrict form
        tagS    = L.toStrict tag

parseHeader :: L.Text -> (T.Text, Space)
parseHeader xs =
    let [_orth, space] = L.splitOn "\t" xs
    in  (L.toStrict _orth, parseSpace space)

parseSpace :: L.Text -> Space
parseSpace "none"    = None
parseSpace "space"   = Space
parseSpace "spaces"  = Space	-- Is it not a Maca bug?
parseSpace "newline" = NewLine
parseSpace "newlines" = NewLine -- TODO: Remove this temporary fix
parseSpace xs        = error ("parseSpace: " ++ L.unpack xs)

-- | Printing.

-- | An infix synonym for 'mappend'.
(<>) :: Monoid m => m -> m -> m
(<>) = mappend
{-# INLINE (<>) #-}

showPlain :: F.Tag -> [[Token]] -> L.Text
showPlain ign =
    L.toLazyText . mconcat  . map (\xs -> buildSent ign xs <> "\n")

buildSent :: F.Tag -> [Token] -> L.Builder
buildSent ign = mconcat . map (buildWord ign)

buildWord :: F.Tag -> Token -> L.Builder
buildWord ign tok
    =  L.fromText (orth tok) <> "\t"
    <> buildSpace (space tok) <> "\n"
    <> buildKnown ign (known tok)
    <> buildInterps (M.toList $ interps tok)

buildInterps :: [(Interp, Bool)] -> L.Builder
buildInterps interps = mconcat
    [ "\t" <> buildBase interp <>
      "\t" <> buildTag  interp <>
      if dmb
        then "\tdisamb\n"
        else "\n"
    | (interp, dmb) <- interps ]
  where
    buildTag    = L.fromText . tag
    buildBase x = case base x of
        Just b  -> L.fromText b
        Nothing -> L.fromText noneBase

buildSpace :: Space -> L.Builder
buildSpace None     = "none"
buildSpace Space    = "space"
buildSpace NewLine  = "newline"

buildKnown :: F.Tag -> Bool -> L.Builder
buildKnown _   True     = ""
buildKnown ign False    =  "\t" <> L.fromText noneBase
                        <> "\t" <> L.fromText ign <> "\n"