src/NLP/HistPL/Analyse.hs

{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}


-- | The module provides functions for dictionary-driven analysis
-- of the input text.


module NLP.HistPL.Analyse
(
-- * Tokenization
  Token (..)
, Other (..)
, tokenize
, rmHyphen
-- * Analysis
, anaWord
, mapL
-- * JSON
, JConf (..)
, ShowCont (..)
, defaultJConf
, jsonAna
) where


import           Control.Applicative ((<$>), (<*>), pure)
import           Data.Maybe (fromJust)
import           Data.Monoid (Monoid, mappend, mconcat)
import           Data.Ord (comparing)
import           Data.List (sortBy, intersperse)
import qualified Data.Map as M
import qualified Data.Char as C
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Builder as L
import           Data.Aeson

import qualified NLP.Morfeusz as R
import qualified NLP.HistPL.Lexicon as H


-----------------------------------------------------
-- Basic types
-----------------------------------------------------


-- | A token is an element of the analysis result.
data Token = Token {
    -- | Orthographic form.
      orth  :: T.Text
    -- | Historical interpretations.
    , hist  :: [(H.LexEntry, H.Code)]
    -- | Contemporary interpretations.
    , cont  :: [[R.Interp]] }
    deriving (Show)


-- | A punctuation or a space.
data Other
    -- | Punctuation
    = Pun T.Text
    -- | Space
    | Space T.Text
    deriving (Show)


-----------------------------------------------------
-- Tokenization
-----------------------------------------------------


-- | Remove all instances of the \"-\\n\" string.
rmHyphen :: L.Text -> L.Text
rmHyphen = L.concat . L.splitOn "-\n"


-- | Perform simple tokenization -- spaces and punctuation
-- characters are treated as token ending markers.
tokenize :: T.Text -> [Either T.Text Other]
tokenize =
    map mkElem . T.groupBy cmp
  where
    cmp x y
        | C.isPunctuation x = False
        | C.isPunctuation y = False
        | otherwise         = C.isSpace x == C.isSpace y
    mkElem x
        | T.any C.isSpace x         = Right (Space x)
        | T.any C.isPunctuation x   = Right (Pun x)
        | otherwise                 = Left x


-----------------------------------------------------
-- Analysis
-----------------------------------------------------
 
 
-- | Map the monadic function over the left elements of the input list.
mapL :: (Functor m, Monad m) => (a -> m a') -> [Either a b] -> m [Either a' b]
mapL f =
    let g (Left x)  = Left <$> f x
        g (Right y) = return (Right y)
    in  mapM g


-- | Analyse the word.
anaWord :: H.HistPL -> T.Text -> IO Token
anaWord hpl x = do
    _hist <- H.lookupMany hpl [x, T.toLower x]
    _cont <- return (anaCont x)
    return $ Token x _hist _cont


-- | Analyse word using the Morfeusz analyser for contemporary Polish.
anaCont :: T.Text -> [[R.Interp]]
anaCont = map R.interps . head . R.paths . R.analyse False


-----------------------------------------------------
-- JSON
-----------------------------------------------------


-- | When contemporary interpretations should be shown.
data ShowCont
    = NoShowCont
    | ShowCont
    | ForceShowCont
    deriving (Show, Eq, Ord)


-- | JSON serialization configuration.  Depending on the configuration,
-- different parts of the result will be converted to a JSON format.
data JConf = JConf
    { showCont  :: ShowCont
    -- ^ When to show cont. interpretations.
    , showDefs  :: Bool
    -- ^ Show definitions?
    } deriving (Show, Eq, Ord)


-- | Default JSON serialization configuration.
defaultJConf :: JConf
defaultJConf = JConf
    { showCont  = ShowCont
    , showDefs  = False }


-- | Build JSON value from a list of analysed sentences.
jsonAna :: JConf -> [Either Token Other] -> Value
jsonAna jc = toJSON . map (jsonSent  jc)


-- | JSON representation of a sentence.
jsonSent :: JConf -> Either Token Other -> Value
jsonSent jc = either (jsonTok jc) (jsonOther jc)


-- | JSON represesentation of other segment.
jsonOther :: JConf -> Other -> Value
jsonOther _ (Space _) = toJSON ("space" :: T.Text)
jsonOther _ (Pun t)   = toJSON $ "pun: " `T.append` t


-- | JSON represesentation of a token.
jsonTok :: JConf -> Token -> Value
jsonTok jc tok = object $
    [ "orth" .= orth tok
    , "hist" .= map (jsonHist jc) (hist tok) ] ++
    maybeCont
  where
    maybeCont
        | null (hist tok) && showCont jc /= NoShowCont  = [contElem]
        | showCont jc == ForceShowCont                  = [contElem]
        | otherwise                                     = []
    contElem = "cont" .= jsonCont jc (cont tok)
    

-- | JSON represesentation of a historical interpretation.
jsonHist :: JConf -> (H.LexEntry, H.Code) -> Value
jsonHist jc (entry, code) = object $
    [ "id"   .= jsonID (H.lexId entry, code)
    , "pos"  .= H.pos entry
    , "base" .= H.text (H.lemma entry) ]
    ++ if showDefs jc then [defsElem] else []
  where
    jsonID (id', cd') = toJSON (toJSON id', jsonCode cd')
    jsonCode code' = toJSON $ case code' of
        H.Orig -> "orig" :: T.Text
        H.Both -> "both"
        H.Copy -> "copy"
    defsElem = "defs" .= concatMap H.text (getDefs entry)
    getDefs  = concatMap H.defs . H.senses


-- | JSON represesentation of a contemporary interpretation.
jsonCont :: JConf -> [[R.Interp]] -> Value
jsonCont _ = toJSON . map (map jsonI) where
    jsonI x = object
        [ "base" .= R.base x
        , "msd"  .= R.msd x ]


-----------------------------------------------------
-- Utils
-----------------------------------------------------


(<>) :: Monoid m => m -> m -> m
(<>) = mappend