module NLP.HistPL.Analyse
( Token (..)
, Other (..)
, tokenize
, anaText
, anaWord
, mapL
, showAna
) where
import Control.Applicative ((<$>), (<*>), pure)
import Data.Maybe (fromJust)
import Data.Monoid (Monoid, mappend, mconcat)
import Data.Ord (comparing)
import Data.List (sortBy, intersperse)
import qualified Data.Map as M
import qualified Data.Char as C
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Builder as L
import qualified NLP.Morfeusz as R
import qualified NLP.HistPL.Lexicon as H
data Token = Token {
orth :: T.Text
, hist :: [(H.LexEntry, H.Code)]
, cont :: [[R.Interp]] }
deriving (Show)
data Other
= Pun T.Text
| Space T.Text
deriving (Show)
tokenize :: T.Text -> [Either T.Text Other]
tokenize =
map mkElem . T.groupBy cmp
where
cmp x y
| C.isPunctuation x = False
| C.isPunctuation y = False
| otherwise = C.isSpace x == C.isSpace y
mkElem x
| T.any C.isSpace x = Right (Space x)
| T.any C.isPunctuation x = Right (Pun x)
| otherwise = Left x
anaText :: H.HistPL -> T.Text -> IO [Either Token Other]
anaText hpl = mapL (anaWord hpl) . tokenize
mapL :: (Functor m, Monad m) => (a -> m a') -> [Either a b] -> m [Either a' b]
mapL f =
let g (Left x) = Left <$> f x
g (Right y) = return (Right y)
in mapM g
anaWord :: H.HistPL -> T.Text -> IO Token
anaWord hpl x = do
_hist <- H.lookupMany hpl [x, T.toLower x]
_cont <- return (anaCont x)
return $ Token x _hist _cont
anaCont :: T.Text -> [[R.Interp]]
anaCont = map R.interps . head . R.paths . R.analyse False
showAna :: [Either Token Other] -> L.Text
showAna = L.toLazyText . mconcat . newlineSep . buildAna
buildAna :: [Either Token Other] -> [L.Builder]
buildAna xs = "sent:" :
map indent (concatMap (either buildTok buildOther) xs)
buildTok :: Token -> [L.Builder]
buildTok tok
= buildHead tok
: map (indent . buildHist) histInterps
where
histInterps = sortBy (comparing snd) (hist tok)
buildHead :: Token -> L.Builder
buildHead tok = "word: " <> L.fromText (orth tok)
buildHist :: (H.LexEntry, H.Code) -> L.Builder
buildHist (entry, code)
= "hist: " <> buildID (H.lexId entry, code)
<> " " <> buildPos
<> ": " <> commaRepr (H.lemma entry)
where
buildID (id', cd') = "[" <> L.fromText id' <> ", " <> buildCode cd' <> "]"
buildPos = case H.pos entry of
[] -> "-"
xs -> mconcat . commaSep . map L.fromText $ xs
buildCode code' = case code' of
H.Orig -> "orig"
H.Both -> "both"
H.Copy -> "copy"
buildOther :: Other -> [L.Builder]
buildOther (Space _) = ["<space>"]
buildOther (Pun t) = ["pun: " <> L.fromText t]
commaRepr :: H.HasRepr t => t -> L.Builder
commaRepr = mconcat . commaSep . map L.fromText . H.text
(<>) :: Monoid m => m -> m -> m
(<>) = mappend
indent :: L.Builder -> L.Builder
indent = (" " <>)
commaSep :: [L.Builder] -> [L.Builder]
commaSep = intersperse ", "
newlineSep :: [L.Builder] -> [L.Builder]
newlineSep = intersperse "\n"