{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE DeriveDataTypeable #-}
module NLP.Concraft.Polish.DAG.Format.Base
(
ShowCfg (..)
, ProbType (..)
, showSent
, showData
, parseData
, parseSent
) where
import Prelude hiding (Word)
import Data.Monoid (mconcat, mappend)
import qualified Data.Map as M
import Data.List (intersperse, groupBy)
import Data.String (IsString)
import Data.Data (Data)
import Data.Typeable (Typeable)
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Builder as L
import Text.Printf (printf)
import Text.Read (readMaybe)
import qualified Data.DAG as DAG
import qualified NLP.Concraft.DAG.Morphosyntax as X
import NLP.Concraft.Polish.DAGSeg (AnnoSent(..))
import qualified NLP.Concraft.Polish.Morphosyntax as I
import NLP.Concraft.Polish.DAG.Morphosyntax hiding (tag, Tag)
import qualified NLP.Concraft.Polish.DAG.Morphosyntax as PolX
type Tag = PolX.Interp PolX.Tag
data ShowCfg = ShowCfg
{ probType :: ProbType
, numericDisamb :: Bool
}
data ProbType
= Marginals
| MaxProbs
| GuessedMarginals
deriving (Show, Eq, Ord, Enum, Typeable, Data)
showData :: ShowCfg -> [[AnnoSent]] -> L.Text
showData cfg
= flip L.append "\n"
. L.toLazyText
. mconcat
. intersperse "\n"
. map (buildSents cfg)
showSent :: ShowCfg -> [AnnoSent] -> L.Text
showSent cfg = L.toLazyText . buildSents cfg
buildSents :: ShowCfg -> [AnnoSent] -> L.Builder
buildSents cfg =
finalize . map (buildSent cfg)
where
finalize = mconcat
buildSent :: ShowCfg -> AnnoSent -> L.Builder
buildSent showCfg AnnoSent{..} = finalize $ do
let dag = guessSent
edgeID <- DAG.dagEdges dag
let tailNode = DAG.begsWith edgeID dag
headNode = DAG.endsWith edgeID dag
X.Seg{..} = DAG.edgeLabel edgeID dag
interpWeight <- map Just (M.toList (X.unWMap tags)) ++
if known word then [] else [Nothing]
return $ case interpWeight of
Just (interp@Interp{..}, weight) -> buildInterp
showCfg tailNode headNode word interp
(case probType showCfg of
Marginals ->
tagWeightIn edgeID interp marginals
MaxProbs ->
tagWeightIn edgeID interp maxProbs
GuessedMarginals ->
weight)
(tagLabelIn False edgeID interp disambs)
Nothing ->
let interp = Interp
{ base = "none"
, tag = ign
, commonness = Nothing
, qualifier = Nothing
, metaInfo = Nothing
, eos = False }
in buildInterp showCfg tailNode headNode word interp 0 False
where
finalize = (`mappend` "\n") . mconcat . intersperse "\n"
tagWeightIn = tagLabelIn 0
tagLabelIn def i x anno
= maybe def (tagLabel def x) (DAG.maybeEdgeLabel i anno)
tagLabel def x = maybe def id . M.lookup x
buildInterp
:: ShowCfg
-> DAG.NodeID
-> DAG.NodeID
-> Word
-> Interp PolX.Tag
-> Double
-> Bool
-> L.Builder
buildInterp ShowCfg{..} tailNode headNode word Interp{..} weight disamb =
mconcat $ intersperse "\t" $
[ buildNode tailNode
, buildNode headNode
, L.fromText $ orth word
, L.fromText $ if known word then base else orth word
, L.fromText tag
, buildMayText commonness
, buildMayText qualifier
, if numericDisamb
then buildDisamb disamb
else buildWeight weight
, buildMayText metaInfo
, if eos then "eos" else ""
] ++
if numericDisamb then [] else [buildDisamb disamb]
where
buildNode (DAG.NodeID i) = L.fromString (show i)
buildWeight = L.fromString . printf "%.4f"
buildDisamb True = if numericDisamb then "1.0000" else "disamb"
buildDisamb False = if numericDisamb then "0.0000" else ""
buildMayText Nothing = ""
buildMayText (Just x) = L.fromText x
parseData :: L.Text -> [Sent Tag]
parseData =
map parseSent . filter realSent . L.splitOn "\n\n"
where
realSent = not . L.null
parseSent :: L.Text -> Sent Tag
parseSent = fromRows . parseRows
data Row = Row
{ tailNode :: Int
, headNode :: Int
, orthForm :: T.Text
, baseForm :: T.Text
, theTag :: PolX.Tag
, commonness :: Maybe T.Text
, qualifier :: Maybe T.Text
, tagProb :: Double
, metaInfo :: Maybe T.Text
, eos :: Bool
}
fromRows :: [Row] -> Sent Tag
fromRows =
DAG.mapN (const I.None) . DAG.fromEdgesUnsafe . getEdges
where
getEdges = map mkEdge . groupBy theSameEdge
theSameEdge r1 r2
= tailNode r1 == tailNode r2
&& headNode r1 == headNode r2
mkEdge [] = error "Format.Base.fromRows: empty list"
mkEdge rows@(row0:_) = DAG.Edge
{ DAG.tailNode = DAG.NodeID $ tailNode row0
, DAG.headNode = DAG.NodeID $ headNode row0
, DAG.edLabel = edge }
where
edge = X.Seg
{ word = newWord
, tags = newTags }
newWord = Word
{ orth = orthForm row0
, known = not $ ign `elem` map theTag rows }
newTags = X.mkWMap
[ (interp, tagProb)
| Row{..} <- rows
, not $ theTag == ign
, let interp = Interp
{ base = baseForm
, tag = theTag
, commonness = commonness
, qualifier = qualifier
, metaInfo = metaInfo
, eos = eos }
]
parseRows :: L.Text -> [Row]
parseRows = map parseRow . L.splitOn "\n"
parseRow :: L.Text -> Row
parseRow =
doit . L.splitOn "\t"
where
doit (tlNode : hdNode : otForm : bsForm : tag :
comm : qual : prob : meta : eos : _) = Row
{ tailNode = readTyp "tail node" $ L.unpack tlNode
, headNode = readTyp "head node" $ L.unpack hdNode
, orthForm = L.toStrict otForm
, baseForm = L.toStrict bsForm
, theTag = L.toStrict tag
, commonness = nullIfEmpty comm
, qualifier = nullIfEmpty qual
, tagProb = readTyp "probability value" $ L.unpack prob
, metaInfo = nullIfEmpty meta
, eos = case eos of
"eos" -> True
_ -> False
}
doit _ = error "parseRow: unexpected number of row cells"
nullIfEmpty x = case x of
"" -> Nothing
_ -> Just (L.toStrict x)
readTyp :: (Read a) => String -> String -> a
readTyp typ x =
case readMaybe x of
Just y -> y
Nothing -> error $
"unable to parse \"" ++ x ++ "\" to a " ++ typ
ign :: IsString a => a
ign = "ign"
{-# INLINE ign #-}