{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE CPP #-}
module NLP.Concraft.Polish.Format.Plain
(
parsePlain
, parsePara
, parseSent
, ShowCfg (..)
, showPlain
, showPara
, showSent
) where
#if MIN_VERSION_base(4,11,0)
import Prelude hiding ((<>))
#endif
import Data.Monoid (Monoid, mappend, mconcat)
import Data.Maybe (catMaybes)
import Data.List (groupBy)
import Data.String (IsString)
import qualified Data.Char as C
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Builder as L
import qualified Data.Text.Lazy.Read as R
import Text.Printf (printf)
import qualified NLP.Concraft.Morphosyntax as X
import NLP.Concraft.Polish.Morphosyntax
parsePlain :: L.Text -> [[Sent Tag]]
parsePlain =
map parsePara' . groupBy f . L.splitOn "\n\n"
where
f _ xs = case L.uncons xs of
Nothing -> False
Just (x, _) -> not (C.isSpace x)
parsePara :: L.Text -> [Sent Tag]
parsePara = parsePara' . L.splitOn "\n\n"
parsePara' :: [L.Text] -> [Sent Tag]
parsePara' = map (parseSent . L.strip) . filter (not.isEmpty)
isEmpty :: L.Text -> Bool
isEmpty = L.all C.isSpace
parseSent :: L.Text -> Sent Tag
parseSent
= map parseWord
. groupBy (\_ x -> cond x)
. L.lines
where
cond = ("\t" `L.isPrefixOf`)
parseWord :: [L.Text] -> Seg Tag
parseWord xs = Seg
(Word _orth _space _known)
_interps
where
(_orth, _space) = parseHeader (head xs)
ys = map parseInterp (tail xs)
_known = not (Nothing `elem` ys)
_interps = X.mkWMap $ catMaybes ys
parseInterp :: L.Text -> Maybe (Interp Tag, Double)
parseInterp =
doIt . tail . L.splitOn "\t"
where
doIt [form, tag]
| tag == ign = Nothing
| otherwise = Just $
(mkInterp form tag, 0)
doIt [form, tag, "disamb"] =
Just (mkInterp form tag, 1)
doIt [form, tag, weight] = case R.double weight of
Left er -> error $ "parseInterp (weight):" ++ show er
Right w -> Just (mkInterp form tag, fst w)
doIt xs = error $ "parseInterp: " ++ show xs
mkInterp form tag = Interp (L.toStrict form) (L.toStrict tag)
parseHeader :: L.Text -> (T.Text, Space)
parseHeader xs =
let [_orth, space] = L.splitOn "\t" xs
in (L.toStrict _orth, parseSpace space)
parseSpace :: L.Text -> Space
parseSpace "none" = None
parseSpace "space" = Space
parseSpace "spaces" = Space
parseSpace "newline" = NewLine
parseSpace "newlines" = NewLine
parseSpace xs = error ("parseSpace: " ++ L.unpack xs)
data ShowCfg = ShowCfg {
showWsCfg :: Bool }
showPlain :: ShowCfg ->[[Sent Tag]] -> L.Text
showPlain cfg =
L.intercalate "\n" . map (showPara cfg)
showPara :: ShowCfg -> [Sent Tag] -> L.Text
showPara cfg = L.toLazyText . mconcat . map (\xs -> buildSent cfg xs <> "\n")
showSent :: ShowCfg -> Sent Tag -> L.Text
showSent cfg xs = L.toLazyText $ buildSent cfg xs
buildSent :: ShowCfg -> Sent Tag -> L.Builder
buildSent cfg = mconcat . map (buildWord cfg)
buildWord :: ShowCfg -> Seg Tag -> L.Builder
buildWord cfg Seg{..}
= L.fromText orth <> "\t"
<> buildSpace space <> "\n"
<> buildKnown orth known
<> buildInterps cfg interps
where Word{..} = word
buildInterps :: ShowCfg -> X.WMap (Interp Tag) -> L.Builder
buildInterps ShowCfg{..} interps = mconcat
[ "\t" <> buildBase interp <>
"\t" <> buildTag interp <> buildDmb dmb
| (interp, dmb) <- M.toList (X.unWMap interps) ]
where
buildTag = L.fromText . tag
buildBase = L.fromText . base
buildDmb = case showWsCfg of
True -> \x -> between "\t" "\n"
$ L.fromString
$ printf "%.3f" x
False -> \x -> if x > 0
then "\tdisamb\n"
else "\n"
between x y z = x <> z <> y
buildSpace :: Space -> L.Builder
buildSpace None = "none"
buildSpace Space = "space"
buildSpace NewLine = "newline"
buildKnown :: T.Text -> Bool -> L.Builder
buildKnown _ True = ""
buildKnown lemma False
= "\t" <> L.fromText lemma
<> "\t" <> L.fromText ign
<> "\n"
(<>) :: Monoid m => m -> m -> m
(<>) = mappend
{-# INLINE (<>) #-}
ign :: IsString a => a
ign = "ign"
{-# INLINE ign #-}