module NLP.Concraft.Format.Plain
( plainFormat
) where
import Control.Arrow (first)
import Data.Monoid (Monoid, mappend, mconcat)
import Data.Maybe (catMaybes)
import Data.List (groupBy)
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Builder as L
import qualified NLP.Concraft.Morphosyntax as Mx
import qualified NLP.Concraft.Format as F
data Space
= None
| Space
| NewLine
deriving (Show, Eq, Ord)
data Token = Token
{ orth :: T.Text
, space :: Space
, known :: Bool
, interps :: M.Map Interp Bool }
deriving (Show, Eq, Ord)
data Interp = Interp
{ base :: Maybe T.Text
, tag :: F.Tag }
deriving (Show, Eq, Ord)
noneBase :: T.Text
noneBase = "None"
plainFormat :: F.Tag -> F.Doc [] [Token] Token
plainFormat ign = F.Doc (parsePlain ign) (showPlain ign) sentHandler
sentHandler :: F.Sent [Token] Token
sentHandler = F.Sent id (\xs _ -> xs) wordHandler
wordHandler :: F.Word Token
wordHandler = F.Word extract select
extract :: Token -> Mx.Word F.Tag
extract tok = Mx.Word
{ Mx.orth = orth tok
, Mx.tagWMap = Mx.mkWMap
[ (tag x, if disamb then 1 else 0)
| (x, disamb) <- M.toList (interps tok) ]
, Mx.oov = not (known tok) }
select :: Mx.WMap F.Tag -> Token -> Token
select wMap tok =
tok { interps = newInterps }
where
wSet = M.fromList . map (first tag) . M.toList . interps
asDmb x = if x > 0
then True
else False
newInterps = M.fromList $
[ case M.lookup (tag interp) (Mx.unWMap wMap) of
Just x -> (interp, asDmb x)
Nothing -> (interp, False)
| interp <- M.keys (interps tok) ]
++ catMaybes
[ if tag `M.member` wSet tok
then Nothing
else Just (Interp Nothing tag, asDmb x)
| (tag, x) <- M.toList (Mx.unWMap wMap) ]
parsePlain :: F.Tag -> L.Text -> [[Token]]
parsePlain ign = map (parseSent ign) . init . L.splitOn "\n\n"
parseSent :: F.Tag -> L.Text -> [Token]
parseSent ign
= map (parseWord ignL)
. groupBy (\_ x -> cond x)
. L.lines
where
cond = ("\t" `L.isPrefixOf`)
ignL = L.fromStrict ign
parseWord :: L.Text -> [L.Text] -> Token
parseWord ign xs =
(Token _orth _space _known _interps)
where
(_orth, _space) = parseHeader (head xs)
ys = map (parseInterp ign) (tail xs)
_known = not (Nothing `elem` ys)
_interps = M.fromListWith max (catMaybes ys)
parseInterp :: L.Text -> L.Text -> Maybe (Interp, Bool)
parseInterp ign =
doIt . tail . L.splitOn "\t"
where
doIt [form, tag]
| tag == ign = Nothing
| otherwise = Just $
(mkInterp form tag, False)
doIt [form, tag, "disamb"] = Just $
(mkInterp form tag, True)
doIt xs = error $ "parseInterp: " ++ show xs
mkInterp form tag
| formS == noneBase = Interp Nothing tagS
| otherwise = Interp (Just formS) tagS
where
formS = L.toStrict form
tagS = L.toStrict tag
parseHeader :: L.Text -> (T.Text, Space)
parseHeader xs =
let [_orth, space] = L.splitOn "\t" xs
in (L.toStrict _orth, parseSpace space)
parseSpace :: L.Text -> Space
parseSpace "none" = None
parseSpace "space" = Space
parseSpace "spaces" = Space
parseSpace "newline" = NewLine
parseSpace "newlines" = NewLine
parseSpace xs = error ("parseSpace: " ++ L.unpack xs)
(<>) :: Monoid m => m -> m -> m
(<>) = mappend
showPlain :: F.Tag -> [[Token]] -> L.Text
showPlain ign =
L.toLazyText . mconcat . map (\xs -> buildSent ign xs <> "\n")
buildSent :: F.Tag -> [Token] -> L.Builder
buildSent ign = mconcat . map (buildWord ign)
buildWord :: F.Tag -> Token -> L.Builder
buildWord ign tok
= L.fromText (orth tok) <> "\t"
<> buildSpace (space tok) <> "\n"
<> buildKnown ign (known tok)
<> buildInterps (M.toList $ interps tok)
buildInterps :: [(Interp, Bool)] -> L.Builder
buildInterps interps = mconcat
[ "\t" <> buildBase interp <>
"\t" <> buildTag interp <>
if dmb
then "\tdisamb\n"
else "\n"
| (interp, dmb) <- interps ]
where
buildTag = L.fromText . tag
buildBase x = case base x of
Just b -> L.fromText b
Nothing -> L.fromText noneBase
buildSpace :: Space -> L.Builder
buildSpace None = "none"
buildSpace Space = "space"
buildSpace NewLine = "newline"
buildKnown :: F.Tag -> Bool -> L.Builder
buildKnown _ True = ""
buildKnown ign False = "\t" <> L.fromText noneBase
<> "\t" <> L.fromText ign <> "\n"