{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} -- | Simple format for morphosyntax representation which -- assumes that all tags have a textual representation -- with no spaces inside and that one of the tags indicates -- unknown words. module NLP.Concraft.Plain ( -- * Types Space (..) , Token (..) , Interp (..) -- * Interface , fromTok , choose , addInterps , addNones -- * Parsing , readPlain , parsePlain , parseSent -- * Showing , writePlain , showPlain , showSent , showWord ) where import Data.Monoid (Monoid, mappend, mconcat) import Data.Maybe (catMaybes) import Data.List (groupBy) import qualified Data.Set as S import qualified Data.Map as M import qualified Data.Text as T import qualified Data.Text.Lazy as L import qualified Data.Text.Lazy.IO as L import qualified Data.Text.Lazy.Builder as L import qualified NLP.Concraft.Morphosyntax as Mx -- | No space, space or newline. data Space = None | Space | NewLine deriving (Show, Eq, Ord) -- | A token. data Token = Token { orth :: T.Text , space :: Space , known :: Bool -- | Interpretations with disambiguation info. , interps :: M.Map Interp Bool } deriving (Show, Eq, Ord) data Interp = Interp { base :: T.Text , tag :: T.Text } deriving (Show, Eq, Ord) -- | Extract information relevant for tagging. fromTok :: Token -> (Mx.Word T.Text, Mx.Choice T.Text) fromTok tok = (word, choice) where word = Mx.Word { Mx.orth = orth tok , Mx.tags = if known tok then S.fromList . map tag . M.keys $ interps tok else S.empty } choice = M.fromListWith (Mx.<+>) [ (tag x, Mx.mkPositive 1) | (x, True) <- M.toList (interps tok) ] -- | Mark all interpretations with tag component beeing a member of -- the given choice set with disamb annotations. choose :: Token -> S.Set T.Text -> Token choose tok choice = tok { interps = (M.fromList . map mark . M.keys) (interps tok) } where mark ip | tag ip `S.member` choice = (ip, True) | otherwise = (ip, False) -- | Add new interpretations with given disamb annotation. addInterps :: Bool -> Token -> [Interp] -> Token addInterps dmb tok xs = let newIps = M.fromList [(x, dmb) | x <- xs] in tok { interps = M.unionWith max newIps (interps tok) } -- | Add new interpretations with "None" base and given disamb annotation. addNones :: Bool -> Token -> [T.Text] -> Token addNones dmb tok = addInterps dmb tok . map (Interp "None") readPlain :: T.Text -> FilePath -> IO [[Token]] readPlain ign = fmap (parsePlain ign) . L.readFile parsePlain :: T.Text -> L.Text -> [[Token]] parsePlain ign = map (parseSent ign) . init . L.splitOn "\n\n" parseSent :: T.Text -> L.Text -> [Token] parseSent ign = map (parseWord ignL) . groupBy (\_ x -> cond x) . L.lines where cond = ("\t" `L.isPrefixOf`) ignL = L.fromStrict ign parseWord :: L.Text -> [L.Text] -> Token parseWord ign xs = (Token _orth _space _known _interps) where (_orth, _space) = parseHeader (head xs) ys = map (parseInterp ign) (tail xs) _known = not (Nothing `elem` ys) _interps = M.fromList (catMaybes ys) parseInterp :: L.Text -> L.Text -> Maybe (Interp, Bool) parseInterp ign = doIt . tail . L.splitOn "\t" where doIt [form, tag] | tag == ign = Nothing | otherwise = Just $ (mkInterp form tag, False) doIt [form, tag, "disamb"] = Just $ (mkInterp form tag, True) doIt xs = error $ "parseInterp: " ++ show xs mkInterp form tag = Interp (L.toStrict form) (L.toStrict tag) parseHeader :: L.Text -> (T.Text, Space) parseHeader xs = let [_orth, space] = L.splitOn "\t" xs in (L.toStrict _orth, parseSpace space) parseSpace :: L.Text -> Space parseSpace "none" = None parseSpace "space" = Space parseSpace "newline" = NewLine parseSpace "newlines" = NewLine -- TODO: Remove this temporary fix parseSpace xs = error ("parseSpace: " ++ L.unpack xs) -- | Printing. -- | An infix synonym for 'mappend'. (<>) :: Monoid m => m -> m -> m (<>) = mappend {-# INLINE (<>) #-} writePlain :: T.Text -> FilePath -> [[Token]] -> IO () writePlain ign path = L.writeFile path . showPlain ign showPlain :: T.Text -> [[Token]] -> L.Text showPlain ign = L.toLazyText . mconcat . map (\xs -> buildSent ign xs <> "\n") showSent :: T.Text -> [Token] -> L.Text showSent ign = L.toLazyText . buildSent ign showWord :: T.Text -> Token -> L.Text showWord ign = L.toLazyText . buildWord ign buildSent :: T.Text -> [Token] -> L.Builder buildSent ign = mconcat . map (buildWord ign) buildWord :: T.Text -> Token -> L.Builder buildWord ign tok = L.fromText (orth tok) <> "\t" <> buildSpace (space tok) <> "\n" <> buildKnown ign (known tok) <> buildInterps (M.toList $ interps tok) buildInterps :: [(Interp, Bool)] -> L.Builder buildInterps interps = mconcat [ "\t" <> L.fromText _base <> "\t" <> L.fromText _tag <> if dmb then "\tdisamb\n" else "\n" | (Interp _base _tag, dmb) <- interps ] buildSpace :: Space -> L.Builder buildSpace None = "none" buildSpace Space = "space" buildSpace NewLine = "newline" buildKnown :: T.Text -> Bool -> L.Builder buildKnown _ True = "" buildKnown ign False = "\tNone\t" <> L.fromText ign <> "\n"