{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} -- | Simple format for morphosyntax representation which -- assumes that all tags have a textual representation -- with no spaces within and that one of the tags indicates -- unknown words. module NLP.Concraft.Polish.Format.Plain ( -- * Parsing parsePlain , parsePara , parseSent -- * Printing , showPlain , showPara , showSent ) where import Data.Monoid (Monoid, mappend, mconcat) import Data.Maybe (catMaybes) import Data.List (groupBy) import Data.String (IsString) import qualified Data.Char as C import qualified Data.Map as M import qualified Data.Text as T import qualified Data.Text.Lazy as L import qualified Data.Text.Lazy.Builder as L import NLP.Concraft.Polish.Morphosyntax noneBase :: T.Text noneBase = "None" -- | Parse the text in the plain format. parsePlain :: L.Text -> [[Sent Tag]] parsePlain = map parsePara' . groupBy f . L.splitOn "\n\n" where f _ xs = case L.uncons xs of Nothing -> False Just (x, _) -> not (C.isSpace x) -- | Parse the paragraph in the plain format. parsePara :: L.Text -> [Sent Tag] parsePara = parsePara' . L.splitOn "\n\n" -- | Parse paragraph already divided into sentence chunks. parsePara' :: [L.Text] -> [Sent Tag] parsePara' = map (parseSent . L.strip) . filter (not.isEmpty) -- | Identify empty chunks of text. isEmpty :: L.Text -> Bool isEmpty = L.all C.isSpace -- | Parse the sentence in the plain format. parseSent :: L.Text -> Sent Tag parseSent = map parseWord . groupBy (\_ x -> cond x) . L.lines where cond = ("\t" `L.isPrefixOf`) parseWord :: [L.Text] -> Seg Tag parseWord xs = Seg (Word _orth _space _known) _interps where (_orth, _space) = parseHeader (head xs) ys = map parseInterp (tail xs) _known = not (Nothing `elem` ys) _interps = M.fromListWith max (catMaybes ys) parseInterp :: L.Text -> Maybe (Interp Tag, Bool) parseInterp = doIt . tail . L.splitOn "\t" where doIt [form, tag] | tag == ign = Nothing | otherwise = Just $ (mkInterp form tag, False) doIt [form, tag, "disamb"] = Just $ (mkInterp form tag, True) doIt xs = error $ "parseInterp: " ++ show xs mkInterp form tag | formS == noneBase = Interp Nothing tagS | otherwise = Interp (Just formS) tagS where formS = L.toStrict form tagS = L.toStrict tag parseHeader :: L.Text -> (T.Text, Space) parseHeader xs = let [_orth, space] = L.splitOn "\t" xs in (L.toStrict _orth, parseSpace space) parseSpace :: L.Text -> Space parseSpace "none" = None parseSpace "space" = Space parseSpace "spaces" = Space -- Is it not a Maca bug? parseSpace "newline" = NewLine parseSpace "newlines" = NewLine -- TODO: Remove this temporary fix parseSpace xs = error ("parseSpace: " ++ L.unpack xs) ----------- -- Printing ----------- -- | Show the plain data. showPlain :: [[Sent Tag]] -> L.Text showPlain = L.intercalate "\n" . map showPara -- | Show the paragraph. showPara :: [Sent Tag] -> L.Text showPara = L.toLazyText . mconcat . map (\xs -> buildSent xs <> "\n") -- | Show the sentence. showSent :: Sent Tag -> L.Text showSent xs = L.toLazyText $ buildSent xs buildSent :: Sent Tag -> L.Builder buildSent = mconcat . map buildWord buildWord :: Seg Tag -> L.Builder buildWord Seg{..} = L.fromText orth <> "\t" <> buildSpace space <> "\n" <> buildKnown known <> buildInterps (M.toList interps) where Word{..} = word buildInterps :: [(Interp Tag, Bool)] -> L.Builder buildInterps interps = mconcat [ "\t" <> buildBase interp <> "\t" <> buildTag interp <> if dmb then "\tdisamb\n" else "\n" | (interp, dmb) <- interps ] where buildTag = L.fromText . tag buildBase x = case base x of Just b -> L.fromText b Nothing -> L.fromText noneBase buildSpace :: Space -> L.Builder buildSpace None = "none" buildSpace Space = "space" buildSpace NewLine = "newline" buildKnown :: Bool -> L.Builder buildKnown True = "" buildKnown False = "\t" <> L.fromText noneBase <> "\t" <> L.fromText ign <> "\n" ----------- -- Utils ----------- -- | An infix synonym for 'mappend'. (<>) :: Monoid m => m -> m -> m (<>) = mappend {-# INLINE (<>) #-} -- | Tag which indicates unknown words. ign :: IsString a => a ign = "ign" {-# INLINE ign #-}