{-# LANGUAGE OverloadedStrings #-} {- | Parsing text in the Enamex data format. Each node is enclosed between opening and closing tags with tag name representing the label and contents representing children of the node. Both leaf and label values should be escaped by prepending the \\ character before special >, <, \\ and space characters. Example: >>> :m Text.Named.Enamex Data.Named.Tree Data.Text.Lazy >>> let drawIt = putStr . drawForest . mapForest show . parseForest >>> drawIt $ pack "w1.1\\ w1.2 w2 w3" Left "x" | `- Right "w1.1 w1.2" , Left "y" | +- Left "z" | | | `- Right "w2" | `- Right "w3" -} module Text.Named.Enamex ( -- * Parsing parseForest , parseEnamex -- * Printing , showForest , showEnamex ) where import Control.Applicative import Control.Monad ((<=<), when) import Data.Monoid import Data.Attoparsec.Text.Lazy import Data.List (intersperse) import Data.Function (on) import qualified Data.Text as T import qualified Data.Text.Lazy as L import qualified Data.Text.Lazy.Builder as L import qualified Data.Named.Tree as Tr pForest :: Parser (Tr.NeForest T.Text T.Text) pForest = pTree `sepBy` (space *> skipSpace) pTree :: Parser (Tr.NeTree T.Text T.Text) pTree = pNode <|> pLeaf pLeaf :: Parser (Tr.NeTree T.Text T.Text) pLeaf = Tr.Node <$> (Right <$> pWord) <*> pure [] pNode :: Parser (Tr.NeTree T.Text T.Text) pNode = do x <- pOpenTag kids <- pForest x' <- pCloseTag when (x /= x') (fail "Tag start/end mismatch") return $ Tr.Node (Left x) kids pOpenTag :: Parser T.Text pOpenTag = "<" .*> pWord <*. ">" pCloseTag :: Parser T.Text pCloseTag = " pWord <*. ">" pWord :: Parser T.Text pWord = unEscape <$> scan False special where special False c = case c == ' ' || c == '<' || c == '>' of True -> Nothing False -> if c == '\\' then Just True else Just False special True _ = Just False -- | TODO: Use lazy text builder to avoid slowness in the pessimistic case. unEscape :: T.Text -> T.Text unEscape xs = x `T.append` case drop1 rest of Just (y, ys) -> y `T.cons` unEscape ys Nothing -> "" where drop1 = T.uncons <=< return . snd <=< T.uncons (x, rest) = T.breakOn "\\" xs -- | TODO: Use lazy text builder to avoid slowness in the pessimistic case. escape :: T.Text -> T.Text escape x = case T.uncons z of Nothing -> y Just (c, q) -> y `T.append` ('\\' `T.cons` (c `T.cons` escape q)) where (y, z) = T.break special x special c = c == ' ' || c == '<' || c == '>' || c == '\\' -- | Parse the enamex forest. parseForest :: L.Text -> Tr.NeForest T.Text T.Text parseForest = either error id . eitherResult . parse (pForest <* endOfInput) -- | Parse the enamex file. parseEnamex :: L.Text -> [Tr.NeForest T.Text T.Text] parseEnamex = map parseForest . L.lines data Tag = Open | Close | Body -- | Function which determines between which two tag elements a space -- character should be inserted. noSpace :: Tag -> Tag -> Bool noSpace Open _ = True noSpace Body Close = True noSpace Close Close = True noSpace _ _ = False -- | We define our own groupBy because the standard version from Data.List -- assumes that the predicate is transitive. groupBy :: (a -> a -> Bool) -> [a] -> [[a]] groupBy p (x : y : xs) | p x y = join x $ groupBy p (y : xs) | otherwise = [x] : groupBy p (y : xs) where join z (zs : zss) = (z : zs) : zss join z [] = [[z]] groupBy _ [x] = [[x]] groupBy _ [] = [] buildForest :: Tr.NeForest t t -> [(t, Tag)] buildForest = concat . map buildTree buildTree :: Tr.NeTree t t -> [(t, Tag)] buildTree (Tr.Node (Left x) ts) = (x, Open) : buildForest ts ++ [(x, Close)] buildTree (Tr.Node (Right x) _) = [(x, Body)] buildStream :: [(T.Text, Tag)] -> L.Builder buildStream = mconcat . intersperse " " . map (mconcat . map buildTag) . groupBy (noSpace `on` snd) buildTag :: (T.Text, Tag) -> L.Builder buildTag (x, tag) = case tag of Open -> "<" `mappend` y `mappend` ">" Close -> "" _ -> y where y = L.fromText (escape x) -- | Show the forest. showForest :: Tr.NeForest T.Text T.Text -> L.Text showForest = L.toLazyText . buildStream . buildForest -- | Show the enamex file. showEnamex :: [Tr.NeForest T.Text T.Text] -> L.Text showEnamex = L.toLazyText . mconcat . map (L.fromLazyText . showForest)