{-# LANGUAGE OverloadedStrings #-} {- | Parsing text in the Enamex data format. Each node is enclosed between opening and closing tags with tag name representing the label and contents representing children of the node. Both leaf and label values should be escaped by prepending the \\ character before special >, <, \\ and space characters. Example: >>> :m Data.Tree Data.Text Text.Named.Enamex >>> let drawIt = putStr . drawForest . fmap (fmap unpack) . parseForest >>> drawIt $ pack "w1.1\\ w1.2 w2 w3" x | `- w1.1 w1.2 , y | +- z | | | `- w2 | `- w3 -} module Text.Named.Enamex ( parseForest , parseEnamex , mapTwo ) where import Control.Applicative import Control.Monad import Data.Attoparsec.Text import qualified Data.Text as T import qualified Data.Text.Lazy as L import qualified Data.Tree as Tree type Tree = Tree.Tree T.Text type Forest = Tree.Forest T.Text -- | Map the first function over internal nodes -- and the second one over leaves. mapTwo :: (a -> b) -> (a -> c) -> Tree.Tree a -> Tree.Tree (Either b c) mapTwo _ g (Tree.Node x []) = Tree.Node (Right $ g x) [] mapTwo f g (Tree.Node x kids) = Tree.Node (Left $ f x) (map (mapTwo f g) kids) pForest :: Parser Forest pForest = pTree `sepBy` (space *> skipSpace) pTree :: Parser Tree pTree = pNode <|> pLeaf pLeaf :: Parser Tree pLeaf = Tree.Node <$> pWord <*> pure [] pNode :: Parser Tree pNode = do x <- pOpenTag kids <- pForest x' <- pCloseTag when (x /= x') (fail "Tag start/end mismatch") return $ Tree.Node x kids pOpenTag :: Parser T.Text pOpenTag = "<" .*> pWord <*. ">" pCloseTag :: Parser T.Text pCloseTag = " pWord <*. ">" pWord :: Parser T.Text pWord = unEscape <$> scan False special where special False c = case c == ' ' || c == '<' || c == '>' of True -> Nothing False -> if c == '\\' then Just True else Just False special True _ = Just False -- | TODO: Use lazy text builder to avoid slowness in the pessimistic case. unEscape :: T.Text -> T.Text unEscape xs = x `T.append` case drop1 rest of Just (y, ys) -> y `T.cons` unEscape ys Nothing -> "" where drop1 = T.uncons <=< return . snd <=< T.uncons (x, rest) = T.breakOn "\\" xs -- | Parse the enamex forest. parseForest :: T.Text -> Forest parseForest = either error id . parseOnly (pForest <* endOfInput) -- | Parse the enamex file. parseEnamex :: L.Text -> [Forest] parseEnamex = map (parseForest . L.toStrict) . L.lines