{-# LANGUAGE OverloadedStrings #-}
{- |
Parsing text in the Enamex data format. Each node is enclosed between
opening and closing tags with tag name representing the label and contents
representing children of the node. Both leaf and label values should be
escaped by prepending the \\ character before special >, <, \\ and space
characters.
Example:
>>> :m Text.Named.Enamex Data.Named.Tree Data.Text.Lazy
>>> let drawIt = putStr . drawForest . mapForest show . parseForest
>>> drawIt $ pack "w1.1\\ w1.2 w2 w3"
Left "x"
|
`- Right "w1.1 w1.2"
,
Left "y"
|
+- Left "z"
| |
| `- Right "w2"
|
`- Right "w3"
-}
module Text.Named.Enamex
(
-- * Parsing
parseForest
, parseEnamex
-- * Printing
, showForest
, showEnamex
) where
import Control.Applicative
import Control.Monad ((<=<), when)
import Data.Monoid
import Data.Attoparsec.Text.Lazy
import Data.List (intersperse)
import Data.Function (on)
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Builder as L
import qualified Data.Named.Tree as Tr
pForest :: Parser (Tr.NeForest T.Text T.Text)
pForest = pTree `sepBy` (space *> skipSpace)
pTree :: Parser (Tr.NeTree T.Text T.Text)
pTree = pNode
<|> pLeaf
pLeaf :: Parser (Tr.NeTree T.Text T.Text)
pLeaf = Tr.Node <$> (Right <$> pWord) <*> pure []
pNode :: Parser (Tr.NeTree T.Text T.Text)
pNode = do
x <- pOpenTag
kids <- pForest
x' <- pCloseTag
when (x /= x') (fail "Tag start/end mismatch")
return $ Tr.Node (Left x) kids
pOpenTag :: Parser T.Text
pOpenTag = "<" .*> pWord <*. ">"
pCloseTag :: Parser T.Text
pCloseTag = "" .*> pWord <*. ">"
pWord :: Parser T.Text
pWord =
unEscape <$> scan False special
where
special False c =
case c == ' ' || c == '<' || c == '>' of
True -> Nothing
False -> if c == '\\'
then Just True
else Just False
special True _ = Just False
-- | TODO: Use lazy text builder to avoid slowness in the pessimistic case.
unEscape :: T.Text -> T.Text
unEscape xs = x `T.append` case drop1 rest of
Just (y, ys) -> y `T.cons` unEscape ys
Nothing -> ""
where
drop1 = T.uncons <=< return . snd <=< T.uncons
(x, rest) = T.breakOn "\\" xs
-- | TODO: Use lazy text builder to avoid slowness in the pessimistic case.
escape :: T.Text -> T.Text
escape x = case T.uncons z of
Nothing -> y
Just (c, q) -> y
`T.append` ('\\'
`T.cons` (c
`T.cons` escape q))
where
(y, z) = T.break special x
special c = c == ' ' || c == '<' || c == '>' || c == '\\'
-- | Parse the enamex forest.
parseForest :: L.Text -> Tr.NeForest T.Text T.Text
parseForest = either error id . eitherResult . parse (pForest <* endOfInput)
-- | Parse the enamex file.
parseEnamex :: L.Text -> [Tr.NeForest T.Text T.Text]
parseEnamex = map parseForest . L.lines
data Tag = Open | Close | Body
-- | Function which determines between which two tag elements a space
-- character should be inserted.
noSpace :: Tag -> Tag -> Bool
noSpace Open _ = True
noSpace Body Close = True
noSpace Close Close = True
noSpace _ _ = False
-- | We define our own groupBy because the standard version from Data.List
-- assumes that the predicate is transitive.
groupBy :: (a -> a -> Bool) -> [a] -> [[a]]
groupBy p (x : y : xs)
| p x y = join x $ groupBy p (y : xs)
| otherwise = [x] : groupBy p (y : xs)
where
join z (zs : zss) = (z : zs) : zss
join z [] = [[z]]
groupBy _ [x] = [[x]]
groupBy _ [] = []
buildForest :: Tr.NeForest t t -> [(t, Tag)]
buildForest = concat . map buildTree
buildTree :: Tr.NeTree t t -> [(t, Tag)]
buildTree (Tr.Node (Left x) ts) = (x, Open) : buildForest ts ++ [(x, Close)]
buildTree (Tr.Node (Right x) _) = [(x, Body)]
buildStream :: [(T.Text, Tag)] -> L.Builder
buildStream
= mconcat . intersperse " "
. map (mconcat . map buildTag)
. groupBy (noSpace `on` snd)
buildTag :: (T.Text, Tag) -> L.Builder
buildTag (x, tag) = case tag of
Open -> "<" `mappend` y `mappend` ">"
Close -> "" `mappend` y `mappend` ">"
_ -> y
where
y = L.fromText (escape x)
-- | Show the forest.
showForest :: Tr.NeForest T.Text T.Text -> L.Text
showForest = L.toLazyText . buildStream . buildForest
-- | Show the enamex file.
showEnamex :: [Tr.NeForest T.Text T.Text] -> L.Text
showEnamex = L.toLazyText . mconcat . map (L.fromLazyText . showForest)