-----------------------------------------------------------------------------
-- Copyright 2019, Ideas project team. This file is distributed under the
-- terms of the Apache License 2.0. For more information, see the files
-- "LICENSE.txt" and "NOTICE.txt", which are included in the distribution.
-----------------------------------------------------------------------------
-- |
-- Maintainer : bastiaan.heeren@ou.nl
-- Stability : provisional
-- Portability : portable (depends on ghc)
--
-- Collection of common operation on XML documents
--
-----------------------------------------------------------------------------
module Ideas.Text.XML
( -- * XML types
XML, Name, Attributes, Attribute(..)
-- * Parsing XML
, parseXML, parseXMLFile
-- * Building/constructing XML
, BuildXML(..), XMLBuilder, makeXML
-- * Pretty-printing XML
, prettyXML, compactXML
-- * Simple decoding queries
, name, attributes, findAttribute, children, findChildren, findChild
, getData, expecting
-- * Decoding XML
, decodeData, decodeAttribute, decodeChild, decodeFirstChild
-- * Type classes for converting to/from XML
, ToXML(..), builderXML, InXML(..)
-- * Processing XML
, foldXML, trimXML
-- * Deprecated functions
, content, emptyContent, fromBuilder
) where
import Control.Monad.State
import Data.Char (chr, ord, isSpace)
import Data.Foldable (toList)
import Data.List
import Data.Maybe
import Data.Semigroup as Sem
import Data.String
import Ideas.Text.XML.Document (escape, Name, prettyElement)
import Ideas.Text.XML.Parser (document)
import Ideas.Text.XML.Unicode
import Ideas.Utils.Decoding
import Ideas.Utils.Parsing (parseSimple)
import System.IO
import qualified Data.Map as M
import qualified Data.Sequence as Seq
import qualified Ideas.Text.XML.Document as D
-------------------------------------------------------------------------------
-- XML types
-- invariants content: no two adjacent Lefts, no Left with empty string,
-- valid tag/attribute names
data XML = Tag
{ name :: Name
, attributes :: Attributes
, content :: [Either String XML]
}
deriving Eq
instance Show XML where
show = compactXML
type Attributes = [Attribute]
data Attribute = Name := String
deriving Eq
-------------------------------------------------------------------------------
-- Parsing XML
parseXML :: String -> Either String XML
parseXML input = do
doc <- parseSimple document input
return (fromXMLDoc doc)
parseXMLFile :: FilePath -> IO XML
parseXMLFile file =
withBinaryFile file ReadMode $
hGetContents >=> either fail return . parseXML
fromXMLDoc :: D.XMLDoc -> XML
fromXMLDoc doc = fromElement (D.root doc)
where
fromElement (D.Element n as c) =
makeXML n (fromAttributes as <> fromContent c)
fromAttributes = mconcat . map fromAttribute
fromAttribute (n D.:= v) =
n .=. concatMap (either return refToString) v
fromContent :: D.Content -> XMLBuilder
fromContent = mconcat . map f
where
f :: D.XML -> XMLBuilder
f (D.Tagged e) = builder (fromElement e)
f (D.CharData s) = string s
f (D.CDATA s) = string s
f (D.Reference r) = fromReference r
refToString :: D.Reference -> String
refToString (D.CharRef i) = [chr i]
refToString (D.EntityRef s) = maybe "" return (lookup s general)
fromReference :: D.Reference -> XMLBuilder
fromReference (D.CharRef i) = char (chr i)
fromReference (D.EntityRef s) = fromMaybe mempty (lookup s entities)
entities :: [(String, XMLBuilder)]
entities =
[ (n, fromContent (snd ext)) | (n, ext) <- D.externals doc ] ++
-- predefined entities
[ (n, char c) | (n, c) <- general ]
general :: [(String, Char)]
general = [("lt",'<'), ("gt",'>'), ("amp",'&'), ("apos",'\''), ("quot",'"')]
-------------------------------------------------------------------------------
-- Building/constructing XML
infix 7 .=.
class (Sem.Semigroup a, Monoid a) => BuildXML a where
(.=.) :: String -> String -> a -- attribute
string :: String -> a -- (escaped) text
builder :: XML -> a -- (named) xml element
tag :: String -> a -> a -- tag (with content)
-- functions with a default
char :: Char -> a
text :: Show s => s -> a -- escaped text with Show class
element :: String -> [a] -> a
emptyTag :: String -> a
-- implementations
char c = string [c]
text = string . show
element s = tag s . mconcat
emptyTag s = tag s mempty
instance BuildXML a => BuildXML (Decoder env s a) where
n .=. s = pure (n .=. s)
string = pure . string
builder = pure . builder
tag = fmap . tag
data XMLBuilder = BS (Seq.Seq Attribute) (Seq.Seq (Either String XML))
instance Sem.Semigroup XMLBuilder where
BS as1 elts1 <> BS as2 elts2 = BS (as1 <> as2) (elts1 <> elts2)
instance Monoid XMLBuilder where
mempty = BS mempty mempty
mappend = (<>)
instance BuildXML XMLBuilder where
n .=. s = nameCheck n $ BS (Seq.singleton (n := s)) mempty
string s = BS mempty (if null s then mempty else Seq.singleton (Left s))
builder = BS mempty . Seq.singleton . Right
tag n = builder . uncurry (Tag n) . fromBS . nameCheck n
instance IsString XMLBuilder where
fromString = string
makeXML :: String -> XMLBuilder -> XML
makeXML s = uncurry (Tag s) . fromBS . nameCheck s
nameCheck :: String -> a -> a
nameCheck s = if isName s then id else fail $ "Invalid name " ++ s
isName :: String -> Bool
isName [] = False
isName (x:xs) = (isLetter x || x `elem` "_:") && all isNameChar xs
isNameChar :: Char -> Bool
isNameChar c = any ($ c) [isLetter, isDigit, isCombiningChar, isExtender, (`elem` ".-_:")]
-- local helper: merge attributes, but preserve order
fromBS :: XMLBuilder -> (Attributes, [Either String XML])
fromBS (BS as elts) = (attrList, merge (toList elts))
where
attrMap = foldr add M.empty as
add (k := v) = M.insertWith (\x y -> x ++ " " ++ y) k v
attrList = nubBy eqKey (map make (toList as))
make (k := _) = k := M.findWithDefault "" k attrMap
eqKey (k1 := _) (k2 := _) = k1 == k2
merge [] = []
merge (Left x:Left y:rest) = merge (Left (x++y):rest)
merge (Left x:rest) = Left x : merge rest
merge (Right y:rest) = Right y : merge rest
-------------------------------------------------------------------------------
-- Pretty-printing XML
prettyXML :: XML -> String
prettyXML = show . prettyElement False . toElement
compactXML :: XML -> String
compactXML = show . prettyElement True . toElement
toElement :: XML -> D.Element
toElement = foldXML make mkAttribute mkString
where
make n as = D.Element n as . concatMap (either id (return . D.Tagged))
mkAttribute :: Attribute -> D.Attribute
mkAttribute (m := s) = (D.:=) m (map Left s)
mkString :: String -> [D.XML]
mkString [] = []
mkString xs@(hd:tl)
| null xs1 = D.Reference (D.CharRef (ord hd)) : mkString tl
| otherwise = D.CharData xs1 : mkString xs2
where
(xs1, xs2) = break ((> 127) . ord) xs
-------------------------------------------------------------------------------
-- Simple decoding queries
findAttribute :: Monad m => String -> XML -> m String
findAttribute s (Tag _ as _) =
case [ t | n := t <- as, s==n ] of
[hd] -> return hd
_ -> fail $ "Invalid attribute: " ++ show s
children :: XML -> [XML]
children e = [ c | Right c <- content e ]
findChildren :: String -> XML -> [XML]
findChildren s = filter ((==s) . name) . children
findChild :: Monad m => String -> XML -> m XML
findChild s e =
case findChildren s e of
[] -> fail $ "Child not found: " ++ show s
[a] -> return a
_ -> fail $ "Multiple children found: " ++ show s
getData :: XML -> String
getData e = concat [ s | Left s <- content e ]
expecting :: Monad m => String -> XML -> m ()
expecting s xml =
unless (name xml == s) $ fail $ "Expecting element " ++ s ++ ", but found " ++ name xml
-------------------------------------------------------------------------------
-- Decoding XML
decodeData :: Decoder env XML String
decodeData = get >>= \xml ->
case content xml of
Left s:rest -> put xml {content = rest} >> return s
_ -> fail "Could not find data"
decodeAttribute :: String -> Decoder env XML String
decodeAttribute s = get >>= \xml ->
case break hasName (attributes xml) of
(xs, (_ := val):ys) -> put xml {attributes = xs ++ ys } >> return val
_ -> fail $ "Could not find attribute " ++ s
where
hasName (n := _) = n == s
decodeChild :: Name -> Decoder env XML a -> Decoder env XML a
decodeChild s p = get >>= \xml ->
case break hasName (content xml) of
(xs, Right y:ys) -> do
put y
a <- p
put xml { content = xs ++ ys }
return a
_ -> fail $ "Could not find child " ++ s
where
hasName = either (const False) ((==s) . name)
decodeFirstChild :: Name -> Decoder env XML a -> Decoder env XML a
decodeFirstChild s p = get >>= \xml ->
case content xml of
Right y:ys | name y == s -> do
put y
a <- p
put xml { content = ys }
return a
_ -> fail $ "Could not find first child " ++ s
-------------------------------------------------------------------------------
-- Type classes for converting to/from XML
class ToXML a where
toXML :: a -> XML
listToXML :: [a] -> XML
-- default definitions
listToXML = makeXML "list" . mconcat . map builderXML
instance ToXML () where
toXML _ = makeXML "Unit" mempty
instance ToXML a => ToXML (Maybe a) where
toXML = maybe (makeXML "Nothing" mempty) toXML
builderXML :: (ToXML a, BuildXML b) => a -> b
builderXML = builder . toXML
class ToXML a => InXML a where
fromXML :: Monad m => XML -> m a
listFromXML :: Monad m => XML -> m [a]
listFromXML xml
| name xml == "list" && null (attributes xml) =
mapM fromXML (children xml)
| otherwise = fail "expecting a list tag"
-------------------------------------------------------------------------------
-- Processing XML
foldXML :: (Name -> [a] -> [Either s e] -> e) -> (Attribute -> a) -> (String -> s) -> XML -> e
foldXML fe fa fs = rec
where
rec (Tag n as cs) = fe n (map fa as) (map (either (Left . fs) (Right . rec)) cs)
trimXML :: XML -> XML
trimXML = foldXML make fa (string . trim)
where
fa (n := s) = n .=. trim s
make :: String -> [XMLBuilder] -> [Either XMLBuilder XML] -> XML
make s as = makeXML s . mconcat . (as ++) . map (either id builder)
trim, trimLeft, trimRight :: String -> String
trim = trimLeft . trimRight
trimLeft = dropWhile isSpace
trimRight = reverse . trimLeft . reverse
-------------------------------------------------------------------------------
-- Deprecated functions
emptyContent :: XML -> Bool
emptyContent = null . content
fromBuilder :: XMLBuilder -> Maybe XML
fromBuilder m =
case fromBS m of
([], [Right a]) -> Just a
_ -> Nothing
-------------------------------------------------------------------------------
-- Tests
_runTests :: IO ()
_runTests = do
forM_ [testDataP, testAttrP, testDataB, testAttrB] $ \f ->
pp $ map f tests
forM_ [mkPD, mkPA, mkBD, mkBA] $ \f ->
pp $ map (testXML . f) tests
where
pp = putStrLn . map (\b -> if b then '.' else 'X')
tests :: [String]
tests =
[ "input"
, "<>&"'"
, "<>&'\""
, "p & q' => p"
, ""
, " "
, "eerste \n\n derde regel"
]
testDataP, testAttrP, testDataB, testAttrB :: String -> Bool
testDataP s = let xml = mkPD s in getData xml == s
testAttrP s = let xml = mkPA s in findAttribute "a" xml == Just s
testDataB s = let xml = mkBD s in getData xml == s
testAttrB s = let xml = mkBA s in findAttribute "a" xml == Just s
testXML :: XML -> Bool
testXML xml =
case parseXML (compactXML xml) of
Left msg -> error msg
Right a -> a == xml
mkPD, mkPA, mkBD, mkBA :: String -> XML
mkPD s = either error id $ parseXML $ "" ++ escape s ++ ""
mkPA s = either error id $ parseXML $ ""
mkBD s = makeXML "a" (string s)
mkBA s = makeXML "t" ("a".=. s)