{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeFamilies #-} {-# OPTIONS_GHC -fno-warn-orphans #-} module Symantic.XML.Read ( module Symantic.XML.Read.Parser , module Symantic.XML.Read ) where import Control.Arrow (left) import Control.Applicative (Applicative(..), Alternative(..)) import Control.Monad (Monad(..), void, unless, forM, join) import Data.Bool import Data.Char (Char) import Data.Default.Class (Default(..)) import Data.Either (Either(..)) import Data.Eq (Eq(..)) import Data.Foldable (Foldable(..)) import Data.Function (($), (.), const) import Data.Functor ((<$>), (<$)) import Data.Maybe (Maybe(..), maybe) import Data.Monoid (Monoid(..)) import Data.Ord (Ord(..)) import Data.Semigroup (Semigroup(..)) import Data.String (String) import Data.TreeSeq.Strict (Tree(..)) import Data.Tuple (snd) import Prelude (Num(..), Enum(..), Bounded(..), Integer, toInteger) import System.IO (FilePath, IO) import Text.Megaparsec (()) import Text.Show (Show(..)) import qualified Control.Exception as Exn import qualified Control.Monad.Trans.Reader as R import qualified Data.ByteString.Lazy as BSL import qualified Data.Char as Char import qualified Data.Char.Properties.XMLCharProps as XC import qualified Data.HashMap.Strict as HM import qualified Data.List as List import qualified Data.Sequence as Seq import qualified Data.Text.Encoding.Error as TL import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as TL import qualified Data.TreeSeq.Strict as TS import qualified System.IO.Error as IO import qualified Text.Megaparsec as P import qualified Text.Megaparsec.Char as P import Symantic.XML.Document hiding (XML, XMLs) import Symantic.XML.Read.Parser readXML :: FilePath -> TL.Text -> Either (P.ParseErrorBundle TL.Text Error) XMLs readXML filePath stateInput = snd $ P.runParser' (R.runReaderT p_document def) P.State { P.stateInput , P.stateOffset = 0 , P.statePosState = P.PosState { P.pstateInput = stateInput , P.pstateOffset = 0 , P.pstateSourcePos = P.initialPos filePath , P.pstateTabWidth = P.pos1 , P.pstateLinePrefix = "" } } readFile :: FilePath -> IO (Either ErrorRead TL.Text) readFile fp = (left ErrorRead_Unicode . TL.decodeUtf8' <$> BSL.readFile fp) `Exn.catch` \e -> if IO.isAlreadyInUseError e || IO.isDoesNotExistError e || IO.isPermissionError e then return $ Left $ ErrorRead_IO e else IO.ioError e -- * Type 'ErrorRead' data ErrorRead = ErrorRead_IO IO.IOError | ErrorRead_Unicode TL.UnicodeException deriving (Show) -- * Document p_document :: P.Tokens s ~ TL.Text => Parser Error s XMLs p_document = do ps <- p_prolog e <- p_Element ms <- P.many p_Misc P.eof return (ps <> pure e <> join (Seq.fromList ms)) -- ** Prolog p_prolog :: P.Tokens s ~ TL.Text => Parser Error s XMLs p_prolog = do xmlDecl <- P.option Seq.empty $ pure <$> p_XMLDecl ms <- P.many p_Misc return (xmlDecl <> join (Seq.fromList ms)) -- ** Misc p_Misc :: P.Tokens s ~ TL.Text => Parser Error s XMLs p_Misc = P.try (pure <$> p_Comment) <|> P.try (pure <$> p_PI) <|> pure <$> p_S -- ** XMLDecl p_XMLDecl :: P.Tokens s ~ TL.Text => Parser Error s XML p_XMLDecl = P.label "XMLDecl" $ do Sourced src as <- p_Sourced $ P.between (P.string "") $ do vi <- pure <$> p_VersionInfo ed <- P.option Seq.empty $ pure <$> p_EncodingDecl sd <- P.option Seq.empty $ pure <$> p_SDDecl p_Spaces return $ vi <> ed <> sd return $ Tree (Sourced src $ NodePI "xml" "") as p_VersionInfo :: P.Tokens s ~ TL.Text => Parser Error s XML p_VersionInfo = P.label "VersionInfo" $ do Sourced c v <- p_Sourced $ do P.try (() <$ p_Spaces1 <* P.string "version") p_Eq p_quoted $ const $ p_Sourced $ (<>) <$> P.string "1." <*> P.takeWhile1P Nothing Char.isDigit return $ Tree (Sourced c $ NodeAttr "version") $ pure $ TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v p_EncodingDecl :: P.Tokens s ~ TL.Text => Parser Error s XML p_EncodingDecl = P.label "EncodingDecl" $ do Sourced c v <- p_Sourced $ do P.try (() <$ p_Spaces1 <* P.string "encoding") p_Eq p_quoted $ const $ p_Sourced p_EncName return $ Tree (Sourced c $ NodeAttr "encoding") $ pure $ TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v p_EncName :: P.Tokens s ~ TL.Text => Parser Error s TL.Text p_EncName = P.label "EncName" $ do P.notFollowedBy (P.satisfy $ not . isAlpha) P.takeWhile1P Nothing $ \c -> isAlpha c || Char.isDigit c || c=='.' || c=='_' || c=='-' where isAlpha c = Char.isAsciiLower c || Char.isAsciiUpper c -- *** SDDecl p_SDDecl :: P.Tokens s ~ TL.Text => Parser Error s XML p_SDDecl = P.label "SDDecl" $ do p_SourcedBegin $ do Sourced ca () <- P.try (p_Sourced $ () <$ p_Spaces1 <* P.string "standalone") p_Eq v <- p_quoted $ const $ p_Sourced $ P.string "yes" <|> P.string "no" return $ Tree (Sourced ca $ NodeAttr "standalone") $ pure $ TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v -- ** CharData p_CharData :: P.Tokens s ~ TL.Text => Parser e s EscapedText p_CharData = escapeText <$> p_until1 (\c -> XC.isXmlChar c && c/='<' && c/='&') (']',"]>") -- ** Comment p_Comment :: P.Tokens s ~ TL.Text => Parser Error s XML p_Comment = p_SourcedBegin $ P.string "" cell <- p_SourcedEnd return $ TS.tree0 (cell $ NodeComment c) -- ** CDATA p_CDSect :: P.Tokens s ~ TL.Text => Parser Error s XML p_CDSect = p_SourcedBegin $ P.string " p_CDSect__ p_CDSect_ :: P.Tokens s ~ TL.Text => Parser Error s XML p_CDSect_ = P.string "[CDATA[" *> p_CDSect__ p_CDSect__ :: P.Tokens s ~ TL.Text => Parser Error s XML p_CDSect__ = P.label "CDSect" $ do c <- p_until XC.isXmlChar (']', "]>") void $ P.string "]]>" cell <- p_SourcedEnd return $ TS.tree0 $ cell $ NodeCDATA c -- ** PI p_PI :: P.Tokens s ~ TL.Text => Parser Error s XML p_PI = p_SourcedBegin $ P.string " p_PI__ p_PI_ :: P.Tokens s ~ TL.Text => Parser Error s XML p_PI_ = P.char '?' *> p_PI__ p_PI__ :: P.Tokens s ~ TL.Text => Parser Error s XML p_PI__ = P.label "PI" $ do n <- p_PITarget v <- P.option "" $ P.try $ p_Spaces1 *> p_until XC.isXmlChar ('?', ">") void $ P.string "?>" cell <- p_SourcedEnd return $ TS.tree0 $ cell $ NodePI n v p_PITarget :: P.Tokens s ~ TL.Text => Parser Error s PName p_PITarget = do n <- p_PName case n of PName{pNameSpace=Nothing, pNameLocal=NCName l} | "xml" == TL.toLower l -> p_error $ Error_PI_reserved n _ -> return n -- ** Element p_Element :: P.Tokens s ~ TL.Text => Parser Error s XML p_Element = p_SourcedBegin $ (P.char '<' *> p_Element_) p_Element_ :: P.Tokens s ~ TL.Text => Parser Error s XML p_Element_ = P.label "Element" p_STag -- *** STag p_STag :: P.Tokens s ~ TL.Text => Parser Error s XML p_STag = do n <- p_PName as <- P.many $ P.try $ p_Spaces1 *> p_Attribute p_Spaces ro <- R.ask elemNS :: HM.HashMap NCName Namespace <- (HM.fromList . List.concat <$>) $ forM as $ \case Sourced _ (PName{..}, Sourced _ av) | ns <- Namespace $ unescapeText av , Nothing <- pNameSpace , NCName "xmlns" <- pNameLocal -> -- NOTE: default namespace declaration. case ns of _ | ns == xmlns_xml -- DOC: it MUST NOT be declared as the default namespace || ns == xmlns_xmlns -- DOC: it MUST NOT be declared as the default namespace -> p_error $ Error_Namespace_reserved ns _ -> return [(NCName "" , ns)] | ns <- Namespace $ unescapeText av , Just (NCName "xmlns") <- pNameSpace -> -- NOTE: namespace prefix declaration. case unNCName pNameLocal of "xml" -- DOC: It MAY, but need not, be declared, -- and MUST NOT be bound to any other namespace name. | ns == xmlns_xml -> return [] | otherwise -> p_error $ Error_Namespace_reserved_prefix pNameLocal "xmlns" -- DOC: It MUST NOT be declared -> p_error $ Error_Namespace_reserved_prefix pNameLocal local | "xml" <- TL.toLower $ TL.take 3 local -> return [] -- DOC: All other prefixes beginning with the three-letter -- sequence x, m, l, in any case combination, are reserved. -- This means that: processors MUST NOT treat them as fatal errors. _ | ns == xmlns_xml -- DOC: Other prefixes MUST NOT be bound to this namespace name. || ns == xmlns_xmlns -- DOC: Other prefixes MUST NOT be bound to this namespace name. -> p_error $ Error_Namespace_reserved ns _ -> return [(pNameLocal, ns)] | otherwise -> return [] let scopeNS = elemNS <> reader_ns_scope ro let defaultNS = HM.lookupDefault (reader_ns_default ro) (NCName "") scopeNS let lookupNamePrefix prefix = maybe (p_error $ Error_Namespace_prefix_unknown prefix) return $ HM.lookup prefix scopeNS elemName :: QName <- -- NOTE: expand element's QName. case pNameSpace n of Nothing -> return QName{qNameSpace=defaultNS, qNameLocal=pNameLocal n} -- DOC: If there is a default namespace declaration in scope, -- the expanded name corresponding to an unprefixed element name -- has the URI of the default namespace as its namespace name. Just prefix | NCName "xmlns" <- prefix -> -- DOC: Element names MUST NOT have the prefix xmlns. p_error $ Error_Namespace_reserved_prefix prefix | otherwise -> do ns <- lookupNamePrefix prefix return QName{qNameSpace=ns, qNameLocal=pNameLocal n} elemAttrs :: [FileSourced (QName, FileSourced EscapedText)] <- -- NOTE: expand attributes' PName into QName. forM as $ \s@Sourced{unSourced=(an, av)} -> do ns <- maybe (return "") lookupNamePrefix $ pNameSpace an let qn = QName{qNameSpace=ns, qNameLocal=pNameLocal an} return s{unSourced=(qn, av)} -- NOTE: check for attribute collision. let attrsByQName :: HM.HashMap QName [FileSourced (QName, FileSourced EscapedText)] = HM.fromListWith (<>) $ (<$> elemAttrs) $ \a@(Sourced _c (an, _av)) -> (an, [a]) case HM.toList $ HM.filter (\x -> length x > 1) attrsByQName of (an, _):_ -> p_error $ Error_Attribute_collision an _ -> return () elemAttrsXML :: XMLs <- (Seq.fromList <$>) $ forM elemAttrs $ \(Sourced sa (an, av)) -> do return $ TS.Tree (Sourced sa $ NodeAttr an) $ pure $ TS.tree0 $ NodeText <$> av content :: XMLs <- elemAttrsXML <$ P.string "/>" <|> R.local (const ro { reader_ns_scope = scopeNS , reader_ns_default = defaultNS }) ((elemAttrsXML <>) <$ P.char '>' <*> p_content <* p_ETag elemName) cell <- p_SourcedEnd return $ Tree (cell $ NodeElem elemName) content -- *** Attribute p_Attribute :: P.Tokens s ~ TL.Text => Parser Error s (FileSourced (PName, FileSourced EscapedText)) p_Attribute = p_Sourced $ (,) <$> p_PName <* p_Eq <*> p_AttValue p_AttValue :: P.Tokens s ~ TL.Text => Parser Error s (FileSourced EscapedText) p_AttValue = P.label "AttValue" $ p_quoted p_AttValueText p_AttValueText :: P.Tokens s ~ TL.Text => Char -> Parser Error s (FileSourced EscapedText) p_AttValueText q = p_Sourced $ EscapedText . Seq.fromList <$> P.many ( p_Reference <|> EscapedPlain <$> P.takeWhile1P Nothing (\c -> XC.isXmlChar c && c `List.notElem` (q:"<&'\">")) <|> EscapedEntityRef entityRef_gt <$ P.char '>' <|> (if q == '\'' then EscapedEntityRef entityRef_quot <$ P.char '"' else EscapedEntityRef entityRef_apos <$ P.char '\'') ) -- * content p_content :: P.Tokens s ~ TL.Text => Parser Error s XMLs p_content = (Seq.fromList <$>) $ P.many $ (p_SourcedBegin $ do P.try $ P.char '<' *> P.notFollowedBy (P.char '/') p_Element_ <|> p_PI_ <|> (P.char '!' *> (p_Comment_ <|> p_CDSect_)) ) <|> ((tree0 <$>) $ p_Sourced $ NodeText . mconcat <$> P.some (p_CharData <|> EscapedText . pure <$> p_Reference)) -- *** ETag p_ETag :: P.Tokens s ~ TL.Text => QName -> Parser Error s () p_ETag expected = do got <- P.string " p_QName <* p_Spaces <* P.char '>' unless (got == expected) $ p_error $ Error_Closing_tag_unexpected got expected -- * Name p_Name :: P.Tokens s ~ TL.Text => Parser Error s Name p_Name = P.label "Name" $ Name <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNameStartChar) <*> P.takeWhile1P Nothing XC.isXmlNameChar -- * PName p_PName :: P.Tokens s ~ TL.Text => Parser e s PName p_PName = P.label "PName" $ do n <- p_NCName s <- P.optional $ P.try $ P.char ':' *> p_NCName return $ case s of Nothing -> PName{pNameSpace=Nothing, pNameLocal=n} Just l -> PName{pNameSpace=Just n , pNameLocal=l} -- * QName p_QName :: P.Tokens s ~ TL.Text => Parser Error s QName p_QName = P.label "QName" $ do n <- p_NCName s <- P.optional $ P.try $ P.char ':' *> p_NCName Reader{..} <- R.ask case s of Nothing -> return QName{qNameSpace=reader_ns_default, qNameLocal=n} Just l -> case HM.lookup n reader_ns_scope of Nothing -> p_error $ Error_Namespace_prefix_unknown n Just ns -> return QName{qNameSpace=ns, qNameLocal=l} -- ** NCName p_NCName :: P.Tokens s ~ TL.Text => Parser e s NCName p_NCName = P.label "NCName" $ NCName <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNCNameStartChar) <*> P.takeWhile1P Nothing XC.isXmlNCNameChar -- * Reference p_Reference :: P.Tokens s ~ TL.Text => Parser Error s Escaped p_Reference = EscapedCharRef <$> p_CharRef <|> EscapedEntityRef <$> p_EntityRef -- ** EntityRef p_EntityRef :: P.Tokens s ~ TL.Text => Parser Error s EntityRef p_EntityRef = P.label "EntityRef" $ do ref <- P.char '&' *> p_NCName <* P.char ';' EntityRef ref <$> lookupEntityRef ref where lookupEntityRef (NCName "lt" ) = pure "<" lookupEntityRef (NCName "gt" ) = pure ">" lookupEntityRef (NCName "amp" ) = pure "&" lookupEntityRef (NCName "apos") = pure "'" lookupEntityRef (NCName "quot") = pure "\"" lookupEntityRef n = p_error $ Error_EntityRef_unknown n -- ** CharRef p_CharRef :: P.Tokens s ~ TL.Text => Parser Error s CharRef p_CharRef = P.label "CharRef" $ do ref <- readHexadecimal <$ P.string "&#x" <*> P.some P.hexDigitChar <* P.char ';' check ref <|> do ref <- readDecimal <$ P.string "&#" <*> P.some P.digitChar <* P.char ';' check ref where check i = let c = toEnum (fromInteger i) in if i <= toInteger (fromEnum (maxBound::Char)) && XC.isXmlChar c then pure $ CharRef c else p_error $ Error_CharRef_invalid i readInt :: Integer -> String -> Integer readInt base digits = sign * List.foldl' acc 0 (List.concatMap digToInt digits1) where acc q r = q*base + r (sign, digits1) = case digits of [] -> (1, digits) c:ds | c == '-' -> (-1, ds) | c == '+' -> ( 1, ds) | otherwise -> ( 1, digits) ord = toInteger . Char.ord digToInt c | Char.isDigit c = [ord c - ord '0'] | Char.isAsciiLower c = [ord c - ord 'a' + 10] | Char.isAsciiUpper c = [ord c - ord 'A' + 10] | otherwise = [] readDecimal :: String -> Integer readDecimal = readInt 10 readHexadecimal :: String -> Integer readHexadecimal = readInt 16 -- * Char p_Char :: P.Tokens s ~ TL.Text => Parser e s Char p_Char = P.label "Char" $ P.satisfy XC.isXmlCharCR <|> p_CRLF {-# INLINE p_Char #-} -- ** Space -- | Map '\r' and '\r\n' to '\n'. p_CRLF :: P.Tokens s ~ TL.Text => Parser e s Char p_CRLF = P.label "CRLF" $ P.char '\r' *> P.option '\n' (P.char '\n') p_Space :: P.Tokens s ~ TL.Text => Parser e s Char p_Space = P.label "Space" $ P.satisfy XC.isXmlSpaceCharCR <|> p_CRLF {-# INLINE p_Space #-} p_Spaces :: P.Tokens s ~ TL.Text => Parser e s () p_Spaces = P.label "Spaces" $ void $ P.takeWhileP Nothing XC.isXmlSpaceChar {-# INLINE p_Spaces #-} p_S :: P.Tokens s ~ TL.Text => Parser Error s XML p_S = P.label "Spaces" $ (\ts -> TS.tree0 (NodeText . EscapedText . pure . EscapedPlain . TL.concat <$> ts)) <$> p_Sourced (P.some $ P.takeWhile1P Nothing XC.isXmlSpaceCharCR <|> TL.singleton <$> p_CRLF) p_Spaces1 :: P.Tokens s ~ TL.Text => Parser e s () p_Spaces1 = P.label "Spaces1" $ void $ P.takeWhile1P Nothing XC.isXmlSpaceChar {-# INLINE p_Spaces1 #-} -- * Eq p_separator :: P.Tokens s ~ TL.Text => Char -> Parser e s () p_separator c = P.try (() <$ p_Spaces <* P.char c) <* p_Spaces [c] p_Eq :: P.Tokens s ~ TL.Text => Parser e s () p_Eq = p_separator '=' "Eq"