{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeFamilies #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Symantic.XML.Read
( module Symantic.XML.Read.Parser
, module Symantic.XML.Read
) where
import Control.Arrow (left)
import Control.Applicative (Applicative(..), Alternative(..))
import Control.Monad (Monad(..), void, unless, forM, join)
import Data.Bool
import Data.Char (Char)
import Data.Default.Class (Default(..))
import Data.Either (Either(..))
import Data.Eq (Eq(..))
import Data.Foldable (Foldable(..))
import Data.Function (($), (.), const)
import Data.Functor ((<$>), (<$))
import Data.Maybe (Maybe(..), maybe)
import Data.Monoid (Monoid(..))
import Data.Ord (Ord(..))
import Data.Semigroup (Semigroup(..))
import Data.String (String)
import Data.TreeSeq.Strict (Tree(..))
import Data.Tuple (snd)
import Prelude (Num(..), Enum(..), Bounded(..), Integer, toInteger)
import System.IO (FilePath, IO)
import Text.Megaparsec ((<?>))
import Text.Show (Show(..))
import qualified Control.Exception as Exn
import qualified Control.Monad.Trans.Reader as R
import qualified Data.ByteString.Lazy as BSL
import qualified Data.Char as Char
import qualified Data.Char.Properties.XMLCharProps as XC
import qualified Data.HashMap.Strict as HM
import qualified Data.List as List
import qualified Data.Sequence as Seq
import qualified Data.Text.Encoding.Error as TL
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TL
import qualified Data.TreeSeq.Strict as TS
import qualified System.IO.Error as IO
import qualified Text.Megaparsec as P
import qualified Text.Megaparsec.Char as P
import Symantic.XML.Document hiding (XML, XMLs)
import Symantic.XML.Read.Parser
readXML :: FilePath -> TL.Text -> Either (P.ParseErrorBundle TL.Text Error) XMLs
readXML filePath stateInput =
snd $
P.runParser'
(R.runReaderT p_document def)
P.State
{ P.stateInput
, P.stateOffset = 0
, P.statePosState = P.PosState
{ P.pstateInput = stateInput
, P.pstateOffset = 0
, P.pstateSourcePos = P.initialPos filePath
, P.pstateTabWidth = P.pos1
, P.pstateLinePrefix = ""
}
}
readFile :: FilePath -> IO (Either ErrorRead TL.Text)
readFile fp =
(left ErrorRead_Unicode . TL.decodeUtf8' <$> BSL.readFile fp)
`Exn.catch` \e ->
if IO.isAlreadyInUseError e
|| IO.isDoesNotExistError e
|| IO.isPermissionError e
then return $ Left $ ErrorRead_IO e
else IO.ioError e
data ErrorRead
= ErrorRead_IO IO.IOError
| ErrorRead_Unicode TL.UnicodeException
deriving (Show)
p_document :: P.Tokens s ~ TL.Text => Parser Error s XMLs
p_document = do
ps <- p_prolog
e <- p_Element
ms <- P.many p_Misc
P.eof
return (ps <> pure e <> join (Seq.fromList ms))
p_prolog :: P.Tokens s ~ TL.Text => Parser Error s XMLs
p_prolog = do
xmlDecl <- P.option Seq.empty $ pure <$> p_XMLDecl
ms <- P.many p_Misc
return (xmlDecl <> join (Seq.fromList ms))
p_Misc :: P.Tokens s ~ TL.Text => Parser Error s XMLs
p_Misc =
P.try (pure <$> p_Comment)
<|> P.try (pure <$> p_PI)
<|> pure <$> p_S
p_XMLDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
p_XMLDecl = P.label "XMLDecl" $ do
Sourced src as <- p_Sourced $ P.between (P.string "<?xml") (P.string "?>") $ do
vi <- pure <$> p_VersionInfo
ed <- P.option Seq.empty $ pure <$> p_EncodingDecl
sd <- P.option Seq.empty $ pure <$> p_SDDecl
p_Spaces
return $ vi <> ed <> sd
return $ Tree (Sourced src $ NodePI "xml" "") as
p_VersionInfo :: P.Tokens s ~ TL.Text => Parser Error s XML
p_VersionInfo = P.label "VersionInfo" $ do
Sourced c v <- p_Sourced $ do
P.try (() <$ p_Spaces1 <* P.string "version")
p_Eq
p_quoted $ const $ p_Sourced $
(<>)
<$> P.string "1."
<*> P.takeWhile1P Nothing Char.isDigit
return $ Tree (Sourced c $ NodeAttr "version") $ pure $
TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v
p_EncodingDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
p_EncodingDecl = P.label "EncodingDecl" $ do
Sourced c v <- p_Sourced $ do
P.try (() <$ p_Spaces1 <* P.string "encoding")
p_Eq
p_quoted $ const $ p_Sourced p_EncName
return $ Tree (Sourced c $ NodeAttr "encoding") $ pure $
TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v
p_EncName :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
p_EncName = P.label "EncName" $ do
P.notFollowedBy (P.satisfy $ not . isAlpha)
P.takeWhile1P Nothing $ \c ->
isAlpha c || Char.isDigit c ||
c=='.' || c=='_' || c=='-'
where isAlpha c = Char.isAsciiLower c || Char.isAsciiUpper c
p_SDDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
p_SDDecl = P.label "SDDecl" $ do
p_SourcedBegin $ do
Sourced ca () <- P.try (p_Sourced $ () <$ p_Spaces1 <* P.string "standalone")
p_Eq
v <- p_quoted $ const $ p_Sourced $ P.string "yes" <|> P.string "no"
return $ Tree (Sourced ca $ NodeAttr "standalone") $ pure $
TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v
p_CharData :: P.Tokens s ~ TL.Text => Parser e s EscapedText
p_CharData =
escapeText
<$> p_until1 (\c -> XC.isXmlChar c && c/='<' && c/='&') (']',"]>")
p_Comment :: P.Tokens s ~ TL.Text => Parser Error s XML
p_Comment = p_SourcedBegin $ P.string "<!--" *> p_Comment__
p_Comment_ :: P.Tokens s ~ TL.Text => Parser Error s XML
p_Comment_ = P.string "--" *> p_Comment__
p_Comment__:: P.Tokens s ~ TL.Text => Parser Error s XML
p_Comment__ = P.label "Comment" $ do
c <- p_until XC.isXmlChar ('-', "-")
void $ P.string "-->"
cell <- p_SourcedEnd
return $ TS.tree0 (cell $ NodeComment c)
p_CDSect :: P.Tokens s ~ TL.Text => Parser Error s XML
p_CDSect = p_SourcedBegin $ P.string "<![CDATA[" *> p_CDSect__
p_CDSect_ :: P.Tokens s ~ TL.Text => Parser Error s XML
p_CDSect_ = P.string "[CDATA[" *> p_CDSect__
p_CDSect__ :: P.Tokens s ~ TL.Text => Parser Error s XML
p_CDSect__ = P.label "CDSect" $ do
c <- p_until XC.isXmlChar (']', "]>")
void $ P.string "]]>"
cell <- p_SourcedEnd
return $ TS.tree0 $ cell $ NodeCDATA c
p_PI :: P.Tokens s ~ TL.Text => Parser Error s XML
p_PI = p_SourcedBegin $ P.string "<?" *> p_PI__
p_PI_ :: P.Tokens s ~ TL.Text => Parser Error s XML
p_PI_ = P.char '?' *> p_PI__
p_PI__ :: P.Tokens s ~ TL.Text => Parser Error s XML
p_PI__ = P.label "PI" $ do
n <- p_PITarget
v <- P.option "" $ P.try $ p_Spaces1 *> p_until XC.isXmlChar ('?', ">")
void $ P.string "?>"
cell <- p_SourcedEnd
return $ TS.tree0 $ cell $ NodePI n v
p_PITarget :: P.Tokens s ~ TL.Text => Parser Error s PName
p_PITarget = do
n <- p_PName
case n of
PName{pNameSpace=Nothing, pNameLocal=NCName l}
| "xml" == TL.toLower l -> p_error $ Error_PI_reserved n
_ -> return n
p_Element :: P.Tokens s ~ TL.Text => Parser Error s XML
p_Element = p_SourcedBegin $ (P.char '<' *> p_Element_)
p_Element_ :: P.Tokens s ~ TL.Text => Parser Error s XML
p_Element_ = P.label "Element" p_STag
p_STag :: P.Tokens s ~ TL.Text => Parser Error s XML
p_STag = do
n <- p_PName
as <- P.many $ P.try $ p_Spaces1 *> p_Attribute
p_Spaces
ro <- R.ask
elemNS :: HM.HashMap NCName Namespace <-
(HM.fromList . List.concat <$>) $ forM as $ \case
Sourced _ (PName{..}, Sourced _ av)
| ns <- Namespace $ unescapeText av
, Nothing <- pNameSpace
, NCName "xmlns" <- pNameLocal ->
case ns of
_ | ns == xmlns_xml
|| ns == xmlns_xmlns
-> p_error $ Error_Namespace_reserved ns
_ -> return [(NCName "" , ns)]
| ns <- Namespace $ unescapeText av
, Just (NCName "xmlns") <- pNameSpace ->
case unNCName pNameLocal of
"xml"
| ns == xmlns_xml -> return []
| otherwise -> p_error $ Error_Namespace_reserved_prefix pNameLocal
"xmlns"
-> p_error $ Error_Namespace_reserved_prefix pNameLocal
local | "xml" <- TL.toLower $ TL.take 3 local -> return []
_ | ns == xmlns_xml
|| ns == xmlns_xmlns
-> p_error $ Error_Namespace_reserved ns
_ -> return [(pNameLocal, ns)]
| otherwise -> return []
let scopeNS = elemNS <> reader_ns_scope ro
let defaultNS = HM.lookupDefault (reader_ns_default ro) (NCName "") scopeNS
let lookupNamePrefix prefix =
maybe (p_error $ Error_Namespace_prefix_unknown prefix) return $
HM.lookup prefix scopeNS
elemName :: QName <-
case pNameSpace n of
Nothing -> return QName{qNameSpace=defaultNS, qNameLocal=pNameLocal n}
Just prefix
| NCName "xmlns" <- prefix ->
p_error $ Error_Namespace_reserved_prefix prefix
| otherwise -> do
ns <- lookupNamePrefix prefix
return QName{qNameSpace=ns, qNameLocal=pNameLocal n}
elemAttrs :: [FileSourced (QName, FileSourced EscapedText)] <-
forM as $ \s@Sourced{unSourced=(an, av)} -> do
ns <- maybe (return "") lookupNamePrefix $ pNameSpace an
let qn = QName{qNameSpace=ns, qNameLocal=pNameLocal an}
return s{unSourced=(qn, av)}
let attrsByQName :: HM.HashMap QName [FileSourced (QName, FileSourced EscapedText)] =
HM.fromListWith (<>) $ (<$> elemAttrs) $ \a@(Sourced _c (an, _av)) -> (an, [a])
case HM.toList $ HM.filter (\x -> length x > 1) attrsByQName of
(an, _):_ -> p_error $ Error_Attribute_collision an
_ -> return ()
elemAttrsXML :: XMLs <- (Seq.fromList <$>) $
forM elemAttrs $ \(Sourced sa (an, av)) -> do
return $ TS.Tree (Sourced sa $ NodeAttr an) $
pure $ TS.tree0 $ NodeText <$> av
content :: XMLs <-
elemAttrsXML <$ P.string "/>" <|>
R.local
(const ro
{ reader_ns_scope = scopeNS
, reader_ns_default = defaultNS
})
((elemAttrsXML <>) <$ P.char '>' <*> p_content <* p_ETag elemName)
cell <- p_SourcedEnd
return $ Tree (cell $ NodeElem elemName) content
p_Attribute :: P.Tokens s ~ TL.Text => Parser Error s (FileSourced (PName, FileSourced EscapedText))
p_Attribute = p_Sourced $ (,) <$> p_PName <* p_Eq <*> p_AttValue
p_AttValue :: P.Tokens s ~ TL.Text => Parser Error s (FileSourced EscapedText)
p_AttValue = P.label "AttValue" $ p_quoted p_AttValueText
p_AttValueText :: P.Tokens s ~ TL.Text => Char -> Parser Error s (FileSourced EscapedText)
p_AttValueText q = p_Sourced $
EscapedText . Seq.fromList <$> P.many
( p_Reference
<|> EscapedPlain <$> P.takeWhile1P Nothing (\c ->
XC.isXmlChar c &&
c `List.notElem` (q:"<&'\">"))
<|> EscapedEntityRef entityRef_gt <$ P.char '>'
<|> (if q == '\''
then EscapedEntityRef entityRef_quot <$ P.char '"'
else EscapedEntityRef entityRef_apos <$ P.char '\'')
)
p_content :: P.Tokens s ~ TL.Text => Parser Error s XMLs
p_content =
(Seq.fromList <$>) $ P.many $
(p_SourcedBegin $ do
P.try $ P.char '<' *> P.notFollowedBy (P.char '/')
p_Element_ <|> p_PI_ <|> (P.char '!' *> (p_Comment_ <|> p_CDSect_))
)
<|> ((tree0 <$>) $ p_Sourced $ NodeText . mconcat
<$> P.some (p_CharData <|> EscapedText . pure <$> p_Reference))
p_ETag :: P.Tokens s ~ TL.Text => QName -> Parser Error s ()
p_ETag expected = do
got <- P.string "</" *> p_QName <* p_Spaces <* P.char '>'
unless (got == expected) $
p_error $ Error_Closing_tag_unexpected got expected
p_Name :: P.Tokens s ~ TL.Text => Parser Error s Name
p_Name = P.label "Name" $
Name
<$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNameStartChar)
<*> P.takeWhile1P Nothing XC.isXmlNameChar
p_PName :: P.Tokens s ~ TL.Text => Parser e s PName
p_PName = P.label "PName" $ do
n <- p_NCName
s <- P.optional $ P.try $ P.char ':' *> p_NCName
return $ case s of
Nothing -> PName{pNameSpace=Nothing, pNameLocal=n}
Just l -> PName{pNameSpace=Just n , pNameLocal=l}
p_QName :: P.Tokens s ~ TL.Text => Parser Error s QName
p_QName = P.label "QName" $ do
n <- p_NCName
s <- P.optional $ P.try $ P.char ':' *> p_NCName
Reader{..} <- R.ask
case s of
Nothing -> return QName{qNameSpace=reader_ns_default, qNameLocal=n}
Just l ->
case HM.lookup n reader_ns_scope of
Nothing -> p_error $ Error_Namespace_prefix_unknown n
Just ns -> return QName{qNameSpace=ns, qNameLocal=l}
p_NCName :: P.Tokens s ~ TL.Text => Parser e s NCName
p_NCName = P.label "NCName" $
NCName
<$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNCNameStartChar)
<*> P.takeWhile1P Nothing XC.isXmlNCNameChar
p_Reference :: P.Tokens s ~ TL.Text => Parser Error s Escaped
p_Reference =
EscapedCharRef <$> p_CharRef <|>
EscapedEntityRef <$> p_EntityRef
p_EntityRef :: P.Tokens s ~ TL.Text => Parser Error s EntityRef
p_EntityRef = P.label "EntityRef" $ do
ref <- P.char '&' *> p_NCName <* P.char ';'
EntityRef ref <$> lookupEntityRef ref
where
lookupEntityRef (NCName "lt" ) = pure "<"
lookupEntityRef (NCName "gt" ) = pure ">"
lookupEntityRef (NCName "amp" ) = pure "&"
lookupEntityRef (NCName "apos") = pure "'"
lookupEntityRef (NCName "quot") = pure "\""
lookupEntityRef n = p_error $ Error_EntityRef_unknown n
p_CharRef :: P.Tokens s ~ TL.Text => Parser Error s CharRef
p_CharRef = P.label "CharRef" $
do
ref <- readHexadecimal
<$ P.string "&#x"
<*> P.some P.hexDigitChar
<* P.char ';'
check ref
<|> do
ref <- readDecimal
<$ P.string "&#"
<*> P.some P.digitChar
<* P.char ';'
check ref
where
check i =
let c = toEnum (fromInteger i) in
if i <= toInteger (fromEnum (maxBound::Char))
&& XC.isXmlChar c
then pure $ CharRef c
else p_error $ Error_CharRef_invalid i
readInt :: Integer -> String -> Integer
readInt base digits =
sign * List.foldl' acc 0 (List.concatMap digToInt digits1)
where
acc q r = q*base + r
(sign, digits1) =
case digits of
[] -> (1, digits)
c:ds | c == '-' -> (-1, ds)
| c == '+' -> ( 1, ds)
| otherwise -> ( 1, digits)
ord = toInteger . Char.ord
digToInt c
| Char.isDigit c = [ord c - ord '0']
| Char.isAsciiLower c = [ord c - ord 'a' + 10]
| Char.isAsciiUpper c = [ord c - ord 'A' + 10]
| otherwise = []
readDecimal :: String -> Integer
readDecimal = readInt 10
readHexadecimal :: String -> Integer
readHexadecimal = readInt 16
p_Char :: P.Tokens s ~ TL.Text => Parser e s Char
p_Char = P.label "Char" $ P.satisfy XC.isXmlCharCR <|> p_CRLF
{-# INLINE p_Char #-}
p_CRLF :: P.Tokens s ~ TL.Text => Parser e s Char
p_CRLF = P.label "CRLF" $
P.char '\r' *> P.option '\n' (P.char '\n')
p_Space :: P.Tokens s ~ TL.Text => Parser e s Char
p_Space = P.label "Space" $
P.satisfy XC.isXmlSpaceCharCR <|> p_CRLF
{-# INLINE p_Space #-}
p_Spaces :: P.Tokens s ~ TL.Text => Parser e s ()
p_Spaces = P.label "Spaces" $
void $ P.takeWhileP Nothing XC.isXmlSpaceChar
{-# INLINE p_Spaces #-}
p_S :: P.Tokens s ~ TL.Text => Parser Error s XML
p_S = P.label "Spaces" $
(\ts -> TS.tree0 (NodeText . EscapedText . pure . EscapedPlain . TL.concat <$> ts))
<$> p_Sourced (P.some $
P.takeWhile1P Nothing XC.isXmlSpaceCharCR <|>
TL.singleton <$> p_CRLF)
p_Spaces1 :: P.Tokens s ~ TL.Text => Parser e s ()
p_Spaces1 = P.label "Spaces1" $
void $ P.takeWhile1P Nothing XC.isXmlSpaceChar
{-# INLINE p_Spaces1 #-}
p_separator :: P.Tokens s ~ TL.Text => Char -> Parser e s ()
p_separator c = P.try (() <$ p_Spaces <* P.char c) <* p_Spaces <?> [c]
p_Eq :: P.Tokens s ~ TL.Text => Parser e s ()
p_Eq = p_separator '=' <?> "Eq"