{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE Rank2Types #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Symantic.XML.Tree.Read where
import Control.Arrow (left)
import Control.Applicative (Applicative(..), Alternative(..))
import Control.Monad (Monad(..), void, unless, forM)
import Data.Bool
import Data.Char (Char)
import Data.Either (Either(..))
import Data.Eq (Eq(..))
import Data.Foldable (Foldable(..))
import Data.Function (($), (.), const)
import Data.Functor ((<$>), (<$))
import Data.Maybe (Maybe(..), maybe, catMaybes)
import Data.Monoid (Monoid(..))
import Data.Ord (Ord(..))
import Data.Semigroup (Semigroup(..))
import Data.List.NonEmpty (NonEmpty(..))
import Data.String (String, IsString(..))
import Prelude (Num(..), Enum(..), Bounded(..), Integer, toInteger)
import System.IO (FilePath, IO)
import Text.Show (Show(..))
import qualified Control.Exception as Exn
import qualified Control.Monad.Trans.Reader as R
import qualified Data.ByteString.Lazy as BSL
import qualified Data.Char as Char
import qualified Data.Char.Properties.XMLCharProps as XC
import qualified Data.HashMap.Strict as HM
import qualified Data.List as List
import qualified Data.Set as Set
import qualified Data.Sequence as Seq
import qualified Data.Text.Encoding.Error as TL
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TL
import qualified Data.TreeSeq.Strict as TS
import qualified System.IO.Error as IO
import qualified Text.Megaparsec as P
import qualified Text.Megaparsec.Char as P
import Symantic.Base ()
import Symantic.XML.Language hiding (void)
import Symantic.XML.Tree.Source
import Symantic.XML.Tree.Data
readTree :: FilePath -> IO (Either String FileSourcedTrees)
readTree path =
readUtf8 path >>= \case
Left err -> return $ Left $ show err
Right txt -> return $
case runReadTree path txt of
Right a -> Right a
Left err -> Left $ P.errorBundlePretty err
runReadTree ::
FilePath -> TL.Text ->
Either (P.ParseErrorBundle TL.Text Error)
FileSourcedTrees
runReadTree = P.runParser $ R.runReaderT p_document defaultReadTreeInh
data ErrorRead
= ErrorRead_IO IO.IOError
| ErrorRead_Unicode TL.UnicodeException
deriving (Show)
readUtf8 :: FilePath -> IO (Either ErrorRead TL.Text)
readUtf8 path =
(left ErrorRead_Unicode . TL.decodeUtf8' <$> BSL.readFile path)
`Exn.catch` \e ->
if IO.isAlreadyInUseError e
|| IO.isDoesNotExistError e
|| IO.isPermissionError e
then return $ Left $ ErrorRead_IO e
else IO.ioError e
type ReadTree e s a =
ReadTreeConstraints e s a =>
R.ReaderT ReadTreeInh (P.Parsec e s) a
type ReadTreeConstraints e s a =
( P.Stream s
, P.Token s ~ Char
, Ord e
, IsString (P.Tokens s)
, P.ShowErrorComponent e
)
data ReadTreeInh
= ReadTreeInh
{ readTreeInh_source :: FileSource Offset
, readTreeInh_ns_scope :: HM.HashMap NCName Namespace
, readTreeInh_ns_default :: Namespace
} deriving (Show)
defaultReadTreeInh :: ReadTreeInh
defaultReadTreeInh = ReadTreeInh
{ readTreeInh_source = FileSource $ pure $
FileRange mempty mempty mempty
, readTreeInh_ns_scope = HM.fromList
[ ("xml" , xmlns_xml)
, ("xmlns", xmlns_xmlns)
]
, readTreeInh_ns_default = ""
}
p_Offset :: ReadTree e s Offset
p_Offset = Offset <$> P.getOffset
{-# INLINE p_Offset #-}
p_Sourced :: ReadTree e s a -> ReadTree e s (Sourced (FileSource Offset) a)
p_Sourced pa = do
ReadTreeInh{readTreeInh_source} <- R.ask
b <- P.getParserState
let fileRange_path = P.sourceName $ P.pstateSourcePos $ P.statePosState b
let fileRange_begin = Offset $ P.stateOffset b
a <- pa
e <- P.getParserState
let fileRange_end = Offset $ P.stateOffset e
return $ Sourced (setSource FileRange{..} readTreeInh_source) a
setSource :: FileRange pos -> FileSource pos -> FileSource pos
setSource fileRange (FileSource (_curr:|next)) = FileSource (fileRange:|next)
p_SourcedBegin :: ReadTree e s a -> ReadTree e s a
p_SourcedBegin pa = do
b <- P.getParserState
let fileRange_path = P.sourceName $ P.pstateSourcePos $ P.statePosState b
let fileRange_begin = Offset $ P.stateOffset b
let fileRange_end = fileRange_begin
(`R.local` pa) $ \inh@ReadTreeInh{..} ->
inh{ readTreeInh_source = setSource FileRange{..} readTreeInh_source }
p_SourcedEnd :: ReadTree e s (a -> Sourced (FileSource Offset) a)
p_SourcedEnd = do
ReadTreeInh{..} <- R.ask
e <- P.getParserState
let fileRange_end = Offset $ P.stateOffset e
return $ Sourced $
(\(FileSource (curr:|path)) -> FileSource (curr{fileRange_end}:|path))
readTreeInh_source
data Error
= Error_CharRef_invalid Integer
| Error_EntityRef_unknown NCName
| Error_Closing_tag_unexpected QName QName
| Error_Attribute_collision QName
| Error_PI_reserved PName
| Error_Namespace_prefix_unknown NCName
| Error_Namespace_empty NCName
| Error_Namespace_reserved Namespace
| Error_Namespace_reserved_prefix NCName
deriving (Eq,Ord,Show)
instance P.ShowErrorComponent Error where
showErrorComponent = show
p_error :: e -> ReadTree e s a
p_error = P.fancyFailure . Set.singleton . P.ErrorCustom
p_quoted :: P.Tokens s ~ TL.Text => (Char -> ReadTree e s a) -> ReadTree e s a
p_quoted p =
P.between (P.char '"') (P.char '"') (p '"') <|>
P.between (P.char '\'') (P.char '\'') (p '\'')
p_until ::
P.Tokens s ~ TL.Text =>
(Char -> Bool) -> (Char, TL.Text) -> ReadTree e s TL.Text
p_until content (end, end_) =
(TL.concat <$>) $ P.many $
P.takeWhile1P Nothing (\c -> content c && c /= end) <|>
P.try (TL.singleton <$> P.char end <* P.notFollowedBy (P.string end_))
p_until1 ::
P.Tokens s ~ TL.Text =>
(Char -> Bool) -> (Char, TL.Text) -> ReadTree e s TL.Text
p_until1 content (end, end_) =
(TL.concat <$>) $ P.some $
P.takeWhile1P Nothing (\c -> content c && c /= end) <|>
P.try (TL.singleton <$> P.char end <* P.notFollowedBy (P.string end_))
p_document :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTrees
p_document = do
ps <- p_prolog
e <- p_Element
m <- p_Miscs
P.eof
return (ps <> pure e <> m)
p_prolog :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTrees
p_prolog = (<>)
<$> P.option Seq.empty (pure <$> p_XMLDecl)
<*> p_Miscs
p_Miscs :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTrees
p_Miscs = (Seq.fromList . catMaybes <$>) $ P.many $
Just <$> p_Comment <|>
Just <$> p_PI <|>
Nothing <$ p_Spaces1
p_XMLDecl :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
p_XMLDecl = do
Sourced src as <- p_Sourced $ P.between (P.string "<?xml") (P.string "?>") $ do
vi <- pure <$> p_VersionInfo
ed <- P.option Seq.empty $ pure <$> p_EncodingDecl
sd <- P.option Seq.empty $ pure <$> p_SDDecl
p_Spaces
return $ vi <> ed <> sd
return $ TS.Tree (Sourced src $ NodePI "xml" "") as
p_VersionInfo :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
p_VersionInfo = do
Sourced src v <- p_Sourced $ do
P.try $ p_Spaces1 <* P.string "version"
p_Eq
p_quoted $ const $
(<>)
<$> P.string "1."
<*> P.takeWhile1P Nothing Char.isDigit
return $ TS.tree0 $ Sourced src $ NodePI "version" v
p_EncodingDecl :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
p_EncodingDecl = do
Sourced src v <- p_Sourced $ do
P.try $ p_Spaces1 <* P.string "encoding"
p_Eq
p_quoted $ const p_EncName
return $ TS.tree0 $ Sourced src $ NodePI "encoding" v
p_EncName :: P.Tokens s ~ TL.Text => ReadTree Error s TL.Text
p_EncName = P.label "EncName" $ do
P.notFollowedBy (P.satisfy $ not . isAlpha)
P.takeWhile1P Nothing $ \c ->
isAlpha c || Char.isDigit c ||
c=='.' || c=='_' || c=='-'
where isAlpha c = Char.isAsciiLower c || Char.isAsciiUpper c
p_SDDecl :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
p_SDDecl = do
Sourced src v <- p_Sourced $ do
P.try $ p_Spaces1 <* P.string "standalone"
p_Eq
p_quoted $ const $ P.string "yes" <|> P.string "no"
return $ TS.tree0 $ Sourced src $ NodePI "standalone" v
p_CharData :: P.Tokens s ~ TL.Text => ReadTree e s EscapedText
p_CharData = P.label "[^<&]" $ escapeText <$>
p_until1 (\c -> XC.isXmlChar c && c/='<' && c/='&') (']',"]>")
p_Comment :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
p_Comment = p_SourcedBegin $ P.string "<!--" *> p_Comment__
p_Comment_ :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
p_Comment_ = P.string "--" *> p_Comment__
p_Comment__:: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
p_Comment__ = do
c <- p_until XC.isXmlChar ('-', "-")
void $ P.string "-->"
src <- p_SourcedEnd
return $ TS.tree0 $ src $ NodeComment c
p_CDSect :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
p_CDSect = p_SourcedBegin $ P.string "<![CDATA[" *> p_CDSect__
p_CDSect_ :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
p_CDSect_ = P.string "[CDATA[" *> p_CDSect__
p_CDSect__ :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
p_CDSect__ = do
c <- p_until XC.isXmlChar (']', "]>")
void $ P.string "]]>"
src <- p_SourcedEnd
return $ TS.tree0 $ src $ NodeCDATA c
p_PI :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
p_PI = p_SourcedBegin $ P.string "<?" *> p_PI__
p_PI_ :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
p_PI_ = P.char '?' *> p_PI__
p_PI__ :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
p_PI__ = do
n <- p_PITarget
v <- P.option "" $ p_Spaces1 *> p_until XC.isXmlChar ('?', ">")
void $ P.string "?>"
src <- p_SourcedEnd
return $ TS.tree0 $ src $ NodePI n v
p_PITarget :: P.Tokens s ~ TL.Text => ReadTree Error s PName
p_PITarget = do
n <- p_PName
case n of
PName{pNameSpace=Nothing, pNameLocal=NCName l}
| "xml" == TL.toLower l -> p_error $ Error_PI_reserved n
_ -> return n
p_Element :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
p_Element = p_SourcedBegin $ (P.char '<' *> p_Element_)
p_Element_ :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
p_Element_ = p_STag
p_STag :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
p_STag = do
n <- p_PName
attrs <- P.many $ p_Attribute
p_Spaces
ro <- R.ask
elemNS :: HM.HashMap NCName Namespace <-
(HM.fromList . List.concat <$>) $ forM attrs $ \case
(PName{..}, Sourced _ av)
| ns <- Namespace $ unescapeAttr av
, Nothing <- pNameSpace
, NCName "xmlns" <- pNameLocal ->
case ns of
_ | ns == xmlns_xml
|| ns == xmlns_xmlns
-> p_error $ Error_Namespace_reserved ns
_ -> return [(NCName "" , ns)]
| ns <- Namespace $ unescapeAttr av
, Just (NCName "xmlns") <- pNameSpace ->
case unNCName pNameLocal of
"xml"
| ns == xmlns_xml -> return []
| otherwise -> p_error $ Error_Namespace_reserved_prefix pNameLocal
"xmlns"
-> p_error $ Error_Namespace_reserved_prefix pNameLocal
local | "xml" <- TL.toLower $ TL.take 3 local -> return []
_ | ns == xmlns_xml
|| ns == xmlns_xmlns
-> p_error $ Error_Namespace_reserved ns
_ -> return [(pNameLocal, ns)]
| otherwise -> return []
let scopeNS = elemNS <> readTreeInh_ns_scope ro
let defaultNS = HM.lookupDefault (readTreeInh_ns_default ro) (NCName "") scopeNS
let
lookupNamePrefix prefix =
maybe (p_error $ Error_Namespace_prefix_unknown prefix) return $
HM.lookup prefix scopeNS
elemName :: QName <-
case pNameSpace n of
Nothing -> return QName{qNameSpace=defaultNS, qNameLocal=pNameLocal n}
Just prefix
| NCName "xmlns" <- prefix ->
p_error $ Error_Namespace_reserved_prefix prefix
| otherwise -> do
ns <- lookupNamePrefix prefix
return QName{qNameSpace=ns, qNameLocal=pNameLocal n}
elemAttrs :: [(QName, FileSourced EscapedAttr)] <-
forM attrs $ \(an, av) -> do
ns <- maybe (return "") lookupNamePrefix $ pNameSpace an
let qn = QName{qNameSpace=ns, qNameLocal=pNameLocal an}
return (qn, av)
let
attrsByQName :: HM.HashMap QName [FileSourced EscapedAttr] =
HM.fromListWith (<>) $ (<$> elemAttrs) $
\(an, av) -> (an, [av])
case HM.toList $ HM.filter (\x -> length x > 1) attrsByQName of
(an, _):_ -> p_error $ Error_Attribute_collision an
_ -> return ()
content :: FileSourcedTrees <-
mempty <$ P.string "/>" <|>
R.local
(const ro
{ readTreeInh_ns_scope = scopeNS
, readTreeInh_ns_default = defaultNS
})
(P.char '>' *> p_content <* p_ETag elemName)
src <- p_SourcedEnd
return $ TS.Tree (src $ NodeElem elemName (List.head <$> attrsByQName)) content
p_Attribute :: P.Tokens s ~ TL.Text => ReadTree Error s (PName, FileSourced EscapedAttr)
p_Attribute =
p_SourcedBegin $ do
an <- P.try $ p_Spaces1 *> p_PName
void p_Eq
av <- p_AttrValue
src <- p_SourcedEnd
return (an, src av)
p_AttrValue :: P.Tokens s ~ TL.Text => ReadTree Error s EscapedAttr
p_AttrValue = p_quoted p_AttrValueText
p_AttrValueText :: P.Tokens s ~ TL.Text => Char -> ReadTree Error s EscapedAttr
p_AttrValueText q =
EscapedAttr . Seq.fromList <$> P.many (
p_Reference <|>
(if q /= '\"' then EscapedEntityRef entityRef_quot <$ P.char '"' else P.empty) <|>
EscapedPlain <$> P.label ("[^<&"<>[q]<>"]")
(P.takeWhile1P Nothing $ \c ->
XC.isXmlChar c &&
c `List.notElem` (q:"<&")
)
)
p_content :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTrees
p_content =
(Seq.fromList <$>) $ P.many $
(p_SourcedBegin $ do
P.try $ P.char '<' *> P.notFollowedBy (P.char '/')
p_Element_ <|> p_PI_ <|> (P.char '!' *> (p_Comment_ <|> p_CDSect_))
)
<|> (
(TS.tree0 <$>) $
p_Sourced $ NodeText . EscapedText . foldMap unEscapedText
<$> P.some (
p_CharData <|>
EscapedText . pure <$> p_Reference
)
)
p_ETag :: P.Tokens s ~ TL.Text => QName -> ReadTree Error s ()
p_ETag expected = do
got <- P.string "</" *> p_QName <* p_Spaces <* P.char '>'
unless (got == expected) $
p_error $ Error_Closing_tag_unexpected got expected
p_PName :: P.Tokens s ~ TL.Text => ReadTree e s PName
p_PName = do
n <- p_NCName
s <- P.optional $ P.try $ P.char ':' *> p_NCName
return $ case s of
Nothing -> PName{pNameSpace=Nothing, pNameLocal=n}
Just l -> PName{pNameSpace=Just n , pNameLocal=l}
p_QName :: P.Tokens s ~ TL.Text => ReadTree Error s QName
p_QName = do
n <- p_NCName
s <- P.optional $ P.try $ P.char ':' *> p_NCName
ReadTreeInh{..} <- R.ask
case s of
Nothing -> return QName{qNameSpace=readTreeInh_ns_default, qNameLocal=n}
Just l ->
case HM.lookup n readTreeInh_ns_scope of
Nothing -> p_error $ Error_Namespace_prefix_unknown n
Just ns -> return QName{qNameSpace=ns, qNameLocal=l}
p_NCName :: P.Tokens s ~ TL.Text => ReadTree e s NCName
p_NCName = P.label "NCName" $ NCName
<$ P.notFollowedBy (P.satisfy (not . XC.isXmlNCNameStartChar))
<*> P.takeWhile1P Nothing XC.isXmlNCNameChar
p_Reference :: P.Tokens s ~ TL.Text => ReadTree Error s Escaped
p_Reference =
EscapedCharRef <$> p_CharRef <|>
EscapedEntityRef <$> p_EntityRef
p_EntityRef :: P.Tokens s ~ TL.Text => ReadTree Error s EntityRef
p_EntityRef = do
ref <- P.char '&' *> p_NCName <* P.char ';'
EntityRef ref <$> lookupEntityRef ref
where
lookupEntityRef (NCName "lt" ) = pure "<"
lookupEntityRef (NCName "gt" ) = pure ">"
lookupEntityRef (NCName "amp" ) = pure "&"
lookupEntityRef (NCName "apos") = pure "'"
lookupEntityRef (NCName "quot") = pure "\""
lookupEntityRef n = p_error $ Error_EntityRef_unknown n
p_CharRef :: P.Tokens s ~ TL.Text => ReadTree Error s CharRef
p_CharRef =
do
ref <- readHexadecimal
<$ P.string "&#x"
<*> P.some P.hexDigitChar
<* P.char ';'
check ref
<|> do
ref <- readDecimal
<$ P.string "&#"
<*> P.some P.digitChar
<* P.char ';'
check ref
where
check i =
let c = toEnum (fromInteger i) in
if i <= toInteger (fromEnum (maxBound::Char))
&& XC.isXmlChar c
then pure $ CharRef c
else p_error $ Error_CharRef_invalid i
readInt :: Integer -> String -> Integer
readInt base digits =
sign * List.foldl' acc 0 (List.concatMap digToInt digits1)
where
acc q r = q*base + r
(sign, digits1) =
case digits of
[] -> (1, digits)
c:ds | c == '-' -> (-1, ds)
| c == '+' -> ( 1, ds)
| otherwise -> ( 1, digits)
ord = toInteger . Char.ord
digToInt c
| Char.isDigit c = [ord c - ord '0']
| Char.isAsciiLower c = [ord c - ord 'a' + 10]
| Char.isAsciiUpper c = [ord c - ord 'A' + 10]
| otherwise = []
readDecimal :: String -> Integer
readDecimal = readInt 10
readHexadecimal :: String -> Integer
readHexadecimal = readInt 16
p_Char :: P.Tokens s ~ TL.Text => ReadTree e s Char
p_Char = P.label "XmlChar" $ P.satisfy XC.isXmlCharCR <|> p_CRLF
{-# INLINE p_Char #-}
p_CRLF :: P.Tokens s ~ TL.Text => ReadTree e s Char
p_CRLF = P.char '\r' *> P.option '\n' (P.char '\n')
p_Space :: P.Tokens s ~ TL.Text => ReadTree e s Char
p_Space = P.label "space" $ P.satisfy XC.isXmlSpaceCharCR <|> p_CRLF
{-# INLINE p_Space #-}
p_Spaces :: P.Tokens s ~ TL.Text => ReadTree e s ()
p_Spaces = P.label "spaces" $ void $ P.takeWhileP Nothing XC.isXmlSpaceChar
{-# INLINE p_Spaces #-}
p_Spaces1 :: P.Tokens s ~ TL.Text => ReadTree e s ()
p_Spaces1 = P.label "spaces" $ void $ P.takeWhile1P Nothing XC.isXmlSpaceChar
{-# INLINE p_Spaces1 #-}
p_separator :: P.Tokens s ~ TL.Text => Char -> ReadTree e s ()
p_separator c = P.try (() <$ p_Spaces <* P.char c) <* p_Spaces
p_Eq :: P.Tokens s ~ TL.Text => ReadTree e s ()
p_Eq = p_separator '='