{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeFamilies #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Symantic.XML.Read
 ( module Symantic.XML.Read.Parser
 , module Symantic.XML.Read
 ) where

import Control.Arrow (left)
import Control.Applicative (Applicative(..), Alternative(..))
import Control.Monad (Monad(..), void, unless, forM, join)
import Data.Bool
import Data.Char (Char)
import Data.Default.Class (Default(..))
import Data.Either (Either(..))
import Data.Eq (Eq(..))
import Data.Foldable (Foldable(..))
import Data.Function (($), (.), const)
import Data.Functor ((<$>), (<$))
import Data.Maybe (Maybe(..), maybe)
import Data.Monoid (Monoid(..))
import Data.Ord (Ord(..))
import Data.Semigroup (Semigroup(..))
import Data.String (String)
import Data.TreeSeq.Strict (Tree(..))
import Data.Tuple (snd)
import Prelude (Num(..), Enum(..), Bounded(..), Integer, toInteger)
import System.IO (FilePath, IO)
import Text.Megaparsec ((<?>))
import Text.Show (Show(..))
import qualified Control.Exception as Exn
import qualified Control.Monad.Trans.Reader as R
import qualified Data.ByteString.Lazy as BSL
import qualified Data.Char as Char
import qualified Data.Char.Properties.XMLCharProps as XC
import qualified Data.HashMap.Strict as HM
import qualified Data.List as List
import qualified Data.Sequence as Seq
import qualified Data.Text.Encoding.Error as TL
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TL
import qualified Data.TreeSeq.Strict as TS
import qualified System.IO.Error as IO
import qualified Text.Megaparsec as P
import qualified Text.Megaparsec.Char as P

import Symantic.XML.Document hiding (XML, XMLs)
import Symantic.XML.Read.Parser

readXML :: FilePath -> TL.Text -> Either (P.ParseErrorBundle TL.Text Error) XMLs
readXML filePath stateInput =
        snd $
        P.runParser'
         (R.runReaderT p_document def)
         P.State
         { P.stateInput
         , P.stateOffset = 0
         , P.statePosState = P.PosState
                 { P.pstateInput      = stateInput
                 , P.pstateOffset     = 0
                 , P.pstateSourcePos  = P.initialPos filePath
                 , P.pstateTabWidth   = P.pos1
                 , P.pstateLinePrefix = ""
                 }
         }

readFile :: FilePath -> IO (Either ErrorRead TL.Text)
readFile fp =
        (left ErrorRead_Unicode . TL.decodeUtf8' <$> BSL.readFile fp)
        `Exn.catch` \e ->
                if IO.isAlreadyInUseError e
                || IO.isDoesNotExistError e
                || IO.isPermissionError   e
                then return $ Left $ ErrorRead_IO e
                else IO.ioError e

-- * Type 'ErrorRead'
data ErrorRead
 =   ErrorRead_IO IO.IOError
 |   ErrorRead_Unicode TL.UnicodeException
 deriving (Show)

-- * Document
p_document :: P.Tokens s ~ TL.Text => Parser Error s XMLs
p_document = do
        ps <- p_prolog
        e  <- p_Element
        ms <- P.many p_Misc
        P.eof
        return (ps <> pure e <> join (Seq.fromList ms))

-- ** Prolog
p_prolog :: P.Tokens s ~ TL.Text => Parser Error s XMLs
p_prolog = do
        xmlDecl <- P.option Seq.empty $ pure <$> p_XMLDecl
        ms <- P.many p_Misc
        return (xmlDecl <> join (Seq.fromList ms))

-- ** Misc
p_Misc :: P.Tokens s ~ TL.Text => Parser Error s XMLs
p_Misc =
        P.try (pure <$> p_Comment)
         <|> P.try (pure <$> p_PI)
         <|> pure <$> p_S

-- ** XMLDecl
p_XMLDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
p_XMLDecl = P.label "XMLDecl" $ do
        Sourced src as <- p_Sourced $ P.between (P.string "<?xml") (P.string "?>") $ do
                vi <- pure <$> p_VersionInfo
                ed <- P.option Seq.empty $ pure <$> p_EncodingDecl
                sd <- P.option Seq.empty $ pure <$> p_SDDecl
                p_Spaces
                return $ vi <> ed <> sd
        return $ Tree (Sourced src $ NodePI "xml" "") as

p_VersionInfo :: P.Tokens s ~ TL.Text => Parser Error s XML
p_VersionInfo = P.label "VersionInfo" $ do
        Sourced c v <- p_Sourced $ do
                P.try (() <$ p_Spaces1 <* P.string "version")
                p_Eq
                p_quoted $ const $ p_Sourced $
                        (<>)
                         <$> P.string "1."
                         <*> P.takeWhile1P Nothing Char.isDigit
        return $ Tree (Sourced c $ NodeAttr "version") $ pure $
                TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v

p_EncodingDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
p_EncodingDecl = P.label "EncodingDecl" $ do
        Sourced c v <- p_Sourced $ do
                P.try (() <$ p_Spaces1 <* P.string "encoding")
                p_Eq
                p_quoted $ const $ p_Sourced p_EncName
        return $ Tree (Sourced c $ NodeAttr "encoding") $ pure $
                TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v

p_EncName :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
p_EncName = P.label "EncName" $ do
        P.notFollowedBy (P.satisfy $ not . isAlpha)
        P.takeWhile1P Nothing $ \c ->
                isAlpha c || Char.isDigit c ||
                c=='.' || c=='_' || c=='-'
        where isAlpha c = Char.isAsciiLower c || Char.isAsciiUpper c

-- *** SDDecl
p_SDDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
p_SDDecl = P.label "SDDecl" $ do
        p_SourcedBegin $ do
                Sourced ca () <- P.try (p_Sourced $ () <$ p_Spaces1 <* P.string "standalone")
                p_Eq
                v <- p_quoted $ const $ p_Sourced $ P.string "yes" <|> P.string "no"
                return $ Tree (Sourced ca $ NodeAttr "standalone") $ pure $
                        TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v

-- ** CharData
p_CharData :: P.Tokens s ~ TL.Text => Parser e s EscapedText
p_CharData =
        escapeText
         <$> p_until1 (\c -> XC.isXmlChar c && c/='<' && c/='&') (']',"]>")

-- ** Comment
p_Comment :: P.Tokens s ~ TL.Text => Parser Error s XML
p_Comment = p_SourcedBegin $ P.string "<!--" *> p_Comment__
p_Comment_ :: P.Tokens s ~ TL.Text => Parser Error s XML
p_Comment_ = P.string "--" *> p_Comment__
p_Comment__:: P.Tokens s ~ TL.Text => Parser Error s XML
p_Comment__ = P.label "Comment" $ do
        c <- p_until XC.isXmlChar ('-', "-")
        void $ P.string "-->"
        cell <- p_SourcedEnd
        return $ TS.tree0 (cell $ NodeComment c)

-- ** CDATA
p_CDSect :: P.Tokens s ~ TL.Text => Parser Error s XML
p_CDSect = p_SourcedBegin $ P.string "<![CDATA[" *> p_CDSect__
p_CDSect_ :: P.Tokens s ~ TL.Text => Parser Error s XML
p_CDSect_ = P.string "[CDATA[" *> p_CDSect__
p_CDSect__ :: P.Tokens s ~ TL.Text => Parser Error s XML
p_CDSect__ = P.label "CDSect" $ do
        c <- p_until XC.isXmlChar (']', "]>")
        void $ P.string "]]>"
        cell <- p_SourcedEnd
        return $ TS.tree0 $ cell $ NodeCDATA c

-- ** PI
p_PI :: P.Tokens s ~ TL.Text => Parser Error s XML
p_PI = p_SourcedBegin $ P.string "<?" *> p_PI__
p_PI_ :: P.Tokens s ~ TL.Text => Parser Error s XML
p_PI_ = P.char '?' *> p_PI__
p_PI__ :: P.Tokens s ~ TL.Text => Parser Error s XML
p_PI__ = P.label "PI" $ do
        n <- p_PITarget
        v <- P.option "" $ P.try $ p_Spaces1 *> p_until XC.isXmlChar ('?', ">")
        void $ P.string "?>"
        cell <- p_SourcedEnd
        return $ TS.tree0 $ cell $ NodePI n v
p_PITarget :: P.Tokens s ~ TL.Text => Parser Error s PName
p_PITarget = do
        n <- p_PName
        case n of
         PName{pNameSpace=Nothing, pNameLocal=NCName l}
          | "xml" == TL.toLower l -> p_error $ Error_PI_reserved n
         _ -> return n

-- ** Element
p_Element :: P.Tokens s ~ TL.Text => Parser Error s XML
p_Element = p_SourcedBegin $ (P.char '<' *> p_Element_)
p_Element_ :: P.Tokens s ~ TL.Text => Parser Error s XML
p_Element_ = P.label "Element" p_STag

-- *** STag
p_STag :: P.Tokens s ~ TL.Text => Parser Error s XML
p_STag = do
        n  <- p_PName
        as <- P.many $ P.try $ p_Spaces1 *> p_Attribute
        p_Spaces
        ro <- R.ask
        elemNS :: HM.HashMap NCName Namespace <-
                (HM.fromList . List.concat <$>) $ forM as $ \case
                 Sourced _ (PName{..}, Sourced _ av)
                  | ns <- Namespace $ unescapeText av
                  , Nothing        <- pNameSpace
                  , NCName "xmlns" <- pNameLocal ->
                        -- NOTE: default namespace declaration.
                        case ns of
                         _ |  ns == xmlns_xml   -- DOC: it MUST NOT be declared as the default namespace
                           || ns == xmlns_xmlns -- DOC: it MUST NOT be declared as the default namespace
                           -> p_error $ Error_Namespace_reserved ns
                         _ -> return [(NCName "" , ns)]
                  | ns <- Namespace $ unescapeText av
                  , Just (NCName "xmlns") <- pNameSpace ->
                        -- NOTE: namespace prefix declaration.
                        case unNCName pNameLocal of
                         "xml" -- DOC: It MAY, but need not, be declared,
                               -- and MUST NOT be bound to any other namespace name.
                               | ns == xmlns_xml -> return []
                               | otherwise -> p_error $ Error_Namespace_reserved_prefix pNameLocal
                         "xmlns" -- DOC: It MUST NOT be declared
                                 -> p_error $ Error_Namespace_reserved_prefix pNameLocal
                         local | "xml" <- TL.toLower $ TL.take 3 local -> return []
                               -- DOC: All other prefixes beginning with the three-letter
                               -- sequence x, m, l, in any case combination, are reserved.
                               -- This means that: processors MUST NOT treat them as fatal errors.
                         _ |  ns == xmlns_xml   -- DOC: Other prefixes MUST NOT be bound to this namespace name.
                           || ns == xmlns_xmlns -- DOC: Other prefixes MUST NOT be bound to this namespace name.
                           -> p_error $ Error_Namespace_reserved ns
                         _ -> return [(pNameLocal, ns)]
                  | otherwise -> return []
        let scopeNS = elemNS <> reader_ns_scope ro
        let defaultNS = HM.lookupDefault (reader_ns_default ro) (NCName "") scopeNS
        let lookupNamePrefix prefix =
                maybe (p_error $ Error_Namespace_prefix_unknown prefix) return $
                HM.lookup prefix scopeNS
        elemName :: QName <-
                -- NOTE: expand element's QName.
                case pNameSpace n of
                 Nothing -> return QName{qNameSpace=defaultNS, qNameLocal=pNameLocal n}
                  -- DOC: If there is a default namespace declaration in scope,
                  -- the expanded name corresponding to an unprefixed element name
                  -- has the URI of the default namespace as its namespace name.
                 Just prefix
                  | NCName "xmlns" <- prefix ->
                        -- DOC: Element names MUST NOT have the prefix xmlns.
                        p_error $ Error_Namespace_reserved_prefix prefix
                  | otherwise -> do
                        ns <- lookupNamePrefix prefix
                        return QName{qNameSpace=ns, qNameLocal=pNameLocal n}
        elemAttrs :: [FileSourced (QName, FileSourced EscapedText)] <-
                -- NOTE: expand attributes' PName into QName.
                forM as $ \s@Sourced{unSourced=(an, av)} -> do
                        ns <- maybe (return "") lookupNamePrefix $ pNameSpace an
                        let qn = QName{qNameSpace=ns, qNameLocal=pNameLocal an}
                        return s{unSourced=(qn, av)}
        -- NOTE: check for attribute collision.
        let attrsByQName :: HM.HashMap QName [FileSourced (QName, FileSourced EscapedText)] =
                HM.fromListWith (<>) $ (<$> elemAttrs) $ \a@(Sourced _c (an, _av)) -> (an, [a])
        case HM.toList $ HM.filter (\x -> length x > 1) attrsByQName of
         (an, _):_ -> p_error $ Error_Attribute_collision an
         _ -> return ()
        elemAttrsXML :: XMLs <- (Seq.fromList <$>) $
                forM elemAttrs $ \(Sourced sa (an, av)) -> do
                        return $ TS.Tree (Sourced sa $ NodeAttr an) $
                                pure $ TS.tree0 $ NodeText <$> av
        content :: XMLs <-
                elemAttrsXML <$ P.string "/>" <|>
                R.local
                 (const ro
                         { reader_ns_scope   = scopeNS
                         , reader_ns_default = defaultNS
                         })
                 ((elemAttrsXML <>) <$ P.char '>' <*> p_content <* p_ETag elemName)
        cell <- p_SourcedEnd
        return $ Tree (cell $ NodeElem elemName) content

-- *** Attribute
p_Attribute :: P.Tokens s ~ TL.Text => Parser Error s (FileSourced (PName, FileSourced EscapedText))
p_Attribute = p_Sourced $ (,) <$> p_PName <* p_Eq <*> p_AttValue

p_AttValue :: P.Tokens s ~ TL.Text => Parser Error s (FileSourced EscapedText)
p_AttValue = P.label "AttValue" $ p_quoted p_AttValueText

p_AttValueText :: P.Tokens s ~ TL.Text => Char -> Parser Error s (FileSourced EscapedText)
p_AttValueText q = p_Sourced $
        EscapedText . Seq.fromList <$> P.many
         ( p_Reference
         <|> EscapedPlain <$> P.takeWhile1P Nothing (\c ->
                XC.isXmlChar c &&
                c `List.notElem` (q:"<&'\">"))
         <|> EscapedEntityRef entityRef_gt <$ P.char '>'
         <|> (if q == '\''
                then EscapedEntityRef entityRef_quot <$ P.char '"'
                else EscapedEntityRef entityRef_apos <$ P.char '\'')
         )

-- * content
p_content :: P.Tokens s ~ TL.Text => Parser Error s XMLs
p_content =
        (Seq.fromList <$>) $ P.many $
                (p_SourcedBegin $ do
                        P.try $ P.char '<' *> P.notFollowedBy (P.char '/')
                        p_Element_ <|> p_PI_ <|> (P.char '!' *> (p_Comment_ <|> p_CDSect_))
                )
                <|> ((tree0 <$>) $ p_Sourced $ NodeText . mconcat
                         <$> P.some (p_CharData <|> EscapedText . pure <$> p_Reference))

-- *** ETag
p_ETag :: P.Tokens s ~ TL.Text => QName -> Parser Error s ()
p_ETag expected = do
        got <- P.string "</" *> p_QName <* p_Spaces <* P.char '>'
        unless (got == expected) $
                p_error $ Error_Closing_tag_unexpected got expected

-- * Name
p_Name :: P.Tokens s ~ TL.Text => Parser Error s Name
p_Name = P.label "Name" $
        Name
         <$  P.notFollowedBy (P.satisfy $ not . XC.isXmlNameStartChar)
         <*> P.takeWhile1P Nothing XC.isXmlNameChar

-- * PName
p_PName :: P.Tokens s ~ TL.Text => Parser e s PName
p_PName = P.label "PName" $ do
        n <- p_NCName
        s <- P.optional $ P.try $ P.char ':' *> p_NCName
        return $ case s of
         Nothing -> PName{pNameSpace=Nothing, pNameLocal=n}
         Just l  -> PName{pNameSpace=Just n , pNameLocal=l}

-- * QName
p_QName :: P.Tokens s ~ TL.Text => Parser Error s QName
p_QName = P.label "QName" $ do
        n <- p_NCName
        s <- P.optional $ P.try $ P.char ':' *> p_NCName
        Reader{..} <- R.ask
        case s of
         Nothing -> return QName{qNameSpace=reader_ns_default, qNameLocal=n}
         Just l ->
                case HM.lookup n reader_ns_scope of
                 Nothing -> p_error $ Error_Namespace_prefix_unknown n
                 Just ns -> return QName{qNameSpace=ns, qNameLocal=l}

-- ** NCName
p_NCName :: P.Tokens s ~ TL.Text => Parser e s NCName
p_NCName = P.label "NCName" $
        NCName
         <$  P.notFollowedBy (P.satisfy $ not . XC.isXmlNCNameStartChar)
         <*> P.takeWhile1P Nothing XC.isXmlNCNameChar

-- * Reference
p_Reference :: P.Tokens s ~ TL.Text => Parser Error s Escaped
p_Reference =
        EscapedCharRef <$> p_CharRef <|>
        EscapedEntityRef <$> p_EntityRef

-- ** EntityRef
p_EntityRef :: P.Tokens s ~ TL.Text => Parser Error s EntityRef
p_EntityRef = P.label "EntityRef" $ do
        ref <- P.char '&' *> p_NCName <* P.char ';'
        EntityRef ref <$> lookupEntityRef ref
        where
        lookupEntityRef (NCName "lt"  ) = pure "<"
        lookupEntityRef (NCName "gt"  ) = pure ">"
        lookupEntityRef (NCName "amp" ) = pure "&"
        lookupEntityRef (NCName "apos") = pure "'"
        lookupEntityRef (NCName "quot") = pure "\""
        lookupEntityRef n = p_error $ Error_EntityRef_unknown n

-- ** CharRef
p_CharRef :: P.Tokens s ~ TL.Text => Parser Error s CharRef
p_CharRef = P.label "CharRef" $
        do
                ref <- readHexadecimal
                 <$  P.string "&#x"
                 <*> P.some P.hexDigitChar
                 <*  P.char ';'
                check ref
        <|> do
                ref <- readDecimal
                 <$  P.string "&#"
                 <*> P.some P.digitChar
                 <*  P.char ';'
                check ref
        where
        check i =
                let c = toEnum (fromInteger i) in
                if i <= toInteger (fromEnum (maxBound::Char))
                && XC.isXmlChar c
                then pure $ CharRef c
                else p_error $ Error_CharRef_invalid i

readInt :: Integer -> String -> Integer
readInt base digits =
        sign * List.foldl' acc 0 (List.concatMap digToInt digits1)
        where
        acc q r = q*base + r
        (sign, digits1) =
                case digits of
                 [] -> (1, digits)
                 c:ds | c == '-'  -> (-1, ds)
                      | c == '+'  -> ( 1, ds)
                      | otherwise -> ( 1, digits)
        ord = toInteger . Char.ord
        digToInt c
         | Char.isDigit c      = [ord c - ord '0']
         | Char.isAsciiLower c = [ord c - ord 'a' + 10]
         | Char.isAsciiUpper c = [ord c - ord 'A' + 10]
         | otherwise           = []

readDecimal :: String -> Integer
readDecimal = readInt 10

readHexadecimal :: String -> Integer
readHexadecimal = readInt 16

-- * Char
p_Char :: P.Tokens s ~ TL.Text => Parser e s Char
p_Char = P.label "Char" $ P.satisfy XC.isXmlCharCR <|> p_CRLF
{-# INLINE p_Char #-}

-- ** Space
-- | Map '\r' and '\r\n' to '\n'.
p_CRLF :: P.Tokens s ~ TL.Text => Parser e s Char
p_CRLF = P.label "CRLF" $
        P.char '\r' *> P.option '\n' (P.char '\n')

p_Space :: P.Tokens s ~ TL.Text => Parser e s Char
p_Space = P.label "Space" $
        P.satisfy XC.isXmlSpaceCharCR <|> p_CRLF
{-# INLINE p_Space #-}

p_Spaces :: P.Tokens s ~ TL.Text => Parser e s ()
p_Spaces = P.label "Spaces" $
        void $ P.takeWhileP Nothing XC.isXmlSpaceChar
{-# INLINE p_Spaces #-}

p_S :: P.Tokens s ~ TL.Text => Parser Error s XML
p_S = P.label "Spaces" $
        (\ts -> TS.tree0 (NodeText . EscapedText . pure . EscapedPlain . TL.concat <$> ts))
         <$> p_Sourced (P.some $
                P.takeWhile1P Nothing XC.isXmlSpaceCharCR <|>
                TL.singleton <$> p_CRLF)

p_Spaces1 :: P.Tokens s ~ TL.Text => Parser e s ()
p_Spaces1 = P.label "Spaces1" $
        void $ P.takeWhile1P Nothing XC.isXmlSpaceChar
{-# INLINE p_Spaces1 #-}

-- * Eq
p_separator :: P.Tokens s ~ TL.Text => Char -> Parser e s ()
p_separator c = P.try (() <$ p_Spaces <* P.char c) <* p_Spaces <?> [c]

p_Eq :: P.Tokens s ~ TL.Text => Parser e s ()
p_Eq = p_separator '=' <?> "Eq"