-- | Validate a document against a dtd.
module Text.XML.HaXml.Validate
  ( validate
  , partialValidate
  ) where

import Prelude hiding (elem,rem,mod,sequence)
import qualified Prelude (elem)
import Text.XML.HaXml.Types
import Text.XML.HaXml.Combinators (multi,tag,iffind,literal,none,o)
import Text.XML.HaXml.XmlContent (attr2str)
import Maybe (fromMaybe,isNothing,fromJust)
import List (intersperse,nub,(\\))
import Char (isSpace)

#if __GLASGOW_HASKELL__ >= 604 || __NHC__ >= 118 || defined(__HUGS__)
-- emulate older finite map interface using Data.Map, if it is available
import qualified Data.Map as Map
type FiniteMap a b = Map.Map a b
listToFM :: Ord a => [(a,b)] -> FiniteMap a b
listToFM = Map.fromList
lookupFM :: Ord a => FiniteMap a b -> a -> Maybe b
lookupFM = flip Map.lookup
#elif __GLASGOW_HASKELL__ >= 504 || __NHC__ > 114
-- real finite map, if it is available
import Data.FiniteMap
#else
-- otherwise, a very simple and inefficient implementation of a finite map
type FiniteMap a b = [(a,b)]
listToFM :: Eq a => [(a,b)] -> FiniteMap a b
listToFM = id
lookupFM :: Eq a => FiniteMap a b -> a -> Maybe b
lookupFM fm k = lookup k fm
#endif

-- gather appropriate information out of the DTD
data SimpleDTD = SimpleDTD
    { elements   :: FiniteMap Name ContentSpec	-- content model of elem
    , attributes :: FiniteMap (Name,Name) AttType -- type of (elem,attr)
    , required   :: FiniteMap Name [Name]	-- required attributes of elem
    , ids        :: [(Name,Name)]	-- all (element,attr) with ID type
    , idrefs     :: [(Name,Name)]	-- all (element,attr) with IDREF type
    }

simplifyDTD :: DocTypeDecl -> SimpleDTD
simplifyDTD (DTD _ _ decls) =
    SimpleDTD
      { elements   = listToFM [ (name,content)
                              | Element (ElementDecl name content) <- decls ]
      , attributes = listToFM [ ((elem,attr),typ)
                              | AttList (AttListDecl elem attdefs) <- decls
                              , AttDef attr typ _ <- attdefs ]
      , required   = listToFM [ (elem, [ attr
                                       | AttDef attr _ REQUIRED <- attdefs ])
                              | AttList (AttListDecl elem attdefs) <- decls ]
      , ids        = [ (elem,attr)
                     | Element (ElementDecl elem _) <- decls
                     , AttList (AttListDecl name attdefs) <- decls
                     , elem == name
                     , AttDef attr (TokenizedType ID) _ <- attdefs ]
      , idrefs     = []	-- not implemented
      }

-- simple auxiliary to avoid lots of if-then-else with empty else clauses.
gives :: Bool -> a -> [a]
True `gives` x = [x]
False `gives` _ = []

-- | 'validate' takes a DTD and a tagged element, and returns a list of
--   errors in the document with respect to its DTD.
--
--   If you have several documents to validate against a single DTD,
--   then you will gain efficiency by freezing-in the DTD through partial
--   application, e.g. @checkMyDTD = validate myDTD@.
validate :: DocTypeDecl -> Element i -> [String]
validate dtd' elem = root dtd' elem ++ partialValidate dtd' elem
  where
    root (DTD name _ _) (Elem name' _ _) =
        (name/=name') `gives` ("Document type should be <"++name
                               ++"> but appears to be <"++name'++">.")

-- | 'partialValidate' is like validate, except that it does not check that
--   the element type matches that of the DTD's root element.
partialValidate :: DocTypeDecl -> Element i -> [String]
partialValidate dtd' elem = valid elem ++ checkIDs elem
  where
    dtd = simplifyDTD dtd'

    valid (Elem name attrs contents) =
        -- is the element defined in the DTD?
        let spec = lookupFM (elements dtd) name in 
        (isNothing spec) `gives` ("Element <"++name++"> not known.")
        -- is each attribute mentioned only once?
        ++ (let dups = duplicates (map fst attrs) in
            not (null dups) `gives`
               ("Element <"++name++"> has duplicate attributes: "
                ++concat (intersperse "," dups)++"."))
        -- does each attribute belong to this element?  value is in range?
        ++ concatMap (checkAttr name) attrs
        -- are all required attributes present?
        ++ concatMap (checkRequired name attrs)
                     (fromMaybe [] (lookupFM (required dtd) name))
        -- are its children in a permissible sequence?
        ++ checkContentSpec name (fromMaybe ANY spec) contents
        -- now recursively check the element children
        ++ concatMap valid [ elem | CElem elem _ <- contents ]

    checkAttr elem (attr, val) =
        let typ = lookupFM (attributes dtd) (elem,attr)
            attval = attr2str val in
        if isNothing typ then ["Attribute \""++attr
                               ++"\" not known for element <"++elem++">."]
        else
          case fromJust typ of
            EnumeratedType e ->
              case e of
                Enumeration es ->
                    (not (attval `Prelude.elem` es)) `gives`
                          ("Value \""++attval++"\" of attribute \""
                           ++attr++"\" in element <"++elem
                           ++"> is not in the required enumeration range: "
                           ++unwords es)
                _ -> []
            _ -> []

    checkRequired elem attrs req =
        (not (req `Prelude.elem` map fst attrs)) `gives`
            ("Element <"++elem++"> requires the attribute \""++req
             ++"\" but it is missing.")

    checkContentSpec _elem ANY _ = []
    checkContentSpec _elem EMPTY [] = []
    checkContentSpec  elem EMPTY (_:_) =
        ["Element <"++elem++"> is not empty but should be."]
    checkContentSpec  elem (Mixed PCDATA) cs = concatMap (checkMixed elem []) cs
    checkContentSpec  elem (Mixed (PCDATAplus names)) cs =
        concatMap (checkMixed elem names) cs
    checkContentSpec  elem (ContentSpec cp) cs = excludeText elem cs ++
        (let (errs,rest) = checkCP elem cp (flatten cs) in
         case rest of [] -> errs
                      _  -> errs++["Element <"++elem++"> contains extra "
                                  ++"elements beyond its content spec."])

    checkMixed  elem  permitted (CElem (Elem name _ _) _)
        | not (name `Prelude.elem` permitted) =
            ["Element <"++elem++"> contains an element <"++name
             ++"> but should not."]
    checkMixed _elem _permitted _ = []

    flatten (CElem (Elem name _ _) _: cs) = name: flatten cs
    flatten (_: cs)                       = flatten cs
    flatten []                            = []

    excludeText  elem (CElem _ _: cs) = excludeText elem cs
    excludeText  elem (CMisc _ _: cs) = excludeText elem cs
    excludeText  elem (CString _ s _: cs) | all isSpace s = excludeText elem cs
    excludeText  elem (_:  cs) =
        ["Element <"++elem++"> contains text/references but should not."]
    excludeText _elem [] = []

    -- This is a little parser really.  Returns any errors, plus the remainder
    -- of the input string.
    checkCP :: Name -> CP -> [Name] -> ([String],[Name])
    checkCP  elem cp@(TagName n None) [] = (cpError elem cp, [])
    checkCP  elem cp@(TagName n None) (n':ns)
        | n==n'     = ([], ns)
        | otherwise = (cpError elem cp, n':ns)
    checkCP _elem cp@(TagName n Query) [] = ([],[])
    checkCP _elem cp@(TagName n Query) (n':ns)
        | n==n'     = ([], ns)
        | otherwise = ([], n':ns)
    checkCP _elem cp@(TagName n Star) [] = ([],[])
    checkCP  elem cp@(TagName n Star) (n':ns)
        | n==n'     = checkCP elem (TagName n Star) ns
        | otherwise = ([], n':ns)
    checkCP  elem cp@(TagName n Plus) [] = (cpError elem cp, [])
    checkCP  elem cp@(TagName n Plus) (n':ns)
        | n==n'     = checkCP elem (TagName n Star) ns
        | otherwise = (cpError elem cp, n':ns)
 -- omit this clause, to permit (a?|b?) as a valid but empty choice
 -- checkCP elem cp@(Choice cps None) [] = (cpError elem cp, [])
    checkCP  elem cp@(Choice cps None) ns =
        let next = choice elem ns cps in
        if null next then (cpError elem cp, ns)
        else ([], head next)	-- choose the first alternative with no errors
    checkCP _elem cp@(Choice cps Query) [] = ([],[])
    checkCP  elem cp@(Choice cps Query) ns =
        let next = choice elem ns cps in
        if null next then ([],ns)
        else ([], head next)
    checkCP _elem cp@(Choice cps Star) [] = ([],[])
    checkCP  elem cp@(Choice cps Star) ns =
        let next = choice elem ns cps in
        if null next then ([],ns)
        else checkCP elem (Choice cps Star) (head next)
    checkCP  elem cp@(Choice cps Plus) [] = (cpError elem cp, [])
    checkCP  elem cp@(Choice cps Plus) ns =
        let next = choice elem ns cps in
        if null next then (cpError elem cp, ns)
        else checkCP elem (Choice cps Star) (head next)
 -- omit this clause, to permit (a?,b?) as a valid but empty sequence
 -- checkCP elem cp@(Seq cps None) [] = (cpError elem cp, [])
    checkCP  elem cp@(Seq cps None) ns =
        let (errs,next) = sequence elem ns cps in
        if null errs then ([],next)
        else (cpError elem cp++errs, ns)
    checkCP _elem cp@(Seq cps Query) [] = ([],[])
    checkCP  elem cp@(Seq cps Query) ns =
        let (errs,next) = sequence elem ns cps in
        if null errs then ([],next)
        else ([], ns)
    checkCP _elem cp@(Seq cps Star) [] = ([],[])
    checkCP  elem cp@(Seq cps Star) ns =
        let (errs,next) = sequence elem ns cps in
        if null errs then checkCP elem (Seq cps Star) next
        else ([], ns)
    checkCP  elem cp@(Seq cps Plus) [] = (cpError elem cp, [])
    checkCP  elem cp@(Seq cps Plus) ns =
        let (errs,next) = sequence elem ns cps in
        if null errs then checkCP elem (Seq cps Star) next
        else (cpError elem cp++errs, ns)

    choice elem ns cps =  -- return only those parses that don't give any errors
        [ rem | ([],rem) <- map (\cp-> checkCP elem (definite cp) ns) cps ]
        ++ [ ns | all possEmpty cps ]
        where definite (TagName n Query)  = TagName n None
              definite (Choice cps Query) = Choice cps None
              definite (Seq cps Query)    = Seq cps None
              definite (TagName n Star)   = TagName n Plus
              definite (Choice cps Star)  = Choice cps Plus
              definite (Seq cps Star)     = Seq cps Plus
              definite x                  = x
              possEmpty (TagName _ mod)   = mod `Prelude.elem` [Query,Star]
              possEmpty (Choice cps None) = all possEmpty cps
              possEmpty (Choice _ mod)    = mod `Prelude.elem` [Query,Star]
              possEmpty (Seq cps None)    = all possEmpty cps
              possEmpty (Seq _ mod)       = mod `Prelude.elem` [Query,Star]
    sequence elem ns cps =  -- accumulate errors down the sequence
        foldl (\(es,ns) cp-> let (es',ns') = checkCP elem cp ns
                             in (es++es', ns'))
              ([],ns) cps

    checkIDs elem =
        let celem = CElem elem undefined
            showAttr a = iffind a literal none
            idElems = concatMap (\(name,at)-> multi (showAttr at `o` tag name)
                                                    celem)
                                (ids dtd)
            badIds  = duplicates (map (\(CString _ s _)->s) idElems)
        in not (null badIds) `gives`
               ("These attribute values of type ID are not unique: "
                ++concat (intersperse "," badIds)++".")


cpError :: Name -> CP -> [String]
cpError elem cp =
    ["Element <"++elem++"> should contain "++display cp++" but does not."]


display :: CP -> String
display (TagName name mod) = name ++ modifier mod
display (Choice cps mod)   = "(" ++ concat (intersperse "|" (map display cps))
                             ++ ")" ++ modifier mod
display (Seq cps mod)      = "(" ++ concat (intersperse "," (map display cps))
                             ++ ")" ++ modifier mod

modifier :: Modifier -> String
modifier None  = ""
modifier Query = "?"
modifier Star  = "*"
modifier Plus  = "+"

duplicates :: Eq a => [a] -> [a]
duplicates xs = xs \\ (nub xs)