-- ------------------------------------------------------------

{- |
   Module     : Text.XML.HXT.Arrow.XmlRegex
   Copyright  : Copyright (C) 2008 Uwe Schmidt
   License    : MIT

   Maintainer : Uwe Schmidt (uwe@fh-wedel.de)
   Stability  : experimental
   Portability: portable

   Regular Expression Matcher working on lists of XmlTrees

   It's intended to import this module with an explicit
   import declaration for not spoiling the namespace
   with these somewhat special arrows

-}

-- ------------------------------------------------------------

module Text.XML.HXT.Arrow.XmlRegex
    ( XmlRegex
    , mkZero
    , mkUnit
    , mkPrim
    , mkPrim'
    , mkPrimA
    , mkDot
    , mkStar
    , mkAlt
    , mkAlts
    , mkSeq
    , mkSeqs
    , mkRep
    , mkRng
    , mkOpt
    , mkPerm
    , mkPerms
    , mkMerge
    , nullable
    , delta
    , matchXmlRegex
    , splitXmlRegex
    , scanXmlRegex
    , matchRegexA
    , splitRegexA
    , scanRegexA
    )
where

import Control.Arrow.ListArrows

import Data.Maybe

import Text.XML.HXT.DOM.Interface
import Text.XML.HXT.DOM.ShowXml  ( xshow )

-- ------------------------------------------------------------
-- the exported regex arrows

-- | check whether a sequence of XmlTrees match an Xml regular expression
--
-- The arrow for 'matchXmlRegex'.
--
-- The expession is build up from simple arrows acting as predicate ('mkPrimA') for
-- an XmlTree and of the usual cobinators for sequence ('mkSeq'), repetition
-- ('mkStar', mkRep', 'mkRng') and choice ('mkAlt', 'mkOpt')

matchRegexA             :: XmlRegex -> LA XmlTree XmlTree -> LA XmlTree XmlTrees
matchRegexA re ts       = ts >>. (\ s -> maybe [s] (const []) . matchXmlRegex re $ s)

-- | split the sequence of trees computed by the filter a into
--
-- The arrow for 'splitXmlRegex'.
--
-- a first part matching the regex and a rest,
-- if a prefix of the input sequence does not match the regex, the arrow fails
-- else the pair containing the result lists is returned

splitRegexA             :: XmlRegex -> LA XmlTree XmlTree -> LA XmlTree (XmlTrees, XmlTrees)
splitRegexA re ts       = ts >>. (maybeToList . splitXmlRegex re)

-- | scan the input sequence with a regex and give the result as a list of lists of trees back
-- the regex must at least match one input tree, so the empty sequence should not match the regex
--
-- The arrow for 'scanXmlRegex'.

scanRegexA              :: XmlRegex -> LA XmlTree XmlTree -> LA XmlTree XmlTrees
scanRegexA re ts        = ts >>. (fromMaybe [] . scanXmlRegex re)

-- ------------------------------------------------------------

data XmlRegex   = Zero String
                | Unit
                | Sym  (XmlTree -> Bool) String    -- optional external repr. of predicate
                | Dot
                | Star  XmlRegex
                | Alt   XmlRegex XmlRegex
                | Seq   XmlRegex XmlRegex
                | Rep   Int      XmlRegex          -- 1 or more repetitions
                | Rng   Int Int  XmlRegex          -- n..m repetitions
                | Perm  XmlRegex XmlRegex
                | Merge XmlRegex XmlRegex

-- ------------------------------------------------------------

{- just for documentation

class Inv a where
    inv         :: a -> Bool

instance Inv XmlRegex where
    inv (Zero _)        = True
    inv Unit            = True
    inv (Sym p _)       = p holds for some XmlTrees
    inv Dot             = True
    inv (Star e)        = inv e
    inv (Alt e1 e2)     = inv e1 &&
                          inv e2
    inv (Seq e1 e2)     = inv e1 &&
                          inv e2
    inv (Rep i e)       = i > 0 && inv e
    inv (Rng i j e)     = (i < j || (i == j && i > 1)) &&
                          inv e
    inv (Perm e1 e2)    = inv e1 &&
                          inv e2
-}
-- ------------------------------------------------------------
--
-- smart constructors

mkZero          :: String -> XmlRegex
mkZero          = Zero

mkUnit          :: XmlRegex
mkUnit          = Unit

mkPrim          :: (XmlTree -> Bool) -> XmlRegex
mkPrim p        = Sym p ""

mkPrim'         :: (XmlTree -> Bool) -> String -> XmlRegex
mkPrim'         = Sym

mkPrimA         :: LA XmlTree XmlTree -> XmlRegex
mkPrimA a       = mkPrim (not . null . runLA a)

mkDot           :: XmlRegex
mkDot           = Dot

mkStar                  :: XmlRegex -> XmlRegex
mkStar (Zero _)         = mkUnit                -- {}* == ()
mkStar e@Unit           = e                     -- ()* == ()
mkStar e@(Star _e1)     = e                     -- (r*)* == r*
mkStar (Rep 1 e1)       = mkStar e1             -- (r+)* == r*
mkStar e@(Alt _ _)      = Star (rmStar e)       -- (a*|b)* == (a|b)*
mkStar e                = Star e

rmStar  :: XmlRegex -> XmlRegex
rmStar (Alt e1 e2)      = mkAlt (rmStar e1) (rmStar e2)
rmStar (Star e1)        = rmStar e1
rmStar (Rep 1 e1)       = rmStar e1
rmStar e1               = e1

mkAlt                                   :: XmlRegex -> XmlRegex -> XmlRegex
mkAlt e1            (Zero _)            = e1                            -- e1 u {} = e1
mkAlt (Zero _)      e2                  = e2                            -- {} u e2 = e2
mkAlt e1@(Star Dot) _e2                 = e1                            -- A* u e1 = A*
mkAlt _e1           e2@(Star Dot)       = e2                            -- e1 u A* = A*
mkAlt (Sym p1 e1)   (Sym p2 e2)         = mkPrim' (\ x -> p1 x || p2 x)  (e e1 e2) -- melting of predicates
                                          where
                                            e "" x2 = x2
                                            e x1 "" = x1
                                            e x1 x2 = x1 ++ "|" ++ x2
mkAlt e1            e2@(Sym _ _)        = mkAlt e2 e1                   -- symmetry: predicates always first
mkAlt e1@(Sym _ _)  (Alt e2@(Sym _ _) e3)
                                        = mkAlt (mkAlt e1 e2) e3        -- prepare melting of predicates
mkAlt (Alt e1 e2)   e3                  = mkAlt e1 (mkAlt e2 e3)        -- associativity
mkAlt e1 e2                             = Alt e1 e2

mkAlts                          :: [XmlRegex] -> XmlRegex
mkAlts                          = foldr mkAlt (mkZero "")

mkSeq                           :: XmlRegex -> XmlRegex -> XmlRegex
mkSeq e1@(Zero _) _e2           = e1
mkSeq _e1         e2@(Zero _)   = e2
mkSeq Unit        e2            = e2
mkSeq e1          Unit          = e1
mkSeq (Seq e1 e2) e3            = mkSeq e1 (mkSeq e2 e3)
mkSeq e1 e2                     = Seq e1 e2

mkSeqs                          :: [XmlRegex] -> XmlRegex
mkSeqs                          = foldr mkSeq mkUnit

mkRep           :: Int -> XmlRegex -> XmlRegex
mkRep 0 e                       = mkStar e
mkRep _ e@(Zero _)              = e
mkRep _ e@Unit                  = e
mkRep i e                       = Rep i e

mkRng   :: Int -> Int -> XmlRegex -> XmlRegex
mkRng 0  0  _e                  = mkUnit
mkRng 1  1  e                   = e
mkRng lb ub _e
    | lb > ub                   = Zero $
                                  "illegal range " ++
                                  show lb ++ ".." ++ show ub
mkRng _l _u e@(Zero _)          = e
mkRng _l _u e@Unit              = e
mkRng lb ub e                   = Rng lb ub e

mkOpt   :: XmlRegex -> XmlRegex
mkOpt   = mkRng 0 1

mkPerm                           :: XmlRegex -> XmlRegex -> XmlRegex
mkPerm e1@(Zero _) _             = e1
mkPerm _           e2@(Zero _)   = e2
mkPerm Unit        e2            = e2
mkPerm e1          Unit          = e1
mkPerm e1          e2            = Perm e1 e2

mkPerms                          :: [XmlRegex] -> XmlRegex
mkPerms                          = foldr mkPerm mkUnit

mkMerge                          :: XmlRegex -> XmlRegex -> XmlRegex
mkMerge e1@(Zero _) _            = e1
mkMerge _           e2@(Zero _)  = e2
mkMerge Unit        e2           = e2
mkMerge e1          Unit         = e1
mkMerge e1          e2           = Merge e1 e2

-- ------------------------------------------------------------

instance Show XmlRegex where
    show (Zero s)       = "{err:" ++ s ++ "}"
    show Unit           = "()"
    show (Sym _p "")    = "<pred>"
    show (Sym _p r )    = r
    show Dot            = "."
    show (Star e)       = "(" ++ show e ++ ")*"
    show (Alt e1 e2)    = "(" ++ show e1 ++ "|" ++ show e2 ++ ")"
    show (Seq e1 e2)    = show e1 ++ show e2
    show (Rep 1 e)      = "(" ++ show e ++ ")+"
    show (Rep i e)      = "(" ++ show e ++ "){" ++ show i ++ ",}"
    show (Rng 0 1 e)    = "(" ++ show e ++ ")?"
    show (Rng i j e)    = "(" ++ show e ++ "){" ++ show i ++ "," ++ show j ++ "}"
    show (Perm e1 e2)   = "(" ++ show e1 ++ show e2 ++ "|" ++ show e2 ++ show e1 ++ ")"
    show (Merge e1 e2)  = "(" ++ show e1 ++ "&" ++ show e2 ++ ")"

-- ------------------------------------------------------------

unexpected 		:: XmlTree -> String -> String
unexpected t e		= emsg e ++ (cut 80 . xshow) [t]
    where
      emsg ""           = "unexpected: "
      emsg s            = "expected: " ++ s ++ ", but got: "
      cut n s
          | null rest   = s'
          | otherwise   = s' ++ "..."
          where
            (s', rest)  = splitAt n s

-- ------------------------------------------------------------

nullable        :: XmlRegex -> Bool
nullable (Zero _)       = False
nullable Unit           = True
nullable (Sym _p _)     = False         -- assumption: p holds for at least one tree
nullable Dot            = False
nullable (Star _)       = True
nullable (Alt e1 e2)    = nullable e1 ||
                          nullable e2
nullable (Seq e1 e2)    = nullable e1 &&
                          nullable e2
nullable (Rep _i e)     = nullable e
nullable (Rng i _ e)    = i == 0 ||
                          nullable e
nullable (Perm e1 e2)   = nullable e1 &&
                          nullable e2
nullable (Merge e1 e2)  = nullable e1 &&
                          nullable e2

-- ------------------------------------------------------------

delta   :: XmlRegex -> XmlTree -> XmlRegex
delta e@(Zero _)   _    = e
delta Unit         c    = mkZero $ unexpected c ""
delta (Sym p e)    c
    | p c               = mkUnit
    | otherwise         = mkZero $ unexpected c e
delta Dot          _    = mkUnit
delta e@(Star e1)  c    = mkSeq (delta e1 c) e
delta (Alt e1 e2)  c    = mkAlt (delta e1 c) (delta e2 c)
delta (Seq e1 e2)  c
    | nullable e1       = mkAlt (mkSeq (delta e1 c) e2) (delta e2 c)
    | otherwise         = mkSeq (delta e1 c) e2
delta (Rep i e)    c    = mkSeq (delta e c) (mkRep (i-1) e)
delta (Rng i j e)  c    = mkSeq (delta e c) (mkRng ((i-1) `max` 0) (j-1) e)
delta (Perm e1 e2) c    = case e1' of
                            (Zero _) -> mkPerm e1 (delta e2 c)
                            _        -> mkPerm e1' e2
                          where
                          e1' = delta e1 c
delta (Merge e1 e2) c   = mkAlt (mkMerge (delta e1 c) e2)
                                (mkMerge e1 (delta e2 c))

-- ------------------------------------------------------------

delta'          :: XmlRegex -> XmlTrees -> XmlRegex
delta'          = foldl delta

-- | match a sequence of XML trees with a regular expression over trees
--
-- If the input matches, the result is Nothing, else Just an error message is returned

matchXmlRegex           :: XmlRegex -> XmlTrees -> Maybe String
matchXmlRegex e
    = res . delta' e
    where
    res (Zero er)       = Just er
    res re
        | nullable re   = Nothing       -- o.k.
        | otherwise     = Just $ "input does not match " ++ show e

-- ------------------------------------------------------------

-- | split a sequence of XML trees into a pair of a a matching prefix and a rest
--
-- If there is no matching prefix, Nothing is returned

splitXmlRegex           :: XmlRegex -> XmlTrees -> Maybe (XmlTrees, XmlTrees)
splitXmlRegex re        = splitXmlRegex' re []

splitXmlRegex'          :: XmlRegex -> XmlTrees -> XmlTrees -> Maybe (XmlTrees, XmlTrees)
splitXmlRegex' re res []
    | nullable re       = Just (reverse res, [])
    | otherwise         = Nothing

splitXmlRegex' (Zero _) _ _
                        = Nothing

splitXmlRegex' re res xs@(x:xs')
    | isJust res'       = res'
    | nullable re       = Just (reverse res, xs)
    | otherwise         = Nothing
    where
    re'  = delta re x
    res' = splitXmlRegex' re' (x:res) xs'

-- ------------------------------------------------------------

-- | scan a sequence of XML trees and split it into parts matching the given regex
--
-- If the parts cannot be split because of a missing match, or because of the
-- empty sequence as match, Nothing is returned

scanXmlRegex                            :: XmlRegex -> XmlTrees -> Maybe [XmlTrees]
scanXmlRegex re ts                      = scanXmlRegex' re (splitXmlRegex re ts)

scanXmlRegex'                           :: XmlRegex -> Maybe (XmlTrees, XmlTrees) -> Maybe [XmlTrees]
scanXmlRegex' _  Nothing                = Nothing
scanXmlRegex' _  (Just (rs, []))        = Just [rs]
scanXmlRegex' _  (Just ([], _))         = Nothing       -- re is nullable (the empty word matches), nothing split off
                                                        -- would give infinite list of empty lists
scanXmlRegex' re (Just (rs, rest))
    | isNothing res                     = Nothing
    | otherwise                         = Just (rs : fromJust res)
    where
    res = scanXmlRegex' re (splitXmlRegex re rest)

-- ------------------------------------------------------------