module Text.HTML.Tagchup.Process (
   Encoding, Encoded,
   evalDecodeAdaptive, decodeAdaptive, decodeTagAdaptive,
   getEmbeddedEncoding,
   getXMLEncoding,
   findMetaEncoding,
   getMetaHTTPHeaders,
   getHeadTags,
   partAttrs,
   parts,
   takeBeforeMatchingClose,
   takeUntilMatchingClose,
   ) where

import qualified Text.HTML.Tagchup.Tag as Tag
import qualified Text.HTML.Tagchup.Tag.Match as Match

import qualified Text.XML.Basic.ProcessingInstruction as PI
import qualified Text.XML.Basic.Attribute as Attr
import qualified Text.XML.Basic.Name as Name
import qualified Text.XML.Basic.Tag as TagX
import qualified Text.HTML.Basic.Tag as TagH
import qualified Text.HTML.Basic.Character as HTMLChar
import qualified Text.HTML.Basic.String as HTMLString

import Text.HTML.Basic.String (Encoded, )

import Control.Monad.Trans.State (State, put, get, evalState, )
import Control.Monad.HT ((<=<), )
import Control.Monad (msum, guard, )
import Control.Applicative ((<|>))

import qualified Data.NonEmpty as NonEmpty
import qualified Data.List.Match as ListMatch
import qualified Data.Foldable as Fold
import Data.Traversable (traverse, )
import Data.List.HT (viewL, takeUntil, switchR, )
import Data.Maybe.HT (toMaybe, )
import Data.Maybe (fromMaybe, mapMaybe, )


-- * analyse soup


type Encoding = String



evalDecodeAdaptive ::
   State (Encoded -> String) a -> a
evalDecodeAdaptive :: forall a. State (Encoded -> Encoded) a -> a
evalDecodeAdaptive =
   forall a b c. (a -> b -> c) -> b -> a -> c
flip forall s a. State s a -> s -> a
evalState forall a. a -> a
id

{- |
Selects a decoder dynamically according
to xml-encoding and meta-http-equiv tags.
The @?xml@ tag should only appear at the beginning of a document,
but we respect it at every occurence.

> import qualified Text.XML.HXT.DOM.Unicode as Unicode

> evalDecodeAdaptive .
> decodeAdaptive
>    (maybe Unicode.latin1ToUnicode (fst.) .
>     Unicode.getDecodingFct)
-}
decodeAdaptive ::
   (Name.Attribute name, Name.Tag name) =>
   (Encoding -> Encoded -> String) ->
   [Tag.T name [HTMLChar.T]] ->
   State (Encoded -> String) [Tag.T name String]
decodeAdaptive :: forall name.
(Attribute name, Tag name) =>
(Encoded -> Encoded -> Encoded)
-> [T name [T]] -> State (Encoded -> Encoded) [T name Encoded]
decodeAdaptive Encoded -> Encoded -> Encoded
getDecoder =
   forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
traverse (forall name.
(Attribute name, Tag name) =>
(Encoded -> Encoded -> Encoded)
-> T name [T] -> State (Encoded -> Encoded) (T name Encoded)
decodeTagAdaptive Encoded -> Encoded -> Encoded
getDecoder)

{- |
@decodeTagAdaptive decoderSelector tag@ generates a state monad,
with a decoder as state.
It decodes encoding specific byte sequences
using the current decoder
and XML references using a fixed table.
-}
decodeTagAdaptive ::
   (Name.Attribute name, Name.Tag name) =>
   (Encoding -> Encoded -> String) ->
   Tag.T name [HTMLChar.T] ->
   State (Encoded -> String) (Tag.T name String)
decodeTagAdaptive :: forall name.
(Attribute name, Tag name) =>
(Encoded -> Encoded -> Encoded)
-> T name [T] -> State (Encoded -> Encoded) (T name Encoded)
decodeTagAdaptive Encoded -> Encoded -> Encoded
getDecoder T name [T]
tag0 =
   do Encoded -> Encoded
decoder <- forall (m :: * -> *) s. Monad m => StateT s m s
get
      let tag1 :: T name Encoded
tag1 =
             -- this is less elegant than using maybeCData but lazier
             forall b a. b -> (a -> b) -> Maybe a -> b
maybe
                (forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((Encoded -> Encoded) -> [T] -> Encoded
HTMLString.decode Encoded -> Encoded
decoder) T name [T]
tag0)
                (\(Name name
name, Encoded
s) ->
                   forall name string. Name name -> Encoded -> T name string
Tag.special Name name
name forall a b. (a -> b) -> a -> b
$
                      if forall name. Tag name => Name name
TagH.cdataName forall a. Eq a => a -> a -> Bool
== Name name
name
                        then Encoded -> Encoded
decoder Encoded
s
                        else Encoded
s)
                (forall name string. T name string -> Maybe (Name name, Encoded)
Tag.maybeSpecial T name [T]
tag0)
      forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
Fold.mapM_ (forall (m :: * -> *) s. Monad m => s -> StateT s m ()
put forall b c a. (b -> c) -> (a -> b) -> a -> c
. Encoded -> Encoded -> Encoded
getDecoder) forall a b. (a -> b) -> a -> b
$
         (do (Name name, [T name Encoded])
openTag <- forall name string.
T name string -> Maybe (Name name, [T name string])
Tag.maybeOpen T name Encoded
tag1
             forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry forall name.
(Tag name, Attribute name) =>
Name name -> [T name Encoded] -> Maybe Encoded
TagH.maybeMetaEncoding (Name name, [T name Encoded])
openTag forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
                forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry forall name string.
(Tag name, Attribute name) =>
Name name -> [T name string] -> Maybe string
TagH.maybeMetaCharset (Name name, [T name Encoded])
openTag)
         forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
         (forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry forall name string.
(Tag name, Attribute name) =>
Name name -> T name string -> Maybe string
TagX.maybeXMLEncoding forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< forall name string.
T name string -> Maybe (Name name, T name string)
Tag.maybeProcessing T name Encoded
tag1)
      forall (m :: * -> *) a. Monad m => a -> m a
return T name Encoded
tag1


getEmbeddedEncoding ::
   (Name.Attribute name, Name.Tag name) =>
   [Tag.T name String] -> Maybe Encoding
getEmbeddedEncoding :: forall name.
(Attribute name, Tag name) =>
[T name Encoded] -> Maybe Encoded
getEmbeddedEncoding [T name Encoded]
leadingTags =
   let xmlEncoding :: Maybe Encoded
xmlEncoding = do
         (T name Encoded
t,[T name Encoded]
_) <- forall a. [a] -> Maybe (a, [a])
viewL [T name Encoded]
leadingTags
         (Name name
name, PI.Known [T name Encoded]
attrs) <- forall name string.
T name string -> Maybe (Name name, T name string)
Tag.maybeProcessing T name Encoded
t
         forall (f :: * -> *). Alternative f => Bool -> f ()
guard (forall name. Tag name => Name name
TagX.xmlName forall a. Eq a => a -> a -> Bool
== Name name
name)
         forall name string.
Attribute name =>
Name name -> [T name string] -> Maybe string
Attr.lookup forall name. Attribute name => Name name
Attr.encodingName [T name Encoded]
attrs

   in  forall (t :: * -> *) (m :: * -> *) a.
(Foldable t, MonadPlus m) =>
t (m a) -> m a
msum forall a b. (a -> b) -> a -> b
$
         Maybe Encoded
xmlEncoding forall a. a -> [a] -> [a]
:
         forall a b. (a -> b) -> [a] -> [b]
map
            (\(Name name, [T name Encoded])
tag ->
               forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry forall name string.
(Tag name, Attribute name) =>
Name name -> [T name string] -> Maybe string
TagH.maybeMetaCharset (Name name, [T name Encoded])
tag forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
               forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry forall name.
(Tag name, Attribute name) =>
Name name -> [T name Encoded] -> Maybe Encoded
TagH.maybeMetaEncoding (Name name, [T name Encoded])
tag)
            (forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe forall name string.
T name string -> Maybe (Name name, [T name string])
Tag.maybeOpen forall a b. (a -> b) -> a -> b
$ forall name string.
(Tag name, Attribute name) =>
[T name string] -> [T name string]
getHeadTags [T name Encoded]
leadingTags)


{- |
Check whether the first tag is an @xml@ processing instruction tag
and return the value of its @encoding@ attribute.
-}
getXMLEncoding ::
   (Name.Tag name, Name.Attribute name) =>
   [Tag.T name String] -> Maybe String
getXMLEncoding :: forall name.
(Tag name, Attribute name) =>
[T name Encoded] -> Maybe Encoded
getXMLEncoding [T name Encoded]
tags =
   do (T name Encoded
t,[T name Encoded]
_) <- forall a. [a] -> Maybe (a, [a])
viewL [T name Encoded]
tags
      forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry forall name string.
(Tag name, Attribute name) =>
Name name -> T name string -> Maybe string
TagX.maybeXMLEncoding forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< forall name string.
T name string -> Maybe (Name name, T name string)
Tag.maybeProcessing T name Encoded
t

{- |
Rather the same as @wraxml:HTML.Tree.findMetaEncoding@
-}
findMetaEncoding ::
   (Name.Tag name, Name.Attribute name) =>
   [Tag.T name String] -> Maybe String
findMetaEncoding :: forall name.
(Tag name, Attribute name) =>
[T name Encoded] -> Maybe Encoded
findMetaEncoding =
   forall (t :: * -> *) (m :: * -> *) a.
(Foldable t, MonadPlus m) =>
t (m a) -> m a
msum forall b c a. (b -> c) -> (a -> b) -> a -> c
.
   forall a b. (a -> b) -> [a] -> [b]
map (forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry forall name.
(Tag name, Attribute name) =>
Name name -> [T name Encoded] -> Maybe Encoded
TagH.maybeMetaEncoding forall (m :: * -> *) b c a.
Monad m =>
(b -> m c) -> (a -> m b) -> a -> m c
<=< forall name string.
T name string -> Maybe (Name name, [T name string])
Tag.maybeOpen) forall b c a. (b -> c) -> (a -> b) -> a -> c
.
   forall name string.
(Tag name, Attribute name) =>
[T name string] -> [T name string]
getHeadTags


{- |
Extract META tags which contain HTTP-EQUIV attribute
and present these values like HTTP headers.
-}
getMetaHTTPHeaders ::
   (Name.Tag name, Name.Attribute name) =>
   [Tag.T name string] -> [(string, string)]
getMetaHTTPHeaders :: forall name string.
(Tag name, Attribute name) =>
[T name string] -> [(string, string)]
getMetaHTTPHeaders =
   forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe (forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry forall name string.
(Tag name, Attribute name) =>
Name name -> [T name string] -> Maybe (string, string)
TagH.maybeMetaHTTPHeader forall (m :: * -> *) b c a.
Monad m =>
(b -> m c) -> (a -> m b) -> a -> m c
<=< forall name string.
T name string -> Maybe (Name name, [T name string])
Tag.maybeOpen) forall b c a. (b -> c) -> (a -> b) -> a -> c
.
   forall name string.
(Tag name, Attribute name) =>
[T name string] -> [T name string]
getHeadTags


getHeadTags ::
   (Name.Tag name, Name.Attribute name) =>
   [Tag.T name string] -> [Tag.T name string]
getHeadTags :: forall name string.
(Tag name, Attribute name) =>
[T name string] -> [T name string]
getHeadTags =
   forall a. (a -> Bool) -> [a] -> [a]
takeWhile (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall name string. Tag name => Encoded -> T name string -> Bool
Match.closeLit Encoded
"head") forall b c a. (b -> c) -> (a -> b) -> a -> c
.
   forall a. Int -> [a] -> [a]
drop Int
1 forall b c a. (b -> c) -> (a -> b) -> a -> c
.
   forall a. (a -> Bool) -> [a] -> [a]
dropWhile (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall name string. Tag name => Encoded -> T name string -> Bool
Match.openNameLit Encoded
"head") forall b c a. (b -> c) -> (a -> b) -> a -> c
.
   forall a. (a -> Bool) -> [a] -> [a]
takeWhile (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall name string. Tag name => Encoded -> T name string -> Bool
Match.openNameLit Encoded
"body")


-- * transform soup

{- |
Modify attributes and tags of certain parts.
For limitations, see 'parts'.
-}
partAttrs ::
   (Name.Tag name) =>
   (Tag.Name name -> Bool) ->
   (([Attr.T name string], [Tag.T name string]) ->
    ([Attr.T name string], [Tag.T name string])) ->
   [Tag.T name string] -> [Tag.T name string]
partAttrs :: forall name string.
Tag name =>
(Name name -> Bool)
-> (([T name string], [T name string])
    -> ([T name string], [T name string]))
-> [T name string]
-> [T name string]
partAttrs Name name -> Bool
p ([T name string], [T name string])
-> ([T name string], [T name string])
f =
   forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap
      (forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either
          (\((Name name
name,[T name string]
attrs),[T name string]
part) ->
              let ([T name string]
newAttrs, [T name string]
newPart) = ([T name string], [T name string])
-> ([T name string], [T name string])
f ([T name string]
attrs, [T name string]
part)
              in  forall name string. Name name -> [T name string] -> T name string
Tag.Open Name name
name [T name string]
newAttrs forall a. a -> [a] -> [a]
: [T name string]
newPart forall a. [a] -> [a] -> [a]
++ [forall name string. Name name -> T name string
Tag.Close Name name
name])
          forall a. a -> a
id) forall b c a. (b -> c) -> (a -> b) -> a -> c
.
   forall name string.
Tag name =>
(Name name -> Bool)
-> [T name string]
-> [Either
      ((Name name, [T name string]), [T name string]) [T name string]]
parts Name name -> Bool
p

{- |
Extract parts from the tag soup
that are enclosed in corresponding open and close tags.
If a close tag is missing, the soup end is considered as end of the part.
However nested tags are not supported,
e.g. in @\<a\>\<a\>\<\/a\>\<\/a\>@ the second @\<a\>@ is considered
to be enclosed in the first @\<a\>@ and the first @\<\/a\>@,
and the second @\<\/a\>@ is ignored.
-}
parts ::
   (Name.Tag name) =>
   (Tag.Name name -> Bool) ->
   [Tag.T name string] ->
   [Either
       ((Tag.Name name, [Attr.T name string]), [Tag.T name string])
       [Tag.T name string]]
parts :: forall name string.
Tag name =>
(Name name -> Bool)
-> [T name string]
-> [Either
      ((Name name, [T name string]), [T name string]) [T name string]]
parts Name name -> Bool
p =
   let recourse :: [T name string]
-> [Either
      ((Name name, [T name string]), [T name string]) [T name string]]
recourse [T name string]
ts =
          let ([T name string]
prefix0,[T name string]
suffix0) = forall a. (a -> Bool) -> [a] -> ([a], [a])
break (forall name string.
(Name name -> Bool)
-> ([T name string] -> Bool) -> T name string -> Bool
Match.open Name name -> Bool
p (forall a b. a -> b -> a
const Bool
True)) [T name string]
ts
          in  forall a b. b -> Either a b
Right [T name string]
prefix0 forall a. a -> [a] -> [a]
:
              forall a. a -> Maybe a -> a
fromMaybe []
                 (do (T name string
t, [T name string]
suffix1) <- forall a. [a] -> Maybe (a, [a])
viewL [T name string]
suffix0
                     (Name name
name, [T name string]
attrs) <- forall name string.
T name string -> Maybe (Name name, [T name string])
Tag.maybeOpen T name string
t
                     let ([T name string]
part,[T name string]
suffix2) = forall a. (a -> Bool) -> [a] -> ([a], [a])
break (forall name string. (Name name -> Bool) -> T name string -> Bool
Match.close (Name name
nameforall a. Eq a => a -> a -> Bool
==)) [T name string]
suffix1
                     forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall a b. a -> Either a b
Left ((Name name
name, [T name string]
attrs), [T name string]
part) forall a. a -> [a] -> [a]
: [T name string]
-> [Either
      ((Name name, [T name string]), [T name string]) [T name string]]
recourse (forall a. Int -> [a] -> [a]
drop Int
1 [T name string]
suffix2))
   in  forall {string}.
[T name string]
-> [Either
      ((Name name, [T name string]), [T name string]) [T name string]]
recourse



nestDiff :: (Eq name) => TagH.Name name -> Tag.T name string -> Int
nestDiff :: forall name string. Eq name => Name name -> T name string -> Int
nestDiff Name name
name T name string
tag =
   forall a. Enum a => a -> Int
fromEnum (forall name string.
(Name name -> Bool)
-> ([T name string] -> Bool) -> T name string -> Bool
Match.open (Name name
nameforall a. Eq a => a -> a -> Bool
==) (forall a b. a -> b -> a
const Bool
True) T name string
tag)
   forall a. Num a => a -> a -> a
-
   forall a. Enum a => a -> Int
fromEnum (forall name string. (Name name -> Bool) -> T name string -> Bool
Match.close (Name name
nameforall a. Eq a => a -> a -> Bool
==) T name string
tag)

countNesting :: (a -> Int) -> [a] -> [Int]
countNesting :: forall a. (a -> Int) -> [a] -> [Int]
countNesting a -> Int
p = forall (f :: * -> *) a. T f a -> f a
NonEmpty.tail forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (f :: * -> *) b a.
Traversable f =>
(b -> a -> b) -> b -> f a -> T f b
NonEmpty.scanl forall a. Num a => a -> a -> a
(+) Int
0 forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map a -> Int
p


{-
Could be moved to utility-ht, but there seem to be several useful variants
with respect to whether opening and closing element should be included.
-}
{- |
> Process> let parenDiff c = case c of '(' -> 1; ')' -> -1; _ -> 0
> Process> takeBeforeMatch parenDiff "((abc)de)f"
> "((abc)de"
-}
takeBeforeMatch :: (a -> Int) -> [a] -> [a]
takeBeforeMatch :: forall a. (a -> Int) -> [a] -> [a]
takeBeforeMatch a -> Int
p [a]
xs =
   forall a b c. (a -> b -> c) -> b -> a -> c
flip forall b a. [b] -> [a] -> [a]
ListMatch.take [a]
xs forall a b. (a -> b) -> a -> b
$ forall a. (a -> Bool) -> [a] -> [a]
takeWhile (forall a. Ord a => a -> a -> Bool
>Int
0) forall a b. (a -> b) -> a -> b
$ forall a. (a -> Int) -> [a] -> [Int]
countNesting a -> Int
p [a]
xs

{- |
Take all tags until the one that matches the opening tag.
The matching closing tag is not included in the list.
The list must begin with the according opening tag.
Nesting of the considered tag is respected,
but the nesting of other tags is ignored.
-}
takeBeforeMatchingClose ::
   (Eq name) => TagH.Name name -> [Tag.T name string] -> [Tag.T name string]
takeBeforeMatchingClose :: forall name string.
Eq name =>
Name name -> [T name string] -> [T name string]
takeBeforeMatchingClose Name name
name = forall a. (a -> Int) -> [a] -> [a]
takeBeforeMatch forall a b. (a -> b) -> a -> b
$ forall name string. Eq name => Name name -> T name string -> Int
nestDiff Name name
name


{- |
> Process> takeUntilMatch parenDiff "((abc)de)f"
> Just "((abc)de)"
-}
takeUntilMatch :: (a -> Int) -> [a] -> Maybe [a]
takeUntilMatch :: forall a. (a -> Int) -> [a] -> Maybe [a]
takeUntilMatch a -> Int
p [a]
xs =
   (\[Int]
ys -> forall b a. b -> ([a] -> a -> b) -> [a] -> b
switchR (forall a. a -> Maybe a
Just []) (\[Int]
_ Int
y -> forall a. Bool -> a -> Maybe a
toMaybe (Int
yforall a. Eq a => a -> a -> Bool
==Int
0) forall a b. (a -> b) -> a -> b
$ forall b a. [b] -> [a] -> [a]
ListMatch.take [Int]
ys [a]
xs) [Int]
ys) forall a b. (a -> b) -> a -> b
$
   forall a. (a -> Bool) -> [a] -> [a]
takeUntil (forall a. Eq a => a -> a -> Bool
==Int
0) forall a b. (a -> b) -> a -> b
$ forall a. (a -> Int) -> [a] -> [Int]
countNesting a -> Int
p [a]
xs

{- |
This is like 'takeBeforeMatchingClose'
but the matching close tag is included in the result.
-}
takeUntilMatchingClose ::
   (Eq name) =>
   TagH.Name name -> [Tag.T name string] -> Maybe [Tag.T name string]
takeUntilMatchingClose :: forall name string.
Eq name =>
Name name -> [T name string] -> Maybe [T name string]
takeUntilMatchingClose Name name
name = forall a. (a -> Int) -> [a] -> Maybe [a]
takeUntilMatch forall a b. (a -> b) -> a -> b
$ forall name string. Eq name => Name name -> T name string -> Int
nestDiff Name name
name