{-# OPTIONS #-}

-- ------------------------------------------------------------

module Holumbus.Crawler.Html
where

import           Data.Function.Selector

import           Data.List
import           Data.Maybe

import           Holumbus.Crawler.Types
import           Holumbus.Crawler.URIs

import           System.FilePath

import           Text.XML.HXT.Core              hiding ( when
                                                       , getState
                                                       )
{- just for debugging
import qualified Debug.Trace                    as D
-- -}
-- ------------------------------------------------------------

defaultHtmlCrawlerConfig        :: AccumulateDocResult a r ->
                                   MergeDocResults r -> CrawlerConfig a r
defaultHtmlCrawlerConfig op op2 = ( setS theSysConfig
                                             ( withValidate no
                                               >>>
                                               withParseHTML yes
                                               >>>
                                               withInputEncoding isoLatin1
                                               >>>
                                               withWarnings no
                                               >>>
                                               withIgnoreNoneXmlContents yes
                                             )
                                    >>>
                                    setS thePreRefsFilter this
                                    >>>
                                    setS theProcessRefs getHtmlReferences
                                    $ 
                                    defaultCrawlerConfig op op2
                                  )

-- ------------------------------------------------------------

-- | Collect all HTML references to other documents within a, frame and iframe elements

getHtmlReferences               :: ArrowXml a => a XmlTree URI
getHtmlReferences               = fromLA (getRefs $< computeDocBase)
    where
    getRefs base                = deep (hasNameWith
                                        ( (`elem` ["a","frame","iframe"]) . localPart )
                                       )
                                  >>>
                                  ( getAttrValue0 "href"
                                    <+>
                                    getAttrValue0 "src"
                                  )
                                  >>^ toAbsRef base

getDocReferences                :: ArrowXml a => a XmlTree URI
getDocReferences                = fromLA (getRefs $< computeDocBase)
    where
    getRefs base                = multi selRefs >>^ toAbsRef base
        where
          hasLocName n          = hasNameWith ((== n) . localPart)
          selRef en an          = hasLocName en :-> getAttrValue0 an
          selRefs               = choiceA $
                                  map (uncurry selRef) names
                                  ++
                                  [ appletRefs
                                  , objectRefs
                                  , this :-> none
                                  ]
          names                 = [ ("img",     "src")
                                  , ("input",   "src")          -- input type="image" scr="..."
                                  , ("link",    "href")
                                  , ("script",  "src")
                                  ]                             
          appletRefs            = hasLocName "applet"   :-> (getAppRef $< getAppBase)
              where
                getAppBase      = (getAttrValue0 "codebase" `withDefault` ".") >>^ toAbsRef base
                getAppRef ab    = getAttrValue0 "code" >>^ toAbsRef ab

          objectRefs            = hasLocName "object"   :-> none        -- TODO

-- | construct an absolute URI by a base URI and a possibly relative URI

toAbsRef                        :: URI -> URI -> URI
toAbsRef base ref               = ( expandURIString ref                 -- here >>> is normal function composition
                                    >>>
                                    fromMaybe ref
                                    >>>
                                    removeFragment
                                  ) base
    where
    removeFragment r
        | "#" `isPrefixOf` path = reverse . tail $ path
        | otherwise             = r
        where
        path                    = dropWhile (/='#') . reverse $ r 

-- ------------------------------------------------------------

-- | Compute the base URI of a HTML page with respect to a possibly
--   given base element in the head element of a html page.
--
--   Stolen from Uwe Schmidt, http:\/\/www.haskell.org\/haskellwiki\/HXT
--   and then stolen back again by Uwe from Holumbus.Utility

computeDocBase                  :: ArrowXml a => a XmlTree String
computeDocBase                  = ( ( ( getByPath ["html", "head", "base"]
                                        >>>
                                        getAttrValue "href"                     -- and compute document base with transfer uri and base
                                      )
                                      &&&
                                      getAttrValue transferURI
                                    )
                                    >>> expandURI
                                  )
                                  `orElse`
                                  getAttrValue transferURI              -- the default: take the transfer uri

-- ------------------------------------------------------------

getByPath                       :: ArrowXml a => [String] -> a XmlTree XmlTree
getByPath                       = seqA . map (\ n -> getChildren >>> hasName n)

getHtmlTitle                    :: ArrowXml a => a XmlTree String
getHtmlTitle                    = getAllText $
                                  getByPath ["html", "head", "title"]

getHtmlPlainText                :: ArrowXml a => a XmlTree String
getHtmlPlainText                = getAllText $
                                  getByPath ["html", "body"]

getAllText                      :: ArrowXml a => a XmlTree XmlTree -> a XmlTree String
getAllText getText'             = ( getText'
                                    >>>
                                    ( fromLA $ deep getText )
                                    >>^
                                    (" " ++)                            -- text parts are separated by a space
                                  )
                                  >. (concat >>> normalizeWS)           -- normalize Space

isHtmlContents                  :: ArrowXml a => a XmlTree XmlTree
isHtmlContents                  = ( getAttrValue transferMimeType
                                    >>>
                                    isA ( `elem` [text_html, application_xhtml] )
                                  ) `guards` this

isPdfContents                   :: ArrowXml a => a XmlTree XmlTree
isPdfContents                   = ( getAttrValue transferMimeType
                                    >>>
                                    isA ( == application_pdf )
                                  ) `guards` this

getTitleOrDocName               :: ArrowXml a => a XmlTree String
getTitleOrDocName               = ( getHtmlTitle >>> isA (not . null) )
                                  `orElse`
                                  ( getAttrValue transferURI >>^ takeFileName )

isElemWithAttr                  :: ArrowXml a => String -> String -> (String -> Bool) -> a XmlTree XmlTree
isElemWithAttr en an av         = isElem
                                  >>>
                                  hasName en
                                  >>>
                                  hasAttrValue an av

-- ------------------------------------------------------------

application_pdf                 :: String
application_pdf                 = "application/pdf"

-- ------------------------------------------------------------

-- | normalize whitespace by splitting a text into words and joining this together with unwords

normalizeWS                     :: String -> String
normalizeWS                     = words >>> unwords

-- | take the first n chars of a string, if the input
--   is too long the cut off is indicated by \"...\" at the end

limitLength     :: Int -> String -> String
limitLength n s
    | length s' <= n            = s
    | otherwise                 = take (n - 3) s' ++ "..."
    where
    s'                          = take (n + 1) s

-- ------------------------------------------------------------