{-DHUN| module for processing urls and downloading their content with repect to mediawiki DHUN-}
module UrlAnalyse
       (getpage, analyse, analyseFull, unify, WikiUrl, getLemma,
        FullWikiUrl, hostname, url, alternatives, lemma, wikiUrl, geturl,
        parses, geturl2, fullWikiUrlZero, getExpandedPage, getpage2)
       where
import qualified Network.Browser as Browser
import Network.HTTP.Conduit
import Data.ByteString.Lazy.Internal
import Network.URL as URL
import Network.HTTP
import Control.Monad
import Data.Maybe
import Data.List.Split
import Data.List
import Text.XML.HXT.Core
import Data.Tree.NTree.TypeDefs
import Codec.Binary.UTF8.String
import Control.Exception
import qualified Data.ByteString as BStr
import qualified Data.ByteString.UTF8 as UTF8Str
 
{-DHUN| This represents the main url of a wiki page. So the url of the wiki page that should be converted to latex. It is a tuple wrapped into the maybe monad, to deal with case in which the url could not be parses. The first element of the tuple is just the main url parser with Network.URL The second element is a list of urls. These URLs are possible base urls for wiki pages. So en.wikipedia.org/wiki/Foobar has got the main url en.wikipedia.org/wiki/Foobar and one of the base urls us en.wikipedia.org/wiki/. Base urls are important since the wiki source related to the main url might include subpage in wiki notation that is [[JohnDow]]. The actual url to look up JohnDow is the baseurl plus JohnDow so that is en.wikipedia.org/wiki/JohnDow Since also images in the commons and similar things are possible, the are usually some baseurls to be looked at. This way the base URl have to be a list. DHUN-}
 
type WikiUrl = Maybe (URL, [URL])
 
{-DHUN| same as WikiURL. two additional parameters. the host parameter is name of host. And lemma is the name of the page on the wiki DHUN-}
 
wikiUrl :: FullWikiUrl -> WikiUrl
wikiUrl fu = return (url fu, alternatives fu)
 
data FullWikiUrl = FullWikiUrl{url :: URL, alternatives :: [URL],
                               hostname :: String, lemma :: String}
                 deriving (Eq, Ord)
 
fullWikiUrlZero :: FullWikiUrl
fullWikiUrlZero
  = FullWikiUrl{url =
                  URL{URL.url_type =
                        Absolute
                          (Host{URL.protocol = HTTP False, URL.host = "",
                                URL.port = Nothing}),
                      url_path = "", url_params = []},
                alternatives = [], hostname = "", lemma = ""}
 
{-DHUN| returns the list of baseurls of an WikiURL. The list may be empty if the WikiURL has none. See documentation on the type 'WikiURL' to understand what a baseurl is. DHUN-}
 
parses :: WikiUrl -> [URL]
parses u
  = do uu <- maybeToList u
       snd uu
 
{-DHUN| takes a baseurl and a wiki lemma and creates the url under which the wiki source code of the lemma can be retrieved. So the Url en.wikipedia.org/wiki/FoorBar has possible many baseurls. One of which is  en.wikipedia.org/wiki/. The wiki source of the lemma JonDow can be retrieved from the wiki via the url en.wikipedia.org/wiki/Special:Export/JohnDow. Which is just what these function returns. See also documentation on the type 'WikiUrl' on what a baseurl is. DHUN-}
 
modpath :: String -> URL -> URL
modpath s u
  = u{url_path =
        if p /= [] then p ++ "/Special:Export/" ++ s else
          "Special:Export/" ++ s}
  where p = (url_path u)
 
modpathForExpansion :: URL -> URL
modpathForExpansion u
  = u{url_path =
        (if p /= [] then p ++ "/Special:ExpandTemplates" else
           "Special:ExpandTemplates"),
      url_params = []}
  where p = (url_path u)
 
{-DHUN| load a webpage via http the url to the webpage has to be given as string on the first input parameter. The result of the request is returned as a String wrapped in the IO monad. DHUN-}
 
geturl :: String -> IO String
geturl u
  = Control.Exception.catch
      (do x <- Browser.browse $
                 do Browser.setAllowRedirects True
                    Browser.request $ getRequest u
          return (rspBody (snd x)))
      fun
  where  
        fun :: ErrorCall -> IO String
        fun _ = return ""
 
geturl3 :: String -> String -> String -> IO String
geturl3 u d s
  = Control.Exception.catch
      (do req0 <- parseUrl u
          let req
                = (urlEncodedBody
                     (map (\ (a, b) -> (UTF8Str.fromString a, UTF8Str.fromString b))
                        [("wpInput", d), ("removecomments", "1"), ("removenowiki", "1"),
                         ("generate_xml", "0"), ("contexttitle", s)]))
                    req0
          res <- withManager $ httpLbs req
          return ((unpackChars (responseBody res))))
      fun
  where  
        fun :: ErrorCall -> IO String
        fun _ = return ""
 
geturl2 :: String -> IO BStr.ByteString
geturl2 u
  = Control.Exception.catch
      (do x <- Browser.browse $
                 do Browser.setAllowRedirects True
                    Browser.request $
                      Request{rqURI = rqURI dg, rqMethod = rqMethod dg,
                              rqHeaders = rqHeaders dg, rqBody = UTF8Str.fromString (rqBody dg)}
          return (rspBody (snd x)))
      fun
  where  
        fun :: ErrorCall -> IO BStr.ByteString
        fun _ = return (BStr.pack [])
        dg = (getRequest u)
 
{-DHUN| helper function to get the actual wiki source as string out of a part of and xml tree returned by the xml parser. Only used for the function getTextContent DHUN-}
 
toText :: [NTree XNode] -> Maybe String
toText [NTree _ [NTree (XText l) []]] = Just l
toText _ = Nothing
 
{-DHUN| this function gets the actual wiki source of a wiki page out of the result String returned by the Special:Export function of mediawiki. You should not call this function directly since it may break the flow of control. Better use the function getTextContent2 DHUN-}
 
getTextContent :: String -> IO (Maybe String)
getTextContent z
  = do h <- runX
              ((readString [withValidate no, withParseHTML yes] z) >>>
                 (deep (isElem >>> hasName "text")))
       x <- return . toText $ h
       return (seq x x)
 
{-DHUN| this function gets the actual wiki source of a wiki page out of the result String returned by the Special:Export function of mediawiki. This function returns its result wrapped in a maybe monad so it can return the maybe value Nothing in case of failure but does not break the flow of control. It is also wrapped in the IO monad since the xml parser used is bound to the IO monad  DHUN-}
 
getTextContent2 :: String -> IO (Maybe String)
getTextContent2 z
  = catchJust myfun (getTextContent z) (\ _ -> return Nothing)
 
getExpandedTextContent :: String -> IO (Maybe String)
getExpandedTextContent z
  = do h <- runX
              ((readString [withValidate no, withParseHTML yes] z) >>>
                 (deep
                    (isElem >>> hasName "textarea" >>>
                       (hasAttrValue "id" (\ g -> g == "output")))))
       x <- return . toText $ h
       return (seq x x)
 
getExpandedTextContent2 :: String -> IO (Maybe String)
getExpandedTextContent2 z
  = catchJust myfun (getExpandedTextContent z)
      (\ _ -> return Nothing)
 
{-DHUN| exception predicate interested in all exceptions. Thats means we catch all exceptions. This needed in getTexTContent2 so that the maybe value nothing is returned in case of an exception but the control flow is not interrupted DHUN-}
 
myfun :: SomeException -> Maybe ()
myfun _ = return ()
 
{-DHUN| gets the wiki source code of a lemma on a wiki. The first parameter is the lemma. So JohnDow for en.wikipedia/wiki/JohnDow. The second parameter is the wikiurl. A wikiurl specifies the wiki from which the data should be downloaded. See documentation on the type 'WikiUrl' for more information. Also see documentation on the function 'analyze' to see how to create a 'WikiUrl'. This function returns the wiki source code of the lemma as String it is wrapped in the IO monad since it does a http request it is also wrapped in the Maybe monad since it may not be able to retrieve the source. DHUN-}
 
getpage :: String -> WikiUrl -> IO (Maybe String)
getpage ss u
  = do l <- mapM (geturl . unify . exportURL . modpath ss) (parses u)
       ll <- mapM getTextContent2 l
       lll <- return (seq ll ll)
       return $
         (listToMaybe $ concat (map maybeToList lll)) >>=
           (return . decodeString)
 
getpage2 :: String -> WikiUrl -> IO (Maybe (String, URL))
getpage2 ss u
  = do l <- mapM (geturl . unify . exportURL . modpath ss) (parses u)
       ll <- mapM getTextContent2 l
       lll <- return (seq ll ll)
       return $ (listToMaybe $ concat (map go (zip lll (parses u))))
  where go (Just xx, uu) = [(decodeString xx, uu)]
        go _ = []
 
getExpandedPage :: String -> String -> URL -> IO (Maybe String)
getExpandedPage ss d u
  = do l <- mapM
              ((\ x -> geturl3 x d ss) . unify . exportURL . modpathForExpansion)
              [u]
       ll <- mapM getExpandedTextContent2 l
       lll <- return (seq ll ll)
       return $
         (listToMaybe $ concat (map maybeToList lll)) >>=
           (return . decodeString)
 
{-DHUN| unescapes the special character underscore and % from url notation DHUN-}
 
unify :: [Char] -> [Char]
unify ('%' : ('2' : ('0' : xs))) = '_' : unify xs
unify ('%' : ('2' : ('5' : xs))) = '%' : unify xs
unify (x : xs) = x : (unify xs)
unify [] = []
 
{-DHUN| converts an url given as String into a WikiURL. See description on type 'WikiURL' on what that means. DHUN-}
 
analyse :: String -> WikiUrl
analyse s
  = do vv <- v
       ww <- importURL "http://commons.wikimedia.org/wiki"
       return (vv, z ++ [ww])
  where v = importURL s
         
        z :: [URL]
        z = do u <- maybeToList $ v
               l <- return $ splitOn "/" $ (unify (url_path u))
               x <- (map
                       (\ i ->
                          if (length l) > i then [intercalate "/" (take i l)] else mzero)
                       [0, 1, 2])
               map (\ i -> u{url_path = i}) x
 
{-DHUN| converts an url given as String into the lemma it points to on the wiki.  DHUN-}
 
getLemma :: String -> Maybe String
getLemma s = z
  where v = importURL s
         
        z :: Maybe String
        z = do u <- v
               l <- return $ splitOn "/" $ (unify (url_path u))
               let x = if (length l) > 1 then drop 1 l else l
               let xx
                     = if "index.php" `elem` x then
                         case dropWhile (/= "index.php") x of
                             (_ : ys) -> ys
                             _ -> []
                         else x
               return $ intercalate "/" xx
 
{-DHUN| converts an url given as String into the host (as string) it points to on the wiki.  DHUN-}
 
getHost :: String -> Maybe String
getHost s = z
  where v = importURL s
         
        z :: Maybe String
        z = do u <- v
               case (url_type u) of
                   Absolute h -> return (URL.host h)
                   _ -> mzero
 
analyseFull :: String -> Maybe FullWikiUrl
analyseFull theUrl
  = do ana <- analyse theUrl
       l <- getLemma theUrl
       h <- getHost theUrl
       return $
         FullWikiUrl{url = fst ana, alternatives = snd ana, hostname = h,
                     lemma = l}