-- ------------------------------------------------------------ {- | Module : Text.XML.HXT.IO.GetHTTPLibCurl Copyright : Copyright (C) 2008 Uwe Schmidt License : MIT Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Stability : stable Portability: portable GET for http access with libcurl -} -- ------------------------------------------------------------ module Text.XML.HXT.IO.GetHTTPLibCurl ( getCont ) where import Control.Arrow ( first , (>>>) ) import Control.Concurrent.MVar import Control.Monad ( when ) import Data.Char ( isDigit , isSpace ) import Data.List ( isPrefixOf ) import Network.Curl import System.IO import System.IO.Unsafe ( unsafePerformIO ) import Text.ParserCombinators.Parsec ( parse ) import Text.XML.HXT.DOM.Util ( stringToLower ) import Text.XML.HXT.DOM.XmlKeywords import Text.XML.HXT.DOM.XmlOptions ( isTrueValue ) import Text.XML.HXT.Parser.ProtocolHandlerUtil ( parseContentType ) import Text.XML.HXT.Version -- ------------------------------------------------------------ -- -- the global flag for initializing curl in the 1. call isInitCurl :: MVar Bool isInitCurl = unsafePerformIO $ newMVar False initCurl :: IO () initCurl = do i <- takeMVar isInitCurl when (not i) ( do curl_global_init 3 return () ) putMVar isInitCurl True -- ------------------------------------------------------------ -- -- the http protocol handler implemented by calling libcurl -- () -- via the curl binding -- -- This function tries to support mostly all curl options concerning HTTP requests. -- The naming convetion is as follows: A curl option must be prefixed by the string -- \"curl\" and then written exactly as described in the curl man page -- (). -- -- Example: -- -- > getCont [("curl--user-agent","My first HXT app"),("curl-e","http://the.referer.url/")] "http://..." -- -- will set the user agent and the referer URL for this request. getCont :: [(String, String)] -> String -> IO (Either ([(String, String)], String) ([(String, String)], String)) getCont options uri = do initCurl resp <- curlGetResponse_ uri curlOptions -- dumpResponse return $ evalResponse resp where _dumpResponse r = do hPutStrLn stderr $ show $ respCurlCode r hPutStrLn stderr $ show $ respStatus r hPutStrLn stderr $ respStatusLine r hPutStrLn stderr $ show $ respHeaders r hPutStrLn stderr $ respBody r curlOptions = defaultOptions ++ concatMap (uncurry copt) options ++ standardOptions defaultOptions -- these options may be overwritten = [ CurlUserAgent ("hxt/" ++ hxt_version ++ " via libcurl") , CurlFollowLocation True ] standardOptions -- these options can't be overwritten = [ CurlFailOnError False , CurlHeader False , CurlNoProgress True ] evalResponse r | rc /= CurlOK = Left ( [ mkH transferStatus "999" , mkH transferMessage $ "curl library rc: " ++ show rc ] , "curl library error when requesting URI " ++ show uri ++ ": (curl return code=" ++ show rc ++ ") " ) | rs < 200 && rs >= 300 = Left ( contentT rsh ++ headers , "http error when accessing URI " ++ show uri ++ ": " ++ show rsl ) | otherwise = Right ( contentT rsh ++ headers, respBody r ) where mkH x y = (x, dropWhile isSpace y) headers = map (\ (k, v) -> mkH (httpPrefix ++ stringToLower k) v) rsh ++ statusLine (words rsl) contentT = map (first stringToLower) -- all header names to lowercase >>> filter ((== "content-type") . fst) -- select content-type header >>> reverse -- when libcurl is called with automatic redirects, there are more than one content-type headers >>> take 1 -- take the last one, (if at leat one is found) >>> map snd -- select content-type value >>> map ( either (const []) id . parse parseContentType "" -- parse the content-type for mimetype and charset ) >>> concat statusLine (vers : _code : msg) -- the status line of the curl response can be an old one, e.g. in the case of a redirect, = [ mkH transferVersion vers -- so the return code is taken from the status field, which is contains the last status , mkH transferMessage $ unwords msg , mkH transferStatus $ show rs ] statusLine _ = [] rc = respCurlCode r rs = respStatus r rsl = respStatusLine r rsh = respHeaders r -- ------------------------------------------------------------ copt :: String -> String -> [CurlOption] copt k v | "curl" `isPrefixOf` k = opt2copt (drop 4 k) v | k `elem` [a_proxy, a_redirect] = opt2copt k v | k == a_options_curl = curlOptionString v | otherwise = [] opt2copt :: String -> String -> [CurlOption] opt2copt k v | k `elem` ["-A", "--user-agent"] = [CurlUserAgent v] | k `elem` ["-b", "--cookie"] = [CurlCookie v] | k == "--connect-timeout" && isIntArg v = [CurlConnectTimeout $ read v] | k == "--crlf" = [CurlCRLF $ isTrue v] | k `elem` ["-d", "--data"] = [CurlPostFields $ lines v] | k `elem` ["-e", "--referer"] = [CurlReferer v] | k `elem` ["-H", "--header"] = [CurlHttpHeaders $ lines v] | k == "--ignore-content-length" = [CurlIgnoreContentLength $ isTrue v] | k `elem` ["-I", "--head"] = [CurlNoBody $ isTrue v] | k `elem` ["-L", "--location", a_redirect] = [CurlFollowLocation $ isTrue v] | k == "--max-filesize" && isIntArg v = [CurlMaxFileSizeLarge $ read v] | k `elem` ["-m", "--max-time"] && isIntArg v = [CurlTimeoutMS $ read v] | k `elem` ["-n", "--netrc"] = [CurlNetrcFile v] | k `elem` ["--ssl-verify-peer"] = [CurlSSLVerifyPeer $ read v] | k `elem` ["-R", "--remote-time"] = [CurlFiletime $ isTrue v] | k `elem` ["-u", "--user"] = [CurlUserPwd v] | k `elem` ["-U", "--proxy-user"] = [CurlProxyUserPwd v] | k `elem` ["-x", "--proxy"] = proxyOptions | k `elem` ["-X", "--request"] = [CurlCustomRequest v] | k `elem` ["-y", "--speed-time"] && isIntArg v = [CurlLowSpeedTime $ read v] | k `elem` ["-Y", "--speed-limit"] && isIntArg v = [CurlLowSpeed $ read v] | k `elem` ["-z", "--time-cond"] = ifModifiedOptions | k == "--max-redirs" && isIntArg v = [CurlMaxRedirs $ read v] | k `elem` ["-0", "--http1.0"] = [CurlHttpVersion HttpVersion10] | otherwise = [] where ifModifiedOptions | "-" `isPrefixOf` v && isIntArg v' = [CurlTimeCondition TimeCondIfUnmodSince ,CurlTimeValue $ read v' ] | isIntArg v = [CurlTimeCondition TimeCondIfModSince ,CurlTimeValue $ read v' ] | otherwise = [] where v' = tail v proxyOptions = [ CurlProxyPort pport , CurlProxy phost ] where pport | isIntArg ppp = read v | otherwise = 1080 (phost, pp) = span (/=':') v ppp = drop 1 pp isTrue :: String -> Bool isTrue s = null s || isTrueValue s isIntArg :: String -> Bool isIntArg s = not (null s) && all isDigit s curlOptionString :: String -> [CurlOption] curlOptionString = concatMap (uncurry copt) . opts . words where opts l | null l = [] | not ("-" `isPrefixOf` k) = opts l1 -- k not an option: ignore | null l1 = opts (k:"":l1) -- last option | "-" `isPrefixOf` v = (k, "") : opts (v:l) -- k option without arg | otherwise = (k, v) : opts l2 -- k with value where (k:l1) = l (v:l2) = l1 -- ------------------------------------------------------------