{- - - Copyright (c) 2009-2010 Johnny Morrice - - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation - files (the "Software"), to deal in the Software without - restriction, including without limitation the rights to use, copy, - modify, merge, publish, distribute, sublicense, and/or sell copies - of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - -} module Network.Shpider.URL ( module Network.URL , isSameDomain , mkAbsoluteUrl , isAbsoluteUrl , isMailto , isHttp , getDomain , getFolder ) where import Network.URL import Network.Shpider.State import Text.Regex.Posix import Network.Shpider.TextUtils -- | is the second url on the same domain as the first? Note: this will return False if either URL is invalid. isSameDomain :: String -> String -> Bool isSameDomain urlYourOn urlYourNot = let url1 = importURL urlYourOn url2 = importURL urlYourNot in maybe False ( \ t1 -> maybe False ( \ t2 -> let urlType1 = url_type t1 urlType2 = url_type t2 isRelative ut = ( ut == PathRelative ) || ( ut == HostRelative ) ut1Relative = isRelative urlType1 ut2Relative = isRelative urlType2 utsSame = urlType1 == urlType2 in ( ut1Relative || ut2Relative || utsSame ) ) url2 ) url1 -- | Assumes the given URL is relative to `currentPage`. mkAbsoluteUrl :: String -> Shpider ( Maybe String ) mkAbsoluteUrl uncleanUrl = do if not $ isMailto uncleanUrl then do shpider <- get let cleanUrlStr = escapeSpaces uncleanUrl maybeUrl = importURL cleanUrlStr currentAddr = addr $ currentPage shpider maybe ( return Nothing ) ( \ url -> case url_type url of PathRelative -> return $ Just $ getFolder currentAddr ++ urlStr HostRelative -> return $ Just $ getDomain currentAddr ++ urlStr _ -> return $ Just urlStr ) maybeUrl else return Nothing where urlStr = escapeSpaces uncleanUrl -- | True if the url is absolute isAbsoluteUrl :: String -> Bool isAbsoluteUrl urlStr = case importURL urlStr of Just url -> case url_type url of Absolute _ -> True _ -> False _ -> False -- | is the given string of form \"mailto:person.com\"? isMailto :: String -> Bool isMailto = flip (=~) "mailto:.*" -- | is the url a http url? isHttp :: String -> Bool isHttp = flip (=~) "(http://|https://).*" -- | Get the protocol and domain from a URL eg -- -- @ -- getDomain \"widdle:\/\/owqueer.co.uk\/strangeanticsofsailors\/jimmy\" -- -- \"widdle:\/\/owqueer.co.uk\" -- @ getDomain :: String -> String getDomain = flip (=~) "[^:]+://[^/]+" -- | Get the whole url up to and including the current folder of the present document. -- -- @ -- getFolder \"widdle:\/\/owqueer.co.uk\/strangeanticsofsailors\/jimmy\" -- -- \"widdle:\/\/owqueer.co.uk\/strangeanticsofsailors\/\" -- @ getFolder :: String -> String getFolder url = let ms = url =~ "[^/]*/[^/]*" l = length ms in if l > 0 then ( concat $ take ( l - 1 ) $ map head ms ) ++ "/" else ""