module Network.Shpider.URL
( module Network.URL
, isSameDomain
, mkAbsoluteUrl
, isAbsoluteUrl
, isMailto
, isHttp
, getDomain
, getFolder
)
where
import Network.URL
import Network.Shpider.State
import Text.Regex.Posix
import Network.Shpider.TextUtils
isSameDomain :: String -> String -> Bool
isSameDomain urlYourOn urlYourNot =
let url1 =
importURL urlYourOn
url2 =
importURL urlYourNot
in
maybe False
( \ t1 ->
maybe False
( \ t2 ->
let urlType1 = url_type t1
urlType2 = url_type t2
isRelative ut = ( ut == PathRelative ) || ( ut == HostRelative )
ut1Relative = isRelative urlType1
ut2Relative = isRelative urlType2
utsSame = urlType1 == urlType2
in
( ut1Relative || ut2Relative || utsSame )
)
url2
)
url1
mkAbsoluteUrl :: String -> Shpider ( Maybe String )
mkAbsoluteUrl uncleanUrl = do
if not $ isMailto uncleanUrl
then do
shpider <- get
let cleanUrlStr = escapeSpaces uncleanUrl
maybeUrl = importURL cleanUrlStr
currentAddr = addr $ currentPage shpider
maybe ( return Nothing )
( \ url ->
case url_type url of
PathRelative ->
return $ Just $ getFolder currentAddr ++ urlStr
HostRelative ->
return $ Just $ getDomain currentAddr ++ urlStr
_ ->
return $ Just urlStr
)
maybeUrl
else
return Nothing
where
urlStr =
escapeSpaces uncleanUrl
isAbsoluteUrl :: String -> Bool
isAbsoluteUrl urlStr =
case importURL urlStr of
Just url ->
case url_type url of
Absolute _ ->
True
_ ->
False
_ ->
False
isMailto :: String -> Bool
isMailto =
flip (=~) "mailto:.*"
isHttp :: String -> Bool
isHttp =
flip (=~) "(http://|https://).*"
getDomain :: String -> String
getDomain =
flip (=~) "[^:]+://[^/]+"
getFolder :: String -> String
getFolder url =
let ms =
url =~ "[^/]*/[^/]*"
l =
length ms
in
if l > 0
then
( concat $ take ( l 1 ) $ map head ms ) ++ "/"
else
""