import Text.HTML.TagSoup as T
import Control.Monad
import Control.Monad.Reader
import Control.Monad.State
import Data.Maybe
import Data.Char
import Data.List
import System
import Network.Stream
import Network.URI
import Network.HTTP
import qualified Data.Map as M
-- Stolen from GHC.Exts
sortWith :: Ord b => (a -> b) -> [a] -> [a]
sortWith f = sortBy (\x y -> compare (f x) (f y))
main :: IO ()
main =
fmap makeConfig getArgs
>>= runReaderT (
(ask >>= liftIO . putStrLn . ("\n"++) . (++"\n") . show)
>> makeFilesList
>>= processFiles
>>= displayResults)
data Config = Config
{ configSeparator :: Char
, configMappings :: [(String,URI)]
, configNoReportCodes :: [Integer]
} deriving Show
-- For storing URIs in Data.Map
instance Ord URI where
x < y = show x < show y
x <= y = show x <= show y
type FileProcessingResult = (FilePath, Either String [HrefProcessingResult])
type HrefProcessingResult = (Href, URIProcessingResult)
type Href = String
type URIProcessingResult = Result (Response String)
type URIMap = M.Map URI URIProcessingResult
defaultConfig :: Config
defaultConfig =
Config { configSeparator = '\n'
, configMappings = []
, configNoReportCodes = [200]
}
makeConfig :: [String] -> Config
makeConfig arguments = foldl (flip ($)) defaultConfig (makeConfig' arguments)
where
makeConfig' :: [String] -> [Config -> Config]
makeConfig' args = case args of
[] -> []
("-0":rest) ->
(\x -> x { configSeparator = '\NUL' })
: makeConfig' rest
("--map" : fpath : uri : rest) ->
(\x -> x { configMappings = reverse $ sortWith fst $ configMappings x
++ [( fpath
, fromMaybe
(error $ "Not a valid URL: " ++ uri)
(parseAbsoluteURI uri)
)] })
: makeConfig' rest
("-n":xs) ->
let (consumed,rest) = span (\x -> all isDigit x && length x == 3) xs
in if null consumed
then error "No valid args for -n option (3-digit numbers)"
else (\x -> x { configNoReportCodes = map (read::String->Integer) consumed })
: makeConfig' rest
garbage -> error $ "Unconsumed commandline arguments: " ++ show garbage
makeFilesList :: ReaderT Config IO [String]
makeFilesList = do
config <- ask
liftIO $ fmap (linesWith $ configSeparator config) getContents
where
linesWith sep contents = let (l, s') = break (== sep) contents
in l : case s' of
[] -> []
(_:s'') -> lines s''
processFiles :: [FilePath] -> ReaderT Config IO [FileProcessingResult]
processFiles fpaths =
evalStateT (mapM (\p -> sanitize p $ processFile p) fpaths) M.empty
where
-- We must catch io errors per-file to avoid throwing away the other results
sanitize fp func = do
s <- get
config <- lift ask
liftIO $ catch
(runReaderT (evalStateT func s) config)
(\e -> return (fp, Left $ show e))
processFile :: FilePath -> StateT URIMap (ReaderT Config IO) FileProcessingResult
processFile fpath = do
(b, hrefs) <- liftIO $ extractHrefs fpath
config <- lift ask
let
-- take longest (first) match and create baseURI as combination of implicit and explicit base href
bestMatch = listToMaybe $ filter (\(str,_) -> str `isPrefixOf` fpath) (configMappings config)
implicitBaseURI = fromMaybe nullURI $ bestMatch >>= makeUpURI fpath
where
makeUpURI string mapping = ((`relativeTo` snd mapping) . fromMaybe nullURI . parseURIReference) $ drop (length $ fst mapping) string
baseURI = fromMaybe nullURI (fromMaybe nullURI (parseAbsoluteURI $ fromMaybe "" b) `relativeTo` implicitBaseURI)
ret <- mapM (processHref baseURI) hrefs
return (fpath, Right ret)
processHref :: URI -> Href -> StateT URIMap (ReaderT Config IO) HrefProcessingResult
processHref baseURI href = do
oldMap <- get
let maybeURI = parseURIReference href
if isNothing maybeURI then
return (href, Left $ ErrorParse "Couldn't parse href")
else if not (null $ uriScheme $ fromJust maybeURI) && uriScheme (fromJust maybeURI) /= "http:" then
return (href, Left $ ErrorParse "Not evaluating non-http locator")
else do
let
requestURI = fromJust $ fromJust maybeURI `relativeTo` baseURI -- relativeTo never returns Nothing. wtf
liftIO $ putStrLn $ "Checking: " ++ show requestURI
reqres <- maybe
(do
res <- liftIO $ simpleHTTP $ Request requestURI HEAD [] ""
put $ M.insert requestURI res oldMap
return res)
return
(M.lookup requestURI oldMap)
-- HACK: prepend requestURI to Href if not fully qualified
if (null $ uriScheme $ fromJust maybeURI) ||
(isNothing $ uriAuthority $ fromJust maybeURI)
then return ("(" ++ show requestURI ++ ") " ++ href, reqres)
else return (href, reqres)
extractHrefs :: FilePath -> IO (Maybe String,[String])
extractHrefs fpath = do
tags <- fmap parseTags $ readFile fpath
return $ collect $ map examine tags
where
examine x = case x of
TagOpen "base" attrs -> (listToMaybe $ findHrefs attrs, [])
TagOpen _ attrs -> (Nothing, findHrefs attrs)
_ -> (Nothing, [])
collect = foldl (\(a,b) (x,y) -> (x `mplus` a, b ++ y)) (Nothing,[])
findHrefs = mapMaybe $ \a -> case a of
("href",val) -> Just val
_ -> Nothing
displayResults :: [FileProcessingResult] -> ReaderT Config IO ()
displayResults = mapM_ displayResult where
displayResult :: FileProcessingResult -> ReaderT Config IO ()
displayResult (fpath, fileresult) = do
config <- ask
let reports = case fileresult of
Left err -> "Error: " ++ show err
Right hrefresults ->
let results = filter (mustReport config . snd) hrefresults in
if null results
then ""
else unlines $ map showHrefResult results
liftIO $ unless (null reports) $ do
putStrLn ""
putStrLn $ underline ("Results for " ++ fpath)
putStrLn reports
showHrefResult (hr,urires) = case urires of
Left connError -> hr ++ "\t" ++ show connError
Right response -> hr ++ "\t" ++ (head . lines . show) response
underline x = x ++ "\n" ++ map (\_ -> '~') x
mustReport :: Config -> URIProcessingResult -> Bool
mustReport config res =
case res of
Left _ -> True
Right response -> not $ (assembleRspCode (rspCode response) `elem` configNoReportCodes config)
where
assembleRspCode :: (Int,Int,Int) -> Integer
assembleRspCode (a,b,c) = fromIntegral (100 * a + 10 * b + c)