import Text.HTML.TagSoup as T import Control.Monad import Control.Monad.Reader import Control.Monad.State import Data.Maybe import Data.Char import Data.List import System import Network.Stream import Network.URI import Network.HTTP import qualified Data.Map as M -- Stolen from GHC.Exts sortWith :: Ord b => (a -> b) -> [a] -> [a] sortWith f = sortBy (\x y -> compare (f x) (f y)) main :: IO () main = fmap makeConfig getArgs >>= runReaderT ( (ask >>= liftIO . putStrLn . ("\n"++) . (++"\n") . show) >> makeFilesList >>= processFiles >>= displayResults) data Config = Config { configSeparator :: Char , configMappings :: [(String,URI)] , configNoReportCodes :: [Integer] } deriving Show -- For storing URIs in Data.Map instance Ord URI where x < y = show x < show y x <= y = show x <= show y type FileProcessingResult = (FilePath, Either String [HrefProcessingResult]) type HrefProcessingResult = (Href, URIProcessingResult) type Href = String type URIProcessingResult = Result (Response String) type URIMap = M.Map URI URIProcessingResult defaultConfig :: Config defaultConfig = Config { configSeparator = '\n' , configMappings = [] , configNoReportCodes = [200] } makeConfig :: [String] -> Config makeConfig arguments = foldl (flip ($)) defaultConfig (makeConfig' arguments) where makeConfig' :: [String] -> [Config -> Config] makeConfig' args = case args of [] -> [] ("-0":rest) -> (\x -> x { configSeparator = '\NUL' }) : makeConfig' rest ("--map" : fpath : uri : rest) -> (\x -> x { configMappings = reverse $ sortWith fst $ configMappings x ++ [( fpath , fromMaybe (error $ "Not a valid URL: " ++ uri) (parseAbsoluteURI uri) )] }) : makeConfig' rest ("-n":xs) -> let (consumed,rest) = span (\x -> all isDigit x && length x == 3) xs in if null consumed then error "No valid args for -n option (3-digit numbers)" else (\x -> x { configNoReportCodes = map (read::String->Integer) consumed }) : makeConfig' rest garbage -> error $ "Unconsumed commandline arguments: " ++ show garbage makeFilesList :: ReaderT Config IO [String] makeFilesList = do config <- ask liftIO $ fmap (linesWith $ configSeparator config) getContents where linesWith sep contents = let (l, s') = break (== sep) contents in l : case s' of [] -> [] (_:s'') -> lines s'' processFiles :: [FilePath] -> ReaderT Config IO [FileProcessingResult] processFiles fpaths = evalStateT (mapM (\p -> sanitize p $ processFile p) fpaths) M.empty where -- We must catch io errors per-file to avoid throwing away the other results sanitize fp func = do s <- get config <- lift ask liftIO $ catch (runReaderT (evalStateT func s) config) (\e -> return (fp, Left $ show e)) processFile :: FilePath -> StateT URIMap (ReaderT Config IO) FileProcessingResult processFile fpath = do (b, hrefs) <- liftIO $ extractHrefs fpath config <- lift ask let -- take longest (first) match and create baseURI as combination of implicit and explicit base href bestMatch = listToMaybe $ filter (\(str,_) -> str `isPrefixOf` fpath) (configMappings config) implicitBaseURI = fromMaybe nullURI $ bestMatch >>= makeUpURI fpath where makeUpURI string mapping = ((`relativeTo` snd mapping) . fromMaybe nullURI . parseURIReference) $ drop (length $ fst mapping) string baseURI = fromMaybe nullURI (fromMaybe nullURI (parseAbsoluteURI $ fromMaybe "" b) `relativeTo` implicitBaseURI) ret <- mapM (processHref baseURI) hrefs return (fpath, Right ret) processHref :: URI -> Href -> StateT URIMap (ReaderT Config IO) HrefProcessingResult processHref baseURI href = do oldMap <- get let maybeURI = parseURIReference href if isNothing maybeURI then return (href, Left $ ErrorParse "Couldn't parse href") else if not (null $ uriScheme $ fromJust maybeURI) && uriScheme (fromJust maybeURI) /= "http:" then return (href, Left $ ErrorParse "Not evaluating non-http locator") else do let requestURI = fromJust $ fromJust maybeURI `relativeTo` baseURI -- relativeTo never returns Nothing. wtf liftIO $ putStrLn $ "Checking: " ++ show requestURI reqres <- maybe (do res <- liftIO $ simpleHTTP $ Request requestURI HEAD [] "" put $ M.insert requestURI res oldMap return res) return (M.lookup requestURI oldMap) -- HACK: prepend requestURI to Href if not fully qualified if (null $ uriScheme $ fromJust maybeURI) || (isNothing $ uriAuthority $ fromJust maybeURI) then return ("(" ++ show requestURI ++ ") " ++ href, reqres) else return (href, reqres) extractHrefs :: FilePath -> IO (Maybe String,[String]) extractHrefs fpath = do tags <- fmap parseTags $ readFile fpath return $ collect $ map examine tags where examine x = case x of TagOpen "base" attrs -> (listToMaybe $ findHrefs attrs, []) TagOpen _ attrs -> (Nothing, findHrefs attrs) _ -> (Nothing, []) collect = foldl (\(a,b) (x,y) -> (x `mplus` a, b ++ y)) (Nothing,[]) findHrefs = mapMaybe $ \a -> case a of ("href",val) -> Just val _ -> Nothing displayResults :: [FileProcessingResult] -> ReaderT Config IO () displayResults = mapM_ displayResult where displayResult :: FileProcessingResult -> ReaderT Config IO () displayResult (fpath, fileresult) = do config <- ask let reports = case fileresult of Left err -> "Error: " ++ show err Right hrefresults -> let results = filter (mustReport config . snd) hrefresults in if null results then "" else unlines $ map showHrefResult results liftIO $ unless (null reports) $ do putStrLn "" putStrLn $ underline ("Results for " ++ fpath) putStrLn reports showHrefResult (hr,urires) = case urires of Left connError -> hr ++ "\t" ++ show connError Right response -> hr ++ "\t" ++ (head . lines . show) response underline x = x ++ "\n" ++ map (\_ -> '~') x mustReport :: Config -> URIProcessingResult -> Bool mustReport config res = case res of Left _ -> True Right response -> not $ (assembleRspCode (rspCode response) `elem` configNoReportCodes config) where assembleRspCode :: (Int,Int,Int) -> Integer assembleRspCode (a,b,c) = fromIntegral (100 * a + 10 * b + c)