----------------------------------------------------------------------------- -- | -- Module : UrlCheck.hs -- Copyright : (c) Don Stewart 2006 -- License : BSD3-style (see LICENSE) -- -- Maintainer : dons@cse.unsw.edu.au -- Stability : stable -- Portability : portable -- ----------------------------------------------------------------------------- import Data.Char (isControl) import Data.List import Text.Printf import Data.ByteString.Lazy.Char8 (ByteString, pack, unpack) import qualified Data.ByteString.Lazy.Char8 as B import qualified Data.Set as S import qualified Data.Map as M import Control.Monad.State import Control.Concurrent import Control.Exception (handle, finally) import System.Console.GetOpt import System.Environment import System.Exit import System.IO import System.Time import TinyHTTP data Job = Job ByteString | Done main = time $ do (files,k) <- parseArgs proxy <- getProxy let n = length files bad <- newMVar (0 :: Int) errs <- newChan jobs <- newChan children <- newMVar [] forkIO (writer errs) fork k children (thread errs jobs bad proxy) stats <- execStateT (mapM_ check files) (empty jobs) replicateM_ k (writeChan jobs Done) wait children broken <- takeMVar bad printf fmt broken (found stats) (S.size (cache stats)) n (if n > 1 then "s" else "") where empty = UC S.empty 0 fmt = "Found %d broken links. Checked %d links (%d unique) in %d file%s.\n" -- -- fork k children threads -- fork k cs f = flip mapM_ [1..k] $ \n -> do mv <- newEmptyMVar modifyMVar_ cs (return . (mv :)) forkIO (f n `finally` putMVar mv ()) -- -- just print out failed urls as they arrive -- writer c = getChanContents c >>= mapM_ (\s -> putStrLn s >> hFlush stdout) -- -- wait on a list of children threads -- wait cs = do xs <- takeMVar cs case xs of [] -> return () m:ms -> do putMVar cs ms takeMVar m wait cs -- -- extract a list of urls from a file, and write them into the job queue -- need to do this in smp-parallel fashion -- check f = do src <- io (B.readFile f) let urls = extract src bad <- filterM seenURI urls sendJobs bad updateStats (length urls) -- -- read jobs from the queue, check if they work on the network -- thread errs queue bad proxy n = loop where loop = do job <- readChan queue case job of Done -> return () Job x -> run (B.unpack x) >> loop inc = modifyMVar_ bad (return . (+1)) run url = case parseURI url of Just uri -> do mn <- handle (return . Left . show) (getStatus uri proxy) case mn of Right 200 -> return () Right n -> next (show n) Left err -> next err _ -> next "Invalid URL" where next s = inc >> writeChan errs (url ++ " " ++ s) -- -- Url cache type and statistics -- data UC = UC { cache :: S.Set ByteString, found :: Int, queue :: Chan Job } updateStats a = modify $ \s -> s { found = found s + a } insertURI c = modify $ \s -> s { cache = S.insert c (cache s) } seenURI u = do v <- (not . S.member u) `fmap` gets cache insertURI u return v sendJobs js = do c <- gets queue io $ mapM_ (writeChan c . Job) js io = liftIO -- -- URI extraction -- extract :: ByteString -> [ByteString] extract = concatMap split . B.lines split :: ByteString -> [ByteString] split ln = uris ln uris s = filter (\s -> not (B.null s) && looksOk s) (B.splitWith isDelim s) where isDelim c = isControl c || c `elem` " <>\"{}|\\^[]`" looksOk s = http `B.isPrefixOf` s http = pack "http:" -- -- Argument handling -- data Flag = Help | N Int deriving Eq options = [Option ['h'] ["help"] (NoArg Help) "Show this help message" ,Option ['n'] [] (ReqArg (\s -> N (read s)) "N") "Number of concurrent connections (default 16)" ] parseArgs = do argv <- getArgs case parse argv of ([], fs, []) -> return (nub fs, 16) (as, fs, []) | Help `elem` as -> help | [N n] <- filter (/=Help) as -> return (nub fs, n) (_,_,errs) -> die errs where parse argv = getOpt Permute options argv header = "Usage: checkuri [-h] [-n n] [file ...]" info = usageInfo header options dump = hPutStrLn stderr die errs = dump (concat errs ++ info) >> exitWith (ExitFailure 1) help = dump info >> exitWith ExitSuccess getProxy = handle (\_ -> return Nothing) $ do env <- M.fromList `fmap` getEnvironment return $! do s <- M.lookup "http_proxy" env a <- parseURI s v <- uriAuthority a let host = uriRegName v port = read (tail (uriPort v)) return (host, port) -- -- Time a computation -- time a = do start <- getClockTime a end <- getClockTime let diff = diffClockTimes end start s = timeDiffToString diff printf "Search time: %s\n" s