{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}

module SeoCheck
  ( seoCheck,
    runSeoCheck,
  )
where

import Control.Applicative
import Control.Concurrent
import Control.Monad
import Control.Monad.IO.Class
import Control.Monad.Logger
import qualified Data.ByteString as SB
import qualified Data.ByteString.Lazy as LB
import qualified Data.CaseInsensitive as CI
import Data.IntMap (IntMap)
import qualified Data.IntMap.Strict as IM
import Data.List
import Data.Map (Map)
import qualified Data.Map as M
import Data.Maybe
import Data.Set (Set)
import qualified Data.Set as S
import Data.Text (Text)
import qualified Data.Text as T
import Data.Validity
import Network.HTTP.Client as HTTP
import Network.HTTP.Client.TLS as HTTP
import Network.HTTP.Types as HTTP
import Network.URI
import Rainbow
import SeoCheck.OptParse
import System.Exit
import Text.HTML.DOM as HTML
import Text.Show.Pretty (ppShow)
import Text.XML as XML
import UnliftIO hiding (link)

seoCheck :: IO ()
seoCheck :: IO ()
seoCheck = IO Settings
getSettings IO Settings -> (Settings -> IO ()) -> IO ()
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Settings -> IO ()
runSeoCheck

runSeoCheck :: Settings -> IO ()
runSeoCheck :: Settings -> IO ()
runSeoCheck settings :: Settings
settings@Settings {Maybe Int
Maybe Word
URI
LogLevel
setMaxDepth :: Settings -> Maybe Word
setFetchers :: Settings -> Maybe Int
setLogLevel :: Settings -> LogLevel
setUri :: Settings -> URI
setMaxDepth :: Maybe Word
setFetchers :: Maybe Int
setLogLevel :: LogLevel
setUri :: URI
..} = do
  Manager
man <- IO Manager
forall (m :: * -> *). MonadIO m => m Manager
HTTP.newTlsManager
  TQueue Link
queue <- IO (TQueue Link)
forall (m :: * -> *) a. MonadIO m => m (TQueue a)
newTQueueIO
  TVar (Set URI)
seen <- Set URI -> IO (TVar (Set URI))
forall (m :: * -> *) a. MonadIO m => a -> m (TVar a)
newTVarIO Set URI
forall a. Set a
S.empty
  TVar (Map Link Result)
results <- Map Link Result -> IO (TVar (Map Link Result))
forall (m :: * -> *) a. MonadIO m => a -> m (TVar a)
newTVarIO Map Link Result
forall k a. Map k a
M.empty
  let fetchers :: Int
fetchers = Int -> Maybe Int -> Int
forall a. a -> Maybe a -> a
fromMaybe Int
1 Maybe Int
setFetchers
      indexes :: [Int]
indexes = [Int
0 .. Int
fetchers Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]
  TVar (IntMap Bool)
fetcherStati <- IntMap Bool -> IO (TVar (IntMap Bool))
forall (m :: * -> *) a. MonadIO m => a -> m (TVar a)
newTVarIO (IntMap Bool -> IO (TVar (IntMap Bool)))
-> IntMap Bool -> IO (TVar (IntMap Bool))
forall a b. (a -> b) -> a -> b
$ [(Int, Bool)] -> IntMap Bool
forall a. [(Int, a)] -> IntMap a
IM.fromList ([(Int, Bool)] -> IntMap Bool) -> [(Int, Bool)] -> IntMap Bool
forall a b. (a -> b) -> a -> b
$ [Int] -> [Bool] -> [(Int, Bool)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Int]
indexes (Bool -> [Bool]
forall a. a -> [a]
repeat Bool
True)
  STM () -> IO ()
forall (m :: * -> *) a. MonadIO m => STM a -> m a
atomically (STM () -> IO ()) -> STM () -> IO ()
forall a b. (a -> b) -> a -> b
$ TQueue Link -> Link -> STM ()
forall a. TQueue a -> a -> STM ()
writeTQueue TQueue Link
queue (LinkType -> URI -> Word -> Link
Link LinkType
A URI
setUri Word
0)
  LoggingT IO () -> IO ()
forall (m :: * -> *) a. MonadIO m => LoggingT m a -> m a
runStderrLoggingT (LoggingT IO () -> IO ()) -> LoggingT IO () -> IO ()
forall a b. (a -> b) -> a -> b
$
    (LogSource -> LogLevel -> Bool) -> LoggingT IO () -> LoggingT IO ()
forall (m :: * -> *) a.
(LogSource -> LogLevel -> Bool) -> LoggingT m a -> LoggingT m a
filterLogger (\LogSource
_ LogLevel
ll -> LogLevel
ll LogLevel -> LogLevel -> Bool
forall a. Ord a => a -> a -> Bool
>= LogLevel
setLogLevel) (LoggingT IO () -> LoggingT IO ())
-> LoggingT IO () -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ do
      LogSource -> LoggingT IO ()
forall (m :: * -> *). MonadLogger m => LogSource -> m ()
logDebugN (LogSource -> LoggingT IO ()) -> LogSource -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ String -> LogSource
T.pack (String -> LogSource) -> String -> LogSource
forall a b. (a -> b) -> a -> b
$ Settings -> String
forall a. Show a => a -> String
ppShow Settings
settings
      LogSource -> LoggingT IO ()
forall (m :: * -> *). MonadLogger m => LogSource -> m ()
logInfoN (LogSource -> LoggingT IO ()) -> LogSource -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ LogSource
"Running with " LogSource -> LogSource -> LogSource
forall a. Semigroup a => a -> a -> a
<> String -> LogSource
T.pack (Int -> String
forall a. Show a => a -> String
show Int
fetchers) LogSource -> LogSource -> LogSource
forall a. Semigroup a => a -> a -> a
<> LogSource
" fetchers"
      [Int] -> (Int -> LoggingT IO ()) -> LoggingT IO ()
forall (m :: * -> *) (f :: * -> *) a b.
(MonadUnliftIO m, Foldable f) =>
f a -> (a -> m b) -> m ()
forConcurrently_ [Int]
indexes ((Int -> LoggingT IO ()) -> LoggingT IO ())
-> (Int -> LoggingT IO ()) -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ \Int
ix ->
        Maybe Word
-> Manager
-> TQueue Link
-> TVar (Set URI)
-> TVar (Map Link Result)
-> TVar (IntMap Bool)
-> Int
-> LoggingT IO ()
worker Maybe Word
setMaxDepth Manager
man TQueue Link
queue TVar (Set URI)
seen TVar (Map Link Result)
results TVar (IntMap Bool)
fetcherStati Int
ix
  Map Link Result
resultsMap <- TVar (Map Link Result) -> IO (Map Link Result)
forall (m :: * -> *) a. MonadIO m => TVar a -> m a
readTVarIO TVar (Map Link Result)
results
  Chunk -> [ByteString] -> [ByteString]
bytestringMaker <- IO (Chunk -> [ByteString] -> [ByteString])
byteStringMakerFromEnvironment
  ([Chunk] -> IO ()) -> [[Chunk]] -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ ((ByteString -> IO ()) -> [ByteString] -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ ByteString -> IO ()
SB.putStr ([ByteString] -> IO ())
-> ([Chunk] -> [ByteString]) -> [Chunk] -> IO ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Chunk -> [ByteString] -> [ByteString]) -> [Chunk] -> [ByteString]
chunksToByteStrings Chunk -> [ByteString] -> [ByteString]
bytestringMaker) ([[Chunk]] -> IO ()) -> [[Chunk]] -> IO ()
forall a b. (a -> b) -> a -> b
$ SEOResult -> [[Chunk]]
renderSEOResult (SEOResult -> [[Chunk]]) -> SEOResult -> [[Chunk]]
forall a b. (a -> b) -> a -> b
$ SEOResult :: Map Link Result -> SEOResult
SEOResult {seoResultPageResults :: Map Link Result
seoResultPageResults = Map Link Result
resultsMap}
  Bool -> IO () -> IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when ((Result -> Bool) -> Map Link Result -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any Result -> Bool
resultBad Map Link Result
resultsMap) (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$
    ExitCode -> IO ()
forall a. ExitCode -> IO a
exitWith (ExitCode -> IO ()) -> ExitCode -> IO ()
forall a b. (a -> b) -> a -> b
$
      Int -> ExitCode
ExitFailure Int
1

newtype SEOResult = SEOResult
  { SEOResult -> Map Link Result
seoResultPageResults :: Map Link Result
  }
  deriving (Int -> SEOResult -> ShowS
[SEOResult] -> ShowS
SEOResult -> String
(Int -> SEOResult -> ShowS)
-> (SEOResult -> String)
-> ([SEOResult] -> ShowS)
-> Show SEOResult
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [SEOResult] -> ShowS
$cshowList :: [SEOResult] -> ShowS
show :: SEOResult -> String
$cshow :: SEOResult -> String
showsPrec :: Int -> SEOResult -> ShowS
$cshowsPrec :: Int -> SEOResult -> ShowS
Show, SEOResult -> SEOResult -> Bool
(SEOResult -> SEOResult -> Bool)
-> (SEOResult -> SEOResult -> Bool) -> Eq SEOResult
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: SEOResult -> SEOResult -> Bool
$c/= :: SEOResult -> SEOResult -> Bool
== :: SEOResult -> SEOResult -> Bool
$c== :: SEOResult -> SEOResult -> Bool
Eq)

renderSEOResult :: SEOResult -> [[Chunk]]
renderSEOResult :: SEOResult -> [[Chunk]]
renderSEOResult SEOResult {Map Link Result
seoResultPageResults :: Map Link Result
seoResultPageResults :: SEOResult -> Map Link Result
..} = [[[Chunk]]] -> [[Chunk]]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat ([[[Chunk]]] -> [[Chunk]]) -> [[[Chunk]]] -> [[Chunk]]
forall a b. (a -> b) -> a -> b
$ ((Link, Result) -> Maybe [[Chunk]])
-> [(Link, Result)] -> [[[Chunk]]]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe ((Link -> Result -> Maybe [[Chunk]])
-> (Link, Result) -> Maybe [[Chunk]]
forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry Link -> Result -> Maybe [[Chunk]]
renderPageResult) (Map Link Result -> [(Link, Result)]
forall k a. Map k a -> [(k, a)]
M.toList Map Link Result
seoResultPageResults)

renderPageResult :: Link -> Result -> Maybe [[Chunk]]
renderPageResult :: Link -> Result -> Maybe [[Chunk]]
renderPageResult Link
link r :: Result
r@Result {Maybe DocResult
Status
resultDocResult :: Result -> Maybe DocResult
resultStatus :: Result -> Status
resultDocResult :: Maybe DocResult
resultStatus :: Status
..} =
  if Result -> Bool
resultBad Result
r
    then [[Chunk]] -> Maybe [[Chunk]]
forall a. a -> Maybe a
Just [[Chunk]]
go
    else Maybe [[Chunk]]
forall a. Maybe a
Nothing
  where
    go :: [[Chunk]]
    go :: [[Chunk]]
go =
      [Chunk] -> [[Chunk]] -> [[Chunk]]
forall a. a -> [a] -> [a]
intersperse [LogSource -> Chunk
chunk LogSource
"\n"] ([[Chunk]] -> [[Chunk]]) -> [[Chunk]] -> [[Chunk]]
forall a b. (a -> b) -> a -> b
$
        [ [Radiant -> Chunk -> Chunk
fore Radiant
blue (Chunk -> Chunk) -> Chunk -> Chunk
forall a b. (a -> b) -> a -> b
$ LogSource -> Chunk
chunk (LogSource -> Chunk) -> LogSource -> Chunk
forall a b. (a -> b) -> a -> b
$ String -> LogSource
T.pack (String -> LogSource) -> String -> LogSource
forall a b. (a -> b) -> a -> b
$ URI -> String
forall a. Show a => a -> String
show (Link -> URI
linkUri Link
link)],
          Status -> [Chunk]
renderStatusResult Status
resultStatus
        ]
          [[Chunk]] -> [[Chunk]] -> [[Chunk]]
forall a. [a] -> [a] -> [a]
++ [[Chunk]]
-> (DocResult -> [[Chunk]]) -> Maybe DocResult -> [[Chunk]]
forall b a. b -> (a -> b) -> Maybe a -> b
maybe [] DocResult -> [[Chunk]]
renderDocResult Maybe DocResult
resultDocResult

renderStatusResult :: HTTP.Status -> [Chunk]
renderStatusResult :: Status -> [Chunk]
renderStatusResult Status
s =
  [ LogSource -> Chunk
chunk LogSource
"Status: ",
    Radiant -> Chunk -> Chunk
fore Radiant
col (Chunk -> Chunk) -> Chunk -> Chunk
forall a b. (a -> b) -> a -> b
$ LogSource -> Chunk
chunk (LogSource -> Chunk) -> LogSource -> Chunk
forall a b. (a -> b) -> a -> b
$ String -> LogSource
T.pack (String -> LogSource) -> String -> LogSource
forall a b. (a -> b) -> a -> b
$ Int -> String
forall a. Show a => a -> String
show Int
sci
  ]
  where
    sci :: Int
sci = Status -> Int
HTTP.statusCode Status
s
    col :: Radiant
col = if Int
200 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
sci Bool -> Bool -> Bool
&& Int
sci Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
300 then Radiant
green else Radiant
red

renderDocResult :: DocResult -> [[Chunk]]
renderDocResult :: DocResult -> [[Chunk]]
renderDocResult DocResult {[Link]
Set LogSource
DescriptionResult
TitleResult
DocTypeResult
docResultImagesWithoutAlt :: DocResult -> Set LogSource
docResultDescription :: DocResult -> DescriptionResult
docResultTitle :: DocResult -> TitleResult
docResultDocType :: DocResult -> DocTypeResult
docResultLinks :: DocResult -> [Link]
docResultImagesWithoutAlt :: Set LogSource
docResultDescription :: DescriptionResult
docResultTitle :: TitleResult
docResultDocType :: DocTypeResult
docResultLinks :: [Link]
..} =
  [ [ LogSource -> Chunk
chunk LogSource
"Doctype: ",
      case DocTypeResult
docResultDocType of
        DocTypeResult
HtmlDocType -> Radiant -> Chunk -> Chunk
fore Radiant
green (Chunk -> Chunk) -> Chunk -> Chunk
forall a b. (a -> b) -> a -> b
$ LogSource -> Chunk
chunk LogSource
"html"
        DocTypeResult
UnknownDocType -> Radiant -> Chunk -> Chunk
fore Radiant
red (Chunk -> Chunk) -> Chunk -> Chunk
forall a b. (a -> b) -> a -> b
$ LogSource -> Chunk
chunk LogSource
"Unknown doctype"
        DocTypeResult
NoDocType -> Radiant -> Chunk -> Chunk
fore Radiant
red (Chunk -> Chunk) -> Chunk -> Chunk
forall a b. (a -> b) -> a -> b
$ LogSource -> Chunk
chunk LogSource
"No doctype"
    ],
    [ LogSource -> Chunk
chunk LogSource
"Title: ",
      case TitleResult
docResultTitle of
        TitleResult
NoTitleFound -> Radiant -> Chunk -> Chunk
fore Radiant
red (Chunk -> Chunk) -> Chunk -> Chunk
forall a b. (a -> b) -> a -> b
$ LogSource -> Chunk
chunk LogSource
"No title"
        TitleResult
EmptyTitle -> Radiant -> Chunk -> Chunk
fore Radiant
red (Chunk -> Chunk) -> Chunk -> Chunk
forall a b. (a -> b) -> a -> b
$ LogSource -> Chunk
chunk LogSource
"Empty title"
        NonStandardTitle Element
e -> Radiant -> Chunk -> Chunk
fore Radiant
red (Chunk -> Chunk) -> Chunk -> Chunk
forall a b. (a -> b) -> a -> b
$ LogSource -> Chunk
chunk (LogSource -> Chunk) -> LogSource -> Chunk
forall a b. (a -> b) -> a -> b
$ String -> LogSource
T.pack (String -> LogSource) -> String -> LogSource
forall a b. (a -> b) -> a -> b
$ String
"Non-standard title: " String -> ShowS
forall a. Semigroup a => a -> a -> a
<> Element -> String
forall a. Show a => a -> String
show Element
e
        TitleFound LogSource
t -> Radiant -> Chunk -> Chunk
fore Radiant
green (Chunk -> Chunk) -> Chunk -> Chunk
forall a b. (a -> b) -> a -> b
$ LogSource -> Chunk
chunk LogSource
t
    ],
    [ LogSource -> Chunk
chunk LogSource
"Description: ",
      case DescriptionResult
docResultDescription of
        Description LogSource
d -> Radiant -> Chunk -> Chunk
fore Radiant
green (Chunk -> Chunk) -> Chunk -> Chunk
forall a b. (a -> b) -> a -> b
$ LogSource -> Chunk
chunk LogSource
d
        DescriptionResult
EmptyDescription -> Radiant -> Chunk -> Chunk
fore Radiant
red (Chunk -> Chunk) -> Chunk -> Chunk
forall a b. (a -> b) -> a -> b
$ LogSource -> Chunk
chunk LogSource
"Empty description"
        DescriptionResult
NoDescription -> Radiant -> Chunk -> Chunk
fore Radiant
red (Chunk -> Chunk) -> Chunk -> Chunk
forall a b. (a -> b) -> a -> b
$ LogSource -> Chunk
chunk LogSource
"No description"
        DescriptionResult
MultipleDescriptions -> Radiant -> Chunk -> Chunk
fore Radiant
red (Chunk -> Chunk) -> Chunk -> Chunk
forall a b. (a -> b) -> a -> b
$ LogSource -> Chunk
chunk LogSource
"Multiple descriptions"
        NonStandardDescription Element
e -> Radiant -> Chunk -> Chunk
fore Radiant
red (Chunk -> Chunk) -> Chunk -> Chunk
forall a b. (a -> b) -> a -> b
$ LogSource -> Chunk
chunk (LogSource -> Chunk) -> LogSource -> Chunk
forall a b. (a -> b) -> a -> b
$ String -> LogSource
T.pack (String -> LogSource) -> String -> LogSource
forall a b. (a -> b) -> a -> b
$ String
"Non-standard description: " String -> ShowS
forall a. Semigroup a => a -> a -> a
<> Element -> String
forall a. Show a => a -> String
show Element
e
    ],
    [ LogSource -> Chunk
chunk LogSource
"Images without Alt: ",
      case Set LogSource -> [LogSource]
forall a. Set a -> [a]
S.toList Set LogSource
docResultImagesWithoutAlt of
        [] -> Radiant -> Chunk -> Chunk
fore Radiant
green (Chunk -> Chunk) -> Chunk -> Chunk
forall a b. (a -> b) -> a -> b
$ LogSource -> Chunk
chunk LogSource
"None"
        [LogSource]
is -> Radiant -> Chunk -> Chunk
fore Radiant
red (Chunk -> Chunk) -> Chunk -> Chunk
forall a b. (a -> b) -> a -> b
$ LogSource -> Chunk
chunk (LogSource -> Chunk) -> LogSource -> Chunk
forall a b. (a -> b) -> a -> b
$ String -> LogSource
T.pack (String -> LogSource) -> String -> LogSource
forall a b. (a -> b) -> a -> b
$ [LogSource] -> String
forall a. Show a => a -> String
show [LogSource]
is
    ],
    [LogSource -> Chunk
chunk LogSource
"\n"] -- Empty line
  ]

worker ::
  Maybe Word ->
  HTTP.Manager ->
  TQueue Link ->
  TVar (Set URI) ->
  TVar (Map Link Result) ->
  TVar (IntMap Bool) ->
  Int ->
  LoggingT IO ()
worker :: Maybe Word
-> Manager
-> TQueue Link
-> TVar (Set URI)
-> TVar (Map Link Result)
-> TVar (IntMap Bool)
-> Int
-> LoggingT IO ()
worker Maybe Word
maxDepth Manager
man TQueue Link
queue TVar (Set URI)
seen TVar (Map Link Result)
results TVar (IntMap Bool)
stati Int
index = Bool -> LoggingT IO ()
go Bool
True
  where
    setStatus :: Bool -> m ()
setStatus Bool
b = STM () -> m ()
forall (m :: * -> *) a. MonadIO m => STM a -> m a
atomically (STM () -> m ()) -> STM () -> m ()
forall a b. (a -> b) -> a -> b
$ TVar (IntMap Bool) -> (IntMap Bool -> IntMap Bool) -> STM ()
forall a. TVar a -> (a -> a) -> STM ()
modifyTVar' TVar (IntMap Bool)
stati ((IntMap Bool -> IntMap Bool) -> STM ())
-> (IntMap Bool -> IntMap Bool) -> STM ()
forall a b. (a -> b) -> a -> b
$ Int -> Bool -> IntMap Bool -> IntMap Bool
forall a. Int -> a -> IntMap a -> IntMap a
IM.insert Int
index Bool
b
    setBusy :: LoggingT IO ()
setBusy = Bool -> LoggingT IO ()
forall (m :: * -> *). MonadIO m => Bool -> m ()
setStatus Bool
True
    setIdle :: LoggingT IO ()
setIdle = Bool -> LoggingT IO ()
forall (m :: * -> *). MonadIO m => Bool -> m ()
setStatus Bool
False
    allDone :: MonadIO m => m Bool
    allDone :: m Bool
allDone = (Bool -> Bool) -> IntMap Bool -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all Bool -> Bool
not (IntMap Bool -> Bool) -> m (IntMap Bool) -> m Bool
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> TVar (IntMap Bool) -> m (IntMap Bool)
forall (m :: * -> *) a. MonadIO m => TVar a -> m a
readTVarIO TVar (IntMap Bool)
stati
    go :: Bool -> LoggingT IO ()
go Bool
busy = do
      Maybe Link
mv <- STM (Maybe Link) -> LoggingT IO (Maybe Link)
forall (m :: * -> *) a. MonadIO m => STM a -> m a
atomically (STM (Maybe Link) -> LoggingT IO (Maybe Link))
-> STM (Maybe Link) -> LoggingT IO (Maybe Link)
forall a b. (a -> b) -> a -> b
$ TQueue Link -> STM (Maybe Link)
forall a. TQueue a -> STM (Maybe a)
tryReadTQueue TQueue Link
queue
      -- Get an item off the queue
      case Maybe Link
mv of
        -- No items on the queue
        Maybe Link
Nothing -> do
          -- Set this worker as idle
          LogSource -> LoggingT IO ()
forall (m :: * -> *). MonadLogger m => LogSource -> m ()
logDebugN (LogSource -> LoggingT IO ()) -> LogSource -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ LogSource
"Worker is idle: " LogSource -> LogSource -> LogSource
forall a. Semigroup a => a -> a -> a
<> String -> LogSource
T.pack (Int -> String
forall a. Show a => a -> String
show Int
index)
          Bool -> LoggingT IO () -> LoggingT IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when Bool
busy LoggingT IO ()
setIdle
          -- If all workers are idle, we are done.
          Bool
ad <- LoggingT IO Bool
forall (m :: * -> *). MonadIO m => m Bool
allDone
          Bool -> LoggingT IO () -> LoggingT IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless Bool
ad (LoggingT IO () -> LoggingT IO ())
-> LoggingT IO () -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ do
            IO () -> LoggingT IO ()
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO () -> LoggingT IO ()) -> IO () -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ Int -> IO ()
threadDelay Int
10000 -- 10 ms
            Bool -> LoggingT IO ()
go Bool
False
        -- An item on the queue
        Just Link
link -> do
          -- Set this worker as busy
          LogSource -> LoggingT IO ()
forall (m :: * -> *). MonadLogger m => LogSource -> m ()
logDebugN (LogSource -> LoggingT IO ()) -> LogSource -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ LogSource
"Worker is busy: " LogSource -> LogSource -> LogSource
forall a. Semigroup a => a -> a -> a
<> String -> LogSource
T.pack (Int -> String
forall a. Show a => a -> String
show Int
index)
          Bool -> LoggingT IO () -> LoggingT IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless Bool
busy LoggingT IO ()
setBusy
          -- Check if the link has been seen already
          Bool
alreadySeen <- URI -> Set URI -> Bool
forall a. Ord a => a -> Set a -> Bool
S.member (Link -> URI
linkUri Link
link) (Set URI -> Bool) -> LoggingT IO (Set URI) -> LoggingT IO Bool
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> TVar (Set URI) -> LoggingT IO (Set URI)
forall (m :: * -> *) a. MonadIO m => TVar a -> m a
readTVarIO TVar (Set URI)
seen
          if Bool
alreadySeen
            then do
              -- We've already seen it, don't do anything.
              LogSource -> LoggingT IO ()
forall (m :: * -> *). MonadLogger m => LogSource -> m ()
logDebugN (LogSource -> LoggingT IO ()) -> LogSource -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ LogSource
"Not fetching again: " LogSource -> LogSource -> LogSource
forall a. Semigroup a => a -> a -> a
<> String -> LogSource
T.pack (URI -> String
forall a. Show a => a -> String
show (Link -> URI
linkUri Link
link))
              () -> LoggingT IO ()
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
            else do
              -- We haven't seen it yet. Mark it as seen.
              STM () -> LoggingT IO ()
forall (m :: * -> *) a. MonadIO m => STM a -> m a
atomically (STM () -> LoggingT IO ()) -> STM () -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ TVar (Set URI) -> (Set URI -> Set URI) -> STM ()
forall a. TVar a -> (a -> a) -> STM ()
modifyTVar' TVar (Set URI)
seen ((Set URI -> Set URI) -> STM ()) -> (Set URI -> Set URI) -> STM ()
forall a b. (a -> b) -> a -> b
$ URI -> Set URI -> Set URI
forall a. Ord a => a -> Set a -> Set a
S.insert (Link -> URI
linkUri Link
link)
              Maybe Result
mres <- Maybe Word -> Manager -> Link -> LoggingT IO (Maybe Result)
produceResult Maybe Word
maxDepth Manager
man Link
link
              Maybe Result -> (Result -> LoggingT IO ()) -> LoggingT IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ Maybe Result
mres ((Result -> LoggingT IO ()) -> LoggingT IO ())
-> (Result -> LoggingT IO ()) -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ \Result
res -> do
                STM () -> LoggingT IO ()
forall (m :: * -> *) a. MonadIO m => STM a -> m a
atomically (STM () -> LoggingT IO ()) -> STM () -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ TVar (Map Link Result)
-> (Map Link Result -> Map Link Result) -> STM ()
forall a. TVar a -> (a -> a) -> STM ()
modifyTVar' TVar (Map Link Result)
results ((Map Link Result -> Map Link Result) -> STM ())
-> (Map Link Result -> Map Link Result) -> STM ()
forall a b. (a -> b) -> a -> b
$ Link -> Result -> Map Link Result -> Map Link Result
forall k a. Ord k => k -> a -> Map k a -> Map k a
M.insert Link
link Result
res
                let recurse :: Bool
recurse = case Maybe Word
maxDepth of
                      Maybe Word
Nothing -> Bool
True
                      Just Word
md -> Link -> Word
linkDepth Link
link Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
< Word
md
                Bool -> LoggingT IO () -> LoggingT IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when Bool
recurse (LoggingT IO () -> LoggingT IO ())
-> LoggingT IO () -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$
                  Maybe [Link] -> ([Link] -> LoggingT IO ()) -> LoggingT IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ (DocResult -> [Link]
docResultLinks (DocResult -> [Link]) -> Maybe DocResult -> Maybe [Link]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Result -> Maybe DocResult
resultDocResult Result
res) (([Link] -> LoggingT IO ()) -> LoggingT IO ())
-> ([Link] -> LoggingT IO ()) -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ \[Link]
uris ->
                    STM () -> LoggingT IO ()
forall (m :: * -> *) a. MonadIO m => STM a -> m a
atomically (STM () -> LoggingT IO ()) -> STM () -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ (Link -> STM ()) -> [Link] -> STM ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ (TQueue Link -> Link -> STM ()
forall a. TQueue a -> a -> STM ()
writeTQueue TQueue Link
queue) [Link]
uris
          -- Filter out the ones that are not on the same host.
          Bool -> LoggingT IO ()
go Bool
True

data Link = Link
  { Link -> LinkType
linkType :: !LinkType,
    Link -> URI
linkUri :: !URI,
    Link -> Word
linkDepth :: !Word
  }
  deriving (Int -> Link -> ShowS
[Link] -> ShowS
Link -> String
(Int -> Link -> ShowS)
-> (Link -> String) -> ([Link] -> ShowS) -> Show Link
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Link] -> ShowS
$cshowList :: [Link] -> ShowS
show :: Link -> String
$cshow :: Link -> String
showsPrec :: Int -> Link -> ShowS
$cshowsPrec :: Int -> Link -> ShowS
Show, Link -> Link -> Bool
(Link -> Link -> Bool) -> (Link -> Link -> Bool) -> Eq Link
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Link -> Link -> Bool
$c/= :: Link -> Link -> Bool
== :: Link -> Link -> Bool
$c== :: Link -> Link -> Bool
Eq, Eq Link
Eq Link
-> (Link -> Link -> Ordering)
-> (Link -> Link -> Bool)
-> (Link -> Link -> Bool)
-> (Link -> Link -> Bool)
-> (Link -> Link -> Bool)
-> (Link -> Link -> Link)
-> (Link -> Link -> Link)
-> Ord Link
Link -> Link -> Bool
Link -> Link -> Ordering
Link -> Link -> Link
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: Link -> Link -> Link
$cmin :: Link -> Link -> Link
max :: Link -> Link -> Link
$cmax :: Link -> Link -> Link
>= :: Link -> Link -> Bool
$c>= :: Link -> Link -> Bool
> :: Link -> Link -> Bool
$c> :: Link -> Link -> Bool
<= :: Link -> Link -> Bool
$c<= :: Link -> Link -> Bool
< :: Link -> Link -> Bool
$c< :: Link -> Link -> Bool
compare :: Link -> Link -> Ordering
$ccompare :: Link -> Link -> Ordering
$cp1Ord :: Eq Link
Ord)

data LinkType
  = A
  | IMG
  | LINK
  deriving (Int -> LinkType -> ShowS
[LinkType] -> ShowS
LinkType -> String
(Int -> LinkType -> ShowS)
-> (LinkType -> String) -> ([LinkType] -> ShowS) -> Show LinkType
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [LinkType] -> ShowS
$cshowList :: [LinkType] -> ShowS
show :: LinkType -> String
$cshow :: LinkType -> String
showsPrec :: Int -> LinkType -> ShowS
$cshowsPrec :: Int -> LinkType -> ShowS
Show, LinkType -> LinkType -> Bool
(LinkType -> LinkType -> Bool)
-> (LinkType -> LinkType -> Bool) -> Eq LinkType
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: LinkType -> LinkType -> Bool
$c/= :: LinkType -> LinkType -> Bool
== :: LinkType -> LinkType -> Bool
$c== :: LinkType -> LinkType -> Bool
Eq, Eq LinkType
Eq LinkType
-> (LinkType -> LinkType -> Ordering)
-> (LinkType -> LinkType -> Bool)
-> (LinkType -> LinkType -> Bool)
-> (LinkType -> LinkType -> Bool)
-> (LinkType -> LinkType -> Bool)
-> (LinkType -> LinkType -> LinkType)
-> (LinkType -> LinkType -> LinkType)
-> Ord LinkType
LinkType -> LinkType -> Bool
LinkType -> LinkType -> Ordering
LinkType -> LinkType -> LinkType
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: LinkType -> LinkType -> LinkType
$cmin :: LinkType -> LinkType -> LinkType
max :: LinkType -> LinkType -> LinkType
$cmax :: LinkType -> LinkType -> LinkType
>= :: LinkType -> LinkType -> Bool
$c>= :: LinkType -> LinkType -> Bool
> :: LinkType -> LinkType -> Bool
$c> :: LinkType -> LinkType -> Bool
<= :: LinkType -> LinkType -> Bool
$c<= :: LinkType -> LinkType -> Bool
< :: LinkType -> LinkType -> Bool
$c< :: LinkType -> LinkType -> Bool
compare :: LinkType -> LinkType -> Ordering
$ccompare :: LinkType -> LinkType -> Ordering
$cp1Ord :: Eq LinkType
Ord)

data Result = Result
  { Result -> Status
resultStatus :: !HTTP.Status,
    Result -> Maybe DocResult
resultDocResult :: !(Maybe DocResult)
  }
  deriving (Int -> Result -> ShowS
[Result] -> ShowS
Result -> String
(Int -> Result -> ShowS)
-> (Result -> String) -> ([Result] -> ShowS) -> Show Result
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Result] -> ShowS
$cshowList :: [Result] -> ShowS
show :: Result -> String
$cshow :: Result -> String
showsPrec :: Int -> Result -> ShowS
$cshowsPrec :: Int -> Result -> ShowS
Show, Result -> Result -> Bool
(Result -> Result -> Bool)
-> (Result -> Result -> Bool) -> Eq Result
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Result -> Result -> Bool
$c/= :: Result -> Result -> Bool
== :: Result -> Result -> Bool
$c== :: Result -> Result -> Bool
Eq)

resultBad :: Result -> Bool
resultBad :: Result -> Bool
resultBad Result {Maybe DocResult
Status
resultDocResult :: Maybe DocResult
resultStatus :: Status
resultDocResult :: Result -> Maybe DocResult
resultStatus :: Result -> Status
..} =
  Bool -> Bool
not (Bool -> Bool) -> Bool -> Bool
forall a b. (a -> b) -> a -> b
$
    Validation -> Bool
validationIsValid (Validation -> Bool) -> Validation -> Bool
forall a b. (a -> b) -> a -> b
$
      [Validation] -> Validation
forall a. Monoid a => [a] -> a
mconcat
        [ String -> Bool -> Validation
declare String
"The status code is in the 200 range" (Bool -> Validation) -> Bool -> Validation
forall a b. (a -> b) -> a -> b
$
            let sci :: Int
sci = Status -> Int
HTTP.statusCode Status
resultStatus
             in Int
200 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
sci Bool -> Bool -> Bool
&& Int
sci Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
300,
          String -> Validation -> Validation
decorate String
"Doc result" (Validation -> Validation) -> Validation -> Validation
forall a b. (a -> b) -> a -> b
$ Validation
-> (DocResult -> Validation) -> Maybe DocResult -> Validation
forall b a. b -> (a -> b) -> Maybe a -> b
maybe Validation
valid DocResult -> Validation
docResultValidation Maybe DocResult
resultDocResult
        ]

produceResult :: Maybe Word -> HTTP.Manager -> Link -> LoggingT IO (Maybe Result)
produceResult :: Maybe Word -> Manager -> Link -> LoggingT IO (Maybe Result)
produceResult Maybe Word
maxDepth Manager
man link :: Link
link@Link {Word
URI
LinkType
linkDepth :: Word
linkUri :: URI
linkType :: LinkType
linkType :: Link -> LinkType
linkDepth :: Link -> Word
linkUri :: Link -> URI
..} =
  -- Create a request
  case URI -> Maybe Request
forall (m :: * -> *). MonadThrow m => URI -> m Request
requestFromURI URI
linkUri of
    Maybe Request
Nothing -> do
      LogSource -> LoggingT IO ()
forall (m :: * -> *). MonadLogger m => LogSource -> m ()
logErrorN (LogSource -> LoggingT IO ()) -> LogSource -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ LogSource
"Unable to construct a request from this uri: " LogSource -> LogSource -> LogSource
forall a. Semigroup a => a -> a -> a
<> String -> LogSource
T.pack (URI -> String
forall a. Show a => a -> String
show URI
linkUri)
      Maybe Result -> LoggingT IO (Maybe Result)
forall (f :: * -> *) a. Applicative f => a -> f a
pure Maybe Result
forall a. Maybe a
Nothing
    Just Request
req -> do
      let fetchingLog :: [String]
fetchingLog = case Maybe Word
maxDepth of
            Maybe Word
Nothing -> [String
"Fetching: ", URI -> String
forall a. Show a => a -> String
show URI
linkUri]
            Just Word
md -> [String
"Depth ", Word -> String
forall a. Show a => a -> String
show Word
linkDepth, String
"/", Word -> String
forall a. Show a => a -> String
show Word
md, String
"; Fetching: ", URI -> String
forall a. Show a => a -> String
show URI
linkUri]
      LogSource -> LoggingT IO ()
forall (m :: * -> *). MonadLogger m => LogSource -> m ()
logInfoN (LogSource -> LoggingT IO ()) -> LogSource -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ String -> LogSource
T.pack (String -> LogSource) -> String -> LogSource
forall a b. (a -> b) -> a -> b
$ [String] -> String
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [String]
fetchingLog
      -- Do the actual fetch
      Response ByteString
resp <- IO (Response ByteString) -> LoggingT IO (Response ByteString)
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO (Response ByteString) -> LoggingT IO (Response ByteString))
-> IO (Response ByteString) -> LoggingT IO (Response ByteString)
forall a b. (a -> b) -> a -> b
$ Request -> Manager -> IO (Response ByteString)
httpLbs Request
req Manager
man
      let status :: Status
status = Response ByteString -> Status
forall body. Response body -> Status
responseStatus Response ByteString
resp
      let sci :: Int
sci = Status -> Int
HTTP.statusCode Status
status
      LogSource -> LoggingT IO ()
forall (m :: * -> *). MonadLogger m => LogSource -> m ()
logDebugN (LogSource -> LoggingT IO ()) -> LogSource -> LoggingT IO ()
forall a b. (a -> b) -> a -> b
$ LogSource
"Got response for " LogSource -> LogSource -> LogSource
forall a. Semigroup a => a -> a -> a
<> String -> LogSource
T.pack (URI -> String
forall a. Show a => a -> String
show URI
linkUri) LogSource -> LogSource -> LogSource
forall a. Semigroup a => a -> a -> a
<> LogSource
": " LogSource -> LogSource -> LogSource
forall a. Semigroup a => a -> a -> a
<> String -> LogSource
T.pack (Int -> String
forall a. Show a => a -> String
show Int
sci)
      -- If the status code is not in the 2XX range, add it to the results
      let body :: ByteString
body = Response ByteString -> ByteString
forall body. Response body -> body
responseBody Response ByteString
resp
      let headers :: ResponseHeaders
headers = Response ByteString -> ResponseHeaders
forall body. Response body -> ResponseHeaders
responseHeaders Response ByteString
resp
          contentType :: Maybe ByteString
contentType = HeaderName -> ResponseHeaders -> Maybe ByteString
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup HeaderName
hContentType ResponseHeaders
headers
      Maybe Result -> LoggingT IO (Maybe Result)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Maybe Result -> LoggingT IO (Maybe Result))
-> Maybe Result -> LoggingT IO (Maybe Result)
forall a b. (a -> b) -> a -> b
$
        Result -> Maybe Result
forall a. a -> Maybe a
Just (Result -> Maybe Result) -> Result -> Maybe Result
forall a b. (a -> b) -> a -> b
$
          Result :: Status -> Maybe DocResult -> Result
Result
            { resultStatus :: Status
resultStatus = Response ByteString -> Status
forall body. Response body -> Status
responseStatus Response ByteString
resp,
              resultDocResult :: Maybe DocResult
resultDocResult = case LinkType
linkType of
                LinkType
A -> do
                  ByteString
ct <- Maybe ByteString
contentType
                  if ByteString
"text/html" ByteString -> ByteString -> Bool
`SB.isInfixOf` ByteString
ct
                    then DocResult -> Maybe DocResult
forall a. a -> Maybe a
Just (DocResult -> Maybe DocResult) -> DocResult -> Maybe DocResult
forall a b. (a -> b) -> a -> b
$ Link -> Response ByteString -> Document -> DocResult
produceDocResult Link
link Response ByteString
resp (Document -> DocResult) -> Document -> DocResult
forall a b. (a -> b) -> a -> b
$ ByteString -> Document
HTML.parseLBS ByteString
body
                    else Maybe DocResult
forall a. Maybe a
Nothing
                LinkType
_ -> Maybe DocResult
forall a. Maybe a
Nothing
            }

data DocResult = DocResult
  { DocResult -> [Link]
docResultLinks :: ![Link],
    DocResult -> DocTypeResult
docResultDocType :: !DocTypeResult,
    DocResult -> TitleResult
docResultTitle :: !TitleResult,
    DocResult -> DescriptionResult
docResultDescription :: !DescriptionResult,
    DocResult -> Set LogSource
docResultImagesWithoutAlt :: !(Set Text) -- The 'src' tags of those images
  }
  deriving (Int -> DocResult -> ShowS
[DocResult] -> ShowS
DocResult -> String
(Int -> DocResult -> ShowS)
-> (DocResult -> String)
-> ([DocResult] -> ShowS)
-> Show DocResult
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [DocResult] -> ShowS
$cshowList :: [DocResult] -> ShowS
show :: DocResult -> String
$cshow :: DocResult -> String
showsPrec :: Int -> DocResult -> ShowS
$cshowsPrec :: Int -> DocResult -> ShowS
Show, DocResult -> DocResult -> Bool
(DocResult -> DocResult -> Bool)
-> (DocResult -> DocResult -> Bool) -> Eq DocResult
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: DocResult -> DocResult -> Bool
$c/= :: DocResult -> DocResult -> Bool
== :: DocResult -> DocResult -> Bool
$c== :: DocResult -> DocResult -> Bool
Eq)

docResultValidation :: DocResult -> Validation
docResultValidation :: DocResult -> Validation
docResultValidation DocResult {[Link]
Set LogSource
DescriptionResult
TitleResult
DocTypeResult
docResultImagesWithoutAlt :: Set LogSource
docResultDescription :: DescriptionResult
docResultTitle :: TitleResult
docResultDocType :: DocTypeResult
docResultLinks :: [Link]
docResultImagesWithoutAlt :: DocResult -> Set LogSource
docResultDescription :: DocResult -> DescriptionResult
docResultTitle :: DocResult -> TitleResult
docResultDocType :: DocResult -> DocTypeResult
docResultLinks :: DocResult -> [Link]
..} =
  [Validation] -> Validation
forall a. Monoid a => [a] -> a
mconcat
    [ String -> Bool -> Validation
declare String
"There was a doctype" (Bool -> Validation) -> Bool -> Validation
forall a b. (a -> b) -> a -> b
$ case DocTypeResult
docResultDocType of
        DocTypeResult
HtmlDocType -> Bool
True
        DocTypeResult
NoDocType -> Bool
False
        DocTypeResult
UnknownDocType -> Bool
False,
      String -> Bool -> Validation
declare String
"There was exactly one title" (Bool -> Validation) -> Bool -> Validation
forall a b. (a -> b) -> a -> b
$ case TitleResult
docResultTitle of
        TitleFound LogSource
_ -> Bool
True
        TitleResult
_ -> Bool
False,
      String -> Bool -> Validation
declare String
"There was exactly one description" (Bool -> Validation) -> Bool -> Validation
forall a b. (a -> b) -> a -> b
$
        case DescriptionResult
docResultDescription of
          Description LogSource
_ -> Bool
True
          DescriptionResult
_ -> Bool
False,
      String -> Bool -> Validation
declare String
"There are no pages without alt tags" (Bool -> Validation) -> Bool -> Validation
forall a b. (a -> b) -> a -> b
$ Set LogSource -> Bool
forall a. Set a -> Bool
S.null Set LogSource
docResultImagesWithoutAlt
    ]

produceDocResult :: Link -> Response LB.ByteString -> XML.Document -> DocResult
produceDocResult :: Link -> Response ByteString -> Document -> DocResult
produceDocResult Link
link Response ByteString
resp Document
d =
  DocResult :: [Link]
-> DocTypeResult
-> TitleResult
-> DescriptionResult
-> Set LogSource
-> DocResult
DocResult
    { docResultLinks :: [Link]
docResultLinks = Link -> Document -> [Link]
documentLinks Link
link Document
d,
      docResultDocType :: DocTypeResult
docResultDocType = Response ByteString -> DocTypeResult
documentDocType Response ByteString
resp,
      docResultTitle :: TitleResult
docResultTitle = Document -> TitleResult
documentTitle Document
d,
      docResultDescription :: DescriptionResult
docResultDescription = Document -> DescriptionResult
documentDescription Document
d,
      docResultImagesWithoutAlt :: Set LogSource
docResultImagesWithoutAlt = Document -> Set LogSource
documentImagesWithoutAlt Document
d
    }

documentLinks :: Link -> Document -> [Link]
documentLinks :: Link -> Document -> [Link]
documentLinks Link
link = Link -> Element -> [Link]
elementLinks Link
link (Element -> [Link]) -> (Document -> Element) -> Document -> [Link]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Document -> Element
documentRoot

elementLinks :: Link -> Element -> [Link]
elementLinks :: Link -> Element -> [Link]
elementLinks Link
link Element {[Node]
Map Name LogSource
Name
elementName :: Element -> Name
elementAttributes :: Element -> Map Name LogSource
elementNodes :: Element -> [Node]
elementNodes :: [Node]
elementAttributes :: Map Name LogSource
elementName :: Name
..} =
  ( case Link -> Name -> Map Name LogSource -> Maybe Link
singleElementLink Link
link Name
elementName Map Name LogSource
elementAttributes of
      Maybe Link
Nothing -> [Link] -> [Link]
forall a. a -> a
id
      Just Link
l -> (Link
l Link -> [Link] -> [Link]
forall a. a -> [a] -> [a]
:)
  )
    ([Link] -> [Link]) -> [Link] -> [Link]
forall a b. (a -> b) -> a -> b
$ (Node -> [Link]) -> [Node] -> [Link]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap (Link -> Node -> [Link]
nodeLinks Link
link) [Node]
elementNodes

singleElementLink :: Link -> Name -> Map Name Text -> Maybe Link
singleElementLink :: Link -> Name -> Map Name LogSource -> Maybe Link
singleElementLink Link
link Name
name Map Name LogSource
attrs = do
  (LinkType
typ, LogSource
t) <- case Name
name of
    Name
"a" -> (,) LinkType
A (LogSource -> (LinkType, LogSource))
-> Maybe LogSource -> Maybe (LinkType, LogSource)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Name -> Map Name LogSource -> Maybe LogSource
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup Name
"href" Map Name LogSource
attrs
    Name
"link" -> (,) LinkType
LINK (LogSource -> (LinkType, LogSource))
-> Maybe LogSource -> Maybe (LinkType, LogSource)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Name -> Map Name LogSource -> Maybe LogSource
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup Name
"href" Map Name LogSource
attrs
    Name
"img" -> (,) LinkType
IMG (LogSource -> (LinkType, LogSource))
-> Maybe LogSource -> Maybe (LinkType, LogSource)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Name -> Map Name LogSource -> Maybe LogSource
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup Name
"src" Map Name LogSource
attrs
    Name
_ -> Maybe (LinkType, LogSource)
forall a. Maybe a
Nothing
  let root :: URI
root = Link -> URI
linkUri Link
link
  URI
uri <- URI -> String -> Maybe URI
parseURIRelativeTo URI
root (String -> Maybe URI) -> String -> Maybe URI
forall a b. (a -> b) -> a -> b
$ LogSource -> String
T.unpack LogSource
t
  -- We remove the fragment so that the same uri is not fetched twice.
  let uri' :: URI
uri' = URI
uri {uriFragment :: String
uriFragment = String
""}
  Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Maybe ()) -> Bool -> Maybe ()
forall a b. (a -> b) -> a -> b
$ URI -> Maybe URIAuth
uriAuthority URI
uri' Maybe URIAuth -> Maybe URIAuth -> Bool
forall a. Eq a => a -> a -> Bool
== URI -> Maybe URIAuth
uriAuthority URI
root
  Link -> Maybe Link
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Link -> Maybe Link) -> Link -> Maybe Link
forall a b. (a -> b) -> a -> b
$
    Link :: LinkType -> URI -> Word -> Link
Link
      { linkType :: LinkType
linkType = LinkType
typ,
        linkUri :: URI
linkUri = URI
uri',
        linkDepth :: Word
linkDepth = Word -> Word
forall a. Enum a => a -> a
succ (Link -> Word
linkDepth Link
link)
      }

nodeLinks :: Link -> Node -> [Link]
nodeLinks :: Link -> Node -> [Link]
nodeLinks Link
link = \case
  NodeElement Element
e -> Link -> Element -> [Link]
elementLinks Link
link Element
e
  NodeContent LogSource
_ -> []
  NodeComment LogSource
_ -> []
  NodeInstruction Instruction
_ -> []

parseURIRelativeTo :: URI -> String -> Maybe URI
parseURIRelativeTo :: URI -> String -> Maybe URI
parseURIRelativeTo URI
root String
s =
  [Maybe URI] -> Maybe URI
forall (t :: * -> *) (m :: * -> *) a.
(Foldable t, MonadPlus m) =>
t (m a) -> m a
msum
    [ (URI -> URI -> URI
`relativeTo` URI
root) (URI -> URI) -> Maybe URI -> Maybe URI
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> String -> Maybe URI
parseRelativeReference String
s,
      String -> Maybe URI
parseAbsoluteURI String
s
    ]

data DocTypeResult = HtmlDocType | NoDocType | UnknownDocType
  deriving (Int -> DocTypeResult -> ShowS
[DocTypeResult] -> ShowS
DocTypeResult -> String
(Int -> DocTypeResult -> ShowS)
-> (DocTypeResult -> String)
-> ([DocTypeResult] -> ShowS)
-> Show DocTypeResult
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [DocTypeResult] -> ShowS
$cshowList :: [DocTypeResult] -> ShowS
show :: DocTypeResult -> String
$cshow :: DocTypeResult -> String
showsPrec :: Int -> DocTypeResult -> ShowS
$cshowsPrec :: Int -> DocTypeResult -> ShowS
Show, DocTypeResult -> DocTypeResult -> Bool
(DocTypeResult -> DocTypeResult -> Bool)
-> (DocTypeResult -> DocTypeResult -> Bool) -> Eq DocTypeResult
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: DocTypeResult -> DocTypeResult -> Bool
$c/= :: DocTypeResult -> DocTypeResult -> Bool
== :: DocTypeResult -> DocTypeResult -> Bool
$c== :: DocTypeResult -> DocTypeResult -> Bool
Eq)

documentDocType :: Response LB.ByteString -> DocTypeResult
documentDocType :: Response ByteString -> DocTypeResult
documentDocType Response ByteString
resp =
  if ByteString -> CI ByteString
forall s. FoldCase s => s -> CI s
CI.mk (Int64 -> ByteString -> ByteString
LB.take (Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Int64) -> Int -> Int64
forall a b. (a -> b) -> a -> b
$ ByteString -> Int
SB.length ByteString
"<!DOCTYPE ") (Response ByteString -> ByteString
forall body. Response body -> body
responseBody Response ByteString
resp)) CI ByteString -> CI ByteString -> Bool
forall a. Eq a => a -> a -> Bool
== ByteString -> CI ByteString
forall s. FoldCase s => s -> CI s
CI.mk ByteString
"<!DOCTYPE "
    then
      if ByteString -> CI ByteString
forall s. FoldCase s => s -> CI s
CI.mk (Int64 -> ByteString -> ByteString
LB.take (Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Int64) -> Int -> Int64
forall a b. (a -> b) -> a -> b
$ ByteString -> Int
SB.length ByteString
"<!DOCTYPE html>") (Response ByteString -> ByteString
forall body. Response body -> body
responseBody Response ByteString
resp)) CI ByteString -> CI ByteString -> Bool
forall a. Eq a => a -> a -> Bool
== ByteString -> CI ByteString
forall s. FoldCase s => s -> CI s
CI.mk ByteString
"<!DOCTYPE html>"
        then DocTypeResult
HtmlDocType
        else DocTypeResult
UnknownDocType
    else DocTypeResult
NoDocType

data TitleResult
  = NoTitleFound
  | EmptyTitle
  | TitleFound Text
  | NonStandardTitle Element
  deriving (Int -> TitleResult -> ShowS
[TitleResult] -> ShowS
TitleResult -> String
(Int -> TitleResult -> ShowS)
-> (TitleResult -> String)
-> ([TitleResult] -> ShowS)
-> Show TitleResult
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [TitleResult] -> ShowS
$cshowList :: [TitleResult] -> ShowS
show :: TitleResult -> String
$cshow :: TitleResult -> String
showsPrec :: Int -> TitleResult -> ShowS
$cshowsPrec :: Int -> TitleResult -> ShowS
Show, TitleResult -> TitleResult -> Bool
(TitleResult -> TitleResult -> Bool)
-> (TitleResult -> TitleResult -> Bool) -> Eq TitleResult
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: TitleResult -> TitleResult -> Bool
$c/= :: TitleResult -> TitleResult -> Bool
== :: TitleResult -> TitleResult -> Bool
$c== :: TitleResult -> TitleResult -> Bool
Eq)

documentTitle :: Document -> TitleResult
documentTitle :: Document -> TitleResult
documentTitle Document
d = case (Name -> Bool) -> Document -> Maybe Element
findDocumentTag (Name -> Name -> Bool
forall a. Eq a => a -> a -> Bool
== Name
"head") Document
d Maybe Element -> (Element -> Maybe Element) -> Maybe Element
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Name -> Bool) -> Element -> Maybe Element
findElementTag (Name -> Name -> Bool
forall a. Eq a => a -> a -> Bool
== Name
"title") of
  Maybe Element
Nothing -> TitleResult
NoTitleFound
  Just e :: Element
e@Element {[Node]
Map Name LogSource
Name
elementNodes :: [Node]
elementAttributes :: Map Name LogSource
elementName :: Name
elementName :: Element -> Name
elementAttributes :: Element -> Map Name LogSource
elementNodes :: Element -> [Node]
..} -> case [Node]
elementNodes of
    [] -> TitleResult
EmptyTitle
    [NodeContent LogSource
t] -> LogSource -> TitleResult
TitleFound LogSource
t
    [Node]
_ -> Element -> TitleResult
NonStandardTitle Element
e

data DescriptionResult
  = NoDescription
  | EmptyDescription
  | MultipleDescriptions
  | Description Text
  | NonStandardDescription Element
  deriving (Int -> DescriptionResult -> ShowS
[DescriptionResult] -> ShowS
DescriptionResult -> String
(Int -> DescriptionResult -> ShowS)
-> (DescriptionResult -> String)
-> ([DescriptionResult] -> ShowS)
-> Show DescriptionResult
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [DescriptionResult] -> ShowS
$cshowList :: [DescriptionResult] -> ShowS
show :: DescriptionResult -> String
$cshow :: DescriptionResult -> String
showsPrec :: Int -> DescriptionResult -> ShowS
$cshowsPrec :: Int -> DescriptionResult -> ShowS
Show, DescriptionResult -> DescriptionResult -> Bool
(DescriptionResult -> DescriptionResult -> Bool)
-> (DescriptionResult -> DescriptionResult -> Bool)
-> Eq DescriptionResult
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: DescriptionResult -> DescriptionResult -> Bool
$c/= :: DescriptionResult -> DescriptionResult -> Bool
== :: DescriptionResult -> DescriptionResult -> Bool
$c== :: DescriptionResult -> DescriptionResult -> Bool
Eq)

documentDescription :: Document -> DescriptionResult
documentDescription :: Document -> DescriptionResult
documentDescription Document
d =
  case (Name -> Bool) -> Document -> Maybe Element
findDocumentTag (Name -> Name -> Bool
forall a. Eq a => a -> a -> Bool
== Name
"head") Document
d of
    Maybe Element
Nothing -> DescriptionResult
NoDescription
    Just Element
headTag ->
      let metaTags :: [Element]
metaTags = (Name -> Bool) -> Element -> [Element]
findElementTags (Name -> Name -> Bool
forall a. Eq a => a -> a -> Bool
== Name
"meta") Element
headTag
          isMetaDescription :: Element -> Bool
isMetaDescription Element
e = Name -> Map Name LogSource -> Maybe LogSource
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup Name
"name" (Element -> Map Name LogSource
elementAttributes Element
e) Maybe LogSource -> Maybe LogSource -> Bool
forall a. Eq a => a -> a -> Bool
== LogSource -> Maybe LogSource
forall a. a -> Maybe a
Just LogSource
"description"
       in case (Element -> Bool) -> [Element] -> [Element]
forall a. (a -> Bool) -> [a] -> [a]
filter Element -> Bool
isMetaDescription [Element]
metaTags of
            [] -> DescriptionResult
NoDescription
            [Element
e] -> case Element -> [Node]
elementNodes Element
e of
              [] -> DescriptionResult
-> (LogSource -> DescriptionResult)
-> Maybe LogSource
-> DescriptionResult
forall b a. b -> (a -> b) -> Maybe a -> b
maybe DescriptionResult
EmptyDescription LogSource -> DescriptionResult
Description (Maybe LogSource -> DescriptionResult)
-> Maybe LogSource -> DescriptionResult
forall a b. (a -> b) -> a -> b
$ Name -> Map Name LogSource -> Maybe LogSource
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup Name
"content" (Element -> Map Name LogSource
elementAttributes Element
e)
              [Node]
_ -> Element -> DescriptionResult
NonStandardDescription Element
e
            [Element]
_ -> DescriptionResult
MultipleDescriptions

findDocumentTag :: (Name -> Bool) -> Document -> Maybe Element
findDocumentTag :: (Name -> Bool) -> Document -> Maybe Element
findDocumentTag Name -> Bool
p = (Name -> Bool) -> Element -> Maybe Element
findElementTag Name -> Bool
p (Element -> Maybe Element)
-> (Document -> Element) -> Document -> Maybe Element
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Document -> Element
documentRoot

documentImagesWithoutAlt :: Document -> Set Text
documentImagesWithoutAlt :: Document -> Set LogSource
documentImagesWithoutAlt Document
d = [LogSource] -> Set LogSource
forall a. Ord a => [a] -> Set a
S.fromList ([LogSource] -> Set LogSource) -> [LogSource] -> Set LogSource
forall a b. (a -> b) -> a -> b
$
  ((Element -> Maybe LogSource) -> [Element] -> [LogSource])
-> [Element] -> (Element -> Maybe LogSource) -> [LogSource]
forall a b c. (a -> b -> c) -> b -> a -> c
flip (Element -> Maybe LogSource) -> [Element] -> [LogSource]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe ((Name -> Bool) -> Document -> [Element]
findDocumentTags (Name -> Name -> Bool
forall a. Eq a => a -> a -> Bool
== Name
"img") Document
d) ((Element -> Maybe LogSource) -> [LogSource])
-> (Element -> Maybe LogSource) -> [LogSource]
forall a b. (a -> b) -> a -> b
$
    \Element
e -> do
      LogSource
src <- Name -> Map Name LogSource -> Maybe LogSource
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup Name
"src" (Element -> Map Name LogSource
elementAttributes Element
e) -- We skip the ones without a 'src' attribute because we cannot identify them.
      case Name -> Map Name LogSource -> Maybe LogSource
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup Name
"alt" (Element -> Map Name LogSource
elementAttributes Element
e) of
        Maybe LogSource
Nothing -> LogSource -> Maybe LogSource
forall a. a -> Maybe a
Just LogSource
src
        Just LogSource
"" -> LogSource -> Maybe LogSource
forall a. a -> Maybe a
Just LogSource
src
        Just LogSource
a -> if LogSource -> Bool
T.null (LogSource -> LogSource
T.strip LogSource
a) then LogSource -> Maybe LogSource
forall a. a -> Maybe a
Just LogSource
src else Maybe LogSource
forall a. Maybe a
Nothing

findElementTag :: (Name -> Bool) -> Element -> Maybe Element
findElementTag :: (Name -> Bool) -> Element -> Maybe Element
findElementTag Name -> Bool
p e :: Element
e@Element {[Node]
Map Name LogSource
Name
elementNodes :: [Node]
elementAttributes :: Map Name LogSource
elementName :: Name
elementName :: Element -> Name
elementAttributes :: Element -> Map Name LogSource
elementNodes :: Element -> [Node]
..} =
  Maybe Element
go Maybe Element -> Maybe Element -> Maybe Element
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> [Maybe Element] -> Maybe Element
forall (t :: * -> *) (m :: * -> *) a.
(Foldable t, MonadPlus m) =>
t (m a) -> m a
msum ((Node -> Maybe Element) -> [Node] -> [Maybe Element]
forall a b. (a -> b) -> [a] -> [b]
map Node -> Maybe Element
goNode [Node]
elementNodes)
  where
    go :: Maybe Element
go = do
      Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Name -> Bool
p Name
elementName)
      Element -> Maybe Element
forall (f :: * -> *) a. Applicative f => a -> f a
pure Element
e
    goNode :: Node -> Maybe Element
    goNode :: Node -> Maybe Element
goNode = \case
      NodeElement Element
e' -> (Name -> Bool) -> Element -> Maybe Element
findElementTag Name -> Bool
p Element
e'
      Node
_ -> Maybe Element
forall a. Maybe a
Nothing

findDocumentTags :: (Name -> Bool) -> Document -> [Element]
findDocumentTags :: (Name -> Bool) -> Document -> [Element]
findDocumentTags Name -> Bool
p = (Name -> Bool) -> Element -> [Element]
findElementTags Name -> Bool
p (Element -> [Element])
-> (Document -> Element) -> Document -> [Element]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Document -> Element
documentRoot

findElementTags :: (Name -> Bool) -> Element -> [Element]
findElementTags :: (Name -> Bool) -> Element -> [Element]
findElementTags Name -> Bool
p e :: Element
e@Element {[Node]
Map Name LogSource
Name
elementNodes :: [Node]
elementAttributes :: Map Name LogSource
elementName :: Name
elementName :: Element -> Name
elementAttributes :: Element -> Map Name LogSource
elementNodes :: Element -> [Node]
..} =
  [Element] -> [Element]
go ((Node -> [Element]) -> [Node] -> [Element]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Node -> [Element]
goNode [Node]
elementNodes)
  where
    go :: [Element] -> [Element]
go = if Name -> Bool
p Name
elementName then (Element
e Element -> [Element] -> [Element]
forall a. a -> [a] -> [a]
:) else [Element] -> [Element]
forall a. a -> a
id
    goNode :: Node -> [Element]
    goNode :: Node -> [Element]
goNode = \case
      NodeElement Element
e' -> (Name -> Bool) -> Element -> [Element]
findElementTags Name -> Bool
p Element
e'
      Node
_ -> []