module Holumbus.Crawler.IndexerCore
( RawDoc
, RawContexts
, RawContext
, RawWords
, RawWord
, RawTitle
, IndexCrawlerConfig
, IndexContextConfig(..)
, IndexerState(..)
, emptyIndexerState
, indexCrawlerConfig
, stdIndexer
, unionIndexerStatesM
, insertRawDocM
)
where
import Control.DeepSeq
import Control.Monad ( foldM )
import Control.Monad.Trans
import Data.Binary ( Binary )
import qualified Data.Binary as B
import Data.Function.Selector
import Data.Maybe
import Holumbus.Crawler
import Holumbus.Index.Common hiding ( URI )
import Text.XML.HXT.Core
type RawDoc c = (RawContexts, RawTitle, Maybe c)
type RawContexts = [RawContext]
type RawContext = (Context, RawWords)
type RawWords = [RawWord]
type RawWord = (Word, Position)
type RawTitle = String
type IndexCrawlerConfig i d c = CrawlerConfig (RawDoc c) (IndexerState i d c)
type IndexCrawlerState i d c = CrawlerState (IndexerState i d c)
data IndexContextConfig = IndexContextConfig
{ ixc_name :: String
, ixc_collectText :: IOSArrow XmlTree String
, ixc_textToWords :: String -> [String]
, ixc_boringWord :: String -> Bool
}
data IndexerState i d c = IndexerState
{ ixs_index :: ! i
, ixs_documents :: ! (d c)
} deriving (Show)
instance (NFData i, NFData (d c)) => NFData (IndexerState i d c)
where
rnf IndexerState { ixs_index = i
, ixs_documents = d
} = rnf i `seq` rnf d
instance (Binary i, Binary (d c)) => Binary (IndexerState i d c)
where
put s = B.put (ixs_index s)
>>
B.put (ixs_documents s)
get = do
ix <- B.get
dm <- B.get
return $ IndexerState
{ ixs_index = ix
, ixs_documents = dm
}
instance (XmlPickler i, XmlPickler (d c)) => XmlPickler (IndexerState i d c)
where
xpickle = xpElem "index-state" $
xpWrap ( uncurry IndexerState
, \ ix -> (ixs_index ix, ixs_documents ix)
) $
xpPair xpickle xpickle
emptyIndexerState :: i -> d c -> IndexerState i d c
emptyIndexerState eix edm = IndexerState
{ ixs_index = eix
, ixs_documents = edm
}
stdIndexer :: ( Binary i
, Binary (d c)
, Binary c
, HolIndexM IO i
, HolDocuments d c
, NFData i
, NFData (d c)
, NFData c) =>
IndexCrawlerConfig i d c
-> Maybe String
-> [URI]
-> IndexerState i d c
-> IO (IndexCrawlerState i d c)
stdIndexer config resumeLoc startUris eis
= execCrawler action config (initCrawlerState eis)
where
action = do
noticeC "indexerCore" ["indexer started"]
res <- maybe (crawlDocs startUris) crawlerResume $ resumeLoc
noticeC "indexerCore" ["indexer finished"]
return res
indexCrawlerConfig :: ( HolIndexM IO i
, HolDocuments d c
, HolDocIndex d c i
, NFData i
, NFData c
, NFData (d c)
) =>
SysConfig
-> (URI -> Bool)
-> Maybe (IOSArrow XmlTree String)
-> Maybe (IOSArrow XmlTree XmlTree)
-> Maybe (IOSArrow XmlTree String)
-> Maybe (IOSArrow XmlTree c)
-> [IndexContextConfig]
-> IndexCrawlerConfig i d c
indexCrawlerConfig opts followRef getHrefF preDocF titleF0 customF0 contextCs
= addSysConfig (defaultOpts >>> opts)
>>>
( setS theFollowRef followRef )
>>>
( setS theProcessRefs $ fromMaybe getHtmlReferences getHrefF )
>>>
( setS thePreDocFilter $ fromMaybe checkDocumentStatus preDocF )
>>>
( setS theProcessDoc rawDocF )
>>>
enableRobotsTxt
>>>
addRobotsNoFollow
>>>
addRobotsNoIndex
$
defaultCrawlerConfig insertRawDocM unionIndexerStatesM
where
rawDocF = ( listA contextFs
&&&
titleF
&&&
customF
)
>>^ (\ (x3, (x2, x1)) -> (x3, x2, x1))
titleF = ( fromMaybe (constA "") titleF0 ) >. concat
customF = ( fromMaybe none customF0 ) >. listToMaybe
contextFs :: IOSArrow XmlTree RawContext
contextFs = catA . map contextF $ contextCs
contextF :: IndexContextConfig -> IOSArrow XmlTree RawContext
contextF ixc = constA (ixc_name ixc)
&&&
( ixc_collectText ixc >. processText )
where
processText :: [String] -> RawWords
processText = concat
>>>
ixc_textToWords ixc
>>>
flip zip [1..]
>>>
filter (fst >>> ixc_boringWord ixc >>> not)
defaultOpts = withRedirect yes
>>>
withAcceptedMimeTypes ["text/html", "text/xhtml"]
>>>
withInputEncoding isoLatin1
>>>
withEncodingErrors no
>>>
withValidate no
>>>
withParseHTML yes
>>>
withWarnings no
unionIndexerStatesM :: ( MonadIO m
, HolIndexM m i
, HolDocuments d c
, HolDocIndex d c i
) =>
IndexerState i d c
-> IndexerState i d c
-> m (IndexerState i d c)
unionIndexerStatesM ixs1 ixs2
= return
$! IndexerState { ixs_index = ix
, ixs_documents = dt
}
where
(! dt, ! ix) = unionDocIndex dt1 ix1 dt2 ix2
ix1 = ixs_index ixs1
ix2 = ixs_index ixs2
dt1 = ixs_documents ixs1
dt2 = ixs_documents ixs2
insertRawDocM :: ( MonadIO m
, HolIndexM m i
, HolDocuments d c
, NFData i
, NFData c
, NFData (d c)
) =>
(URI, RawDoc c)
-> IndexerState i d c
-> m (IndexerState i d c)
insertRawDocM (rawUri, (rawContexts, rawTitle, rawCustom)) ixs
| nullContexts = return ixs
| otherwise = do
newIx <- foldM (insertRawContextM did) (ixs_index ixs) $ rawContexts
newIxs <- return $
IndexerState { ixs_index = newIx
, ixs_documents = newDocs
}
rnf newIxs `seq`
return newIxs
where
nullContexts = and . map (null . snd) $ rawContexts
(did, newDocs) = insertDoc (ixs_documents ixs) doc
doc = Document
{ title = rawTitle
, uri = rawUri
, custom = rawCustom
}
insertRawContextM :: (Monad m, HolIndexM m i) =>
DocId -> i -> (Context, [(Word, Position)]) -> m i
insertRawContextM did ix (cx, ws)
= foldM (insWordM cx did) ix ws
insWordM :: (Monad m, HolIndexM m i) =>
Context -> DocId -> i -> (Word, Position) -> m i
insWordM cx' did' ix' (w', p') = insertPositionM cx' w' did' p' ix'