{-# OPTIONS -fno-warn-unused-imports #-} -- ------------------------------------------------------------ {- | Module : Text.XML.HXT.Arrow.XmlCache Copyright : Copyright (C) 2009 Uwe Schmidt License : MIT Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Stability : experimental Portability: portable Caching of XML document trees and other binary data -} -- ------------------------------------------------------------ module Text.XML.HXT.Arrow.XmlCache ( withCache , withoutCache , isInCache , lookupCache , readCache , writeCache , sha1HashValue , sha1HashString ) where import Control.DeepSeq import Control.Concurrent.ResourceTable import Control.Exception ( SomeException , try ) import Data.Binary import qualified Data.ByteString.Lazy as B import Data.Char import Data.Either import Data.Maybe import Data.Digest.Pure.SHA import System.FilePath import System.Directory import System.IO import System.Locale import System.Posix ( touchFile ) import System.Time import System.IO.Unsafe ( unsafePerformIO ) import Text.XML.HXT.Core import Text.XML.HXT.Arrow.XmlState.TypeDefs import Text.XML.HXT.Arrow.Binary -- ------------------------------------------------------------ -- | withCache enables reading documents with caching. -- -- When the cache is configured and enabled, every document read and parsed is serialized and stored in binary -- form in the cache. When reading the same document again, it is just deserialized, no parsing is performed. -- -- The cache is configured by a path pointing to a directory for storing the documents, -- by a maximal time span in second for valid documents. After that time span, the documents are read again -- and the cache is updated. -- The flag contols, whether documents returning 404 or other errors will be cached. -- If set, the cache is even activated for 404 (not found) responses, default is false. -- -- The serialized documents can be compressed, e.g. with bzip, to save disk space and IO time. -- The compression can be configured by 'Text.XML.HXT.Arrow.XmlState.withCompression' -- -- example: -- -- > import Text.XML.HXT.Core -- > import Text.XML.HXT.Cache -- > import Codec.Compression.BZip (compress, decompress) -- > ... -- > readDocument [ withCache "/tmp/cache" 3600 False -- > , withCompression (compress, decompress) -- > , .... -- > ] "http://www.haskell.org/" -- > -- -- In the example the document is read and stored in binary serialized form under \/tmp\/cache. -- The cached document remains valid for the next hour. -- It is compressed, before written to disk. withCache :: String -> Int -> Bool -> SysConfig withCache cachePath documentAge cache404 = setS (theWithCache .&&&. theCacheDir .&&&. theDocumentAge .&&&. theCache404Err .&&&. theCacheRead ) (True, (cachePath, (documentAge, (cache404, readDocCache)))) -- | Disable use of cache withoutCache :: SysConfig withoutCache = setS theWithCache False -- ------------------------------------------------------------ readDocCache :: String -> IOStateArrow s b XmlTree readDocCache src = localSysVar theWithCache $ configSysVar withoutCache >>> ( flip readDocCache' src $< getSysVar (theCacheDir .&&&. theDocumentAge .&&&. theCache404Err ) ) where readDocCache' config src' = applyA $ arrIO0 (lookupCache' config src') -- ------------------------------------------------------------ -- | Predicate arrow for checking if a document is in the cache. -- The arrow fails if document not there or is not longer valid, else the file name is returned. isInCache :: IOStateArrow s String String isInCache = uncurry isInC $< getSysVar (theDocumentAge .&&&. theCacheDir) where isInC age cdir = ( traceValue 2 (\ x -> "isInCache: file=" ++ show x ++ " age=" ++ show age ++ " cache dir=" ++ show cdir) >>> arrIO (isInCache' age cdir) >>> arrL ( \ x -> case x of Just Nothing -> [x] _ -> [] ) ) `guards` this isInCache' age cdir f = cacheHit age cf where cf = uncurry () $ cacheFile cdir f -- ------------------------------------------------------------ lookupCache' :: (FilePath, (Int, Bool)) -> String -> IO (IOStateArrow s a XmlTree) lookupCache' (dir, (age, e404)) src = do ch <- cacheHit age cf return $ case ch of Nothing -> readAndCacheDocument Just Nothing -> readDocumentFromCache Just (Just mt) -> readDocumentCond mt where cf = uncurry () $ cacheFile dir src is200 | e404 = hasAttrValue transferStatus (`elem` ["200", "404"]) | otherwise = hasAttrValue transferStatus (== "200") is304 = hasAttrValue transferStatus (== "304") readDocumentFromCache = traceMsg 1 ("cache hit for " ++ show src ++ " reading " ++ show cf) >>> ( readCache' cf >>> traceMsg 2 "cache read" ) `orElse` ( clearErrStatus >>> traceMsg 1 "cache file was corrupted, reading original" >>> readAndCacheDocument ) readAndCacheDocument = traceMsg 1 ("cache miss, reading original document " ++ show src) >>> readDocument [] src >>> perform ( choiceA [ is200 :-> ( writeCache src >>> none ) , this :-> traceMsg 1 "transfer status /= 200, page not cached" ] ) readDocumentCond mt = traceMsg 1 ("cache out of date, read original document if modified " ++ show src) >>> readDocument [withInputOption a_if_modified_since (fmtTime mt)] src >>> choiceA [ is304 :-> ( traceMsg 1 ("document not modified, using cache data from " ++ show cf) >>> perform (arrIO0 $ touchFile cf) >>> readDocumentFromCache ) , is200 :-> ( traceMsg 1 "document read, cache will be updated" >>> perform (writeCache src >>> traceMsg 2 "cache is updated" ) ) , this :-> ( traceMsg 1 "document read without caching" >>> perform ( arrIO0 $ remFile cf ) ) ] where fmtTime = formatCalendarTime defaultTimeLocale rfc822DateFormat . toUTCTime -- ------------------------------------------------------------ lookupCache :: (NFData b, Binary b) => String -> IOStateArrow s a b lookupCache f = uncurry lookupC $< getSysVar (theDocumentAge .&&&. theCacheDir) where lookupC age cdir = isIOA (const $ hit) `guards` readCache' cf where cf = uncurry () $ cacheFile cdir f hit = do ch <- cacheHit age cf return $ case ch of Just Nothing -> True _ -> False -- ------------------------------------------------------------ readCache :: (NFData c, Binary c) => String -> IOStateArrow s b c readCache f = readC $< getSysVar theCacheDir where readC cdir = readCache' $ uncurry () $ cacheFile cdir f readCache' :: (NFData c, Binary c) => String -> IOStateArrow s b c readCache' cf = rnfA $ withLock cf $ readBinaryValue cf writeCache :: (Binary b) => String -> IOStateArrow s b () writeCache f = writeC $< getSysVar theCacheDir where writeC cdir = traceMsg 1 ("writing cache file " ++ show cf ++ " for document " ++ show f) >>> perform (arrIO0 createDir) >>> withLock cf (writeBinaryValue cf) >>> perform (withLock ixf (arrIO0 $ writeIndex ixf f cf)) where cf = dir file ixf = cdir "index" (dir, file) = cacheFile cdir f createDir = createDirectoryIfMissing True dir -- ------------------------------------------------------------ remFile :: FilePath -> IO () remFile f = ( try' $ do ex <- doesFileExist f if ex then removeFile f else return () ) >> return () -- ------------------------------------------------------------ cacheFile :: FilePath -> String -> (FilePath, FilePath) cacheFile dir f = (dir fd, fn) where (fd, fn) = splitAt 2 . sha1HashString $ f -- ------------------------------------------------------------ -- result interpretation for cacheHit -- -- Nothing : cache miss: get document -- Just Nothing : cache hit, cache data valid: use cache data -- Just (Just t) : cache hit, but cache data out of date: get document conditionally with if-modified-since t cacheHit :: Int -> FilePath -> IO (Maybe (Maybe ClockTime)) cacheHit age cf = ( try' $ do e <- doesFileExist cf if not e then return Nothing else do mt <- getModificationTime cf ct <- getClockTime return . Just $ if (dt `addToClockTime` mt) >= ct then Nothing else Just mt ) >>= return . either (const Nothing) id where seconds = age `mod` 60 minutes = age `div` 60 dt = normalizeTimeDiff $ TimeDiff 0 0 0 0 minutes seconds 0 try' :: IO a -> IO (Either SomeException a) try' = try writeIndex :: String -> String -> FilePath -> IO () writeIndex ixf f cf = ( try' $ do h <- openBinaryFile ixf AppendMode hPutStrLn h $ show (cf, f) hClose h return () ) >> return () -- ------------------------------------------------------------ -- | Compute the SHA1 hash is hexadecimal format for an arbitray serializable value sha1HashValue :: (Arrow a, Binary b) => a b Integer sha1HashValue = arr $ integerDigest . sha1 . encode sha1HashString :: (Arrow a, Binary b) => a b String sha1HashString = arr $ showDigest . sha1 . encode -- ------------------------------------------------------------ -- | the internal table of file locks theLockedFiles :: ResourceTable String theLockedFiles = unsafePerformIO newResourceTable {-# NOINLINE theLockedFiles #-} lockFile, unlockFile :: String -> IO () lockFile = requestResource theLockedFiles unlockFile = releaseResource theLockedFiles withLock :: String -> IOStateArrow s b c -> IOStateArrow s b c withLock l a = ( perform (arrIO0 $ lockFile l) >>> listA a >>> perform (arrIO0 $ unlockFile l) ) >>> unlistA -----------------------------------------------------------------------------