-- | Module : Codec.Archive.Zim.Parser -- Description : API for parsing ZIM files -- Copyright : (c) Robbin C. -- License : GPLv3 -- Maintainer : Robbin C. -- Stability : unstable -- Portability : portable -- -- This is a library for parsing ZIM () files. ZIM files -- contain offline web content (eg, Wikipedia) which can be browsed locally -- without an Internet connection. -- -- The API is meant to be intuitive for normal use-cases. -- -- To get content for "A/index.htm" from ZIM file "file.zim": -- -- > > mimeContent <- "file.zim" `getContent` Url "A/index.htm" -- > > :t mimeContent -- > mimeContent :: Maybe (B8.ByteString, BL.ByteString) -- > > print mimeContent -- > Just ("text/html", "...") -- -- The above will open the file, parse the ZIM header, lookup the -- MIME type and content of the URL, close the file and return the MIME type and -- content as a pair. Note that content is a lazy bytestring. -- -- The above operation should suffice for a simple webserver serving a ZIM file. -- For finer control, it is possible to cache and reuse the file handle and the -- ZIM header. -- -- > > hdl <- openBinaryFile "file.zim" ReadMode -- > > hdr <- getHeader hdl -- > > :t hdr -- > hdr :: ZimHeader -- > > (hdl, hdr) `getContent` Url "A/index.htm" -- > Just ("text/html", "...") -- -- ZIM files of Wikimedia Foundation (Wikipedia, Wikibooks, etc) can be -- found at http://ftpmirror.your.org/pub/kiwix/zim. -- -- Below is a full example of a Scotty web server that serves a ZIM file -- (specified on command line) on localhost port 3000: -- -- > {-# LANGUAGE OverloadedStrings #-} -- > -- > import Control.Monad.IO.Class (liftIO) -- > import Data.Text.Lazy (toStrict, fromStrict) -- > import Data.Text.Encoding (decodeUtf8, encodeUtf8) -- > import System.Environment (getArgs) -- > import Network.HTTP.Types.Status (status404) -- > import Web.Scotty -- > import Codec.Archive.Zim.Parser (getMainPageUrl, getContent, Url(..)) -- > -- > main :: IO () -- > main = do -- > [fp] <- getArgs -- > scotty 3000 $ do -- > get "/" (redirectToZimMainPage fp) -- > get (regex "^/(./.*)$") (serveZimUrl fp) -- > notFound $ text "Invalid URL!" -- > -- > redirectToZimMainPage :: FilePath -> ActionM () -- > redirectToZimMainPage fp = do -- > res <- liftIO $ getMainPageUrl fp -- > case res of -- > Nothing -> do -- > status status404 -- > text "This ZIM file has no main page specified!" -- > Just (Url url) -> redirect . fromStrict $ decodeUtf8 url -- > -- > serveZimUrl :: FilePath -> ActionM () -- > serveZimUrl fp = do -- > url <- (encodeUtf8 . toStrict) <$> param "1" -- > res <- liftIO $ fp `getContent` Url url -- > case res of -- > Nothing -> do -- > liftIO . putStrLn $ "Invalid URL: " ++ show url -- > status status404 -- > text $ "Invalid URL!" -- > Just (mimeType, content) -> do -- > liftIO . putStrLn $ "Serving: " ++ show url -- > setHeader "Content-Type" (fromStrict $ decodeUtf8 mimeType) -- > raw content -- -- Feedback and contributions are welcome on . {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE FlexibleInstances #-} module Codec.Archive.Zim.Parser ( -- * Functions getHeader , getMimeList , getDE , getMainPageUrl , getCluster , getBlob , getContent , searchDE , MimeList , mkNsTitle , mkNsTitlePrefix , mkNsUrl , RunZim , ZimGetDE , ZimSearchDE , ZimGetContent -- * Exceptions , ZimException(..) -- * ZIM Header , ZimHeader(..) -- * ZIM Directory Entry , ZimDirEntType(..) , ZimDirEnt(..) , UrlIndex(..) , TitleIndex(..) , ClusterNumber(..) , BlobNumber(..) , Cluster(..) , Blob(..) , Url(..) , Title , TitlePrefix -- * ZIM file format -- | Following is a short summary of the ZIM file format. -- The authoritative reference is at http://www.openzim.org/wiki/ZIM_file_format. -- -- === 1. ZIM header -- This is an 80-byte header (see 'ZimHeader'). Among other things, it contains file offsets to the below. -- -- === 2. List of MIME types -- This is a sequence of null-terminated strings (eg. @text\/html@, @text\/javascript@). The last string is zero length, so -- the end always consists of 2 consecutive null bytes. -- -- === 3. List of URLs -- This is a sequence of 8-byte file offsets, each pointing to a directory entry. This list is sorted by the directory entries' URL. -- -- 'getZimDirEntByUrlIndex' looks up this table to return a directory entry. -- -- === 4. List of Titles -- This is a sequence of 4-byte indices, each pointing to a URL above (which in turn point to a directory entry). -- This list is sorted by the directory entries' Title. -- -- 'getZimDirEntByTitleIndex' uses this table to return a directory entry. -- -- === 5. Directory Entries -- This is a sequence of Directory Entries (see 'ZimDirEnt'). -- The first 2 bytes determine the type of this entry, which also determine the length. -- Contents include: -- -- ==== a. MIME type -- This 2-byte field means: -- -- [@0xffff@] This directory entry is a 'ZimRedirectEntry'. -- [@0xfffe@] This directory entry is a 'ZimLinkTarget'. -- [@0xfffd@] This directory entry is a 'ZimDeletedEntry'. -- [@any other value@] This directory entry is a 'ZimArticleEntry' and this index into the MIME list from above determines its MIME type. -- -- ==== b. Namespace -- This single character determines the directory entry's namespace. (eg. __A__ for articles, __I__ for images, etc.) -- The comprehensive list is at http://www.openzim.org/wiki/ZIM_file_format#Namespaces. -- -- ==== c. Cluster and Blob number -- Only for 'ZimArticleEntry', this is the directory entry's Cluster and Blob number. -- The Cluster number is a 4-byte index into the list of Clusters below. -- The Blob number refers to a block inside the (decompressed) cluster. -- Together, they provide the content of this directory entry. -- -- ==== d. URL and Title -- These 2 null-terminated strings represent the URL and Title of this directory entry respectively. -- If the Title is empty, it is taken to be the same as the URL. -- -- === 6. List of Clusters -- This is a list of 8-byte file offsets, each pointing to a cluster in the file. -- The end of a cluster is also the start of the next cluster. -- Therefore, the length of a cluster is the difference between the adjacent offsets. -- For the last cluster, the end is the Checksum file offset, as the Checksum is always -- the last 16 bytes of a ZIM file. -- -- ==== a. Compression Type -- The first byte of the cluster determines if it is uncompressed (eg. PNG image) or compressed with LZMA (eg. HTML). -- -- [@0 or 1@] No compression -- [@4@] Compressed with LZMA -- -- ==== b. List of Blobs -- This is a list of 4-byte offsets, each pointing inside this cluster. -- The end of a blob is also the start of the next blob. -- Therefore, the length of a blob is the difference between the adjacent offsets. -- The last offset points to the end of the data area so there is always one more offset than blobs. ) where import Control.Applicative ((<$>), (<*>)) import Control.Exception (Exception, throw) import Control.Monad (when) import Data.Char (chr) import Data.Maybe (fromJust) import Data.Typeable (Typeable) import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as B8 import qualified Data.ByteString.Lazy as BL import System.IO (Handle, IOMode(ReadMode), withBinaryFile) import Data.Conduit (($$), (=$), await, Sink) import Control.Monad.Trans.Resource (runResourceT) import Data.Conduit.Binary (sourceHandleRange, sourceLbs, sinkLbs) import Data.Conduit.Serialization.Binary (sinkGet, conduitGet) import Data.Conduit.Lzma (decompress) import Data.Array.IArray ((!), listArray, Array) import Data.Binary.Get (Get, skip, getWord8, getWord16le, getWord32le, getWord64le, getByteString, getLazyByteStringNul, getRemainingLazyByteString) import Numeric (showHex) -- | Other than the below, ErrorCall can be thrown by LZMA library if there is a problem with decompression. data ZimException = ZimInvalidMagic -- ^ ZIM file has invalid magic number (anything other than 72173914). | ZimParseError String -- ^ There is an error in parsing. | ZimIncompleteInput -- ^ There is insufficient bytes required to parse. | ZimInvalidIndex Int -- ^ The given index (URL, title or cluster) is out of bounds for this ZIM file. deriving (Show, Typeable) instance Exception ZimException -- | See http://www.openzim.org/wiki/ZIM_file_format#Header for more details. data ZimHeader = ZimHeader { -- | Magic Number of file (somewhat superfluous as 'getZimHeader' will throw an exception if magic number is anything other than 72173914) zimMagicNumber :: Int -- | Version of ZIM header , zimVersion :: Int -- | UUID of file , zimUuid :: B.ByteString -- | Number of articles , zimArticleCount :: Int -- | Number of clusters , zimClusterCount :: Int -- | Position of sorted URL pointers , zimUrlPtrPos :: Integer -- | Position of sorted Title pointers , zimTitlePtrPos :: Integer -- | Position of Cluster pointers , zimClusterPtrPos :: Integer -- | Position of MIME list , zimMimeListPos :: Integer -- | Index of main page , zimMainPage :: Maybe Int -- | Index of layout page , zimLayoutPage :: Maybe Int -- | Position of MD5 checksum , zimChecksumPos :: Integer } deriving (Show, Eq) -- | There are 4 types of directory entries. Most content in a ZIM file are -- usually 'ZimArticleEntry' or 'ZimRedirectEntry'. data ZimDirEntType = ZimArticleEntry | ZimRedirectEntry | ZimLinkTarget | ZimDeletedEntry deriving (Eq, Show) -- | See http://www.openzim.org/wiki/ZIM_file_format#Directory_Entries for more details. data ZimDirEnt = ZimDirEnt { -- | Type of this Directory Entry zimDeType :: ZimDirEntType -- | Index into MIME list given by 'getZimMimeList' , zimDeMimeType :: Int -- | Parameter Length , zimDeParameterLen :: Int -- | Namespace , zimDeNamespace :: Char -- | Revision , zimDeRevision :: Int -- | Redirect Index (only applicable for 'ZimRedirectEntry') , zimDeRedirectIndex :: Maybe Int -- | Content is stored in this Cluster Number (only applicable for 'ZimArticleEntry') , zimDeClusterNumber :: Maybe Int -- | Content is stored in this Blob Number (only applicable for 'ZimArticleEntry') , zimDeBlobNumber :: Maybe Int -- | URL , zimDeUrl :: B8.ByteString -- | Title , zimDeTitle :: B8.ByteString -- , zimDeParameter :: BL.ByteString -- unused } deriving (Eq, Show) -- | List of Mime Types type MimeList = Array Int B8.ByteString -- | Wrapper for URL index newtype UrlIndex = UrlIndex Int deriving (Eq, Ord, Show) -- | Wrapper for Title index newtype TitleIndex = TitleIndex Int deriving (Eq, Ord, Show) -- | Wrapper for Cluster number newtype ClusterNumber = ClusterNumber Int deriving (Eq, Ord, Show) -- | Wrapper for Blob number newtype BlobNumber = BlobNumber Int deriving (Eq, Ord, Show) -- | Wrapper for Url newtype Url = Url B.ByteString deriving (Eq, Ord, Show) -- | Construct a Url with a Namespace. mkNsUrl :: Char -> B.ByteString -> Url mkNsUrl c s = Url $ c `B8.cons` '/' `B8.cons` s -- | Wrapper for Title newtype Title = Title B.ByteString deriving (Eq, Ord, Show) -- | Construct a Title with a Namespace. mkNsTitle :: Char -> B.ByteString -> Title mkNsTitle c s = Title $ c `B8.cons` '/' `B8.cons` s -- | Wrapper for Title Prefix newtype TitlePrefix = TitlePrefix B.ByteString deriving (Eq, Ord, Show) -- | Construct a TitlePrefix with a Namespace. mkNsTitlePrefix :: Char -> B.ByteString -> TitlePrefix mkNsTitlePrefix c s = TitlePrefix $ c `B8.cons` '/' `B8.cons` s -- | Wrapper for Cluster newtype Cluster = Cluster {unCluster :: BL.ByteString} -- | Wrapper for Blob newtype Blob = Blob {unBlob :: BL.ByteString} parseZimHeader :: Get ZimHeader parseZimHeader = do magicNumber <- fromIntegral <$> getWord32le when (magicNumber /= 72173914) $ throw ZimInvalidMagic version <- fromIntegral <$> getWord32le uuid <- getByteString 16 articleCount <- fromIntegral <$> getWord32le clusterCount <- fromIntegral <$> getWord32le urlPtrPos <- fromIntegral <$> getWord64le titlePtrPos <- fromIntegral <$> getWord64le clusterPtrPos <- fromIntegral <$> getWord64le mimeListPos <- fromIntegral <$> getWord64le mainPage <- fromIntegral <$> getWord32le layoutPage <- fromIntegral <$> getWord32le checksumPos <- fromIntegral <$> getWord64le return $ ZimHeader magicNumber version uuid articleCount clusterCount urlPtrPos titlePtrPos clusterPtrPos mimeListPos (if mainPage == 0xffffffff then Nothing else Just mainPage) (if layoutPage == 0xffffffff then Nothing else Just layoutPage) checksumPos -- | Instances of this class represent a Zim File and are able to perform ZIM operations (getMimeList, getContent, etc). Valid instances include a Handle to a ZIM file, a FilePath to a ZIM file, or a (Handle, ZimHeader) where ZimHeader is parsed previously (so it does not need to be reparsed). class RunZim h where runZim :: h -> (Handle -> ZimHeader -> IO a) -> IO a instance RunZim Handle where runZim hdl f = do hdr <- src $$ sinkGet parseZimHeader f hdl hdr where (pos, len) = (Just 0, Just 80) src = sourceHandleRange hdl pos len instance RunZim (Handle, ZimHeader) where runZim x f = uncurry f x instance RunZim FilePath where runZim fp f = withBinaryFile fp ReadMode $ \hdl -> runZim hdl f getHeader :: RunZim h => h -> IO ZimHeader getHeader h = runZim h $ \_ hdr -> return hdr -- Parses a list of null-terminated byte sequence. -- Last entry is zero length (end of block is always 2 null bytes). parseByteStringsNul :: Sink B8.ByteString IO [B8.ByteString] parseByteStringsNul = conduitGet getLazyByteStringNul =$ loop id where loop :: ([B8.ByteString] -> [B8.ByteString]) -> Sink BL.ByteString IO [B8.ByteString] loop front = await >>= maybe (return $ front []) (\x -> let bs = BL.toStrict x in if B8.null bs then return (front []) else loop (front . (bs:)) ) getMimeList :: RunZim h => h -> IO MimeList getMimeList h = runZim h $ \hdl hdr -> do let (pos, len) = (Just $ zimMimeListPos hdr, Nothing) src = sourceHandleRange hdl pos len mimeList <- src $$ parseByteStringsNul return $ listArray (0, length mimeList) mimeList parseZimDirEnt :: Get ZimDirEnt parseZimDirEnt = do mimeType <- fromIntegral <$> getWord16le :: Get Int parmLen <- fromIntegral <$> getWord8 namespace <- chr . fromIntegral <$> getWord8 revision <- fromIntegral <$> getWord32le -- parsing of next 3 fields depends on mimeType let deType = case mimeType of 0xffff -> ZimRedirectEntry 0xfffe -> ZimLinkTarget 0xfffd -> ZimDeletedEntry _ -> ZimArticleEntry (redirectIndex, clusterNumber, blobNumber ) <- case deType of ZimArticleEntry -> (\x y -> (Nothing, Just $ fromIntegral x, Just $ fromIntegral y)) <$> getWord32le <*> getWord32le ZimRedirectEntry -> (\x -> (Just $ fromIntegral x, Nothing, Nothing)) <$> getWord32le ZimLinkTarget -> skip 8 >> return (Nothing, Nothing, Nothing) ZimDeletedEntry -> skip 8 >> return (Nothing, Nothing, Nothing) :: Get (Maybe Int, Maybe Int, Maybe Int) url <- BL.toStrict <$> getLazyByteStringNul title <- BL.toStrict <$> getLazyByteStringNul return $ ZimDirEnt deType mimeType parmLen namespace revision redirectIndex clusterNumber blobNumber url -- specs: title is same as url if title is empty (if B.null title then url else title) class ZimGetDE k where getDE :: RunZim h => h -> k -> IO ZimDirEnt instance ZimGetDE UrlIndex where getDE h (UrlIndex i) = runZim h $ \hdl hdr -> do let urlPtrPos = Just $ zimUrlPtrPos hdr + 8 * fromIntegral i when (i < 0 || i >= zimArticleCount hdr) . throw $ ZimInvalidIndex i dePos <- sourceHandleRange hdl urlPtrPos Nothing $$ sinkGet getWord64le let srcDirEnt = sourceHandleRange hdl (Just $ fromIntegral dePos) Nothing srcDirEnt $$ sinkGet parseZimDirEnt instance ZimGetDE TitleIndex where getDE h (TitleIndex i) = runZim h $ \hdl hdr -> do let titlePtrPos = Just $ zimTitlePtrPos hdr + 4 * fromIntegral i srcTitle = sourceHandleRange hdl titlePtrPos Nothing when (i < 0 || i >= zimArticleCount hdr) . throw $ ZimInvalidIndex i urlIndex <- srcTitle $$ sinkGet getWord32le (hdl, hdr) `getDE` (UrlIndex $ fromIntegral urlIndex) getCluster :: RunZim h => h -> ClusterNumber -> IO Cluster getCluster h (ClusterNumber i) = runZim h $ \hdl hdr -> do let limit = zimClusterCount hdr - 1 when (i < 0 || i > limit) . throw $ ZimInvalidIndex i let clusterPos = Just $ zimClusterPtrPos hdr + 8 * fromIntegral i src = sourceHandleRange hdl clusterPos Nothing (pos0, pos1) <- src $$ sinkGet $ (,) <$> getWord64le <*> getWord64le -- length of last cluster is determined by checksum pos instead of next cluster pos let len = if i == limit then fromIntegral (zimChecksumPos hdr) - pos0 else pos1 - pos0 toI = Just . fromIntegral srcCluster = sourceHandleRange hdl (toI pos0) (toI len) bs <- srcCluster $$ sinkGet getRemainingLazyByteString case BL.uncons bs of Just (0, cluster) -> return $ Cluster cluster Just (1, cluster) -> return $ Cluster cluster Just (4, cluster) -> Cluster <$> (runResourceT $ sourceLbs cluster $$ decompress Nothing =$ sinkLbs) Just (x, _) -> throw . ZimParseError $ "Cluster " ++ show i ++ " (offset: " ++ showHex pos0 "" ++ ", length: " ++ show len ++ ") compressed with unsupported type: " ++ show x Nothing -> throw . ZimParseError $ "Insufficient bytes for cluster " ++ show i getBlob :: RunZim h => h -> (ClusterNumber, BlobNumber) -> IO Blob getBlob h (c, BlobNumber b) = do Cluster cluster <- h `getCluster` c let src = sourceLbs (BL.drop (4 * fromIntegral b) cluster) (pos0, pos1) <- src $$ sinkGet $ (,) <$> getWord32le <*> getWord32le let len = pos1 - pos0 return . Blob . BL.take (fromIntegral len) $ BL.drop (fromIntegral pos0) cluster -- | Returns URL of main page in ZIM. -- This URL can be used for redirecting to the actual page. getMainPageUrl :: RunZim h => h -> IO (Maybe Url) getMainPageUrl h = runZim h $ \hdl hdr -> case zimMainPage hdr of Nothing -> return Nothing Just i -> do de <- (hdl, hdr) `getDE` UrlIndex i return . Just $ mkNsUrl (zimDeNamespace de) (zimDeUrl de) class ZimGetContent k where -- | Get (MIME type, Content). Note that Content is lazy. getContent :: RunZim h => h -> k -> IO (Maybe (B.ByteString, BL.ByteString)) instance ZimGetContent (MimeList, ZimDirEnt) where getContent h (ml, de) = runZim h $ \hdl hdr -> do case zimDeType de of ZimRedirectEntry -> let u = UrlIndex . fromJust $ zimDeRedirectIndex de in (hdl, hdr) `getDE` u >>= ((hdl, hdr) `getContent`) ZimArticleEntry -> do let (Just c, Just b) = (zimDeClusterNumber de, zimDeBlobNumber de) content <- unBlob <$> (hdl, hdr) `getBlob` (ClusterNumber c, BlobNumber b) return $ Just (ml ! zimDeMimeType de, content) _ -> return $ Just (ml ! zimDeMimeType de, BL.empty) instance ZimGetContent ZimDirEnt where getContent h de = runZim h $ \hdl hdr -> do ml <- getMimeList (hdl, hdr) (hdl, hdr) `getContent` (ml, de) instance ZimGetContent (MimeList, Url) where getContent h (ml, url) = runZim h $ \hdl hdr -> do des <- (hdl, hdr) `searchDE` url case des of [] -> return Nothing ((_, de) : _) -> (hdl, hdr) `getContent` (ml, de) instance ZimGetContent Url where getContent h url = runZim h $ \hdl hdr -> do ml <- getMimeList (hdl, hdr) (hdl, hdr) `getContent` (ml, url) instance ZimGetContent (MimeList, Title) where getContent h (ml, title) = runZim h $ \hdl hdr -> do des <- (hdl, hdr) `searchDE` title case des of [] -> return Nothing ((_, de) : _) -> (hdl, hdr) `getContent` (ml, de) instance ZimGetContent Title where getContent h title = runZim h $ \hdl hdr -> do ml <- getMimeList (hdl, hdr) (hdl, hdr) `getContent` (ml, title) instance ZimGetContent (MimeList, UrlIndex) where getContent h (ml, u) = runZim h $ \hdl hdr -> do de <- (hdl, hdr) `getDE` u (hdl, hdr) `getContent` (ml, de) instance ZimGetContent UrlIndex where getContent h u = runZim h $ \hdl hdr -> do ml <- getMimeList (hdl, hdr) (hdl, hdr) `getContent` (ml, u) instance ZimGetContent (MimeList, TitleIndex) where getContent h (ml, t) = runZim h $ \hdl hdr -> do de <- (hdl, hdr) `getDE` t (hdl, hdr) `getContent` (ml, de) instance ZimGetContent TitleIndex where getContent h t = runZim h $ \hdl hdr -> do ml <- getMimeList (hdl, hdr) (hdl, hdr) `getContent` (ml, t) -- Binary Search implementation used for searching sorted URL and Title lists. binarySearch :: (Int -> IO (Ordering, a)) -> Int -> Int -> IO (Maybe a) binarySearch f low high = if high < low then return Nothing else do let mid = (low + high) `div` 2 (o, x) <- f mid case o of LT -> binarySearch f low (mid -1) GT -> binarySearch f (mid + 1) high EQ -> return $ Just x class ZimSearchDE k where -- | Search for a Directory Entry on a RunZim. -- When searching for a: -- -- [@Url@] Returns either 0 (not found) or 1 element. -- [@Title@] Returns either 0 (not found) or 1 element. -- [@TitlePrefix@] Returns either 0 (not found) or 2 elements corresponding to lower and upper bound of titles containing the prefix. -- searchDE :: RunZim h => h -> k -> IO [(Int, ZimDirEnt)] instance ZimSearchDE Url where searchDE h url = runZim h $ \hdl hdr -> do let f i = do de <- (hdl, hdr) `getDE` UrlIndex i let v = mkNsUrl (zimDeNamespace de) (zimDeUrl de) return (compare url v, (UrlIndex i, de)) res <- binarySearch f 0 (zimArticleCount hdr - 1) return $ maybe [] (\(UrlIndex i, r) -> [(i, r)]) res instance ZimSearchDE Title where searchDE h title = runZim h $ \hdl hdr -> do let f i = do de <- (hdl, hdr) `getDE` TitleIndex i let v = mkNsTitle (zimDeNamespace de) (zimDeTitle de) return (compare title v, (TitleIndex i, de)) res <- binarySearch f 0 (zimArticleCount hdr - 1) return $ maybe [] (\(TitleIndex i, r) -> [(i, r)]) res instance ZimSearchDE TitlePrefix where searchDE h (TitlePrefix pre) = runZim h $ \hdl hdr -> do let preLen = B8.length pre - 2 -- minus namespace prefix limit = zimArticleCount hdr - 1 -- extracts title to compare from Directory Entry mkT x = mkNsTitle (zimDeNamespace x) (B8.take preLen (zimDeTitle x)) g idx = (\x -> (x, mkT x)) <$> (hdl, hdr) `getDE` idx -- i has to be the entry just before prefix matches lowerBound i = do de <- (hdl, hdr) `getDE` TitleIndex i case compare (Title pre) (mkT de) of -- if prefix matches, we still return LT as we want to find the entry BEFORE. -- special case: if i = 0, then this is the lower bound. EQ -> if i == 0 then return (EQ, (TitleIndex i, de)) else return (LT, (TitleIndex i, de)) lgt -> do -- if succeeding entry has prefix, that is the lower bound. (de', Title v') <- g $ TitleIndex (i + 1) if pre `B8.isPrefixOf` v' then return (EQ, (TitleIndex $ i + 1, de')) else return (lgt, (TitleIndex i, de)) upperBound i = do de <- (hdl, hdr) `getDE` TitleIndex i case compare (Title pre) (mkT de) of EQ -> if i == limit then return (EQ, (TitleIndex i, de)) else return (GT, (TitleIndex i, de)) lgt -> do (de', Title v') <- g $ TitleIndex (i - 1) if pre `B8.isPrefixOf` v' then return (EQ, (TitleIndex $ i - 1, de')) else return (lgt, (TitleIndex i, de)) lb <- binarySearch lowerBound 0 limit case lb of Nothing -> return [] _ -> do ub <- binarySearch upperBound 0 limit let Just (TitleIndex lbi, lb') = lb Just (TitleIndex ubi, ub') = ub return [(lbi, lb'), (ubi, ub')]