src/Codec/Archive/Zim/Parser.hs

-- | Module      : Codec.Archive.Zim.Parser
-- Description : API for parsing ZIM files
-- Copyright   : (c) Robbin C.
-- License     : GPLv3
-- Maintainer  : Robbin C.
-- Stability   : unstable
-- Portability : portable
--
-- This is a library for parsing ZIM (<http://openzim.org>) files. ZIM files
-- contain offline web content (eg, Wikipedia) which can be browsed locally
-- without an Internet connection.
--
-- The high-level functions can be used if it is not a problem to re-open and close
-- the ZIM file on each invocation. For simple browsing on a local device, this
-- should suffice. This also works if the underlying ZIM file is changing.
--
-- The other functions can be used if the caller opts to have more control over
-- resource management.
--
-- Behind the scenes, conduit is used to read from files so memory usage should
-- be constant.
--
-- Below is a full example of a Scotty web server that serves a ZIM file
-- (specified on command line) on localhost port 3000:
--
-- > {-# LANGUAGE OverloadedStrings #-}
-- >
-- > import Control.Monad.IO.Class (liftIO)
-- > import Data.Text.Lazy (toStrict, fromStrict)
-- > import Data.Text.Encoding (decodeUtf8, encodeUtf8)
-- > import System.Environment (getArgs)
-- > import Network.HTTP.Types.Status (status404)
-- > import Web.Scotty
-- > import Codec.Archive.Zim.Parser (getZimMainPageUrl, getZimUrlContent)
-- >
-- > main :: IO ()
-- > main = do
-- >     [fp] <- getArgs
-- >     scotty 3000 $ do
-- >       get "/" (redirectToZimMainPage fp)
-- >       get (regex "^/(./.*)$") (serveZimUrl fp)
-- >       notFound $ text "Invalid URL!"
-- >
-- > redirectToZimMainPage :: FilePath -> ActionM ()
-- > redirectToZimMainPage fp = do
-- >     res <- liftIO $ getZimMainPageUrl fp
-- >     case res of
-- >       Nothing -> do
-- >         status status404
-- >         text "This ZIM file has no main page specified!"
-- >       Just url -> redirect . fromStrict $ decodeUtf8 url
-- >
-- > serveZimUrl :: FilePath -> ActionM ()
-- > serveZimUrl fp = do
-- >     url <- (encodeUtf8 . toStrict) <$> param "1"
-- >     res <- liftIO $ getZimUrlContent fp url
-- >     case res of
-- >       Nothing -> do
-- >         liftIO . putStrLn $ "Invalid URL: " ++ show url
-- >         status status404
-- >         text $ "Invalid URL!"
-- >       Just (mimeType, content) -> do
-- >         liftIO . putStrLn $ "Serving: " ++ show url
-- >         setHeader "Content-Type" (fromStrict $ decodeUtf8 mimeType)
-- >         raw content
--
-- Feedback and contributions are welcome on <http://github.com/robbinch/zim-parser>.


{-# LANGUAGE DeriveDataTypeable #-}
module Codec.Archive.Zim.Parser
       (
       -- * High-level Functions
       -- | The following high-level functions are sufficient to program a simple webserver that serves ZIM files (see example above).
         getZimMainPageUrl
       , getZimUrlContent
       -- * Searching
       , searchZimDirEntByUrl
       , searchZimDirEntByTitle
       , searchZimDirEntByTitlePrefix
       -- * Exceptions
       , ZimException(..)
       -- * ZIM Header
       , ZimHeader(..)
       , getZimHeader
       , getZimMimeList
       -- * ZIM Directory Entry
       , ZimDirEntType(..)
       , ZimDirEnt(..)
       , getZimDirEntByUrlIndex
       , getZimDirEntByTitleIndex
       -- * ZIM Content
       , getZimCluster
       , getZimBlob
       , getZimContentByUrlIndex
       -- * ZIM file format
       -- | Following is a short summary of the ZIM file format.
       -- The authoritative reference is at http://www.openzim.org/wiki/ZIM_file_format.
       --
       -- === 1. ZIM header
       -- This is an 80-byte header (see 'ZimHeader'). Among other things, it contains file offsets to the below.
       --
       -- === 2. List of MIME types
       -- This is a sequence of null-terminated strings (eg. @text\/html@, @text\/javascript@). The last string is zero length, so
       -- the end always consists of 2 consecutive null bytes.
       --
       -- === 3. List of URLs
       -- This is a sequence of 8-byte file offsets, each pointing to a directory entry. This list is sorted by the directory entries' URL.
       --
       -- 'getZimDirEntByUrlIndex' looks up this table to return a directory entry.
       --
       -- === 4. List of Titles
       -- This is a sequence of 4-byte indices, each pointing to a URL above (which in turn point to a directory entry).
       -- This list is sorted by the directory entries' Title.
       --
       -- 'getZimDirEntByTitleIndex' uses this table to return a directory entry.
       --
       -- === 5. Directory Entries
       -- This is a sequence of Directory Entries (see 'ZimDirEnt').
       -- The first 2 bytes determine the type of this entry, which also determine the length.
       -- Contents include:
       --
       -- ==== a. MIME type
       -- This 2-byte field means:
       --
       -- [@0xffff@] This directory entry is a 'ZimRedirectEntry'.
       -- [@0xfffe@] This directory entry is a 'ZimLinkTarget'.
       -- [@0xfffd@] This directory entry is a 'ZimDeletedEntry'.
       -- [@any other value@] This directory entry is a 'ZimArticleEntry' and this index into the MIME list from above determines its MIME type.
       --
       -- ==== b. Namespace
       -- This single character determines the directory entry's namespace. (eg. __A__ for articles, __I__ for images, etc.)
       -- The comprehensive list is at http://www.openzim.org/wiki/ZIM_file_format#Namespaces.
       --
       -- ==== c. Cluster and Blob number
       -- Only for 'ZimArticleEntry', this is the directory entry's Cluster and Blob number.
       -- The Cluster number is a 4-byte index into the list of Clusters below.
       -- The Blob number refers to a block inside the (decompressed) cluster.
       -- Together, they provide the content of this directory entry.
       --
       -- ==== d. URL and Title
       -- These 2 null-terminated strings represent the URL and Title of this directory entry respectively.
       -- If the Title is empty, it is taken to be the same as the URL.
       --
       -- === 6. List of Clusters
       -- This is a list of 8-byte file offsets, each pointing to a cluster in the file.
       -- The end of a cluster is also the start of the next cluster.
       -- Therefore, the length of a cluster is the difference between the adjacent offsets.
       -- For the last cluster, the end is the Checksum file offset, as the Checksum is always
       -- the last 16 bytes of a ZIM file.
       --
       -- ==== a. Compression Type
       -- The first byte of the cluster determines if it is uncompressed (eg. PNG image) or compressed with LZMA (eg. HTML).
       --
       -- [@0 or 1@] No compression
       -- [@4@] Compressed with LZMA
       --
       -- ==== b. List of Blobs
       -- This is a list of 4-byte offsets, each pointing inside this cluster.
       -- The end of a blob is also the start of the next blob.
       -- Therefore, the length of a blob is the difference between the adjacent offsets.
       -- The last offset points to the end of the data area so there is always one more offset than blobs.
       ) where

import Control.Applicative ((<$>), (<*>))
import Control.Exception (Exception, throw)
import Control.Monad (when)
import Data.Char (chr)
import Data.Maybe (fromJust)
import Data.Typeable (Typeable)
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as B8
import qualified Data.ByteString.Lazy as BL
import System.IO (Handle, IOMode(ReadMode), withBinaryFile)

import Data.Conduit (($$), (=$), await, Sink)
import Control.Monad.Trans.Resource (runResourceT)
import Data.Conduit.Binary (sourceHandleRange, sourceLbs, sinkLbs)
import Data.Conduit.Serialization.Binary (sinkGet, conduitGet)
import Data.Conduit.Lzma (decompress)

import Data.Array.IArray ((!), listArray, Array)
import Data.Binary.Get (Get, skip, getWord8, getWord16le, getWord32le, getWord64le, getByteString, getLazyByteStringNul, getRemainingLazyByteString)
import Numeric (showHex)

-- | Other than the below, ErrorCall can be thrown by LZMA library if there is a problem with decompression.
data ZimException = ZimInvalidMagic       -- ^ ZIM file has invalid magic number (anything other than 72173914).
                  | ZimParseError String  -- ^ There is an error in parsing.
                  | ZimIncompleteInput    -- ^ There is insufficient bytes required to parse.
                  | ZimInvalidIndex Int   -- ^ The given index (URL, title or cluster) is out of bounds for this ZIM file.
                  deriving (Show, Typeable)
instance Exception ZimException

-- | See http://www.openzim.org/wiki/ZIM_file_format#Header for more details.
data ZimHeader = ZimHeader
    {
    -- | Magic Number of file (somewhat superfluous as 'getZimHeader' will throw an exception if magic number is anything other than 72173914)
      zimMagicNumber   :: Int
    -- | Version of ZIM header
    , zimVersion       :: Int
    -- | UUID of file
    , zimUuid          :: B.ByteString
    -- | Number of articles
    , zimArticleCount  :: Int
    -- | Number of clusters
    , zimClusterCount  :: Int
    -- | Position of sorted URL pointers
    , zimUrlPtrPos     :: Integer
    -- | Position of sorted Title pointers
    , zimTitlePtrPos   :: Integer
    -- | Position of Cluster pointers
    , zimClusterPtrPos :: Integer
    -- | Position of MIME list
    , zimMimeListPos   :: Integer
    -- | Index of main page
    , zimMainPage      :: Maybe Int
    -- | Index of layout page
    , zimLayoutPage    :: Maybe Int
    -- | Position of MD5 checksum
    , zimChecksumPos   :: Integer
    } deriving (Show, Eq)

-- | There are 4 types of directory entries. Most content in a ZIM file are
-- usually 'ZimArticleEntry' or 'ZimRedirectEntry'.
data ZimDirEntType = ZimArticleEntry
                   | ZimRedirectEntry
                   | ZimLinkTarget
                   | ZimDeletedEntry
                   deriving (Eq, Show)

-- | See http://www.openzim.org/wiki/ZIM_file_format#Directory_Entries for more details.
data ZimDirEnt = ZimDirEnt
    {
    -- | Type of this Directory Entry
      zimDeType          :: ZimDirEntType
    -- | Index into MIME list given by 'getZimMimeList'
    , zimDeMimeType      :: Int
    -- | Parameter Length
    , zimDeParameterLen  :: Int
    -- | Namespace
    , zimDeNamespace     :: Char
    -- | Revision
    , zimDeRevision      :: Int
    -- | Redirect Index (only applicable for 'ZimRedirectEntry')
    , zimDeRedirectIndex :: Maybe Int
    -- | Content is stored in this Cluster Number (only applicable for 'ZimArticleEntry')
    , zimDeClusterNumber :: Maybe Int
    -- | Content is stored in this Blob Number (only applicable for 'ZimArticleEntry')
    , zimDeBlobNumber    :: Maybe Int
    -- | URL
    , zimDeUrl           :: B8.ByteString
    -- | Title
    , zimDeTitle         :: B8.ByteString
    -- , zimDeParameter     :: BL.ByteString -- unused
    } deriving (Eq, Show)

-- | Parses ZIM Header from a file handle.
-- A ZIM Header is used by most of the functions in this module.
-- For better performance or resource management, multiple file handles
-- can be opened with the same ZIM header in order to call
-- functions in parallel.
-- If the underlying ZIM file has changed, a new ZIM header should be parsed.
getZimHeader :: Handle        -- ^ Handle to ZIM file (eg. previously returned from 'withBinaryFile')
             -> IO ZimHeader  -- ^ Returns ZIM Header
getZimHeader hdl = src $$ sinkGet parseZimHeader
  where (pos, len) = (Just 0, Just 80)
        src        = sourceHandleRange hdl pos len

parseZimHeader :: Get ZimHeader
parseZimHeader = do
    magicNumber   <- fromIntegral <$> getWord32le
    when (magicNumber /= 72173914) $ throw ZimInvalidMagic
    version       <- fromIntegral <$> getWord32le
    uuid          <- getByteString 16
    articleCount  <- fromIntegral <$> getWord32le
    clusterCount  <- fromIntegral <$> getWord32le
    urlPtrPos     <- fromIntegral <$> getWord64le
    titlePtrPos   <- fromIntegral <$> getWord64le
    clusterPtrPos <- fromIntegral <$> getWord64le
    mimeListPos   <- fromIntegral <$> getWord64le
    mainPage      <- fromIntegral <$> getWord32le
    layoutPage    <- fromIntegral <$> getWord32le
    checksumPos   <- fromIntegral <$> getWord64le
    return $ ZimHeader magicNumber version uuid articleCount clusterCount
                       urlPtrPos titlePtrPos clusterPtrPos mimeListPos
                       (if mainPage == 0xffffffff then Nothing else Just mainPage)
                       (if layoutPage == 0xffffffff then Nothing else Just layoutPage)
                       checksumPos

-- | Parses MIME List from a ZIM header and a file handle.
getZimMimeList :: ZimHeader                     -- ^ ZIM header
               -> Handle                        -- ^ Handle to ZIM file
               -> IO (Array Int B8.ByteString)  -- ^ Returns array of MIME types
getZimMimeList hdr hdl = do
    mimeList <- src $$ parseByteStringsNul
    return $ listArray (0, length mimeList) mimeList
  where (pos, len) = (Just $ zimMimeListPos hdr, Nothing)
        src        = sourceHandleRange hdl pos len

-- Parses a list of null-terminated byte sequence.
-- Last entry is zero length (end of block is always 2 null bytes).
parseByteStringsNul :: Sink B8.ByteString IO [B8.ByteString]
parseByteStringsNul = conduitGet getLazyByteStringNul =$ loop id
  where loop :: ([B8.ByteString] -> [B8.ByteString]) -> Sink BL.ByteString IO [B8.ByteString]
        loop front = await >>= maybe
            (return $ front [])
            (\x -> let bs = BL.toStrict x
                   in if B8.null bs then return (front []) else loop (front . (bs:))
            )

-- | Returns Directory Entry corresponding to URL index.
getZimDirEntByUrlIndex :: ZimHeader     -- ^ ZIM header
                       -> Handle        -- ^ Handle to ZIM file
                       -> Int           -- ^ URL index
                       -> IO ZimDirEnt  -- ^ Returns a Directory Entry
getZimDirEntByUrlIndex hdr hdl i = do
    let urlPtrPos = Just $ zimUrlPtrPos hdr + 8 * fromIntegral i
    when (i < 0 || i >= zimArticleCount hdr) . throw $ ZimInvalidIndex i
    dePos <- sourceHandleRange hdl urlPtrPos Nothing $$ sinkGet getWord64le
    let srcDirEnt = sourceHandleRange hdl (Just $ fromIntegral dePos) Nothing
    srcDirEnt $$ sinkGet parseZimDirEnt

parseZimDirEnt :: Get ZimDirEnt
parseZimDirEnt = do
    mimeType  <- fromIntegral       <$> getWord16le :: Get Int
    parmLen   <- fromIntegral       <$> getWord8
    namespace <- chr . fromIntegral <$> getWord8
    revision  <- fromIntegral       <$> getWord32le

    -- parsing of next 3 fields depends on mimeType
    let deType = case mimeType of
          0xffff -> ZimRedirectEntry
          0xfffe -> ZimLinkTarget
          0xfffd -> ZimDeletedEntry
          _      -> ZimArticleEntry
    (redirectIndex, clusterNumber, blobNumber ) <-
      case deType of
        ZimArticleEntry  ->
          (\x y -> (Nothing, Just $ fromIntegral x, Just $ fromIntegral y))
            <$> getWord32le <*> getWord32le
        ZimRedirectEntry ->
          (\x -> (Just $ fromIntegral x, Nothing, Nothing))
            <$> getWord32le
        ZimLinkTarget    -> skip 8 >> return (Nothing, Nothing, Nothing)
        ZimDeletedEntry  -> skip 8 >> return (Nothing, Nothing, Nothing)
        :: Get (Maybe Int, Maybe Int, Maybe Int)

    url   <- BL.toStrict <$> getLazyByteStringNul
    title <- BL.toStrict <$> getLazyByteStringNul
    return $ ZimDirEnt deType mimeType parmLen namespace revision redirectIndex
                       clusterNumber blobNumber url
                       -- specs: title is same as url if title is empty
                       (if B.null title then url else title)

-- | Returns Directory Entry corresponding to Title index.
getZimDirEntByTitleIndex :: ZimHeader     -- ^ ZIM header
                         -> Handle        -- ^ Handle to ZIM file
                         -> Int           -- ^ Title index
                         -> IO ZimDirEnt  -- ^ Returns a Directory Entry
getZimDirEntByTitleIndex hdr hdl i = do
    when (i < 0 || i >= zimArticleCount hdr) . throw $ ZimInvalidIndex i
    urlIndex <- srcTitle $$ sinkGet getWord32le
    getZimDirEntByUrlIndex hdr hdl (fromIntegral urlIndex)
  where titlePtrPos = Just $ zimTitlePtrPos hdr + 4 * fromIntegral i
        srcTitle    = sourceHandleRange hdl titlePtrPos Nothing

-- | Returns (decompressed) Cluster corresponding to Cluster number.
-- This can throw ErrorCall if there is an error during decompression.
getZimCluster :: ZimHeader         -- ^ ZIM header
              -> Handle            -- ^ Handle to ZIM file
              -> Int               -- ^ Cluster number
              -> IO BL.ByteString  -- ^ Returns a lazy bytestring containing cluster
getZimCluster hdr hdl i = do
    let limit = zimClusterCount hdr - 1
    when (i < 0 || i > limit) . throw $ ZimInvalidIndex i
    let clusterPos = Just $ zimClusterPtrPos hdr + 8 * fromIntegral i
        src        = sourceHandleRange hdl clusterPos Nothing
    (pos0, pos1) <- src $$ sinkGet $ (,) <$> getWord64le <*> getWord64le
    -- length of last cluster is determined by checksum pos instead of next cluster pos
    let len = if i == limit
                  then (fromIntegral $ zimChecksumPos hdr) - pos0
                  else pos1 - pos0
        toI = Just . fromIntegral
        srcCluster = sourceHandleRange hdl (toI pos0) (toI len)
    bs <- srcCluster $$ sinkGet getRemainingLazyByteString

    case BL.uncons bs of
      Just (0, cluster) -> return cluster
      Just (1, cluster) -> return cluster
      Just (4, cluster) -> runResourceT $ sourceLbs cluster $$ decompress Nothing =$ sinkLbs
      Just (x, _)       -> throw . ZimParseError $
        "Cluster " ++ show i ++
        " (offset: " ++ showHex pos0 "" ++ ", length: " ++ show len ++
        ") compressed with unsupported type: " ++ show x
      Nothing           -> throw . ZimParseError $
        "Insufficient bytes for cluster " ++ show i

-- | Returns Blob given Cluster and Blob number.
getZimBlob :: ZimHeader         -- ^ ZIM header
           -> Handle            -- ^ Handle to ZIM file
           -> Int               -- ^ Cluster Number
           -> Int               -- ^ Blob Number
           -> IO BL.ByteString  -- ^ Returns a lazy bytestring containing blob
getZimBlob hdr hdl c b = do
    cluster      <- getZimCluster hdr hdl c
    let src = sourceLbs (BL.drop (4 * fromIntegral b) cluster)
    (pos0, pos1) <- src $$ sinkGet $ (,) <$> getWord32le <*> getWord32le
    let len = pos1 - pos0
    return . BL.take (fromIntegral len) $ BL.drop (fromIntegral pos0) cluster

-- | Returns content given URL index. Redirects are handled automatically
getZimContentByUrlIndex :: ZimHeader         -- ^ ZIM header
                        -> Handle            -- ^ Handle to ZIM file
                        -> Int               -- ^ URL index
                        -> IO BL.ByteString  -- ^ Returns a lazy bytestring containing content
getZimContentByUrlIndex hdr hdl i = do
    de <- getZimDirEntByUrlIndex hdr hdl i
    case zimDeType de of
      ZimRedirectEntry -> getZimContentByUrlIndex hdr hdl (fromJust $ zimDeRedirectIndex de)
      ZimArticleEntry  -> let (Just c, Just b) = (zimDeClusterNumber de, zimDeBlobNumber de)
                          in getZimBlob hdr hdl c b
      _                -> return BL.empty

-- Binary Search implementation used for searching sorted URL and Title lists.
binarySearch :: (Int -> IO (Ordering, a)) -> Int -> Int -> IO (Maybe a)
binarySearch f low high =
    if high < low
        then return Nothing
        else do
            let mid = (low + high) `div` 2
            (o, x) <- f mid
            case o of
              LT -> binarySearch f low       (mid -1)
              GT -> binarySearch f (mid + 1) high
              EQ -> return $ Just x

-- | Search for a Directory Entry given a URL.
-- URL must be prefixed with Namespace (eg. "A\/Blue.html" or "I\/favicon.png").
searchZimDirEntByUrl :: ZimHeader                    -- ^ ZIM header
                     -> Handle                       -- ^ Handle to ZIM file
                     -> B.ByteString                 -- ^ URL to search for
                     -> IO (Maybe (Int, ZimDirEnt))  -- ^ Returns (URL Index, Directory Entry) if found.
searchZimDirEntByUrl hdr hdl url =
    binarySearch f 0 (zimArticleCount hdr - 1)
  -- prepend namespace when comparing URLs
  where f i = do
          de <- getZimDirEntByUrlIndex hdr hdl i
          let v = zimDeNamespace de `B8.cons` '/' `B8.cons` zimDeUrl de
          return (compare url v, (i, de))

-- | Search for a Directory Entry given a Title and namespace.
searchZimDirEntByTitle :: ZimHeader                   -- ^ ZIM header
                       -> Handle                      -- ^ Handle to ZIM file
                       -> Char                        -- ^ Namespace to search for
                       -> B.ByteString                -- ^ Title to search for
                       -> IO (Maybe (Int, ZimDirEnt)) -- ^ Returns (Title Index, Directory Entry) if found
searchZimDirEntByTitle hdr hdl n title =
    binarySearch f 0 (zimArticleCount hdr - 1)
  where title' = n `B8.cons` '/' `B8.cons` title
        f i = do
          de <- getZimDirEntByTitleIndex hdr hdl i
          let v = zimDeNamespace de `B8.cons` '/' `B8.cons` zimDeTitle de
          return (compare title' v, (i, de))

-- | Search for lower and upper bounds of Title indices that contains prefix in their title.
-- Eg, if title list comprises \[ \"A\", \"Ba\", \"Bb\", \"Bc\", \"C\" \] prefix
-- search for \"B\" will return bounds corresponding to (\"Ba\", \"Bc\").
searchZimDirEntByTitlePrefix :: ZimHeader     -- ^ ZIM header
                             -> Handle        -- ^ Handle to ZIM file
                             -> Char          -- ^ Namespace
                             -> B.ByteString  -- ^ Title Prefix
                             -> IO (Maybe ((Int, ZimDirEnt), (Int, ZimDirEnt)))  -- ^ Returns ((Lower Title Index, Lower Directory Entry), (Upper Title Index, Upper Directory Entry)) if found.
searchZimDirEntByTitlePrefix hdr hdl n pre = do
    lb <- binarySearch lowerBound 0 limit
    case lb of
      Nothing -> return Nothing
      _ -> do
        ub <- binarySearch upperBound 0 limit
        return . Just $ (fromJust lb, fromJust ub)
  where pre' = n `B8.cons` '/' `B8.cons` pre
        preLen = B8.length pre
        limit = zimArticleCount hdr - 1
        -- extracts title to compare from Directory Entry
        mkT x = zimDeNamespace x `B8.cons` '/' `B8.cons` B8.take preLen (zimDeTitle x)
        g idx = (\x -> (x, mkT x)) <$> getZimDirEntByTitleIndex hdr hdl idx
        -- i has to be the entry just before prefix matches
        lowerBound i = do
          de <- getZimDirEntByTitleIndex hdr hdl i
          case compare pre' (mkT de) of
            -- if prefix matches, we still return LT as we want to find the entry BEFORE.
            -- special case: if i = 0, then this is the lower bound.
            EQ  -> if i == 0
                      then return (EQ, (i, de))
                      else return (LT, (i, de))
            lgt -> do
              -- if succeeding entry has prefix, that is the lower bound.
              (de', v') <- g (i + 1)
              if pre' `B8.isPrefixOf` v'
                 then return (EQ, (i + 1, de'))
                 else return (lgt, (i, de))
        upperBound i = do
          de <- getZimDirEntByTitleIndex hdr hdl i
          case compare pre' (mkT de) of
            EQ  -> if i == limit
                      then return (EQ, (i, de))
                      else return (GT, (i, de))
            lgt -> do
              (de', v') <- g (i - 1)
              if pre' `B8.isPrefixOf` v'
                 then return (EQ, (i - 1, de'))
                 else return (lgt, (i, de))

-- | Returns URL of main page in ZIM.
-- This URL can be used for redirecting to the actual page.
getZimMainPageUrl :: FilePath                 -- ^ Path to ZIM file
                  -> IO (Maybe B.ByteString)  -- ^ Returns URL if found
getZimMainPageUrl fp = do
    withBinaryFile fp ReadMode $ \hdl -> do
        hdr <- getZimHeader hdl
        case zimMainPage hdr of
            Nothing -> return Nothing
            Just i  -> do
              de <- getZimDirEntByUrlIndex hdr hdl i
              return . Just $ zimDeNamespace de `B8.cons` '/' `B8.cons` zimDeUrl de

-- | Returns (MIME type, content) of URL, ready to be served via HTTP.
-- Note that MIME type is a strict bytestring while Content is lazy.
getZimUrlContent :: FilePath                                  -- ^ Path to ZIM file
                 -> B.ByteString                              -- ^ URL
                 -> IO (Maybe (B.ByteString, BL.ByteString))  -- ^ Returns (MIME type, content) if found
getZimUrlContent fp url = do
    withBinaryFile fp ReadMode $ \hdl -> do
        hdr <- getZimHeader hdl
        res <- searchZimDirEntByUrl hdr hdl url
        case res of
            Nothing -> return Nothing
            Just (i, de) -> do
              mimeList <- getZimMimeList hdr hdl
              content  <- getZimContentByUrlIndex hdr hdl i
              return $ Just (mimeList ! zimDeMimeType de, content)