module Codec.Archive.Zim.Parser (
getHeader
, getMimeList
, getDE
, getMainPageUrl
, getCluster
, getBlob
, getContent
, searchDE
, MimeList
, mkNsTitle
, mkNsTitlePrefix
, mkNsUrl
, RunZim
, ZimGetDE
, ZimSearchDE
, ZimGetContent
, ZimException(..)
, ZimHeader(..)
, ZimDirEntType(..)
, ZimDirEnt(..)
, UrlIndex(..)
, TitleIndex(..)
, ClusterNumber(..)
, BlobNumber(..)
, Cluster(..)
, Blob(..)
, Url(..)
, Title
, TitlePrefix
) where
import Prelude ()
import Prelude.Compat
import Codec.Compression.Lzma (decompress)
import Control.Exception (Exception, throw)
import Control.Monad (when)
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as B8
import qualified Data.ByteString.Lazy as BL
import Data.Char (chr)
import Data.Maybe (fromJust)
import Data.Typeable (Typeable)
import System.IO (Handle, IOMode (ReadMode), withBinaryFile)
import Data.Conduit (Sink, await, ($$), (=$))
import Data.Conduit.Binary (sourceHandleRange, sourceLbs)
import Data.Conduit.Serialization.Binary (conduitGet, sinkGet)
import Data.Array.IArray (Array, listArray, (!))
import Data.Binary.Get
import Numeric (showHex)
data ZimException
= ZimInvalidMagic
| ZimParseError String
| ZimIncompleteInput
| ZimInvalidIndex Int
deriving (Show, Typeable)
instance Exception ZimException
data ZimHeader = ZimHeader {
zimMagicNumber :: Int
, zimVersion :: Int
, zimUuid :: B.ByteString
, zimArticleCount :: Int
, zimClusterCount :: Int
, zimUrlPtrPos :: Integer
, zimTitlePtrPos :: Integer
, zimClusterPtrPos :: Integer
, zimMimeListPos :: Integer
, zimMainPage :: Maybe Int
, zimLayoutPage :: Maybe Int
, zimChecksumPos :: Integer
} deriving (Show, Eq)
data ZimDirEntType
= ZimArticleEntry
| ZimRedirectEntry
| ZimLinkTarget
| ZimDeletedEntry
deriving (Eq, Show)
data ZimDirEnt = ZimDirEnt {
zimDeType :: ZimDirEntType
, zimDeMimeType :: Int
, zimDeParameterLen :: Int
, zimDeNamespace :: Char
, zimDeRevision :: Int
, zimDeRedirectIndex :: Maybe Int
, zimDeClusterNumber :: Maybe Int
, zimDeBlobNumber :: Maybe Int
, zimDeUrl :: B8.ByteString
, zimDeTitle :: B8.ByteString
} deriving (Eq, Show)
type MimeList = Array Int B8.ByteString
newtype UrlIndex = UrlIndex Int deriving (Eq, Ord, Show)
newtype TitleIndex = TitleIndex Int deriving (Eq, Ord, Show)
newtype ClusterNumber = ClusterNumber Int deriving (Eq, Ord, Show)
newtype BlobNumber = BlobNumber Int deriving (Eq, Ord, Show)
newtype Url = Url B.ByteString deriving (Eq, Ord, Show)
mkNsUrl :: Char -> B.ByteString -> Url
mkNsUrl c s = Url $ c `B8.cons` '/' `B8.cons` s
newtype Title = Title B.ByteString deriving (Eq, Ord, Show)
mkNsTitle :: Char -> B.ByteString -> Title
mkNsTitle c s = Title $ c `B8.cons` '/' `B8.cons` s
newtype TitlePrefix = TitlePrefix B.ByteString deriving (Eq, Ord, Show)
mkNsTitlePrefix :: Char -> B.ByteString -> TitlePrefix
mkNsTitlePrefix c s = TitlePrefix $ c `B8.cons` '/' `B8.cons` s
newtype Cluster = Cluster {unCluster :: BL.ByteString}
newtype Blob = Blob {unBlob :: BL.ByteString}
parseZimHeader :: Get ZimHeader
parseZimHeader = do
magicNumber <- fromIntegral <$> getWord32le
when (magicNumber /= 72173914) $ throw ZimInvalidMagic
version <- fromIntegral <$> getWord32le
uuid <- getByteString 16
articleCount <- fromIntegral <$> getWord32le
clusterCount <- fromIntegral <$> getWord32le
urlPtrPos <- fromIntegral <$> getWord64le
titlePtrPos <- fromIntegral <$> getWord64le
clusterPtrPos <- fromIntegral <$> getWord64le
mimeListPos <- fromIntegral <$> getWord64le
mainPage <- fromIntegral <$> getWord32le
layoutPage <- fromIntegral <$> getWord32le
checksumPos <- fromIntegral <$> getWord64le
return $ ZimHeader magicNumber version uuid articleCount clusterCount
urlPtrPos titlePtrPos clusterPtrPos mimeListPos
(maybePage mainPage) (maybePage layoutPage) checksumPos
where
maybePage page
| page == 0xffffffff = Nothing
| otherwise = Just page
class RunZim h where
runZim :: h -> (Handle -> ZimHeader -> IO a) -> IO a
instance RunZim Handle where
runZim hdl f = do
hdr <- src $$ sinkGet parseZimHeader
f hdl hdr
where
(pos, len) = (Just 0, Just 80)
src = sourceHandleRange hdl pos len
instance RunZim (Handle, ZimHeader) where
runZim x f = uncurry f x
instance RunZim FilePath where
runZim fp f = withBinaryFile fp ReadMode $ \hdl -> runZim hdl f
getHeader :: RunZim h => h -> IO ZimHeader
getHeader h = runZim h $ \_ hdr -> return hdr
parseByteStringsNul :: Sink B8.ByteString IO [B8.ByteString]
parseByteStringsNul = conduitGet getLazyByteStringNul =$ loop id
where
loop :: ([B8.ByteString] -> [B8.ByteString]) -> Sink BL.ByteString IO [B8.ByteString]
loop front = await >>= maybe
(return $ front [])
(\x -> let bs = BL.toStrict x
in if B8.null $ BL.toStrict x
then return (front [])
else loop (front . (bs:))
)
getMimeList :: RunZim h => h -> IO MimeList
getMimeList h = runZim h $ \hdl hdr -> do
let (pos, len) = (Just $ zimMimeListPos hdr, Nothing)
src = sourceHandleRange hdl pos len
mimeList <- src $$ parseByteStringsNul
return $ listArray (0, length mimeList) mimeList
parseZimDirEnt :: Get ZimDirEnt
parseZimDirEnt = do
mimeType <- fromIntegral <$> getWord16le
parmLen <- fromIntegral <$> getWord8
namespace <- chr . fromIntegral <$> getWord8
revision <- fromIntegral <$> getWord32le
let deType = getDEType mimeType
(redirectIndex, clusterNumber, blobNumber) <- parseRedirectClusterBlob deType
url <- BL.toStrict <$> getLazyByteStringNul
title <- BL.toStrict <$> getLazyByteStringNul
let title'
| B.null title = url
| otherwise = title
return $ ZimDirEnt deType mimeType parmLen namespace revision
redirectIndex clusterNumber blobNumber url title'
where
getDEType mimeType = case mimeType of
0xffff -> ZimRedirectEntry
0xfffe -> ZimLinkTarget
0xfffd -> ZimDeletedEntry
_ -> ZimArticleEntry
offset = Just . fromIntegral
parseRedirectClusterBlob deType = case deType of
ZimArticleEntry -> (\x y -> (Nothing, offset x, offset y)) <$> getWord32le <*> getWord32le
ZimRedirectEntry -> (\x -> (offset x, Nothing, Nothing )) <$> getWord32le
ZimLinkTarget -> skip 8 >> return (Nothing, Nothing, Nothing)
ZimDeletedEntry -> skip 8 >> return (Nothing, Nothing, Nothing)
class ZimGetDE k where
getDE :: RunZim h => h -> k -> IO ZimDirEnt
instance ZimGetDE UrlIndex where
getDE h (UrlIndex i) = runZim h $ \hdl hdr -> do
let urlPtrPos = zimUrlPtrPos hdr + 8 * toInteger i
srcStart pos = sourceHandleRange hdl (Just $ fromIntegral pos) Nothing
when (i < 0 || i >= zimArticleCount hdr) . throw $ ZimInvalidIndex i
dePos <- srcStart urlPtrPos $$ sinkGet getWord64le
srcStart dePos $$ sinkGet parseZimDirEnt
instance ZimGetDE TitleIndex where
getDE h (TitleIndex i) = runZim h $ \hdl hdr -> do
let titlePtrPos = Just $ zimTitlePtrPos hdr + 4 * fromIntegral i
srcTitle = sourceHandleRange hdl titlePtrPos Nothing
when (i < 0 || i >= zimArticleCount hdr) . throw $ ZimInvalidIndex i
urlIndex <- srcTitle $$ sinkGet getWord32le
(hdl, hdr) `getDE` (UrlIndex $ fromIntegral urlIndex)
getCluster :: RunZim h => h -> ClusterNumber -> IO Cluster
getCluster h (ClusterNumber i) = runZim h $ \hdl hdr -> do
let limit = zimClusterCount hdr 1
when (i < 0 || i > limit) . throw $ ZimInvalidIndex i
let clusterPos = Just $ zimClusterPtrPos hdr + 8 * fromIntegral i
src = sourceHandleRange hdl clusterPos Nothing
(pos0, pos1) <- src $$ sinkGet $ (,) <$> getWord64le <*> getWord64le
let
len
| i == limit = fromIntegral (zimChecksumPos hdr) pos0
| otherwise = pos1 pos0
toI = Just . fromIntegral
srcCluster = sourceHandleRange hdl (toI pos0) (toI len)
bs <- srcCluster $$ sinkGet getRemainingLazyByteString
case BL.uncons bs of
Just (0, cluster) -> return $ Cluster cluster
Just (1, cluster) -> return $ Cluster cluster
Just (4, cluster) -> return . Cluster $ decompress cluster
Just (x, _) -> throw . ZimParseError $
"Cluster " ++ show i ++ " (offset: " ++ showHex pos0 "" ++ ", length: " ++ show len ++ ") compressed with unsupported type: " ++ show x
Nothing -> throw . ZimParseError $
"Insufficient bytes for cluster " ++ show i
getBlob :: RunZim h => h -> (ClusterNumber, BlobNumber) -> IO Blob
getBlob h (c, BlobNumber b) = do
Cluster cluster <- h `getCluster` c
let src = sourceLbs (BL.drop (4 * fromIntegral b) cluster)
(pos0, pos1) <- src $$ sinkGet $ (,) <$> getWord32le <*> getWord32le
let len = pos1 pos0
return . Blob $ getRange pos0 len cluster
where
getRange p l = BL.take (fromIntegral l) . BL.drop (fromIntegral p)
getMainPageUrl :: RunZim h => h -> IO (Maybe Url)
getMainPageUrl h = runZim h $ \hdl hdr ->
case zimMainPage hdr of
Nothing -> return Nothing
Just i -> do
de <- (hdl, hdr) `getDE` UrlIndex i
return . Just $ mkNsUrl (zimDeNamespace de) (zimDeUrl de)
class ZimGetContent k where
getContent :: RunZim h => h -> k -> IO (Maybe (B.ByteString, BL.ByteString))
instance ZimGetContent (MimeList, ZimDirEnt) where
getContent h (ml, de) = runZim h $ \hdl hdr -> do
case zimDeType de of
ZimRedirectEntry -> do
let u = UrlIndex . fromJust $ zimDeRedirectIndex de
(hdl, hdr) `getDE` u >>= ((hdl, hdr) `getContent`)
ZimArticleEntry -> do
let (Just c, Just b) = (zimDeClusterNumber de, zimDeBlobNumber de)
content <- unBlob <$> (hdl, hdr) `getBlob` (ClusterNumber c, BlobNumber b)
return $ Just (ml ! zimDeMimeType de, content)
_ -> return $ Just (ml ! zimDeMimeType de, BL.empty)
instance ZimGetContent ZimDirEnt where
getContent h de = runZim h $ \hdl hdr -> do
ml <- getMimeList (hdl, hdr)
(hdl, hdr) `getContent` (ml, de)
instance ZimGetContent (MimeList, Url) where
getContent h (ml, url) = runZim h $ \hdl hdr -> do
des <- (hdl, hdr) `searchDE` url
case des of
[] -> return Nothing
((_, de) : _) -> (hdl, hdr) `getContent` (ml, de)
instance ZimGetContent Url where
getContent h url = runZim h $ \hdl hdr -> do
ml <- getMimeList (hdl, hdr)
(hdl, hdr) `getContent` (ml, url)
instance ZimGetContent (MimeList, Title) where
getContent h (ml, title) = runZim h $ \hdl hdr -> do
des <- (hdl, hdr) `searchDE` title
case des of
[] -> return Nothing
((_, de) : _) -> (hdl, hdr) `getContent` (ml, de)
instance ZimGetContent Title where
getContent h title = runZim h $ \hdl hdr -> do
ml <- getMimeList (hdl, hdr)
(hdl, hdr) `getContent` (ml, title)
instance ZimGetContent (MimeList, UrlIndex) where
getContent h (ml, u) = runZim h $ \hdl hdr -> do
de <- (hdl, hdr) `getDE` u
(hdl, hdr) `getContent` (ml, de)
instance ZimGetContent UrlIndex where
getContent h u = runZim h $ \hdl hdr -> do
ml <- getMimeList (hdl, hdr)
(hdl, hdr) `getContent` (ml, u)
instance ZimGetContent (MimeList, TitleIndex) where
getContent h (ml, t) = runZim h $ \hdl hdr -> do
de <- (hdl, hdr) `getDE` t
(hdl, hdr) `getContent` (ml, de)
instance ZimGetContent TitleIndex where
getContent h t = runZim h $ \hdl hdr -> do
ml <- getMimeList (hdl, hdr)
(hdl, hdr) `getContent` (ml, t)
binarySearch :: (Int -> IO (Ordering, a)) -> Int -> Int -> IO (Maybe a)
binarySearch f low high
| high < low = return Nothing
| otherwise = do
let mid = (low + high) `div` 2
(o, x) <- f mid
case o of
LT -> binarySearch f low (mid 1)
GT -> binarySearch f (mid + 1) high
EQ -> return $ Just x
class ZimSearchDE k where
searchDE :: RunZim h => h -> k -> IO [(Int, ZimDirEnt)]
instance ZimSearchDE Url where
searchDE h url = runZim h $ \hdl hdr -> do
let
f i = do
de <- (hdl, hdr) `getDE` UrlIndex i
let v = mkNsUrl (zimDeNamespace de) (zimDeUrl de)
return (compare url v, (UrlIndex i, de))
res <- binarySearch f 0 (zimArticleCount hdr 1)
return $ maybe [] (\(UrlIndex i, r) -> [(i, r)]) res
instance ZimSearchDE Title where
searchDE h title = runZim h $ \hdl hdr -> do
let
f i = do
de <- (hdl, hdr) `getDE` TitleIndex i
let v = mkNsTitle (zimDeNamespace de) (zimDeTitle de)
return (compare title v, (TitleIndex i, de))
res <- binarySearch f 0 (zimArticleCount hdr 1)
return $ maybe [] (\(TitleIndex i, r) -> [(i, r)]) res
instance ZimSearchDE TitlePrefix where
searchDE h (TitlePrefix pre) = runZim h $ \hdl hdr -> do
let
preLen = B8.length pre 2
limit = zimArticleCount hdr 1
mkT x = mkNsTitle (zimDeNamespace x) (B8.take preLen (zimDeTitle x))
g idx = (\x -> (x, mkT x)) <$> (hdl, hdr) `getDE` idx
lowerBound i = do
de <- (hdl, hdr) `getDE` TitleIndex i
case compare (Title pre) (mkT de) of
EQ | i == 0 -> return (EQ, (TitleIndex i, de))
| otherwise -> return (LT, (TitleIndex i, de))
lgt -> do
(de', Title v') <- g $ TitleIndex (i + 1)
if pre `B8.isPrefixOf` v'
then return (EQ, (TitleIndex $ i + 1, de'))
else return (lgt, (TitleIndex i, de))
upperBound i = do
de <- (hdl, hdr) `getDE` TitleIndex i
case compare (Title pre) (mkT de) of
EQ | i == limit -> return (EQ, (TitleIndex i, de))
| otherwise -> return (GT, (TitleIndex i, de))
lgt -> do
(de', Title v') <- g $ TitleIndex (i 1)
if pre `B8.isPrefixOf` v'
then return (EQ, (TitleIndex $ i 1, de'))
else return (lgt, (TitleIndex i, de))
lb <- binarySearch lowerBound 0 limit
case lb of
Nothing -> return []
_ -> do
ub <- binarySearch upperBound 0 limit
let Just (TitleIndex lbi, lb') = lb
Just (TitleIndex ubi, ub') = ub
return [(lbi, lb'), (ubi, ub')]