{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE GADTs #-}

-- | WARC (or Web ARCive) is a archival file format widely used to distribute
-- corpora of crawled web content (see, for instance the Common Crawl corpus). A
-- WARC file consists of a set of records, each of which describes a web request
-- or response.
--
-- This module provides a streaming parser and encoder for WARC archives for use
-- with the @pipes@ package.
--
-- Here is a simple example which walks throught the WARC file:
--
-- > {-# LANGUAGE RecordWildCards #-}
-- > {-# LANGUAGE OverloadedStrings #-}
-- > 
-- > module Main where
-- > 
-- > import Control.Lens
-- > import Control.Monad.IO.Class
-- > import qualified Data.ByteString as B
-- > import Data.Warc
-- > import qualified Pipes as P
-- > import Pipes.ByteString (fromHandle)
-- > import System.IO
-- > 
-- > iterFunc :: Record IO b -> IO b
-- > iterFunc Record {..} = do
-- >   case recHeader ^. recHeaders . at "Content-Type" of
-- >     Just ct -> liftIO $ putStrLn ("Content-Type: " ++ show ct)
-- >     Nothing -> return ()
-- >   r <-
-- >     liftIO $ P.runEffect $ P.for recContent $ \x -> do
-- >       liftIO $ putStrLn ("Got bytes: " ++ show (B.length x))
-- >       return ()
-- >   return r
-- > 
-- > main :: IO ()
-- > main = do
-- >   withFile "example.warc" ReadMode $ \h -> do
-- >     _ <- iterRecords iterFunc (parseWarc (fromHandle h))
-- >     return ()
--
module Data.Warc
    ( Warc(..)
    , Record(..)
      -- * Parsing
    , parseWarc
    , iterRecords
    , produceRecords
      -- * Encoding
    , encodeRecord
      -- * Headers
    , module Data.Warc.Header
    ) where

import Data.Char (ord)
import Pipes hiding (each)
import qualified Pipes.ByteString as PBS
import Control.Lens
import qualified Pipes.Attoparsec as PA
import qualified Data.ByteString as BS
import qualified Data.ByteString.Builder as BB
import Data.ByteString (ByteString)
import Control.Monad (join)
import Control.Monad.Trans.Free
import Control.Monad.Trans.State.Strict

import Data.Warc.Header


-- | A WARC record
--
-- This represents a single record of a WARC file, consisting of a set of
-- headers and a means of producing the record's body.
data Record m r = Record { Record m r -> RecordHeader
recHeader    :: RecordHeader
                           -- ^ the WARC headers
                         , Record m r -> Producer ByteString m r
recContent   :: Producer BS.ByteString m r
                           -- ^ the body of the record
                         }

instance Monad m => Functor (Record m) where
    fmap :: (a -> b) -> Record m a -> Record m b
fmap a -> b
f (Record RecordHeader
hdr Producer ByteString m a
r) = RecordHeader -> Producer ByteString m b -> Record m b
forall (m :: * -> *) r.
RecordHeader -> Producer ByteString m r -> Record m r
Record RecordHeader
hdr ((a -> b) -> Producer ByteString m a -> Producer ByteString m b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap a -> b
f Producer ByteString m a
r)

-- | A WARC archive.
--
-- This represents a sequence of records followed by whatever data
-- was leftover from the parse.
type Warc m a = FreeT (Record m) m (Producer BS.ByteString m a)

-- | Parse a WARC archive.
--
-- Note that this function does not actually do any parsing itself;
-- it merely returns a 'Warc' value which can then be run to parse
-- individual records.
parseWarc :: (Functor m, Monad m)
          => Producer ByteString m a   -- ^ a producer of a stream of WARC content
          -> Warc m a                  -- ^ the parsed WARC archive
parseWarc :: Producer ByteString m a -> Warc m a
parseWarc = Producer ByteString m a -> Warc m a
forall (m :: * -> *) x.
Monad m =>
Producer ByteString m x
-> FreeT (Record m) m (Producer ByteString m x)
loop
  where
    loop :: Producer ByteString m x
-> FreeT (Record m) m (Producer ByteString m x)
loop Producer ByteString m x
upstream = m (FreeF
     (Record m)
     (Producer ByteString m x)
     (FreeT (Record m) m (Producer ByteString m x)))
-> FreeT (Record m) m (Producer ByteString m x)
forall (f :: * -> *) (m :: * -> *) a.
m (FreeF f a (FreeT f m a)) -> FreeT f m a
FreeT (m (FreeF
      (Record m)
      (Producer ByteString m x)
      (FreeT (Record m) m (Producer ByteString m x)))
 -> FreeT (Record m) m (Producer ByteString m x))
-> m (FreeF
        (Record m)
        (Producer ByteString m x)
        (FreeT (Record m) m (Producer ByteString m x)))
-> FreeT (Record m) m (Producer ByteString m x)
forall a b. (a -> b) -> a -> b
$ do
        (Maybe (Either ParsingError RecordHeader)
hdr, Producer ByteString m x
rest) <- StateT
  (Producer ByteString m x)
  m
  (Maybe (Either ParsingError RecordHeader))
-> Producer ByteString m x
-> m (Maybe (Either ParsingError RecordHeader),
      Producer ByteString m x)
forall s (m :: * -> *) a. StateT s m a -> s -> m (a, s)
runStateT (Parser ByteString RecordHeader
-> Parser ByteString m (Maybe (Either ParsingError RecordHeader))
forall (m :: * -> *) a b.
(Monad m, ParserInput a) =>
Parser a b -> Parser a m (Maybe (Either ParsingError b))
PA.parse Parser ByteString RecordHeader
header) Producer ByteString m x
upstream
        Maybe (Either ParsingError RecordHeader)
-> Producer ByteString m x
-> m (FreeF
        (Record m)
        (Producer ByteString m x)
        (FreeT (Record m) m (Producer ByteString m x)))
go Maybe (Either ParsingError RecordHeader)
hdr Producer ByteString m x
rest

    go :: Maybe (Either ParsingError RecordHeader)
-> Producer ByteString m x
-> m (FreeF
        (Record m)
        (Producer ByteString m x)
        (FreeT (Record m) m (Producer ByteString m x)))
go Maybe (Either ParsingError RecordHeader)
mhdr Producer ByteString m x
rest
      | Maybe (Either ParsingError RecordHeader)
Nothing <- Maybe (Either ParsingError RecordHeader)
mhdr             = FreeF
  (Record m)
  (Producer ByteString m x)
  (FreeT (Record m) m (Producer ByteString m x))
-> m (FreeF
        (Record m)
        (Producer ByteString m x)
        (FreeT (Record m) m (Producer ByteString m x)))
forall (m :: * -> *) a. Monad m => a -> m a
return (FreeF
   (Record m)
   (Producer ByteString m x)
   (FreeT (Record m) m (Producer ByteString m x))
 -> m (FreeF
         (Record m)
         (Producer ByteString m x)
         (FreeT (Record m) m (Producer ByteString m x))))
-> FreeF
     (Record m)
     (Producer ByteString m x)
     (FreeT (Record m) m (Producer ByteString m x))
-> m (FreeF
        (Record m)
        (Producer ByteString m x)
        (FreeT (Record m) m (Producer ByteString m x)))
forall a b. (a -> b) -> a -> b
$ Producer ByteString m x
-> FreeF
     (Record m)
     (Producer ByteString m x)
     (FreeT (Record m) m (Producer ByteString m x))
forall (f :: * -> *) a b. a -> FreeF f a b
Pure Producer ByteString m x
rest
      | Just (Left ParsingError
err) <- Maybe (Either ParsingError RecordHeader)
mhdr     = [Char]
-> m (FreeF
        (Record m)
        (Producer ByteString m x)
        (FreeT (Record m) m (Producer ByteString m x)))
forall a. HasCallStack => [Char] -> a
error ([Char]
 -> m (FreeF
         (Record m)
         (Producer ByteString m x)
         (FreeT (Record m) m (Producer ByteString m x))))
-> [Char]
-> m (FreeF
        (Record m)
        (Producer ByteString m x)
        (FreeT (Record m) m (Producer ByteString m x)))
forall a b. (a -> b) -> a -> b
$ ParsingError -> [Char]
forall a. Show a => a -> [Char]
show ParsingError
err
      | Just (Right RecordHeader
hdr) <- Maybe (Either ParsingError RecordHeader)
mhdr
      , Just (Right Integer
len) <- RecordHeader -> Field Integer -> Maybe (Either [Char] Integer)
forall a. RecordHeader -> Field a -> Maybe (Either [Char] a)
lookupField RecordHeader
hdr Field Integer
contentLength = do
            let produceBody :: Producer ByteString m x
-> Proxy X () () ByteString m (Producer ByteString m x)
produceBody = (Producer ByteString m x -> Producer ByteString m x)
-> Proxy X () () ByteString m (Producer ByteString m x)
-> Proxy X () () ByteString m (Producer ByteString m x)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Producer ByteString m x -> Producer ByteString m x
consumeWhitespace (Proxy X () () ByteString m (Producer ByteString m x)
 -> Proxy X () () ByteString m (Producer ByteString m x))
-> (Producer ByteString m x
    -> Proxy X () () ByteString m (Producer ByteString m x))
-> Producer ByteString m x
-> Proxy X () () ByteString m (Producer ByteString m x)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Getting
  (Proxy X () () ByteString m (Producer ByteString m x))
  (Producer ByteString m x)
  (Proxy X () () ByteString m (Producer ByteString m x))
-> Producer ByteString m x
-> Proxy X () () ByteString m (Producer ByteString m x)
forall s (m :: * -> *) a. MonadReader s m => Getting a s a -> m a
view (Integer
-> Lens'
     (Producer ByteString m x)
     (Proxy X () () ByteString m (Producer ByteString m x))
forall (m :: * -> *) n x.
(Monad m, Integral n) =>
n
-> Lens'
     (Producer ByteString m x)
     (Producer ByteString m (Producer ByteString m x))
PBS.splitAt Integer
len)
                consumeWhitespace :: Producer ByteString m x -> Producer ByteString m x
consumeWhitespace = (Word8 -> Bool)
-> Producer ByteString m x -> Producer ByteString m x
forall (m :: * -> *) r.
Monad m =>
(Word8 -> Bool)
-> Producer ByteString m r -> Producer ByteString m r
PBS.dropWhile Word8 -> Bool
isEOL
                isEOL :: Word8 -> Bool
isEOL Word8
c = Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Char -> Word8
ord8 Char
'\r' Bool -> Bool -> Bool
|| Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Char -> Word8
ord8 Char
'\n'
                ord8 :: Char -> Word8
ord8 = Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Word8) -> (Char -> Int) -> Char -> Word8
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
ord
            FreeF
  (Record m)
  (Producer ByteString m x)
  (FreeT (Record m) m (Producer ByteString m x))
-> m (FreeF
        (Record m)
        (Producer ByteString m x)
        (FreeT (Record m) m (Producer ByteString m x)))
forall (m :: * -> *) a. Monad m => a -> m a
return (FreeF
   (Record m)
   (Producer ByteString m x)
   (FreeT (Record m) m (Producer ByteString m x))
 -> m (FreeF
         (Record m)
         (Producer ByteString m x)
         (FreeT (Record m) m (Producer ByteString m x))))
-> FreeF
     (Record m)
     (Producer ByteString m x)
     (FreeT (Record m) m (Producer ByteString m x))
-> m (FreeF
        (Record m)
        (Producer ByteString m x)
        (FreeT (Record m) m (Producer ByteString m x)))
forall a b. (a -> b) -> a -> b
$ Record m (FreeT (Record m) m (Producer ByteString m x))
-> FreeF
     (Record m)
     (Producer ByteString m x)
     (FreeT (Record m) m (Producer ByteString m x))
forall (f :: * -> *) a b. f b -> FreeF f a b
Free (Record m (FreeT (Record m) m (Producer ByteString m x))
 -> FreeF
      (Record m)
      (Producer ByteString m x)
      (FreeT (Record m) m (Producer ByteString m x)))
-> Record m (FreeT (Record m) m (Producer ByteString m x))
-> FreeF
     (Record m)
     (Producer ByteString m x)
     (FreeT (Record m) m (Producer ByteString m x))
forall a b. (a -> b) -> a -> b
$ RecordHeader
-> Producer
     ByteString m (FreeT (Record m) m (Producer ByteString m x))
-> Record m (FreeT (Record m) m (Producer ByteString m x))
forall (m :: * -> *) r.
RecordHeader -> Producer ByteString m r -> Record m r
Record RecordHeader
hdr (Producer
   ByteString m (FreeT (Record m) m (Producer ByteString m x))
 -> Record m (FreeT (Record m) m (Producer ByteString m x)))
-> Producer
     ByteString m (FreeT (Record m) m (Producer ByteString m x))
-> Record m (FreeT (Record m) m (Producer ByteString m x))
forall a b. (a -> b) -> a -> b
$ (Producer ByteString m x
 -> FreeT (Record m) m (Producer ByteString m x))
-> Proxy X () () ByteString m (Producer ByteString m x)
-> Producer
     ByteString m (FreeT (Record m) m (Producer ByteString m x))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Producer ByteString m x
-> FreeT (Record m) m (Producer ByteString m x)
loop (Proxy X () () ByteString m (Producer ByteString m x)
 -> Producer
      ByteString m (FreeT (Record m) m (Producer ByteString m x)))
-> Proxy X () () ByteString m (Producer ByteString m x)
-> Producer
     ByteString m (FreeT (Record m) m (Producer ByteString m x))
forall a b. (a -> b) -> a -> b
$ Producer ByteString m x
-> Proxy X () () ByteString m (Producer ByteString m x)
produceBody Producer ByteString m x
rest

-- | Iterate over the 'Record's in a WARC archive
iterRecords :: forall m a. Monad m
            => (forall b. Record m b -> m b)  -- ^ the action to run on each 'Record'
            -> Warc m a                       -- ^ the 'Warc' file
            -> m (Producer BS.ByteString m a) -- ^ returns any leftover data
iterRecords :: (forall b. Record m b -> m b)
-> Warc m a -> m (Producer ByteString m a)
iterRecords forall b. Record m b -> m b
f Warc m a
warc = (Record m (m (Producer ByteString m a))
 -> m (Producer ByteString m a))
-> Warc m a -> m (Producer ByteString m a)
forall (f :: * -> *) (m :: * -> *) a.
(Functor f, Monad m) =>
(f (m a) -> m a) -> FreeT f m a -> m a
iterT Record m (m (Producer ByteString m a))
-> m (Producer ByteString m a)
iter Warc m a
warc
  where
    iter :: Record m (m (Producer BS.ByteString m a))
         -> m (Producer BS.ByteString m a)
    iter :: Record m (m (Producer ByteString m a))
-> m (Producer ByteString m a)
iter Record m (m (Producer ByteString m a))
r = m (m (Producer ByteString m a)) -> m (Producer ByteString m a)
forall (m :: * -> *) a. Monad m => m (m a) -> m a
join (m (m (Producer ByteString m a)) -> m (Producer ByteString m a))
-> m (m (Producer ByteString m a)) -> m (Producer ByteString m a)
forall a b. (a -> b) -> a -> b
$ Record m (m (Producer ByteString m a))
-> m (m (Producer ByteString m a))
forall b. Record m b -> m b
f Record m (m (Producer ByteString m a))
r

produceRecords :: forall m o a. Monad m
               => (forall b. RecordHeader -> Producer BS.ByteString m b
                                          -> Producer o m b)
                  -- ^ consume the record producing some output
               -> Warc m a
                  -- ^ a WARC archive (see 'parseWarc')
               -> Producer o m (Producer BS.ByteString m a)
                  -- ^ returns any leftover data
produceRecords :: (forall b.
 RecordHeader -> Producer ByteString m b -> Producer o m b)
-> Warc m a -> Producer o m (Producer ByteString m a)
produceRecords forall b. RecordHeader -> Producer ByteString m b -> Producer o m b
f Warc m a
warc = (Record m (Producer o m (Producer ByteString m a))
 -> Producer o m (Producer ByteString m a))
-> Warc m a -> Producer o m (Producer ByteString m a)
forall (f :: * -> *) (m :: * -> *) (t :: (* -> *) -> * -> *) a.
(Functor f, Monad m, MonadTrans t, Monad (t m)) =>
(f (t m a) -> t m a) -> FreeT f m a -> t m a
iterTM Record m (Producer o m (Producer ByteString m a))
-> Producer o m (Producer ByteString m a)
iter Warc m a
warc
  where
    iter :: Record m (Producer o m (Producer BS.ByteString m a))
         -> Producer o m (Producer BS.ByteString m a)
    iter :: Record m (Producer o m (Producer ByteString m a))
-> Producer o m (Producer ByteString m a)
iter (Record RecordHeader
hdr Producer ByteString m (Producer o m (Producer ByteString m a))
body) = Proxy X () () o m (Producer o m (Producer ByteString m a))
-> Producer o m (Producer ByteString m a)
forall (m :: * -> *) a. Monad m => m (m a) -> m a
join (Proxy X () () o m (Producer o m (Producer ByteString m a))
 -> Producer o m (Producer ByteString m a))
-> Proxy X () () o m (Producer o m (Producer ByteString m a))
-> Producer o m (Producer ByteString m a)
forall a b. (a -> b) -> a -> b
$ RecordHeader
-> Producer ByteString m (Producer o m (Producer ByteString m a))
-> Proxy X () () o m (Producer o m (Producer ByteString m a))
forall b. RecordHeader -> Producer ByteString m b -> Producer o m b
f RecordHeader
hdr Producer ByteString m (Producer o m (Producer ByteString m a))
body

-- | Encode a 'Record' in WARC format.
encodeRecord :: Monad m => Record m a -> Producer BS.ByteString m a
encodeRecord :: Record m a -> Producer ByteString m a
encodeRecord (Record RecordHeader
hdr Producer ByteString m a
content) = do
    ByteString -> Producer' ByteString m ()
forall (m :: * -> *).
Monad m =>
ByteString -> Producer' ByteString m ()
PBS.fromLazy (ByteString -> Producer' ByteString m ())
-> ByteString -> Producer' ByteString m ()
forall a b. (a -> b) -> a -> b
$ Builder -> ByteString
BB.toLazyByteString (Builder -> ByteString) -> Builder -> ByteString
forall a b. (a -> b) -> a -> b
$ RecordHeader -> Builder
encodeHeader RecordHeader
hdr
    Producer ByteString m a
content