warc-1.0.5: A parser for the Web Archive (WARC) format
Safe HaskellNone
LanguageHaskell2010

Data.Warc

Description

WARC (or Web ARCive) is a archival file format widely used to distribute corpora of crawled web content (see, for instance the Common Crawl corpus). A WARC file consists of a set of records, each of which describes a web request or response.

This module provides a streaming parser and encoder for WARC archives for use with the pipes package.

Here is a simple example which walks throught the WARC file:

{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE OverloadedStrings #-}

module Main where

import Control.Lens
import Control.Monad.IO.Class
import qualified Data.ByteString as B
import Data.Warc
import qualified Pipes as P
import Pipes.ByteString (fromHandle)
import System.IO

iterFunc :: Record IO b -> IO b
iterFunc Record {..} = do
  case recHeader ^. recHeaders . at "Content-Type" of
    Just ct -> liftIO $ putStrLn ("Content-Type: " ++ show ct)
    Nothing -> return ()
  r <-
    liftIO $ P.runEffect $ P.for recContent $ \x -> do
      liftIO $ putStrLn ("Got bytes: " ++ show (B.length x))
      return ()
  return r

main :: IO ()
main = do
  withFile "example.warc" ReadMode $ \h -> do
    _ <- iterRecords iterFunc (parseWarc (fromHandle h))
    return ()
Synopsis

Documentation

type Warc m a = FreeT (Record m) m (Producer ByteString m a) Source #

A WARC archive.

This represents a sequence of records followed by whatever data was leftover from the parse.

data Record m r Source #

A WARC record

This represents a single record of a WARC file, consisting of a set of headers and a means of producing the record's body.

Constructors

Record 

Fields

Instances

Instances details
Monad m => Functor (Record m) Source # 
Instance details

Defined in Data.Warc

Methods

fmap :: (a -> b) -> Record m a -> Record m b #

(<$) :: a -> Record m b -> Record m a #

Parsing

parseWarc Source #

Arguments

:: (Functor m, Monad m) 
=> Producer ByteString m a

a producer of a stream of WARC content

-> Warc m a

the parsed WARC archive

Parse a WARC archive.

Note that this function does not actually do any parsing itself; it merely returns a Warc value which can then be run to parse individual records.

iterRecords Source #

Arguments

:: forall m a. Monad m 
=> (forall b. Record m b -> m b)

the action to run on each Record

-> Warc m a

the Warc file

-> m (Producer ByteString m a)

returns any leftover data

Iterate over the Records in a WARC archive

produceRecords Source #

Arguments

:: forall m o a. Monad m 
=> (forall b. RecordHeader -> Producer ByteString m b -> Producer o m b)

consume the record producing some output

-> Warc m a

a WARC archive (see parseWarc)

-> Producer o m (Producer ByteString m a)

returns any leftover data

Encoding

encodeRecord :: Monad m => Record m a -> Producer ByteString m a Source #

Encode a Record in WARC format.

Headers