-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | A parser for the Web Archive (WARC) format -- -- A streaming parser for the Web Archive (WARC) format. @package warc @version 1.0.4 module Data.Warc.Header -- | A WARC header header :: Parser RecordHeader encodeHeader :: RecordHeader -> Builder data Version Version :: !Int -> Version [versionMajor, versionMinor] :: Version -> !Int warc0_16 :: Version data RecordHeader RecordHeader :: Version -> HashMap FieldName ByteString -> RecordHeader [_recWarcVersion] :: RecordHeader -> Version [_recHeaders] :: RecordHeader -> HashMap FieldName ByteString data WarcType WarcInfo :: WarcType Response :: WarcType Resource :: WarcType Request :: WarcType Metadata :: WarcType Revisit :: WarcType Conversion :: WarcType Continuation :: WarcType FutureType :: !Text -> WarcType newtype RecordId RecordId :: Uri -> RecordId data TruncationReason TruncLength :: TruncationReason TruncTime :: TruncationReason TruncDisconnect :: TruncationReason TruncUnspecified :: TruncationReason TruncOther :: !Text -> TruncationReason data Digest Digest :: !ByteString -> Digest [digestAlgorithm, digestHash] :: Digest -> !ByteString newtype Uri Uri :: ByteString -> Uri data Field a Field :: FieldName -> a -> Builder -> Parser a -> Field a [fieldName] :: Field a -> FieldName [encode] :: Field a -> a -> Builder [decode] :: Field a -> Parser a newtype FieldName FieldName :: Text -> FieldName [getFieldName] :: FieldName -> Text -- | A lens-y means of querying Fields. field :: Field a -> Traversal' RecordHeader a -- | Lookup the value of a field. Returns Nothing if the field is -- not present, Just (Left err) in the event of a parse error, -- and Just (Right v) on success. lookupField :: RecordHeader -> Field a -> Maybe (Either String a) addField :: Field a -> a -> RecordHeader -> RecordHeader mapField :: (a -> b) -> (b -> a) -> Field a -> Field b rawField :: FieldName -> Field ByteString warcRecordId :: Field RecordId contentLength :: Field Integer warcDate :: Field UTCTime warcType :: Field WarcType contentType :: Field ByteString warcConcurrentTo :: Field RecordId warcBlockDigest :: Field Digest warcPayloadDigest :: Field Digest warcIpAddress :: Field ByteString warcRefersTo :: Field Uri warcTargetUri :: Field Uri warcTruncated :: Field TruncationReason warcWarcinfoID :: Field RecordId warcFilename :: Field Text warcProfile :: Field Uri warcSegmentNumber :: Field Integer warcSegmentTotalLength :: Field Integer recWarcVersion :: Lens' RecordHeader Version recHeaders :: Lens' RecordHeader (HashMap FieldName ByteString) instance GHC.Show.Show Data.Warc.Header.RecordHeader instance GHC.Classes.Ord Data.Warc.Header.Digest instance GHC.Classes.Eq Data.Warc.Header.Digest instance GHC.Read.Read Data.Warc.Header.Digest instance GHC.Show.Show Data.Warc.Header.Digest instance GHC.Classes.Eq Data.Warc.Header.TruncationReason instance GHC.Classes.Ord Data.Warc.Header.TruncationReason instance GHC.Read.Read Data.Warc.Header.TruncationReason instance GHC.Show.Show Data.Warc.Header.TruncationReason instance GHC.Classes.Ord Data.Warc.Header.RecordId instance GHC.Classes.Eq Data.Warc.Header.RecordId instance GHC.Read.Read Data.Warc.Header.RecordId instance GHC.Show.Show Data.Warc.Header.RecordId instance GHC.Classes.Ord Data.Warc.Header.Uri instance GHC.Classes.Eq Data.Warc.Header.Uri instance GHC.Read.Read Data.Warc.Header.Uri instance GHC.Show.Show Data.Warc.Header.Uri instance GHC.Classes.Eq Data.Warc.Header.WarcType instance GHC.Classes.Ord Data.Warc.Header.WarcType instance GHC.Read.Read Data.Warc.Header.WarcType instance GHC.Show.Show Data.Warc.Header.WarcType instance Data.String.IsString Data.Warc.Header.FieldName instance GHC.Read.Read Data.Warc.Header.FieldName instance GHC.Show.Show Data.Warc.Header.FieldName instance GHC.Classes.Ord Data.Warc.Header.Version instance GHC.Classes.Eq Data.Warc.Header.Version instance GHC.Read.Read Data.Warc.Header.Version instance GHC.Show.Show Data.Warc.Header.Version instance Data.Hashable.Class.Hashable Data.Warc.Header.FieldName instance GHC.Classes.Eq Data.Warc.Header.FieldName instance GHC.Classes.Ord Data.Warc.Header.FieldName -- | WARC (or Web ARCive) is a archival file format widely used to -- distribute corpora of crawled web content (see, for instance the -- Common Crawl corpus). A WARC file consists of a set of records, each -- of which describes a web request or response. -- -- This module provides a streaming parser and encoder for WARC archives -- for use with the pipes package. -- -- Here is a simple example which walks throught the WARC file: -- --
-- {-# LANGUAGE RecordWildCards #-}
-- {-# LANGUAGE OverloadedStrings #-}
--
-- module Main where
--
-- import Control.Lens
-- import Control.Monad.IO.Class
-- import qualified Data.ByteString as B
-- import Data.Warc
-- import qualified Pipes as P
-- import Pipes.ByteString (fromHandle)
-- import System.IO
--
-- iterFunc :: Record IO b -> IO b
-- iterFunc Record {..} = do
-- case recHeader ^. recHeaders . at "Content-Type" of
-- Just ct -> liftIO $ putStrLn ("Content-Type: " ++ show ct)
-- Nothing -> return ()
-- r <-
-- liftIO $ P.runEffect $ P.for recContent $ \x -> do
-- liftIO $ putStrLn ("Got bytes: " ++ show (B.length x))
-- return ()
-- return r
--
-- main :: IO ()
-- main = do
-- withFile "example.warc" ReadMode $ \h -> do
-- _ <- iterRecords iterFunc (parseWarc (fromHandle h))
-- return ()
--
module Data.Warc
-- | A WARC archive.
--
-- This represents a sequence of records followed by whatever data was
-- leftover from the parse.
type Warc m a = FreeT (Record m) m (Producer ByteString m a)
-- | A WARC record
--
-- This represents a single record of a WARC file, consisting of a set of
-- headers and a means of producing the record's body.
data Record m r
Record :: RecordHeader -> Producer ByteString m r -> Record m r
-- | the WARC headers
[recHeader] :: Record m r -> RecordHeader
-- | the body of the record
[recContent] :: Record m r -> Producer ByteString m r
-- | Parse a WARC archive.
--
-- Note that this function does not actually do any parsing itself; it
-- merely returns a Warc value which can then be run to parse
-- individual records.
parseWarc :: (Functor m, Monad m) => Producer ByteString m a -> Warc m a
-- | Iterate over the Records in a WARC archive
iterRecords :: forall m a. Monad m => (forall b. Record m b -> m b) -> Warc m a -> m (Producer ByteString m a)
produceRecords :: forall m o a. Monad m => (forall b. RecordHeader -> Producer ByteString m b -> Producer o m b) -> Warc m a -> Producer o m (Producer ByteString m a)
-- | Encode a Record in WARC format.
encodeRecord :: Monad m => Record m a -> Producer ByteString m a
instance GHC.Base.Monad m => GHC.Base.Functor (Data.Warc.Record m)