-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/
-- | A parser for the Web Archive (WARC) format
--
-- A streaming parser for the Web Archive (WARC) format.
@package warc
@version 1.0.3
module Data.Warc.Header
-- | A WARC header
header :: Parser RecordHeader
encodeHeader :: RecordHeader -> Builder
data Version
Version :: !Int -> Version
[versionMajor, versionMinor] :: Version -> !Int
warc0_16 :: Version
data RecordHeader
RecordHeader :: Version -> HashMap FieldName ByteString -> RecordHeader
[_recWarcVersion] :: RecordHeader -> Version
[_recHeaders] :: RecordHeader -> HashMap FieldName ByteString
data WarcType
WarcInfo :: WarcType
Response :: WarcType
Resource :: WarcType
Request :: WarcType
Metadata :: WarcType
Revisit :: WarcType
Conversion :: WarcType
Continuation :: WarcType
FutureType :: !Text -> WarcType
newtype RecordId
RecordId :: Uri -> RecordId
data TruncationReason
TruncLength :: TruncationReason
TruncTime :: TruncationReason
TruncDisconnect :: TruncationReason
TruncUnspecified :: TruncationReason
TruncOther :: !Text -> TruncationReason
data Digest
Digest :: !ByteString -> Digest
[digestAlgorithm, digestHash] :: Digest -> !ByteString
newtype Uri
Uri :: ByteString -> Uri
data Field a
Field :: FieldName -> (a -> Builder) -> Parser a -> Field a
[fieldName] :: Field a -> FieldName
[encode] :: Field a -> a -> Builder
[decode] :: Field a -> Parser a
newtype FieldName
FieldName :: Text -> FieldName
[getFieldName] :: FieldName -> Text
-- | A lens-y means of querying Fields.
field :: Field a -> Traversal' RecordHeader a
-- | Lookup the value of a field. Returns Nothing if the field is
-- not present, Just (Left err) in the event of a parse error,
-- and Just (Right v) on success.
lookupField :: RecordHeader -> Field a -> Maybe (Either String a)
addField :: Field a -> a -> RecordHeader -> RecordHeader
mapField :: (a -> b) -> (b -> a) -> Field a -> Field b
rawField :: FieldName -> Field ByteString
warcRecordId :: Field RecordId
contentLength :: Field Integer
warcDate :: Field UTCTime
warcType :: Field WarcType
contentType :: Field ByteString
warcConcurrentTo :: Field RecordId
warcBlockDigest :: Field Digest
warcPayloadDigest :: Field Digest
warcIpAddress :: Field ByteString
warcRefersTo :: Field Uri
warcTargetUri :: Field Uri
warcTruncated :: Field TruncationReason
warcWarcinfoID :: Field RecordId
warcFilename :: Field Text
warcProfile :: Field Uri
warcSegmentNumber :: Field Integer
warcSegmentTotalLength :: Field Integer
recWarcVersion :: Lens' RecordHeader Version
recHeaders :: Lens' RecordHeader (HashMap FieldName ByteString)
instance GHC.Show.Show Data.Warc.Header.RecordHeader
instance GHC.Classes.Ord Data.Warc.Header.Digest
instance GHC.Classes.Eq Data.Warc.Header.Digest
instance GHC.Read.Read Data.Warc.Header.Digest
instance GHC.Show.Show Data.Warc.Header.Digest
instance GHC.Classes.Eq Data.Warc.Header.TruncationReason
instance GHC.Classes.Ord Data.Warc.Header.TruncationReason
instance GHC.Read.Read Data.Warc.Header.TruncationReason
instance GHC.Show.Show Data.Warc.Header.TruncationReason
instance GHC.Classes.Ord Data.Warc.Header.RecordId
instance GHC.Classes.Eq Data.Warc.Header.RecordId
instance GHC.Read.Read Data.Warc.Header.RecordId
instance GHC.Show.Show Data.Warc.Header.RecordId
instance GHC.Classes.Ord Data.Warc.Header.Uri
instance GHC.Classes.Eq Data.Warc.Header.Uri
instance GHC.Read.Read Data.Warc.Header.Uri
instance GHC.Show.Show Data.Warc.Header.Uri
instance GHC.Classes.Eq Data.Warc.Header.WarcType
instance GHC.Classes.Ord Data.Warc.Header.WarcType
instance GHC.Read.Read Data.Warc.Header.WarcType
instance GHC.Show.Show Data.Warc.Header.WarcType
instance Data.String.IsString Data.Warc.Header.FieldName
instance GHC.Read.Read Data.Warc.Header.FieldName
instance GHC.Show.Show Data.Warc.Header.FieldName
instance GHC.Classes.Ord Data.Warc.Header.Version
instance GHC.Classes.Eq Data.Warc.Header.Version
instance GHC.Read.Read Data.Warc.Header.Version
instance GHC.Show.Show Data.Warc.Header.Version
instance Data.Hashable.Class.Hashable Data.Warc.Header.FieldName
instance GHC.Classes.Eq Data.Warc.Header.FieldName
instance GHC.Classes.Ord Data.Warc.Header.FieldName
-- | WARC (or Web ARCive) is a archival file format widely used to
-- distribute corpora of crawled web content (see, for instance the
-- Common Crawl corpus). A WARC file consists of a set of records, each
-- of which describes a web request or response.
--
-- This module provides a streaming parser and encoder for WARC archives
-- for use with the pipes package.
module Data.Warc
-- | A WARC archive.
--
-- This represents a sequence of records followed by whatever data was
-- leftover from the parse.
type Warc m a = FreeT (Record m) m (Producer ByteString m a)
-- | A WARC record
--
-- This represents a single record of a WARC file, consisting of a set of
-- headers and a means of producing the record's body.
data Record m r
Record :: RecordHeader -> Producer ByteString m r -> Record m r
-- | the WARC headers
[recHeader] :: Record m r -> RecordHeader
-- | the body of the record
[recContent] :: Record m r -> Producer ByteString m r
-- | Parse a WARC archive.
--
-- Note that this function does not actually do any parsing itself; it
-- merely returns a Warc value which can then be run to parse
-- individual records.
parseWarc :: (Functor m, Monad m) => Producer ByteString m a -> Warc m a
-- | Iterate over the Records in a WARC archive
iterRecords :: forall m a. Monad m => (forall b. Record m b -> m b) -> Warc m a -> m (Producer ByteString m a)
produceRecords :: forall m o a. Monad m => (forall b. RecordHeader -> Producer ByteString m b -> Producer o m b) -> Warc m a -> Producer o m (Producer ByteString m a)
-- | Encode a Record in WARC format.
encodeRecord :: Monad m => Record m a -> Producer ByteString m a
instance GHC.Base.Monad m => GHC.Base.Functor (Data.Warc.Record m)