src/Subversion/Dump.hs

{-# LANGUAGE OverloadedStrings #-}

module Subversion.Dump
       ( RevDate
       , Revision(..)

       , OpKind(..)
       , OpAction(..)
       , Operation(..)

       , readSvnDump
       ) where

import           Control.Applicative hiding (many)
import           Data.ByteString as B hiding (map)
import qualified Data.ByteString.Char8 as BC hiding (map)
import qualified Data.ByteString.Lazy as BL hiding (map)
import qualified Data.List as L
import           Data.Text (Text)
import qualified Data.Text.Encoding as E
import           Data.Maybe

import           Subversion.Dump.Raw

import           Prelude hiding (getContents)

default (ByteString)

-- | A parser for Subversion dump files.  The objective is to convert a dump
--   file into a series of data structures representing that same information.
--   It uses 'Data.ByteString.Lazy' to reading the file, and 'Data.Text' to
--   represent text fields which may contain Unicode characters.

-- At the topmost level, a dump file is simple an in-order, linear list of
-- revisions, where each revisions consist of a series of "operation nodes"
-- that represent the changes made by that revision to the repository.  The
-- author name and revision comment are decoded from UTF8.

type RevDate = Text

data Revision = Revision { revNumber     :: Int
                         , revDate       :: RevDate
                         , revAuthor     :: Maybe Text
                         , revComment    :: Maybe Text
                         , revOperations :: [Operation] }
              deriving Show

-- Each node reflects the changes to a single file.  Note that branches don't
-- need to be considered separately, since in Subversion, all files are stored
-- within a single filesystem.  Branches are something the user applies "after
-- the fact" by using specially named paths, such as "foo/branches".  The
-- file's contents are not decoded, as we have no way of knowing what the
-- intended encoding should be -- or even if there is in, in the case of
-- binary files.
--
-- 'opContentLength' is provided as a separate member to avoid reading in the
-- full contents of the operation solely to determine its length.  This way,
-- you can inspect the length while deferring the content read if you don't
-- need it.

data OpKind   = File | Directory deriving (Show, Enum, Eq)
data OpAction = Add | Change | Replace | Delete deriving (Show, Enum, Eq)

data Operation = Operation { opKind          :: OpKind
                           , opAction        :: OpAction
                           , opPathname      :: FilePath
                           , opContents      :: BL.ByteString
                           , opContentLength :: Int
                           , opChecksumMD5   :: Maybe Text
                           , opChecksumSHA1  :: Maybe Text
                           , opCopyFromRev   :: Maybe Int
                           , opCopyFromPath  :: Maybe FilePath }
               deriving Show

-- A further note is needed on 'opCopyFromRev' and 'opCopyFromPath', since
-- these two represent the only real complexity in a dump file.  Basically
-- what they say is that there is no 'opContents' record for this 'Operation'.
-- Rather, the contents to be taken from another file in a past revision.
-- Since this historical information would be expensive to maintain,
-- 'Operation' only provides the data given by the dump file, and it is left
-- as an analytical pass on this data to build the structures necessary to
-- figure out what those contents would have been.
--
-- So, with our structures defined, we're ready to read in the file.  Since we
-- don't know what each element will be yet (revisions are interspersed with
-- nodes), we read them first into the much more general Node structure.

-- | Reads a dump file from a ByteString in the IO monad into a list of
--   Revision values.  This is the "cooked" parallel of 'readSvnDumpRaw'.

readSvnDump :: BL.ByteString -> [Revision]
readSvnDump dump = map processRevs $ L.groupBy sameRev $ readSvnDumpRaw dump
  where sameRev _ y     = isNothing $
                          L.lookup "Revision-number" (entryTags y)
        getField f n x  = L.lookup n (f x)
        getField' f n x = fromMaybe "" (getField f n x)
        tagM            = getField entryTags
        propM           = getField entryProps
        tag             = getField' entryTags
        prop            = getField' entryProps

        processRevs [] = error "Unexpected"
        processRevs (rev:ops) =
          Revision {
              revNumber     = readInt $ tag "Revision-number" rev
            , revDate       = parseDate $ prop "svn:date" rev
            , revAuthor     = E.decodeUtf8 <$> propM "svn:author" rev
            , revComment    = E.decodeUtf8 <$> propM "svn:log" rev
            , revOperations = map processOp ops }

        processOp op =
          Operation {
              opKind          = getOpKind $ tag "Node-kind" op
            , opAction        = getOpAction $ tag "Node-action" op
            , opPathname      = BC.unpack $ tag "Node-path" op
            , opContents      = entryBody op
            , opContentLength = readInt $ tag "Text-content-length" op
            , opCopyFromRev   = readInt <$>
                                tagM "Node-copyfrom-rev" op
            , opCopyFromPath  = BC.unpack <$> tagM "Node-copyfrom-path" op
            , opChecksumMD5   = E.decodeUtf8 <$> tagM "Text-content-md5" op
            , opChecksumSHA1  = E.decodeUtf8 <$> tagM "Text-content-sha1" op }

        getOpKind kind = case kind of
          "file" -> File
          "dir"  -> Directory
          _      -> error "Unexpected"

        getOpAction kind = case kind of
          "add"     -> Add
          "delete"  -> Delete
          "change"  -> Change
          "replace" -> Replace
          _      -> error "Unexpected"

parseDate :: ByteString -> RevDate
parseDate = E.decodeUtf8

-- Dump.hs ends here