src/Subversion/Dump.hs

{-# LANGUAGE OverloadedStrings #-}

module Subversion.Dump
       ( RevDate
       , Revision(..)

       , OpKind(..)
       , OpAction(..)
       , Operation(..)

       , FieldMap
       , Entry(..)

       , readSvnDump
       , readSvnDumpRaw
       ) where

import           Control.Applicative hiding (many, (<|>))
import           Control.Monad
import qualified Data.ByteString.Lazy as B
--import qualified Data.ByteString.Lazy.Char8 as BC
import qualified Data.List as L
import           Data.Maybe
import           Data.Text.Lazy hiding (map, count)
import           Data.Text.Lazy.Encoding as E
import           System.FilePath
import           Text.Parsec
import           Text.Parsec.ByteString.Lazy as PB

import           Prelude hiding (getContents)

default (Data.Text.Lazy.Text)

-- | A parser for Subversion dump files.  The objective is to convert a dump
--   file into a series of data structures representing that same information.
--   It uses 'Data.ByteString.Lazy' to reading the file, and 'Data.Text' to
--   represent text fields which may contain Unicode characters.

-- At the topmost level, a dump file is simple an in-order, linear list of
-- revisions, where each revisions consist of a series of "operation nodes"
-- that represent the changes made by that revision to the repository.  The
-- author name and revision comment are decoded from UTF8.

type RevDate = Text

data Revision = Revision { revNumber     :: Int
                         , revDate       :: RevDate
                         , revAuthor     :: Maybe Text
                         , revComment    :: Maybe Text
                         , revOperations :: [Operation] }
              deriving Show

-- Each node reflects the changes to a single file.  Note that branches don't
-- need to be considered separately, since in Subversion, all files are stored
-- within a single filesystem.  Branches are something the user applies "after
-- the fact" by using specially named paths, such as "foo/branches".  The
-- file's contents are not decoded, as we have no way of knowing what the
-- intended encoding should be -- or even if there is in, in the case of
-- binary files.
--
-- 'opContentLength' is provided as a separate member to avoid reading in the
-- full contents of the operation solely to determine its length.  This way,
-- you can inspect the length while deferring the content read if you don't
-- need it.

data OpKind   = File | Directory deriving (Show, Enum, Eq)
data OpAction = Add | Change | Replace | Delete deriving (Show, Enum, Eq)

data Operation = Operation { opKind          :: OpKind
                           , opAction        :: OpAction
                           , opPathname      :: FilePath
                           , opContents      :: B.ByteString
                           , opContentLength :: Int
                           , opChecksumMD5   :: Maybe String
                           , opChecksumSHA1  :: Maybe String
                           , opCopyFromRev   :: Maybe Int
                           , opCopyFromPath  :: Maybe FilePath }
               deriving Show

-- A further note is needed on 'opCopyFromRev' and 'opCopyFromPath', since
-- these two represent the only real complexity in a dump file.  Basically
-- what they say is that there is no 'opContents' record for this 'Operation'.
-- Rather, the contents to be taken from another file in a past revision.
-- Since this historical information would be expensive to maintain,
-- 'Operation' only provides the data given by the dump file, and it is left
-- as an analytical pass on this data to build the structures necessary to
-- figure out what those contents would have been.
--
-- So, with our structures defined, we're ready to read in the file.  Since we
-- don't know what each element will be yet (revisions are interspersed with
-- nodes), we read them first into the much more general Node structure.

-- | Reads a dump file from a ByteString in the IO monad into a list of
--   Revision values.  This is the "cooked" parallel of 'readSvnDumpRaw'.

readSvnDump :: B.ByteString -> IO (Either ParseError [Revision])
readSvnDump io = do
  result <- readSvnDumpRaw io
  return $ map processRevs <$> (L.groupBy sameRev <$> result)

  where sameRev _ y     = isNothing $
                          L.lookup "Revision-number" (entryTags y)
        getField f n x  = L.lookup n (f x)
        getField' f n x = fromMaybe "" (getField f n x)
        tagM            = getField entryTags
        propM           = getField entryProps
        tag             = getField' entryTags
        prop            = getField' entryProps

        processRevs [] = error "Unexpected"
        processRevs (rev:ops) =
          Revision {
              revNumber     = read $ tag "Revision-number" rev
            , revDate       = parseDate $ prop "svn:date" rev
            , revAuthor     = propM "svn:author" rev
            , revComment    = propM "svn:log" rev
            , revOperations = map processOp ops }

        processOp op =
          Operation {
              opKind          = getOpKind $ tag "Node-kind" op
            , opAction        = getOpAction $ tag "Node-action" op
            , opPathname      = tag "Node-path" op
            , opContents      = entryBody op
            , opContentLength = read $ tag "Text-content-length" op
            , opCopyFromRev   = read <$>
                                tagM "Node-copyfrom-rev" op
            , opCopyFromPath  = tagM "Node-copyfrom-path" op
            , opChecksumMD5   = tagM "Text-content-md5" op
            , opChecksumSHA1  = tagM "Text-content-sha1" op }

        getOpKind kind = case kind of
          "file" -> File
          "dir"  -> Directory
          _      -> error "Unexpected"

        getOpAction kind = case kind of
          "add"     -> Add
          "delete"  -> Delete
          "change"  -> Change
          "replace" -> Replace
          _      -> error "Unexpected"

type FieldMap a = [(String, a)]

data Entry = Entry { entryTags  :: FieldMap String
                   , entryProps :: FieldMap Text
                   , entryBody  :: B.ByteString }
             deriving Show

readSvnDumpRaw :: B.ByteString -> IO (Either ParseError [Entry])
readSvnDumpRaw dump = return $ parse parseSvnDump "" dump

-- These are the Parsec parsers for the various parts of the input file.

parseTag :: PB.Parser (String, String)
parseTag = (,) <$> fieldKey   <* char ':' <* space
               <*> fieldValue <* newline
  where
    fieldKey   = (:) <$> letter <*> many fieldChar
    fieldChar  = letter <|> digit <|> oneOf "-_"
    fieldValue = many1 (noneOf "\n")

parseIndicator :: PB.Parser (Char, Integer)
parseIndicator = (,) <$> oneOf "KV" <* space
                     <*> (read <$> many1 digit <* newline)

readTextRange :: Integer -> PB.Parser B.ByteString
readTextRange len = do
  input <- getInput
  let value = B.take (fromIntegral len) input
  setInput $ B.drop (fromIntegral len) input
  return value

--readTextRange' :: Integer -> PB.Parser B.ByteString
--readTextRange' len = BC.pack <$> count (fromIntegral len) anyChar

parseSpecValue :: Char -> PB.Parser Text
parseSpecValue expected = do
  (kind, len) <- parseIndicator
  when (kind /= expected) $ unexpected "Unexpected spec value char"
  value <- readTextRange len
  --trace ("Value: " ++ (show value)) $ return ()
  _ <- newline
  return $ E.decodeUtf8 value

parseProperty :: PB.Parser (String, Text)
parseProperty = (,) <$> (unpack <$> parseSpecValue 'K')
                    <*> parseSpecValue 'V'

parseEntry :: PB.Parser Entry
parseEntry = do
  fields <- many1 parseTag <* newline

  props  <- case L.lookup "Prop-content-length" fields of
              Nothing -> return []
              Just _  -> many parseProperty <* string "PROPS-END\n"

  body   <- case L.lookup "Text-content-length" fields of
              Nothing  -> return B.empty
              Just len -> readTextRange (read len)

  _ <- many newline <?> "entry-terminating newline"

  return Entry { entryTags  = fields
               , entryProps = props
               , entryBody  = body }

parseHeader :: PB.Parser ()
parseHeader = do
  _ <- string "SVN-fs-dump-format-version: 2\n\n"
       <?> "Dump file starts without a recognizable tag"
  _ <- string "UUID: " <* many1 (hexDigit <|> char '-')
       <* newline <* newline
  return ()

parseSvnDump :: PB.Parser [Entry]
parseSvnDump = parseHeader >> many parseEntry

parseDate :: Text -> RevDate
parseDate = id

-- SvnDump.hs ends here