{-# LANGUAGE OverloadedStrings #-} module Subversion.Dump ( RevDate , Revision(..) , OpKind(..) , OpAction(..) , Operation(..) , FieldMap , Entry(..) , readSvnDump , readSvnDumpRaw ) where import Control.Applicative hiding (many, (<|>)) import Control.Monad import qualified Data.ByteString.Lazy as B --import qualified Data.ByteString.Lazy.Char8 as BC import qualified Data.List as L import Data.Maybe import Data.Text.Lazy hiding (map, count) import Data.Text.Lazy.Encoding as E import System.FilePath import Text.Parsec import Text.Parsec.ByteString.Lazy as PB import Prelude hiding (getContents) default (Data.Text.Lazy.Text) -- | A parser for Subversion dump files. The objective is to convert a dump -- file into a series of data structures representing that same information. -- It uses 'Data.ByteString.Lazy' to reading the file, and 'Data.Text' to -- represent text fields which may contain Unicode characters. -- At the topmost level, a dump file is simple an in-order, linear list of -- revisions, where each revisions consist of a series of "operation nodes" -- that represent the changes made by that revision to the repository. The -- author name and revision comment are decoded from UTF8. type RevDate = Text data Revision = Revision { revNumber :: Int , revDate :: RevDate , revAuthor :: Maybe Text , revComment :: Maybe Text , revOperations :: [Operation] } deriving Show -- Each node reflects the changes to a single file. Note that branches don't -- need to be considered separately, since in Subversion, all files are stored -- within a single filesystem. Branches are something the user applies "after -- the fact" by using specially named paths, such as "foo/branches". The -- file's contents are not decoded, as we have no way of knowing what the -- intended encoding should be -- or even if there is in, in the case of -- binary files. -- -- 'opContentLength' is provided as a separate member to avoid reading in the -- full contents of the operation solely to determine its length. This way, -- you can inspect the length while deferring the content read if you don't -- need it. data OpKind = File | Directory deriving (Show, Enum, Eq) data OpAction = Add | Change | Replace | Delete deriving (Show, Enum, Eq) data Operation = Operation { opKind :: OpKind , opAction :: OpAction , opPathname :: FilePath , opContents :: B.ByteString , opContentLength :: Int , opChecksumMD5 :: Maybe String , opChecksumSHA1 :: Maybe String , opCopyFromRev :: Maybe Int , opCopyFromPath :: Maybe FilePath } deriving Show -- A further note is needed on 'opCopyFromRev' and 'opCopyFromPath', since -- these two represent the only real complexity in a dump file. Basically -- what they say is that there is no 'opContents' record for this 'Operation'. -- Rather, the contents to be taken from another file in a past revision. -- Since this historical information would be expensive to maintain, -- 'Operation' only provides the data given by the dump file, and it is left -- as an analytical pass on this data to build the structures necessary to -- figure out what those contents would have been. -- -- So, with our structures defined, we're ready to read in the file. Since we -- don't know what each element will be yet (revisions are interspersed with -- nodes), we read them first into the much more general Node structure. -- | Reads a dump file from a ByteString in the IO monad into a list of -- Revision values. This is the "cooked" parallel of 'readSvnDumpRaw'. readSvnDump :: B.ByteString -> IO (Either ParseError [Revision]) readSvnDump io = do result <- readSvnDumpRaw io return $ map processRevs <$> (L.groupBy sameRev <$> result) where sameRev _ y = isNothing $ L.lookup "Revision-number" (entryTags y) getField f n x = L.lookup n (f x) getField' f n x = fromMaybe "" (getField f n x) tagM = getField entryTags propM = getField entryProps tag = getField' entryTags prop = getField' entryProps processRevs [] = error "Unexpected" processRevs (rev:ops) = Revision { revNumber = read $ tag "Revision-number" rev , revDate = parseDate $ prop "svn:date" rev , revAuthor = propM "svn:author" rev , revComment = propM "svn:log" rev , revOperations = map processOp ops } processOp op = Operation { opKind = getOpKind $ tag "Node-kind" op , opAction = getOpAction $ tag "Node-action" op , opPathname = tag "Node-path" op , opContents = entryBody op , opContentLength = read $ tag "Text-content-length" op , opCopyFromRev = read <$> tagM "Node-copyfrom-rev" op , opCopyFromPath = tagM "Node-copyfrom-path" op , opChecksumMD5 = tagM "Text-content-md5" op , opChecksumSHA1 = tagM "Text-content-sha1" op } getOpKind kind = case kind of "file" -> File "dir" -> Directory _ -> error "Unexpected" getOpAction kind = case kind of "add" -> Add "delete" -> Delete "change" -> Change "replace" -> Replace _ -> error "Unexpected" type FieldMap a = [(String, a)] data Entry = Entry { entryTags :: FieldMap String , entryProps :: FieldMap Text , entryBody :: B.ByteString } deriving Show readSvnDumpRaw :: B.ByteString -> IO (Either ParseError [Entry]) readSvnDumpRaw dump = return $ parse parseSvnDump "" dump -- These are the Parsec parsers for the various parts of the input file. parseTag :: PB.Parser (String, String) parseTag = (,) <$> fieldKey <* char ':' <* space <*> fieldValue <* newline where fieldKey = (:) <$> letter <*> many fieldChar fieldChar = letter <|> digit <|> oneOf "-_" fieldValue = many1 (noneOf "\n") parseIndicator :: PB.Parser (Char, Integer) parseIndicator = (,) <$> oneOf "KV" <* space <*> (read <$> many1 digit <* newline) readTextRange :: Integer -> PB.Parser B.ByteString readTextRange len = do input <- getInput let value = B.take (fromIntegral len) input setInput $ B.drop (fromIntegral len) input return value --readTextRange' :: Integer -> PB.Parser B.ByteString --readTextRange' len = BC.pack <$> count (fromIntegral len) anyChar parseSpecValue :: Char -> PB.Parser Text parseSpecValue expected = do (kind, len) <- parseIndicator when (kind /= expected) $ unexpected "Unexpected spec value char" value <- readTextRange len --trace ("Value: " ++ (show value)) $ return () _ <- newline return $ E.decodeUtf8 value parseProperty :: PB.Parser (String, Text) parseProperty = (,) <$> (unpack <$> parseSpecValue 'K') <*> parseSpecValue 'V' parseEntry :: PB.Parser Entry parseEntry = do fields <- many1 parseTag <* newline props <- case L.lookup "Prop-content-length" fields of Nothing -> return [] Just _ -> many parseProperty <* string "PROPS-END\n" body <- case L.lookup "Text-content-length" fields of Nothing -> return B.empty Just len -> readTextRange (read len) _ <- many newline "entry-terminating newline" return Entry { entryTags = fields , entryProps = props , entryBody = body } parseHeader :: PB.Parser () parseHeader = do _ <- string "SVN-fs-dump-format-version: 2\n\n" "Dump file starts without a recognizable tag" _ <- string "UUID: " <* many1 (hexDigit <|> char '-') <* newline <* newline return () parseSvnDump :: PB.Parser [Entry] parseSvnDump = parseHeader >> many parseEntry parseDate :: Text -> RevDate parseDate = id -- SvnDump.hs ends here