{-# LANGUAGE OverloadedStrings #-} module Subversion.Dump ( RevDate , Revision(..) , OpKind(..) , OpAction(..) , Operation(..) , FieldMap , Entry(..) , readSvnDumpRaw , readSvnDump ) where {-| This is a parser for Subversion dump files. The objective is to convert a dump file into a series of data structures representing that same information. It uses `Data.ByteString.Lazy` to reading the file, and `Data.Text` to represent text fields which may contain Unicode characters. -} --import Debug.Trace import Control.Applicative hiding (many, (<|>)) import Control.Monad import qualified Data.ByteString.Lazy as B --import qualified Data.ByteString.Lazy.Char8 as BC import qualified Data.List as L --import qualified Data.Map as M import Data.Maybe import Data.Text.Lazy hiding (map, count) import Data.Text.Lazy.Encoding as E import System.FilePath import Text.Parsec import Text.Parsec.ByteString.Lazy as PB import Prelude hiding (getContents) default (Data.Text.Lazy.Text) {- At the topmost level, a dump file is simple an in-order, linear list of revisions, where each revisions consist of a series of "operation nodes" that represent the changes made by that revision to the repository. The author name and revision comment are decoded from UTF8. -} type RevDate = Text data Revision = Revision { revNumber :: Int , revDate :: RevDate , revAuthor :: Maybe Text , revComment :: Maybe Text , revOperations :: [Operation] } deriving Show {- Each node reflects the changes to a single file. Note that branches don't need to be considered separately, since in Subversion, all files are stored within a single filesystem. Branches are something the user applies "after the fact" by using specially named paths, such as "foo/branches". The file's contents are not decoded, as we have no way of knowing what the intended encoding should be -- or even if there is in, in the case of binary files. `opContentLength` is provided as a separate member to avoid reading in the full contents of the operation solely to determine its length. This way, you can inspect the length while deferring the content read if you don't need it. -} data OpKind = File | Directory deriving (Show, Enum, Eq) data OpAction = Add | Change | Replace | Delete deriving (Show, Enum, Eq) data Operation = Operation { opKind :: OpKind , opAction :: OpAction , opPathname :: FilePath , opContents :: B.ByteString , opContentLength :: Int , opChecksumMD5 :: Maybe String , opChecksumSHA1 :: Maybe String , opCopyFromRev :: Maybe Int , opCopyFromPath :: Maybe FilePath } deriving Show {- A further note is needed on `opCopyFromRev` and `opCopyFromPath`, since these two represent the only real complexity in a dump file. Basically what they say is that there is no `opContents` record for this `Operation`. Rather, the contents to be taken from another file in a past revision. Since this historical information would be expensive to maintain, `Operation` only provides the data given by the dump file, and it is left as an analytical pass on this data to build the structures necessary to figure out what those contents would have been. So, with our structures defined, we're ready to read in the file. Since we don't know what each element will be yet (revisions are interspersed with nodes), we read them first into the much more general Node structure. -} {-| Reads a dump file from a ByteString in the IO monad into a list of Revision values. This is the "cooked" parallel of `readSvnDumpRaw`. -} readSvnDump :: B.ByteString -> IO (Either ParseError [Revision]) readSvnDump io = do result <- readSvnDumpRaw io return $ map processRevs <$> (L.groupBy sameRev <$> result) where sameRev _ y = isNothing $ L.lookup "Revision-number" (entryTags y) getField f n x = L.lookup n (f x) getField' f n x = fromMaybe "" (getField f n x) tagM = getField entryTags propM = getField entryProps tag = getField' entryTags prop = getField' entryProps processRevs [] = error "Unexpected" processRevs (rev:ops) = Revision { revNumber = read $ tag "Revision-number" rev , revDate = parseDate $ prop "svn:date" rev , revAuthor = propM "svn:author" rev , revComment = propM "svn:log" rev , revOperations = map processOp ops } processOp op = Operation { opKind = getOpKind $ tag "Node-kind" op , opAction = getOpAction $ tag "Node-action" op , opPathname = tag "Node-path" op , opContents = entryBody op , opContentLength = read $ tag "Text-content-length" op , opCopyFromRev = read <$> tagM "Node-copyfrom-rev" op , opCopyFromPath = tagM "Node-copyfrom-path" op , opChecksumMD5 = tagM "Text-content-md5" op , opChecksumSHA1 = tagM "Text-content-sha1" op } getOpKind kind = case kind of "file" -> File "dir" -> Directory _ -> error "Unexpected" getOpAction kind = case kind of "add" -> Add "delete" -> Delete "change" -> Change "replace" -> Replace _ -> error "Unexpected" type FieldMap a = [(String, a)] data Entry = Entry { entryTags :: FieldMap String , entryProps :: FieldMap Text , entryBody :: B.ByteString } deriving Show readSvnDumpRaw :: B.ByteString -> IO (Either ParseError [Entry]) readSvnDumpRaw dump = return $ parse parseSvnDump "" dump {- These are the Parsec parsers for the various parts of the input file. -} parseTag :: PB.Parser (String, String) parseTag = (,) <$> fieldKey <* char ':' <* space <*> fieldValue <* newline where fieldKey = (:) <$> letter <*> many fieldChar fieldChar = letter <|> digit <|> oneOf "-_" fieldValue = many1 (noneOf "\n") parseIndicator :: PB.Parser (Char, Integer) parseIndicator = (,) <$> oneOf "KV" <* space <*> (read <$> many1 digit <* newline) readTextRange :: Integer -> PB.Parser B.ByteString readTextRange len = do input <- getInput let value = B.take (fromIntegral len) input setInput $ B.drop (fromIntegral len) input return value --readTextRange' :: Integer -> PB.Parser B.ByteString --readTextRange' len = BC.pack <$> count (fromIntegral len) anyChar parseSpecValue :: Char -> PB.Parser Text parseSpecValue expected = do (kind, len) <- parseIndicator when (kind /= expected) $ unexpected "Unexpected spec value char" value <- readTextRange len --trace ("Value: " ++ (show value)) $ return () _ <- newline return $ E.decodeUtf8 value parseProperty :: PB.Parser (String, Text) parseProperty = (,) <$> (unpack <$> parseSpecValue 'K') <*> parseSpecValue 'V' parseEntry :: PB.Parser Entry parseEntry = do fields <- many1 parseTag <* newline props <- case L.lookup "Prop-content-length" fields of Nothing -> return [] Just _ -> many parseProperty <* string "PROPS-END\n" body <- case L.lookup "Text-content-length" fields of Nothing -> return B.empty Just len -> readTextRange (read len) _ <- many newline "entry-terminating newline" return Entry { entryTags = fields , entryProps = props , entryBody = body } parseHeader :: PB.Parser () parseHeader = do _ <- string "SVN-fs-dump-format-version: 2\n\n" "Dump file starts without a recognizable tag" _ <- string "UUID: " <* many1 (hexDigit <|> char '-') <* newline <* newline return () parseSvnDump :: PB.Parser [Entry] parseSvnDump = parseHeader >> many parseEntry parseDate :: Text -> RevDate parseDate = id -- SvnDump.hs ends here