{-# LANGUAGE OverloadedStrings #-} module Subversion.Dump ( RevDate , Revision(..) , OpKind(..) , OpAction(..) , Operation(..) , FieldMap , Entry(..) , readSvnDump , readSvnDumpRaw , parseHeader , parseEntry ) where import Control.Applicative hiding (many) import Control.Monad import qualified Data.Attoparsec.Char8 as AC import Data.Attoparsec.Combinator import Data.Attoparsec.Lazy as AL import Data.ByteString as B hiding (map) import qualified Data.ByteString.Char8 as BC hiding (map) import qualified Data.ByteString.Lazy as BL hiding (map) import qualified Data.List as L import Data.Text (Text) import qualified Data.Text.Encoding as E import Data.Maybe import Data.Word (Word8) import System.FilePath import Prelude hiding (getContents) default (ByteString) -- | A parser for Subversion dump files. The objective is to convert a dump -- file into a series of data structures representing that same information. -- It uses 'Data.ByteString.Lazy' to reading the file, and 'Data.Text' to -- represent text fields which may contain Unicode characters. -- At the topmost level, a dump file is simple an in-order, linear list of -- revisions, where each revisions consist of a series of "operation nodes" -- that represent the changes made by that revision to the repository. The -- author name and revision comment are decoded from UTF8. type RevDate = Text data Revision = Revision { revNumber :: Int , revDate :: RevDate , revAuthor :: Maybe Text , revComment :: Maybe Text , revOperations :: [Operation] } deriving Show -- Each node reflects the changes to a single file. Note that branches don't -- need to be considered separately, since in Subversion, all files are stored -- within a single filesystem. Branches are something the user applies "after -- the fact" by using specially named paths, such as "foo/branches". The -- file's contents are not decoded, as we have no way of knowing what the -- intended encoding should be -- or even if there is in, in the case of -- binary files. -- -- 'opContentLength' is provided as a separate member to avoid reading in the -- full contents of the operation solely to determine its length. This way, -- you can inspect the length while deferring the content read if you don't -- need it. data OpKind = File | Directory deriving (Show, Enum, Eq) data OpAction = Add | Change | Replace | Delete deriving (Show, Enum, Eq) data Operation = Operation { opKind :: OpKind , opAction :: OpAction , opPathname :: FilePath , opContents :: ByteString , opContentLength :: Int , opChecksumMD5 :: Maybe Text , opChecksumSHA1 :: Maybe Text , opCopyFromRev :: Maybe Int , opCopyFromPath :: Maybe FilePath } deriving Show -- A further note is needed on 'opCopyFromRev' and 'opCopyFromPath', since -- these two represent the only real complexity in a dump file. Basically -- what they say is that there is no 'opContents' record for this 'Operation'. -- Rather, the contents to be taken from another file in a past revision. -- Since this historical information would be expensive to maintain, -- 'Operation' only provides the data given by the dump file, and it is left -- as an analytical pass on this data to build the structures necessary to -- figure out what those contents would have been. -- -- So, with our structures defined, we're ready to read in the file. Since we -- don't know what each element will be yet (revisions are interspersed with -- nodes), we read them first into the much more general Node structure. -- | Reads a dump file from a ByteString in the IO monad into a list of -- Revision values. This is the "cooked" parallel of 'readSvnDumpRaw'. readSvnDump :: BL.ByteString -> Either String [Revision] readSvnDump io = do case readSvnDumpRaw io of Fail _ _ y -> Left y Done _ result -> Right $ map processRevs (L.groupBy sameRev result) where sameRev _ y = isNothing $ L.lookup "Revision-number" (entryTags y) getField f n x = L.lookup n (f x) getField' f n x = fromMaybe "" (getField f n x) tagM = getField entryTags propM = getField entryProps tag = getField' entryTags prop = getField' entryProps processRevs [] = error "Unexpected" processRevs (rev:ops) = Revision { revNumber = readInt $ tag "Revision-number" rev , revDate = parseDate $ prop "svn:date" rev , revAuthor = E.decodeUtf8 <$> propM "svn:author" rev , revComment = E.decodeUtf8 <$> propM "svn:log" rev , revOperations = map processOp ops } processOp op = Operation { opKind = getOpKind $ tag "Node-kind" op , opAction = getOpAction $ tag "Node-action" op , opPathname = BC.unpack $ tag "Node-path" op , opContents = entryBody op , opContentLength = readInt $ tag "Text-content-length" op , opCopyFromRev = readInt <$> tagM "Node-copyfrom-rev" op , opCopyFromPath = BC.unpack <$> tagM "Node-copyfrom-path" op , opChecksumMD5 = E.decodeUtf8 <$> tagM "Text-content-md5" op , opChecksumSHA1 = E.decodeUtf8 <$> tagM "Text-content-sha1" op } getOpKind kind = case kind of "file" -> File "dir" -> Directory _ -> error "Unexpected" getOpAction kind = case kind of "add" -> Add "delete" -> Delete "change" -> Change "replace" -> Replace _ -> error "Unexpected" type FieldMap = [(ByteString, ByteString)] data Entry = Entry { entryTags :: FieldMap , entryProps :: FieldMap , entryBody :: ByteString } deriving Show readSvnDumpRaw :: BL.ByteString -> Result [Entry] readSvnDumpRaw dump = parse parseSvnDump dump -- These are the Parsec parsers for the various parts of the input file. space :: Parser Word8 space = satisfy (== 32) newline :: Parser Word8 newline = satisfy (== 10) parseTag :: Parser (ByteString, ByteString) parseTag = (,) <$> takeWhile1 fieldChar <* string ": " -- : <*> takeWhile1 (/= 10) <* newline where fieldChar w = (w >= 65 && w <= 90) -- A-Z || (w >= 97 && w <= 121) -- a-z || (w >= 48 && w <= 57) -- 0-9 || w == 45 -- - || w == 95 -- _ parseIndicator :: Parser (Word8, Int) parseIndicator = (,) <$> satisfy (oneOf 75 86) <* space -- K or V <*> AC.decimal <* newline where oneOf x y w = w == x || w == y parseSpecValue :: Parser ByteString parseSpecValue = do (_, len) <- parseIndicator AL.take len <* newline parseProperty :: Parser (ByteString, ByteString) parseProperty = (,) <$> parseSpecValue -- K <*> parseSpecValue -- V readInt :: ByteString -> Int readInt bs = B.foldl' addup 0 bs where addup acc x = acc * 10 + (fromIntegral x - 48) -- '0' parseEntry :: Parser Entry parseEntry = do fields <- many1 parseTag <* newline props <- case L.lookup "Prop-content-length" fields of Nothing -> return [] Just _ -> manyTill parseProperty (try (string "PROPS-END\n")) body <- case L.lookup "Text-content-length" fields of Nothing -> return B.empty Just len -> AL.take (readInt len) _ <- AL.takeWhile (== 10) return Entry { entryTags = fields , entryProps = props , entryBody = body } parseHeader :: Parser ByteString parseHeader = do _ <- string "SVN-fs-dump-format-version: 2\n\n" "Dump file starts without a recognizable tag" string "UUID: " *> takeWhile1 uuidMember <* newline <* newline where -- Accept any hexadecimal character, or '-' uuidMember w = w == 45 || (w >= 48 && w <= 57) || (w >= 97 && w <= 102) parseSvnDump :: Parser [Entry] parseSvnDump = parseHeader >> many1 parseEntry parseDate :: ByteString -> RevDate parseDate = E.decodeUtf8 -- SvnDump.hs ends here