{-# LANGUAGE OverloadedStrings,MultiParamTypeClasses #-} {-# OPTIONS_GHC -fno-warn-orphans #-} module Database.Alteryx.Serialization ( dbFileId, getRecord, getValue, putRecord, putValue, headerPageSize, miniblockThreshold, numMetadataBytesActual, numMetadataBytesHeader, numBlockBytesActual, numBlockBytesHeader, parseRecordsUntil, recordsPerBlock, startOfBlocksByteIndex ) where import Database.Alteryx.Fields import Database.Alteryx.Types import Codec.Compression.LZF.ByteString (decompressByteStringFixed, compressByteStringFixed) import qualified Control.Newtype as NT import Control.Applicative import Control.Lens import Control.Monad as M import Control.Monad.Loops import Data.Array.IArray (listArray, bounds, elems) import Data.Binary import Data.Binary.C () import Data.Binary.Get import Data.Binary.Put import Data.Bits import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as BSL import Data.Conduit import Data.Conduit.List (sourceList) import Data.Conduit.Lazy (lazyConsume) import qualified Data.Map as Map import Data.Maybe (isJust, listToMaybe) import Data.Text as T import Data.Text.Encoding import qualified Data.Text.Lazy as TL import Data.Time.Clock.POSIX import System.IO.Unsafe (unsafePerformIO) import Text.XML hiding (renderText) import Text.XML.Cursor as XMLC ( Cursor, ($//), attribute, element, fromDocument ) import Text.XML.Stream.Render (renderText) import Text.XML.Unresolved (toEvents) -- | Number of records before each block is flushed and added to the block index recordsPerBlock :: Int recordsPerBlock = 0x10000 spatialIndexRecordBlockSize = 32 -- | Number of bytes taken by the fixed header headerPageSize :: Int headerPageSize = 512 -- | When writing miniblocks, how many bytes should each miniblock aim for? miniblockThreshold :: Int miniblockThreshold = 0x10000 -- | When decompressing miniblocks, how many bytes should be allocated for the output? bufferSize :: Int bufferSize = 0x40000 dbFileId :: DbType -> Word32 dbFileId WrigleyDb = 0x00440205 dbFileId WrigleyDb_NoSpatialIndex = 0x00440204 numBytes :: (Binary b, Num t) => b -> t numBytes x = fromIntegral $ BSL.length $ runPut $ put x numMetadataBytesHeader :: Header -> Int numMetadataBytesHeader header = fromIntegral $ 2 * (header ^. metaInfoLength) numMetadataBytesActual :: RecordInfo -> Int numMetadataBytesActual recordInfo = numBytes recordInfo numBlockBytesHeader :: Header -> Int numBlockBytesHeader header = let start = headerPageSize + (numMetadataBytesHeader header) end = (fromIntegral $ header ^. recordBlockIndexPos) in end - start numBlockBytesActual :: Block -> Int numBlockBytesActual block = numBytes block startOfBlocksByteIndex :: Header -> Int startOfBlocksByteIndex header = headerPageSize + (numMetadataBytesHeader header) parseRecordsUntil :: RecordInfo -> Get [Record] parseRecordsUntil recordInfo = do done <- isEmpty if done then return $ [] else (:) <$> getRecord recordInfo <*> parseRecordsUntil recordInfo -- | This binary instance is really slow because the YxdbFile type stores a list of records. Use the Conduit functions instead. instance Binary YxdbFile where put yxdbFile = do put $ yxdbFile ^. yxdbFileHeader put $ yxdbFile ^. yxdbFileMetadata mapM_ (putRecord $ yxdbFile ^. yxdbFileMetadata) $ yxdbFile ^. yxdbFileRecords put $ yxdbFile ^. yxdbFileBlockIndex get = do fHeader <- label "Header" $ isolate (fromIntegral headerPageSize) get fMetadata <- label "Metadata" $ isolate (numMetadataBytesHeader fHeader) $ get let numBlockBytes = numBlockBytesHeader $ fHeader fBlocks <- label ("Blocks of size " ++ show numBlockBytes) $ isolate numBlockBytes get :: Get Block fBlockIndex <- label "Block Index" get let fRecords = runGet (label "Records" $ parseRecordsUntil fMetadata) $ NT.unpack fBlocks return $ YxdbFile { _yxdbFileHeader = fHeader, _yxdbFileMetadata = fMetadata, _yxdbFileRecords = fRecords, _yxdbFileBlockIndex = fBlockIndex } documentToTextWithoutXMLHeader :: Document -> T.Text documentToTextWithoutXMLHeader document = let events = Prelude.tail $ toEvents $ toXMLDocument document in T.concat $ unsafePerformIO $ lazyConsume $ sourceList events $= renderText def instance Binary RecordInfo where put metadata = let fieldMap :: Field -> Map.Map Name Text fieldMap field = let requiredAttributes = [ ("name", field ^. fieldName), ("type", renderFieldType $ field ^. fieldType) ] sizeAttributes = case field ^. fieldSize of Nothing -> [ ] Just x -> [ ("size", T.pack $ show x) ] scaleAttributes = case field ^. fieldScale of Nothing -> [ ] Just x -> [ ("scale", T.pack $ show x) ] in Map.fromList $ Prelude.concat $ [ requiredAttributes, sizeAttributes, scaleAttributes ] transformField field = NodeElement $ Element "Field" (fieldMap field) [ ] transformRecordInfo recordInfo = NodeElement $ Element "RecordInfo" Map.empty $ Prelude.map transformField recordInfo transformMetaInfo (RecordInfo recordInfo) = Element "MetaInfo" Map.empty [ transformRecordInfo recordInfo] transformToDocument node = Document (Prologue [] Nothing []) node [] renderMetaInfo metadata = encodeUtf16LE $ flip T.snoc '\0' $ flip T.snoc '\n' $ documentToTextWithoutXMLHeader $ transformToDocument $ transformMetaInfo metadata in putByteString $ renderMetaInfo metadata get = do bs <- BS.concat . BSL.toChunks <$> getRemainingLazyByteString when (BS.length bs < 4) $ fail $ "No trailing newline and null: " ++ show bs let text = T.init $ T.init $ decodeUtf16LE bs let document = parseText_ def $ TL.fromStrict text let cursor = fromDocument document let recordInfos = parseXmlRecordInfo cursor case recordInfos of [] -> fail "No RecordInfo entries found" x:[] -> return x xs -> fail "Too many RecordInfo entries found" parseXmlField :: Cursor -> [Field] parseXmlField cursor = do let fieldCursors = cursor $// XMLC.element "Field" fieldCursor <- fieldCursors aName <- attribute "name" fieldCursor aType <- attribute "type" fieldCursor let aDesc = listToMaybe $ attribute "description" fieldCursor let aSize = listToMaybe $ attribute "size" fieldCursor let aScale = listToMaybe $ attribute "scale" fieldCursor return $ Field { _fieldName = aName, _fieldType = parseFieldType aType, _fieldSize = parseInt <$> aSize, _fieldScale = parseInt <$> aScale } parseXmlRecordInfo :: Cursor -> [RecordInfo] parseXmlRecordInfo cursor = do let recordInfoCursors = cursor $// XMLC.element "RecordInfo" recordInfoCursor <- recordInfoCursors let fields = parseXmlField recordInfoCursor return $ RecordInfo fields parseInt :: Text -> Int parseInt text = read $ T.unpack text :: Int -- | True if any fields have associated variable data in the variable data portion of the record. hasVariableData :: RecordInfo -> Bool hasVariableData (RecordInfo recordInfo) = let fieldHasVariableData field = case field ^. fieldType of FTVString -> True FTVWString -> True FTBlob -> True _ -> False in Prelude.any fieldHasVariableData recordInfo -- | Writes a record using the provided metadata. putRecord :: RecordInfo -> Record -> Put putRecord recordInfo@(RecordInfo fields) (Record fieldValues) = do zipWithM_ putValue fields fieldValues when (hasVariableData recordInfo) $ do error "putRecord: Variable data unimplemented" -- | Records consists of a fixed amount of data for each field, and also a possibly large amoutn of variable data at the end. getRecord :: RecordInfo -> Get Record getRecord recordInfo@(RecordInfo fields) = do record <- Record <$> mapM getValue fields when (hasVariableData recordInfo) $ do _ <- getAllVariableData return () return record instance Binary BlockIndex where get = do arraySize <- label "Index Array Size" $ fromIntegral <$> getWord32le let numBlockIndexBytes = arraySize * 8 blocks <- label ("Reading block of size " ++ show arraySize) $ isolate numBlockIndexBytes $ replicateM arraySize (fromIntegral <$> getWord64le) return $ BlockIndex $ listArray (0, arraySize-1) blocks put (BlockIndex blockIndex) = do let (_, iMax) = bounds blockIndex putWord32le $ fromIntegral $ iMax + 1 mapM_ (putWord64le . fromIntegral) $ elems blockIndex instance Binary Block where get = let tryGetOne = do done <- isEmpty if done then return Nothing else Just <$> get :: Get (Maybe Miniblock) in NT.pack <$> BSL.fromChunks <$> Prelude.map NT.unpack <$> unfoldM tryGetOne put (Block bs) = case BSL.toChunks bs of [] -> put $ Miniblock $ BS.empty xs -> mapM_ (put . Miniblock) xs instance Binary Miniblock where get = do writtenSize <- label "Block size" getWord32le let compressionBitIndex = 31 let isCompressed = not $ testBit writtenSize compressionBitIndex let size = fromIntegral $ clearBit writtenSize compressionBitIndex bs <- label ("Block of size " ++ show size) $ isolate size $ getByteString $ size let chunk = if isCompressed then case decompressByteStringFixed bufferSize bs of Nothing -> fail "Unable to decompress. Increase buffer size?" Just x -> return $ x else return bs Miniblock <$> chunk put (Miniblock bs) = do let compressionBitIndex = 31 let compressedBlock = compressByteStringFixed ((BS.length bs)-1) bs let blockToWrite = case compressedBlock of Nothing -> bs Just x -> x let size = BS.length blockToWrite let writtenSize = if isJust compressedBlock then size else setBit size compressionBitIndex putWord32le $ fromIntegral writtenSize putByteString blockToWrite instance Binary Header where put header = do let actualDescriptionBS = BS.take 64 $ encodeUtf8 $ header ^. description let numPaddingBytes = fromIntegral $ 64 - BS.length actualDescriptionBS let paddingDescriptionBS = BSL.toStrict $ BSL.take numPaddingBytes $ BSL.repeat 0 putByteString actualDescriptionBS putByteString paddingDescriptionBS putWord32le $ header ^. fileId putWord32le $ truncate $ utcTimeToPOSIXSeconds $ header ^. creationDate putWord32le $ header ^. flags1 putWord32le $ header ^. flags2 putWord32le $ header ^. metaInfoLength putWord32le $ header ^. mystery putWord64le $ header ^. spatialIndexPos putWord64le $ header ^. recordBlockIndexPos putWord64le $ header ^. numRecords putWord32le $ header ^. compressionVersion putByteString $ header ^. reservedSpace get = do fDescription <- label "Description" $ decodeUtf8 <$> getByteString 64 fFileId <- label "FileId" getWord32le fCreationDate <- label "Creation Date" getWord32le fFlags1 <- label "Flags 1" getWord32le fFlags2 <- label "Flags 2" getWord32le fMetaInfoLength <- label "Metadata Length" getWord32le fMystery <- label "Mystery Field" getWord32le fSpatialIndexPos <- label "Spatial Index" getWord64le fRecordBlockIndexPos <- label "Record Block" getWord64le fNumRecords <- label "Num Records" getWord64le fCompressionVersion <- label "Compression Version" getWord32le fReservedSpace <- label "Reserved Space" $ (BS.concat . BSL.toChunks <$> getRemainingLazyByteString) return $ Header { _description = fDescription, _fileId = fFileId, _creationDate = posixSecondsToUTCTime $ fromIntegral fCreationDate, _flags1 = fFlags1, _flags2 = fFlags2, _metaInfoLength = fMetaInfoLength, _mystery = fMystery, _spatialIndexPos = fSpatialIndexPos, _recordBlockIndexPos = fRecordBlockIndexPos, _numRecords = fNumRecords, _compressionVersion = fCompressionVersion, _reservedSpace = fReservedSpace }