// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ #define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ #include #include #include #include #include #include #include #include "rocksdb/advanced_options.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" #include "rocksdb/listener.h" #include "rocksdb/universal_compaction.h" #include "rocksdb/version.h" #include "rocksdb/write_buffer_manager.h" #ifdef max #undef max #endif namespace rocksdb { class Cache; class CompactionFilter; class CompactionFilterFactory; class Comparator; class Env; enum InfoLogLevel : unsigned char; class SstFileManager; class FilterPolicy; class Logger; class MergeOperator; class Snapshot; class MemTableRepFactory; class RateLimiter; class Slice; class Statistics; class InternalKeyComparator; class WalFilter; // DB contents are stored in a set of blocks, each of which holds a // sequence of key,value pairs. Each block may be compressed before // being stored in a file. The following enum describes which // compression method (if any) is used to compress a block. enum CompressionType : unsigned char { // NOTE: do not change the values of existing entries, as these are // part of the persistent format on disk. kNoCompression = 0x0, kSnappyCompression = 0x1, kZlibCompression = 0x2, kBZip2Compression = 0x3, kLZ4Compression = 0x4, kLZ4HCCompression = 0x5, kXpressCompression = 0x6, kZSTD = 0x7, // Only use kZSTDNotFinalCompression if you have to use ZSTD lib older than // 0.8.0 or consider a possibility of downgrading the service or copying // the database files to another service running with an older version of // RocksDB that doesn't have kZSTD. Otherwise, you should use kZSTD. We will // eventually remove the option from the public API. kZSTDNotFinalCompression = 0x40, // kDisableCompressionOption is used to disable some compression options. kDisableCompressionOption = 0xff, }; struct Options; struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // The function recovers options to a previous version. Only 4.6 or later // versions are supported. ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4, int rocksdb_minor_version = 6); // Some functions that make it easier to optimize RocksDB // Use this if your DB is very small (like under 1GB) and you don't want to // spend lots of memory for memtables. ColumnFamilyOptions* OptimizeForSmallDb(); // Use this if you don't need to keep the data sorted, i.e. you'll never use // an iterator, only Put() and Get() API calls // // Not supported in ROCKSDB_LITE ColumnFamilyOptions* OptimizeForPointLookup( uint64_t block_cache_size_mb); // Default values for some parameters in ColumnFamilyOptions are not // optimized for heavy workloads and big datasets, which means you might // observe write stalls under some conditions. As a starting point for tuning // RocksDB options, use the following two functions: // * OptimizeLevelStyleCompaction -- optimizes level style compaction // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction // Universal style compaction is focused on reducing Write Amplification // Factor for big data sets, but increases Space Amplification. You can learn // more about the different styles here: // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide // Make sure to also call IncreaseParallelism(), which will provide the // biggest performance gains. // Note: we might use more memory than memtable_memory_budget during high // write rate period // // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE ColumnFamilyOptions* OptimizeLevelStyleCompaction( uint64_t memtable_memory_budget = 512 * 1024 * 1024); ColumnFamilyOptions* OptimizeUniversalStyleCompaction( uint64_t memtable_memory_budget = 512 * 1024 * 1024); // ------------------- // Parameters that affect behavior // Comparator used to define the order of keys in the table. // Default: a comparator that uses lexicographic byte-wise ordering // // REQUIRES: The client must ensure that the comparator supplied // here has the same name and orders keys *exactly* the same as the // comparator provided to previous open calls on the same DB. const Comparator* comparator = BytewiseComparator(); // REQUIRES: The client must provide a merge operator if Merge operation // needs to be accessed. Calling Merge on a DB without a merge operator // would result in Status::NotSupported. The client must ensure that the // merge operator supplied here has the same name and *exactly* the same // semantics as the merge operator provided to previous open calls on // the same DB. The only exception is reserved for upgrade, where a DB // previously without a merge operator is introduced to Merge operation // for the first time. It's necessary to specify a merge operator when // opening the DB in this case. // Default: nullptr std::shared_ptr merge_operator = nullptr; // A single CompactionFilter instance to call into during compaction. // Allows an application to modify/delete a key-value during background // compaction. // // If the client requires a new compaction filter to be used for different // compaction runs, it can specify compaction_filter_factory instead of this // option. The client should specify only one of the two. // compaction_filter takes precedence over compaction_filter_factory if // client specifies both. // // If multithreaded compaction is being used, the supplied CompactionFilter // instance may be used from different threads concurrently and so should be // thread-safe. // // Default: nullptr const CompactionFilter* compaction_filter = nullptr; // This is a factory that provides compaction filter objects which allow // an application to modify/delete a key-value during background compaction. // // A new filter will be created on each compaction run. If multithreaded // compaction is being used, each created CompactionFilter will only be used // from a single thread and so does not need to be thread-safe. // // Default: nullptr std::shared_ptr compaction_filter_factory = nullptr; // ------------------- // Parameters that affect performance // Amount of data to build up in memory (backed by an unsorted log // on disk) before converting to a sorted on-disk file. // // Larger values increase performance, especially during bulk loads. // Up to max_write_buffer_number write buffers may be held in memory // at the same time, // so you may wish to adjust this parameter to control memory usage. // Also, a larger write buffer will result in a longer recovery time // the next time the database is opened. // // Note that write_buffer_size is enforced per column family. // See db_write_buffer_size for sharing memory across column families. // // Default: 64MB // // Dynamically changeable through SetOptions() API size_t write_buffer_size = 64 << 20; // Compress blocks using the specified compression algorithm. This // parameter can be changed dynamically. // // Default: kSnappyCompression, if it's supported. If snappy is not linked // with the library, the default is kNoCompression. // // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: // ~200-500MB/s compression // ~400-800MB/s decompression // Note that these speeds are significantly faster than most // persistent storage speeds, and therefore it is typically never // worth switching to kNoCompression. Even if the input data is // incompressible, the kSnappyCompression implementation will // efficiently detect that and will switch to uncompressed mode. CompressionType compression; // Compression algorithm that will be used for the bottommost level that // contain files. If level-compaction is used, this option will only affect // levels after base level. // // Default: kDisableCompressionOption (Disabled) CompressionType bottommost_compression = kDisableCompressionOption; // different options for compression algorithms CompressionOptions compression_opts; // Number of files to trigger level-0 compaction. A value <0 means that // level-0 compaction will not be triggered by number of files at all. // // Default: 4 // // Dynamically changeable through SetOptions() API int level0_file_num_compaction_trigger = 4; // If non-nullptr, use the specified function to determine the // prefixes for keys. These prefixes will be placed in the filter. // Depending on the workload, this can reduce the number of read-IOP // cost for scans when a prefix is passed via ReadOptions to // db.NewIterator(). For prefix filtering to work properly, // "prefix_extractor" and "comparator" must be such that the following // properties hold: // // 1) key.starts_with(prefix(key)) // 2) Compare(prefix(key), key) <= 0. // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0 // 4) prefix(prefix(key)) == prefix(key) // // Default: nullptr std::shared_ptr prefix_extractor = nullptr; // Control maximum total data size for a level. // max_bytes_for_level_base is the max total for level-1. // Maximum number of bytes for level L can be calculated as // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1)) // For example, if max_bytes_for_level_base is 200MB, and if // max_bytes_for_level_multiplier is 10, total data size for level-1 // will be 200MB, total file size for level-2 will be 2GB, // and total file size for level-3 will be 20GB. // // Default: 256MB. // // Dynamically changeable through SetOptions() API uint64_t max_bytes_for_level_base = 256 * 1048576; // Disable automatic compactions. Manual compactions can still // be issued on this column family // // Dynamically changeable through SetOptions() API bool disable_auto_compactions = false; // This is a factory that provides TableFactory objects. // Default: a block-based table factory that provides a default // implementation of TableBuilder and TableReader with default // BlockBasedTableOptions. std::shared_ptr table_factory; // Create ColumnFamilyOptions with default values for all fields ColumnFamilyOptions(); // Create ColumnFamilyOptions from Options explicit ColumnFamilyOptions(const Options& options); void Dump(Logger* log) const; }; enum class WALRecoveryMode : char { // Original levelDB recovery // We tolerate incomplete record in trailing data on all logs // Use case : This is legacy behavior (default) kTolerateCorruptedTailRecords = 0x00, // Recover from clean shutdown // We don't expect to find any corruption in the WAL // Use case : This is ideal for unit tests and rare applications that // can require high consistency guarantee kAbsoluteConsistency = 0x01, // Recover to point-in-time consistency // We stop the WAL playback on discovering WAL inconsistency // Use case : Ideal for systems that have disk controller cache like // hard disk, SSD without super capacitor that store related data kPointInTimeRecovery = 0x02, // Recovery after a disaster // We ignore any corruption in the WAL and try to salvage as much data as // possible // Use case : Ideal for last ditch effort to recover data or systems that // operate with low grade unrelated data kSkipAnyCorruptedRecords = 0x03, }; struct DbPath { std::string path; uint64_t target_size; // Target size of total files under the path, in byte. DbPath() : target_size(0) {} DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {} }; struct DBOptions { // The function recovers options to the option as in version 4.6. DBOptions* OldDefaults(int rocksdb_major_version = 4, int rocksdb_minor_version = 6); // Some functions that make it easier to optimize RocksDB // Use this if your DB is very small (like under 1GB) and you don't want to // spend lots of memory for memtables. DBOptions* OptimizeForSmallDb(); #ifndef ROCKSDB_LITE // By default, RocksDB uses only one background thread for flush and // compaction. Calling this function will set it up such that total of // `total_threads` is used. Good value for `total_threads` is the number of // cores. You almost definitely want to call this function if your system is // bottlenecked by RocksDB. DBOptions* IncreaseParallelism(int total_threads = 16); #endif // ROCKSDB_LITE // If true, the database will be created if it is missing. // Default: false bool create_if_missing = false; // If true, missing column families will be automatically created. // Default: false bool create_missing_column_families = false; // If true, an error is raised if the database already exists. // Default: false bool error_if_exists = false; // If true, RocksDB will aggressively check consistency of the data. // Also, if any of the writes to the database fails (Put, Delete, Merge, // Write), the database will switch to read-only mode and fail all other // Write operations. // In most cases you want this to be set to true. // Default: true bool paranoid_checks = true; // Use the specified object to interact with the environment, // e.g. to read/write files, schedule background work, etc. // Default: Env::Default() Env* env = Env::Default(); // Use to control write rate of flush and compaction. Flush has higher // priority than compaction. Rate limiting is disabled if nullptr. // If rate limiter is enabled, bytes_per_sync is set to 1MB by default. // Default: nullptr std::shared_ptr rate_limiter = nullptr; // Use to track SST files and control their file deletion rate. // // Features: // - Throttle the deletion rate of the SST files. // - Keep track the total size of all SST files. // - Set a maximum allowed space limit for SST files that when reached // the DB wont do any further flushes or compactions and will set the // background error. // - Can be shared between multiple dbs. // Limitations: // - Only track and throttle deletes of SST files in // first db_path (db_name if db_paths is empty). // // Default: nullptr std::shared_ptr sst_file_manager = nullptr; // Any internal progress/error information generated by the db will // be written to info_log if it is non-nullptr, or to a file stored // in the same directory as the DB contents if info_log is nullptr. // Default: nullptr std::shared_ptr info_log = nullptr; #ifdef NDEBUG InfoLogLevel info_log_level = INFO_LEVEL; #else InfoLogLevel info_log_level = DEBUG_LEVEL; #endif // NDEBUG // Number of open files that can be used by the DB. You may need to // increase this if your database has a large working set. Value -1 means // files opened are always kept open. You can estimate number of files based // on target_file_size_base and target_file_size_multiplier for level-based // compaction. For universal-style compaction, you can usually set it to -1. // Default: -1 int max_open_files = -1; // If max_open_files is -1, DB will open all files on DB::Open(). You can // use this option to increase the number of threads used to open the files. // Default: 16 int max_file_opening_threads = 16; // Once write-ahead logs exceed this size, we will start forcing the flush of // column families whose memtables are backed by the oldest live WAL file // (i.e. the ones that are causing all the space amplification). If set to 0 // (default), we will dynamically choose the WAL size limit to be // [sum of all write_buffer_size * max_write_buffer_number] * 4 // Default: 0 uint64_t max_total_wal_size = 0; // If non-null, then we should collect metrics about database operations std::shared_ptr statistics = nullptr; // If true, then every store to stable storage will issue a fsync. // If false, then every store to stable storage will issue a fdatasync. // This parameter should be set to true while storing data to // filesystem like ext3 that can lose files after a reboot. // Default: false // Note: on many platforms fdatasync is defined as fsync, so this parameter // would make no difference. Refer to fdatasync definition in this code base. bool use_fsync = false; // A list of paths where SST files can be put into, with its target size. // Newer data is placed into paths specified earlier in the vector while // older data gradually moves to paths specified later in the vector. // // For example, you have a flash device with 10GB allocated for the DB, // as well as a hard drive of 2TB, you should config it to be: // [{"/flash_path", 10GB}, {"/hard_drive", 2TB}] // // The system will try to guarantee data under each path is close to but // not larger than the target size. But current and future file sizes used // by determining where to place a file are based on best-effort estimation, // which means there is a chance that the actual size under the directory // is slightly more than target size under some workloads. User should give // some buffer room for those cases. // // If none of the paths has sufficient room to place a file, the file will // be placed to the last path anyway, despite to the target size. // // Placing newer data to earlier paths is also best-efforts. User should // expect user files to be placed in higher levels in some extreme cases. // // If left empty, only one path will be used, which is db_name passed when // opening the DB. // Default: empty std::vector db_paths; // This specifies the info LOG dir. // If it is empty, the log files will be in the same dir as data. // If it is non empty, the log files will be in the specified dir, // and the db data dir's absolute path will be used as the log file // name's prefix. std::string db_log_dir = ""; // This specifies the absolute dir path for write-ahead logs (WAL). // If it is empty, the log files will be in the same dir as data, // dbname is used as the data dir by default // If it is non empty, the log files will be in kept the specified dir. // When destroying the db, // all log files in wal_dir and the dir itself is deleted std::string wal_dir = ""; // The periodicity when obsolete files get deleted. The default // value is 6 hours. The files that get out of scope by compaction // process will still get automatically delete on every compaction, // regardless of this setting uint64_t delete_obsolete_files_period_micros = 6ULL * 60 * 60 * 1000000; // Maximum number of concurrent background jobs (compactions and flushes). int max_background_jobs = 2; // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the // value of max_background_jobs. This option is ignored. int base_background_compactions = -1; // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the // value of max_background_jobs. For backwards compatibility we will set // `max_background_jobs = max_background_compactions + max_background_flushes` // in the case where user sets at least one of `max_background_compactions` or // `max_background_flushes` (we replace -1 by 1 in case one option is unset). // // Maximum number of concurrent background compaction jobs, submitted to // the default LOW priority thread pool. // // If you're increasing this, also consider increasing number of threads in // LOW priority thread pool. For more information, see // Env::SetBackgroundThreads // Default: -1 int max_background_compactions = -1; // This value represents the maximum number of threads that will // concurrently perform a compaction job by breaking it into multiple, // smaller ones that are run simultaneously. // Default: 1 (i.e. no subcompactions) uint32_t max_subcompactions = 1; // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the // value of max_background_jobs. For backwards compatibility we will set // `max_background_jobs = max_background_compactions + max_background_flushes` // in the case where user sets at least one of `max_background_compactions` or // `max_background_flushes`. // // Maximum number of concurrent background memtable flush jobs, submitted by // default to the HIGH priority thread pool. If the HIGH priority thread pool // is configured to have zero threads, flush jobs will share the LOW priority // thread pool with compaction jobs. // // It is important to use both thread pools when the same Env is shared by // multiple db instances. Without a separate pool, long running compaction // jobs could potentially block memtable flush jobs of other db instances, // leading to unnecessary Put stalls. // // If you're increasing this, also consider increasing number of threads in // HIGH priority thread pool. For more information, see // Env::SetBackgroundThreads // Default: -1 int max_background_flushes = -1; // Specify the maximal size of the info log file. If the log file // is larger than `max_log_file_size`, a new info log file will // be created. // If max_log_file_size == 0, all logs will be written to one // log file. size_t max_log_file_size = 0; // Time for the info log file to roll (in seconds). // If specified with non-zero value, log file will be rolled // if it has been active longer than `log_file_time_to_roll`. // Default: 0 (disabled) // Not supported in ROCKSDB_LITE mode! size_t log_file_time_to_roll = 0; // Maximal info log files to be kept. // Default: 1000 size_t keep_log_file_num = 1000; // Recycle log files. // If non-zero, we will reuse previously written log files for new // logs, overwriting the old data. The value indicates how many // such files we will keep around at any point in time for later // use. This is more efficient because the blocks are already // allocated and fdatasync does not need to update the inode after // each write. // Default: 0 size_t recycle_log_file_num = 0; // manifest file is rolled over on reaching this limit. // The older manifest file be deleted. // The default value is MAX_INT so that roll-over does not take place. uint64_t max_manifest_file_size = std::numeric_limits::max(); // Number of shards used for table cache. int table_cache_numshardbits = 6; // NOT SUPPORTED ANYMORE // int table_cache_remove_scan_count_limit; // The following two fields affect how archived logs will be deleted. // 1. If both set to 0, logs will be deleted asap and will not get into // the archive. // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, // WAL files will be checked every 10 min and if total size is greater // then WAL_size_limit_MB, they will be deleted starting with the // earliest until size_limit is met. All empty files will be deleted. // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then // WAL files will be checked every WAL_ttl_secondsi / 2 and those that // are older than WAL_ttl_seconds will be deleted. // 4. If both are not 0, WAL files will be checked every 10 min and both // checks will be performed with ttl being first. uint64_t WAL_ttl_seconds = 0; uint64_t WAL_size_limit_MB = 0; // Number of bytes to preallocate (via fallocate) the manifest // files. Default is 4mb, which is reasonable to reduce random IO // as well as prevent overallocation for mounts that preallocate // large amounts of data (such as xfs's allocsize option). size_t manifest_preallocation_size = 4 * 1024 * 1024; // Allow the OS to mmap file for reading sst tables. Default: false bool allow_mmap_reads = false; // Allow the OS to mmap file for writing. // DB::SyncWAL() only works if this is set to false. // Default: false bool allow_mmap_writes = false; // Enable direct I/O mode for read/write // they may or may not improve performance depending on the use case // // Files will be opened in "direct I/O" mode // which means that data r/w from the disk will not be cached or // buffered. The hardware buffer of the devices may however still // be used. Memory mapped files are not impacted by these parameters. // Use O_DIRECT for user reads // Default: false // Not supported in ROCKSDB_LITE mode! bool use_direct_reads = false; // Use O_DIRECT for both reads and writes in background flush and compactions // When true, we also force new_table_reader_for_compaction_inputs to true. // Default: false // Not supported in ROCKSDB_LITE mode! bool use_direct_io_for_flush_and_compaction = false; // If false, fallocate() calls are bypassed bool allow_fallocate = true; // Disable child process inherit open files. Default: true bool is_fd_close_on_exec = true; // NOT SUPPORTED ANYMORE -- this options is no longer used bool skip_log_error_on_recovery = false; // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec // Default: 600 (10 min) unsigned int stats_dump_period_sec = 600; // If set true, will hint the underlying file system that the file // access pattern is random, when a sst file is opened. // Default: true bool advise_random_on_open = true; // Amount of data to build up in memtables across all column // families before writing to disk. // // This is distinct from write_buffer_size, which enforces a limit // for a single memtable. // // This feature is disabled by default. Specify a non-zero value // to enable it. // // Default: 0 (disabled) size_t db_write_buffer_size = 0; // The memory usage of memtable will report to this object. The same object // can be passed into multiple DBs and it will track the sum of size of all // the DBs. If the total size of all live memtables of all the DBs exceeds // a limit, a flush will be triggered in the next DB to which the next write // is issued. // // If the object is only passed to on DB, the behavior is the same as // db_write_buffer_size. When write_buffer_manager is set, the value set will // override db_write_buffer_size. // // This feature is disabled by default. Specify a non-zero value // to enable it. // // Default: null std::shared_ptr write_buffer_manager = nullptr; // Specify the file access pattern once a compaction is started. // It will be applied to all input files of a compaction. // Default: NORMAL enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED }; AccessHint access_hint_on_compaction_start = NORMAL; // If true, always create a new file descriptor and new table reader // for compaction inputs. Turn this parameter on may introduce extra // memory usage in the table reader, if it allocates extra memory // for indexes. This will allow file descriptor prefetch options // to be set for compaction input files and not to impact file // descriptors for the same file used by user queries. // Suggest to enable BlockBasedTableOptions.cache_index_and_filter_blocks // for this mode if using block-based table. // // Default: false bool new_table_reader_for_compaction_inputs = false; // If non-zero, we perform bigger reads when doing compaction. If you're // running RocksDB on spinning disks, you should set this to at least 2MB. // That way RocksDB's compaction is doing sequential instead of random reads. // // When non-zero, we also force new_table_reader_for_compaction_inputs to // true. // // Default: 0 size_t compaction_readahead_size = 0; // This is a maximum buffer size that is used by WinMmapReadableFile in // unbuffered disk I/O mode. We need to maintain an aligned buffer for // reads. We allow the buffer to grow until the specified value and then // for bigger requests allocate one shot buffers. In unbuffered mode we // always bypass read-ahead buffer at ReadaheadRandomAccessFile // When read-ahead is required we then make use of compaction_readahead_size // value and always try to read ahead. With read-ahead we always // pre-allocate buffer to the size instead of growing it up to a limit. // // This option is currently honored only on Windows // // Default: 1 Mb // // Special value: 0 - means do not maintain per instance buffer. Allocate // per request buffer and avoid locking. size_t random_access_max_buffer_size = 1024 * 1024; // This is the maximum buffer size that is used by WritableFileWriter. // On Windows, we need to maintain an aligned buffer for writes. // We allow the buffer to grow until it's size hits the limit in buffered // IO and fix the buffer size when using direct IO to ensure alignment of // write requests if the logical sector size is unusual // // Default: 1024 * 1024 (1 MB) size_t writable_file_max_buffer_size = 1024 * 1024; // Use adaptive mutex, which spins in the user space before resorting // to kernel. This could reduce context switch when the mutex is not // heavily contended. However, if the mutex is hot, we could end up // wasting spin time. // Default: false bool use_adaptive_mutex = false; // Create DBOptions with default values for all fields DBOptions(); // Create DBOptions from Options explicit DBOptions(const Options& options); void Dump(Logger* log) const; // Allows OS to incrementally sync files to disk while they are being // written, asynchronously, in the background. This operation can be used // to smooth out write I/Os over time. Users shouldn't rely on it for // persistency guarantee. // Issue one request for every bytes_per_sync written. 0 turns it off. // Default: 0 // // You may consider using rate_limiter to regulate write rate to device. // When rate limiter is enabled, it automatically enables bytes_per_sync // to 1MB. // // This option applies to table files uint64_t bytes_per_sync = 0; // Same as bytes_per_sync, but applies to WAL files // Default: 0, turned off uint64_t wal_bytes_per_sync = 0; // A vector of EventListeners which call-back functions will be called // when specific RocksDB event happens. std::vector> listeners; // If true, then the status of the threads involved in this DB will // be tracked and available via GetThreadList() API. // // Default: false bool enable_thread_tracking = false; // The limited write rate to DB if soft_pending_compaction_bytes_limit or // level0_slowdown_writes_trigger is triggered, or we are writing to the // last mem table allowed and we allow more than 3 mem tables. It is // calculated using size of user write requests before compression. // RocksDB may decide to slow down more if the compaction still // gets behind further. // If the value is 0, we will infer a value from `rater_limiter` value // if it is not empty, or 16MB if `rater_limiter` is empty. Note that // if users change the rate in `rate_limiter` after DB is opened, // `delayed_write_rate` won't be adjusted. // // Unit: byte per second. // // Default: 0 uint64_t delayed_write_rate = 0; // By default, a single write thread queue is maintained. The thread gets // to the head of the queue becomes write batch group leader and responsible // for writing to WAL and memtable for the batch group. // // If enable_pipelined_write is true, separate write thread queue is // maintained for WAL write and memtable write. A write thread first enter WAL // writer queue and then memtable writer queue. Pending thread on the WAL // writer queue thus only have to wait for previous writers to finish their // WAL writing but not the memtable writing. Enabling the feature may improve // write throughput and reduce latency of the prepare phase of two-phase // commit. // // Default: false bool enable_pipelined_write = false; // If true, allow multi-writers to update mem tables in parallel. // Only some memtable_factory-s support concurrent writes; currently it // is implemented only for SkipListFactory. Concurrent memtable writes // are not compatible with inplace_update_support or filter_deletes. // It is strongly recommended to set enable_write_thread_adaptive_yield // if you are going to use this feature. // // Default: true bool allow_concurrent_memtable_write = true; // If true, threads synchronizing with the write batch group leader will // wait for up to write_thread_max_yield_usec before blocking on a mutex. // This can substantially improve throughput for concurrent workloads, // regardless of whether allow_concurrent_memtable_write is enabled. // // Default: true bool enable_write_thread_adaptive_yield = true; // The maximum number of microseconds that a write operation will use // a yielding spin loop to coordinate with other write threads before // blocking on a mutex. (Assuming write_thread_slow_yield_usec is // set properly) increasing this value is likely to increase RocksDB // throughput at the expense of increased CPU usage. // // Default: 100 uint64_t write_thread_max_yield_usec = 100; // The latency in microseconds after which a std::this_thread::yield // call (sched_yield on Linux) is considered to be a signal that // other processes or threads would like to use the current core. // Increasing this makes writer threads more likely to take CPU // by spinning, which will show up as an increase in the number of // involuntary context switches. // // Default: 3 uint64_t write_thread_slow_yield_usec = 3; // If true, then DB::Open() will not update the statistics used to optimize // compaction decision by loading table properties from many files. // Turning off this feature will improve DBOpen time especially in // disk environment. // // Default: false bool skip_stats_update_on_db_open = false; // Recovery mode to control the consistency while replaying WAL // Default: kPointInTimeRecovery WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; // if set to false then recovery will fail when a prepared // transaction is encountered in the WAL bool allow_2pc = false; // A global cache for table-level rows. // Default: nullptr (disabled) // Not supported in ROCKSDB_LITE mode! std::shared_ptr row_cache = nullptr; #ifndef ROCKSDB_LITE // A filter object supplied to be invoked while processing write-ahead-logs // (WALs) during recovery. The filter provides a way to inspect log // records, ignoring a particular record or skipping replay. // The filter is invoked at startup and is invoked from a single-thread // currently. WalFilter* wal_filter = nullptr; #endif // ROCKSDB_LITE // If true, then DB::Open / CreateColumnFamily / DropColumnFamily // / SetOptions will fail if options file is not detected or properly // persisted. // // DEFAULT: false bool fail_if_options_file_error = false; // If true, then print malloc stats together with rocksdb.stats // when printing to LOG. // DEFAULT: false bool dump_malloc_stats = false; // By default RocksDB replay WAL logs and flush them on DB open, which may // create very small SST files. If this option is enabled, RocksDB will try // to avoid (but not guarantee not to) flush during recovery. Also, existing // WAL logs will be kept, so that if crash happened before flush, we still // have logs to recover from. // // DEFAULT: false bool avoid_flush_during_recovery = false; // By default RocksDB will flush all memtables on DB close if there are // unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup // DB close. Unpersisted data WILL BE LOST. // // DEFAULT: false // // Dynamically changeable through SetDBOptions() API. bool avoid_flush_during_shutdown = false; // Set this option to true during creation of database if you want // to be able to ingest behind (call IngestExternalFile() skipping keys // that already exist, rather than overwriting matching keys). // Setting this option to true will affect 2 things: // 1) Disable some internal optimizations around SST file compression // 2) Reserve bottom-most level for ingested files only. // 3) Note that num_levels should be >= 3 if this option is turned on. // // DEFAULT: false // Immutable. bool allow_ingest_behind = false; // If enabled it uses two queues for writes, one for the ones with // disable_memtable and one for the ones that also write to memtable. This // allows the memtable writes not to lag behind other writes. It can be used // to optimize MySQL 2PC in which only the commits, which are serial, write to // memtable. bool concurrent_prepare = false; // If true WAL is not flushed automatically after each write. Instead it // relies on manual invocation of FlushWAL to write the WAL buffer to its // file. bool manual_wal_flush = false; }; // Options to control the behavior of a database (passed to DB::Open) struct Options : public DBOptions, public ColumnFamilyOptions { // Create an Options object with default values for all fields. Options() : DBOptions(), ColumnFamilyOptions() {} Options(const DBOptions& db_options, const ColumnFamilyOptions& column_family_options) : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {} // The function recovers options to the option as in version 4.6. Options* OldDefaults(int rocksdb_major_version = 4, int rocksdb_minor_version = 6); void Dump(Logger* log) const; void DumpCFOptions(Logger* log) const; // Some functions that make it easier to optimize RocksDB // Set appropriate parameters for bulk loading. // The reason that this is a function that returns "this" instead of a // constructor is to enable chaining of multiple similar calls in the future. // // All data will be in level 0 without any automatic compaction. // It's recommended to manually call CompactRange(NULL, NULL) before reading // from the database, because otherwise the read can be very slow. Options* PrepareForBulkLoad(); // Use this if your DB is very small (like under 1GB) and you don't want to // spend lots of memory for memtables. Options* OptimizeForSmallDb(); }; // // An application can issue a read request (via Get/Iterators) and specify // if that read should process data that ALREADY resides on a specified cache // level. For example, if an application specifies kBlockCacheTier then the // Get call will process data that is already processed in the memtable or // the block cache. It will not page in data from the OS cache or data that // resides in storage. enum ReadTier { kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage kBlockCacheTier = 0x1, // data in memtable or block cache kPersistedTier = 0x2, // persisted data. When WAL is disabled, this option // will skip data in memtable. // Note that this ReadTier currently only supports // Get and MultiGet and does not support iterators. kMemtableTier = 0x3 // data in memtable. used for memtable-only iterators. }; // Options that control read operations struct ReadOptions { // If "snapshot" is non-nullptr, read as of the supplied snapshot // (which must belong to the DB that is being read and which must // not have been released). If "snapshot" is nullptr, use an implicit // snapshot of the state at the beginning of this read operation. // Default: nullptr const Snapshot* snapshot; // "iterate_upper_bound" defines the extent upto which the forward iterator // can returns entries. Once the bound is reached, Valid() will be false. // "iterate_upper_bound" is exclusive ie the bound value is // not a valid entry. If iterator_extractor is not null, the Seek target // and iterator_upper_bound need to have the same prefix. // This is because ordering is not guaranteed outside of prefix domain. // There is no lower bound on the iterator. If needed, that can be easily // implemented. // // Default: nullptr const Slice* iterate_upper_bound; // If non-zero, NewIterator will create a new table reader which // performs reads of the given size. Using a large size (> 2MB) can // improve the performance of forward iteration on spinning disks. // Default: 0 size_t readahead_size; // A threshold for the number of keys that can be skipped before failing an // iterator seek as incomplete. The default value of 0 should be used to // never fail a request as incomplete, even on skipping too many keys. // Default: 0 uint64_t max_skippable_internal_keys; // Specify if this read request should process data that ALREADY // resides on a particular cache. If the required data is not // found at the specified cache, then Status::Incomplete is returned. // Default: kReadAllTier ReadTier read_tier; // If true, all data read from underlying storage will be // verified against corresponding checksums. // Default: true bool verify_checksums; // Should the "data block"/"index block"/"filter block" read for this // iteration be cached in memory? // Callers may wish to set this field to false for bulk scans. // Default: true bool fill_cache; // Specify to create a tailing iterator -- a special iterator that has a // view of the complete database (i.e. it can also be used to read newly // added data) and is optimized for sequential reads. It will return records // that were inserted into the database after the creation of the iterator. // Default: false // Not supported in ROCKSDB_LITE mode! bool tailing; // Specify to create a managed iterator -- a special iterator that // uses less resources by having the ability to free its underlying // resources on request. // Default: false // Not supported in ROCKSDB_LITE mode! bool managed; // Enable a total order seek regardless of index format (e.g. hash index) // used in the table. Some table format (e.g. plain table) may not support // this option. // If true when calling Get(), we also skip prefix bloom when reading from // block based table. It provides a way to read existing data after // changing implementation of prefix extractor. bool total_order_seek; // Enforce that the iterator only iterates over the same prefix as the seek. // This option is effective only for prefix seeks, i.e. prefix_extractor is // non-null for the column family and total_order_seek is false. Unlike // iterate_upper_bound, prefix_same_as_start only works within a prefix // but in both directions. // Default: false bool prefix_same_as_start; // Keep the blocks loaded by the iterator pinned in memory as long as the // iterator is not deleted, If used when reading from tables created with // BlockBasedTableOptions::use_delta_encoding = false, // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to // return 1. // Default: false bool pin_data; // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we // schedule a background job in the flush job queue and delete obsolete files // in background. // Default: false bool background_purge_on_iterator_cleanup; // If true, keys deleted using the DeleteRange() API will be visible to // readers until they are naturally deleted during compaction. This improves // read performance in DBs with many range deletions. // Default: false bool ignore_range_deletions; ReadOptions(); ReadOptions(bool cksum, bool cache); }; // Options that control write operations struct WriteOptions { // If true, the write will be flushed from the operating system // buffer cache (by calling WritableFile::Sync()) before the write // is considered complete. If this flag is true, writes will be // slower. // // If this flag is false, and the machine crashes, some recent // writes may be lost. Note that if it is just the process that // crashes (i.e., the machine does not reboot), no writes will be // lost even if sync==false. // // In other words, a DB write with sync==false has similar // crash semantics as the "write()" system call. A DB write // with sync==true has similar crash semantics to a "write()" // system call followed by "fdatasync()". // // Default: false bool sync; // If true, writes will not first go to the write ahead log, // and the write may got lost after a crash. bool disableWAL; // If true and if user is trying to write to column families that don't exist // (they were dropped), ignore the write (don't return an error). If there // are multiple writes in a WriteBatch, other writes will succeed. // Default: false bool ignore_missing_column_families; // If true and we need to wait or sleep for the write request, fails // immediately with Status::Incomplete(). bool no_slowdown; // If true, this write request is of lower priority if compaction is // behind. In this case, no_slowdown = true, the request will be cancelled // immediately with Status::Incomplete() returned. Otherwise, it will be // slowed down. The slowdown value is determined by RocksDB to guarantee // it introduces minimum impacts to high priority writes. // // Default: false bool low_pri; WriteOptions() : sync(false), disableWAL(false), ignore_missing_column_families(false), no_slowdown(false), low_pri(false) {} }; // Options that control flush operations struct FlushOptions { // If true, the flush will wait until the flush is done. // Default: true bool wait; FlushOptions() : wait(true) {} }; // Create a Logger from provided DBOptions extern Status CreateLoggerFromOptions(const std::string& dbname, const DBOptions& options, std::shared_ptr* logger); // CompactionOptions are used in CompactFiles() call. struct CompactionOptions { // Compaction output compression type // Default: snappy CompressionType compression; // Compaction will create files of size `output_file_size_limit`. // Default: MAX, which means that compaction will create a single file uint64_t output_file_size_limit; CompactionOptions() : compression(kSnappyCompression), output_file_size_limit(std::numeric_limits::max()) {} }; // For level based compaction, we can configure if we want to skip/force // bottommost level compaction. enum class BottommostLevelCompaction { // Skip bottommost level compaction kSkip, // Only compact bottommost level if there is a compaction filter // This is the default option kIfHaveCompactionFilter, // Always compact bottommost level kForce, }; // CompactRangeOptions is used by CompactRange() call. struct CompactRangeOptions { // If true, no other compaction will run at the same time as this // manual compaction bool exclusive_manual_compaction = true; // If true, compacted files will be moved to the minimum level capable // of holding the data or given level (specified non-negative target_level). bool change_level = false; // If change_level is true and target_level have non-negative value, compacted // files will be moved to target_level. int target_level = -1; // Compaction outputs will be placed in options.db_paths[target_path_id]. // Behavior is undefined if target_path_id is out of range. uint32_t target_path_id = 0; // By default level based compaction will only compact the bottommost level // if there is a compaction filter BottommostLevelCompaction bottommost_level_compaction = BottommostLevelCompaction::kIfHaveCompactionFilter; }; // IngestExternalFileOptions is used by IngestExternalFile() struct IngestExternalFileOptions { // Can be set to true to move the files instead of copying them. bool move_files = false; // If set to false, an ingested file keys could appear in existing snapshots // that where created before the file was ingested. bool snapshot_consistency = true; // If set to false, IngestExternalFile() will fail if the file key range // overlaps with existing keys or tombstones in the DB. bool allow_global_seqno = true; // If set to false and the file key range overlaps with the memtable key range // (memtable flush required), IngestExternalFile will fail. bool allow_blocking_flush = true; // Set to true if you would like duplicate keys in the file being ingested // to be skipped rather than overwriting existing data under that key. // Usecase: back-fill of some historical data in the database without // over-writing existing newer version of data. // This option could only be used if the DB has been running // with allow_ingest_behind=true since the dawn of time. // All files will be ingested at the bottommost level with seqno=0. bool ingest_behind = false; }; } // namespace rocksdb #endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_