//===----------------------------------------------------------------------===// // DuckDB // // duckdb/execution/operator/persistent/csv_reader_options.hpp // // //===----------------------------------------------------------------------===// #pragma once #include "duckdb/execution/operator/persistent/csv_buffer.hpp" #include "duckdb/common/map.hpp" #include "duckdb/function/scalar/strftime_format.hpp" #include "duckdb/common/types/value.hpp" #include "duckdb/common/field_writer.hpp" #include "duckdb/common/case_insensitive_map.hpp" #include "duckdb/common/types.hpp" #include "duckdb/common/multi_file_reader_options.hpp" namespace duckdb { enum class NewLineIdentifier : uint8_t { SINGLE = 1, // Either \r or \n CARRY_ON = 2, // \r\n MIX = 3, // Hippie-Land, can't run it multithreaded NOT_SET = 4 }; enum class ParallelMode { AUTOMATIC = 0, PARALLEL = 1, SINGLE_THREADED = 2 }; struct BufferedCSVReaderOptions { //===--------------------------------------------------------------------===// // CommonCSVOptions //===--------------------------------------------------------------------===// //! Whether or not a delimiter was defined by the user bool has_delimiter = false; //! Delimiter to separate columns within each line string delimiter = ","; //! Whether or not a new_line was defined by the user bool has_newline = false; //! New Line separator NewLineIdentifier new_line = NewLineIdentifier::NOT_SET; //! Whether or not a quote was defined by the user bool has_quote = false; //! Quote used for columns that contain reserved characters, e.g., delimiter string quote = "\""; //! Whether or not an escape character was defined by the user bool has_escape = false; //! Escape character to escape quote character string escape; //! Whether or not a header information was given by the user bool has_header = false; //! Whether or not the file has a header line bool header = false; //! Whether or not we should ignore InvalidInput errors bool ignore_errors = false; //! Expected number of columns idx_t num_cols = 0; //! Number of samples to buffer idx_t buffer_sample_size = STANDARD_VECTOR_SIZE * 50; //! Specifies the string that represents a null value string null_str; //! Whether file is compressed or not, and if so which compression type //! AUTO_DETECT (default; infer from file extension) FileCompressionType compression = FileCompressionType::AUTO_DETECT; //! Option to convert quoted values to NULL values bool allow_quoted_nulls = true; //===--------------------------------------------------------------------===// // CSVAutoOptions //===--------------------------------------------------------------------===// //! SQL Type list mapping of name to SQL type index in sql_type_list case_insensitive_map_t sql_types_per_column; //! User-defined SQL type list vector sql_type_list; //! User-defined name list vector name_list; //! Types considered as candidates for auto detection ordered by descending specificity (~ from high to low) vector auto_type_candidates = {LogicalType::VARCHAR, LogicalType::TIMESTAMP, LogicalType::DATE, LogicalType::TIME, LogicalType::DOUBLE, LogicalType::BIGINT, LogicalType::BOOLEAN, LogicalType::SQLNULL}; //===--------------------------------------------------------------------===// // ReadCSVOptions //===--------------------------------------------------------------------===// //! How many leading rows to skip idx_t skip_rows = 0; //! Whether or not the skip_rows is set by the user bool skip_rows_set = false; //! Maximum CSV line size: specified because if we reach this amount, we likely have wrong delimiters (default: 2MB) //! note that this is the guaranteed line length that will succeed, longer lines may be accepted if slightly above idx_t maximum_line_size = 2097152; //! Whether or not header names shall be normalized bool normalize_names = false; //! True, if column with that index must skip null check vector force_not_null; //! Consider all columns to be of type varchar bool all_varchar = false; //! Size of sample chunk used for dialect and type detection idx_t sample_chunk_size = STANDARD_VECTOR_SIZE; //! Number of sample chunks used for type detection idx_t sample_chunks = 10; //! Whether or not to automatically detect dialect and datatypes bool auto_detect = false; //! The file path of the CSV file to read string file_path; //! Multi-file reader options MultiFileReaderOptions file_options; //! Buffer Size (Parallel Scan) idx_t buffer_size = CSVBuffer::INITIAL_BUFFER_SIZE_COLOSSAL; //! Decimal separator when reading as numeric string decimal_separator = "."; //! Whether or not to pad rows that do not have enough columns with NULL values bool null_padding = false; //! If we are running the parallel version of the CSV Reader. In general, the system should always auto-detect //! When it can't execute a parallel run before execution. However, there are (rather specific) situations where //! setting up this manually might be important ParallelMode parallel_mode; //===--------------------------------------------------------------------===// // WriteCSVOptions //===--------------------------------------------------------------------===// //! True, if column with that index must be quoted vector force_quote; //! Prefix/suffix/custom newline the entire file once (enables writing of files as JSON arrays) string prefix; string suffix; string write_newline; //! The date format to use (if any is specified) std::map date_format = {{LogicalTypeId::DATE, {}}, {LogicalTypeId::TIMESTAMP, {}}}; //! The date format to use for writing (if any is specified) std::map write_date_format = {{LogicalTypeId::DATE, {}}, {LogicalTypeId::TIMESTAMP, {}}}; //! Whether or not a type format is specified std::map has_format = {{LogicalTypeId::DATE, false}, {LogicalTypeId::TIMESTAMP, false}}; void Serialize(FieldWriter &writer) const; void Deserialize(FieldReader &reader); void SetCompression(const string &compression); void SetHeader(bool has_header); void SetEscape(const string &escape); void SetQuote(const string "e); void SetDelimiter(const string &delimiter); void SetNewline(const string &input); //! Set an option that is supported by both reading and writing functions, called by //! the SetReadOption and SetWriteOption methods bool SetBaseOption(const string &loption, const Value &value); //! loption - lowercase string //! set - argument(s) to the option //! expected_names - names expected if the option is "columns" void SetReadOption(const string &loption, const Value &value, vector &expected_names); void SetWriteOption(const string &loption, const Value &value); void SetDateFormat(LogicalTypeId type, const string &format, bool read_format); std::string ToString() const; }; } // namespace duckdb