//===----------------------------------------------------------------------===// // DuckDB // // duckdb/common/multi_file_reader.hpp // // //===----------------------------------------------------------------------===// #pragma once #include "duckdb/common/types.hpp" #include "duckdb/common/multi_file_reader_options.hpp" #include "duckdb/common/enums/file_glob_options.hpp" #include "duckdb/common/union_by_name.hpp" #include "duckdb/common/optional_ptr.hpp" #include "duckdb/common/types/value.hpp" namespace duckdb { class TableFunction; class TableFunctionSet; class TableFilterSet; class LogicalGet; class Expression; class ClientContext; class DataChunk; struct HivePartitioningIndex { HivePartitioningIndex(string value, idx_t index); string value; idx_t index; DUCKDB_API void Serialize(Serializer &serializer) const; DUCKDB_API static HivePartitioningIndex Deserialize(Deserializer &source); }; //! The bind data for the multi-file reader, obtained through MultiFileReader::BindReader struct MultiFileReaderBindData { //! The index of the filename column (if any) idx_t filename_idx = DConstants::INVALID_INDEX; //! The set of hive partitioning indexes (if any) vector hive_partitioning_indexes; DUCKDB_API void Serialize(Serializer &serializer) const; DUCKDB_API static MultiFileReaderBindData Deserialize(Deserializer &source); }; struct MultiFileFilterEntry { idx_t index = DConstants::INVALID_INDEX; bool is_constant = false; }; struct MultiFileConstantEntry { MultiFileConstantEntry(idx_t column_id, Value value_p) : column_id(column_id), value(std::move(value_p)) { } //! The column id to apply the constant value to idx_t column_id; //! The constant value Value value; }; struct MultiFileReaderData { //! The column ids to read from the file vector column_ids; //! The mapping of column id -> result column id //! The result chunk will be filled as follows: chunk.data[column_mapping[i]] = ReadColumn(column_ids[i]); vector column_mapping; //! Whether or not there are no columns to read. This can happen when a file only consists of constants bool empty_columns = false; //! Filters can point to either (1) local columns in the file, or (2) constant values in the `constant_map` //! This map specifies where the to-be-filtered value can be found vector filter_map; //! The set of table filters optional_ptr filters; //! The constants that should be applied at the various positions vector constant_map; //! Map of column_id -> cast, used when reading multiple files when files have diverging types //! for the same column unordered_map cast_map; }; struct MultiFileReader { //! Add the parameters for multi-file readers (e.g. union_by_name, filename) to a table function DUCKDB_API static void AddParameters(TableFunction &table_function); //! Performs any globbing for the multi-file reader and returns a list of files to be read DUCKDB_API static vector GetFileList(ClientContext &context, const Value &input, const string &name, FileGlobOptions options = FileGlobOptions::DISALLOW_EMPTY); //! Parse the named parameters of a multi-file reader DUCKDB_API static bool ParseOption(const string &key, const Value &val, MultiFileReaderOptions &options); //! Perform complex filter pushdown into the multi-file reader, potentially filtering out files that should be read //! If "true" the first file has been eliminated DUCKDB_API static bool ComplexFilterPushdown(ClientContext &context, vector &files, const MultiFileReaderOptions &options, LogicalGet &get, vector> &filters); //! Bind the options of the multi-file reader, potentially emitting any extra columns that are required DUCKDB_API static MultiFileReaderBindData BindOptions(MultiFileReaderOptions &options, const vector &files, vector &return_types, vector &names); //! Finalize the bind phase of the multi-file reader after we know (1) the required (output) columns, and (2) the //! pushed down table filters DUCKDB_API static void FinalizeBind(const MultiFileReaderOptions &file_options, const MultiFileReaderBindData &options, const string &filename, const vector &local_names, const vector &global_types, const vector &global_names, const vector &global_column_ids, MultiFileReaderData &reader_data); //! Create all required mappings from the global types/names to the file-local types/names DUCKDB_API static void CreateMapping(const string &file_name, const vector &local_types, const vector &local_names, const vector &global_types, const vector &global_names, const vector &global_column_ids, optional_ptr filters, MultiFileReaderData &reader_data, const string &initial_file); //! Finalize the reading of a chunk - applying any constants that are required DUCKDB_API static void FinalizeChunk(const MultiFileReaderBindData &bind_data, const MultiFileReaderData &reader_data, DataChunk &chunk); //! Creates a table function set from a single reader function (including e.g. list parameters, etc) DUCKDB_API static TableFunctionSet CreateFunctionSet(TableFunction table_function); template static MultiFileReaderBindData BindUnionReader(ClientContext &context, vector &return_types, vector &names, RESULT_CLASS &result, OPTIONS_CLASS &options) { D_ASSERT(options.file_options.union_by_name); vector union_col_names; vector union_col_types; // obtain the set of union column names + types by unifying the types of all of the files // note that this requires opening readers for each file and reading the metadata of each file auto union_readers = UnionByName::UnionCols(context, result.files, union_col_types, union_col_names, options); std::move(union_readers.begin(), union_readers.end(), std::back_inserter(result.union_readers)); // perform the binding on the obtained set of names + types auto bind_data = MultiFileReader::BindOptions(options.file_options, result.files, union_col_types, union_col_names); names = union_col_names; return_types = union_col_types; result.Initialize(result.union_readers[0]); D_ASSERT(names.size() == return_types.size()); return bind_data; } template static MultiFileReaderBindData BindReader(ClientContext &context, vector &return_types, vector &names, RESULT_CLASS &result, OPTIONS_CLASS &options) { if (options.file_options.union_by_name) { return BindUnionReader(context, return_types, names, result, options); } else { shared_ptr reader; reader = make_shared(context, result.files[0], options); return_types = reader->return_types; names = reader->names; result.Initialize(std::move(reader)); return MultiFileReader::BindOptions(options.file_options, result.files, return_types, names); } } template static void InitializeReader(READER_CLASS &reader, const MultiFileReaderOptions &options, const MultiFileReaderBindData &bind_data, const vector &global_types, const vector &global_names, const vector &global_column_ids, optional_ptr table_filters, const string &initial_file) { FinalizeBind(options, bind_data, reader.GetFileName(), reader.GetNames(), global_types, global_names, global_column_ids, reader.reader_data); CreateMapping(reader.GetFileName(), reader.GetTypes(), reader.GetNames(), global_types, global_names, global_column_ids, table_filters, reader.reader_data, initial_file); reader.reader_data.filters = table_filters; } template static void PruneReaders(BIND_DATA &data) { unordered_set file_set; for (auto &file : data.files) { file_set.insert(file); } if (data.initial_reader) { // check if the initial reader should still be read auto entry = file_set.find(data.initial_reader->GetFileName()); if (entry == file_set.end()) { data.initial_reader.reset(); } } for (idx_t r = 0; r < data.union_readers.size(); r++) { // check if the union reader should still be read or not auto entry = file_set.find(data.union_readers[r]->GetFileName()); if (entry == file_set.end()) { data.union_readers.erase(data.union_readers.begin() + r); r--; continue; } } } private: static void CreateNameMapping(const string &file_name, const vector &local_types, const vector &local_names, const vector &global_types, const vector &global_names, const vector &global_column_ids, MultiFileReaderData &reader_data, const string &initial_file); }; } // namespace duckdb