8000 On COPY TO/FROM check the format during binding. by pdet · Pull Request #17381 · duckdb/duckdb · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

On COPY TO/FROM check the format during binding. #17381

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion extension/json/json_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,13 @@ void JsonExtension::Load(DuckDB &db) {

// JSON copy function
auto copy_fun = JSONFunctions::GetJSONCopyFunction();
ExtensionUtil::RegisterFunction(db_instance, std::move(copy_fun));
ExtensionUtil::RegisterFunction(db_instance, copy_fun);
copy_fun.extension = "ndjson";
copy_fun.name = "ndjson";
ExtensionUtil::RegisterFunction(db_instance, copy_fun);
copy_fun.extension = "jsonl";
copy_fun.name = "jsonl";
ExtensionUtil::RegisterFunction(db_instance, copy_fun);

// JSON macro's
for (idx_t index = 0; json_macros[index].name != nullptr; index++) {
Expand Down
10 changes: 7 additions & 3 deletions src/include/duckdb/parser/parsed_data/copy_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "duckdb/common/unordered_map.hpp"
#include "duckdb/common/types/value.hpp"
#include "duckdb/common/case_insensitive_map.hpp"
#include "duckdb/parser/query_node.hpp"

namespace duckdb {

Expand All @@ -23,7 +24,7 @@ struct CopyInfo : public ParseInfo {
static constexpr const ParseInfoType TYPE = ParseInfoType::COPY_INFO;

public:
CopyInfo() : ParseInfo(TYPE), catalog(INVALID_CATALOG), schema(DEFAULT_SCHEMA) {
CopyInfo() : ParseInfo(TYPE), catalog(INVALID_CATALOG), schema(DEFAULT_SCHEMA), is_format_auto_detected(true) {
}

//! The catalog name to copy to/from
Expand All @@ -38,15 +39,18 @@ struct CopyInfo : public ParseInfo {
bool is_from;
//! The file format of the external file
string format;
//! If the format is manually set (i.e., via the format parameter) or was discovered by inspecting the file path
bool is_format_auto_detected;
//! The file path to copy to/from
string file_path;
//! Set of (key, value) options
case_insensitive_map_t<vector<Value>> options;
// The SQL statement used instead of a table when copying data out to a file
//! The SQL statement used instead of a table when copying data out to a file
unique_ptr<QueryNode> select_statement;

public:
static string CopyOptionsToString(const string &format, const case_insensitive_map_t<vector<Value>> &options);
static string CopyOptionsToString(const string &format, bool is_format_auto_detected,
const case_insensitive_map_t<vector<Value>> &options);

public:
unique_ptr<CopyInfo> Copy() const;
Expand Down
5 changes: 5 additions & 0 deletions src/include/duckdb/storage/serialization/parse_info.json
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,11 @@
"id": 208,
"name": "select_statement",
"type": "QueryNode*"
},
{
"id": 209,
"name": "is_format_auto_detected",
"type": "bool"
}
]
},
Expand Down
14 changes: 9 additions & 5 deletions src/parser/parsed_data/copy_info.cpp
8000
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,26 @@ unique_ptr<CopyInfo> CopyInfo::Copy() const {
result->file_path = file_path;
result->is_from = is_from;
result->format = format;
result->is_format_auto_detected = is_format_auto_detected;
result->options = options;
if (select_statement) {
result->select_statement = select_statement->Copy();
}
return result;
}

string CopyInfo::CopyOptionsToString(const string &format, const case_insensitive_map_t<vector<Value>> &options) {
if (format.empty() && options.empty()) {
string CopyInfo::CopyOptionsToString(const string &format, bool is_format_auto_detected,
const case_insensitive_map_t<vector<Value>> &options) {
// We only output the format if there is a format, and it was manually set.
const bool output_format = !format.empty() && !is_format_auto_detected;
if (!output_format && options.empty()) {
return string();
}
string result;

result += " (";
vector<string> stringified;
if (!format.empty()) {
if (!format.empty() && !is_format_auto_detected) {
stringified.push_back(StringUtil::Format(" FORMAT %s", format));
}
for (auto &opt : options) {
Expand Down Expand Up @@ -81,7 +85,7 @@ string CopyInfo::ToString() const {
result += TablePartToString();
result += " FROM";
result += StringUtil::Format(" %s", SQLString(file_path));
result += CopyOptionsToString(format, options);
result += CopyOptionsToString(format, is_format_auto_detected, options);
} else {
if (select_statement) {
// COPY (select-node) TO ...
Expand All @@ -91,7 +95,7 @@ string CopyInfo::ToString() const {
}
result += " TO ";
result += StringUtil::Format("%s", SQLString(file_path));
result += CopyOptionsToString(format, options);
result += CopyOptionsToString(format, is_format_auto_detected, options);
}
result += ";";
return result;
Expand Down
2 changes: 1 addition & 1 deletion src/parser/statement/export_statement.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ string ExportStatement::ToString() const {
auto &options = info->options;
auto &format = info->format;
result += StringUtil::Format(" '%s'", path);
result += CopyInfo::CopyOptionsToString(format, options);
result += CopyInfo::CopyOptionsToString(format, info->is_format_auto_detected, options);
result += ";";
return result;
}
Expand Down
27 changes: 20 additions & 7 deletions src/parser/transform/statement/transform_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ void Transformer::TransformCopyOptions(CopyInfo &info, optional_ptr<duckdb_libpg
throw ParserException("Unsupported parameter type for FORMAT: expected e.g. FORMAT 'csv', 'parquet'");
}
info.format = StringUtil::Lower(format_val->val.str);
info.is_format_auto_detected = false;
continue;
}

Expand All @@ -77,6 +78,24 @@ void Transformer::TransformCopyOptions(CopyInfo &info, optional_ptr<duckdb_libpg
}
}

string ExtractFormat(const string &file_path) {
auto format = StringUtil::Lower(file_path);
// We first remove extension suffixes
if (StringUtil::EndsWith(format, CompressionExtensionFromType(FileCompressionType::GZIP))) {
format = format.substr(0, format.size() - 3);
} else if (StringUtil::EndsWith(format, CompressionExtensionFromType(FileCompressionType::ZSTD))) {
format = format.substr(0, format.size() - 4);
}
// Now lets check for the last .
size_t dot_pos = format.rfind('.');
if (dot_pos == std::string::npos || dot_pos == format.length() - 1) {
// No format found
return "";
}
// We found something
return format.substr(dot_pos + 1);
}

unique_ptr<CopyStatement> Transformer::TransformCopy(duckdb_libpgquery::PGCopyStmt &stmt) {
auto result = make_uniq<CopyStatement>();
auto &info = *result->info;
Expand All @@ -91,13 +110,7 @@ unique_ptr<CopyStatement> Transformer::TransformCopy(duckdb_libpgquery::PGCopySt
info.file_path = stmt.filename;
}

if (ReplacementScan::CanReplace(info.file_path, {"parquet"})) {
info.format = "parquet";
} else if (ReplacementScan::CanReplace(info.file_path, {"json", "jsonl", "ndjson"})) {
info.format = "json";
} else {
info.format = "csv";
}
info.format = ExtractFormat(info.file_path);

// get select_list
if (stmt.attlist) {
Expand Down
44 changes: 41 additions & 3 deletions src/planner/binder/statement/bind_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,42 @@

#include <algorithm>

#include "duckdb/main/extension_entries.hpp"

namespace duckdb {

static bool GetBooleanArg(ClientContext &context, const vector<Value> &arg) {
return arg.empty() || arg[0].CastAs(context, LogicalType::BOOLEAN).GetValue<bool>();
}

void IsFormatExtensionKnown(const string &format) {
for (auto &file_postfixes : EXTENSION_FILE_POSTFIXES) {
if (format == file_postfixes.name + 1) {
// It's a match, we must throw
throw CatalogException(
"Copy Function with name \"%s\" is not in the catalog, but it exists in the %s extension.", format,
file_postfixes.extension);
}
}
}

BoundStatement Binder::BindCopyTo(CopyStatement &stmt, CopyToType copy_to_type) {
// Let's first bind our format
auto on_entry_do =
stmt.info->is_format_auto_detected ? OnEntryNotFound::RETURN_NULL : OnEntryNotFound::THROW_EXCEPTION;
CatalogEntryRetriever entry_retriever {context};
auto &catalog = Catalog::GetSystemCatalog(context);
auto entry = catalog.GetEntry(entry_retriever, DEFAULT_SCHEMA,
{CatalogType::COPY_FUNCTION_ENTRY, stmt.info->format}, on_entry_do);

if (!entry) {
IsFormatExtensionKnown(stmt.info->format);
// If we did not find an entry, we default to a CSV
entry = catalog.GetEntry(entry_retriever, DEFAULT_SCHEMA, {CatalogType::COPY_FUNCTION_ENTRY, "csv"},
OnEntryNotFound::THROW_EXCEPTION);
}
// lookup the format in the catalog
auto &copy_function =
Catalog::GetEntry<CopyFunctionCatalogEntry>(context, INVALID_CATALOG, DEFAULT_SCHEMA, stmt.info->format);
auto &copy_function = entry->Cast<CopyFunctionCatalogEntry>();
if (copy_function.function.plan) {
// plan rewrite COPY TO
return copy_function.function.plan(*this, stmt);
Expand Down Expand Up @@ -322,7 +348,19 @@ BoundStatement Binder::BindCopyFrom(CopyStatement &stmt) {

// lookup the format in the catalog
auto &catalog = Catalog::GetSystemCatalog(context);
auto &copy_function = catalog.GetEntry<CopyFunctionCatalogEntry>(context, DEFAULT_SCHEMA, stmt.info->format);
auto on_entry_do =
stmt.info->is_format_auto_detected ? OnEntryNotFound::RETURN_NULL : OnEntryNotFound::THROW_EXCEPTION;
CatalogEntryRetriever entry_retriever {context};
auto entry = catalog.GetEntry(entry_retriever, DEFAULT_SCHEMA,
{CatalogType::COPY_FUNCTION_ENTRY, stmt.info->format}, on_entry_do);
if (!entry) {
IsFormatExtensionKnown(stmt.info->format);
// If we did not find an entry, we default to a CSV
entry = catalog.GetEntry(entry_retriever, DEFAULT_SCHEMA, {CatalogType::COPY_FUNCTION_ENTRY, "csv"},
OnEntryNotFound::THROW_EXCEPTION);
}
// lookup the format in the catalog
auto &copy_function = entry->Cast<CopyFunctionCatalogEntry>();
if (!copy_function.function.copy_from_bind) {
throw NotImplementedException("COPY FROM is not supported for FORMAT \"%s\"", stmt.info->format);
}
Expand Down
2 changes: 2 additions & 0 deletions src/storage/serialization/serialize_parse_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@ void CopyInfo::Serialize(Serializer &serializer) const {
serializer.WritePropertyWithDefault<string>(206, "file_path", file_path);
serializer.WritePropertyWithDefault<case_insensitive_map_t<vector<Value>>>(207, "options", options);
serializer.WritePropertyWithDefault<unique_ptr<QueryNode>>(208, "select_statement", select_statement);
serializer.WritePropertyWithDefault<bool>(209, "is_format_auto_detected", is_format_auto_detected);
}

unique_ptr<ParseInfo> CopyInfo::Deserialize(Deserializer &deserializer) {
Expand All @@ -351,6 +352,7 @@ unique_ptr<ParseInfo> CopyInfo::Deserialize(Deserializer &deserializer) {
deserializer.ReadPropertyWithDefault<string>(206, "file_path", result->file_path);
deserializer.ReadPropertyWithDefault<case_insensitive_map_t<vector<Value>>>(207, "options", result->options);
deserializer.ReadPropertyWithDefault<unique_ptr<QueryNode>>(208, "select_statement", result->select_statement);
deserializer.ReadPropertyWithDefault<bool>(209, "is_format_auto_detected", result->is_format_auto_detected);
return std::move(result);
}

Expand Down
Loading
0