8000 [Dev] Fix Arrow fixed size binary reading by Tishj · Pull Request #17573 · duckdb/duckdb · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

[Dev] Fix Arrow fixed size binary reading #17573

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions src/function/table/arrow_conversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -349,26 +349,29 @@ static void ArrowToDuckDBBlob(Vector &vector, ArrowArray &array, const ArrowScan
//! Have to check validity mask before setting this up
idx_t offset = GetEffectiveOffset(array, parent_offset, scan_state, nested_offset) * fixed_size;
auto cdata = ArrowBufferData<char>(array, 1);
auto blob_len = fixed_size;
auto result = FlatVector::GetData<string_t>(vector);
for (idx_t row_idx = 0; row_idx < size; row_idx++) {
if (FlatVector::IsNull(vector, row_idx)) {
offset += blob_len;
continue;
}
auto bptr = cdata + offset;
auto blob_len = fixed_size;
FlatVector::GetData<string_t>(vector)[row_idx] = StringVector::AddStringOrBlob(vector, bptr, blob_len);
result[row_idx] = StringVector::AddStringOrBlob(vector, bptr, blob_len);
offset += blob_len;
}
} else if (size_type == ArrowVariableSizeType::NORMAL) {
auto offsets =
ArrowBufferData<uint32_t>(array, 1) + GetEffectiveOffset(array, parent_offset, scan_state, nested_offset);
auto cdata = ArrowBufferData<char>(array, 2);
auto result = FlatVector::GetData<string_t>(vector);
for (idx_t row_idx = 0; row_idx < size; row_idx++) {
if (FlatVector::IsNull(vector, row_idx)) {
continue;
}
auto bptr = cdata + offsets[row_idx];
auto blob_len = offsets[row_idx + 1] - offsets[row_idx];
FlatVector::GetData<string_t>(vector)[row_idx] = StringVector::AddStringOrBlob(vector, bptr, blob_len);
result[row_idx] = StringVector::AddStringOrBlob(vector, bptr, blob_len);
}
} else {
//! Check if last offset is higher than max uint32
Expand All @@ -378,13 +381,14 @@ static void ArrowToDuckDBBlob(Vector &vector, ArrowArray &array, const ArrowScan
auto offsets =
ArrowBufferData<uint64_t>(array, 1) + GetEffectiveOffset(array, parent_offset, scan_state, nested_offset);
auto cdata = ArrowBufferData<char>(array, 2);
auto result = FlatVector::GetData<string_t>(vector);
for (idx_t row_idx = 0; row_idx < size; row_idx++) {
if (FlatVector::IsNull(vector, row_idx)) {
continue;
}
auto bptr = cdata + offsets[row_idx];
auto blob_len = offsets[row_idx + 1] - offsets[row_idx];
FlatVector::GetData<string_t>(vector)[row_idx] = StringVector::AddStringOrBlob(vector, bptr, blob_len);
result[row_idx] = StringVector::AddStringOrBlob(vector, bptr, blob_len);
}
}
}
Expand Down
21 changes: 21 additions & 0 deletions tools/pythonpkg/tests/fast/arrow/test_arrow_fixed_binary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pytest

pa = pytest.importorskip("pyarrow")


class TestArrowFixedBinary(object):
def test_arrow_fixed_binary(self, duckdb_cursor):
ids = [
None,
b'\x66\x4d\xf4\xae\xb1\x5c\xb0\x4a\xdd\x5d\x1d\x54',
b'\x66\x4d\xf4\xf0\xa3\xfc\xec\x5b\x26\x81\x4e\x1d',
]

id_array = pa.array(ids, type=pa.binary(12))
arrow_table = pa.Table.from_arrays([id_array], names=["id"])
res = duckdb_cursor.sql(
"""
SELECT lower(hex(id)) as id FROM arrow_table
"""
).fetchall()
assert res == [(None,), ('664df4aeb15cb04add5d1d54',), ('664df4f0a3fcec5b26814e1d',)]
Loading
0