8000 [Dev] Fix wrong result reported by Roaring Compression `FinalAnalyze` by Tishj · Pull Request #15677 · duckdb/duckdb · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

[Dev] Fix wrong result reported by Roaring Compression FinalAnalyze #15677

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/include/duckdb/storage/compression/roaring/roaring.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,9 @@ struct RoaringAnalyzeState : public AnalyzeState {
//! Flushed analyze data

//! The space used by the current segment
idx_t space_used = 0;
idx_t data_size = 0;
idx_t metadata_size = 0;

//! The total amount of segments to write
idx_t segment_count = 0;
//! The amount of values in the current segment;
Expand Down
12 changes: 7 additions & 5 deletions src/storage/compression/roaring/analyze.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ void RoaringAnalyzeState::Flush(RoaringAnalyzeState &state) {
}

bool RoaringAnalyzeState::HasEnoughSpaceInSegment(idx_t required_space) {
auto space_used = data_size + metadata_size;
D_ASSERT(space_used <= info.GetBlockSize());
idx_t remaining_space = info.GetBlockSize() - space_used;
if (required_space > remaining_space) {
Expand All @@ -117,13 +118,15 @@ bool RoaringAnalyzeState::HasEnoughSpaceInSegment(idx_t required_space) {
}

void RoaringAnalyzeState::FlushSegment() {
auto space_used = data_size + metadata_size;
if (!current_count) {
D_ASSERT(!space_used);
return;
}
metadata_collection.FlushSegment();
total_size += space_used;
space_used = 0;
data_size = 0;
metadata_size = 0;
current_count = 0;
segment_count++;
}
Expand All @@ -146,15 +149,14 @@ void RoaringAnalyzeState::FlushContainer() {
arrays_count++;
}

idx_t required_space = metadata_collection.GetMetadataSize(runs_count + arrays_count, runs_count, arrays_count);
metadata_size = metadata_collection.GetMetadataSize(runs_count + arrays_count, runs_count, arrays_count);

required_space += metadata.GetDataSizeInBytes(count);
if (!HasEnoughSpaceInSegment(required_space)) {
data_size += metadata.GetDataSizeInBytes(count);
if (!HasEnoughSpaceInSegment(metadata_size + data_size)) {
FlushSegment();
}
container_metadata.push_back(metadata);
metadata_collection.AddMetadata(metadata);
space_used += required_space;
current_count += count;

// Reset the container analyze state
Expand Down
12 changes: 9 additions & 3 deletions src/storage/table/column_data_checkpointer.cpp
8000
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "duckdb/storage/data_table.hpp"
#include "duckdb/parser/column_definition.hpp"
#include "duckdb/storage/table/scan_state.hpp"
#include "duckdb/main/database.hpp"

namespace duckdb {

Expand Down Expand Up @@ -240,14 +241,19 @@ vector<CheckpointAnalyzeResult> ColumnDataCheckpointer::DetectBestCompressionMet
}
}

auto &checkpoint_state = checkpoint_states[i];
auto &col_data = checkpoint_state.get().column_data;
if (!chosen_state) {
auto &checkpoint_state = checkpoint_states[i];
auto &col_data = checkpoint_state.get().column_data;
throw FatalException("No suitable compression/storage method found to store column of type %s",
col_data.type.ToString());
}
D_ASSERT(compression_idx != DConstants::INVALID_INDEX);
result[i] = CheckpointAnalyzeResult(std::move(chosen_state), *functions[compression_idx]);

auto &best_function = *functions[compression_idx];
Logger::Info(db, "FinalAnalyze(%s) result for %s.%s.%d(%s): %d", EnumUtil::ToString(best_function.type),
col_data.info.GetSchemaName(), col_data.info.GetTableName(), col_data.column_index,
col_data.type.ToString(), best_score);
result[i] = CheckpointAnalyzeResult(std::move(chosen_state), best_function);
}
return result;
}
Expand Down
83 changes: 83 additions & 0 deletions test/sql/storage/compression/roaring/roaring_analyze_array.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# name: test/sql/storage/compression/roaring/roaring_analyze_array.test
# description: Check the produced (final_)analyze result
# group: [roaring]

require block_size 262144

require noforcestorage

load __TEST_DIR__/test_roaring.db

statement ok
set logging_level='info';

# 1 rowgroup
statement ok
set variable dataset_size = 122880;

statement ok
PRAGMA force_compression='uncompressed'

statement ok
set enable_logging=true;

statement ok
CREATE TABLE test_uncompressed AS SELECT
case
when i%25=0
then 1337
else null
end
FROM range(getvariable('dataset_size')) tbl(i);

statement ok
checkpoint

statement ok
set enable_logging=false;

query I
SELECT message.split(': ')[2]::INTEGER FROM duckdb_logs
where
message.starts_with('FinalAnalyze') and
message.contains('test_uncompressed') and
message.contains('VALIDITY') and
message.contains('COMPRESSION_UNCOMPRESSED');
----
15360

statement ok
PRAGMA force_compression='roaring'

statement ok
set enable_logging=true;

statement ok
CREATE TABLE test_roaring AS select * from test_uncompressed;

statement ok
checkpoint

statement ok
set enable_logging=false;

# For single row group
# 60 vectors with 82 non-null values per vector
# Total compressed bytes:
# 2 bits (is_inverted + is_run) + 8 bits (cardinality) = 10 bits per Vector
# 10 * 60 = 600 bits == 75 bytes of metadata per RowGroup
#
# 8 (compressed overhead) + (82 * sizeof(uint8_t)) = 90 bytes per Vector
# 90 * 60 = 5400 bytes of data per RowGroup
# 5475 bytes

# We 2x the actual result, to pay for the slower decompression speed
query I
SELECT message.split(': ')[2]::INTEGER FROM duckdb_logs
where
message.starts_with('FinalAnalyze') and
message.contains('test_roaring') and
message.contains('VALIDITY') and
message.contains('COMPRESSION_ROARING');
----
10944
83 changes: 83 additions & 0 deletions test/sql/storage/compression/roaring/roaring_analyze_bitset.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# name: test/sql/storage/compression/roaring/roaring_analyze_bitset.test
# description: Check the produced (final_)analyze result
# group: [roaring]

require block_size 262144

require noforcestorage

load __TEST_DIR__/test_roaring.db

statement ok
set logging_level='info';

# 1 rowgroup
statement ok
set variable dataset_size = 122880;

statement ok
PRAGMA force_compression='uncompressed'

statement ok
set enable_logging=true;

statement ok
CREATE TABLE test_uncompressed AS SELECT
case
when i%3=0
then 1337
else null
end
FROM range(getvariable('dataset_size')) tbl(i);

statement ok
checkpoint

statement ok
set enable_logging=false;

query I
SELECT message.split(': ')[2]::INTEGER FROM duckdb_logs
where
message.starts_with('FinalAnalyze') and
message.contains('test_uncompressed') and
message.contains('VALIDITY') and
message.contains('COMPRESSION_UNCOMPRESSED');
----
15360

statement ok
PRAGMA force_compression='roaring'

statement ok
set enable_logging=true;

statement ok
CREATE TABLE test_roaring AS select * from test_uncompressed;

statement ok
checkpoint

statement ok
set enable_logging=false;

# For single row group
# 60 vectors with 7 or 8 runs of nulls per vector
# Total compressed bytes:
# 2 bits (is_inverted + is_run) = 2 bits per Vector
# 2 * 60 = 120 bits == 15 bytes of metadata per RowGroup
#
# 256 bytes bytes per Vector
# 256 * 60 = 15360 bytes of data per RowGroup
# 15375 bytes

# We 2x the actual result, to pay for the slower decompression speed
query I
SELECT message.split(': ')[2]::INTEGER FROM duckdb_logs
where
message.starts_with('FinalAnalyze') and
message.contains('test_roaring') and
message.contains('VALIDITY') and
message.contains('COMPRESSION_ROARING');
----
30872
83 changes: 83 additions & 0 deletions test/sql/storage/compression/roaring/roaring_analyze_run.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# name: test/sql/storage/compression/roaring/roaring_analyze_run.test
# description: Check the produced (final_)analyze result
# group: [roaring]

require block_size 262144

require noforcestorage

load __TEST_DIR__/test_roaring.db

statement ok
set logging_level='info';

# 1 rowgroup
statement ok
set variable dataset_size = 122880;

statement ok
PRAGMA force_compression='uncompressed'

statement ok
set enable_logging=true;

statement ok
CREATE TABLE test_uncompressed AS SELECT
case
when i = 0 or (i % 512 != 0 and (i % 512) < 350 or (i % 512) > 450)
then null
else 1337
end
FROM range(getvariable('dataset_size')) tbl(i);

statement ok
checkpoint

statement ok
set enable_logging=false;

query I
SELECT message.split(': ')[2]::INTEGER FROM duckdb_logs
where
message.starts_with('FinalAnalyze') and
message.contains('test_uncompressed') and
message.contains('VALIDITY') and
message.contains('COMPRESSION_UNCOMPRESSED');
----
15360

statement ok
PRAGMA force_compression='roaring'

statement ok
set enable_logging=true;

statement ok
CREATE TABLE test_roaring AS select * from test_uncompressed;

statement ok
checkpoint

statement ok
set enable_logging=false;

# For single row group
# 60 vectors with 7 or 8 runs of nulls per vector
# Total compressed bytes:
# 2 bits (is_inverted + is_run) + 7 bits (run_size) = 9 bits per Vector
# 9 * 60 = 540 bits == 67 bytes of metadata per RowGroup
#
# 8 (compressed overhead) + (8 * sizeof(uint16_t)) = 24 bytes per Vector
# 24 * 60 = 1440 bytes of data per RowGroup
# 1507 bytes

# We 2x the actual result, to pay for the slower decompression speed
query I
SELECT message.split(': ')[2]::INTEGER FROM duckdb_logs
where
message.starts_with('FinalAnalyze') and
message.contains('test_roaring') and
message.contains('VALIDITY') and
message.contains('COMPRESSION_ROARING');
----
3024
19 changes: 19 additions & 0 deletions test/sql/storage/compression/zstd/zstd_compression_ratio.test_slow
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ require block_size 262144

load __TEST_DIR__/test_zstd_compression_ratio.db

statement ok
set enable_logging=true;

statement ok
set logging_level='info';

statement ok
PRAGMA force_compression='zstd'

Expand All @@ -30,6 +36,12 @@ create table test_compressed as (
statement ok
checkpoint

# We analyzed the ZSTD compressed data to be 230.8 Mb
query I
SELECT message FROM duckdb_logs where message.starts_with('FinalAnalyze') and message.contains('test_compressed') and message.contains('VARCHAR') order by timestamp;
----
FinalAnalyze(COMPRESSION_ZSTD) result for main.test_compressed.0(VARCHAR): 230801376

statement ok
PRAGMA force_compression='uncompressed'

Expand All @@ -50,6 +62,12 @@ CREATE TABLE test_uncompressed as (
statement ok
checkpoint

# The Uncompressed data to would have been 462.4 Mb
query I
SELECT message FROM duckdb_logs where message.starts_with('FinalAnalyze') and message.contains('test_uncompressed') and message.contains('VARCHAR') order by timestamp;
----
FinalAnalyze(COMPRESSION_UNCOMPRESSED) result for main.test_uncompressed.0(VARCHAR): 462400000

query I
SELECT compression FROM pragma_storage_info('test_compressed') WHERE segment_type != 'VALIDITY' AND compression != 'ZSTD';
----
Expand All @@ -70,6 +88,7 @@ CREATE TYPE test_result AS UNION (
)
);

# This roughly aligns with comparing Uncompressed/ZSTD FinalAnalyze results (which is ~2x)
statement ok
set variable min_ratio = 1.8;
set variable max_ratio = 2.5;
Expand Down
Loading
0