8000 feat(meta cache): stats meta cache unhit in prometheus by soundOfDestiny · Pull Request #9078 · risingwavelabs/risingwave · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

feat(meta cache): stats meta cache unhit in prometheus #9078

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/dashboards/risingwave-dev-dashboard.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docker/dashboards/risingwave-user-dashboard.json

Large diffs are not rendered by default.

26 changes: 24 additions & 2 deletions grafana/risingwave-dev-dashboard.dashboard.py
8000
Original file line number Diff line number Diff line change
Expand Up @@ -1338,7 +1338,7 @@ def section_frontend(outer_panels):


def section_hummock(panels):
mete_miss_filter = "type='meta_miss'"
meta_miss_filter = "type='meta_miss'"
meta_total_filter = "type='meta_total'"
data_miss_filter = "type='data_miss'"
data_total_filter = "type='data_total'"
Expand Down Expand Up @@ -1568,7 +1568,7 @@ def section_hummock(panels):
"bloom filter miss rate - {{table_id}} - {{type}} @ {{job}} @ {{instance}}",
),
panels.target(
f"(sum(rate({metric('state_store_sst_store_block_request_counts', mete_miss_filter)}[$__rate_interval])) by (job,instance,table_id)) / (sum(rate({metric('state_store_sst_store_block_request_counts', meta_total_filter)}[$__rate_interval])) by (job,instance,table_id))",
f"(sum(rate({metric('state_store_sst_store_block_request_counts', meta_miss_filter)}[$__rate_interval])) by (job,instance,table_id)) / (sum(rate({metric('state_store_sst_store_block_request_counts', meta_total_filter)}[$__rate_interval])) by (job,instance,table_id))",
"meta cache miss rate - {{table_id}} @ {{job}} @ {{instance}}",
),
panels.target(
Expand Down Expand Up @@ -1750,6 +1750,28 @@ def section_hummock(panels):
),
],
),

panels.timeseries_count(
"Fetch Meta Unhits",
"",
[
panels.target(
f"{metric('state_store_iter_fetch_meta_cache_unhits')}",
"",
),
],
),

panels.timeseries_count(
"Slow Fetch Meta Unhits",
"",
[
panels.target(
f"{metric('state_store_iter_slow_fetch_meta_cache_unhits')}",
"",
),
],
),
]


Expand Down
2 changes: 1 addition & 1 deletion grafana/risingwave-dev-dashboard.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion grafana/risingwave-user-dashboard.json

Large diffs are not rendered by default.

25 changes: 19 additions & 6 deletions src/storage/src/hummock/sstable_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use await_tree::InstrumentAwait;
use bytes::{Buf, BufMut, Bytes};
use fail::fail_point;
use itertools::Itertools;
use risingwave_common::cache::{CachePriority, LruCacheEventListener};
use risingwave_common::cache::{CachePriority, LookupResponse, LruCacheEventListener};
use risingwave_hummock_sdk::{HummockSstableObjectId, OBJECT_SUFFIX};
use risingwave_object_store::object::{
BlockLocation, MonitoredStreamingReader, ObjectError, ObjectMetadata, ObjectStoreRef,
Expand Down Expand Up @@ -354,14 +354,17 @@ impl SstableStore {
self.meta_cache.clear();
}

/// Returns `table_holder`, `local_cache_meta_block_miss` (1 if cache miss) and
/// `local_cache_meta_block_unhit` (1 if not cache hit).
pub async fn sstable_syncable(
&self,
sst: &SstableInfo,
stats: &StoreLocalStatistic,
) -> HummockResult<(TableHolder, u64)> {
) -> HummockResult<(TableHolder, u64, u64)> {
let mut local_cache_meta_block_miss = 0;
let mut local_cache_meta_block_unhit = 0;
let object_id = sst.get_object_id();
let result = self
let lookup_response = self
.meta_cache
.lookup_with_request_dedup::<_, HummockError, _>(
object_id,
Expand Down Expand Up @@ -390,10 +393,20 @@ impl SstableStore {
Ok((Box::new(sst), charge))
}
},
)
);
if !matches!(lookup_response, LookupResponse::Cached(..)) {
local_cache_meta_block_unhit += 1;
}
let result = lookup_response
.verbose_instrument_await("meta_cache_lookup")
.await;
result.map(|table_holder| (table_holder, local_cache_meta_block_miss))
result.map(|table_holder| {
(
table_holder,
local_cache_meta_block_miss,
local_cache_meta_block_unhit,
)
})
}

pub async fn sstable(
Expand All @@ -402,7 +415,7 @@ impl SstableStore {
stats: &mut StoreLocalStatistic,
) -> HummockResult<TableHolder> {
self.sstable_syncable(sst, stats).await.map(
|(table_holder, local_cache_meta_block_miss)| {
|(table_holder, local_cache_meta_block_miss, ..)| {
stats.apply_meta_fetch(local_cache_meta_block_miss);
table_holder
},
Expand Down
19 changes: 16 additions & 3 deletions src/storage/src/hummock/store/version.rs
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,8 @@ impl HummockVersionReader {
}
}

const SLOW_ITER_FETCH_META_DURATION_SECOND: f64 = 5.0;

impl HummockVersionReader {
pub async fn get(
&self,
Expand Down Expand Up @@ -653,12 +655,23 @@ impl HummockVersionReader {
.iter_fetch_meta_duration
.with_label_values(&[table_id_label])
.start_timer();
let mut local_cache_meta_block_unhit = 0;
let mut flatten_resps = vec![None; req_count];
for flatten_req in flatten_reqs {
let (req_index, resp) = flatten_req.await?;
local_cache_meta_block_unhit += resp.2;
flatten_resps[req_count - req_index - 1] = Some(resp);
}
timer.observe_duration();
let fetch_meta_duration_sec = timer.stop_and_record();
self.state_store_metrics
.iter_fetch_meta_cache_unhits
.set(local_cache_meta_block_unhit as i64);
if fetch_meta_duration_sec > SLOW_ITER_FETCH_META_DURATION_SECOND {
tracing::warn!("Fetching meta while creating an iter to read table_id {:?} at epoch {:?} is slow: duration = {:?}s, cache unhits = {:?}.", table_id_string, epoch, fetch_meta_duration_sec, local_cache_meta_block_unhit);
self.state_store_metrics
.iter_slow_fetch_meta_cache_unhits
.set(local_cache_meta_block_unhit as i64);
}

let mut sst_read_options = SstableIteratorReadOptions::from(&read_options);
if read_options.prefetch_options.exhaust_iter {
Expand All @@ -671,7 +684,7 @@ impl HummockVersionReader {
if level_type == LevelType::Nonoverlapping as i32 {
let mut sstables = vec![];
for sstable_info in fetch_meta_req {
let (sstable, local_cache_meta_block_miss) =
let (sstable, local_cache_meta_block_miss, ..) =
flatten_resps.pop().unwrap().unwrap();
assert_eq!(sstable_info.get_object_id(), sstable.value().id);
local_stats.apply_meta_fetch(local_cache_meta_block_miss);
Expand All @@ -697,7 +710,7 @@ impl HummockVersionReader {
} else {
let mut iters = Vec::new();
for sstable_info in fetch_meta_req {
let (sstable, local_cache_meta_block_miss) =
let (sstable, local_cache_meta_block_miss, ..) =
flatten_resps.pop().unwrap().unwrap();
assert_eq!(sstable_info.get_object_id(), sstable.value().id);
local_stats.apply_meta_fetch(local_cache_meta_block_miss);
Expand Down
21 changes: 20 additions & 1 deletion src/storage/src/monitor/hummock_state_store_metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ use std::sync::Arc;
use prometheus::core::{AtomicU64, Collector, Desc, GenericCounterVec};
use prometheus::{
exponential_buckets, histogram_opts, proto, register_histogram_vec_with_registry,
register_int_counter_vec_with_registry, HistogramVec, IntGauge, Opts, Registry,
register_int_counter_vec_with_registry, register_int_gauge_with_registry, HistogramVec,
IntGauge, Opts, Registry,
};

/// [`HummockStateStoreMetrics`] stores the performance and IO metrics of `XXXStore` such as
Expand All @@ -35,6 +36,8 @@ pub struct HummockStateStoreMetrics {
pub get_shared_buffer_hit_counts: GenericCounterVec<AtomicU64>,
pub remote_read_time: HistogramVec,
pub iter_fetch_meta_duration: HistogramVec,
pub iter_fetch_meta_cache_unhits: IntGauge,
pub iter_slow_fetch_meta_cache_unhits: IntGauge,

pub read_req_bloom_filter_positive_counts: GenericCounterVec<AtomicU64>,
pub read_req_positive_but_non_exist_counts: GenericCounterVec<AtomicU64>,
Expand Down Expand Up @@ -113,6 +116,20 @@ impl HummockStateStoreMetrics {
let iter_fetch_meta_duration =
register_histogram_vec_with_registry!(opts, &["table_id"], registry).unwrap();

let iter_fetch_meta_cache_unhits = register_int_gauge_with_registry!(
"state_store_iter_fetch_meta_cache_unhits",
"Number of SST meta cache unhit during one iterator meta fetch",
registry
)
.unwrap();

let iter_slow_fetch_meta_cache_unhits = register_int_gauge_with_registry!(
"state_store_iter_slow_fetch_meta_cache_unhits",
"Number of SST meta cache unhit during a iterator meta fetch which is slow (costs >5 seconds)",
registry
)
.unwrap();

// ----- write_batch -----
let write_batch_tuple_counts = register_int_counter_vec_with_registry!(
"state_store_write_batch_tuple_counts",
Expand Down Expand Up @@ -171,6 +188,8 @@ impl HummockStateStoreMetrics {
get_shared_buffer_hit_counts,
remote_read_time,
iter_fetch_meta_duration,
iter_fetch_meta_cache_unhits,
iter_slow_fetch_meta_cache_unhits,
read_req_bloom_filter_positive_counts,
read_req_positive_but_non_exist_counts,
read_req_check_bloom_filter_counts,
Expand Down
0