SDSTOR-22799 feat: chunk ownership tracking, remove GC shard header/footer I/O, blob commit validation#446
Conversation
f8e31f5 to
1237535
Compare
|
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## dev/refactor_create_seal_shard #446 +/- ##
=================================================================
Coverage ? 52.63%
=================================================================
Files ? 36
Lines ? 5348
Branches ? 672
=================================================================
Hits ? 2815
Misses ? 2236
Partials ? 297 ☔ View full report in Codecov by Harness. 🚀 New features to boost your workflow:
|
1237535 to
ea3c34b
Compare
…ooter I/O, blob commit validation - ChunkSelector tracks which shard owns each chunk; select/release enforce ownership and reject wrong-owner calls - GC no longer writes physical shard header/footer blocks - Blob commit validates chunk state matches shard metadata
ea3c34b to
664db00
Compare
| @@ -292,9 +292,6 @@ class GCManager { | |||
| pdev_gc_metrics& metrics() { return metrics_; } | |||
|
|
|||
| private: | |||
| // 2 update the chunk state of move_from_chunk, now it is a reserved chunk | ||
| move_from_vchunk->m_pg_id = std::nullopt; | ||
| move_from_vchunk->m_v_chunk_id = std::nullopt; | ||
| move_from_vchunk->m_owner_shard_id = std::nullopt; |
There was a problem hiding this comment.
this line will revert the change in line 361
There was a problem hiding this comment.
this line(371) is to clear move_from_vchunk.
line 361 is to set move_to_vchunk.
| } | ||
| bool res = chunk_selector_->recover_pg_chunks_states(pair.first, excluding_chunks); | ||
| // After restart, each chunk retains at most one shard log, because every | ||
| // seal-shard commit forces a cp flush. |
There was a problem hiding this comment.
no, we only flush dc_lsn, not trigger a cp flush as of now
| // check the blob's pchunk id and shard's pchunk id for this blob | ||
| if (blob_info.pbas.chunk_num() != sb_pchunk_id) { | ||
| BLOGE(tid, blob_info.shard_id, blob_info.blob_id, | ||
| "commit-time pchunk id mismatch: blob pchunk={} shard pchunk={}", blob_info.pbas.chunk_num(), | ||
| sb_pchunk_id); | ||
| RELEASE_ASSERT(false, "traceID={}, commit-time pchunk id mismatch: blob pchunk={} shard pchunk={}", tid, | ||
| blob_info.pbas.chunk_num(), sb_pchunk_id); | ||
| } | ||
| // check shard meta consistency | ||
| const auto ext_chunk = chunk_selector()->get_pg_vchunk(pg_id, sb_vchunk_id); | ||
| if (ext_chunk == nullptr) { | ||
| BLOGE(tid, blob_info.shard_id, blob_info.blob_id, | ||
| "chunk not found at commit time: pg={} v_chunk={} shard=0x{:x} blob={}", pg_id, sb_vchunk_id, | ||
| blob_info.shard_id, blob_info.blob_id); | ||
| RELEASE_ASSERT(false, "traceID={}, chunk not found at commit time: pg={} v_chunk={} shard=0x{:x} blob={}", | ||
| tid, pg_id, sb_vchunk_id, blob_info.shard_id, blob_info.blob_id); | ||
| } else if (!ext_chunk->valid() || ext_chunk->m_state != ChunkState::INUSE || | ||
| ext_chunk->m_owner_shard_id != blob_info.shard_id || ext_chunk->get_chunk_id() != sb_pchunk_id) { | ||
| BLOGE(tid, blob_info.shard_id, blob_info.blob_id, | ||
| "commit-time chunk state invalid: pg={} v_chunk={} blob={} state={} owner=0x{:x} pchunk={} " | ||
| "sb_pchunk={}", | ||
| pg_id, sb_vchunk_id, blob_info.blob_id, ext_chunk->m_state, ext_chunk->m_owner_shard_id.value_or(0), | ||
| ext_chunk->get_chunk_id(), sb_pchunk_id); | ||
| RELEASE_ASSERT( | ||
| false, | ||
| "traceID={}, commit-time chunk state invalid: pg={} v_chunk={} blob={} state={} owner=0x{:x} " | ||
| "pchunk={} sb_pchunk={}", | ||
| tid, pg_id, sb_vchunk_id, blob_info.blob_id, ext_chunk->m_state, | ||
| ext_chunk->m_owner_shard_id.value_or(0), ext_chunk->get_chunk_id(), sb_pchunk_id); | ||
| } |
There was a problem hiding this comment.
better consolidate BLOGE and RELEASE_ASSERT into only one RELEASE_ASSERT for all the logs here
…ted. keep the chunk state clean(initialized) - enhance GC chunk ownership handling, distinguish between normal and recovery - CMakeList add properties to avoid running test in parallel
| --override_config hs_backend_config.gc_scan_interval_sec=5 | ||
| --gtest_filter=HomeObjectFixture.StalePChunkRouteAfterGC:HomeObjectFixture.StaleBlobRouteAfterSealAndGC) | ||
|
|
||
| # All tests share disk, port, and shared-memory resources, so run them serially to avoid parallel conflicts. |
There was a problem hiding this comment.
any issue without this change?
| LOGDEBUGMOD(gcmgr, "EGC recovery: set move_from_chunk={} owner to shard=0x{:x} for replay", | ||
| move_from_chunk, open_shard.value()); | ||
| } else { | ||
| LOGWARNMOD(gcmgr, "EGC recovery: no open shard found in move_from_chunk={} or move_to_chunk={}", |
There was a problem hiding this comment.
should we assert here? this is an unexpected case

ChunkSelector tracks which shard owns each chunk; select/release enforce ownership and reject wrong-owner calls
GC no longer writes physical shard header/footer blocks
Blob commit validates chunk state matches shard metadata