Skip to content

Commit 4ee3c5c

Browse files
obdevfootka
authored andcommitted
[CP] [vector index] hardening fix ddl hung issue
Co-authored-by: footka <672528926@qq.com>
1 parent f664154 commit 4ee3c5c

13 files changed

Lines changed: 180 additions & 26 deletions

deps/oblib/src/lib/utility/ob_tracepoint_def.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,9 @@ GLOBAL_ERRSIM_POINT_DEF(1201, EN_DISABLE_NEWSORT_FIXED_KEY_OPT, "Used to control
580580
// vector index 1300 - 1399
581581
GLOBAL_ERRSIM_POINT_DEF(1300, EN_VEC_INDEX_IVF_ROWKEY_CID_BUILD_ERR, "");
582582
GLOBAL_ERRSIM_POINT_DEF(1301, EN_VEC_INDEX_IVF_PQ_BUILD_ERR, "");
583+
GLOBAL_ERRSIM_POINT_DEF(1302, EN_VEC_INDEX_HNSW_SNAPSHOT_TABLE_BUILD_ERR, "");
584+
GLOBAL_ERRSIM_POINT_DEF(1303, EN_VEC_INDEX_HNSW_REBUILD_CREATE_ERR, "");
585+
GLOBAL_ERRSIM_POINT_DEF(1304, EN_VEC_INDEX_HNSW_RELEASE_SNAPSHOT_ERR, "");
583586

584587
// Transaction // 2001 - 2100
585588
// Transaction free route

src/rootserver/ddl_task/ob_ddl_task.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ enum ObDDLUpdateParentTaskIDType
171171
UPDATE_CREATE_INDEX_ID = 0,
172172
UPDATE_DROP_INDEX_TASK_ID,
173173
UPDATE_VEC_REBUILD_CREATE_INDEX_TASK_ID,
174-
UPDATE_VEC_REBUILD_DROP_INDEX_TASK_ID,
174+
UPDATE_VEC_REBUILD_DROP_INDEX_TASK_ID
175175
};
176176

177177
struct ObVecIndexDDLChildTaskInfo final

src/rootserver/ddl_task/ob_drop_vec_index_task.cpp

Lines changed: 86 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ int ObDropVecIndexTask::obtain_snapshot(const share::ObDDLTaskStatus next_task_s
179179
{
180180
int ret = OB_SUCCESS;
181181
bool state_finished = false;
182+
bool is_snapshot_table_exist = true;
182183
ObDDLTaskStatus old_status = task_status_;
183184
if (OB_UNLIKELY(!is_inited_)) {
184185
ret = OB_NOT_INIT;
@@ -195,13 +196,34 @@ int ObDropVecIndexTask::obtain_snapshot(const share::ObDDLTaskStatus next_task_s
195196
if (OB_FAIL(switch_status(next_task_status, true, ret))) {
196197
LOG_WARN("fail to switch task status", K(ret), K(next_task_status));
197198
}
198-
} else if (OB_FAIL(ObDDLUtil::obtain_snapshot(next_task_status, vec_index_snapshot_data_.table_id_,
199-
vec_index_snapshot_data_.table_id_, snapshot_version_,
200-
this))) {
201-
LOG_WARN("fail to obtain_snapshot", K(ret), K(snapshot_version_));
202-
} else {
203-
state_finished = true;
199+
} else if (OB_FAIL(check_snapshot_table_exist(is_snapshot_table_exist))) {
200+
LOG_WARN("fail to check snapshot table exist", K(ret));
201+
}
202+
203+
// skip and success,switch to DROP_AUX_INDEX_TABLE
204+
if (OB_SUCC(ret) && !state_finished) {
205+
if (!is_snapshot_table_exist) { // snapshot table not exist, skip obtain snapshot
206+
if (OB_FAIL(switch_status(ObDDLTaskStatus::DROP_AUX_INDEX_TABLE, true, ret))) {
207+
LOG_WARN("fail to switch task status when skip obtain snapshot", K(ret));
208+
} else {
209+
state_finished = true;
210+
}
211+
} else if (OB_FAIL(ObDDLUtil::obtain_snapshot(next_task_status,
212+
vec_index_snapshot_data_.table_id_,
213+
vec_index_snapshot_data_.table_id_,
214+
snapshot_version_,
215+
this))) {
216+
LOG_WARN("fail to obtain_snapshot", K(ret), K(snapshot_version_));
217+
} else if (snapshot_version_ <= 0) {
218+
ret = OB_ERR_UNEXPECTED;
219+
LOG_WARN("snapshot version is invalid", K(ret), K(snapshot_version_));
220+
} else if (OB_FAIL(update_task_message())) {
221+
LOG_WARN("fail to snapshot_version_ to __all_ddl_task_status", K(ret));
222+
} else {
223+
state_finished = true;
224+
}
204225
}
226+
205227
#ifdef ERRSIM
206228
if (OB_SUCC(ret)) {
207229
ret = OB_E(common::EventTable::EN_VEC_INDEX_OBTAIN_SNAPSHOT_ERR) OB_SUCCESS;
@@ -210,6 +232,7 @@ int ObDropVecIndexTask::obtain_snapshot(const share::ObDDLTaskStatus next_task_s
210232
}
211233
}
212234
#endif
235+
213236
if (state_finished && OB_SUCC(ret)) {
214237
LOG_INFO("success to obtain_snapshot", K(ret));
215238
} else if (next_task_status == task_status_) { // resume old task status and retry
@@ -225,17 +248,23 @@ int ObDropVecIndexTask::obtain_snapshot(const share::ObDDLTaskStatus next_task_s
225248
int ObDropVecIndexTask::drop_lob_meta_row(const ObDDLTaskStatus next_task_status)
226249
{
227250
int ret = OB_SUCCESS;
228-
bool is_build_replica_end = false;
251+
bool is_build_replica_end = false;
252+
bool is_exist = true;
253+
DEBUG_SYNC(DROP_VECTOR_INDEX_BEFORE_DELETE_LOB_META);
229254
if (OB_UNLIKELY(!is_inited_)) {
230255
ret = OB_NOT_INIT;
231256
LOG_WARN("ObDropVecIndexTask is not inited", K(ret));
232257
} else if (ObDDLTaskStatus::DROP_LOB_META_ROW != task_status_) {
233258
ret = OB_TASK_EXPIRED;
234259
LOG_WARN("task status not match", K(ret), K(task_status_));
235260
} else if (OB_UNLIKELY(snapshot_version_ <= 0)) {
236-
is_build_replica_end = true; // switch to fail.
237-
ret = OB_ERR_UNEXPECTED;
238-
LOG_WARN("unexpected snapshot", K(ret), KPC(this));
261+
is_build_replica_end = true;
262+
LOG_INFO("finish drop lob meta and release snapshot", K(ret));
263+
} else if (vec_index_snapshot_data_.is_valid() && !del_lob_meta_row_task_submitted_ && OB_FAIL(check_snapshot_table_exist(is_exist))) {
264+
LOG_WARN("fail to check snapshot table exist", K(ret));
265+
} else if (!is_exist) {
266+
is_build_replica_end = true;
267+
LOG_INFO("snapshot table not exist, skip drop lob meta row", K(ret));
239268
} else if (vec_index_snapshot_data_.is_valid() && !del_lob_meta_row_task_submitted_ && OB_FAIL(send_build_single_replica_request())) {
240269
LOG_WARN("fail to send build single replica request", K(ret));
241270
} else if (vec_index_snapshot_data_.is_valid() && del_lob_meta_row_task_submitted_ && OB_FAIL(check_build_single_replica(is_build_replica_end))) {
@@ -244,7 +273,12 @@ int ObDropVecIndexTask::drop_lob_meta_row(const ObDDLTaskStatus next_task_status
244273
is_build_replica_end = true;
245274
}
246275
if (is_build_replica_end) {
247-
ret = OB_SUCC(ret) ? delte_lob_meta_job_ret_code_ : ret;
276+
DEBUG_SYNC(DROP_VECTOR_INDEX_AFTER_DELETE_LOB_META);
277+
// Only consume async job return code when the delete-lob-meta job was actually submitted.
278+
// For skip paths (e.g. snapshot table not exist), keep current ret to allow state transition.
279+
if (OB_SUCC(ret) && del_lob_meta_row_task_submitted_) {
280+
ret = delte_lob_meta_job_ret_code_;
281+
}
248282
#ifdef ERRSIM
249283
if (OB_SUCC(ret)) {
250284
ret = OB_E(common::EventTable::EN_VEC_INDEX_DROP_LOB_META_ROW_ERR) OB_SUCCESS;
@@ -256,7 +290,7 @@ int ObDropVecIndexTask::drop_lob_meta_row(const ObDDLTaskStatus next_task_status
256290
if (OB_FAIL(ret)) {
257291
LOG_WARN("fail in delete lob meta row", K(ret));
258292
} else if (OB_FAIL(finish())) {
259-
LOG_WARN("fail in release snapshot", K(ret));
293+
LOG_WARN("fail to release snapshot", K(ret));
260294
} else if (OB_FAIL(switch_status(next_task_status, true/*enable_flt*/, ret))) {
261295
LOG_WARN("fail to switch task status", K(ret), K(next_task_status));
262296
} else {
@@ -805,8 +839,14 @@ int ObDropVecIndexTask::create_drop_index_task(
805839
} else if (OB_FAIL(guard.get_table_schema(tenant_id_, index_schema->get_data_table_id(), data_table_schema))) {
806840
LOG_WARN("fail to get data table schema", K(ret), K(index_schema->get_data_table_id()));
807841
} else if (OB_UNLIKELY(nullptr == database_schema || nullptr == data_table_schema)) {
808-
ret = OB_ERR_UNEXPECTED;
809-
LOG_WARN("unexpected error, schema is nullptr", K(ret), KP(database_schema), KP(data_table_schema));
842+
if (OB_ISNULL(data_table_schema) && drop_index_arg_.is_hidden_) {
843+
task_id = -1;
844+
LOG_INFO("hidden data_table maybe removed when offline ddl is failed, skip drop",
845+
K(ret), K(index_tid), K(index_name));
846+
} else {
847+
ret = OB_ERR_UNEXPECTED;
848+
LOG_WARN("unexpected error, schema is nullptr", K(ret), KP(database_schema), KP(data_table_schema));
849+
}
810850
} else if (is_domain_index && OB_FAIL(drop_index_sql.assign(drop_index_arg_.ddl_stmt_str_))) {
811851
LOG_WARN("assign user drop index sql failed", K(ret));
812852
} else {
@@ -912,6 +952,14 @@ int ObDropVecIndexTask::finish()
912952
} else if (snapshot_version_ > 0 && OB_FAIL(release_snapshot(snapshot_version_))) {
913953
LOG_WARN("release snapshot failed", K(ret));
914954
}
955+
#ifdef ERRSIM
956+
if (OB_SUCC(ret)) {
957+
ret = OB_E(common::EventTable::EN_VEC_INDEX_HNSW_RELEASE_SNAPSHOT_ERR) OB_SUCCESS;
958+
if (OB_FAIL(ret)) {
959+
LOG_WARN("[ERRSIM] fail to finish after release snapshot", K(ret), K(snapshot_version_));
960+
}
961+
}
962+
#endif
915963
return ret;
916964
}
917965

@@ -928,7 +976,7 @@ int ObDropVecIndexTask::exit_all_dags_and_clean()
928976
if (REACH_COUNT_INTERVAL(1000L)) {
929977
LOG_INFO("wait all delete lob meta row data dag exit", K(dst_tenant_id_), K(task_id_));
930978
}
931-
} else if (OB_FAIL(finish())) {
979+
} else if (OB_FAIL(finish())) { // try release hold snapshot
932980
LOG_WARN("finish tans failed", K(ret));
933981
} else if (OB_FAIL(cleanup())) {
934982
LOG_WARN("cleanup failed", K(ret));
@@ -946,6 +994,8 @@ int ObDropVecIndexTask::cleanup_impl()
946994
} else if (OB_ISNULL(GCTX.sql_proxy_)) {
947995
ret = OB_INVALID_ARGUMENT;
948996
LOG_WARN("invalid argument", KR(ret), KP(GCTX.sql_proxy_));
997+
} else if (OB_FAIL(finish())) { // try release hold snapshot
998+
LOG_WARN("finish tans failed", K(ret));
949999
} else if (OB_FAIL(report_error_code(unused_str))) {
9501000
LOG_WARN("report error code failed", K(ret));
9511001
} else if (OB_FAIL(ObDDLTaskRecordOperator::delete_record(*GCTX.sql_proxy_, tenant_id_, task_id_))) {
@@ -1030,6 +1080,27 @@ int ObDropVecIndexTask::check_build_single_replica(bool &is_end)
10301080
return ret;
10311081
}
10321082

1083+
// check whether all leaders have completed the task
1084+
int ObDropVecIndexTask::check_snapshot_table_exist(bool &is_exist)
1085+
{
1086+
int ret = OB_SUCCESS;
1087+
const ObTableSchema *snapshot_table_schema = nullptr;
1088+
const int64_t table_id = vec_index_snapshot_data_.table_id_;
1089+
ObSchemaGetterGuard schema_guard;
1090+
if (OB_ISNULL(GCTX.schema_service_)) {
1091+
ret = OB_INVALID_ARGUMENT;
1092+
LOG_WARN("invalid argument", KR(ret), KP(GCTX.schema_service_));
1093+
} else if (OB_FAIL(GCTX.schema_service_->get_tenant_schema_guard(tenant_id_, schema_guard))) {
1094+
LOG_WARN("fail to get tenant schema guard", K(ret), K(tenant_id_));
1095+
} else if (OB_FAIL(schema_guard.get_table_schema(tenant_id_, table_id, snapshot_table_schema))) {
1096+
LOG_WARN("get table schema failed", K(ret), K(table_id));
1097+
} else if (OB_ISNULL(snapshot_table_schema)) {
1098+
is_exist = false;
1099+
LOG_WARN("snapshot table is not exist", K(table_id));
1100+
}
1101+
return ret;
1102+
}
1103+
10331104
// update sstable complement status for all leaders
10341105
int ObDropVecIndexTask::update_drop_lob_meta_row_job_status(const common::ObTabletID &tablet_id,
10351106
const ObAddr &addr,

src/rootserver/ddl_task/ob_drop_vec_index_task.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ class ObDropVecIndexTask : public ObDDLTask
104104
int succ();
105105
int send_build_single_replica_request();
106106
int check_build_single_replica(bool &is_end);
107+
int check_snapshot_table_exist(bool &is_exist);
107108
virtual int cleanup_impl() override;
108109
virtual bool is_error_need_retry(const int ret_code) override
109110
{

src/rootserver/ddl_task/ob_drop_vec_ivf_index_task.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -710,8 +710,14 @@ int ObDropVecIVFIndexTask::create_drop_index_task(
710710
} else if (OB_FAIL(guard.get_table_schema(tenant_id_, index_schema->get_data_table_id(), data_table_schema))) {
711711
LOG_WARN("fail to get data table schema", K(ret), K(index_schema->get_data_table_id()));
712712
} else if (OB_UNLIKELY(nullptr == database_schema || nullptr == data_table_schema)) {
713-
ret = OB_ERR_UNEXPECTED;
714-
LOG_WARN("unexpected error, schema is nullptr", K(ret), KP(database_schema), KP(data_table_schema));
713+
if (OB_ISNULL(data_table_schema) && drop_index_arg_.is_hidden_) {
714+
task_id = -1;
715+
LOG_INFO("hidden data_table maybe removed when offline ddl is failed, skip drop",
716+
K(ret), K(index_tid), K(index_name));
717+
} else {
718+
ret = OB_ERR_UNEXPECTED;
719+
LOG_WARN("unexpected error, schema is nullptr", K(ret), KP(database_schema), KP(data_table_schema));
720+
}
715721
} else if (is_domain_index && OB_FAIL(drop_index_sql.assign(drop_index_arg_.ddl_stmt_str_))) {
716722
LOG_WARN("assign user drop index sql failed", K(ret));
717723
} else {

src/rootserver/ddl_task/ob_index_build_task.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,14 +132,18 @@ int ObIndexSSTableBuildTask::process()
132132
LOG_WARN("error unexpected, index schema must not be nullptr", K(ret), K(tenant_id_), K(dest_table_id_));
133133
} else {
134134
#ifdef ERRSIM
135-
if (OB_SUCC(ret)
136-
&& (index_schema->is_vec_ivfflat_rowkey_cid_index()
135+
if ((index_schema->is_vec_ivfflat_rowkey_cid_index()
137136
|| index_schema->is_vec_ivfpq_rowkey_cid_index()
138137
|| index_schema->is_vec_ivfsq8_rowkey_cid_index())) {
139138
ret = OB_E(EventTable::EN_VEC_INDEX_IVF_ROWKEY_CID_BUILD_ERR) OB_SUCCESS;
140139
if (OB_FAIL(ret)) {
141140
LOG_WARN("errsim ddl execute building the subtask of vector index rowkey cid table failed", KR(ret));
142141
}
142+
} else if (index_schema->is_vec_index_snapshot_data_type()) {
143+
ret = OB_E(EventTable::EN_VEC_INDEX_HNSW_SNAPSHOT_TABLE_BUILD_ERR) OB_SUCCESS;
144+
if (OB_FAIL(ret)) {
145+
LOG_WARN("errsim ddl execute building the subtask of vector index snapshot table failed", KR(ret));
146+
}
143147
}
144148
#endif
145149
if (OB_FAIL(ret)) {

src/rootserver/ddl_task/ob_rebuild_index_task.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,16 @@ int ObRebuildIndexTask::create_and_wait_rebuild_task_finish(const ObDDLTaskStatu
444444
}
445445
LOG_WARN("check ddl task finish failed", K(ret), K(index_build_task_id_));
446446
}
447+
448+
#ifdef ERRSIM
449+
if (OB_SUCC(ret)) {
450+
ret = OB_E(EventTable::EN_VEC_INDEX_HNSW_REBUILD_CREATE_ERR) OB_SUCCESS;
451+
if (OB_FAIL(ret)) {
452+
LOG_WARN("errsim ddl execute building the subtask of rebuild index failed", KR(ret));
453+
}
454+
}
455+
#endif
456+
447457
if (state_finished || OB_FAIL(ret)) {
448458
DEBUG_SYNC(REBUILD_INDEX_WAIT_CREATE_TASK_FINISH);
449459
(void)switch_status(new_status, true, ret);

src/rootserver/ddl_task/ob_vec_index_build_task.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1866,6 +1866,8 @@ int ObVecIndexBuildTask::submit_drop_vec_index_task()
18661866
const ObDatabaseSchema *database_schema = nullptr;
18671867
const ObTableSchema *data_table_schema = nullptr;
18681868

1869+
DEBUG_SYNC(BUILD_VECTOR_INDEX_SUBMIT_DROP_TASK);
1870+
18691871
obrpc::ObDropIndexArg drop_index_arg;
18701872
obrpc::ObDropIndexRes drop_index_res;
18711873
ObString index_name;

src/rootserver/ob_ddl_service.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7855,6 +7855,8 @@ int ObDDLService::alter_table_index(obrpc::ObAlterTableArg &alter_table_arg,
78557855
const ObString &data_table_name = origin_table_schema.get_table_name_str();
78567856
ret = OB_ERR_KEY_DOES_NOT_EXISTS;
78577857
LOG_USER_ERROR(OB_ERR_KEY_DOES_NOT_EXISTS, ori_index_name.length(), ori_index_name.ptr(), data_table_name.length(), data_table_name.ptr());
7858+
} else if (OB_FAIL(ObVectorIndexUtil::check_rename_rebuild_confilt(schema_guard, trans, *this, origin_table_schema, ori_index_name))) {
7859+
LOG_WARN("fail to check vector rename and rebuild confilt", K(ret), K(ori_index_name));
78587860
} else {
78597861
SMART_VAR(ObTableSchema, new_index_schema) {
78607862
if (OB_FAIL(ddl_operator.alter_table_rename_index(
@@ -27692,7 +27694,7 @@ int ObDDLService::drop_table(const ObDropTableArg &drop_table_arg, const obrpc::
2769227694
int ObDDLService::rebuild_vec_index(const ObRebuildIndexArg &arg, obrpc::ObAlterTableRes &res)
2769327695
{
2769427696
int ret = OB_SUCCESS;
27695-
LOG_DEBUG("RS start to rebuild vec index", K(arg));
27697+
LOG_INFO("RS start to rebuild vec index", K(arg));
2769627698

2769727699
if (OB_FAIL(check_inner_stat())) {
2769827700
ret = OB_INNER_STAT_ERROR;
@@ -27738,12 +27740,15 @@ int ObDDLService::rebuild_vec_index(const ObRebuildIndexArg &arg, obrpc::ObAlter
2773827740
const ObTableSchema *index_table_schema = NULL;
2773927741
ObIndexBuilder index_builder(*this);
2774027742
uint64_t tenant_data_version = 0;
27741-
if (OB_FAIL(schema_guard.get_table_schema(tenant_id, arg.index_table_id_, index_table_schema))) {
27743+
if (OB_FAIL(schema_guard.get_table_schema(tenant_id, arg.database_name_, arg.index_name_, true/*index*/, index_table_schema))) {
2774227744
LOG_WARN("fail to get table schema", K(ret), K(tenant_id), K(index_table_schema));
2774327745
} else if (OB_ISNULL(index_table_schema)) {
2774427746
ret = OB_ERR_CANT_DROP_FIELD_OR_KEY;
2774527747
LOG_WARN("index table schema should not be null", K(ret), K(arg.index_name_));
2774627748
LOG_USER_ERROR(OB_ERR_CANT_DROP_FIELD_OR_KEY, arg.index_name_.length(), arg.index_name_.ptr());
27749+
} else if (arg.is_need_check_based_schema_objects() &&
27750+
OB_FAIL(check_parallel_ddl_conflict(schema_guard, arg))) {
27751+
LOG_WARN("index table schema changed since rebuild request, parallel ddl conflict", KR(ret), K(arg));
2774727752
} else if (OB_FAIL(GET_MIN_DATA_VERSION(tenant_id, tenant_data_version))) {
2774827753
LOG_WARN("get min data version failed", K(ret), K(tenant_id));
2774927754
} else if (index_table_schema->is_in_recyclebin()) {

src/share/ob_debug_sync_point.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -640,10 +640,13 @@ class ObString;
640640
ACT(REBUILD_VEC_INDEX_WAIT_DROP_OLD_INDEX,)\
641641
ACT(REBUILD_VEC_INDEX_SWITCH_INDEX_NAME,)\
642642
ACT(BUILD_VECTOR_INDEX_PREPARE_STATUS,)\
643+
ACT(BUILD_VECTOR_INDEX_SUBMIT_DROP_TASK,)\
643644
ACT(BUILD_VECTOR_INDEX_PREPARE_ROWKEY_VID,)\
644645
ACT(BUILD_VECTOR_INDEX_PREPARE_AUX_INDEX,)\
645646
ACT(BUILD_VECTOR_INDEX_PREPARE_VID_ROWKEY,)\
646647
ACT(DROP_VECTOR_INDEX_PREPARE_STATUS,)\
648+
ACT(DROP_VECTOR_INDEX_BEFORE_DELETE_LOB_META,)\
649+
ACT(DROP_VECTOR_INDEX_AFTER_DELETE_LOB_META,)\
647650
ACT(HANDLE_VECTOR_INDEX_ASYNC_TASK,)\
648651
ACT(CS_ASYNC_VECTOR_INDEX_BEFORE_APPLY,)\
649652
ACT(CS_FETCHER_AFTER_DDL_MDS_LOG,)\

0 commit comments

Comments
 (0)