A deadlock can occur when the DDL command and the apply worker acquire
catalog locks in different orders while dropping replication origins.
The issue is rare in PG16 and higher branches because, in most cases, the
tablesync worker performs the origin drop in those branches, and its
locking sequence does not conflict with DDL operations.
This patch ensures consistent lock acquisition to prevent such deadlocks.
As per buildfarm.
Reported-by: Alexander Lakhin <exclusion@gmail.com>
Author: Ajin Cherian <itsajin@gmail.com>
Reviewed-by: Hayato Kuroda <kuroda.hayato@fujitsu.com>
Reviewed-by: vignesh C <vignesh21@gmail.com>
Reviewed-by: Amit Kapila <amit.kapila16@gmail.com>
Backpatch-through: 14, where it was introduced
Discussion: https://postgr.es/m/
bab95e12-6cc5-4ebb-80a8-
3e41956aa297@gmail.com
* Update the state of a subscription table.
*/
void
-UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
- XLogRecPtr sublsn)
+UpdateSubscriptionRelStateEx(Oid subid, Oid relid, char state,
+ XLogRecPtr sublsn, bool already_locked)
{
Relation rel;
HeapTuple tup;
Datum values[Natts_pg_subscription_rel];
bool replaces[Natts_pg_subscription_rel];
- LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock);
+ if (already_locked)
+ {
+#ifdef USE_ASSERT_CHECKING
+ LOCKTAG tag;
- rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+ Assert(CheckRelationOidLockedByMe(SubscriptionRelRelationId,
+ RowExclusiveLock, true));
+ SET_LOCKTAG_OBJECT(tag, InvalidOid, SubscriptionRelationId, subid, 0);
+ Assert(LockHeldByMe(&tag, AccessShareLock));
+#endif
+
+ rel = table_open(SubscriptionRelRelationId, NoLock);
+ }
+ else
+ {
+ LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock);
+ rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+ }
/* Try finding existing mapping. */
tup = SearchSysCacheCopy2(SUBSCRIPTIONRELMAP,
table_close(rel, NoLock);
}
+/*
+ * Update the state of a subscription table.
+ */
+void
+UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
+ XLogRecPtr sublsn)
+{
+ UpdateSubscriptionRelStateEx(subid, relid, state, sublsn, false);
+}
+
/*
* Get state of subscription table.
*
ListCell *lc;
bool started_tx = false;
bool should_exit = false;
+ Relation rel = NULL;
Assert(!IsTransactionState());
* worker to remove the origin tracking as if there is any
* error while dropping we won't restart it to drop the
* origin. So passing missing_ok = true.
+ *
+ * Lock the subscription and origin in the same order as we
+ * are doing during DDL commands to avoid deadlocks. See
+ * AlterSubscription_refresh.
*/
+ LockSharedObject(SubscriptionRelationId, MyLogicalRepWorker->subid,
+ 0, AccessShareLock);
+ if (!rel)
+ rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+
ReplicationOriginNameForLogicalRep(MyLogicalRepWorker->subid,
rstate->relid,
originname,
/*
* Update the state to READY only after the origin cleanup.
*/
- UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
- rstate->relid, rstate->state,
- rstate->lsn);
+ UpdateSubscriptionRelStateEx(MyLogicalRepWorker->subid,
+ rstate->relid, rstate->state,
+ rstate->lsn, true);
}
}
else
* This is required to avoid any undetected deadlocks
* due to any existing lock as deadlock detector won't
* be able to detect the waits on the latch.
+ *
+ * Also close any tables prior to the commit.
*/
+ if (rel)
+ {
+ table_close(rel, NoLock);
+ rel = NULL;
+ }
CommitTransactionCommand();
pgstat_report_stat(false);
}
}
}
+ /* Close table if opened */
+ if (rel)
+ table_close(rel, NoLock);
+
if (started_tx)
{
/*
XLogRecPtr sublsn);
extern void UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
XLogRecPtr sublsn);
+extern void UpdateSubscriptionRelStateEx(Oid subid, Oid relid, char state,
+ XLogRecPtr sublsn, bool already_locked);
extern char GetSubscriptionRelState(Oid subid, Oid relid, XLogRecPtr *sublsn);
extern void RemoveSubscriptionRel(Oid subid, Oid relid);