Fix a deadlock during ALTER SUBSCRIPTION ... DROP PUBLICATION.
authorAmit Kapila <akapila@postgresql.org>
Fri, 1 Aug 2025 06:40:06 +0000 (06:40 +0000)
committerAmit Kapila <akapila@postgresql.org>
Fri, 1 Aug 2025 06:40:06 +0000 (06:40 +0000)
A deadlock can occur when the DDL command and the apply worker acquire
catalog locks in different orders while dropping replication origins.

The issue is rare in PG16 and higher branches because, in most cases, the
tablesync worker performs the origin drop in those branches, and its
locking sequence does not conflict with DDL operations.

This patch ensures consistent lock acquisition to prevent such deadlocks.

As per buildfarm.

Reported-by: Alexander Lakhin <exclusion@gmail.com>
Author: Ajin Cherian <itsajin@gmail.com>
Reviewed-by: Hayato Kuroda <kuroda.hayato@fujitsu.com>
Reviewed-by: vignesh C <vignesh21@gmail.com>
Reviewed-by: Amit Kapila <amit.kapila16@gmail.com>
Backpatch-through: 14, where it was introduced
Discussion: https://postgr.es/m/bab95e12-6cc5-4ebb-80a8-3e41956aa297@gmail.com

src/backend/catalog/pg_subscription.c
src/backend/replication/logical/tablesync.c
src/include/catalog/pg_subscription_rel.h

index d07f88ce2807a3bc5e8f08324872e2f58e95475a..7a1df9509ed2adbea1df32ca5e5f5b6fc8391cf6 100644 (file)
@@ -273,8 +273,8 @@ AddSubscriptionRelState(Oid subid, Oid relid, char state,
  * Update the state of a subscription table.
  */
 void
-UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
-                          XLogRecPtr sublsn)
+UpdateSubscriptionRelStateEx(Oid subid, Oid relid, char state,
+                            XLogRecPtr sublsn, bool already_locked)
 {
    Relation    rel;
    HeapTuple   tup;
@@ -282,9 +282,24 @@ UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
    Datum       values[Natts_pg_subscription_rel];
    bool        replaces[Natts_pg_subscription_rel];
 
-   LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock);
+   if (already_locked)
+   {
+#ifdef USE_ASSERT_CHECKING
+       LOCKTAG     tag;
 
-   rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+       Assert(CheckRelationOidLockedByMe(SubscriptionRelRelationId,
+                                         RowExclusiveLock, true));
+       SET_LOCKTAG_OBJECT(tag, InvalidOid, SubscriptionRelationId, subid, 0);
+       Assert(LockHeldByMe(&tag, AccessShareLock));
+#endif
+
+       rel = table_open(SubscriptionRelRelationId, NoLock);
+   }
+   else
+   {
+       LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock);
+       rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+   }
 
    /* Try finding existing mapping. */
    tup = SearchSysCacheCopy2(SUBSCRIPTIONRELMAP,
@@ -318,6 +333,16 @@ UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
    table_close(rel, NoLock);
 }
 
+/*
+ * Update the state of a subscription table.
+ */
+void
+UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
+                          XLogRecPtr sublsn)
+{
+   UpdateSubscriptionRelStateEx(subid, relid, state, sublsn, false);
+}
+
 /*
  * Get state of subscription table.
  *
index ca88133ed1493c0eedf6246a1fd052a3698d9a1a..55af49b3230a7e553e044d57bd1331027a8ba3f2 100644 (file)
@@ -425,6 +425,7 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
    ListCell   *lc;
    bool        started_tx = false;
    bool        should_exit = false;
+   Relation    rel = NULL;
 
    Assert(!IsTransactionState());
 
@@ -492,7 +493,16 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
                 * worker to remove the origin tracking as if there is any
                 * error while dropping we won't restart it to drop the
                 * origin. So passing missing_ok = true.
+                *
+                * Lock the subscription and origin in the same order as we
+                * are doing during DDL commands to avoid deadlocks. See
+                * AlterSubscription_refresh.
                 */
+               LockSharedObject(SubscriptionRelationId, MyLogicalRepWorker->subid,
+                                0, AccessShareLock);
+               if (!rel)
+                   rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+
                ReplicationOriginNameForLogicalRep(MyLogicalRepWorker->subid,
                                                   rstate->relid,
                                                   originname,
@@ -502,9 +512,9 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
                /*
                 * Update the state to READY only after the origin cleanup.
                 */
-               UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
-                                          rstate->relid, rstate->state,
-                                          rstate->lsn);
+               UpdateSubscriptionRelStateEx(MyLogicalRepWorker->subid,
+                                            rstate->relid, rstate->state,
+                                            rstate->lsn, true);
            }
        }
        else
@@ -555,7 +565,14 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
                         * This is required to avoid any undetected deadlocks
                         * due to any existing lock as deadlock detector won't
                         * be able to detect the waits on the latch.
+                        *
+                        * Also close any tables prior to the commit.
                         */
+                       if (rel)
+                       {
+                           table_close(rel, NoLock);
+                           rel = NULL;
+                       }
                        CommitTransactionCommand();
                        pgstat_report_stat(false);
                    }
@@ -621,6 +638,10 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
        }
    }
 
+   /* Close table if opened */
+   if (rel)
+       table_close(rel, NoLock);
+
    if (started_tx)
    {
        /*
index 60a2bcca2382fe057f28057412c9840cf6b5861e..9bced6697e9b78e9a7fbc83f63fffcad79607f20 100644 (file)
@@ -84,6 +84,8 @@ extern void AddSubscriptionRelState(Oid subid, Oid relid, char state,
                                    XLogRecPtr sublsn);
 extern void UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
                                       XLogRecPtr sublsn);
+extern void UpdateSubscriptionRelStateEx(Oid subid, Oid relid, char state,
+                                        XLogRecPtr sublsn, bool already_locked);
 extern char GetSubscriptionRelState(Oid subid, Oid relid, XLogRecPtr *sublsn);
 extern void RemoveSubscriptionRel(Oid subid, Oid relid);