amcheck: Distinguish interrupted page deletion from corruption.
authorNoah Misch <noah@leadboat.com>
Mon, 30 Oct 2023 21:46:05 +0000 (14:46 -0700)
committerNoah Misch <noah@leadboat.com>
Mon, 30 Oct 2023 21:46:09 +0000 (14:46 -0700)
This prevents false-positive reports about "the first child of leftmost
target page is not leftmost of its level", "block %u is not leftmost"
and "left link/right link pair".  They appeared if amcheck ran before
VACUUM cleaned things, after a cluster exited recovery between the
first-stage and second-stage WAL records of a deletion.  Back-patch to
v11 (all supported versions).

Reviewed by Peter Geoghegan.

Discussion: https://postgr.es/m/20231005025232.c7.nmisch@google.com

contrib/amcheck/verify_nbtree.c

index 5b75c891efcd68ed52f914d99bc474ac25946498..ef2972d52392a0ffdce5741e83cc7d1899d0a23a 100644 (file)
@@ -127,6 +127,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel,
                     bool readonly, bool heapallindexed);
 static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
                             BtreeLevel level);
+static bool bt_leftmost_ignoring_half_dead(BtreeCheckState *state,
+                                          BlockNumber start,
+                                          BTPageOpaque start_opaque);
 static void bt_target_page_check(BtreeCheckState *state);
 static ScanKey bt_right_page_check_scankey(BtreeCheckState *state);
 static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
@@ -716,7 +719,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
             */
            if (state->readonly)
            {
-               if (!P_LEFTMOST(opaque))
+               if (!bt_leftmost_ignoring_half_dead(state, current, opaque))
                    ereport(ERROR,
                            (errcode(ERRCODE_INDEX_CORRUPTED),
                             errmsg("block %u is not leftmost in index \"%s\"",
@@ -769,10 +772,14 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
        }
 
        /*
-        * readonly mode can only ever land on live pages and half-dead pages,
-        * so sibling pointers should always be in mutual agreement
+        * Sibling links should be in mutual agreement.  There arises
+        * leftcurrent == P_NONE && btpo_prev != P_NONE when the left sibling
+        * of the parent's low-key downlink is half-dead.  (A half-dead page
+        * has no downlink from its parent.)  Under heavyweight locking, the
+        * last bt_leftmost_ignoring_half_dead() validated this btpo_prev.
         */
-       if (state->readonly && opaque->btpo_prev != leftcurrent)
+       if (state->readonly &&
+           opaque->btpo_prev != leftcurrent && leftcurrent != P_NONE)
            ereport(ERROR,
                    (errcode(ERRCODE_INDEX_CORRUPTED),
                     errmsg("left link/right link pair in index \"%s\" not in agreement",
@@ -822,6 +829,67 @@ nextpage:
    return nextleveldown;
 }
 
+/*
+ * Like P_LEFTMOST(start_opaque), but accept an arbitrarily-long chain of
+ * half-dead, sibling-linked pages to the left.  If a half-dead page appears
+ * under state->readonly, the database exited recovery between the first-stage
+ * and second-stage WAL records of a deletion.
+ */
+static bool
+bt_leftmost_ignoring_half_dead(BtreeCheckState *state,
+                              BlockNumber start,
+                              BTPageOpaque start_opaque)
+{
+   BlockNumber reached = start_opaque->btpo_prev,
+               reached_from = start;
+   bool        all_half_dead = true;
+
+   /*
+    * To handle the !readonly case, we'd need to accept BTP_DELETED pages and
+    * potentially observe nbtree/README "Page deletion and backwards scans".
+    */
+   Assert(state->readonly);
+
+   while (reached != P_NONE && all_half_dead)
+   {
+       Page        page = palloc_btree_page(state, reached);
+       BTPageOpaque reached_opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+       CHECK_FOR_INTERRUPTS();
+
+       /*
+        * Try to detect btpo_prev circular links.  _bt_unlink_halfdead_page()
+        * writes that side-links will continue to point to the siblings.
+        * Check btpo_next for that property.
+        */
+       all_half_dead = P_ISHALFDEAD(reached_opaque) &&
+           reached != start &&
+           reached != reached_from &&
+           reached_opaque->btpo_next == reached_from;
+       if (all_half_dead)
+       {
+           XLogRecPtr  pagelsn = PageGetLSN(page);
+
+           /* pagelsn should point to an XLOG_BTREE_MARK_PAGE_HALFDEAD */
+           ereport(DEBUG1,
+                   (errcode(ERRCODE_NO_DATA),
+                    errmsg_internal("harmless interrupted page deletion detected in index \"%s\"",
+                                    RelationGetRelationName(state->rel)),
+                    errdetail_internal("Block=%u right block=%u page lsn=%X/%X.",
+                                       reached, reached_from,
+                                       (uint32) (pagelsn >> 32),
+                                       (uint32) pagelsn)));
+
+           reached_from = reached;
+           reached = reached_opaque->btpo_prev;
+       }
+
+       pfree(page);
+   }
+
+   return all_half_dead;
+}
+
 /*
  * Function performs the following checks on target page, or pages ancillary to
  * target page: