draid: fix cksum errors after rebuild with degraded disks

andriytk · web-flow · commit 2abf469be598 · 2026-04-15T14:48:00.000-07:00
Currently, when more than nparity disks get faulted during the rebuild, only first nparity disks would go to faulted state, and all the remaining disks would go to degraded state. When a hot spare is attached to that degraded disk for rebuild creating the spare mirror, only that hot spare is getting rebuilt, but not the degraded device. So when later during scrub some other attached draid spare happens to map to that spare, it will end up with cksum error. Moreover, if the user clears the degraded disk from errors, the data won't be resilvered to it, hot spare will be detached almost immediately and the data that was resilvered only to it will be lost. Solution: write to all mirrored devices during rebuild, similar to traditional/healing resilvering, but only if we can verify the integrity of the data, or when it's the draid spare we are writing to, in which case we are writing to a reserved spare space, and there is no danger to overwrite any good data. The argument that writing only to rebuilding draid spare vdev is faster than writing to normal device doesn't hold since, at a specific offset being rebuilt, draid spare will be mapped to a normal device anyway. redundancy_draid_degraded2 automation test is added also to cover the scenario. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Andriy Tkachuk <atkachuk@wasabi.com> Closes #18414
diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h
@@ -119,6 +119,7 @@ typedef struct raidz_col {
 	uint8_t rc_need_orig_restore:1;	/* need to restore from orig_data? */
 	uint8_t rc_force_repair:1;	/* Write good data to this column */
 	uint8_t rc_allow_repair:1;	/* Allow repair I/O to this column */
+	uint8_t rc_tgt_is_dspare:1;	/* The target is draid spare vdev */
 	uint8_t rc_latency_outlier:1;	/* Latency outlier for this device */
 	int rc_shadow_devidx;		/* for double write during expansion */
 	int rc_shadow_error;		/* for double write during expansion */
diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c
@@ -24,6 +24,7 @@
  * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
  * Copyright (c) 2025, Klara, Inc.
  * Copyright (c) 2026, Seagate Technology, LLC.
+ * Copyright (c) 2026, Wasabi Technologies, Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -1414,8 +1415,7 @@ vdev_draid_missing(vdev_t *vd, uint64_t physical_offset, uint64_t txg,
 		if (vd == NULL)
 			return (B_TRUE);
 
-		return (vdev_draid_missing(vd, physical_offset,
-		    txg, size));
+		return (vdev_draid_missing(vd, physical_offset, txg, size));
 	}
 
 	return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
@@ -2103,12 +2103,34 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
 		}
 
 		if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) {
+			vdev_t *svd;
+
 			if (c >= rr->rr_firstdatacol)
 				rr->rr_missingdata++;
 			else
 				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ESTALE);
 			rc->rc_skipped = 1;
+
+			/*
+			 * If this child has draid spare attached, and that
+			 * spare by rc_offset maps to another spare, the repair
+			 * would go to that spare, and we want all mirrored
+			 * children on it to be updated with the repaired data,
+			 * even when we cannot vouch for it during rebuilds
+			 * (which don't have checksums). Otherwise, we will have
+			 * a lot of checksum errors on that spares during scrub.
+			 * The worst thing that can happen in this case is that
+			 * we will update the reserved spare column on some
+			 * device with unverified data, which is harmless.
+			 */
+			if ((svd = vdev_draid_find_spare(cvd)) != NULL) {
+				svd = vdev_draid_spare_get_child(svd,
+				    rc->rc_offset);
+				if (svd && (svd->vdev_ops == &vdev_spare_ops ||
+				    svd->vdev_ops == &vdev_replacing_ops))
+					rc->rc_tgt_is_dspare = 1;
+			}
 			continue;
 		}
 
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
@@ -669,23 +669,19 @@ vdev_mirror_io_start(zio_t *zio)
 	}
 
 	while (children--) {
-		mc = &mm->mm_child[c];
-		c++;
+		mc = &mm->mm_child[c++];
 
 		/*
-		 * When sequentially resilvering only issue write repair
-		 * IOs to the vdev which is being rebuilt for two reasons:
-		 * 1. The repair IO data calculated from parity has no checksum
-		 *    to validate and could be incorrect.  Existing data must
-		 *    never be overwritten with unconfirmed data to ensure we
-		 *    never lock in unrecoverable damage to the pool.
-		 * 2. Performance is limited by the slowest child device.  We
-		 *    don't want a slower device to limit the rebuild rate for
-		 *    faster replacement devices such as distributed spares.
+		 * When sequentially resilvering and the integrity of the data
+		 * is speculative (ZIO_FLAG_SPECULATIVE), issue write repair IOs
+		 * only to the vdev which is being rebuilt. Existing data on
+		 * other children must never be overwritten with unconfirmed
+		 * data to avoid unrecoverable damage to the pool.
 		 */
 		if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
 		    (zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 		    !(zio->io_flags & ZIO_FLAG_SCRUB) &&
+		    (zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
 		    mm->mm_rebuilding && !mc->mc_rebuilding) {
 			continue;
 		}
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
@@ -25,6 +25,7 @@
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
  * Copyright (c) 2025, Klara, Inc.
+ * Copyright (c) 2026, Wasabi Technologies, Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -3104,6 +3105,7 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
 	int parity_errors = 0;
 	int parity_untried = 0;
 	int data_errors = 0;
+	zio_flag_t add_flags = 0;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 
@@ -3134,10 +3136,30 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
 	 * Note that we also regenerate parity when resilvering so we
 	 * can write it out to failed devices later.
 	 */
-	if (parity_errors + parity_untried <
-	    rr->rr_firstdatacol - data_errors ||
-	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
+	boolean_t parity_verify = (parity_errors + parity_untried) <
+	    (rr->rr_firstdatacol - data_errors);
+	if (parity_verify || (zio->io_flags & ZIO_FLAG_RESILVER)) {
 		int n = raidz_parity_verify(zio, rr);
+		/*
+		 * In, Reed-Solomon encoding, if we have ndata+1 columns and
+		 * the parity doesn't match, it means the data integrity is
+		 * compromised. We shouldn't try to repair anything in this
+		 * case.
+		 */
+		if (parity_verify && n > 0 &&
+		    zio->io_priority == ZIO_PRIORITY_REBUILD)
+			return;
+		/*
+		 * If we have only ndata columns, the data integrity will
+		 * be checked by the checksums normally, but not in case
+		 * of rebuild when we don't have checksums. In this case,
+		 * we add ZIO_FLAG_SPECULATIVE and try to not spread
+		 * unverified data. For example, when the target vdev happens
+		 * to be the mirroring spare vdev, we would repair only that
+		 * child in it which is being rebuilt.
+		 */
+		if (!parity_verify && zio->io_priority == ZIO_PRIORITY_REBUILD)
+			add_flags |= ZIO_FLAG_SPECULATIVE;
 		unexpected_errors += n;
 	}
 
@@ -3163,13 +3185,27 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
 			 */
 			ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
 
+			/*
+			 * When the target vdev is draid spare, we should clear
+			 * ZIO_FLAG_SPECULATIVE. First, if that draid spare maps
+			 * to another spare having an online/degraded disk, that
+			 * disk must be repaired also. Otherwise, the scrub will
+			 * detect a lot of cksum errors later. Second, since it
+			 * is draid spare, there is no harm in updating its
+			 * content on any vdev it maps to because the space is
+			 * reserved as a spare anyway.
+			 */
+			zio_flag_t aflags = add_flags;
+			if (rc->rc_tgt_is_dspare)
+				aflags &= ~ZIO_FLAG_SPECULATIVE;
+
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    ZIO_TYPE_WRITE,
 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
-			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
+			    ZIO_FLAG_SELF_HEAL : 0) | aflags, NULL, NULL));
 		}
 	}
 
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
@@ -1662,9 +1662,11 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 
 	/*
 	 * If we've decided to do a repair, the write is not speculative --
-	 * even if the original read was.
+	 * even if the original read was. Rebuild is an exception since we
+	 * cannot always ensure its data integrity.
 	 */
-	if (flags & ZIO_FLAG_IO_REPAIR)
+	if ((flags & ZIO_FLAG_IO_REPAIR) &&
+	    pio->io_priority != ZIO_PRIORITY_REBUILD)
 		flags &= ~ZIO_FLAG_SPECULATIVE;
 
 	/*
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
@@ -919,6 +919,7 @@ timeout = 1200
 tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2',
     'redundancy_draid3', 'redundancy_draid_width', 'redundancy_draid_damaged1',
     'redundancy_draid_damaged2', 'redundancy_draid_degraded1',
+    'redundancy_draid_degraded2',
     'redundancy_draid_spare1', 'redundancy_draid_spare2',
     'redundancy_draid_spare3', 'redundancy_draid_spare4', 'redundancy_mirror',
     'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2',
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
@@ -1912,6 +1912,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/redundancy/redundancy_draid_damaged1.ksh \
 	functional/redundancy/redundancy_draid_damaged2.ksh \
 	functional/redundancy/redundancy_draid_degraded1.ksh \
+	functional/redundancy/redundancy_draid_degraded2.ksh \
 	functional/redundancy/redundancy_draid.ksh \
 	functional/redundancy/redundancy_draid_spare1.ksh \
 	functional/redundancy/redundancy_draid_spare2.ksh \
diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded2.ksh
@@ -0,0 +1,157 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Lawrence Livermore National Security, LLC.
+# Copyright (c) 2026 by Wasabi Technologies, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib
+
+#
+# DESCRIPTION:
+#	When sequentially resilvering a dRAID pool with multiple vdevs
+#	and N faulted vdevs, where N=parity, ensure that when another leaf
+#	is marked degraded the pool can still be sequentially resilvered
+#	without introducing new checksum errors.  Note we've exhausted
+#	the available redundancy so no silent correction can be tolerated.
+#
+#	This test is very similar to redundancy_draid_degraded1 and is
+#	based on it. The difference is that 1) we always have some faulted
+#	vdev which is already resilvered, and 2) we resilver the most
+#	recently faulted, but marked degraded due to redundancy exhaustion,
+#	vdev also.
+#
+# STRATEGY:
+#	1. Create block device files for the test draid pool
+#	2. For each parity value [1..3]
+#	    - create draid pool
+#	    - fill it with some directories/files
+#	    - fault one vdev and resilver it
+#	    - fault N=parity vdevs eliminating any redundancy
+#	    - force fault an additional vdev causing it to be degraded
+#	    - replace faulted vdevs using a sequential resilver.
+#	      The minimum pool redundancy requirements are met so
+#	      reconstruction is possible when reading from all online vdevs.
+#	    - verify that the draid spare was correctly reconstructed and
+#	      no checksum errors were introduced.
+#	    - destroy the draid pool
+#
+
+typeset -r devs=13
+typeset -r dev_size_mb=512
+
+typeset -a disks
+
+prefetch_disable=$(get_tunable PREFETCH_DISABLE)
+rebuild_scrub_enabled=$(get_tunable REBUILD_SCRUB_ENABLED)
+scan_suspend_progress=$(get_tunable SCAN_SUSPEND_PROGRESS)
+
+function cleanup
+{
+	poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL"
+
+	for i in {0..$devs}; do
+		rm -f "$TEST_BASE_DIR/dev-$i"
+	done
+
+	set_tunable32 PREFETCH_DISABLE $prefetch_disable
+	set_tunable32 REBUILD_SCRUB_ENABLED $rebuild_scrub_enabled
+	set_tunable32 SCAN_SUSPEND_PROGRESS $scan_suspend_progress
+}
+
+function test_sequential_resilver # <pool> <parity> <dir>
+{
+	typeset pool=$1
+	typeset nparity=$2
+	typeset dir=$3
+
+	# Fault N=parity devices
+	for (( i=0; i<$nparity; i++ )); do
+		log_must zpool offline -f $pool $dir/dev-$i
+	done
+
+	# Parity is exhausted, faulting another device marks it degraded
+	log_must zpool offline -f $pool $dir/dev-$nparity
+
+	# Replace all faulted vdevs with distributed spares
+	log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+	for (( i=0; i<$((nparity+1)); i++ )); do
+		spare=draid${nparity}-0-$i
+		log_must zpool replace -fs $pool $dir/dev-$i $spare
+	done
+	log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
+
+	log_must zpool wait -t resilver $pool
+
+	log_must zpool scrub -w $pool
+	log_must zpool status $pool
+
+	log_must check_pool_status $pool "scan" "repaired 0B"
+	log_must check_pool_status $pool "errors" "No known data errors"
+	log_must check_pool_status $pool "scan" "with 0 errors"
+}
+
+log_onexit cleanup
+
+log_must set_tunable32 PREFETCH_DISABLE 1
+log_must set_tunable32 REBUILD_SCRUB_ENABLED 0
+
+# Disk files which will be used by pool
+for i in {0..$(($devs - 1))}; do
+	device=$TEST_BASE_DIR/dev-$i
+	log_must truncate -s ${dev_size_mb}M $device
+	disks[${#disks[*]}+1]=$device
+done
+
+# Disk file which will be attached
+log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs
+
+for nparity in 3; do
+	raid=draid${nparity}:$((nparity+2))s
+	dir=$TEST_BASE_DIR
+
+	log_must zpool create -O compression=off -f -o cachefile=none $TESTPOOL $raid ${disks[@]}
+	log_must zfs set primarycache=metadata $TESTPOOL
+
+	log_must zfs create $TESTPOOL/fs
+	log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R
+
+	log_must zfs create -o compress=on $TESTPOOL/fs2
+	log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R
+
+	log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3
+	log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R
+
+	log_must zpool export $TESTPOOL
+	log_must zpool import -o cachefile=none -d $dir $TESTPOOL
+
+	log_must check_pool_status $TESTPOOL "errors" "No known data errors"
+
+	test_sequential_resilver $TESTPOOL $nparity $dir
+
+	log_must zpool destroy "$TESTPOOL"
+done
+
+log_pass "draid degraded device(s) test succeeded."