Skip to content

Commit 2abf469

Browse files
authored
draid: fix cksum errors after rebuild with degraded disks
Currently, when more than nparity disks get faulted during the rebuild, only first nparity disks would go to faulted state, and all the remaining disks would go to degraded state. When a hot spare is attached to that degraded disk for rebuild creating the spare mirror, only that hot spare is getting rebuilt, but not the degraded device. So when later during scrub some other attached draid spare happens to map to that spare, it will end up with cksum error. Moreover, if the user clears the degraded disk from errors, the data won't be resilvered to it, hot spare will be detached almost immediately and the data that was resilvered only to it will be lost. Solution: write to all mirrored devices during rebuild, similar to traditional/healing resilvering, but only if we can verify the integrity of the data, or when it's the draid spare we are writing to, in which case we are writing to a reserved spare space, and there is no danger to overwrite any good data. The argument that writing only to rebuilding draid spare vdev is faster than writing to normal device doesn't hold since, at a specific offset being rebuilt, draid spare will be mapped to a normal device anyway. redundancy_draid_degraded2 automation test is added also to cover the scenario. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Andriy Tkachuk <atkachuk@wasabi.com> Closes #18414
1 parent 6692b6e commit 2abf469

8 files changed

Lines changed: 235 additions & 19 deletions

File tree

include/sys/vdev_raidz_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ typedef struct raidz_col {
119119
uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */
120120
uint8_t rc_force_repair:1; /* Write good data to this column */
121121
uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */
122+
uint8_t rc_tgt_is_dspare:1; /* The target is draid spare vdev */
122123
uint8_t rc_latency_outlier:1; /* Latency outlier for this device */
123124
int rc_shadow_devidx; /* for double write during expansion */
124125
int rc_shadow_error; /* for double write during expansion */

module/zfs/vdev_draid.c

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
* Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
2525
* Copyright (c) 2025, Klara, Inc.
2626
* Copyright (c) 2026, Seagate Technology, LLC.
27+
* Copyright (c) 2026, Wasabi Technologies, Inc.
2728
*/
2829

2930
#include <sys/zfs_context.h>
@@ -1414,8 +1415,7 @@ vdev_draid_missing(vdev_t *vd, uint64_t physical_offset, uint64_t txg,
14141415
if (vd == NULL)
14151416
return (B_TRUE);
14161417

1417-
return (vdev_draid_missing(vd, physical_offset,
1418-
txg, size));
1418+
return (vdev_draid_missing(vd, physical_offset, txg, size));
14191419
}
14201420

14211421
return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
@@ -2103,12 +2103,34 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
21032103
}
21042104

21052105
if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) {
2106+
vdev_t *svd;
2107+
21062108
if (c >= rr->rr_firstdatacol)
21072109
rr->rr_missingdata++;
21082110
else
21092111
rr->rr_missingparity++;
21102112
rc->rc_error = SET_ERROR(ESTALE);
21112113
rc->rc_skipped = 1;
2114+
2115+
/*
2116+
* If this child has draid spare attached, and that
2117+
* spare by rc_offset maps to another spare, the repair
2118+
* would go to that spare, and we want all mirrored
2119+
* children on it to be updated with the repaired data,
2120+
* even when we cannot vouch for it during rebuilds
2121+
* (which don't have checksums). Otherwise, we will have
2122+
* a lot of checksum errors on that spares during scrub.
2123+
* The worst thing that can happen in this case is that
2124+
* we will update the reserved spare column on some
2125+
* device with unverified data, which is harmless.
2126+
*/
2127+
if ((svd = vdev_draid_find_spare(cvd)) != NULL) {
2128+
svd = vdev_draid_spare_get_child(svd,
2129+
rc->rc_offset);
2130+
if (svd && (svd->vdev_ops == &vdev_spare_ops ||
2131+
svd->vdev_ops == &vdev_replacing_ops))
2132+
rc->rc_tgt_is_dspare = 1;
2133+
}
21122134
continue;
21132135
}
21142136

module/zfs/vdev_mirror.c

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -669,23 +669,19 @@ vdev_mirror_io_start(zio_t *zio)
669669
}
670670

671671
while (children--) {
672-
mc = &mm->mm_child[c];
673-
c++;
672+
mc = &mm->mm_child[c++];
674673

675674
/*
676-
* When sequentially resilvering only issue write repair
677-
* IOs to the vdev which is being rebuilt for two reasons:
678-
* 1. The repair IO data calculated from parity has no checksum
679-
* to validate and could be incorrect. Existing data must
680-
* never be overwritten with unconfirmed data to ensure we
681-
* never lock in unrecoverable damage to the pool.
682-
* 2. Performance is limited by the slowest child device. We
683-
* don't want a slower device to limit the rebuild rate for
684-
* faster replacement devices such as distributed spares.
675+
* When sequentially resilvering and the integrity of the data
676+
* is speculative (ZIO_FLAG_SPECULATIVE), issue write repair IOs
677+
* only to the vdev which is being rebuilt. Existing data on
678+
* other children must never be overwritten with unconfirmed
679+
* data to avoid unrecoverable damage to the pool.
685680
*/
686681
if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
687682
(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
688683
!(zio->io_flags & ZIO_FLAG_SCRUB) &&
684+
(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
689685
mm->mm_rebuilding && !mc->mc_rebuilding) {
690686
continue;
691687
}

module/zfs/vdev_raidz.c

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
2626
* Copyright (c) 2016 Gvozden Nešković. All rights reserved.
2727
* Copyright (c) 2025, Klara, Inc.
28+
* Copyright (c) 2026, Wasabi Technologies, Inc.
2829
*/
2930

3031
#include <sys/zfs_context.h>
@@ -3104,6 +3105,7 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
31043105
int parity_errors = 0;
31053106
int parity_untried = 0;
31063107
int data_errors = 0;
3108+
zio_flag_t add_flags = 0;
31073109

31083110
ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
31093111

@@ -3134,10 +3136,30 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
31343136
* Note that we also regenerate parity when resilvering so we
31353137
* can write it out to failed devices later.
31363138
*/
3137-
if (parity_errors + parity_untried <
3138-
rr->rr_firstdatacol - data_errors ||
3139-
(zio->io_flags & ZIO_FLAG_RESILVER)) {
3139+
boolean_t parity_verify = (parity_errors + parity_untried) <
3140+
(rr->rr_firstdatacol - data_errors);
3141+
if (parity_verify || (zio->io_flags & ZIO_FLAG_RESILVER)) {
31403142
int n = raidz_parity_verify(zio, rr);
3143+
/*
3144+
* In, Reed-Solomon encoding, if we have ndata+1 columns and
3145+
* the parity doesn't match, it means the data integrity is
3146+
* compromised. We shouldn't try to repair anything in this
3147+
* case.
3148+
*/
3149+
if (parity_verify && n > 0 &&
3150+
zio->io_priority == ZIO_PRIORITY_REBUILD)
3151+
return;
3152+
/*
3153+
* If we have only ndata columns, the data integrity will
3154+
* be checked by the checksums normally, but not in case
3155+
* of rebuild when we don't have checksums. In this case,
3156+
* we add ZIO_FLAG_SPECULATIVE and try to not spread
3157+
* unverified data. For example, when the target vdev happens
3158+
* to be the mirroring spare vdev, we would repair only that
3159+
* child in it which is being rebuilt.
3160+
*/
3161+
if (!parity_verify && zio->io_priority == ZIO_PRIORITY_REBUILD)
3162+
add_flags |= ZIO_FLAG_SPECULATIVE;
31413163
unexpected_errors += n;
31423164
}
31433165

@@ -3163,13 +3185,27 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
31633185
*/
31643186
ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
31653187

3188+
/*
3189+
* When the target vdev is draid spare, we should clear
3190+
* ZIO_FLAG_SPECULATIVE. First, if that draid spare maps
3191+
* to another spare having an online/degraded disk, that
3192+
* disk must be repaired also. Otherwise, the scrub will
3193+
* detect a lot of cksum errors later. Second, since it
3194+
* is draid spare, there is no harm in updating its
3195+
* content on any vdev it maps to because the space is
3196+
* reserved as a spare anyway.
3197+
*/
3198+
zio_flag_t aflags = add_flags;
3199+
if (rc->rc_tgt_is_dspare)
3200+
aflags &= ~ZIO_FLAG_SPECULATIVE;
3201+
31663202
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
31673203
rc->rc_offset, rc->rc_abd, rc->rc_size,
31683204
ZIO_TYPE_WRITE,
31693205
zio->io_priority == ZIO_PRIORITY_REBUILD ?
31703206
ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
31713207
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
3172-
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
3208+
ZIO_FLAG_SELF_HEAL : 0) | aflags, NULL, NULL));
31733209
}
31743210
}
31753211

module/zfs/zio.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1662,9 +1662,11 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
16621662

16631663
/*
16641664
* If we've decided to do a repair, the write is not speculative --
1665-
* even if the original read was.
1665+
* even if the original read was. Rebuild is an exception since we
1666+
* cannot always ensure its data integrity.
16661667
*/
1667-
if (flags & ZIO_FLAG_IO_REPAIR)
1668+
if ((flags & ZIO_FLAG_IO_REPAIR) &&
1669+
pio->io_priority != ZIO_PRIORITY_REBUILD)
16681670
flags &= ~ZIO_FLAG_SPECULATIVE;
16691671

16701672
/*

tests/runfiles/common.run

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -919,6 +919,7 @@ timeout = 1200
919919
tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2',
920920
'redundancy_draid3', 'redundancy_draid_width', 'redundancy_draid_damaged1',
921921
'redundancy_draid_damaged2', 'redundancy_draid_degraded1',
922+
'redundancy_draid_degraded2',
922923
'redundancy_draid_spare1', 'redundancy_draid_spare2',
923924
'redundancy_draid_spare3', 'redundancy_draid_spare4', 'redundancy_mirror',
924925
'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2',

tests/zfs-tests/tests/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1912,6 +1912,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
19121912
functional/redundancy/redundancy_draid_damaged1.ksh \
19131913
functional/redundancy/redundancy_draid_damaged2.ksh \
19141914
functional/redundancy/redundancy_draid_degraded1.ksh \
1915+
functional/redundancy/redundancy_draid_degraded2.ksh \
19151916
functional/redundancy/redundancy_draid.ksh \
19161917
functional/redundancy/redundancy_draid_spare1.ksh \
19171918
functional/redundancy/redundancy_draid_spare2.ksh \
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
#!/bin/ksh -p
2+
# SPDX-License-Identifier: CDDL-1.0
3+
#
4+
# CDDL HEADER START
5+
#
6+
# The contents of this file are subject to the terms of the
7+
# Common Development and Distribution License (the "License").
8+
# You may not use this file except in compliance with the License.
9+
#
10+
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
11+
# or https://opensource.org/licenses/CDDL-1.0.
12+
# See the License for the specific language governing permissions
13+
# and limitations under the License.
14+
#
15+
# When distributing Covered Code, include this CDDL HEADER in each
16+
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
17+
# If applicable, add the following below this CDDL HEADER, with the
18+
# fields enclosed by brackets "[]" replaced with your own identifying
19+
# information: Portions Copyright [yyyy] [name of copyright owner]
20+
#
21+
# CDDL HEADER END
22+
#
23+
24+
#
25+
# Copyright (c) 2026 by Lawrence Livermore National Security, LLC.
26+
# Copyright (c) 2026 by Wasabi Technologies, Inc.
27+
#
28+
29+
. $STF_SUITE/include/libtest.shlib
30+
. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib
31+
32+
#
33+
# DESCRIPTION:
34+
# When sequentially resilvering a dRAID pool with multiple vdevs
35+
# and N faulted vdevs, where N=parity, ensure that when another leaf
36+
# is marked degraded the pool can still be sequentially resilvered
37+
# without introducing new checksum errors. Note we've exhausted
38+
# the available redundancy so no silent correction can be tolerated.
39+
#
40+
# This test is very similar to redundancy_draid_degraded1 and is
41+
# based on it. The difference is that 1) we always have some faulted
42+
# vdev which is already resilvered, and 2) we resilver the most
43+
# recently faulted, but marked degraded due to redundancy exhaustion,
44+
# vdev also.
45+
#
46+
# STRATEGY:
47+
# 1. Create block device files for the test draid pool
48+
# 2. For each parity value [1..3]
49+
# - create draid pool
50+
# - fill it with some directories/files
51+
# - fault one vdev and resilver it
52+
# - fault N=parity vdevs eliminating any redundancy
53+
# - force fault an additional vdev causing it to be degraded
54+
# - replace faulted vdevs using a sequential resilver.
55+
# The minimum pool redundancy requirements are met so
56+
# reconstruction is possible when reading from all online vdevs.
57+
# - verify that the draid spare was correctly reconstructed and
58+
# no checksum errors were introduced.
59+
# - destroy the draid pool
60+
#
61+
62+
typeset -r devs=13
63+
typeset -r dev_size_mb=512
64+
65+
typeset -a disks
66+
67+
prefetch_disable=$(get_tunable PREFETCH_DISABLE)
68+
rebuild_scrub_enabled=$(get_tunable REBUILD_SCRUB_ENABLED)
69+
scan_suspend_progress=$(get_tunable SCAN_SUSPEND_PROGRESS)
70+
71+
function cleanup
72+
{
73+
poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL"
74+
75+
for i in {0..$devs}; do
76+
rm -f "$TEST_BASE_DIR/dev-$i"
77+
done
78+
79+
set_tunable32 PREFETCH_DISABLE $prefetch_disable
80+
set_tunable32 REBUILD_SCRUB_ENABLED $rebuild_scrub_enabled
81+
set_tunable32 SCAN_SUSPEND_PROGRESS $scan_suspend_progress
82+
}
83+
84+
function test_sequential_resilver # <pool> <parity> <dir>
85+
{
86+
typeset pool=$1
87+
typeset nparity=$2
88+
typeset dir=$3
89+
90+
# Fault N=parity devices
91+
for (( i=0; i<$nparity; i++ )); do
92+
log_must zpool offline -f $pool $dir/dev-$i
93+
done
94+
95+
# Parity is exhausted, faulting another device marks it degraded
96+
log_must zpool offline -f $pool $dir/dev-$nparity
97+
98+
# Replace all faulted vdevs with distributed spares
99+
log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
100+
for (( i=0; i<$((nparity+1)); i++ )); do
101+
spare=draid${nparity}-0-$i
102+
log_must zpool replace -fs $pool $dir/dev-$i $spare
103+
done
104+
log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
105+
106+
log_must zpool wait -t resilver $pool
107+
108+
log_must zpool scrub -w $pool
109+
log_must zpool status $pool
110+
111+
log_must check_pool_status $pool "scan" "repaired 0B"
112+
log_must check_pool_status $pool "errors" "No known data errors"
113+
log_must check_pool_status $pool "scan" "with 0 errors"
114+
}
115+
116+
log_onexit cleanup
117+
118+
log_must set_tunable32 PREFETCH_DISABLE 1
119+
log_must set_tunable32 REBUILD_SCRUB_ENABLED 0
120+
121+
# Disk files which will be used by pool
122+
for i in {0..$(($devs - 1))}; do
123+
device=$TEST_BASE_DIR/dev-$i
124+
log_must truncate -s ${dev_size_mb}M $device
125+
disks[${#disks[*]}+1]=$device
126+
done
127+
128+
# Disk file which will be attached
129+
log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs
130+
131+
for nparity in 3; do
132+
raid=draid${nparity}:$((nparity+2))s
133+
dir=$TEST_BASE_DIR
134+
135+
log_must zpool create -O compression=off -f -o cachefile=none $TESTPOOL $raid ${disks[@]}
136+
log_must zfs set primarycache=metadata $TESTPOOL
137+
138+
log_must zfs create $TESTPOOL/fs
139+
log_must fill_fs /$TESTPOOL/fs 1 512 102400 1 R
140+
141+
log_must zfs create -o compress=on $TESTPOOL/fs2
142+
log_must fill_fs /$TESTPOOL/fs2 1 512 102400 1 R
143+
144+
log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3
145+
log_must fill_fs /$TESTPOOL/fs3 1 512 102400 1 R
146+
147+
log_must zpool export $TESTPOOL
148+
log_must zpool import -o cachefile=none -d $dir $TESTPOOL
149+
150+
log_must check_pool_status $TESTPOOL "errors" "No known data errors"
151+
152+
test_sequential_resilver $TESTPOOL $nparity $dir
153+
154+
log_must zpool destroy "$TESTPOOL"
155+
done
156+
157+
log_pass "draid degraded device(s) test succeeded."

0 commit comments

Comments
 (0)