Skip to content

Commit fcd4592

Browse files
committed
ZTS: add targeted redundancy_draid_spare exception
When sequentially resilvering a dRAID pool its possible that a few correctable checksum errors will be reported. This is a known issue which is occasionally observed in the CI. Until it's resolved we want the test case to tolerate a few checksum errors in this scenario to prevent false positives in the CI. This change also has the additional side effect of standardizing in one location how the pool integrity is verified. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue openzfs#18307 Issue openzfs#18319
1 parent 9be5431 commit fcd4592

10 files changed

Lines changed: 100 additions & 80 deletions

tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ function cleanup
5252
#
5353
function cksum_pool
5454
{
55-
typeset -i cksum=$(zpool status $1 | awk '
55+
typeset -i cksum=$(zpool status -p $1 | awk '
5656
!NF { isvdev = 0 }
5757
isvdev { errors += $NF }
5858
/CKSUM$/ { isvdev = 1 }
@@ -408,3 +408,78 @@ function recover_bad_missing_devs
408408

409409
return 0
410410
}
411+
412+
#
413+
# Given a dRAID pool issue a scrub and verify the current pool status
414+
# aligns with the expected status based on the 'replace_mode' passed.
415+
# Valid modes are:
416+
#
417+
# 1. healing - The pool is perfectly intact. No checksum errors have
418+
# been reported and the scrub didn't make any repairs. This is the
419+
# expected state after a healing resilver of a healthy pool.
420+
#
421+
# 2. sequential - The pool is fully intact. There should never be a
422+
# checksum error, but the occasional checksum error does occur in
423+
# practice. Until the root cause is identified and resolved, tolerate
424+
# a checksum error when scrubbing after a sequential resilver.
425+
#
426+
# https://github.com/openzfs/zfs/issues/18307
427+
# https://github.com/openzfs/zfs/issues/18319
428+
#
429+
# 3. damaged - The pool was intentionally silently damaged. Checksum
430+
# errors are expected to be reported as the damaged blocks are
431+
# detected and repaired.
432+
#
433+
# In all of these cases a scrub must be able to successfully repair the
434+
# pool and result in no data loss.
435+
#
436+
function verify_draid_pool
437+
{
438+
typeset pool=${1:-$TESTPOOL}
439+
typeset replace_mode=${2:-healing}
440+
441+
log_note "verify_draid_pool $pool $replace_mode"
442+
log_must zpool scrub $pool
443+
log_must wait_scrubbed $pool
444+
445+
typeset -i cksum=$(cksum_pool $pool)
446+
447+
if [[ "$replace_mode" = "healing" ]]; then
448+
if [[ $cksum -gt 0 ]]; then
449+
log_must zpool status -v $pool
450+
log_fail "Unexpected CKSUM errors found for $pool ($cksum)"
451+
fi
452+
453+
if ! check_pool_status $pool "scan" "repaired 0B"; then
454+
log_must zpool status -v $pool
455+
log_fail "Unexpected repair IO found for $pool ($cksum)"
456+
fi
457+
elif [[ "$replace_mode" = "sequential" ]]; then
458+
if [[ $cksum -gt 1 ]]; then
459+
log_must zpool status -v $pool
460+
log_fail "Unexpected CKSUM errors found for $pool ($cksum)"
461+
fi
462+
elif [[ "$replace_mode" = "damaged" ]]; then
463+
if [[ $cksum -lt 1 ]]; then
464+
log_must zpool status -v $pool
465+
log_fail "Expected CKSUM errors missing for $pool ($cksum)"
466+
fi
467+
468+
if check_pool_status $pool "scan" "repaired 0B"; then
469+
log_must zpool status -v $pool
470+
log_fail "Expected repair IO missing for $pool ($cksum)"
471+
fi
472+
else
473+
log_fail "Invalid replace_mode=$replace_mode"
474+
fi
475+
476+
if ! check_pool_status $pool "scan" "with 0 errors"; then
477+
log_must zpool status -v $pool
478+
log_fail "Unexpected repair errors found for $pool"
479+
fi
480+
481+
if ! check_pool_status $pool "errors" "No known data errors"; then
482+
log_must zpool status -v $pool
483+
log_fail "Unexpected data errors found for $pool"
484+
fi
485+
}

tests/zfs-tests/tests/functional/redundancy/redundancy_draid.ksh

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,7 @@ function test_selfheal # <pool> <parity> <dir>
8686
# from the files which were read. Before overwriting additional
8787
# devices we need to repair all of the blocks in the pool.
8888
#
89-
log_must zpool scrub -w $pool
90-
log_must check_pool_status $pool "errors" "No known data errors"
89+
log_must verify_draid_pool $pool "damaged"
9190

9291
log_must zpool clear $pool
9392

@@ -104,8 +103,7 @@ function test_selfheal # <pool> <parity> <dir>
104103
log_must eval "find $mntpnt -type f -exec cksum {} + >> /dev/null 2>&1"
105104
log_must check_pool_status $pool "errors" "No known data errors"
106105

107-
log_must zpool scrub -w $pool
108-
log_must check_pool_status $pool "errors" "No known data errors"
106+
log_must verify_draid_pool $pool "damaged"
109107

110108
log_must zpool clear $pool
111109
}
@@ -182,8 +180,7 @@ function test_scrub # <pool> <parity> <dir>
182180

183181
log_must zpool import -o cachefile=none -d $dir $pool
184182

185-
log_must zpool scrub -w $pool
186-
log_must check_pool_status $pool "errors" "No known data errors"
183+
log_must verify_draid_pool $pool "damaged"
187184

188185
log_must zpool clear $pool
189186

@@ -196,8 +193,7 @@ function test_scrub # <pool> <parity> <dir>
196193

197194
log_must zpool import -o cachefile=none -d $dir $pool
198195

199-
log_must zpool scrub -w $pool
200-
log_must check_pool_status $pool "errors" "No known data errors"
196+
log_must verify_draid_pool $pool "damaged"
201197

202198
log_must zpool clear $pool
203199
}

tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,7 @@ function test_sequential_resilver # <pool> <parity> <dir>
8989
log_must zpool replace -fsw $pool $dir/dev-$i $spare
9090
done
9191

92-
log_must zpool scrub -w $pool
93-
log_must zpool status $pool
94-
95-
log_mustnot check_pool_status $pool "scan" "repaired 0B"
96-
log_must check_pool_status $pool "errors" "No known data errors"
97-
log_must check_pool_status $pool "scan" "with 0 errors"
92+
log_must verify_draid_pool $pool "damaged"
9893
}
9994

10095
log_onexit cleanup

tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -121,12 +121,7 @@ for nparity in 1 2 3; do
121121

122122
# Scrub the pool after the sequential resilver and verify
123123
# that the silent damage was repaired by the scrub.
124-
log_must zpool scrub -w $TESTPOOL
125-
log_must zpool status $TESTPOOL
126-
log_must check_pool_status $TESTPOOL "errors" \
127-
"No known data errors"
128-
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
129-
log_mustnot check_pool_status $TESTPOOL "scan" "repaired 0B"
124+
log_must verify_draid_pool $TESTPOOL "damaged"
130125
done
131126

132127
for nspare in 0 1 2; do
@@ -145,12 +140,7 @@ for nparity in 1 2 3; do
145140
done
146141

147142
log_must zpool clear $TESTPOOL
148-
log_must zpool scrub -w $TESTPOOL
149-
log_must zpool status $TESTPOOL
150-
151-
log_must check_pool_status $TESTPOOL "errors" "No known data errors"
152-
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
153-
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
143+
log_must verify_draid_pool $TESTPOOL "healing"
154144

155145
log_must zpool destroy "$TESTPOOL"
156146
done

tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded1.ksh

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,7 @@ function test_sequential_resilver # <pool> <parity> <dir>
8989
spare=draid${nparity}-0-0
9090
log_must zpool replace -fsw $pool $dir/dev-$nparity $spare
9191

92-
log_must zpool scrub -w $pool
93-
log_must zpool status $pool
94-
95-
log_must check_pool_status $pool "scan" "repaired 0B"
96-
log_must check_pool_status $pool "errors" "No known data errors"
97-
log_must check_pool_status $pool "scan" "with 0 errors"
92+
log_must verify_draid_pool $pool "sequential"
9893
}
9994

10095
log_onexit cleanup

tests/zfs-tests/tests/functional/redundancy/redundancy_draid_degraded2.ksh

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -105,12 +105,7 @@ function test_sequential_resilver # <pool> <parity> <dir>
105105

106106
log_must zpool wait -t resilver $pool
107107

108-
log_must zpool scrub -w $pool
109-
log_must zpool status $pool
110-
111-
log_must check_pool_status $pool "scan" "repaired 0B"
112-
log_must check_pool_status $pool "errors" "No known data errors"
113-
log_must check_pool_status $pool "scan" "with 0 errors"
108+
log_must verify_draid_pool $pool "sequential"
114109
}
115110

116111
log_onexit cleanup

tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,7 @@ for replace_mode in "healing" "sequential"; do
8585
log_must check_hotspare_state $TESTPOOL $spare_vdev "INUSE"
8686
# Preserve the 1st faulted vdev for the next test.
8787
[[ $i -eq 0 ]] || log_must zpool detach $TESTPOOL $fault_vdev
88-
log_must verify_pool $TESTPOOL
89-
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
90-
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
88+
log_must verify_draid_pool $TESTPOOL $replace_mode
9189

9290
(( i += 1 ))
9391
done
@@ -98,9 +96,7 @@ for replace_mode in "healing" "sequential"; do
9896
# Verify that after clearing the 1st faulted vdev, all is healed.
9997
log_must zpool clear $TESTPOOL "$BASEDIR/vdev0"
10098
log_must wait_resilvered $TESTPOOL
101-
log_must verify_pool $TESTPOOL
102-
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
103-
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
99+
log_must verify_draid_pool $TESTPOOL "healing"
104100

105101
cleanup
106102
done

tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,9 @@ log_must zpool offline -f $TESTPOOL $BASEDIR/vdev9
6060
log_must zpool replace -w $TESTPOOL $BASEDIR/vdev9 draid1-0-2
6161

6262
# Verify, refill and verify the pool contents.
63-
verify_pool $TESTPOOL
63+
log_must verify_draid_pool $TESTPOOL "healing"
6464
refill_test_env $TESTPOOL
65-
verify_pool $TESTPOOL
65+
log_must verify_draid_pool $TESTPOOL "healing"
6666

6767
# Bring everything back online and check for errors.
6868
log_must zpool clear $TESTPOOL
@@ -72,9 +72,7 @@ log_must wait_hotspare_state $TESTPOOL draid1-0-0 "AVAIL"
7272
log_must wait_hotspare_state $TESTPOOL draid1-0-1 "AVAIL"
7373
log_must wait_hotspare_state $TESTPOOL draid1-0-2 "AVAIL"
7474

75-
log_must zpool scrub -w $TESTPOOL
76-
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
77-
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
75+
log_must verify_draid_pool $TESTPOOL "healing"
7876

7977
log_must is_data_valid $TESTPOOL
8078

tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,7 @@ for replace_mode in "healing" "sequential"; do
111111
log_must zpool detach $TESTPOOL $BASEDIR/vdev7
112112
log_must check_vdev_state $TESTPOOL draid1-0-0 "ONLINE"
113113
log_must check_hotspare_state $TESTPOOL draid1-0-0 "INUSE"
114-
log_must verify_pool $TESTPOOL
115-
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
116-
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
114+
log_must verify_draid_pool $TESTPOOL $replace_mode
117115

118116
# Distributed spare in mirror with original device faulted
119117
log_must zpool offline -f $TESTPOOL $BASEDIR/vdev8
@@ -122,19 +120,15 @@ for replace_mode in "healing" "sequential"; do
122120
log_must check_vdev_state $TESTPOOL spare-8 "DEGRADED"
123121
log_must check_vdev_state $TESTPOOL draid1-0-1 "ONLINE"
124122
log_must check_hotspare_state $TESTPOOL draid1-0-1 "INUSE"
125-
log_must verify_pool $TESTPOOL
126-
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
127-
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
123+
log_must verify_draid_pool $TESTPOOL $replace_mode
128124

129125
# Distributed spare in mirror with original device still online
130126
log_must check_vdev_state $TESTPOOL $BASEDIR/vdev9 "ONLINE"
131127
log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev9 draid1-0-2
132128
log_must check_vdev_state $TESTPOOL spare-9 "ONLINE"
133129
log_must check_vdev_state $TESTPOOL draid1-0-2 "ONLINE"
134130
log_must check_hotspare_state $TESTPOOL draid1-0-2 "INUSE"
135-
log_must verify_pool $TESTPOOL
136-
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
137-
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
131+
log_must verify_draid_pool $TESTPOOL $replace_mode
138132

139133
# Normal faulted device replacement
140134
new_vdev0="$BASEDIR/new_vdev0"
@@ -143,9 +137,7 @@ for replace_mode in "healing" "sequential"; do
143137
log_must check_vdev_state $TESTPOOL $BASEDIR/vdev0 "FAULTED"
144138
log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev0 $new_vdev0
145139
log_must check_vdev_state $TESTPOOL $new_vdev0 "ONLINE"
146-
log_must verify_pool $TESTPOOL
147-
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
148-
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
140+
log_must verify_draid_pool $TESTPOOL $replace_mode
149141

150142
# Distributed spare faulted device replacement
151143
log_must zpool offline -f $TESTPOOL $BASEDIR/vdev2
@@ -154,19 +146,15 @@ for replace_mode in "healing" "sequential"; do
154146
log_must check_vdev_state $TESTPOOL spare-2 "DEGRADED"
155147
log_must check_vdev_state $TESTPOOL draid1-0-3 "ONLINE"
156148
log_must check_hotspare_state $TESTPOOL draid1-0-3 "INUSE"
157-
log_must verify_pool $TESTPOOL
158-
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
159-
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
149+
log_must verify_draid_pool $TESTPOOL $replace_mode
160150

161151
# Normal online device replacement
162152
new_vdev1="$BASEDIR/new_vdev1"
163153
log_must truncate -s $MINVDEVSIZE $new_vdev1
164154
log_must check_vdev_state $TESTPOOL $BASEDIR/vdev1 "ONLINE"
165155
log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev1 $new_vdev1
166156
log_must check_vdev_state $TESTPOOL $new_vdev1 "ONLINE"
167-
log_must verify_pool $TESTPOOL
168-
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
169-
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
157+
log_must verify_draid_pool $TESTPOOL $replace_mode
170158

171159
# Distributed spare online device replacement (then fault)
172160
log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev3 draid1-0-4
@@ -176,9 +164,7 @@ for replace_mode in "healing" "sequential"; do
176164
log_must zpool offline -f $TESTPOOL $BASEDIR/vdev3
177165
log_must check_vdev_state $TESTPOOL $BASEDIR/vdev3 "FAULTED"
178166
log_must check_vdev_state $TESTPOOL spare-3 "DEGRADED"
179-
log_must verify_pool $TESTPOOL
180-
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
181-
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
167+
log_must verify_draid_pool $TESTPOOL $replace_mode
182168

183169
# Verify the original data is valid
184170
log_must is_data_valid $TESTPOOL

tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare4.ksh

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,7 @@ for replace_mode in "healing" "sequential"; do
102102
log_must zpool detach $TESTPOOL $fault_vdev
103103
done
104104

105-
log_must verify_pool $TESTPOOL
106-
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
107-
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
105+
log_must verify_draid_pool $TESTPOOL $replace_mode
108106
done
109107

110108
# Fail remaining drives as long as parity permits.
@@ -120,9 +118,7 @@ for replace_mode in "healing" "sequential"; do
120118
log_must zpool offline -f $TESTPOOL $fault_vdev
121119
log_must check_vdev_state $TESTPOOL $fault_vdev "FAULTED"
122120

123-
log_must verify_pool $TESTPOOL
124-
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
125-
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
121+
log_must verify_draid_pool $TESTPOOL "healing"
126122
(( faults_left > 0 && faults_left-- ))
127123
done
128124
done
@@ -138,9 +134,7 @@ for replace_mode in "healing" "sequential"; do
138134
break
139135
fi
140136

141-
log_must verify_pool $TESTPOOL
142-
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
143-
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
137+
log_must verify_draid_pool $TESTPOOL "healing"
144138
done
145139

146140
log_must is_data_valid $TESTPOOL

0 commit comments

Comments
 (0)