Skip to content

Commit 6562851

Browse files
authored
Handle raidz errors <= nparity rather than ignoring
This PR adds a check in the mirror and raidz code for the case where there are errors <= nparity. In that case, ZFS sets a new flag on the zio that will be checked in zio_done. If that flag is set, when the write IO completes, we issue a read IO for the same blkptr. That will allow ZFS's auto-healing mechanisms and other errors recovery tools to detect the effectively-corrupt data, and handle it accordingly. Note that because draid raidz's IO done function, it also benefits from this functionality. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com> Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Closes #18387
1 parent f798b40 commit 6562851

11 files changed

Lines changed: 188 additions & 9 deletions

File tree

cmd/zinject/zinject.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ static const struct errstr errstrtable[] = {
229229
{ ECHILD, "dtl" },
230230
{ EILSEQ, "corrupt" },
231231
{ ENOSYS, "noop" },
232+
{ EFAULT, "io-prefail" },
232233
{ 0, NULL },
233234
};
234235

@@ -308,7 +309,8 @@ usage(void)
308309
"\t\tlabel. Label injection can either be 'nvlist', 'uber',\n "
309310
"\t\t'pad1', or 'pad2'.\n"
310311
"\t\t'errno' can be 'nxio' (the default), 'io', 'dtl',\n"
311-
"\t\t'corrupt' (bit flip), or 'noop' (successfully do nothing).\n"
312+
"\t\t'corrupt' (bit flip), 'io-prefail' (unsuccessfully do\n"
313+
"\t\tnothing) or 'noop' (successfully do nothing).\n"
312314
"\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n"
313315
"\t\tdevice error injection to a percentage of the IOs.\n"
314316
"\n"
@@ -1026,7 +1028,8 @@ main(int argc, char **argv)
10261028
if (error < 0) {
10271029
(void) fprintf(stderr, "invalid error type "
10281030
"'%s': must be one of: io decompress "
1029-
"decrypt nxio dtl corrupt noop\n",
1031+
"decrypt nxio dtl corrupt noop "
1032+
"io-prefail\n",
10301033
optarg);
10311034
usage();
10321035
libzfs_fini(g_zfs);

include/sys/zio.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ typedef uint64_t zio_flag_t;
243243
#define ZIO_FLAG_REEXECUTED (1ULL << 30)
244244
#define ZIO_FLAG_DELEGATED (1ULL << 31)
245245
#define ZIO_FLAG_PREALLOCATED (1ULL << 32)
246+
#define ZIO_FLAG_POSTREAD (1ULL << 33)
246247

247248
#define ZIO_ALLOCATOR_NONE (-1)
248249
#define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE)

man/man4/zfs.4

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2388,6 +2388,13 @@ and may need to load new metaslabs to satisfy these allocations.
23882388
.It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq uint
23892389
Rewrite new block pointers starting in this pass.
23902390
.
2391+
.It Sy zfs_scrub_partial_writes Ns = Ns Sy 1 Ns | Ns 0 Pq int
2392+
If a write to a multi-disk vdev fails, but the data is recoverable, the data is
2393+
persisted on disk but may not be as redundant as the vdev usually ensures.
2394+
If this tunable is set, we issue a read after such a write error to detect the
2395+
full extent of the problem and attempt to recover from it.
2396+
Note: This currently only works with RAID-Z and dRAID.
2397+
.
23912398
.It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
23922399
Maximum size of TRIM command.
23932400
Larger ranges will be split into chunks no larger than this value before

man/man8/zinject.8

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,8 +237,10 @@ for an ECHILD error,
237237
for an EIO error where reopening the device will succeed,
238238
.It Sy nxio
239239
for an ENXIO error where reopening the device will fail, or
240+
.It Sy io-prefail
241+
to drop the IO without executing it and return failure, or
240242
.It Sy noop
241-
to drop the IO without executing it, and return success.
243+
to drop the IO without executing it and return success.
242244
.El
243245
.Pp
244246
For EIO and ENXIO, the "failed" reads or writes still occur.

module/zfs/vdev_raidz.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,16 @@ static unsigned long raidz_io_aggregate_rows = 4;
406406
*/
407407
static int zfs_scrub_after_expand = 1;
408408

409+
/*
410+
* If there are errors when writing, but few enough that the data is
411+
* recoverable, then ZFS used to silently move on, leaving the data not 100%
412+
* redundant. If this tunable is set, we issue a read after that case occurs,
413+
* allowing the normal error recovery process to handle it.
414+
*
415+
* NOTE: Currently applies only to raidz and draid.
416+
*/
417+
static int zfs_scrub_partial_writes = 1;
418+
409419
static void
410420
vdev_raidz_row_free(raidz_row_t *rr)
411421
{
@@ -3641,6 +3651,7 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
36413651
{
36423652
int normal_errors = 0;
36433653
int shadow_errors = 0;
3654+
int retryable_errors = 0;
36443655

36453656
ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
36463657
ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
@@ -3657,6 +3668,11 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
36573668
ASSERT(rc->rc_shadow_error != ECKSUM);
36583669
shadow_errors++;
36593670
}
3671+
if (rc->rc_error || rc->rc_shadow_error) {
3672+
vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
3673+
if (!(vdev_is_dead(cvd) || cvd->vdev_cant_write))
3674+
retryable_errors++;
3675+
}
36603676
}
36613677

36623678
/*
@@ -3676,6 +3692,8 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
36763692
shadow_errors > rr->rr_firstdatacol) {
36773693
zio->io_error = zio_worst_error(zio->io_error,
36783694
vdev_raidz_worst_error(rr));
3695+
} else if (retryable_errors && zfs_scrub_partial_writes) {
3696+
zio->io_flags |= ZIO_FLAG_POSTREAD;
36793697
}
36803698
}
36813699

@@ -5528,6 +5546,9 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
55285546
ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
55295547
"For expanded RAIDZ, automatically start a pool scrub when expansion "
55305548
"completes");
5549+
ZFS_MODULE_PARAM(zfs, zfs_, scrub_partial_writes, INT, ZMOD_RW,
5550+
"Issue reads after writes with recoverable failures to ensure "
5551+
"integrity");
55315552
ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW,
55325553
"Raidz/draid slow disk sit out time period in seconds");
55335554
ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, U64,

module/zfs/zio.c

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1676,10 +1676,10 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
16761676
* have already processed the original allocating I/O.
16771677
*/
16781678
if (flags & ZIO_FLAG_ALLOC_THROTTLED &&
1679-
(vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
1679+
(vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY)) &&
1680+
type == ZIO_TYPE_WRITE) {
16801681
ASSERT(pio->io_metaslab_class != NULL);
16811682
ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
1682-
ASSERT(type == ZIO_TYPE_WRITE);
16831683
ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
16841684
ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
16851685
ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
@@ -4779,11 +4779,17 @@ zio_vdev_io_start(zio_t *zio)
47794779
}
47804780
zio->io_delay = gethrtime();
47814781

4782-
if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) {
4782+
int error = zio_handle_device_injections(vd, zio, ENOSYS,
4783+
EFAULT);
4784+
if (error == ENOSYS || (error == EFAULT &&
4785+
!(zio->io_flags & ZIO_FLAG_IO_REPAIR))) {
47834786
/*
47844787
* "no-op" injections return success, but do no actual
4785-
* work. Just return it.
4788+
* work. Just return it. "io-prefail" injections are
4789+
* similar, but don't return success.
47864790
*/
4791+
if (error == EFAULT)
4792+
zio->io_error = EIO;
47874793
zio_delay_interrupt(zio);
47884794
return (NULL);
47894795
}
@@ -5513,6 +5519,12 @@ zio_dva_throttle_done(zio_t *zio)
55135519
}
55145520
}
55155521

5522+
static void
5523+
zio_done_postread_done(zio_t *zio)
5524+
{
5525+
abd_free(zio->io_abd);
5526+
}
5527+
55165528
static zio_t *
55175529
zio_done(zio_t *zio)
55185530
{
@@ -5843,6 +5855,24 @@ zio_done(zio_t *zio)
58435855
zfs_ereport_free_checksum(zcr);
58445856
}
58455857

5858+
if (zio->io_flags & ZIO_FLAG_POSTREAD) {
5859+
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
5860+
zl = NULL;
5861+
zio_t *pio = zio_walk_parents(zio, &zl);
5862+
blkptr_t *bp = zio->io_bp;
5863+
abd_t *abd = abd_alloc_for_io(BP_GET_PSIZE(bp), B_FALSE);
5864+
zio_priority_t prio = zio->io_priority ==
5865+
ZIO_PRIORITY_SYNC_WRITE ? ZIO_PRIORITY_SYNC_READ :
5866+
ZIO_PRIORITY_SCRUB;
5867+
zio_t *cio = zio_vdev_child_io(pio, zio->io_bp, zio->io_vd,
5868+
zio->io_offset, abd, zio->io_size, ZIO_TYPE_READ, prio,
5869+
ZIO_FLAG_SCRUB | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
5870+
ZIO_FLAG_RESILVER | ZIO_FLAG_DONT_PROPAGATE,
5871+
zio_done_postread_done, NULL);
5872+
cio->io_flags &= ~ZIO_FLAG_ALLOC_THROTTLED;
5873+
zio_nowait(cio);
5874+
}
5875+
58465876
/*
58475877
* It is the responsibility of the done callback to ensure that this
58485878
* particular zio is no longer discoverable for adoption, and as

tests/runfiles/common.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -912,7 +912,7 @@ tags = ['functional', 'redacted_send']
912912
tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_expand_001_pos',
913913
'raidz_expand_002_pos', 'raidz_expand_003_neg', 'raidz_expand_003_pos',
914914
'raidz_expand_004_pos', 'raidz_expand_005_pos', 'raidz_expand_006_neg',
915-
'raidz_expand_007_neg']
915+
'raidz_expand_007_neg', 'raidz_zinject']
916916
tags = ['functional', 'raidz']
917917
timeout = 1200
918918

tests/zfs-tests/include/libtest.shlib

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2019,6 +2019,26 @@ function wait_sit_out #pool vdev timeout
20192019
return 1
20202020
}
20212021

2022+
#
2023+
# Check the output of 'zpool status -v <pool>',
2024+
# and to see if the counts of <device> contain the <regex> specified.
2025+
#
2026+
# Return 0 is contain, 1 otherwise
2027+
#
2028+
function check_pool_device # pool device regex <verbose>
2029+
{
2030+
typeset pool=$1
2031+
typeset device=$2
2032+
typeset regex=$3
2033+
typeset verbose=${4:-false}
2034+
2035+
scan=$(zpool status -v "$pool" 2>/dev/null | grep $device)
2036+
if [[ $verbose == true ]]; then
2037+
log_note $scan
2038+
fi
2039+
echo $scan | grep -qi "$regex"
2040+
}
2041+
20222042
#
20232043
# Check the output of 'zpool status -v <pool>',
20242044
# and to see if the content of <token> contain the <keyword> specified.

tests/zfs-tests/tests/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1884,6 +1884,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
18841884
functional/raidz/raidz_expand_005_pos.ksh \
18851885
functional/raidz/raidz_expand_006_neg.ksh \
18861886
functional/raidz/raidz_expand_007_neg.ksh \
1887+
functional/raidz/raidz_zinject.ksh \
18871888
functional/raidz/setup.ksh \
18881889
functional/redacted_send/cleanup.ksh \
18891890
functional/redacted_send/redacted_compressed.ksh \

tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ function cleanup
4848

4949
function test_device_fault
5050
{
51-
typeset -a errno=("io" "decompress" "decrypt" "nxio" "dtl" "corrupt" "noop")
51+
typeset -a errno=("io" "decompress" "decrypt" "nxio" "dtl" "corrupt" "noop" "io-prefail")
5252
for e in ${errno[@]}; do
5353
log_must eval \
5454
"zinject -d $DISK1 -e $e -T read -f 0.001 $TESTPOOL"

0 commit comments

Comments
 (0)