Skip to content

Commit 1644e2f

Browse files
authored
Fix read corruption after block clone after truncate
When copy_file_range overwrites a recent truncation, subsequent reads can incorrectly determine that it is read hole instead of reading the cloned blocks. This can happen when the following conditions are met: - Truncate adds blkid to dn_free_ranges - A new TXG is created - copy_file_range calls dmu_brt_clone which override the block pointer and set DB_NOFILL - Subsequent read, given DB_NOFILL, hits dbuf_read_impl and dbuf_read_hole - dbuf_read_hole calls dnode_block_freed, which returns TRUE because the truncated blkids are still in dn_free_ranges This will not happen if the clone and truncate are in the same TXG, because the block clone would update the current TXG's dn_free_ranges, which is why this bug only triggers under high IO load (such as compilation). Fix this by skipping the dnode_block_freed call if the block is overridden. The fix shouldn't cause an issue when the cloned block is subsequently freed in later TXGs, as dbuf_undirty would remove the override. This requires a dedicated test program as it is much harder to trigger with scripts (this needs to generate a lot of I/O in short period of time for the bug to trigger reliably). Assisted-by: Gemini:gemini-3.1-pro Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Gary Guo <gary@kernel.org> Closes #18412 Closes #18421
1 parent 4b4ae48 commit 1644e2f

9 files changed

Lines changed: 161 additions & 2 deletions

File tree

module/zfs/dbuf.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1480,8 +1480,12 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
14801480
* Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
14811481
* processes the delete record and clears the bp while we are waiting
14821482
* for the dn_mtx (resulting in a "no" from block_freed).
1483+
*
1484+
* If bp != db->db_blkptr, it means that it was overridden (by a block
1485+
* clone or direct I/O write). We cannot rely on dnode_block_freed as
1486+
* the range can be freed in an earlier TXG but overridden in later.
14831487
*/
1484-
if (!is_hole && db->db_level == 0)
1488+
if (!is_hole && db->db_level == 0 && bp == db->db_blkptr)
14851489
is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
14861490

14871491
if (is_hole) {

tests/runfiles/common.run

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,8 @@ tests = ['block_cloning_clone_mmap_cached',
8484
'block_cloning_replay', 'block_cloning_replay_encrypted',
8585
'block_cloning_lwb_buffer_overflow', 'block_cloning_clone_mmap_write',
8686
'block_cloning_rlimit_fsize', 'block_cloning_large_offset',
87-
'block_cloning_after_device_removal']
87+
'block_cloning_after_device_removal',
88+
'block_cloning_after_trunc']
8889
tags = ['functional', 'block_cloning']
8990

9091
[tests/functional/bootfs]

tests/test-runner/bin/zts-report.py.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ elif sys.platform.startswith('linux'):
320320
'bclone/bclone_samefs_data': ['SKIP', cfr_reason],
321321
'bclone/bclone_samefs_embedded': ['SKIP', cfr_reason],
322322
'bclone/bclone_samefs_hole': ['SKIP', cfr_reason],
323+
'block_cloning/block_cloning_after_trunc': ['SKIP', cfr_reason],
323324
'block_cloning/block_cloning_clone_mmap_cached': ['SKIP', cfr_reason],
324325
'block_cloning/block_cloning_clone_mmap_write':
325326
['SKIP', cfr_reason],

tests/zfs-tests/cmd/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
/btree_test
33
/chg_usr_exec
44
/clonefile
5+
/clone_after_trunc
56
/clone_mmap_cached
67
/clone_mmap_write
78
/crypto_test

tests/zfs-tests/cmd/Makefile.am

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/crypto_test
3434
%C%_crypto_test_SOURCES = %D%/crypto_test.c
3535
%C%_crypto_test_LDADD = libzpool.la
3636

37+
scripts_zfs_tests_bin_PROGRAMS += %D%/clone_after_trunc
38+
%C%_clone_after_trunc_LDADD = -lpthread
3739

3840
if WANT_DEVNAME2DEVID
3941
scripts_zfs_tests_bin_PROGRAMS += %D%/devname2devid
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
// SPDX-License-Identifier: CDDL-1.0
2+
3+
#include <errno.h>
4+
#include <fcntl.h>
5+
#include <limits.h>
6+
#include <pthread.h>
7+
#include <string.h>
8+
#include <stdio.h>
9+
#include <stdlib.h>
10+
#include <unistd.h>
11+
#include <sys/stat.h>
12+
13+
#if defined(_GNU_SOURCE) && defined(__linux__)
14+
_Static_assert(sizeof (loff_t) == sizeof (off_t),
15+
"loff_t and off_t must be the same size");
16+
#endif
17+
18+
ssize_t
19+
copy_file_range(int, off_t *, int, off_t *, size_t, unsigned int)
20+
__attribute__((weak));
21+
22+
#define FILE_SIZE (1024 * 1024)
23+
#define RECORD_SIZE (128 * 1024)
24+
#define NUM_THREADS 64
25+
26+
const char *dir;
27+
volatile int failed;
28+
29+
static void *
30+
run_test(void *arg)
31+
{
32+
int thread_id = (int)(long)arg;
33+
34+
char src_path[PATH_MAX], dst_path[PATH_MAX];
35+
snprintf(src_path, PATH_MAX, "%s/src-%d", dir, thread_id);
36+
snprintf(dst_path, PATH_MAX, "%s/dst-%d", dir, thread_id);
37+
38+
unsigned char *write_buf = malloc(FILE_SIZE);
39+
unsigned char *read_buf = malloc(FILE_SIZE);
40+
41+
// Write out expected data.
42+
memset(write_buf, 0xAA, FILE_SIZE);
43+
int src = open(src_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
44+
if (write(src, write_buf, FILE_SIZE) != FILE_SIZE)
45+
perror("write");
46+
close(src);
47+
48+
// Create destination file so we exercise O_TRUNC.
49+
int dst = open(dst_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
50+
if (write(dst, write_buf, FILE_SIZE) != FILE_SIZE)
51+
perror("write");
52+
fsync(dst);
53+
close(dst);
54+
55+
// Open file with O_TRUNC and perform copy.
56+
src = open(src_path, O_RDONLY);
57+
dst = open(dst_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
58+
59+
off_t off_in = 0, off_out = 0;
60+
ssize_t ret =
61+
copy_file_range(src, &off_in, dst, &off_out, FILE_SIZE, 0);
62+
if (ret != FILE_SIZE)
63+
perror("copy_file_range");
64+
close(src);
65+
close(dst);
66+
67+
// Read back
68+
dst = open(dst_path, O_RDONLY);
69+
if (read(dst, read_buf, FILE_SIZE) != FILE_SIZE)
70+
perror("read");
71+
close(dst);
72+
73+
// Bug check
74+
if (memcmp(write_buf, read_buf, FILE_SIZE) != 0) {
75+
failed = 1;
76+
fprintf(stderr, "[%d]: FAIL\n", thread_id);
77+
78+
int all_zeros = 1;
79+
for (int i = 0; i < RECORD_SIZE; i++) {
80+
if (read_buf[i] != 0) {
81+
all_zeros = 0;
82+
break;
83+
}
84+
}
85+
86+
if (all_zeros) {
87+
fprintf(stderr, "[%d]: ALL ZERO\n", thread_id);
88+
}
89+
}
90+
91+
unlink(src_path);
92+
unlink(dst_path);
93+
free(write_buf);
94+
free(read_buf);
95+
return (NULL);
96+
}
97+
98+
int
99+
main(int argc, const char **argv)
100+
{
101+
if (argc < 2) {
102+
fprintf(stderr, "usage: %s <dir>\n", argv[0]);
103+
return (1);
104+
}
105+
dir = argv[1];
106+
107+
pthread_t threads[NUM_THREADS];
108+
109+
for (int i = 0; i < NUM_THREADS; i++) {
110+
pthread_create(&threads[i], NULL, run_test, (void *)(long)i);
111+
}
112+
for (int i = 0; i < NUM_THREADS; i++) {
113+
pthread_join(threads[i], NULL);
114+
}
115+
116+
return (failed);
117+
}

tests/zfs-tests/include/commands.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@ export ZFSTEST_FILES_COMMON='badsend
187187
btree_test
188188
chg_usr_exec
189189
clonefile
190+
clone_after_trunc
190191
clone_mmap_cached
191192
clone_mmap_write
192193
crypto_test

tests/zfs-tests/tests/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
495495
functional/block_cloning/block_cloning_rlimit_fsize.ksh \
496496
functional/block_cloning/block_cloning_large_offset.ksh \
497497
functional/block_cloning/block_cloning_after_device_removal.ksh \
498+
functional/block_cloning/block_cloning_after_trunc.ksh \
498499
functional/bootfs/bootfs_001_pos.ksh \
499500
functional/bootfs/bootfs_002_neg.ksh \
500501
functional/bootfs/bootfs_003_pos.ksh \
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/ksh -p
2+
# SPDX-License-Identifier: CDDL-1.0
3+
4+
. $STF_SUITE/include/libtest.shlib
5+
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
6+
7+
#
8+
# DESCRIPTION:
9+
# When a block is truncated and then cloned to, a read data corruption can occur.
10+
# This is a regression test for #18412.
11+
#
12+
13+
verify_runnable "global"
14+
15+
claim="No read data corruption when cloning blocks after a truncate"
16+
17+
function cleanup
18+
{
19+
datasetexists $TESTPOOL && destroy_pool $TESTPOOL
20+
}
21+
22+
log_onexit cleanup
23+
24+
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
25+
26+
# Run for a few times to increase the likelihood of bug triggering.
27+
for i in {0..50}; do
28+
log_must clone_after_trunc /$TESTPOOL/
29+
done
30+
31+
log_pass $claim

0 commit comments

Comments
 (0)