Skip to content

Commit aee9f52

Browse files
robnlundman
authored andcommitted
Linux: zfs_putpage: handle page writeback errors
Page writeback is considered completed when the associated itx callback completes. A syncing writeback will receive the error in its callback directly, but an in-flight async writeback that was promoted to sync by the ZIL may also receive an error. Writeback errors, even syncing writeback errors, are not especially serious on their own, because the error will ultimately be returned to the zil_commit() caller, either zfs_fsync() for an explicit sync op (eg msync()) or to zfs_putpage() itself for a syncing (WB_SYNC_ALL) writeback (kernel housekeeping or sync_file_range(SYNC_FILE_RANGE_WAIT_AFTER). The only thing we need to do when a page writeback fails is to re-mark the page dirty, since we don't know if it made it to disk yet. This will ensure that it gets written out again in the future, either some scheduled async writeback or another explicit syncing call. On the other side, we need to make sure that if a syncing op arrives, any changes on dirty pages are written back to the DMU and/or the ZIL first. We do this by starting an _async_ (WB_SYNC_NONE) writeback on the file mapping at the start of the sync op (fsync(), msync(), etc). An async op will get an async itx created and logged, ready for the followup zfs_fsync()->zil_commit() to find, while avoiding a zil_commit() call for every page in the range. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com> Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Closes openzfs#17398
1 parent b1ddf80 commit aee9f52

2 files changed

Lines changed: 95 additions & 25 deletions

File tree

module/os/linux/zfs/zfs_vnops_os.c

Lines changed: 43 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3694,16 +3694,41 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
36943694
return (error);
36953695
}
36963696

3697-
static void
3698-
zfs_putpage_commit_cb(void *arg)
3697+
/* Finish page writeback. */
3698+
static inline void
3699+
zfs_page_writeback_done(struct page *pp, int err)
36993700
{
3700-
(void) err;
3701-
struct page *pp = arg;
3701+
if (err != 0) {
3702+
/*
3703+
* Writeback failed. Re-dirty the page. It was undirtied before
3704+
* the IO was issued (in zfs_putpage() or write_cache_pages()).
3705+
* The kernel only considers writeback for dirty pages; if we
3706+
* don't do this, it is eligible for eviction without being
3707+
* written out, which we definitely don't want.
3708+
*/
3709+
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
3710+
filemap_dirty_folio(page_mapping(pp), page_folio(pp));
3711+
#else
3712+
__set_page_dirty_nobuffers(pp);
3713+
#endif
3714+
}
37023715

37033716
ClearPageError(pp);
37043717
end_page_writeback(pp);
37053718
}
37063719

3720+
/*
3721+
* ZIL callback for page writeback. Passes to zfs_log_write() in zfs_putpage()
3722+
* for syncing writes. Called when the ZIL itx has been written to the log or
3723+
* the whole txg syncs, or if the ZIL crashes or the pool suspends. Any failure
3724+
* is passed as `err`.
3725+
*/
3726+
static void
3727+
zfs_putpage_commit_cb(void *arg, int err)
3728+
{
3729+
zfs_page_writeback_done(arg, err);
3730+
}
3731+
37073732
/*
37083733
* Push a page out to disk, once the page is on stable storage the
37093734
* registered commit callback will be run as notification of completion.
@@ -3857,16 +3882,15 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
38573882
err = dmu_tx_assign(tx, DMU_TX_WAIT);
38583883
if (err != 0) {
38593884
dmu_tx_abort(tx);
3860-
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
3861-
filemap_dirty_folio(page_mapping(pp), page_folio(pp));
3862-
#else
3863-
__set_page_dirty_nobuffers(pp);
3864-
#endif
3865-
ClearPageError(pp);
3866-
end_page_writeback(pp);
3885+
zfs_page_writeback_done(pp, err);
38673886
zfs_rangelock_exit(lr);
38683887
zfs_exit(zfsvfs, FTAG);
3869-
return (err);
3888+
3889+
/*
3890+
* Don't return error for an async writeback; we've re-dirtied
3891+
* the page so it will be tried again some other time.
3892+
*/
3893+
return (for_sync ? err : 0);
38703894
}
38713895

38723896
va = kmap(pp);
@@ -3920,7 +3944,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
39203944
* ALL, zfs_putpage should do it.
39213945
*
39223946
* Summary:
3923-
* for_sync: 0=unlock immediately; 1 unlock once on disk
3947+
* for_sync: 0=unlock immediately; 1=unlock once on disk
39243948
* sync_mode: NONE=caller will commit; ALL=we will commit
39253949
*/
39263950
boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE);
@@ -3935,16 +3959,19 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
39353959
B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp);
39363960

39373961
if (!for_sync) {
3938-
ClearPageError(pp);
3939-
end_page_writeback(pp);
3962+
/*
3963+
* Async writeback is logged and written to the DMU, so page
3964+
* can now be unlocked.
3965+
*/
3966+
zfs_page_writeback_done(pp, 0);
39403967
}
39413968

39423969
dmu_tx_commit(tx);
39433970

39443971
zfs_rangelock_exit(lr);
39453972

39463973
if (need_commit) {
3947-
err = zil_commit(zfsvfs->z_log, zp->z_id);
3974+
err = zil_commit_flags(zfsvfs->z_log, zp->z_id, ZIL_COMMIT_NOW);
39483975
if (err != 0) {
39493976
zfs_exit(zfsvfs, FTAG);
39503977
return (err);

module/os/linux/zfs/zpl_file.c

Lines changed: 52 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,10 @@ zpl_iterate(struct file *filp, struct dir_context *ctx)
107107
return (error);
108108
}
109109

110+
static inline int
111+
zpl_write_cache_pages(struct address_space *mapping,
112+
struct writeback_control *wbc, void *data);
113+
110114
static int
111115
zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
112116
{
@@ -116,9 +120,38 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
116120
int error;
117121
fstrans_cookie_t cookie;
118122

119-
error = filemap_write_and_wait_range(inode->i_mapping, start, end);
120-
if (error)
121-
return (error);
123+
/*
124+
* Force dirty pages in the range out to the DMU and the log, ready
125+
* for zil_commit() to write down.
126+
*
127+
* We call write_cache_pages() directly to ensure that zpl_putpage() is
128+
* called with the flags we need. We need WB_SYNC_NONE to avoid a call
129+
* to zil_commit() (since we're doing this as a kind of pre-sync); but
130+
* we do need for_sync so that the pages remain in writeback until
131+
* they're on disk, and so that we get an error if the DMU write fails.
132+
*/
133+
if (filemap_range_has_page(inode->i_mapping, start, end)) {
134+
int for_sync = 1;
135+
struct writeback_control wbc = {
136+
.sync_mode = WB_SYNC_NONE,
137+
.nr_to_write = LONG_MAX,
138+
.range_start = start,
139+
.range_end = end,
140+
};
141+
error =
142+
zpl_write_cache_pages(inode->i_mapping, &wbc, &for_sync);
143+
if (error != 0) {
144+
/*
145+
* Unclear what state things are in. zfs_putpage() will
146+
* ensure the pages remain dirty if they haven't been
147+
* written down to the DMU, but because there may be
148+
* nothing logged, we can't assume that zfs_sync() ->
149+
* zil_commit() will give us a useful error. It's
150+
* safest if we just error out here.
151+
*/
152+
return (error);
153+
}
154+
}
122155

123156
crhold(cr);
124157
cookie = spl_fstrans_mark();
@@ -495,15 +528,25 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
495528
if (sync_mode != wbc->sync_mode) {
496529
if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
497530
return (result);
498-
if (zfsvfs->z_log != NULL)
499-
result = -zil_commit(zfsvfs->z_log, zp->z_id);
531+
532+
if (zfsvfs->z_log != NULL) {
533+
/*
534+
* We don't want to block here if the pool suspends,
535+
* because this is not a syncing op by itself, but
536+
* might be part of one that the caller will
537+
* coordinate.
538+
*/
539+
result = -zil_commit_flags(zfsvfs->z_log, zp->z_id,
540+
ZIL_COMMIT_NOW);
541+
}
542+
500543
zpl_exit(zfsvfs, FTAG);
501544

502545
/*
503-
* If zil_commit() failed, it's unclear what state things
504-
* are currently in. putpage() has written back out what
505-
* it can to the DMU, but it may not be on disk. We have
506-
* little choice but to escape.
546+
* If zil_commit_flags() failed, it's unclear what state things
547+
* are currently in. putpage() has written back out what it can
548+
* to the DMU, but it may not be on disk. We have little choice
549+
* but to escape.
507550
*/
508551
if (result != 0)
509552
return (result);

0 commit comments

Comments
 (0)