diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 13217fd1c348..376a9f336747 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -58,6 +58,8 @@ #include #include #include +#include +#include #include #include #include @@ -112,6 +114,7 @@ enum { ARG_ALLOCATED = 256, ARG_BLOCK_BIN_MODE, ARG_BLOCK_CLASSES, + ARG_ANYRAID_MAP, }; static const char cmdname[] = "zdb"; @@ -745,9 +748,10 @@ usage(void) "\t\t ::[:]\n" "\t%s -E [-A] word0:word1:...:word15\n" "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " - "\n\n", + "\n" + "\t%s --anyraid-map [ ...]\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, - cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); + cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); @@ -1093,6 +1097,64 @@ dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) (void) os, (void) object, (void) data, (void) size; } +static void +dump_uint32(objset_t *os, uint64_t object, void *data, size_t size) +{ + uint32_t *arr; + uint64_t oursize; + if (dump_opt['d'] < 6) + return; + + if (data == NULL) { + dmu_object_info_t doi; + + VERIFY0(dmu_object_info(os, object, &doi)); + size = doi.doi_max_offset; + /* + * We cap the size at 1 mebibyte here to prevent + * allocation failures and nigh-infinite printing if the + * object is extremely large. + */ + oursize = MIN(size, 1 << 20); + arr = kmem_alloc(oursize, KM_SLEEP); + + int err = dmu_read(os, object, 0, oursize, arr, 0); + if (err != 0) { + (void) printf("got error %u from dmu_read\n", err); + kmem_free(arr, oursize); + return; + } + } else { + /* + * Even though the allocation is already done in this code path, + * we still cap the size to prevent excessive printing. + */ + oursize = MIN(size, 1 << 20); + arr = data; + } + + if (size == 0) { + if (data == NULL) + kmem_free(arr, oursize); + (void) printf("\t\t[]\n"); + return; + } + + (void) printf("\t\t[%0x", arr[0]); + for (size_t i = 1; i * sizeof (uint32_t) < oursize; i++) { + if (i % 4 != 0) + (void) printf(", %0x", arr[i]); + else + (void) printf(",\n\t\t%0x", (arr[i])); + } + if (oursize != size) + (void) printf(", ... "); + (void) printf("]\n"); + + if (data == NULL) + kmem_free(arr, oursize); +} + static void dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) { @@ -3925,6 +3987,14 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { dump_unknown, /* Unknown type, must be last */ }; +static object_viewer_t * +get_objview(dmu_object_type_t ot) +{ + if (ot == DMU_OTN_UINT32_DATA || ot == DMU_OTN_UINT32_METADATA) + return (dump_uint32); + return (object_viewer[ZDB_OT_TYPE(ot)]); +} + static boolean_t match_object_type(dmu_object_type_t obj_type, uint64_t flags) { @@ -4099,7 +4169,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, (longlong_t)dn->dn_phys->dn_maxblkid); if (!dnode_held) { - object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, + get_objview(doi.doi_bonus_type)(os, object, bonus, bsize); } else { (void) printf("\t\t(bonus encrypted)\n"); @@ -4107,7 +4177,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, if (key_loaded || (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type))) { - object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, + get_objview(doi.doi_type)(os, object, NULL, 0); } else { (void) printf("\t\t(object encrypted)\n"); @@ -8492,6 +8562,8 @@ dump_mos_leaks(spa_t *spa) mos_obj_refd(vim->vim_phys->vimp_counts_object); vdev_indirect_mapping_close(vim); } + if (spa->spa_anyraid_relocate) + mos_obj_refd(spa->spa_anyraid_relocate->var_object); deleted_livelists_dump_mos(spa); if (dp->dp_origin_snap != NULL) { @@ -9338,7 +9410,8 @@ zdb_read_block(char *thing, spa_t *spa) if ((zio_checksum_table[ck].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) || - ck == ZIO_CHECKSUM_NOPARITY) { + ck == ZIO_CHECKSUM_NOPARITY || + ck == ZIO_CHECKSUM_ANYRAID_MAP) { continue; } BP_SET_CHECKSUM(bp, ck); @@ -9459,10 +9532,501 @@ dummy_get_file_info(dmu_object_type_t bonustype, const void *data, abort(); } +static int +numlen(uint64_t v) { + char buf[32]; + snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)v); + return (strlen(buf)); +} + +static void +print_separator_line(int cols, int colwidth, boolean_t *print, boolean_t *final) +{ + char buf[64]; + ASSERT3U(colwidth * strlen("─"), <, sizeof (buf) - 2); + int len = 0, off = 0; + // Create a buffer with the cell separator to make later code simpler. + while (len < colwidth) { + len++; + int n = snprintf(buf + off, sizeof (buf) - off, "─"); + ASSERT(n > 0 && n < sizeof (buf) - off); + off += n; + } + + for (int i = 0; i < cols; i++) { + /* + * Skip cells that we don't need to print. If the previous cell] + * also wasn't printed, add an extra space for the separator + * column. + */ + if (!print[i]) { + int extra_width = 0; + if (i == 0 || !print[i - 1]) + extra_width++; + (void) printf("%*s", colwidth + extra_width, ""); + continue; + } + + // Calculate the right shape for the corner of the cells. + const char *left_c, *right_c; + if (i == 0 || !print[i - 1]) { + left_c = (final[i] && (i == 0 || final[i - 1])) ? + "└" : "├"; + } else { + left_c = ""; + } + if (i == cols - 1 || !print[i + 1]) { + right_c = + (final[i] && (i == cols - 1 || final[i + 1])) ? + "┘" : "┤"; + } else { + right_c = + (final[i] && (i == cols - 1 || final[i + 1])) ? + "┴" : "┼"; + } + (void) printf("%s%s%s", left_c, buf, right_c); + } + (void) printf("\n"); +} + +static void +zdb_print_anyraid_tile_layout(vdev_t *vd) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + int cols = vd->vdev_children; + int textwidth = MAX(8, numlen(avl_numnodes(&va->vd_tile_map)) + + va->vd_nparity > 0 ? numlen(va->vd_width) + 1 : 0); + int colwidth = textwidth + 2; + + // Create and populate table with all the values we need to print. + char ***table = malloc(sizeof (*table) * cols); + for (int i = 0; i < cols; i++) { + uint_t cap = va->vd_children[i]->van_capacity; + if (cap == 0) { + ASSERT3S(va->vd_contracting_leaf, ==, i); + cap = (vd->vdev_child[i]->vdev_asize - + VDEV_ANYRAID_TOTAL_MAP_SIZE(vd->vdev_ashift)) / + va->vd_tile_size; + } + table[i] = calloc(cap, + sizeof (**table)); + } + + anyraid_tile_t *cur = avl_first(&va->vd_tile_map); + while (cur) { + int p = 0; + for (anyraid_tile_node_t *node = list_head(&cur->at_list); + node; node = list_next(&cur->at_list, node)) { + ASSERT3U(p, <=, va->vd_nparity + 1); + char **next = + &(table[node->atn_disk][node->atn_tile_idx]); + *next = malloc(textwidth + 1); + int len = snprintf(*next, textwidth, "%d", + cur->at_tile_id); + if (va->vd_nparity > 0) { + (void) snprintf((*next) + len, textwidth - len, + "-%d", p); + } + p++; + } + ASSERT3U(p, ==, va->vd_nparity + va->vd_ndata); + cur = AVL_NEXT(&va->vd_tile_map, cur); + } + + // These are needed to generate the separator lines + boolean_t *printed = malloc(sizeof (*printed) * cols); + boolean_t *final = malloc(sizeof (*final) * cols); + // Print the header row + for (int i = 0; i < cols; i++) { + if (i == 0) + (void) printf("│"); + (void) printf(" %*d ", textwidth, i); + (void) printf("│"); + printed[i] = B_TRUE; + final[i] = B_FALSE; + } + (void) printf("\n"); + print_separator_line(cols, colwidth, printed, final); + + // Print out the actual tile map, one row at a time. + for (int i = 0; ; i++) { + int last_printed = INT_MAX; + for (int v = 0; v < cols; v++) { + uint_t cap = va->vd_children[v]->van_capacity; + if (cap == 0) { + ASSERT3S(va->vd_contracting_leaf, ==, v); + cap = (vd->vdev_child[v]->vdev_asize - + VDEV_ANYRAID_TOTAL_MAP_SIZE( + vd->vdev_ashift)) / va->vd_tile_size; + } + if (final[v]) { + ASSERT3U(i, >=, cap); + int extra_width = 0; + if (v == 0 || !printed[v - 1]) + extra_width++; + (void) printf("%*s", + colwidth + extra_width, ""); + printed[v] = B_FALSE; + continue; + } + if (i + 1 == cap) + final[v] = B_TRUE; + if (v - 1 != last_printed) + (void) printf("│"); + char *value = table[v][i]; + (void) printf(" %*s │", textwidth, value ? value : + ""); + last_printed = v; + } + + if (last_printed == INT_MAX) + break; + (void) printf("\n"); + print_separator_line(cols, colwidth, printed, final); + } + (void) printf("\n"); + for (int i = 0; i < cols; i++) { + for (int j = 0; j < va->vd_children[i]->van_capacity; j++) + if (table[i][j]) + free(table[i][j]); + free(table[i]); + } + free(table); +} + +static void +free_header(anyraid_header_t *header, uint64_t header_size) { + fnvlist_free(header->ah_nvl); + abd_return_buf(header->ah_abd, header->ah_buf, header_size); + abd_free(header->ah_abd); +} + +/* + * Print one of the anyraid maps from the given vdev child. This prints the + * mapping entries themselves, rather than the kernel's interpretation of them, + * which can be useful for debugging. + */ +static void +print_anyraid_mapping(vdev_t *vd, int child, int mapping, int verbosity, + anyraid_header_t *header) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + vdev_t *cvd = vd->vdev_child[child]; + uint64_t ashift = cvd->vdev_ashift; + spa_t *spa = vd->vdev_spa; + int error = 0; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE; + + uint64_t header_offset = VDEV_LABEL_START_SIZE + + mapping * VDEV_ANYRAID_SINGLE_MAP_SIZE(ashift); + uint64_t header_size = VDEV_ANYRAID_MAP_HEADER_SIZE(ashift); + uint64_t map_offset = header_offset + header_size; + + + nvlist_t *hnvl = header->ah_nvl; + + if (verbosity > 4) + nvlist_print(stdout, hnvl); + // Look up and print map metadata. + uint16_t version; + if (nvlist_lookup_uint16(hnvl, VDEV_ANYRAID_HEADER_VERSION, + &version) != 0) { + (void) printf("No version\n"); + free_header(header, header_size); + return; + } + + uint64_t tile_size; + if (nvlist_lookup_uint64(hnvl, VDEV_ANYRAID_HEADER_TILE_SIZE, + &tile_size) != 0) { + (void) printf("No tile size\n"); + free_header(header, header_size); + return; + } + + uint32_t map_length; + if (nvlist_lookup_uint32(hnvl, VDEV_ANYRAID_HEADER_LENGTH, + &map_length) != 0) { + (void) printf("No map length\n"); + free_header(header, header_size); + return; + } + + uint64_t written_txg = 0; + if (nvlist_lookup_uint64(hnvl, VDEV_ANYRAID_HEADER_TXG, + &written_txg) != 0) + (void) printf("No valid TXG\n"); + + uint8_t disk_id = 0; + if (nvlist_lookup_uint8(hnvl, VDEV_ANYRAID_HEADER_DISK, + &disk_id) != 0) + (void) printf("No valid disk ID\n"); + + (void) printf("version: %6d\ttile size: %#8lx\ttxg: %lu\n", + version, tile_size, written_txg); + (void) printf("map length: %6u\tdisk id: %3u\n", map_length, disk_id); + + // Read in and print the actual mapping data + zio_t *rio = zio_root(spa, NULL, NULL, flags); + abd_t *map_abds[VDEV_ANYRAID_MAP_COPIES] = {0}; + int i; + for (i = 0; i <= (map_length / SPA_MAXBLOCKSIZE); i++) { + zio_eck_t *cksum = (zio_eck_t *) + &header->ah_buf[VDEV_ANYRAID_NVL_BYTES(ashift) + + i * sizeof (*cksum)]; + map_abds[i] = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_TRUE); + zio_nowait(zio_read_phys(rio, cvd, map_offset + + i * SPA_MAXBLOCKSIZE, SPA_MAXBLOCKSIZE, map_abds[i], + ZIO_CHECKSUM_ANYRAID_MAP, NULL, cksum, + ZIO_PRIORITY_SYNC_READ, flags, B_FALSE)); + } + i--; + if ((error = zio_wait(rio))) { + (void) printf("Could not read map: %s\n", strerror(error)); + for (; i >= 0; i--) + abd_free(map_abds[i]); + free_header(header, header_size); + return; + } + free_header(header, header_size); + + uint32_t map = -1, cur_tile = 0; + /* + * For now, all entries are the size of a uint32_t. If that + * ever changes, we need better logic here. + */ + uint32_t size = sizeof (uint32_t); + uint8_t *map_buf = NULL; + uint8_t par_cnt = 0; + for (uint32_t off = 0; off < map_length; off += size) { + int next_map = off / SPA_MAXBLOCKSIZE; + if (map != next_map) { + // switch maps + if (map != -1) { + abd_return_buf(map_abds[map], map_buf, + SPA_MAXBLOCKSIZE); + } + map_buf = abd_borrow_buf(map_abds[next_map], + SPA_MAXBLOCKSIZE); + map = next_map; + } + uint32_t mo = off % SPA_MAXBLOCKSIZE; + anyraid_map_entry_t *entry = + (anyraid_map_entry_t *)(map_buf + mo); + uint8_t type = ame_get_type(entry); + uint8_t *buf; + boolean_t allocated = B_FALSE; + if (size > SPA_MAXBLOCKSIZE - mo) { + buf = kmem_alloc(size, KM_SLEEP); + uint8_t rem = SPA_MAXBLOCKSIZE - mo; + allocated = B_TRUE; + memcpy(buf, map_buf + mo, rem); + // switch maps + if (map != -1) { + abd_return_buf(map_abds[map], map_buf, + SPA_MAXBLOCKSIZE); + } + map_buf = abd_borrow_buf(map_abds[next_map], + SPA_MAXBLOCKSIZE); + map = next_map; + memcpy(buf + rem, map_buf, size - rem); + } else { + buf = map_buf + mo; + } + entry = (anyraid_map_entry_t *)buf; + switch (type) { + case AMET_SKIP: { + anyraid_map_skip_entry_t *amse = + &entry->ame_u.ame_amse; + ASSERT0(par_cnt); + cur_tile += amse_get_skip_count(amse); + (void) printf("skip %u\n", + amse_get_skip_count(amse)); + break; + } + case AMET_LOC: { + anyraid_map_loc_entry_t *amle = + &entry->ame_u.ame_amle; + if (par_cnt == 0) { + (void) printf("loc %u:", cur_tile); + cur_tile++; + } + (void) printf("\td%u o%u,", amle_get_disk(amle), + amle_get_offset(amle)); + par_cnt = (par_cnt + 1) % (va->vd_nparity + 1); + if (par_cnt == 0) + (void) printf("\n"); + break; + } + default: + (void) printf("Invalid entry type %d, " + "aborting\n", type); + break; + } + if (allocated) + kmem_free(buf, size); + } + if (map_buf) + abd_return_buf(map_abds[map], map_buf, SPA_MAXBLOCKSIZE); + + va->vd_tile_size = tile_size; + + for (; i >= 0; i--) + abd_free(map_abds[i]); + + return; + +} + +/* + * Print the anyraid maps on disk. With verbosity == 2, we use the normal + * mapping-selection logic that we use during import; with higher verbosity, we + * print them all. + */ +static void +zdb_print_anyraid_ondisk_maps(vdev_t *vd, int verbosity) +{ + int child = 0; + spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); + if (verbosity == 2) { + anyraid_header_t header; + int mapping; + uint64_t txg; + int error = vdev_anyraid_pick_best_mapping( + vd->vdev_child[child], &txg, &header, &mapping); + if (error != 0) { + (void) printf("Could not print mapping: %s\n", + strerror(error)); + spa_config_exit(spa, SCL_ZIO, FTAG); + return; + } + (void) printf("anyraid map %d:\n", mapping); + print_anyraid_mapping(vd, child, mapping, verbosity, &header); + } else if (verbosity == 3) { + for (int i = 0; i < VDEV_ANYRAID_MAP_COPIES; i++) { + (void) printf("anyraid map %d:\n", i); + anyraid_header_t header; + int error = vdev_anyraid_open_header( + vd->vdev_child[child], i, &header); + if (error != 0) { + (void) printf("Could not print mapping: %s\n", + strerror(error)); + spa_config_exit(spa, SCL_ZIO, FTAG); + return; + } + print_anyraid_mapping(vd, child, i, verbosity, &header); + } + } else { + for (; child < vd->vdev_children; child++) { + for (int i = 0; i < VDEV_ANYRAID_MAP_COPIES; i++) { + (void) printf("anyraid map %d %d:\n", child, i); + anyraid_header_t header; + int error = vdev_anyraid_open_header( + vd->vdev_child[child], i, &header); + if (error != 0) { + (void) printf("Could not print " + "mapping: %s\n", strerror(error)); + continue; + } + print_anyraid_mapping(vd, child, i, verbosity, + &header); + } + } + + } + spa_config_exit(spa, SCL_ZIO, FTAG); +} + +/* + * Print the loaded version of the map for the provided anyraid vdev. + */ +static void +zdb_dump_anyraid_map_vdev(vdev_t *vd, int verbosity) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + + (void) printf("\t%-5s%11llu %s %#16llx\n", + "vdev", (u_longlong_t)vd->vdev_id, + "tile_size", (u_longlong_t)va->vd_tile_size); + (void) printf("\t%-8s%8llu", "tiles", + (u_longlong_t)avl_numnodes(&va->vd_tile_map)); + if (va->vd_checkpoint_tile != UINT32_MAX) { + (void) printf(". %-12s %10u\n", "checkpoint tile", + va->vd_checkpoint_tile); + } else { + (void) printf("\n"); + } + + (void) printf("\t%16s %12s %13s\n", "----------------", + "------------", "-------------"); + + anyraid_tile_t *cur = avl_first(&va->vd_tile_map); + anyraid_tile_node_t *curn = cur != NULL ? + list_head(&cur->at_list) : NULL; + while (cur) { + (void) printf("\t%-8s%8llu %-8s%04llx %-11s%02llx\n", + "tile", (u_longlong_t)cur->at_tile_id, + "offset", (u_longlong_t)curn->atn_tile_idx, + "disk", (u_longlong_t)curn->atn_disk); + curn = list_next(&cur->at_list, curn); + if (curn == NULL) { + cur = AVL_NEXT(&va->vd_tile_map, cur); + curn = cur != NULL ? list_head(&cur->at_list) : NULL; + } + } + + (void) printf("\n"); + if (verbosity > 0) + zdb_print_anyraid_tile_layout(vd); + + if (verbosity > 1) + zdb_print_anyraid_ondisk_maps(vd, verbosity); +} + +static int +zdb_dump_anyraid_map(char *vdev_str, spa_t *spa, int verbosity) +{ + vdev_t *rvd, *vd; + + /* A specific vdev. */ + if (vdev_str != NULL) { + vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev_str); + if (vd == NULL) { + (void) printf("Invalid vdev: %s\n", vdev_str); + return (EINVAL); + } + if (!vdev_is_anyraid(vd)) { + vd = vd->vdev_parent ? vd->vdev_parent : vd; + if (!vdev_is_anyraid(vd)) { + (void) printf("Not an anyraid vdev: %s\n", + vdev_str); + return (EINVAL); + } + } + + (void) printf("\nAnyRAID tiles:\n"); + zdb_dump_anyraid_map_vdev(vd, verbosity); + return (0); + } + + (void) printf("\nAnyRAID tiles:\n"); + /* All anyraid vdevs. */ + rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vd = rvd->vdev_child[c]; + if (vd->vdev_ops == &vdev_anymirror_ops || + vd->vdev_ops == &vdev_anyraidz_ops) + zdb_dump_anyraid_map_vdev(vd, verbosity); + } + return (0); +} + int main(int argc, char **argv) { - int c; + int c, long_index; + boolean_t opt_anyraid_map = B_FALSE; int dump_all = 1; int verbose = 0; int error = 0; @@ -9566,12 +10130,14 @@ main(int argc, char **argv) ARG_BLOCK_BIN_MODE}, {"class", required_argument, NULL, ARG_BLOCK_CLASSES}, + {"anyraid-map", no_argument, NULL, + ARG_ANYRAID_MAP}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ", - long_options, NULL)) != -1) { + long_options, &long_index)) != -1) { switch (c) { case 'b': case 'B': @@ -9732,6 +10298,10 @@ main(int argc, char **argv) free(buf); break; } + case ARG_ANYRAID_MAP: + opt_anyraid_map = B_TRUE; + dump_all = 0; + break; default: usage(); break; @@ -10146,6 +10716,16 @@ main(int argc, char **argv) argc--; if (dump_opt['r']) { error = zdb_copy_object(os, object, argv[1]); + } else if (opt_anyraid_map) { + if (argc == 0) + error = zdb_dump_anyraid_map(NULL, spa, verbose); + else + for (int i = 0; i < argc; i++) { + error = zdb_dump_anyraid_map(argv[i], spa, + verbose); + if (error != 0) + break; + } } else if (!dump_opt['R']) { flagbits['d'] = ZOR_FLAG_DIRECTORY; flagbits['f'] = ZOR_FLAG_PLAIN_FILE; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 265d7488dd8a..74a9f9694992 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -134,6 +134,9 @@ static int zpool_do_wait(int, char **); static int zpool_do_ddt_prune(int, char **); +static int zpool_do_rebalance(int, char **); +static int zpool_do_contract(int, char **); + static int zpool_do_help(int argc, char **argv); static zpool_compat_status_t zpool_do_load_compat( @@ -202,6 +205,8 @@ typedef enum { HELP_REGUID, HELP_REOPEN, HELP_VERSION, + HELP_REBALANCE, + HELP_CONTRACT, HELP_WAIT } zpool_help_t; @@ -433,6 +438,8 @@ static zpool_command_t command_table[] = { { "wait", zpool_do_wait, HELP_WAIT }, { NULL }, { "ddtprune", zpool_do_ddt_prune, HELP_DDT_PRUNE }, + { "rebalance", zpool_do_rebalance, HELP_REBALANCE }, + { "contract", zpool_do_contract, HELP_CONTRACT }, }; #define NCOMMAND (ARRAY_SIZE(command_table)) @@ -554,6 +561,11 @@ get_usage(zpool_help_t idx) " [interval]\n")); case HELP_DDT_PRUNE: return (gettext("\tddtprune -d|-p \n")); + case HELP_REBALANCE: + return (gettext("\trebalance [vdev]\n")); + case HELP_CONTRACT: + return (gettext("\tcontract " + "\n")); default: __builtin_unreachable(); } @@ -10340,6 +10352,104 @@ print_raidz_expand_status(zpool_handle_t *zhp, pool_raidz_expand_stat_t *pres) } free(vname); } + +/* + * Print out detailed anyraid rebalance status. + */ +static void +print_anyraid_rebalance_status(zpool_handle_t *zhp, + pool_anyraid_relocate_stat_t *pars) +{ + char copied_buf[7]; + + if (pars == NULL || pars->pars_state == ARS_NONE) + return; + + /* + * Determine name of vdev. + */ + nvlist_t *config = zpool_get_config(zhp, NULL); + nvlist_t *nvroot = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE); + nvlist_t **child; + uint_t children; + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0); + assert(pars->pars_relocating_vdev < children); + + printf(" "); + printf_color(ANSI_BOLD, gettext("rebalance:")); + printf(" "); + + time_t start = pars->pars_start_time; + time_t end = pars->pars_end_time; + char *vname = + zpool_vdev_name(g_zfs, zhp, child[pars->pars_relocating_vdev], 0); + zfs_nicenum(pars->pars_moved, copied_buf, sizeof (copied_buf)); + + /* + * Expansion is finished or canceled. + */ + if (pars->pars_state == ARS_FINISHED) { + char time_buf[32]; + secs_to_dhms(end - start, time_buf); + + (void) printf(gettext("rebalanced %s-%u moved %s in %s, " + "on %s"), vname, (int)pars->pars_relocating_vdev, + copied_buf, time_buf, ctime((time_t *)&end)); + } else { + char examined_buf[7], total_buf[7], rate_buf[7]; + uint64_t copied, total, elapsed, rate, secs_left; + double fraction_done; + + /* + * Expansion is in progress. + */ + (void) printf(gettext( + "rebalance of %s-%u in progress since %s"), + vname, (int)pars->pars_relocating_vdev, ctime(&start)); + + copied = pars->pars_moved > 0 ? pars->pars_moved : 1; + total = pars->pars_to_move; + fraction_done = (double)copied / total; + + /* elapsed time for this pass */ + elapsed = time(NULL) - pars->pars_start_time; + elapsed = elapsed > 0 ? elapsed : 1; + rate = copied / elapsed; + rate = rate > 0 ? rate : 1; + secs_left = (total - copied) / rate; + + zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); + zfs_nicenum(total, total_buf, sizeof (total_buf)); + zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); + + /* + * do not print estimated time if hours_left is more than + * 30 days + */ + (void) printf(gettext("\t%s / %s copied at %s/s, %.2f%% done"), + examined_buf, total_buf, rate_buf, 100 * fraction_done); + if (pars->pars_state == ARS_SCRUBBING) { + (void) printf(gettext(", waiting for scrub to " + "complete\n")); + } else if (pars->pars_state == ARS_CONTRACTING) { + (void) printf(gettext(", removing vdev\n")); + } else if (pars->pars_waiting_for_resilver) { + (void) printf(gettext(", paused for resilver or " + "clear\n")); + } else if (secs_left < (30 * 24 * 3600)) { + char time_buf[32]; + secs_to_dhms(secs_left, time_buf); + (void) printf(gettext(", %s to go\n"), time_buf); + } else { + (void) printf(gettext( + ", (copy is slow, no estimated time)\n")); + } + } + free(vname); +} + static void print_checkpoint_status(pool_checkpoint_stat_t *pcs) { @@ -11087,6 +11197,12 @@ status_callback(zpool_handle_t *zhp, void *data) ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c); print_raidz_expand_status(zhp, pres); + pool_anyraid_relocate_stat_t *pars = NULL; + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_ANYRAID_RELOCATE_STATS, (uint64_t **)&pars, + &c); + print_anyraid_rebalance_status(zhp, pars); + cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, cbp->cb_name_flags | VDEV_NAME_TYPE_ID); if (cbp->cb_namewidth < 10) @@ -13313,8 +13429,10 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) pool_scan_stat_t *pss = NULL; pool_removal_stat_t *prs = NULL; pool_raidz_expand_stat_t *pres = NULL; + pool_anyraid_relocate_stat_t *pars = NULL; const char *const headers[] = {"DISCARD", "FREE", "INITIALIZE", - "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM", "RAIDZ_EXPAND"}; + "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM", "RAIDZ_EXPAND", + "ANYRAID_REBALANCE"}; int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES]; /* Calculate the width of each column */ @@ -13383,6 +13501,13 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) bytes_rem[ZPOOL_WAIT_RAIDZ_EXPAND] = rem; } + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_ANYRAID_RELOCATE_STATS, (uint64_t **)&pars, &c); + if (pars != NULL && pars->pars_state == ARS_SCANNING) { + int64_t rem = pars->pars_to_move - pars->pars_moved; + bytes_rem[ZPOOL_WAIT_ANYRAID_RELOCATE] = rem; + } + bytes_rem[ZPOOL_WAIT_INITIALIZE] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE); bytes_rem[ZPOOL_WAIT_TRIM] = @@ -13521,7 +13646,7 @@ zpool_do_wait(int argc, char **argv) static const char *const col_opts[] = { "discard", "free", "initialize", "replace", "remove", "resilver", "scrub", "trim", - "raidz_expand" }; + "raidz_expand", "anyraid_relocate"}; for (i = 0; i < ARRAY_SIZE(col_opts); ++i) if (strcmp(tok, col_opts[i]) == 0) { @@ -13713,6 +13838,88 @@ zpool_do_ddt_prune(int argc, char **argv) return (error); } +/* + * zpool rebalance [vdev] + * + * Rebalance anyraid tiles on the specific vdev, or all anyraid vdevs. + */ +int +zpool_do_rebalance(int argc, char **argv) +{ + zpool_handle_t *zhp; + int c; + + while ((c = getopt(argc, argv, "")) != -1) { + switch (c) { + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc -= optind; + argv += optind; + + if (argc == 0) { + (void) fprintf(stderr, gettext("no pool provided\n")); + usage(B_FALSE); + } + char *poolname = argv[0]; + argc--; + argv++; + + zhp = zpool_open(g_zfs, poolname); + if (zhp == NULL) + return (-1); + + int error = zpool_rebalance(zhp, argv, argc); + + zpool_close(zhp); + + return (error); +} + +/* + * zpool contract + * + * Contract anyraid vdev by removing a specific leaf vdev. + */ +int +zpool_do_contract(int argc, char **argv) +{ + zpool_handle_t *zhp; + int c; + + while ((c = getopt(argc, argv, "")) != -1) { + switch (c) { + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc -= optind; + argv += optind; + + if (argc != 3) { + (void) fprintf(stderr, gettext("incorrect arguments\n")); + usage(B_FALSE); + } + char *poolname = argv[0]; + char *anyraid_vdev = argv[1]; + char *leaf_vdev = argv[2]; + + zhp = zpool_open(g_zfs, poolname); + if (zhp == NULL) + return (-1); + + int error = zpool_contract(zhp, anyraid_vdev, leaf_vdev); + + zpool_close(zhp); + + return (error); +} + static int find_command_idx(const char *command, int *idx) { diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index d1e9ef76dc10..ef30ae86b578 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -78,6 +78,7 @@ #include "zpool_util.h" #include #include +#include /* * For any given vdev specification, we can have multiple errors. The @@ -431,7 +432,8 @@ is_raidz_mirror(replication_level_t *a, replication_level_t *b, { if ((strcmp(a->zprl_type, "raidz") == 0 || strcmp(a->zprl_type, "draid") == 0) && - strcmp(b->zprl_type, "mirror") == 0) { + (strcmp(b->zprl_type, "mirror") == 0 || + strcmp(b->zprl_type, "anymirror") == 0)) { *raidz = a; *mirror = b; return (B_TRUE); @@ -527,11 +529,12 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) rep.zprl_children = 0; if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || - strcmp(type, VDEV_TYPE_DRAID) == 0) { + strcmp(type, VDEV_TYPE_DRAID) == 0 || + strcmp(type, VDEV_TYPE_ANYMIRROR) == 0 || + strcmp(type, VDEV_TYPE_ANYRAIDZ) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &rep.zprl_parity) == 0); - assert(rep.zprl_parity != 0); } else { rep.zprl_parity = 0; } @@ -541,6 +544,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) * already reported an error for this spec, so don't * bother doing it again. */ + const char *orig_type = type; type = NULL; dontreport = 0; vdev_size = -1LL; @@ -643,10 +647,12 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) * they differ by a significant amount * (~16MB) then report an error. */ - if (!dontreport && - (vdev_size != -1LL && + if (!dontreport && (vdev_size != -1LL && (llabs(size - vdev_size) > - ZPOOL_FUZZ))) { + ZPOOL_FUZZ)) && (strcmp(orig_type, + VDEV_TYPE_ANYMIRROR) != 0 && + strcmp(orig_type, VDEV_TYPE_ANYRAIDZ) != + 0)) { if (ret != NULL) free(ret); ret = NULL; @@ -726,19 +732,6 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) else return (NULL); } - } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != - 0) { - if (ret != NULL) - free(ret); - ret = NULL; - if (fatal) - vdev_error(gettext( - "mismatched replication level: " - "both %s and %s vdevs are " - "present\n"), - lastrep.zprl_type, rep.zprl_type); - else - return (NULL); } else if (lastrep.zprl_parity != rep.zprl_parity) { if (ret) free(ret); @@ -754,7 +747,10 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) rep.zprl_type); else return (NULL); - } else if (lastrep.zprl_children != rep.zprl_children) { + } else if (lastrep.zprl_children != + rep.zprl_children && (strcmp(rep.zprl_type, + VDEV_TYPE_ANYMIRROR) != 0 && strcmp(rep.zprl_type, + VDEV_TYPE_ANYRAIDZ) != 0)) { if (ret) free(ret); ret = NULL; @@ -941,6 +937,12 @@ lines_to_stderr(char *lines[], int lines_cnt) } } +static boolean_t +strstarts(const char *str, const char *prefix) +{ + return (strncmp(str, prefix, strlen(prefix)) == 0); +} + /* * Go through and find any whole disks in the vdev specification, labelling them * as appropriate. When constructing the vdev spec, we were unable to open this @@ -1040,8 +1042,7 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv, boolean_t replacing) char **lines = NULL; int lines_cnt = 0; - ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); - if (ret == 0) { + if (strstarts(udevpath, UDISK_ROOT)) { ret = lstat64(udevpath, &statbuf); if (ret == 0 && S_ISLNK(statbuf.st_mode)) (void) unlink(udevpath); @@ -1200,7 +1201,7 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, } /* - * Returns the parity level extracted from a raidz or draid type. + * Returns the parity level extracted from a raidz, anyraid, or draid type. * If the parity cannot be determined zero is returned. */ static int @@ -1209,7 +1210,7 @@ get_parity(const char *type) long parity = 0; const char *p; - if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) { + if (strstarts(type, VDEV_TYPE_RAIDZ)) { p = type + strlen(VDEV_TYPE_RAIDZ); if (*p == '\0') { @@ -1228,8 +1229,35 @@ get_parity(const char *type) return (0); } } - } else if (strncmp(type, VDEV_TYPE_DRAID, - strlen(VDEV_TYPE_DRAID)) == 0) { + } else if (strstarts(type, VDEV_TYPE_ANYMIRROR)) { + p = type + strlen(VDEV_TYPE_ANYMIRROR); + + if (*p == '\0') { + /* when unspecified default to 1-parity mirror */ + return (1); + } else { + char *end; + errno = 0; + parity = strtol(p, &end, 10); + if (errno != 0 || *end != '\0' || parity < 0) + return (-1); + } + } else if (strstarts(type, VDEV_TYPE_ANYRAIDZ)) { + p = type + strlen(VDEV_TYPE_ANYRAIDZ); + + if (*p == '\0') { + /* when unspecified default to 1-parity mirror */ + return (1); + } else { + char *end; + errno = 0; + parity = strtol(p, &end, 10); + if (errno != 0 || *end != ':' || + parity < 0 || parity > VDEV_RAIDZ_MAXPARITY) { + return (-1); + } + } + } else if (strstarts(type, VDEV_TYPE_DRAID)) { p = type + strlen(VDEV_TYPE_DRAID); if (*p == '\0' || *p == ':') { @@ -1264,8 +1292,8 @@ is_grouping(const char *type, int *mindev, int *maxdev) { int nparity; - if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || - strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) { + if (strstarts(type, VDEV_TYPE_RAIDZ)|| + strstarts(type, VDEV_TYPE_DRAID)) { nparity = get_parity(type); if (nparity == 0) return (NULL); @@ -1274,8 +1302,7 @@ is_grouping(const char *type, int *mindev, int *maxdev) if (maxdev != NULL) *maxdev = 255; - if (strncmp(type, VDEV_TYPE_RAIDZ, - strlen(VDEV_TYPE_RAIDZ)) == 0) { + if (strstarts(type, VDEV_TYPE_RAIDZ)) { return (VDEV_TYPE_RAIDZ); } else { return (VDEV_TYPE_DRAID); @@ -1285,6 +1312,28 @@ is_grouping(const char *type, int *mindev, int *maxdev) if (maxdev != NULL) *maxdev = INT_MAX; + if (strstarts(type, VDEV_TYPE_ANYMIRROR)) { + nparity = get_parity(type); + if (nparity < 0) + return (NULL); + if (mindev != NULL) + *mindev = nparity + 1; + if (maxdev != NULL) + *maxdev = 255; + return (VDEV_TYPE_ANYMIRROR); + } + + if (strstarts(type, VDEV_TYPE_ANYRAIDZ)) { + nparity = get_parity(type); + if (nparity < 0) + return (NULL); + if (mindev != NULL) + *mindev = nparity + 1; + if (maxdev != NULL) + *maxdev = 255; + return (VDEV_TYPE_ANYRAIDZ); + } + if (strcmp(type, "mirror") == 0) { if (mindev != NULL) *mindev = 2; @@ -1319,6 +1368,90 @@ is_grouping(const char *type, int *mindev, int *maxdev) return (NULL); } +static int +anyraidz_config_by_type(nvlist_t *nv, const char *type) +{ + uint64_t nparity; + uint64_t ndata = UINT64_MAX; + + if (!strstarts(type, VDEV_TYPE_ANYRAIDZ)) + return (EINVAL); + + nparity = (uint64_t)get_parity(type); + if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) { + fprintf(stderr, + gettext("invalid anyraid parity level %llu; must be " + "between 1 and %d\n"), (u_longlong_t)nparity, + VDEV_RAIDZ_MAXPARITY); + return (EINVAL); + } + + char *p = (char *)type; + if ((p = strchr(p, ':')) == NULL) { + fprintf(stderr, gettext("no anyraid data count detected\n")); + return (EINVAL); + } + char *end; + + p = p + 1; + errno = 0; + + if (!isdigit(p[0])) { + (void) fprintf(stderr, gettext("invalid anyraidz " + "syntax; expected : not '%s'\n"), + type); + return (EINVAL); + } + + /* Expected non-zero value with c/d suffix */ + ndata = strtol(p, &end, 10); + if (errno != 0) { + (void) fprintf(stderr, gettext("invalid anyraidz " + "syntax; expected : not '%s'\n"), + type); + return (EINVAL); + } + + if (ndata + nparity > VDEV_ANYRAID_MAX_DISKS) { + fprintf(stderr, gettext("too many devices in anyraid group: " + "%"PRIu64" data and %"PRIu64" parity"), ndata, nparity); + return (EINVAL); + } + + if (ndata == 0 || nparity == 0) { + fprintf(stderr, gettext("invalid %s: must not be zero"), + ndata == 0 ? "ndata" : "nparity"); + return (EINVAL); + } + + /* Store the basic anyraidz configuration. */ + fnvlist_add_uint8(nv, ZPOOL_CONFIG_ANYRAID_PARITY_TYPE, VAP_RAIDZ); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); + fnvlist_add_uint8(nv, ZPOOL_CONFIG_ANYRAID_NDATA, (uint8_t)ndata); + + return (0); +} + +static int +anyraid_config_by_type(nvlist_t *nv, const char *type) +{ + uint64_t nparity = 0; + + if (!(strstarts(type, VDEV_TYPE_ANYMIRROR) || + strstarts(type, VDEV_TYPE_ANYRAIDZ))) + return (EINVAL); + + nparity = (uint64_t)get_parity(type); + + if (strstarts(type, VDEV_TYPE_ANYRAIDZ)) + return (anyraidz_config_by_type(nv, type)); + + fnvlist_add_uint8(nv, ZPOOL_CONFIG_ANYRAID_PARITY_TYPE, VAP_MIRROR); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); + + return (0); +} + /* * Extract the configuration parameters encoded in the dRAID type and * use them to generate a dRAID configuration. The expected format is: @@ -1350,7 +1483,7 @@ draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) uint64_t ngroups = 1; long value; - if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0) + if (!strstarts(type, VDEV_TYPE_DRAID)) return (EINVAL); nparity = (uint64_t)get_parity(type); @@ -1527,9 +1660,9 @@ construct_spec(nvlist_t *props, int argc, char **argv) nv = NULL; /* - * If it's a mirror, raidz, or draid the subsequent arguments - * are its leaves -- until we encounter the next mirror, - * raidz or draid. + * If it's a mirror, raidz, anyraid, or draid the subsequent + * arguments are its leaves -- until we encounter the next + * mirror, raidz, anyraid, or draid. */ if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) { nvlist_t **child = NULL; @@ -1596,7 +1729,8 @@ construct_spec(nvlist_t *props, int argc, char **argv) } if (is_log) { - if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { + if (strcmp(type, VDEV_TYPE_MIRROR) != 0 && + strcmp(type, VDEV_TYPE_ANYMIRROR) != 0) { (void) fprintf(stderr, gettext("invalid vdev " "specification: unsupported 'log' " @@ -1690,6 +1824,16 @@ construct_spec(nvlist_t *props, int argc, char **argv) ZPOOL_CONFIG_NPARITY, mindev - 1) == 0); } + if (strcmp(type, VDEV_TYPE_ANYMIRROR) == 0 || + strcmp(type, VDEV_TYPE_ANYRAIDZ) == 0) { + if (anyraid_config_by_type(nv, fulltype) + != 0) { + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + } if (strcmp(type, VDEV_TYPE_DRAID) == 0) { if (draid_config_by_type(nv, fulltype, children) != 0) { diff --git a/cmd/ztest.c b/cmd/ztest.c index bab7e32db414..36bb5f57090c 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -106,6 +106,7 @@ #include #include #include +#include #include #include #include @@ -172,6 +173,24 @@ typedef enum { RAIDZ_EXPAND_CHECKED, /* Pool scrub verification done */ } raidz_expand_test_state_t; +/* Dedicated Anyraid rebalance test states */ +typedef enum { + ANYRAID_REBAL_NONE, /* Default is none, must opt-in */ + ANYRAID_REBAL_REQUESTED, /* The '-X' option was used */ + ANYRAID_REBAL_STARTED, /* Testing has commenced */ + ANYRAID_REBAL_KILLED, /* Reached the proccess kill */ + ANYRAID_REBAL_CHECKED, /* Pool scrub verification done */ +} anyraid_rebalance_test_state_t; + +/* Dedicated Anyraid contraction test states */ +typedef enum { + ANYRAID_CONTRACT_NONE, /* Default is none, must opt-in */ + ANYRAID_CONTRACT_REQUESTED, /* The '-X' option was used */ + ANYRAID_CONTRACT_STARTED, /* Testing has commenced */ + ANYRAID_CONTRACT_KILLED, /* Reached the proccess kill */ + ANYRAID_CONTRACT_CHECKED, /* Pool scrub verification done */ +} anyraid_contraction_test_state_t; + #define ZO_GVARS_MAX_ARGLEN ((size_t)64) #define ZO_GVARS_MAX_COUNT ((size_t)10) @@ -189,7 +208,7 @@ typedef struct ztest_shared_opts { int zo_raid_do_expand; int zo_raid_children; int zo_raid_parity; - char zo_raid_type[8]; + char zo_raid_type[16]; int zo_draid_data; int zo_draid_spares; int zo_datasets; @@ -202,6 +221,8 @@ typedef struct ztest_shared_opts { uint64_t zo_maxloops; uint64_t zo_metaslab_force_ganging; raidz_expand_test_state_t zo_raidz_expand_test; + anyraid_rebalance_test_state_t zo_anyraid_rebal_test; + anyraid_contraction_test_state_t zo_anyraid_contract_test; int zo_mmp_test; int zo_special_vdevs; int zo_dump_dbgmsg; @@ -264,6 +285,8 @@ static const ztest_shared_opts_t ztest_opts_defaults = { .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, .zo_gvars_count = 0, .zo_raidz_expand_test = RAIDZ_EXPAND_NONE, + .zo_anyraid_rebal_test = ANYRAID_REBAL_NONE, + .zo_anyraid_contract_test = ANYRAID_CONTRACT_NONE, }; extern uint64_t metaslab_force_ganging; @@ -276,9 +299,11 @@ extern uint_t dmu_object_alloc_chunk_shift; extern boolean_t zfs_force_some_double_word_sm_entries; extern unsigned long zfs_reconstruct_indirect_damage_fraction; extern uint64_t raidz_expand_max_reflow_bytes; +extern uint64_t anyraid_relocate_max_bytes_pause; extern uint_t raidz_expand_pause_point; extern boolean_t ddt_prune_artificial_age; extern boolean_t ddt_dump_prune_histogram; +extern uint64_t zfs_anyraid_min_tile_size; static ztest_shared_opts_t *ztest_shared_opts; @@ -674,10 +699,12 @@ fatal(int do_perror, const char *message, ...) fatal_msg = buf; /* to ease debugging */ out: - if (ztest_dump_core) + if (ztest_dump_core) { abort(); - else + } else { + // NOTE: Not safe if we've called kernel_fini already dump_debug_buffer(); + } exit(3); } @@ -770,7 +797,7 @@ static ztest_option_t option_table[] = { DEFAULT_RAID_CHILDREN, NULL}, { 'R', "raid-parity", "INTEGER", "Raid parity", DEFAULT_RAID_PARITY, NULL}, - { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind", + { 'K', "raid-kind", "raidz|eraidz|draid|anymirror|random", "Raid kind", NO_DEFAULT, "random"}, { 'D', "draid-data", "INTEGER", "Number of draid data drives", DEFAULT_DRAID_DATA, NULL}, @@ -820,6 +847,10 @@ static ztest_option_t option_table[] = { NO_DEFAULT, NULL}, { 'h', "help", NULL, "Show this help", NO_DEFAULT, NULL}, + { 'b', "anyraid-rebalance", NULL, + "Perform a dedicated anyraid rebalance test", NO_DEFAULT, NULL}, + { 'c', "anyraid-contraction", NULL, + "Perform a dedicated anyraid contraction test", NO_DEFAULT, NULL}, {0, 0, 0, 0, 0, 0} }; @@ -1054,6 +1085,19 @@ process_options(int argc, char **argv) break; case 'X': zo->zo_raidz_expand_test = RAIDZ_EXPAND_REQUESTED; + zo->zo_anyraid_contract_test = ANYRAID_CONTRACT_NONE; + zo->zo_anyraid_rebal_test = ANYRAID_REBAL_NONE; + break; + case 'c': + zo->zo_anyraid_contract_test = + ANYRAID_CONTRACT_REQUESTED; + zo->zo_raidz_expand_test = RAIDZ_EXPAND_NONE; + zo->zo_anyraid_rebal_test = ANYRAID_REBAL_NONE; + break; + case 'b': + zo->zo_anyraid_rebal_test = ANYRAID_REBAL_REQUESTED; + zo->zo_raidz_expand_test = RAIDZ_EXPAND_NONE; + zo->zo_anyraid_contract_test = ANYRAID_CONTRACT_NONE; break; case 'E': zo->zo_init = 0; @@ -1114,10 +1158,18 @@ process_options(int argc, char **argv) zo->zo_vdev_size = DEFAULT_VDEV_SIZE * 2; zo->zo_raid_do_expand = B_FALSE; raid_kind = "raidz"; + } else if (zo->zo_anyraid_contract_test == ANYRAID_CONTRACT_REQUESTED) { + zo->zo_mmp_test = 0; + zo->zo_mirrors = 0; + raid_kind = "anymirror"; + } else if (zo->zo_anyraid_rebal_test == ANYRAID_REBAL_REQUESTED) { + zo->zo_mmp_test = 0; + zo->zo_mirrors = 0; + raid_kind = "anymirror"; } if (strcmp(raid_kind, "random") == 0) { - switch (ztest_random(3)) { + switch (ztest_random(4)) { case 0: raid_kind = "raidz"; break; @@ -1127,6 +1179,9 @@ process_options(int argc, char **argv) case 2: raid_kind = "draid"; break; + case 3: + raid_kind = "anymirror"; + break; } if (ztest_opts.zo_verbose >= 3) @@ -1178,11 +1233,39 @@ process_options(int argc, char **argv) zo->zo_raid_parity = MIN(zo->zo_raid_parity, zo->zo_raid_children - 1); - } else /* using raidz */ { - ASSERT0(strcmp(raid_kind, "raidz")); + } else if (strcmp(raid_kind, "raidz") == 0) { + zo->zo_raid_parity = MIN(zo->zo_raid_parity, + zo->zo_raid_children - 1); + } else if (strcmp(raid_kind, "anymirror") == 0) { + uint64_t min_devsize; + + /* With fewer disks use 1G, otherwise 512M is OK */ + min_devsize = (ztest_opts.zo_raid_children < 16) ? + (1ULL << 30) : (512ULL << 20); + if (zo->zo_vdev_size < min_devsize) + zo->zo_vdev_size = min_devsize; + + zo->zo_raid_parity = MIN(zo->zo_raid_parity, + zo->zo_raid_children - 1); + + (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_ANYMIRROR, + sizeof (zo->zo_raid_type)); + } else if (strcmp(raid_kind, "anyraidz") == 0) { + uint64_t min_devsize; + + /* With fewer disks use 1G, otherwise 512M is OK */ + min_devsize = (ztest_opts.zo_raid_children < 16) ? + (1ULL << 30) : (512ULL << 20); + if (zo->zo_vdev_size < min_devsize) + zo->zo_vdev_size = min_devsize; zo->zo_raid_parity = MIN(zo->zo_raid_parity, zo->zo_raid_children - 1); + + (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_ANYRAIDZ, + sizeof (zo->zo_raid_type)); + } else { + fatal(B_FALSE, "invalid raid kind %s", raid_kind); } zo->zo_vdevtime = @@ -1373,6 +1456,16 @@ make_vdev_raid(const char *path, const char *aux, const char *pool, size_t size, fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); + } else if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_ANYMIRROR) == 0) { + enum vdev_anyraid_parity_type type = VAP_MIRROR; + fnvlist_add_uint8(raid, ZPOOL_CONFIG_ANYRAID_PARITY_TYPE, + type); + } else if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_ANYRAIDZ) == 0) { + enum vdev_anyraid_parity_type type = VAP_RAIDZ; + uint64_t ndata = ztest_opts.zo_draid_data; + fnvlist_add_uint64(raid, ZPOOL_CONFIG_ANYRAID_NDATA, ndata); + fnvlist_add_uint8(raid, ZPOOL_CONFIG_ANYRAID_PARITY_TYPE, + type); } for (c = 0; c < r; c++) @@ -3164,7 +3257,8 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) return; /* dRAID added after feature flags, skip upgrade test. */ - if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) + if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0 || + strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_ANYMIRROR) == 0) return; mutex_enter(&ztest_vdev_lock); @@ -3788,28 +3882,47 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (ztest_opts.zo_raid_children > 1) { if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); + else if (strcmp(oldvd->vdev_ops->vdev_op_type, "anymirror") == + 0) + ASSERT3P(oldvd->vdev_ops, ==, &vdev_anymirror_ops); + else if (strcmp(oldvd->vdev_ops->vdev_op_type, "anyraidz") == 0) + ASSERT3P(oldvd->vdev_ops, ==, &vdev_anyraidz_ops); else ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); oldvd = oldvd->vdev_child[leaf % raidz_children]; } + boolean_t anyraid = vdev_is_anyraid(oldvd->vdev_parent); + + if (!replacing && anyraid) { + oldvd = oldvd->vdev_parent; + } + /* * If we're already doing an attach or replace, oldvd may be a - * mirror vdev -- in which case, pick a random child. + * mirror vdev -- in which case, pick a random child. For anyraid vdevs, + * attachment occurs at the parent level. */ - while (oldvd->vdev_children != 0) { + while (oldvd->vdev_children != 0 && !anyraid) { oldvd_has_siblings = B_TRUE; ASSERT3U(oldvd->vdev_children, >=, 2); oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; } oldguid = oldvd->vdev_guid; - oldsize = vdev_get_min_asize(oldvd); + oldsize = vdev_get_min_attach_size(oldvd); oldvd_is_log = oldvd->vdev_top->vdev_islog; oldvd_is_special = oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_SPECIAL || oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_DEDUP; - (void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN); + if (oldvd->vdev_path == NULL) { + ASSERT(vdev_is_anyraid(oldvd)); + snprintf(oldpath, MAXPATHLEN, "%s-%llu", + oldvd->vdev_ops->vdev_op_type, + (u_longlong_t)oldvd->vdev_id); + } else { + (void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN); + } pvd = oldvd->vdev_parent; pguid = pvd->vdev_guid; @@ -3818,7 +3931,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * to the detach the pool is scrubbed in order to prevent creating * unrepairable blocks as a result of the data corruption injection. */ - if (oldvd_has_siblings && ztest_random(2) == 0) { + if (oldvd_has_siblings && !anyraid && ztest_random(2) == 0) { spa_config_exit(spa, SCL_ALL, FTAG); error = ztest_scrub_impl(spa); @@ -3882,7 +3995,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * If newvd is a distributed spare and it's being attached to a * dRAID which is not its parent it should fail with ENOTSUP. */ - if (pvd->vdev_ops != &vdev_mirror_ops && + if (anyraid) + expected_error = 0; + else if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && (!replacing || pvd->vdev_ops == &vdev_replacing_ops || pvd->vdev_ops == &vdev_spare_ops)) @@ -3894,7 +4009,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) expected_error = replacing ? 0 : EBUSY; else if (vdev_lookup_by_path(rvd, newpath) != NULL) expected_error = EBUSY; - else if (!newvd_is_dspare && newsize < oldsize) + else if (newsize < oldsize && !(newvd_is_dspare || + (vdev_is_anyraid(pvd) && + newsize < pvd->vdev_ops->vdev_op_min_asize(pvd, oldvd)))) expected_error = EOVERFLOW; else if (ashift > oldvd->vdev_top->vdev_ashift) expected_error = EDOM; @@ -3915,8 +4032,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * When supported select either a healing or sequential resilver. */ boolean_t rebuilding = B_FALSE; - if (pvd->vdev_ops == &vdev_mirror_ops || - pvd->vdev_ops == &vdev_root_ops) { + if (oldvd->vdev_ops != &vdev_anyraidz_ops && + (pvd->vdev_ops == &vdev_mirror_ops || + pvd->vdev_ops == &vdev_root_ops)) { rebuilding = !!ztest_random(2); } @@ -7650,6 +7768,7 @@ typedef struct ztest_raidz_expand_io { uint64_t rzx_bufsize; const void *rzx_buffer; uint64_t rzx_alloc_max; + boolean_t rzx_removes; spa_t *rzx_spa; } ztest_expand_io_t; @@ -7703,11 +7822,12 @@ ztest_rzx_thread(void *arg) } } - /* Remove a few objects to leave some holes in allocation space */ - mutex_enter(&zd->zd_dirobj_lock); - (void) ztest_remove(zd, od, 2); - mutex_exit(&zd->zd_dirobj_lock); - + if (info->rzx_removes) { + // Remove a few objects to leave some holes in allocation space + mutex_enter(&zd->zd_dirobj_lock); + (void) ztest_remove(zd, od, 2); + mutex_exit(&zd->zd_dirobj_lock); + } umem_free(od, od_size); thread_exit(); @@ -8180,6 +8300,7 @@ ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) thread_args[t].rzx_buffer = buffer; thread_args[t].rzx_alloc_max = alloc_goal; thread_args[t].rzx_spa = spa; + thread_args[t].rzx_removes = B_TRUE; run_threads[t] = thread_create(NULL, 0, ztest_rzx_thread, &thread_args[t], 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); @@ -8312,6 +8433,423 @@ ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) ztest_kill(zs); } +/* + * After the rebalance was killed, check that the pool is healthy + */ +static void +ztest_anyraid_rebal_check(spa_t *spa) +{ + ASSERT3U(ztest_opts.zo_anyraid_rebal_test, ==, ANYRAID_REBAL_KILLED); + /* + * Set pool check done flag, main program will run a zdb check + * of the pool when we exit. + */ + ztest_shared_opts->zo_anyraid_rebal_test = ANYRAID_REBAL_CHECKED; + + /* Wait for reflow to finish */ + if (ztest_opts.zo_verbose >= 1) { + (void) printf("\nwaiting for rebalance to finish ...\n"); + } + pool_anyraid_relocate_stat_t arr_stats; + pool_anyraid_relocate_stat_t *pars = &arr_stats; + do { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 500); /* wait 1/2 second */ + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_anyraid_relocate_get_stats(spa, pars); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } while (pars->pars_state < DSS_FINISHED && + pars->pars_moved < pars->pars_to_move); + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("verifying an interrupted anyraid " + "rebalance using a pool scrub ...\n"); + } + + /* Will fail here if there is non-recoverable corruption detected */ + VERIFY0(spa_approx_errlog_size(spa)); + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("anyraid rebalance scrub check complete\n"); + } +} + +static void +ztest_write_some_data(ztest_shared_t *zs, spa_t *spa, int run) +{ + int threads = ztest_opts.zo_threads; + kthread_t **run_threads; + ztest_expand_io_t *thread_args; + + /* Setup a 1 MiB buffer of random data */ + uint64_t bufsize = 1024 * 1024; + void *buffer = umem_alloc(bufsize, UMEM_NOFAIL); + random_get_pseudo_bytes((uint8_t *)buffer, bufsize); + + /* + * Put some data in the pool and then attach a vdev to initiate + * reflow. + */ + run_threads = umem_zalloc(threads * sizeof (kthread_t *), UMEM_NOFAIL); + thread_args = umem_zalloc(threads * sizeof (ztest_expand_io_t), + UMEM_NOFAIL); + // TODO group instead of class? Force writes here somehow? + uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - + metaslab_class_get_alloc(spa_normal_class(spa)); + uint_t target = ztest_random(8); + uint64_t alloc_goal = (free_space * target) / 10; + if (ztest_opts.zo_verbose >= 1) { + (void) printf("adding data to pool '%s', goal %llu/%llu " + "bytes\n", ztest_opts.zo_pool, (u_longlong_t)alloc_goal, + (u_longlong_t)free_space); + } + + if (alloc_goal == 0) + goto out; + + /* + * Kick off all the I/O generators that run in parallel. + */ + for (int t = 0; t < threads; t++) { + if (t < ztest_opts.zo_datasets && + ztest_dataset_open((run * threads + t) % + ztest_opts.zo_datasets) != 0) { + umem_free(run_threads, threads * sizeof (kthread_t *)); + umem_free(buffer, bufsize); + return; + } + thread_args[t].rzx_id = run * threads + t; + thread_args[t].rzx_amount = alloc_goal / threads; + thread_args[t].rzx_bufsize = bufsize; + thread_args[t].rzx_buffer = buffer; + thread_args[t].rzx_alloc_max = alloc_goal; + thread_args[t].rzx_spa = spa; + thread_args[t].rzx_removes = B_FALSE; + run_threads[t] = thread_create(NULL, 0, ztest_rzx_thread, + &thread_args[t], 0, NULL, TS_RUN | TS_JOINABLE, + defclsyspri); + } + + /* + * Wait for all of the writers to complete. + */ + for (int t = 0; t < threads; t++) + VERIFY0(thread_join(run_threads[t])); + + /* + * Close all datasets. This must be done after all the threads + * are joined so we can be sure none of the datasets are in-use + * by any of the threads. + */ + for (int t = 0; t < ztest_opts.zo_threads; t++) { + if (t < ztest_opts.zo_datasets) + ztest_dataset_close(t); + } + +out: + txg_wait_synced(spa_get_dsl(spa), 0); + + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); + + umem_free(buffer, bufsize); + umem_free(run_threads, threads * sizeof (kthread_t *)); + umem_free(thread_args, threads * sizeof (ztest_expand_io_t)); +} + +static void +ztest_anyraid_rebal_run(ztest_shared_t *zs, spa_t *spa) +{ + nvlist_t *root; + pool_anyraid_relocate_stat_t arr_stats; + pool_anyraid_relocate_stat_t *pars = &arr_stats; + vdev_t *cvd, *arvd = spa->spa_root_vdev->vdev_child[0]; + uint64_t csize; + int error; + + ASSERT3U(ztest_opts.zo_anyraid_rebal_test, !=, ANYRAID_REBAL_NONE); + ASSERT(vdev_is_anyraid(arvd)); + ztest_opts.zo_anyraid_rebal_test = ANYRAID_REBAL_STARTED; + + ztest_write_some_data(zs, spa, 0); + + /* Set our reflow target to 10%, 20% or 30% of allocated size */ + uint_t multiple = ztest_random(3) + 1; + uint64_t rebal_max = (arvd->vdev_stat.vs_alloc * multiple) / 10; + anyraid_relocate_max_bytes_pause = rebal_max; + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("running anyraid_rebalance test, killing when " + "rebalance reaches %llu bytes (%u/10 of allocated space)\n", + (u_longlong_t)rebal_max, multiple); + } + + /* XXX - do we want some I/O load during the rebalance? */ + + cvd = arvd->vdev_child[0]; + csize = vdev_get_min_asize(cvd); + uint_t new_mult = ztest_random(5) + 1; + csize = (csize * new_mult) / 2; + /* + * Path to vdev to be attached + */ + char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, arvd->vdev_children); + /* + * Build the nvlist describing newpath. + */ + root = make_vdev_root(newpath, NULL, NULL, csize, ztest_get_ashift(), + NULL, 0, 0, 1); + /* + * Expand the anyraid vdev by attaching the new disk + */ + if (ztest_opts.zo_verbose >= 1) { + (void) printf("rebalancing anyraid: %d wide to %d wide with " + "'%s'\n", (int)arvd->vdev_children, + (int)arvd->vdev_children + 1, newpath); + } + error = spa_vdev_attach(spa, arvd->vdev_guid, root, B_FALSE, B_FALSE); + nvlist_free(root); + if (error != 0) { + fatal(0, "anyraid rebalance: attach (%s %llu) returned %d", + newpath, (long long)csize, error); + } + + /* + * Add some more data to the pool to make rebalance more interesting. + */ + ztest_write_some_data(zs, spa, 1); // TODO tune value second time + + VERIFY0(spa_rebalance_vdevs(spa, &arvd->vdev_guid, 1)); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_anyraid_relocate_get_stats(spa, pars); + spa_config_exit(spa, SCL_CONFIG, FTAG); + while (pars->pars_state < DSS_SCANNING) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); /* wait 1/10 second */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_anyraid_relocate_get_stats(spa, pars); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + (void) poll(NULL, 0, 1000); /* wait 1 second */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_anyraid_relocate_get_stats(spa, pars); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + if (pars->pars_state != DSS_SCANNING) + return; + ASSERT3U(pars->pars_to_move, !=, 0); + /* + * Set so when we are killed we go to anyraid checking rather than + * restarting test. + */ + ztest_shared_opts->zo_anyraid_rebal_test = ANYRAID_REBAL_KILLED; + if (ztest_opts.zo_verbose >= 1) { + (void) printf("anyraid rebalance movement started, waiting for " + "%llu bytes to be copied\n", (u_longlong_t)rebal_max); + } + + /* + * Wait for rebal maximum to be reached and then kill the test + */ + while (pars->pars_moved < rebal_max) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); /* wait 1/10 second */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_anyraid_relocate_get_stats(spa, pars); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + /* Reset the rebalance pause before killing */ + anyraid_relocate_max_bytes_pause = 0; + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("killing anyraid rebalance test after move " + "reached %llu bytes\n", (u_longlong_t)pars->pars_moved); + } + + /* + * Kill ourself to simulate a panic during a rebalance. Our parent will + * restart the test and the changed flag value will drive the test + * through the scrub/check code to verify the pool is not corrupted. + */ + ztest_kill(zs); +} + +static void +ztest_anyraid_contract_check(spa_t *spa) +{ + ASSERT3U(ztest_opts.zo_anyraid_contract_test, ==, + ANYRAID_CONTRACT_KILLED); + /* + * Set pool check done flag, main program will run a zdb check + * of the pool when we exit. + */ + ztest_shared_opts->zo_anyraid_contract_test = ANYRAID_CONTRACT_CHECKED; + + /* Wait for reflow to finish */ + if (ztest_opts.zo_verbose >= 1) { + (void) printf("\nwaiting for contraction to finish ...\n"); + } + pool_anyraid_relocate_stat_t arr_stats; + pool_anyraid_relocate_stat_t *pars = &arr_stats; + do { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 500); /* wait 1/2 second */ + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_anyraid_relocate_get_stats(spa, pars); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } while (pars->pars_state != ARS_FINISHED && + pars->pars_moved < pars->pars_to_move); + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("verifying an interrupted anyraid " + "contraction using a pool scrub ...\n"); + } + + /* Will fail here if there is non-recoverable corruption detected */ + VERIFY0(spa_approx_errlog_size(spa)); + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("anyraid contraction scrub check complete\n"); + } +} + +static void +ztest_anyraid_contract_run(ztest_shared_t *zs, spa_t *spa) +{ + nvlist_t *root; + pool_anyraid_relocate_stat_t arr_stats; + pool_anyraid_relocate_stat_t *pars = &arr_stats; + vdev_t *cvd, *arvd = spa->spa_root_vdev->vdev_child[0]; + uint64_t csize; + int error; + + ASSERT3U(ztest_opts.zo_anyraid_contract_test, !=, + ANYRAID_CONTRACT_NONE); + ASSERT(vdev_is_anyraid(arvd)); + ztest_opts.zo_anyraid_contract_test = ANYRAID_CONTRACT_STARTED; + + ztest_write_some_data(zs, spa, 0); + + /* Set our reflow target to 10%, 20% or 30% of allocated size */ + uint_t multiple = ztest_random(3) + 1; + uint64_t contract_max = (arvd->vdev_stat.vs_alloc * multiple) / 10; + anyraid_relocate_max_bytes_pause = contract_max; + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("running anyraid_contraction test, killing when " + "contraction reaches %llu bytes (%u/10 of allocated space)" + "\n", (u_longlong_t)contract_max, multiple); + } + + uint_t child = ztest_random(arvd->vdev_children); + + /* XXX - do we want some I/O load during the contraction? */ + + cvd = arvd->vdev_child[child]; + csize = vdev_get_min_asize(cvd); + /* + * Path to vdev to be attached + */ + char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + int i = 0; + do { + (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, i); + i++; + } while (vdev_lookup_by_path(spa->spa_root_vdev, newpath) != NULL); + /* + * Build the nvlist describing newpath. + */ + root = make_vdev_root(newpath, NULL, NULL, csize, ztest_get_ashift(), + NULL, 0, 0, 1); + /* + * Expand the anyraid vdev by attaching the new disk + */ + if (ztest_opts.zo_verbose >= 1) { + (void) printf("expanding anyraid: %d wide to %d wide with " + "'%s'\n", (int)arvd->vdev_children, + (int)arvd->vdev_children + 1, newpath); + } + error = spa_vdev_attach(spa, arvd->vdev_guid, root, B_FALSE, B_FALSE); + nvlist_free(root); + if (error != 0) { + fatal(0, "anyraid contraction: attach (%s %llu) returned %d", + newpath, (long long)csize, error); + } + if (ztest_opts.zo_verbose >= 1) { + (void) printf("contracting anyraid: %d wide to %d wide with " + "%u\n", (int)arvd->vdev_children, + (int)arvd->vdev_children - 1, child); + } + int err = spa_contract_vdev(spa, arvd->vdev_guid, + arvd->vdev_child[child]->vdev_guid); + if (err == ENOSPC) + return; + VERIFY0(err); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_anyraid_relocate_get_stats(spa, pars); + spa_config_exit(spa, SCL_CONFIG, FTAG); + while (pars->pars_state < ARS_SCANNING) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); /* wait 1/10 second */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_anyraid_relocate_get_stats(spa, pars); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + (void) poll(NULL, 0, 1000); /* wait 1 second */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_anyraid_relocate_get_stats(spa, pars); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + ASSERT3U(pars->pars_state, ==, ARS_SCANNING); + return; + ASSERT3U(pars->pars_to_move, !=, 0); + /* + * Set so when we are killed we go to anyraid checking rather than + * restarting test. + */ + ztest_shared_opts->zo_anyraid_contract_test = ANYRAID_CONTRACT_KILLED; + if (ztest_opts.zo_verbose >= 1) { + (void) printf("anyraid contraction movement started, waiting " + "for %llu bytes to be copied\n", + (u_longlong_t)contract_max); + } + + /* + * Wait for contract maximum to be reached and then kill the test + */ + while (pars->pars_moved < contract_max) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); /* wait 1/10 second */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_anyraid_relocate_get_stats(spa, pars); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + /* Reset the contraction pause before killing */ + anyraid_relocate_max_bytes_pause = 0; + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("killing anyraid contraction test after move " + "reached %llu bytes\n", (u_longlong_t)pars->pars_moved); + dump_debug_buffer(); + } + + /* + * Kill ourself to simulate a panic during a contraction. Our parent + * will restart the test and the changed flag value will drive the test + * through the scrub/check code to verify the pool is not corrupted. + */ + ztest_kill(zs); +} + static void ztest_generic_run(ztest_shared_t *zs, spa_t *spa) { @@ -8518,8 +9056,17 @@ ztest_run(ztest_shared_t *zs) if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) ztest_raidz_expand_run(zs, spa); + else if (ztest_opts.zo_anyraid_rebal_test == ANYRAID_REBAL_REQUESTED) + ztest_anyraid_rebal_run(zs, spa); + else if (ztest_opts.zo_anyraid_contract_test == + ANYRAID_CONTRACT_REQUESTED) + ztest_anyraid_contract_run(zs, spa); else if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_KILLED) ztest_raidz_expand_check(spa); + else if (ztest_opts.zo_anyraid_rebal_test == ANYRAID_REBAL_KILLED) + ztest_anyraid_rebal_check(spa); + else if (ztest_opts.zo_anyraid_contract_test == ANYRAID_CONTRACT_KILLED) + ztest_anyraid_contract_check(spa); else ztest_generic_run(zs, spa); @@ -9000,6 +9547,9 @@ main(int argc, char **argv) metaslab_df_alloc_threshold = zs->zs_metaslab_df_alloc_threshold; + zfs_anyraid_min_tile_size = MIN(zfs_anyraid_min_tile_size, + ztest_opts.zo_vdev_size / 12); + if (zs->zs_do_init) ztest_run_init(); else @@ -9131,7 +9681,11 @@ main(int argc, char **argv) if (!ztest_opts.zo_mmp_test) ztest_run_zdb(zs->zs_guid); if (ztest_shared_opts->zo_raidz_expand_test == - RAIDZ_EXPAND_CHECKED) + RAIDZ_EXPAND_CHECKED || + ztest_shared_opts->zo_anyraid_rebal_test == + ANYRAID_REBAL_CHECKED || + ztest_shared_opts->zo_anyraid_contract_test == + ANYRAID_CONTRACT_CHECKED) break; /* raidz expand test complete */ } diff --git a/include/Makefile.am b/include/Makefile.am index ccca7cb594e6..25da4ceec743 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -101,6 +101,7 @@ COMMON_H = \ sys/unique.h \ sys/uuid.h \ sys/vdev.h \ + sys/vdev_anyraid.h \ sys/vdev_disk.h \ sys/vdev_draid.h \ sys/vdev_file.h \ @@ -108,6 +109,7 @@ COMMON_H = \ sys/vdev_indirect_births.h \ sys/vdev_indirect_mapping.h \ sys/vdev_initialize.h \ + sys/vdev_mirror.h \ sys/vdev_raidz.h \ sys/vdev_raidz_impl.h \ sys/vdev_rebuild.h \ diff --git a/include/libzfs.h b/include/libzfs.h index 0ff3948e117b..5703673031e6 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -160,6 +160,9 @@ typedef enum zfs_error { EZFS_SHAREFAILED, /* filesystem share failed */ EZFS_RAIDZ_EXPAND_IN_PROGRESS, /* a raidz is currently expanding */ EZFS_ASHIFT_MISMATCH, /* can't add vdevs with different ashifts */ + /* an anyraid vdev is already relocating */ + EZFS_ANYRAID_RELOCATE_IN_PROGRESS, + EZFS_CONTRACT_BELOW_WIDTH, /* contraction reducing disk count too far */ EZFS_UNKNOWN } zfs_error_t; @@ -322,6 +325,10 @@ _LIBZFS_H int zpool_trim_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_ddt_prune(zpool_handle_t *, zpool_ddt_prune_unit_t, uint64_t); +_LIBZFS_H int zpool_rebalance(zpool_handle_t *zhp, char **vdev_names, + int count); +_LIBZFS_H int zpool_contract(zpool_handle_t *zhp, const char *anyraid_vdev_name, + const char *leaf_vdev_name); _LIBZFS_H int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); diff --git a/include/libzfs_core.h b/include/libzfs_core.h index 231beaa69290..1f1d7555220e 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -165,6 +165,10 @@ _LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_ddt_prune(const char *, zpool_ddt_prune_unit_t, uint64_t); +_LIBZFS_CORE_H int lzc_pool_rebalance(const char *, const uint64_t *, int); +_LIBZFS_CORE_H int lzc_pool_contract(const char *zpool, uint64_t avd_guid, + uint64_t lvd_guid); + #ifdef __cplusplus } #endif diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h index e49ada399694..ac320869cdc2 100644 --- a/include/os/linux/kernel/linux/mod_compat.h +++ b/include/os/linux/kernel/linux/mod_compat.h @@ -38,6 +38,7 @@ typedef const struct kernel_param zfs_kernel_param_t; enum scope_prefix_types { zfs, + zfs_anyraid, zfs_arc, zfs_brt, zfs_condense, diff --git a/include/sys/avl.h b/include/sys/avl.h index 98436569954b..c85e2f2ca47b 100644 --- a/include/sys/avl.h +++ b/include/sys/avl.h @@ -173,7 +173,8 @@ _AVL_H void avl_create(avl_tree_t *tree, * node - node that has the value being looked for * where - position for use with avl_nearest() or avl_insert(), may be NULL */ -_AVL_H void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where); +_AVL_H void *avl_find(const avl_tree_t *tree, const void *node, + avl_index_t *where); /* * Insert a node into the tree. @@ -279,12 +280,12 @@ _AVL_H void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2); /* * Return the number of nodes in the tree */ -_AVL_H ulong_t avl_numnodes(avl_tree_t *tree); +_AVL_H ulong_t avl_numnodes(const avl_tree_t *tree); /* * Return B_TRUE if there are zero nodes in the tree, B_FALSE otherwise. */ -_AVL_H boolean_t avl_is_empty(avl_tree_t *tree); +_AVL_H boolean_t avl_is_empty(const avl_tree_t *tree); /* * Used to destroy any remaining nodes in a tree. The cookie argument should diff --git a/include/sys/dmu.h b/include/sys/dmu.h index bb623e404955..31974c7096f3 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -417,6 +417,7 @@ typedef struct dmu_buf { #define DMU_POOL_TXG_LOG_TIME_MINUTES "com.klarasystems:txg_log_time:minutes" #define DMU_POOL_TXG_LOG_TIME_DAYS "com.klarasystems:txg_log_time:days" #define DMU_POOL_TXG_LOG_TIME_MONTHS "com.klarasystems:txg_log_time:months" +#define DMU_POOL_RELOCATE_OBJ "com.klarasystems:relocate_obj" /* * Allocate an object from this objset. The range of object numbers diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index bcb98af40067..70b1da0c189d 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -95,6 +95,8 @@ typedef struct dsl_errorscrub_phys { #define ERRORSCRUB_PHYS_NUMINTS (sizeof (dsl_errorscrub_phys_t) \ / sizeof (uint64_t)) +typedef void dsl_scan_done_func_t(spa_t *, dmu_tx_t *, void *); + /* * Every pool will have one dsl_scan_t and this structure will contain * in-memory information about the scan and a pointer to the on-disk @@ -178,12 +180,17 @@ typedef struct dsl_scan { uint64_t scn_queues_pending; /* outstanding data to issue */ /* members needed for syncing error scrub status to disk */ dsl_errorscrub_phys_t errorscrub_phys; + /* Members to enable scan donefuncs */ + dsl_scan_done_func_t *scn_done; + void *scn_done_arg; } dsl_scan_t; typedef struct { pool_scan_func_t func; uint64_t txgstart; uint64_t txgend; + dsl_scan_done_func_t *done; + void *done_arg; } setup_sync_arg_t; typedef struct dsl_scan_io_queue dsl_scan_io_queue_t; @@ -193,6 +200,7 @@ void scan_fini(void); int dsl_scan_init(struct dsl_pool *dp, uint64_t txg); int dsl_scan_setup_check(void *, dmu_tx_t *); void dsl_scan_setup_sync(void *, dmu_tx_t *); +void dsl_scan_set_done_func(struct dsl_pool *, dsl_scan_done_func_t *, void *); void dsl_scan_fini(struct dsl_pool *dp); void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); int dsl_scan_cancel(struct dsl_pool *); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index de2149641d21..c884427deb0f 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -392,6 +392,9 @@ typedef enum { VDEV_PROP_AUTOSIT, VDEV_PROP_SLOW_IO_EVENTS, VDEV_PROP_SCHEDULER, + VDEV_PROP_ANYRAID_CAP_TILES, + VDEV_PROP_ANYRAID_NUM_TILES, + VDEV_PROP_ANYRAID_TILE_SIZE, VDEV_NUM_PROPS } vdev_prop_t; @@ -774,6 +777,8 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_RAIDZ_EXPAND_STATS "raidz_expand_stats" /* not on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ +/* not on disk */ +#define ZPOOL_CONFIG_ANYRAID_RELOCATE_STATS "anyraid_rebalance_stats" /* container nvlist of extended stats */ #define ZPOOL_CONFIG_VDEV_STATS_EX "vdev_stats_ex" @@ -927,10 +932,16 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_DRAID_NSPARES "draid_nspares" #define ZPOOL_CONFIG_DRAID_NGROUPS "draid_ngroups" +/* ANYRAID configuration */ +#define ZPOOL_CONFIG_ANYRAID_PARITY_TYPE "anyraid_parity_type" +#define ZPOOL_CONFIG_ANYRAID_NDATA "anyraid_ndata" + #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" +#define VDEV_TYPE_ANYMIRROR "anymirror" +#define VDEV_TYPE_ANYRAIDZ "anyraidz" #define VDEV_TYPE_DRAID "draid" #define VDEV_TYPE_DRAID_SPARE "dspare" #define VDEV_TYPE_DISK "disk" @@ -1250,6 +1261,25 @@ typedef enum dsl_scan_state { DSS_NUM_STATES } dsl_scan_state_t; +typedef struct pool_anyraid_relocate_stat { + uint64_t pars_state; /* anyraid_relocate_state_t */ + uint64_t pars_relocating_vdev; + uint64_t pars_start_time; + uint64_t pars_end_time; + uint64_t pars_to_move; /* bytes that need to be moved */ + uint64_t pars_moved; /* bytes moved so far */ + uint64_t pars_waiting_for_resilver; +} pool_anyraid_relocate_stat_t; + +typedef enum anyraid_relocate_state { + ARS_NONE, + ARS_SCANNING, + ARS_SCRUBBING, + ARS_CONTRACTING, + ARS_FINISHED, + ARS_NUM_STATES +} anyraid_relocate_state_t; + typedef struct vdev_rebuild_stat { uint64_t vrs_state; /* vdev_rebuild_state_t */ uint64_t vrs_start_time; /* time_t */ @@ -1587,6 +1617,8 @@ typedef enum zfs_ioc { ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */ ZFS_IOC_DDT_PRUNE, /* 0x5a59 */ + ZFS_IOC_POOL_REBALANCE, /* 0x5a5a */ + ZFS_IOC_POOL_CONTRACT, /* 0x5a5b */ /* * Per-platform (Optional) - 8/128 numbers reserved. @@ -1700,6 +1732,7 @@ typedef enum { ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, ZFS_ERR_ASHIFT_MISMATCH, ZFS_ERR_STREAM_LARGE_MICROZAP, + ZFS_ERR_ANYRAID_REBALANCE_IN_PROGRESS, ZFS_ERR_TOO_MANY_SITOUTS, } zfs_errno_t; @@ -1726,6 +1759,7 @@ typedef enum { ZPOOL_WAIT_SCRUB, ZPOOL_WAIT_TRIM, ZPOOL_WAIT_RAIDZ_EXPAND, + ZPOOL_WAIT_ANYRAID_RELOCATE, ZPOOL_WAIT_NUM_ACTIVITIES } zpool_wait_activity_t; diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 0f711fe6fe61..d5de46c8e380 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -139,6 +139,7 @@ void metaslab_group_alloc_decrement(spa_t *, uint64_t, int, int, uint64_t, const void *); void metaslab_recalculate_weight_and_sort(metaslab_t *); void metaslab_disable(metaslab_t *); +void metaslab_disable_nowait(metaslab_t *); void metaslab_enable(metaslab_t *, boolean_t, boolean_t); void metaslab_set_selected_txg(metaslab_t *, uint64_t); diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index faeb96fe965e..e4a51a68d73a 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -82,6 +82,8 @@ typedef enum trace_alloc_type { (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ METASLAB_WEIGHT_CLAIM) +#define METASLAB_MAX_WEIGHT (METASLAB_WEIGHT_TYPE - 1) + /* * The metaslab weight is used to encode the amount of free space in a * metaslab, such that the "best" metaslab appears first when sorting the diff --git a/include/sys/spa.h b/include/sys/spa.h index 1a84844c522a..3a9f2b4347c0 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -815,6 +815,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_REBUILD_DONE 0x2000 #define SPA_ASYNC_DETACH_SPARE 0x4000 #define SPA_ASYNC_REMOVE_BY_USER 0x8000 +#define SPA_ASYNC_CONTRACTION_DONE 0x10000 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t ashift_check); @@ -833,6 +834,9 @@ extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); extern int spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp); +int spa_rebalance_vdevs(spa_t *spa, const uint64_t *guids, uint_t count); +int spa_rebalance_all(spa_t *spa); +int spa_contract_vdev(spa_t *spa, uint64_t anyraid_vdev, uint64_t leaf_vdev); /* spare state (which is global across all pools) */ extern void spa_spare_add(vdev_t *vd); @@ -1084,9 +1088,12 @@ extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); extern uint64_t spa_syncing_txg(spa_t *spa); extern uint64_t spa_final_dirty_txg(spa_t *spa); +extern uint64_t spa_load_max_txg(spa_t *spa); +extern uint64_t spa_current_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); +extern uint64_t spa_load_txg(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); extern void spa_get_min_alloc_range(spa_t *spa, uint64_t *min, uint64_t *max); @@ -1161,7 +1168,9 @@ extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_maxblocksize(spa_t *spa); extern int spa_maxdnodesize(spa_t *spa); extern boolean_t spa_has_checkpoint(spa_t *spa); +extern uint64_t spa_checkpoint_txg(spa_t *spa); extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa); +extern boolean_t spa_importing_checkpoint(spa_t *spa); extern boolean_t spa_suspend_async_destroy(spa_t *spa); extern uint64_t spa_min_claim_txg(spa_t *spa); extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 62cf196eeaa4..f3929ee094bc 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -332,7 +333,7 @@ struct spa { kthread_t *spa_async_thread; /* thread doing async task */ int spa_async_suspended; /* async tasks suspended */ kcondvar_t spa_async_cv; /* wait for thread_exit() */ - uint16_t spa_async_tasks; /* async task mask */ + uint32_t spa_async_tasks; /* async task mask */ uint64_t spa_missing_tvds; /* unopenable tvds on load */ uint64_t spa_missing_tvds_allowed; /* allow loading spa? */ @@ -347,6 +348,9 @@ struct spa { vdev_raidz_expand_t *spa_raidz_expand; zthr_t *spa_raidz_expand_zthr; + vdev_anyraid_relocate_t *spa_anyraid_relocate; + zthr_t *spa_anyraid_relocate_zthr; + uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */ spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ zthr_t *spa_checkpoint_discard_zthr; diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 131cfc9cd16b..798edbba23e2 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -100,7 +100,7 @@ extern boolean_t vdev_replace_in_progress(vdev_t *vdev); extern void vdev_hold(vdev_t *); extern void vdev_rele(vdev_t *); -void vdev_update_nonallocating_space(vdev_t *vd, boolean_t add); +void vdev_update_nonallocating_space(vdev_t *vd, uint64_t bytes, boolean_t add); extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_metaslab_set_size(vdev_t *); @@ -179,6 +179,7 @@ extern boolean_t vdev_writeable(vdev_t *vd); extern boolean_t vdev_allocatable(vdev_t *vd); extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd); +extern boolean_t vdev_is_anyraid(vdev_t *vd); extern void vdev_queue_init(vdev_t *vd); extern void vdev_queue_fini(vdev_t *vd); @@ -191,9 +192,17 @@ extern uint64_t vdev_queue_last_offset(vdev_t *vd); extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p); extern boolean_t vdev_queue_pool_busy(spa_t *spa); +typedef enum vdev_config_sync_status { + VDEV_CONFIG_KEEP_CHECKPOINT, + VDEV_CONFIG_CREATING_CHECKPOINT, + VDEV_CONFIG_NO_CHECKPOINT, + VDEV_CONFIG_REWINDING_CHECKPOINT +} vdev_config_sync_status_t; + extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); -extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); +extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, + vdev_config_sync_status_t status); extern void vdev_state_dirty(vdev_t *vd); extern void vdev_state_clean(vdev_t *vd); diff --git a/include/sys/vdev_anyraid.h b/include/sys/vdev_anyraid.h new file mode 100644 index 000000000000..15b55ddd51e4 --- /dev/null +++ b/include/sys/vdev_anyraid.h @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Klara Inc. + */ + +#ifndef _SYS_VDEV_ANYRAID_H +#define _SYS_VDEV_ANYRAID_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_anyraid_node vdev_anyraid_node_t; + +typedef enum vdev_anyraid_parity_type { + VAP_MIRROR, // includes raid0, i.e. a 0-parity mirror + VAP_RAIDZ, + VAP_TYPES, +} vdev_anyraid_parity_type_t; + +typedef struct vdev_anyraid_relocate_task { + list_node_t vart_node; + uint8_t vart_source_disk; + uint8_t vart_dest_disk; + uint16_t vart_source_idx; + uint16_t vart_dest_idx; + uint32_t vart_tile; + uint32_t vart_task; + uint32_t vart_dis_ms; // Only used during resume +} vdev_anyraid_relocate_task_t; + +typedef struct vdev_anyraid_relocate { + list_t var_list; + list_t var_done_list; + uint64_t var_offset; + uint64_t var_task; + uint64_t var_synced_offset; + uint64_t var_synced_task; + uint64_t var_vd; + + anyraid_relocate_state_t var_state; + uint64_t var_start_time; + uint64_t var_end_time; + uint64_t var_bytes_copied; + uint64_t var_outstanding_bytes; + + uint64_t var_failed_offset; + uint64_t var_failed_task; + boolean_t var_waiting_for_resilver; + uint64_t var_offset_pertxg[TXG_SIZE]; + uint64_t var_task_pertxg[TXG_SIZE]; + uint64_t var_bytes_copied_pertxg[TXG_SIZE]; + + kmutex_t var_lock; + kcondvar_t var_cv; + uint64_t var_nonalloc; + uint64_t var_object; +} vdev_anyraid_relocate_t; + +typedef struct vdev_anyraid { + vdev_anyraid_parity_type_t vd_parity_type; + /* + * The parity of the mismatched vdev; 0 for raid0, or the number of + * mirrors. + */ + uint_t vd_nparity; + uint8_t vd_ndata; + uint8_t vd_width; + uint64_t vd_tile_size; + + krwlock_t vd_lock; + avl_tree_t vd_tile_map; + avl_tree_t vd_children_tree; + uint32_t vd_checkpoint_tile; + vdev_anyraid_node_t **vd_children; + vdev_anyraid_relocate_t vd_relocate; + int32_t vd_contracting_leaf; + zfs_rangelock_t vd_rangelock; +} vdev_anyraid_t; + +#define VDEV_ANYRAID_MAX_DISKS (1 << 8) + +/* + * ========================================================================== + * Externally-accessed function definitions + * ========================================================================== + */ +extern void vdev_anyraid_write_map_sync(vdev_t *vd, zio_t *pio, uint64_t txg, + uint64_t *good_writes, int flags, vdev_config_sync_status_t status); + +extern void vdev_anyraid_expand(vdev_t *tvd, vdev_t *newvd); +extern boolean_t vdev_anyraid_mapped(vdev_t *vd, uint64_t offset, uint64_t txg); +uint64_t vdev_anyraid_child_num_tiles(vdev_t *vd, vdev_t *cvd); +uint64_t vdev_anyraid_child_capacity(vdev_t *vd, vdev_t *cvd); +int spa_anyraid_relocate_get_stats(spa_t *spa, + pool_anyraid_relocate_stat_t *pars); +int vdev_anyraid_check_contract(vdev_t *tvd, vdev_t *lvd, dmu_tx_t *tx); +void vdev_anyraid_setup_contract(vdev_t *tvd, dmu_tx_t *tx); +void vdev_anyraid_compact_children(vdev_t *vd); +int vdev_anyraid_load(vdev_t *vd); +void anyraid_dtl_reassessed(vdev_t *vd); + +vdev_anyraid_relocate_t *vdev_anyraid_relocate_status(vdev_t *vd); +void vdev_anyraid_setup_rebalance(vdev_t *vd, dmu_tx_t *tx); +void spa_start_anyraid_relocate_thread(spa_t *spa); +dsl_scan_done_func_t *anyraid_setup_scan_done(spa_t *spa, uint64_t vd_id, + void **arg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_ANYRAID_H */ diff --git a/include/sys/vdev_anyraid_impl.h b/include/sys/vdev_anyraid_impl.h new file mode 100644 index 000000000000..9ea76857de69 --- /dev/null +++ b/include/sys/vdev_anyraid_impl.h @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Klara Inc. + */ + +#ifndef _SYS_VDEV_ANYRAID_IMPL_H +#define _SYS_VDEV_ANYRAID_IMPL_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * ========================================================================== + * Internal structures & definitions + * ========================================================================== + */ +typedef struct anyraid_free_node { + avl_node_t afn_node; + uint16_t afn_tile; +} anyraid_free_node_t; + +typedef struct anyraid_freelist { + avl_tree_t af_list; + uint16_t af_next_off; +} anyraid_freelist_t; + +void anyraid_freelist_create(anyraid_freelist_t *, uint16_t); +void anyraid_freelist_destroy(anyraid_freelist_t *); +void anyraid_freelist_add(anyraid_freelist_t *, uint16_t); +void anyraid_freelist_remove(anyraid_freelist_t *, uint16_t); +uint16_t anyraid_freelist_pop(anyraid_freelist_t *); +uint16_t anyraid_freelist_alloc(const anyraid_freelist_t *); +boolean_t anyraid_freelist_isfree(const anyraid_freelist_t *af, uint16_t off); + +typedef struct vdev_anyraid_node { + avl_node_t van_node; + uint8_t van_id; + anyraid_freelist_t van_freelist; + uint32_t van_capacity; +} vdev_anyraid_node_t; + +typedef struct anyraid_tile_node { + list_node_t atn_node; + uint8_t atn_disk; + uint16_t atn_tile_idx; +} anyraid_tile_node_t; + +typedef struct anyraid_tile { + avl_node_t at_node; + uint32_t at_tile_id; + list_t at_list; + uint64_t at_synced; +} anyraid_tile_t; + +typedef struct anyraid_move_arg { + vdev_anyraid_relocate_t *ama_var; + zio_t *ama_zio; + zfs_locked_range_t *ama_lr; + uint64_t ama_txg; + uint64_t ama_size; + uint32_t ama_tid; +} anyraid_move_arg_t; + +typedef struct relocate_phys { + uint64_t rp_done; + uint64_t rp_total; +} relocate_phys_t; + +typedef struct relocate_task_phys { + uint32_t rtp_source_disk; + uint32_t rtp_dest_disk; + uint32_t rtp_source_idx; + uint32_t rtp_dest_idx; + uint32_t rtp_tile; + uint32_t rtp_task; + uint64_t rtp_pad2; +} relocate_task_phys_t; + +_Static_assert(sizeof (relocate_task_phys_t) == 32, + "relocate_task_phys_t size wrong"); + +/* + * The ondisk structure of the anyraid tile map is VDEV_ANYRAID_MAP_COPIES + * copies of the following layout. We store the tile map on every disk, and + * each TXG we update a different copy (txg % VDEV_ANYRAID_MAP_COPIES). + * + * First, we start with a MAX(8KiB, 1 << ashift) region that stores a packed + * nvlist containing the header. The header contains a version number, a disk + * ID, a TXG, the tile size (in bytes), the stripe width/parity of the + * tiles, the length of the mapping (in bytes), the pool guid, and the + * checksum of the mapping. This 8KiB region has an embedded checksum so that + * uses the normal ZIO_CHECKSUM_LABEL algorithm. + * + * Then, there is a tile of size VDEV_ANYRAID_MAP_SIZE. This stores the actual + * mapping. It is a series of entries. Right now, there are two entry types: + * + * 0: Skip entries represent a gap in logical tile IDs. From the current + * tile ID, add the value stored in the lower 24 bits of the skip entry. + * + * 1: Location entries represent a mapped tile. Each one represents a single + * physical tile backing the current logical tile. There can be multiple + * physical tiles for one logical tile; that number is the stripe width/ + * parity from the header. These entries contain a 8 bit disk ID and a 16 bit + * offset on that disk. + * + * Here is an example of what the mapping looks like on disk. This is for a + * 1-parity mirror anyraid device: + * + * +----------+----------+----------+----------+----------+----------+ + * | Tile 0 | Tile 0 | Tile 1 | Tile 1 | Tile 2 | Tile 2 | + * | Parity 0 | Parity 1 | Parity 0 | Parity 1 | Parity 0 | Parity 1 | + * | Disk 0 | Disk 1 | Disk 0 | Disk 2 | Disk 0 | Disk 1 | + * | Offset 0 | Offset 0 | Offset 1 | Offset 0 | Offset 2 | Offset 1 | + * +----------+----------+----------+----------+----------+----------+ + * + * Note that each of these entries acutally only contains the "disk" and + * "offset" fields on-disk; the "tile" and "parity" information is derived from + * context (since the entries are stored in tile/offset order, with no gaps + * unless a skip entry is present). + * + * New entry types will be added eventually to store information like parity + * changes. + * + * Because the mapping can be larger than the SPA_MAXBLOCKSIZE, it has to be + * written in multiple IOs; each IO-sized region has their own checksum, which + * is stored in the header block (using the ZIO_CHECKSUM_ANYRAID_MAP algorithm). + */ + +/* + * ========================================================================== + * Header-related definitions + * ========================================================================== + */ +#define VDEV_ANYRAID_HEADER_VERSION "version" +#define VDEV_ANYRAID_HEADER_DISK "disk" +#define VDEV_ANYRAID_HEADER_TXG "txg" +#define VDEV_ANYRAID_HEADER_TILE_SIZE "tile_size" +#define VDEV_ANYRAID_HEADER_LENGTH "length" +#define VDEV_ANYRAID_HEADER_CHECKPOINT "checkpoint_txg" +#define VDEV_ANYRAID_HEADER_DISK_SIZES "sizes" +#define VDEV_ANYRAID_HEADER_RELOC_STATE "state" +#define VDEV_ANYRAID_HEADER_CUR_TASK "cur_task" +#define VDEV_ANYRAID_HEADER_CONTRACTING_LEAF "contracting_leaf" + +#define VART_TILE "tile" +#define VART_SOURCE_DISK "source_disk" +#define VART_SOURCE_OFF "source_off" +#define VART_DEST_DISK "dest_disk" +#define VART_DEST_OFF "dest_off" +#define VART_OFFSET "offset" +#define VART_TASK "task" +/* + * We store the pool guid to prevent disks being reused from an old pool from + * causing any issues. + */ +#define VDEV_ANYRAID_HEADER_GUID "guid" + +#define VDEV_ANYRAID_MAP_HEADER_SIZE(ashift) MAX(8 * 1024, 1ULL << (ashift)) + +#define VDEV_ANYRAID_NVL_BYTES(ashift) \ + (VDEV_ANYRAID_MAP_HEADER_SIZE(ashift) - \ + (VDEV_ANYRAID_MAP_COPIES + 1) * sizeof (zio_eck_t)) + +/* + * ========================================================================== + * Mapping-related definitions + * ========================================================================== + */ +typedef enum anyraid_map_entry_type { + AMET_SKIP = 0, + AMET_LOC = 1, + AMET_TYPES +} anyraid_map_entry_type_t; + +#define AME_TYPE_BITS 8 + +/* + * ========================================================================== + * Skip entry definitions and functions + * ========================================================================== + */ +typedef uint32_t anyraid_map_skip_entry_t; + +#define AMSE_TILE_BITS 24 + +static inline void +amse_set_type(anyraid_map_skip_entry_t *amse) +{ + BF32_SET(*amse, 0, AME_TYPE_BITS, AMET_SKIP); +} + +static inline void +amse_set_skip_count(anyraid_map_skip_entry_t *amse, uint32_t skip_count) +{ + BF32_SET(*amse, AME_TYPE_BITS, AMSE_TILE_BITS, skip_count); +} + +static inline uint32_t +amse_get_skip_count(anyraid_map_skip_entry_t *amse) +{ + return (BF32_GET(*amse, AME_TYPE_BITS, AMSE_TILE_BITS)); +} + +/* + * ========================================================================== + * Location entry definitions and functions + * ========================================================================== + */ +typedef uint32_t anyraid_map_loc_entry_t; + +#define AMLE_DISK_BITS 8 +#define AMLE_OFFSET_BITS 16 + +static inline void +amle_set_type(anyraid_map_loc_entry_t *amle) +{ + BF32_SET(*amle, 0, AME_TYPE_BITS, AMET_LOC); +} + +static inline void +amle_set_disk(anyraid_map_loc_entry_t *amle, uint8_t disk) +{ + BF32_SET(*amle, AME_TYPE_BITS, AMLE_DISK_BITS, disk); +} + +static inline uint32_t +amle_get_disk(anyraid_map_loc_entry_t *amle) +{ + return (BF32_GET(*amle, AME_TYPE_BITS, AMLE_DISK_BITS)); +} + +static inline void +amle_set_offset(anyraid_map_loc_entry_t *amle, uint8_t offset) +{ + BF32_SET(*amle, (AME_TYPE_BITS + AMLE_DISK_BITS), AMLE_OFFSET_BITS, + offset); +} + +static inline uint32_t +amle_get_offset(anyraid_map_loc_entry_t *amle) +{ + return (BF32_GET(*amle, (AME_TYPE_BITS + AMLE_DISK_BITS), + AMLE_OFFSET_BITS)); +} + +/* + * ========================================================================== + * Overall mapping definitions + * ========================================================================== + */ + +typedef struct anyraid_map_entry { + union { + anyraid_map_skip_entry_t ame_amse; + anyraid_map_loc_entry_t ame_amle; + } ame_u; +} anyraid_map_entry_t; + +static inline anyraid_map_entry_type_t +ame_get_type(anyraid_map_entry_t *ame) +{ + return (BF32_GET(ame->ame_u.ame_amle, 0, AME_TYPE_BITS)); +} + +#define VDEV_ANYRAID_MAX_TPD (1 << 16) +#define VDEV_ANYRAID_MAX_TILES (VDEV_ANYRAID_MAX_DISKS * VDEV_ANYRAID_MAX_TPD) +/* + * The worst case scenario here is that we have a loc entry for every single + * tile (0 skips). At that point, we're using 4 bytes per tile. + * That gives us 2^24 * 4 bytes = 64 MB to store the entire map. + */ +#define VDEV_ANYRAID_MAP_SIZE (sizeof (anyraid_map_loc_entry_t) * \ + VDEV_ANYRAID_MAX_TILES) +#define VDEV_ANYRAID_SINGLE_MAP_SIZE(ashift) \ + ((VDEV_ANYRAID_MAP_HEADER_SIZE(ashift) + VDEV_ANYRAID_MAP_SIZE)) +#define VDEV_ANYRAID_MAP_COPIES 4 +#define VDEV_ANYRAID_START_COPIES (VDEV_ANYRAID_MAP_COPIES / 2) +#define VDEV_ANYRAID_TOTAL_MAP_SIZE(ashift) (VDEV_ANYRAID_MAP_COPIES * \ + VDEV_ANYRAID_SINGLE_MAP_SIZE(ashift)) +#define VDEV_ANYRAID_START_OFFSET(ashift) VDEV_ANYRAID_START_COPIES * \ + VDEV_ANYRAID_SINGLE_MAP_SIZE(ashift) + +_Static_assert(VDEV_ANYRAID_TOTAL_MAP_SIZE(9) % SPA_MINBLOCKSIZE == 0, ""); +_Static_assert(VDEV_ANYRAID_TOTAL_MAP_SIZE(12) % SPA_MINBLOCKSIZE == 0, ""); +_Static_assert(VDEV_ANYRAID_MAP_SIZE % SPA_MAXBLOCKSIZE == 0, ""); + +/* + * These functions are exposed for ZDB. + */ + +typedef struct anyraid_header { + abd_t *ah_abd; + char *ah_buf; + nvlist_t *ah_nvl; +} anyraid_header_t; + +int vdev_anyraid_pick_best_mapping(vdev_t *cvd, + uint64_t *out_txg, anyraid_header_t *out_header, int *out_mapping); +int vdev_anyraid_open_header(vdev_t *cvd, int header, + anyraid_header_t *out_header); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_ANYRAID_IMPL_H */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 634594aca124..7a1bedadec1f 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -70,7 +70,8 @@ typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *ashift, uint64_t *pshift); typedef void vdev_close_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize, uint64_t txg); -typedef uint64_t vdev_min_asize_func_t(vdev_t *vd); +typedef uint64_t vdev_min_asize_func_t(vdev_t *pvd, vdev_t *cvd); +typedef uint64_t vdev_min_attach_size_func_t(vdev_t *vd); typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd); typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); @@ -94,6 +95,7 @@ typedef uint64_t vdev_rebuild_asize_func_t(vdev_t *vd, uint64_t start, uint64_t size, uint64_t max_segment); typedef void vdev_metaslab_init_func_t(vdev_t *vd, uint64_t *startp, uint64_t *sizep); +typedef void vdev_metaslab_size_func_t(vdev_t *vd, uint64_t *shiftp); typedef void vdev_config_generate_func_t(vdev_t *vd, nvlist_t *nv); typedef uint64_t vdev_nparity_func_t(vdev_t *vd); typedef uint64_t vdev_ndisks_func_t(vdev_t *vd); @@ -106,6 +108,7 @@ typedef const struct vdev_ops { vdev_asize_func_t *vdev_op_psize_to_asize; vdev_asize_func_t *vdev_op_asize_to_psize; vdev_min_asize_func_t *vdev_op_min_asize; + vdev_min_attach_size_func_t *vdev_op_min_attach_size; vdev_min_alloc_func_t *vdev_op_min_alloc; vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; @@ -121,6 +124,7 @@ typedef const struct vdev_ops { vdev_nparity_func_t *vdev_op_nparity; vdev_ndisks_func_t *vdev_op_ndisks; vdev_kobj_post_evt_func_t *vdev_op_kobj_evt_post; + vdev_metaslab_size_func_t *vdev_op_metaslab_size; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; @@ -426,6 +430,7 @@ struct vdev { boolean_t vdev_kobj_flag; /* kobj event record */ boolean_t vdev_attaching; /* vdev attach ashift handling */ boolean_t vdev_is_blkdev; /* vdev is backed by block device */ + boolean_t vdev_shrinking; /* vdev is currently shrinking */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ @@ -619,6 +624,10 @@ extern vdev_ops_t vdev_missing_ops; extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; extern vdev_ops_t vdev_indirect_ops; +extern vdev_ops_t vdev_anymirror_ops; +extern vdev_ops_t vdev_anyraidz_ops; + +extern zio_vsd_ops_t vdev_mirror_vsd_ops; /* * Common size functions @@ -627,8 +636,10 @@ extern void vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs); extern uint64_t vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg); -extern uint64_t vdev_default_min_asize(vdev_t *vd); +extern uint64_t vdev_default_min_asize(vdev_t *pvd, vdev_t *cvd); +extern uint64_t vdev_default_min_attach_size(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); +extern uint64_t vdev_get_min_attach_size(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); extern uint64_t vdev_get_nparity(vdev_t *vd); extern uint64_t vdev_get_ndisks(vdev_t *vd); diff --git a/include/sys/vdev_mirror.h b/include/sys/vdev_mirror.h new file mode 100644 index 000000000000..f48cc333e8e0 --- /dev/null +++ b/include/sys/vdev_mirror.h @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2025, Klara Inc. + */ + +#ifndef _SYS_VDEV_MIRROR_H +#define _SYS_VDEV_MIRROR_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Virtual device vector for mirroring. + */ +typedef struct mirror_child { + vdev_t *mc_vd; + abd_t *mc_abd; + uint64_t mc_offset; + int mc_error; + int mc_load; + uint8_t mc_tried; + uint8_t mc_skipped; + uint8_t mc_speculative; + uint8_t mc_rebuilding; +} mirror_child_t; + +typedef struct mirror_map { + int *mm_preferred; + int mm_preferred_cnt; + int mm_children; + boolean_t mm_resilvering; + boolean_t mm_rebuilding; + boolean_t mm_root; + mirror_child_t mm_child[]; +} mirror_map_t; + +mirror_map_t *vdev_mirror_map_alloc(int children, boolean_t resilvering, + boolean_t root); +void vdev_mirror_io_start_impl(zio_t *zio, mirror_map_t *mm); +void vdev_mirror_io_done(zio_t *zio); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_MIRROR_H */ diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 8c8dcfb077f6..327a35860327 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -406,6 +406,9 @@ gf_exp4(gf_log_t exp) return ((gf_t)vdev_raidz_pow2[(2 * exp) % 255]); } +void vdev_raidz_io_start_impl(zio_t *zio, raidz_map_t *rm, + uint64_t logical_width, uint64_t physical_width); + #ifdef __cplusplus } #endif diff --git a/include/sys/zio.h b/include/sys/zio.h index acb0a03a36b2..3c37daa1a8de 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -107,6 +107,7 @@ enum zio_checksum { ZIO_CHECKSUM_SKEIN, ZIO_CHECKSUM_EDONR, ZIO_CHECKSUM_BLAKE3, + ZIO_CHECKSUM_ANYRAID_MAP, ZIO_CHECKSUM_FUNCTIONS }; @@ -213,6 +214,7 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_NODATA (1ULL << 12) #define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13) #define ZIO_FLAG_ALLOC_THROTTLED (1ULL << 14) +#define ZIO_FLAG_ZILWRITE (1ULL << 15) #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) @@ -220,29 +222,29 @@ typedef uint64_t zio_flag_t; /* * Flags inherited by vdev children. */ -#define ZIO_FLAG_IO_RETRY (1ULL << 15) /* must be first for INHERIT */ -#define ZIO_FLAG_PROBE (1ULL << 16) -#define ZIO_FLAG_TRYHARD (1ULL << 17) -#define ZIO_FLAG_OPTIONAL (1ULL << 18) -#define ZIO_FLAG_DIO_READ (1ULL << 19) +#define ZIO_FLAG_IO_RETRY (1ULL << 16) /* must be first for INHERIT */ +#define ZIO_FLAG_PROBE (1ULL << 17) +#define ZIO_FLAG_TRYHARD (1ULL << 18) +#define ZIO_FLAG_OPTIONAL (1ULL << 19) +#define ZIO_FLAG_DIO_READ (1ULL << 20) #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ -#define ZIO_FLAG_DONT_QUEUE (1ULL << 20) /* must be first for INHERIT */ -#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 21) -#define ZIO_FLAG_IO_BYPASS (1ULL << 22) -#define ZIO_FLAG_IO_REWRITE (1ULL << 23) -#define ZIO_FLAG_RAW_COMPRESS (1ULL << 24) -#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 25) -#define ZIO_FLAG_GANG_CHILD (1ULL << 26) -#define ZIO_FLAG_DDT_CHILD (1ULL << 27) -#define ZIO_FLAG_GODFATHER (1ULL << 28) -#define ZIO_FLAG_NOPWRITE (1ULL << 29) -#define ZIO_FLAG_REEXECUTED (1ULL << 30) -#define ZIO_FLAG_DELEGATED (1ULL << 31) -#define ZIO_FLAG_PREALLOCATED (1ULL << 32) +#define ZIO_FLAG_DONT_QUEUE (1ULL << 21) /* must be first for INHERIT */ +#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 22) +#define ZIO_FLAG_IO_BYPASS (1ULL << 23) +#define ZIO_FLAG_IO_REWRITE (1ULL << 24) +#define ZIO_FLAG_RAW_COMPRESS (1ULL << 25) +#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 26) +#define ZIO_FLAG_GANG_CHILD (1ULL << 27) +#define ZIO_FLAG_DDT_CHILD (1ULL << 28) +#define ZIO_FLAG_GODFATHER (1ULL << 29) +#define ZIO_FLAG_NOPWRITE (1ULL << 30) +#define ZIO_FLAG_REEXECUTED (1ULL << 31) +#define ZIO_FLAG_DELEGATED (1ULL << 32) +#define ZIO_FLAG_PREALLOCATED (1ULL << 33) #define ZIO_ALLOCATOR_NONE (-1) #define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) @@ -513,6 +515,7 @@ struct zio { vdev_t *io_vd; void *io_vsd; const zio_vsd_ops_t *io_vsd_ops; + void *io_aux_vsd; // Used by anyraid metaslab_class_t *io_metaslab_class; /* dva throttle class */ enum zio_qstate io_queue_state; /* vdev queue state */ diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index f07ad2605e31..b68c712943c4 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -140,8 +140,8 @@ extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, void *, uint64_t, uint64_t, zio_bad_cksum_t *); extern void zio_checksum_compute(zio_t *, enum zio_checksum, struct abd *, uint64_t); -extern int zio_checksum_error_impl(spa_t *, const blkptr_t *, enum zio_checksum, - struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *); +extern int zio_checksum_error_impl(zio_t *, enum zio_checksum, struct abd *, + uint64_t, uint64_t, zio_bad_cksum_t *); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); extern void zio_checksum_templates_free(spa_t *spa); diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 56382ca85b55..c44671673fbd 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -90,6 +90,7 @@ typedef enum spa_feature { SPA_FEATURE_DYNAMIC_GANG_HEADER, SPA_FEATURE_BLOCK_CLONING_ENDIAN, SPA_FEATURE_PHYSICAL_REWRITE, + SPA_FEATURE_ANYRAID, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 876433c0ba58..2f034ca98fba 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -690,7 +690,7 @@ - + @@ -6258,7 +6258,10 @@ - + + + + @@ -6542,7 +6545,8 @@ - + + @@ -9909,8 +9913,8 @@ - - + + @@ -9971,7 +9975,7 @@ - + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index e12308b01ab1..ca310e62968f 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -1225,7 +1226,8 @@ zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) strncmp(pool, "raidz", 5) == 0 || strncmp(pool, "draid", 5) == 0 || strncmp(pool, "spare", 5) == 0 || - strcmp(pool, "log") == 0)) { + strcmp(pool, "log") == 0 || + strncmp(pool, "anymirror", 9) == 0)) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); @@ -1616,6 +1618,18 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, "minimum size (%s)"), buf); } return (zfs_error(hdl, EZFS_BADDEV, errbuf)); + case ENOLCK: + /* + * This occurs when one of the devices is an anyraid + * device that can't hold a single tile. + * Unfortunately, we can't detect which device was the + * problem device since there's no reliable way to + * determine device size from userland. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more anyraid devices cannot store " + "any tiles (see 'zfs_anyraid_min_tile_size')")); + return (zfs_error(hdl, EZFS_BADDEV, errbuf)); case ENOSPC: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, @@ -1855,7 +1869,18 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot, boolean_t check_ashift) } (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; - + case ENOLCK: + /* + * This occurs when one of the devices is an anyraid + * device that can't hold a single tile. + * Unfortunately, we can't detect which device was the + * problem device since there's no reliable way to + * determine device size from userland. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more anyraid devices cannot store " + "any tiles (see 'zfs_anyraid_min_tile_size')")); + return (zfs_error(hdl, EZFS_BADDEV, errbuf)); case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to add these vdevs")); @@ -3211,7 +3236,11 @@ zpool_vdev_is_interior(const char *name) strncmp(name, VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 || strncmp(name, VDEV_TYPE_ROOT, strlen(VDEV_TYPE_ROOT)) == 0 || - strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) + strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0 || + strncmp(name, + VDEV_TYPE_ANYMIRROR, strlen(VDEV_TYPE_ANYMIRROR)) == 0 || + strncmp(name, + VDEV_TYPE_ANYRAIDZ, strlen(VDEV_TYPE_ANYRAIDZ)) == 0) return (B_TRUE); if (strncmp(name, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0 && @@ -3691,6 +3720,22 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, free(newname); return (zfs_error(hdl, EZFS_BADTARGET, errbuf)); } + uint64_t min_tile_size = 0; + if (strncmp(fnvlist_lookup_string(tgt, ZPOOL_CONFIG_TYPE), "any", + 3) == 0) { + char mts[32]; + VERIFY0(zpool_get_vdev_prop(zhp, old_disk, + VDEV_PROP_ANYRAID_TILE_SIZE, NULL, mts, 32, NULL, B_TRUE)); + VERIFY3S(sscanf(mts, "%llu", (u_longlong_t *)&min_tile_size), + ==, 1); + /* + * Unfortunately it's difficult to get the definitions that + * would allow us to do this cleanly into userland. We need + * space for a tile (above) plus the mapping (256MiB) plus the + * labels (4.5MiB). + */ + min_tile_size += 261 * 1024 * 1024; + } free(newname); @@ -3788,6 +3833,19 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; + case ENOLCK: { + /* + * This occurs when one of the devices is an anyraid + * device that can't hold a single tile. + */ + char buf[32]; + ASSERT(min_tile_size != 0); + zfs_nicenum(min_tile_size, buf, 32); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "new device cannot store any tiles (min size %s)"), buf); + return (zfs_error(hdl, EZFS_BADDEV, errbuf)); + } + case ENAMETOOLONG: /* * The resulting top-level vdev spec won't fit in the label. @@ -4571,12 +4629,26 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, path = type; /* - * If it's a raidz device, we need to stick in the parity level. + * If it's a raidz or anyraid device, we need to stick in the + * parity level. */ - if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) { + if (strcmp(path, VDEV_TYPE_RAIDZ) == 0 || + strcmp(path, VDEV_TYPE_ANYMIRROR) == 0 || + strcmp(path, VDEV_TYPE_ANYRAIDZ) == 0) { value = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY); - (void) snprintf(buf, sizeof (buf), "%s%llu", path, - (u_longlong_t)value); + uint8_t type; + if (nvlist_lookup_uint8(nv, + ZPOOL_CONFIG_ANYRAID_PARITY_TYPE, &type) == 0 && + type == VAP_RAIDZ) { + uint8_t ndata = fnvlist_lookup_uint8(nv, + ZPOOL_CONFIG_ANYRAID_NDATA); + (void) snprintf(buf, sizeof (buf), + "%s%llu:%u", path, + (u_longlong_t)value, ndata); + } else { + (void) snprintf(buf, sizeof (buf), "%s%llu", + path, (u_longlong_t)value); + } path = buf; } @@ -5460,6 +5532,10 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, if (nvlist_lookup_nvlist(nvprop, prop_name, &nv) == 0) { src = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); intval = fnvlist_lookup_uint64(nv, ZPROP_VALUE); + } else if (prop == VDEV_PROP_ANYRAID_CAP_TILES || + prop == VDEV_PROP_ANYRAID_NUM_TILES || + prop == VDEV_PROP_ANYRAID_TILE_SIZE) { + return (ENOENT); } else { src = ZPROP_SRC_DEFAULT; intval = vdev_prop_default_numeric(prop); @@ -5490,6 +5566,7 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, case VDEV_PROP_BYTES_FREE: case VDEV_PROP_BYTES_CLAIM: case VDEV_PROP_BYTES_TRIM: + case VDEV_PROP_ANYRAID_TILE_SIZE: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); @@ -5779,3 +5856,150 @@ zpool_ddt_prune(zpool_handle_t *zhp, zpool_ddt_prune_unit_t unit, return (0); } + +static boolean_t +strstarts(const char *str, const char *prefix) +{ + return (strncmp(str, prefix, strlen(prefix)) == 0); +} + +// TODO can't do multiple at once +int +zpool_rebalance(zpool_handle_t *zhp, char **vdev_names, int count) +{ + int ret = 0; + uint64_t *guids = NULL; + if (count != 0) { + guids = umem_alloc(sizeof (*guids) * count, UMEM_DEFAULT); + if (guids == NULL) + return (no_memory(zhp->zpool_hdl)); + } + char errbuf[ERRBUFLEN]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot rebalance vdev(s) on '%s'"), zhp->zpool_name); + libzfs_handle_t *hdl = zhp->zpool_hdl; + for (int i = 0; i < count; i++) { + if (!(strstarts(vdev_names[i], VDEV_TYPE_ANYMIRROR) || + strstarts(vdev_names[i], VDEV_TYPE_ANYRAIDZ))) { + zfs_error_fmt(hdl, EZFS_BADDEV, dgettext(TEXT_DOMAIN, + "non-anyraid device specified")); + } + if ((ret = zpool_vdev_guid(zhp, vdev_names[i], &guids[i])) != 0) + break; + } + + if (ret != 0) { + if (guids) + umem_free(guids, sizeof (*guids) * count); + return (ret); + } + ret = lzc_pool_rebalance(zpool_get_name(zhp), guids, count); + if (guids) + umem_free(guids, sizeof (*guids) * count); + switch (ret) { + case ENOENT: + zfs_error_fmt(hdl, EZFS_NOENT, + dgettext(TEXT_DOMAIN, "no anyraid vdevs found")); + break; + case EINVAL: + zfs_error_fmt(hdl, EZFS_BADDEV, + dgettext(TEXT_DOMAIN, + "non-anyraid device specified")); + break; + case EALREADY: + zfs_error_fmt(hdl, EZFS_BUSY, + dgettext(TEXT_DOMAIN, "specified device already " + "rebalancing")); + break; + case 0: + break; + default: + { + libzfs_handle_t *hdl = zhp->zpool_hdl; + (void) zpool_standard_error(hdl, errno, errbuf); + } + } + return (ret); +} + +int +zpool_contract(zpool_handle_t *zhp, const char *anyraid_vdev_name, + const char *leaf_vdev_name) +{ + int ret = 0; + uint64_t avd_guid, lvd_guid; + char errbuf[ERRBUFLEN]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot perform contraction for vdev(s) on '%s'"), zhp->zpool_name); + libzfs_handle_t *hdl = zhp->zpool_hdl; + if (!(strstarts(anyraid_vdev_name, VDEV_TYPE_ANYMIRROR) || + strstarts(anyraid_vdev_name, VDEV_TYPE_ANYRAIDZ))) { + zfs_error_fmt(hdl, EZFS_BADDEV, dgettext(TEXT_DOMAIN, + "non-anyraid device specified")); + } + + if ((ret = zpool_vdev_guid(zhp, anyraid_vdev_name, &avd_guid)) != 0) + return (ret); + + if ((ret = zpool_vdev_guid(zhp, leaf_vdev_name, &lvd_guid)) != 0) + return (ret); + + ret = lzc_pool_contract(zpool_get_name(zhp), avd_guid, lvd_guid); + + switch (ret) { + case ENOENT: + zfs_error_fmt(hdl, EZFS_NOENT, + dgettext(TEXT_DOMAIN, "no anyraid vdev found")); + break; + case EINVAL: + zfs_error_fmt(hdl, EZFS_BADDEV, + dgettext(TEXT_DOMAIN, + "non-anyraid device specified")); + break; + case ENXIO: + zfs_error_fmt(hdl, EZFS_INVALCONFIG, + dgettext(TEXT_DOMAIN, + "%s is not a child of %s"), leaf_vdev_name, + anyraid_vdev_name); + break; + case ENOSPC: + zfs_error_fmt(hdl, EZFS_NOSPC, + dgettext(TEXT_DOMAIN, "insufficient free tiles to " + "remove %s from %s"), leaf_vdev_name, + anyraid_vdev_name); + break; + case EXFULL: + zfs_error_fmt(hdl, EZFS_NOSPC, + dgettext(TEXT_DOMAIN, "could not find valid " + "relocation target for all tiles when " + "removing %s from %s"), leaf_vdev_name, + anyraid_vdev_name); + break; + case EBUSY: + zfs_error_fmt(hdl, EZFS_CHECKPOINT_EXISTS, + dgettext(TEXT_DOMAIN, "cannot perform contraction " + "while a checkpoint exists")); + break; + case EALREADY: + zfs_error_fmt(hdl, EZFS_ANYRAID_RELOCATE_IN_PROGRESS, + dgettext(TEXT_DOMAIN, "another anyraid relocate " + "operation is already in progress")); + break; + case ENODEV: + zfs_error_fmt(hdl, EZFS_CONTRACT_BELOW_WIDTH, + dgettext(TEXT_DOMAIN, "cannot contract %s because " + "its child count is equal to its logical width"), + anyraid_vdev_name); + break; + case 0: + break; + default: + { + libzfs_handle_t *hdl = zhp->zpool_hdl; + (void) zpool_standard_error(hdl, errno, errbuf); + } + } + return (ret); +} \ No newline at end of file diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index 9347aa7c6a28..56e584de5e2d 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -1994,3 +1994,41 @@ lzc_ddt_prune(const char *pool, zpool_ddt_prune_unit_t unit, uint64_t amount) return (error); } + +int +lzc_pool_rebalance(const char *zpool, const uint64_t *vdevs, int count) +{ + int error; + + nvlist_t *result = NULL; + nvlist_t *args = fnvlist_alloc(); + + if (count != 0) + fnvlist_add_uint64_array(args, "vdevs", vdevs, count); + + error = lzc_ioctl(ZFS_IOC_POOL_REBALANCE, zpool, args, &result); + + fnvlist_free(args); + fnvlist_free(result); + + return (error); +} + +int +lzc_pool_contract(const char *zpool, uint64_t avd_guid, uint64_t lvd_guid) +{ + int error; + + nvlist_t *result = NULL; + nvlist_t *args = fnvlist_alloc(); + + fnvlist_add_uint64(args, "anyraid_vdev", avd_guid); + fnvlist_add_uint64(args, "leaf_vdev", lvd_guid); + + error = lzc_ioctl(ZFS_IOC_POOL_CONTRACT, zpool, args, &result); + + fnvlist_free(args); + fnvlist_free(result); + + return (error); +} \ No newline at end of file diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index f8f1282683bb..3a92c0db38b5 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -148,6 +148,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/vdev_label.c \ module/zfs/vdev_mirror.c \ module/zfs/vdev_missing.c \ + module/zfs/vdev_anyraid.c \ module/zfs/vdev_queue.c \ module/zfs/vdev_raidz.c \ module/zfs/vdev_raidz_math.c \ diff --git a/man/Makefile.am b/man/Makefile.am index ae7b707152ec..20fe300631e9 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -71,6 +71,7 @@ dist_man_MANS = \ %D%/man8/zpool-attach.8 \ %D%/man8/zpool-checkpoint.8 \ %D%/man8/zpool-clear.8 \ + %D%/man8/zpool-contract.8 \ %D%/man8/zpool-create.8 \ %D%/man8/zpool-destroy.8 \ %D%/man8/zpool-detach.8 \ @@ -87,6 +88,7 @@ dist_man_MANS = \ %D%/man8/zpool-offline.8 \ %D%/man8/zpool-online.8 \ %D%/man8/zpool-prefetch.8 \ + %D%/man8/zpool-rebalance.8 \ %D%/man8/zpool-reguid.8 \ %D%/man8/zpool-remove.8 \ %D%/man8/zpool-reopen.8 \ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 97c0ac6ab3e3..4ed34acf5156 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -665,6 +665,17 @@ Logical ashift for file-based devices. .It Sy vdev_file_physical_ashift Ns = Ns Sy 9 Po 512 B Pc Pq u64 Physical ashift for file-based devices. . +.It Sy zfs_anyraid_min_tile_size Ns = Ns Sy 16 GiB Pq u64 +Minimum size of the tiles that anyraid will use to do its mapping. +Smaller tile sizes let data be spread more evenly across devices, and makes +smaller devices use more of their capacity. +Larger tile sizes allow for larger disks to be used in the future, since a given +device can only store 16384 tiles. +The minimum valid tile size is 16MiB, since a metaslab always needs to be able +to fit in a single tile. +Note that any disk that is smaller than the tile size plus the size of the +anyraid map (256MiB) cannot be added to an anyraid vdev. +. .It Sy zap_iterate_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int If set, when we start iterating over a ZAP object, prefetch the entire object (all leaf blocks). diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index 3b65a52ae630..37fcdff06d0d 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -211,6 +211,21 @@ Never adds I/O requests to the vdev queue. This is not recommended for vdevs backed by spinning disks as it could result in starvation. .El +.It anyraid_tile_capacity +Only valid for +.Sy AnyRAID +vdevs and their leaf vdevs. +The number of physical tiles that the vdev can hold. +.It anyraid_tile_count +Only valid for +.Sy AnyRAID +vdevs and their leaf vdevs. +The number of physical tiles that are currently allocated on the vdev. +.It anyraid_tile_size +Only valid for +.Sy AnyRAID +vdevs and their leaf vdevs. +The size of the tiles in use on this vdev. .El .Ss User Properties In addition to the standard native properties, ZFS supports arbitrary user diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7 index 07b78dda2396..d5966244b1bc 100644 --- a/man/man7/zpoolconcepts.7 +++ b/man/man7/zpoolconcepts.7 @@ -165,6 +165,49 @@ An error is returned when the provided number of children differs. The number of distributed hot spares. Defaults to zero. .El +.It Sy anymirror , anymirror0 , anymirror1 , anymirror2 , ... anyMirrorN +A new device type that allows for mirror-parity redundancy while using devices +of different sizes. +An AnyRAID vdev works by dividing each of the underlying disks that make it up +into +.Sy tiles \. +The tiles are then each mirrored at the desired parity level. +This allows for full redundancy, since tiles are allocated from independent +disks, while enabling maximum space usage by allocating more tiles from the +disks with the most free space. +In addition, the device can be expanded by attaching new disks, and new tiles +will be allocated from those disks. +The vdev class as a whole is referred to as AnyRAID; anymirror vdevs +specifically use mirror-style parity. +.Pp +The desired parity value (N) is specified in the +.Sy anymirrorN +string you pass when creating/adding the vdev group. +The parity value cannot exceed the maximum number of vdevs in the group minus +one. +.Sy anymirror +is a synonym for +.Sy anymirror1 +, which is the 2-way mirror parity version (1 parity tile). +.Sy anymirror2 +is a 3-way mirror (2 parity tiles). +.Sy anymirror3 +is a 4-way mirror (3 parity tiles). +The pattern continues in the expected way. +.Sy anymirror0 +is striped (no parity tiles), and is primarily intended for testing. +.It Sy anyraidz1:N , anyraidz2:N, anyraidz3:N +A new device type that allows for raidz-parity redundancy while using devices +of different sizes. +These devices work in a similar way to .Sy anymirror devices, but use wider +stripes to take advantage of raidz-style parity, rather than mirror parity. +.Pp +The desired parity value (X) and number of data disks (Y) is specified in the +.Sy anyraidzX:Y +string you pass when creating/adding the vdev group. +This will store information in the same basic way, and with the same +efficiency tradeoffs, as a .Sy raidzX device with X + Y disks. +The total width (X + Y) cannot exceed the maximum number of vdevs in the group. .It Sy spare A pseudo-vdev which keeps track of available hot spares for a pool. For more information, see the diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index 2d1742811fb8..498ac7164d00 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -100,6 +100,9 @@ .Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns … .Op Fl U Ar cache .Ar poolname +.Nm +.Fl -anyraid-map +.Ar poolname Op Ar vdev Ns … . .Sh DESCRIPTION The @@ -451,6 +454,8 @@ Display histograms of per-vdev BRT refcounts. Dump the contents of the block reference tables. .It Fl u , -uberblock Display the current uberblock. +.It Fl -anyraid-map +Display information about the mappings of one or all anyraid vdevs in the pool. .El .Pp Other options: diff --git a/man/man8/zpool-contract.8 b/man/man8/zpool-contract.8 new file mode 100644 index 000000000000..e9fea09dfa74 --- /dev/null +++ b/man/man8/zpool-contract.8 @@ -0,0 +1,50 @@ +.\" SPDX-License-Identifier: CDDL-1.0 +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or https://opensource.org/licenses/CDDL-1.0. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2026 Klara, Inc. +.\" +.Dd April 2, 2026 +.Dt ZPOOL-CONTRACT 8 +.Os +. +.Sh NAME +.Nm zpool-contract +.Nd contract an AnyRAID vdev in a ZFS storage pool +.Sh SYNOPSIS +.Nm zpool +.Cm contract +.Ar pool Ar anyraid-vdev Ar leaf-vdev +. +.Sh DESCRIPTION +Starts a contraction operation on an AnyRAID vdev. +If a relocate operation is already in progress, this command will fail. +This operation will proceed in the background, moving tiles from the specified +leaf vdev to other vdevs in the anyraid vdev. +When the relocate finishes, the pool will be scrubbed to verify the integrity +of all the moved data. +Once that process is complete, the specified leaf vdev will be removed from the +pool, and the available space will be reduced. +Note that if for whatever reason valid homes cannot be found for all of the +tiles that need to be moved, the command will fail immediately. +. +.Sh SEE ALSO +.Xr zpoolconcepts 7 , +.Xr zpool-status 8 diff --git a/man/man8/zpool-rebalance.8 b/man/man8/zpool-rebalance.8 new file mode 100644 index 000000000000..08e578258bf0 --- /dev/null +++ b/man/man8/zpool-rebalance.8 @@ -0,0 +1,47 @@ +.\" SPDX-License-Identifier: CDDL-1.0 +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or https://opensource.org/licenses/CDDL-1.0. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2026 Klara, Inc. +.\" +.Dd April 2, 2026 +.Dt ZPOOL-REBALANCE 8 +.Os +. +.Sh NAME +.Nm zpool-rebalance +.Nd rebalance an AnyRAID vdev in a ZFS storage pool +.Sh SYNOPSIS +.Nm zpool +.Cm rebalance +.Ar pool Ar anyraid-vdev +. +.Sh DESCRIPTION +Starts a rebalance operation on an AnyRAID vdev. +If a relocate operation is already in progress, this command will fail. +This operation will proceed in the background, moving tiles from more fill +leaf vdevs to less full ones in order to optimize potential space usage and +balance read workload across devices. +When the relocate finishes, the pool will be scrubbed to verify the integrity +of all the moved data. +. +.Sh SEE ALSO +.Xr zpoolconcepts 7 , +.Xr zpool-status 8 diff --git a/module/Kbuild.in b/module/Kbuild.in index de093df3d5c7..5130f08d5a63 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -397,6 +397,7 @@ ZFS_OBJS := \ vdev_label.o \ vdev_mirror.o \ vdev_missing.o \ + vdev_anyraid.o \ vdev_queue.o \ vdev_raidz.o \ vdev_raidz_math.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 7a1bef6466db..96bdfc2faa30 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -328,6 +328,7 @@ SRCS+= abd.c \ vdev_label.c \ vdev_mirror.c \ vdev_missing.c \ + vdev_anyraid.c \ vdev_queue.c \ vdev_raidz.c \ vdev_raidz_math_avx2.c \ diff --git a/module/avl/avl.c b/module/avl/avl.c index 67cbcd3adeec..793c7a5f9d3e 100644 --- a/module/avl/avl.c +++ b/module/avl/avl.c @@ -180,8 +180,10 @@ avl_first(avl_tree_t *tree) avl_node_t *prev = NULL; size_t off = tree->avl_offset; - for (node = tree->avl_root; node != NULL; node = node->avl_child[0]) + for (node = tree->avl_root; node != NULL; node = node->avl_child[0]) { + ASSERT(node == NULL || (uintptr_t)node > 64); prev = node; + } if (prev != NULL) return (AVL_NODE2DATA(prev, off)); @@ -246,7 +248,7 @@ avl_nearest(avl_tree_t *tree, avl_index_t where, int direction) * "void *" of the found tree node */ void * -avl_find(avl_tree_t *tree, const void *value, avl_index_t *where) +avl_find(const avl_tree_t *tree, const void *value, avl_index_t *where) { avl_node_t *node; avl_node_t *prev = NULL; @@ -906,14 +908,14 @@ avl_destroy(avl_tree_t *tree) * Return the number of nodes in an AVL tree. */ ulong_t -avl_numnodes(avl_tree_t *tree) +avl_numnodes(const avl_tree_t *tree) { ASSERT(tree); return (tree->avl_numnodes); } boolean_t -avl_is_empty(avl_tree_t *tree) +avl_is_empty(const avl_tree_t *tree) { ASSERT(tree); return (tree->avl_numnodes == 0); diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index a06e00d73738..d0a0844cdbe4 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -93,6 +93,8 @@ #include SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, anyraid, CTLFLAG_RW, 0, + "ZFS AnyRAID VDEV"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0, "ZFS adaptive replacement cache"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, brt, CTLFLAG_RW, 0, diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index be30c58cf72b..a4dfc95375f4 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -1291,6 +1291,7 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_geom_io_start, .vdev_op_io_done = vdev_geom_io_done, diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 66e10584ab5e..c4b9dea468e6 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -1297,6 +1297,7 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_disk_io_start, .vdev_op_io_done = vdev_disk_io_done, diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 6ba9892eeb64..e3a96fad32ee 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -810,6 +810,10 @@ zpool_feature_init(void) ZFEATURE_TYPE_BOOLEAN, physical_rewrite_deps, sfeatures); } + zfeature_register(SPA_FEATURE_ANYRAID, + "com.klarasystems:anyraid", "anyraid", "Support for anyraid VDEV", + ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c index deb0547c1084..70167c65124c 100644 --- a/module/zcommon/zfs_namecheck.c +++ b/module/zcommon/zfs_namecheck.c @@ -445,7 +445,9 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what) if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0 || - strcmp(pool, "draid") == 0) { + strcmp(pool, "draid") == 0 || + strcmp(pool, "anymirror") == 0 || + strcmp(pool, "anyraid") == 0) { if (why) *why = NAME_ERR_RESERVED; return (-1); diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 2c6515e93676..3f34de491286 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -449,6 +449,15 @@ vdev_prop_init(void) zprop_register_number(VDEV_PROP_BYTES_TRIM, "trim_bytes", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "TRIMBYTE", B_FALSE, sfeatures); + zprop_register_number(VDEV_PROP_ANYRAID_CAP_TILES, + "anyraid_tile_capacity", 0, PROP_READONLY, ZFS_TYPE_VDEV, + "", "TILECAP", B_FALSE, sfeatures); + zprop_register_number(VDEV_PROP_ANYRAID_NUM_TILES, + "anyraid_tile_count", 0, PROP_READONLY, ZFS_TYPE_VDEV, + "", "NUMTILES", B_FALSE, sfeatures); + zprop_register_number(VDEV_PROP_ANYRAID_TILE_SIZE, + "anyraid_tile_size", 0, PROP_READONLY, ZFS_TYPE_VDEV, + "", "TILESIZE", B_FALSE, sfeatures); /* default numeric properties */ zprop_register_number(VDEV_PROP_CHECKSUM_N, "checksum_n", UINT64_MAX, diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 5db549bcbba6..0d8d5da6145a 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1514,9 +1514,8 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) * generated using the correct checksum algorithm and accounts for the * logical I/O size and not just a gang fragment. */ - return (zio_checksum_error_impl(zio->io_spa, zio->io_bp, - BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, - zio->io_offset, NULL) == 0); + return (zio_checksum_error_impl(zio, BP_GET_CHECKSUM(zio->io_bp), + zio->io_abd, zio->io_size, zio->io_offset, NULL) == 0); } /* diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 5cb02831a251..d6f95d1e2e69 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2052,7 +2052,8 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, - dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); + dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_ZILWRITE, + zb)); return (0); } @@ -2220,8 +2221,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db, NULL), &zp, dmu_sync_ready, NULL, - dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, - &zb)); + dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, + ZIO_FLAG_CANFAIL | ZIO_FLAG_ZILWRITE, &zb)); return (0); } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index ae36161dd1b6..e6f96dc46dab 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -650,6 +650,14 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) spa_scan_stat_init(spa); vdev_scan_stat_init(spa->spa_root_vdev); + if (spa->spa_anyraid_relocate != NULL && + spa->spa_anyraid_relocate->var_state == ARS_SCRUBBING) { + void *arg; + scn->scn_done = anyraid_setup_scan_done(spa, + spa->spa_anyraid_relocate->var_vd, &arg); + scn->scn_done_arg = arg; + } + return (0); } @@ -864,6 +872,15 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx) return (0); } +void +dsl_scan_set_done_func(dsl_pool_t *dp, dsl_scan_done_func_t *done, + void *done_arg) +{ + dsl_scan_t *scn = dp->dp_scan; + scn->scn_done = done; + scn->scn_done_arg = done_arg; +} + void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { @@ -893,6 +910,8 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) } else { scn->scn_phys.scn_max_txg = setup_sync_arg->txgend; } + scn->scn_done = setup_sync_arg->done; + scn->scn_done_arg = setup_sync_arg->done_arg; scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ scn->scn_phys.scn_start_time = gethrestime_sec(); scn->scn_phys.scn_errors = 0; @@ -1158,6 +1177,9 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) } } + if (scn->scn_done) + scn->scn_done(spa, tx, scn->scn_done_arg); + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { spa->spa_scrub_active = B_FALSE; @@ -4398,6 +4420,8 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) .func = POOL_SCAN_SCRUB, .txgstart = 0, .txgend = 0, + .done = NULL, + .done_arg = NULL, }; dsl_scan_done(scn, B_FALSE, tx); if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 0c359928a8d4..17cd90f62c68 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -3271,7 +3272,8 @@ metaslab_space_weight(metaslab_t *msp) * In effect, this means that we'll select the metaslab with the most * free bandwidth rather than simply the one with the most free space. */ - if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { + if ((!vd->vdev_nonrot && metaslab_lba_weighting_enabled) || + vdev_is_anyraid(vd)) { weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; ASSERT(weight >= space && weight <= 2 * space); } @@ -3431,6 +3433,22 @@ metaslab_segment_weight(metaslab_t *msp) weight = metaslab_weight_from_spacemap(msp); } + /* + * Anyraid vdevs strongly prefer allocations from earlier regions, in + * order to prevent premature region placement. While this optimization + * is not usually good for segment-based weighting, we enable it for + * that case specifically. + */ + vdev_t *vd = mg->mg_vd; + if ((vdev_is_anyraid(vd) || metaslab_lba_weighting_enabled) && + WEIGHT_GET_INDEX(weight) > SPA_MAXBLOCKSHIFT) { + uint64_t id = msp->ms_id; + uint64_t count = vd->vdev_ms_count; + WEIGHT_SET_INDEX(weight, WEIGHT_GET_INDEX(weight) + 3 - + ((id * 4) / count)); + weight = MIN(weight, METASLAB_MAX_WEIGHT); + } + /* * If the metaslab was active the last time we calculated its weight * then keep it active. We want to consume the entire region that @@ -3451,7 +3469,8 @@ metaslab_segment_weight(metaslab_t *msp) * weights we rely on the entire weight (excluding the weight-type bit). */ static boolean_t -metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) +metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard, + uint64_t txg, boolean_t mapped) { /* * This case will usually but not always get caught by the checks below; @@ -3462,6 +3481,17 @@ metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) if (unlikely(msp->ms_new)) return (B_FALSE); + /* + * This I/O needs to be written to a stable location and be retreivable + * before the next TXG syncs. This is the case for ZIL writes. In that + * case, if we're using an anyraid vdev, we can't use a tile that isn't + * mapped yet. + */ + if (mapped && vdev_is_anyraid(msp->ms_group->mg_vd)) { + return (vdev_anyraid_mapped(msp->ms_group->mg_vd, + msp->ms_start, txg)); + } + /* * If the metaslab is loaded, ms_max_size is definitive and we can use * the fast check. If it's not, the ms_max_size is a lower bound (once @@ -4911,8 +4941,8 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, dva_t *dva, int d, uint64_t asize, int allocator, - boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, - boolean_t *was_active) + boolean_t try_hard, uint64_t txg, boolean_t mapped, zio_alloc_list_t *zal, + metaslab_t *search, boolean_t *was_active) { avl_index_t idx; avl_tree_t *t = &mg->mg_metaslab_tree; @@ -4930,7 +4960,8 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, } tries++; - if (!metaslab_should_allocate(msp, asize, try_hard)) { + if (!metaslab_should_allocate(msp, asize, try_hard, txg, + mapped)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); continue; @@ -5011,7 +5042,7 @@ metaslab_active_mask_verify(metaslab_t *msp) static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, uint64_t asize, uint64_t max_asize, uint64_t txg, - dva_t *dva, int d, int allocator, boolean_t try_hard, + dva_t *dva, int d, int allocator, boolean_t try_hard, boolean_t mapped, uint64_t *actual_asize) { metaslab_t *msp = NULL; @@ -5087,8 +5118,8 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, - asize, allocator, try_hard, zal, search, - &was_active); + asize, allocator, try_hard, txg, mapped, zal, + search, &was_active); } mutex_exit(&mg->mg_lock); @@ -5193,7 +5224,8 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, * can accurately determine if the allocation attempt should * proceed. */ - if (!metaslab_should_allocate(msp, asize, try_hard)) { + if (!metaslab_should_allocate(msp, asize, try_hard, txg, + mapped)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); @@ -5287,7 +5319,8 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, * we may end up in an infinite loop retrying the same * metaslab. */ - ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); + ASSERT(!metaslab_should_allocate(msp, asize, try_hard, txg, + mapped)); mutex_exit(&msp->ms_lock); } @@ -5442,8 +5475,12 @@ metaslab_alloc_dva_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, uint64_t max_asize = vdev_psize_to_asize_txg(vd, max_psize, txg); ASSERT0(P2PHASE(max_asize, 1ULL << vd->vdev_ashift)); + boolean_t mapped = B_FALSE; + if (flags & METASLAB_ZIL) + mapped = B_TRUE; + uint64_t offset = metaslab_group_alloc(mg, zal, asize, - max_asize, txg, dva, d, allocator, try_hard, + max_asize, txg, dva, d, allocator, try_hard, mapped, &asize); if (offset != -1ULL) { @@ -6239,16 +6276,17 @@ metaslab_group_disable_wait(metaslab_group_t *mg) } static void -metaslab_group_disabled_increment(metaslab_group_t *mg) +metaslab_group_disabled_increment(metaslab_group_t *mg, boolean_t wait) { ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); ASSERT(mg->mg_disabled_updating); - while (mg->mg_ms_disabled >= max_disabled_ms) { + while (wait && mg->mg_ms_disabled >= max_disabled_ms) { cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); } mg->mg_ms_disabled++; - ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); + if (wait) + ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); } /* @@ -6257,8 +6295,8 @@ metaslab_group_disabled_increment(metaslab_group_t *mg) * metaslab group and limit them to prevent allocation failures from * occurring because all metaslabs are disabled. */ -void -metaslab_disable(metaslab_t *msp) +static void +metaslab_disable_impl(metaslab_t *msp, boolean_t wait) { ASSERT(!MUTEX_HELD(&msp->ms_lock)); metaslab_group_t *mg = msp->ms_group; @@ -6277,7 +6315,7 @@ metaslab_disable(metaslab_t *msp) metaslab_group_disable_wait(mg); mg->mg_disabled_updating = B_TRUE; if (msp->ms_disabled == 0) { - metaslab_group_disabled_increment(mg); + metaslab_group_disabled_increment(mg, wait); } mutex_enter(&msp->ms_lock); msp->ms_disabled++; @@ -6288,6 +6326,18 @@ metaslab_disable(metaslab_t *msp) mutex_exit(&mg->mg_ms_disabled_lock); } +void +metaslab_disable(metaslab_t *msp) +{ + metaslab_disable_impl(msp, B_TRUE); +} + +void +metaslab_disable_nowait(metaslab_t *msp) +{ + metaslab_disable_impl(msp, B_FALSE); +} + void metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) { diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 916889c3130c..6a918c12c862 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -730,6 +730,7 @@ zfs_range_tree_clear(zfs_range_tree_t *rt, uint64_t start, uint64_t size) ASSERT3U(start + size, >, start); while ((rs = zfs_range_tree_find_impl(rt, start, size)) != NULL) { + ASSERT(zfs_range_tree_space(rs) != 0); uint64_t free_start = MAX(zfs_rs_get_start(rs, rt), start); uint64_t free_end = MIN(zfs_rs_get_end(rs, rt), start + size); zfs_range_tree_remove(rt, free_start, free_end - free_start); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 843b1b9d66bb..82cc0d795393 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include #include @@ -2143,6 +2144,10 @@ spa_destroy_aux_threads(spa_t *spa) zthr_destroy(spa->spa_raidz_expand_zthr); spa->spa_raidz_expand_zthr = NULL; } + if (spa->spa_anyraid_relocate_zthr != NULL) { + zthr_destroy(spa->spa_anyraid_relocate_zthr); + spa->spa_anyraid_relocate_zthr = NULL; + } } static void @@ -2422,6 +2427,7 @@ spa_unload(spa_t *spa) } spa->spa_raidz_expand = NULL; + spa->spa_anyraid_relocate = NULL; spa->spa_checkpoint_txg = 0; spa_config_exit(spa, SCL_ALL, spa); @@ -3562,6 +3568,7 @@ spa_spawn_aux_threads(spa_t *spa) ASSERT(spa_writeable(spa)); spa_start_raidz_expansion_thread(spa); + spa_start_anyraid_relocate_thread(spa); spa_start_indirect_condensing_thread(spa); spa_start_livelist_destroy_thread(spa); spa_start_livelist_condensing_thread(spa); @@ -5902,7 +5909,8 @@ spa_ld_checkpoint_rewind(spa_t *spa) if (svdcount == SPA_SYNC_MIN_VDEVS) break; } - error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); + error = vdev_config_sync(svd, svdcount, spa->spa_first_txg, + VDEV_CONFIG_REWINDING_CHECKPOINT); if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_ALL, FTAG); @@ -7303,6 +7311,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, for (int i = 0; i < ndraid; i++) spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); + for (uint64_t i = 0; i < rvd->vdev_children; i++) + if (vdev_is_anyraid(rvd->vdev_child[i])) + spa_feature_incr(spa, SPA_FEATURE_ANYRAID, tx); + dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; @@ -7899,13 +7911,26 @@ spa_draid_feature_incr(void *arg, dmu_tx_t *tx) spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); } +/* + * This is called as a synctask to increment the anyraid feature flag + */ +static void +spa_anyraid_feature_incr(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + uint64_t nanyraid = (uint64_t)(uintptr_t)arg; + + for (int i = 0; i < nanyraid; i++) + spa_feature_incr(spa, SPA_FEATURE_ANYRAID, tx); +} + /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) { - uint64_t txg, ndraid = 0; + uint64_t txg, ndraid = 0, nanyraid = 0; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; @@ -8039,6 +8064,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) dmu_tx_commit(tx); } + for (uint64_t i = 0; i < vd->vdev_children; i++) + if (vdev_is_anyraid(vd->vdev_child[i])) + nanyraid++; + if (nanyraid > 0) { + dmu_tx_t *tx; + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, + spa_anyraid_feature_incr, + (void *)(uintptr_t)nanyraid, tx); + dmu_tx_commit(tx); + } + /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we @@ -8205,6 +8243,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, return (spa_vdev_exit(spa, NULL, txg, ENODEV)); boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops; + boolean_t anyraid = vdev_is_anyraid(oldvd); if (raidz) { if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION)) @@ -8217,11 +8256,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); } - } else if (!oldvd->vdev_ops->vdev_op_leaf) { + } else if (!anyraid && !oldvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); } - if (raidz) + if (raidz || anyraid) pvd = oldvd; else pvd = oldvd->vdev_parent; @@ -8269,6 +8308,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, if (tvd->vdev_ops != &vdev_mirror_ops && tvd->vdev_ops != &vdev_root_ops && + tvd->vdev_ops != &vdev_anymirror_ops && tvd->vdev_ops != &vdev_draid_ops) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } @@ -8282,10 +8322,13 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && - !raidz) + !raidz && !anyraid) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - pvops = &vdev_mirror_ops; + if (anyraid) + pvops = pvd->vdev_ops; + else + pvops = &vdev_mirror_ops; } else { /* * Active hot spares can only be replaced by inactive hot @@ -8327,9 +8370,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, /* * Make sure the new device is big enough. */ - vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; - if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) - return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); + if (newvd->vdev_asize < vdev_get_min_attach_size(oldvd)) + return (spa_vdev_exit(spa, newrootvd, txg, anyraid ? ENOLCK : + EOVERFLOW)); /* * The new device cannot have a higher alignment requirement @@ -8375,6 +8418,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); oldvdpath = spa_strdup(tmp); kmem_strfree(tmp); + } else if (anyraid) { + char *tmp = kmem_asprintf(VDEV_TYPE_ANYMIRROR "%u-%u", + (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); + oldvdpath = spa_strdup(tmp); + kmem_strfree(tmp); } else { oldvdpath = spa_strdup(oldvd->vdev_path); } @@ -8402,7 +8450,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ - if (!raidz && pvd->vdev_ops != pvops) { + if (!raidz && !anyraid && pvd->vdev_ops != pvops) { pvd = vdev_add_parent(oldvd, pvops); ASSERT(pvd->vdev_ops == pvops); ASSERT(oldvd->vdev_parent == pvd); @@ -8460,6 +8508,13 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, newvd, tx); dmu_tx_commit(tx); + } else if (anyraid) { + vdev_anyraid_expand(tvd, newvd); + vdev_dirty(tvd, VDD_DTL, newvd, txg); + tvd->vdev_expanding = B_TRUE; + vdev_reopen(tvd); + spa->spa_ccw_fail_time = 0; + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } else { vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, dtl_max_txg - TXG_INITIAL); @@ -9385,6 +9440,105 @@ spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, return (error); } +static void +spa_vdev_contraction_done(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *avd = NULL; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + if (!vdev_is_anyraid(tvd)) + continue; + vdev_anyraid_t *va = tvd->vdev_tsd; + if (va->vd_contracting_leaf == -1) + continue; + avd = tvd; + break; + } + ASSERT(avd); + vdev_anyraid_t *va = avd->vdev_tsd; + uint64_t avd_guid = avd->vdev_guid; + vdev_t *lvd = avd->vdev_child[va->vd_contracting_leaf]; + + uint64_t txg = spa_vdev_detach_enter(spa, lvd->vdev_guid); + + /* + * Erase the disk labels so the disk can be used for other things. + * This must be done after all other error cases are handled, + * but before we disembowel vd (so we can still do I/O to it). + * But if we can't do it, don't treat the error as fatal -- + * it may be that the unwritability of the disk is the reason + * it's being detached! + */ + (void) vdev_label_init(lvd, 0, VDEV_LABEL_REMOVE); + + rw_enter(&va->vd_lock, RW_WRITER); + + /* + * Remove vd from its parent and compact the parent's children. + */ + vdev_remove_child(avd, lvd); + vdev_compact_children(avd); + + ASSERT3S(va->vd_contracting_leaf, ==, lvd->vdev_id); + vdev_anyraid_compact_children(avd); + va->vd_contracting_leaf = -1; + spa->spa_anyraid_relocate = NULL; + va->vd_relocate.var_state = ARS_FINISHED; + rw_exit(&va->vd_lock); + + /* + * Remember one of the remaining children so we can get tvd below. + */ + vdev_t *cvd = avd->vdev_child[avd->vdev_children - 1]; + + ASSERT3P(avd->vdev_parent, ==, spa->spa_root_vdev); + + /* + * Reevaluate the parent vdev state. + */ + vdev_propagate_state(cvd); + + vdev_config_dirty(avd); + + /* + * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that + * vd->vdev_detached is set and free vd's DTL object in syncing context. + * But first make sure we're not on any *other* txg's DTL list, to + * prevent vd from being accessed after it's freed. + */ + char *vdpath = spa_strdup(lvd->vdev_path ? lvd->vdev_path : "none"); + for (int t = 0; t < TXG_SIZE; t++) + (void) txg_list_remove_this(&avd->vdev_dtl_list, lvd, t); + lvd->vdev_detached = B_TRUE; + vdev_dirty(avd, VDD_DTL, lvd, txg); + vdev_config_dirty(avd); + + spa_event_notify(spa, lvd, NULL, ESC_ZFS_VDEV_REMOVE); + spa_notify_waiters(spa); + + /* hang on to the spa before we release the lock */ + spa_open_ref(spa, FTAG); + + avd->vdev_shrinking = B_TRUE; + vdev_reopen(avd); + vdev_metaslab_init(avd, txg); + avd->vdev_shrinking = B_FALSE; + VERIFY0(spa_vdev_exit(spa, lvd, txg, 0)); // TODO + + spa_history_log_internal(spa, "detach", NULL, + "vdev=%s", vdpath); + spa_strfree(vdpath); + + txg_wait_synced(spa->spa_dsl_pool, txg); + avd = spa_lookup_by_guid(spa, avd_guid, B_FALSE); + va = avd->vdev_tsd; + /* all done with the spa; OK to release */ + spa_namespace_enter(FTAG); + spa_close(spa, FTAG); + spa_namespace_exit(FTAG); +} + /* * Find any device that's done replacing, or a vdev marked 'unspare' that's * currently spared, so we can detach it. @@ -9716,6 +9870,8 @@ spa_async_thread(void *arg) spa->spa_async_tasks = 0; mutex_exit(&spa->spa_async_lock); + if (tasks & SPA_ASYNC_CONTRACTION_DONE) + spa_vdev_contraction_done(spa); /* * See if the config needs to be updated. */ @@ -9883,6 +10039,10 @@ spa_async_suspend(spa_t *spa) if (raidz_expand_thread != NULL) zthr_cancel(raidz_expand_thread); + zthr_t *anyraid_relocate_thread = spa->spa_anyraid_relocate_zthr; + if (anyraid_relocate_thread != NULL) + zthr_cancel(anyraid_relocate_thread); + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_cancel(discard_thread); @@ -9913,6 +10073,10 @@ spa_async_resume(spa_t *spa) if (raidz_expand_thread != NULL) zthr_resume(raidz_expand_thread); + zthr_t *anyraid_relocate_thread = spa->spa_anyraid_relocate_zthr; + if (anyraid_relocate_thread != NULL) + zthr_resume(anyraid_relocate_thread); + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_resume(discard_thread); @@ -10744,6 +10908,13 @@ spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg = tx->tx_txg; + vdev_config_sync_status_t status; + if (dmu_tx_get_txg(tx) == spa->spa_checkpoint_txg + 1) + status = VDEV_CONFIG_CREATING_CHECKPOINT; + else if (spa->spa_checkpoint_txg == 0) + status = VDEV_CONFIG_NO_CHECKPOINT; + else + status = VDEV_CONFIG_KEEP_CHECKPOINT; for (;;) { int error = 0; @@ -10777,10 +10948,10 @@ spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) if (svdcount == SPA_SYNC_MIN_VDEVS) break; } - error = vdev_config_sync(svd, svdcount, txg); + error = vdev_config_sync(svd, svdcount, txg, status); } else { error = vdev_config_sync(rvd->vdev_child, - rvd->vdev_children, txg); + rvd->vdev_children, txg, status); } if (error == 0) @@ -11503,6 +11674,12 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); break; } + case ZPOOL_WAIT_ANYRAID_RELOCATE: + { + vdev_anyraid_relocate_t *var = spa->spa_anyraid_relocate; + *in_progress = (var != NULL && var->var_state == ARS_SCANNING); + break; + } default: panic("unrecognized value for activity %d", activity); } @@ -11638,6 +11815,139 @@ spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); } +static int +spa_check_start_rebalance(void *arg, dmu_tx_t *tx) { + vdev_t *vd = (vdev_t *)arg; + spa_t *spa = vd->vdev_spa; + if (!vdev_is_anyraid(vd)) + return (SET_ERROR(EINVAL)); + if (vdev_anyraid_relocate_status(vd)->var_state == ARS_SCANNING) + return (SET_ERROR(EALREADY)); + if (vd->vdev_spa->spa_anyraid_relocate != NULL) + return (SET_ERROR(EEXIST)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + int error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (SET_ERROR(error)); + } + + (void) tx; + return (0); +} + +static void +spa_sync_start_rebalance(void *arg, dmu_tx_t *tx) { + vdev_t *vd = (vdev_t *)arg; + ASSERT(vdev_is_anyraid(vd)); + vdev_anyraid_setup_rebalance(vd, tx); +} + +int +spa_rebalance_vdevs(spa_t *spa, const uint64_t *guids, uint_t count) +{ + if (count == 0) + return (ENOENT); + ASSERT(guids); + uint_t lasterror = 0; + for (uint_t c = 0; c < count; c++) { + vdev_t *vd = spa_lookup_by_guid(spa, guids[c], B_FALSE); + if (vd == NULL) { + lasterror = SET_ERROR(ENOENT); + break; + } + + lasterror = dsl_sync_task(spa->spa_name, + spa_check_start_rebalance, spa_sync_start_rebalance, + vd, 6, ZFS_SPACE_CHECK_NORMAL); + if (lasterror) + break; + } + return (lasterror); +} + +int +spa_rebalance_all(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint_t count = 0; + uint_t lasterror = 0; + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *cvd = rvd->vdev_child[c]; + if (!vdev_is_anyraid(cvd)) + continue; + count++; + /* + * Theoretically, if every single tile was getting moved, we + * could need vd->vdev_asize / va->vd_tile_size * + * sizeof (relocate_task_phys_t) bytes to store all the tasks, + * plus the dnode and indirect blocks. + */ + vdev_anyraid_t *va = cvd->vdev_tsd; + uint_t blocks = ((cvd->vdev_asize / va->vd_tile_size * 32) >> + SPA_OLD_MAXBLOCKSHIFT) + 6; + lasterror = dsl_sync_task(spa->spa_name, + spa_check_start_rebalance, spa_sync_start_rebalance, + cvd, blocks, ZFS_SPACE_CHECK_NORMAL); + if (lasterror) + break; + } + if (count == 0) + return (ENOENT); + return (lasterror); +} + +static int +spa_check_start_contract(void *arg, dmu_tx_t *tx) { + vdev_t *lvd = (vdev_t *)arg; + vdev_t *tvd = lvd->vdev_top; + spa_t *spa = tvd->vdev_spa; + if (!vdev_is_anyraid(tvd)) + return (SET_ERROR(EINVAL)); + if (vdev_anyraid_relocate_status(tvd)->var_state == ARS_SCANNING || + vdev_anyraid_relocate_status(tvd)->var_state == ARS_SCRUBBING) + return (SET_ERROR(EALREADY)); + if (spa->spa_anyraid_relocate != NULL) + return (SET_ERROR(EEXIST)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + int error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (SET_ERROR(error)); + } + + return (vdev_anyraid_check_contract(tvd, lvd, tx)); +} + +static void +spa_sync_start_contract(void *arg, dmu_tx_t *tx) { + vdev_t *lvd = (vdev_t *)arg; + vdev_t *tvd = lvd->vdev_top; + ASSERT(vdev_is_anyraid(tvd)); + vdev_anyraid_setup_contract(tvd, tx); +} + +int +spa_contract_vdev(spa_t *spa, uint64_t anyraid_vdev, uint64_t leaf_vdev) +{ + vdev_t *avd = spa_lookup_by_guid(spa, anyraid_vdev, B_FALSE); + vdev_t *lvd = spa_lookup_by_guid(spa, leaf_vdev, B_FALSE); + if (avd == NULL || lvd == NULL) + return (SET_ERROR(ENOENT)); + if (!vdev_is_anyraid(avd)) + return (SET_ERROR(EINVAL)); + if (lvd->vdev_top != avd) + return (SET_ERROR(ENXIO)); + + /* + * We need one relocate task per tile on the leaf vdev. + * We also need additional blocks to store metadata. + */ + uint_t blocks = ((vdev_anyraid_child_num_tiles(avd, lvd) * 32) >> + SPA_OLD_MAXBLOCKSHIFT) + 6; + return (dsl_sync_task(spa->spa_name, spa_check_start_contract, + spa_sync_start_contract, lvd, blocks, ZFS_SPACE_CHECK_NORMAL)); +} + /* state manipulation functions */ EXPORT_SYMBOL(spa_open); EXPORT_SYMBOL(spa_open_rewind); diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c index a42aa62e6599..88080d0aff42 100644 --- a/module/zfs/spa_checkpoint.c +++ b/module/zfs/spa_checkpoint.c @@ -469,6 +469,9 @@ spa_checkpoint_check(void *arg, dmu_tx_t *tx) if (spa->spa_raidz_expand != NULL) return (SET_ERROR(ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); + if (spa->spa_anyraid_relocate != NULL) + return (SET_ERROR(ZFS_ERR_ANYRAID_REBALANCE_IN_PROGRESS)); + if (spa->spa_checkpoint_txg != 0) return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS)); diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 31216e9a7ccc..5a22d140d119 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -508,7 +508,10 @@ spa_config_update(spa_t *spa, int what) * metaslab size nor call vdev_expand() on them. */ if (!vdev_is_concrete(tvd) || - (tvd->vdev_islog && tvd->vdev_removing)) + (tvd->vdev_islog && tvd->vdev_removing) || + (vdev_is_anyraid(tvd) && + ((vdev_anyraid_t *)tvd->vdev_tsd) + ->vd_contracting_leaf != -1)) continue; if (tvd->vdev_ms_array == 0) diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index eaaa429eda33..412897dbdc04 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1891,6 +1891,18 @@ spa_syncing_txg(spa_t *spa) return (spa->spa_syncing_txg); } +uint64_t +spa_load_max_txg(spa_t *spa) +{ + return (spa->spa_load_max_txg); +} + +uint64_t +spa_current_txg(spa_t *spa) +{ + return (spa->spa_uberblock.ub_txg); +} + /* * Return the last txg where data can be dirtied. The final txgs * will be used to just clear out any deferred frees that remain. @@ -2332,6 +2344,12 @@ spa_dirty_data(spa_t *spa) return (spa->spa_dsl_pool->dp_dirty_total); } +uint64_t +spa_load_txg(spa_t *spa) +{ + return (spa->spa_load_txg); +} + /* * ========================================================================== * SPA Import Progress Routines @@ -3048,6 +3066,12 @@ spa_has_checkpoint(spa_t *spa) return (spa->spa_checkpoint_txg != 0); } +uint64_t +spa_checkpoint_txg(spa_t *spa) +{ + return (spa->spa_checkpoint_txg); +} + boolean_t spa_importing_readonly_checkpoint(spa_t *spa) { @@ -3055,6 +3079,13 @@ spa_importing_readonly_checkpoint(spa_t *spa) spa->spa_mode == SPA_MODE_READ); } +boolean_t +spa_importing_checkpoint(spa_t *spa) +{ + return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) && + spa->spa_uberblock.ub_checkpoint_txg != 0); +} + uint64_t spa_min_claim_txg(spa_t *spa) { @@ -3158,6 +3189,7 @@ EXPORT_SYMBOL(spa_syncing_txg); EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); +EXPORT_SYMBOL(spa_load_txg); EXPORT_SYMBOL(spa_freeze_txg); EXPORT_SYMBOL(spa_get_min_alloc_range); /* for Lustre */ EXPORT_SYMBOL(spa_get_dspace); @@ -3200,8 +3232,10 @@ EXPORT_SYMBOL(spa_missing_tvds_allowed); EXPORT_SYMBOL(spa_set_missing_tvds); EXPORT_SYMBOL(spa_state_to_name); EXPORT_SYMBOL(spa_importing_readonly_checkpoint); +EXPORT_SYMBOL(spa_importing_checkpoint); EXPORT_SYMBOL(spa_min_claim_txg); EXPORT_SYMBOL(spa_suspend_async_destroy); +EXPORT_SYMBOL(spa_checkpoint_txg); EXPORT_SYMBOL(spa_has_checkpoint); EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 3480b884ea96..019b60e07a1e 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -55,11 +55,11 @@ #include #include #include -#include #include #include #include #include +#include #include #include #include "zfs_prop.h" @@ -279,6 +279,8 @@ static vdev_ops_t *const vdev_ops_table[] = { &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, + &vdev_anymirror_ops, + &vdev_anyraidz_ops, NULL }; @@ -345,6 +347,21 @@ vdev_derive_alloc_bias(const char *bias) return (alloc_bias); } +uint64_t +vdev_default_min_attach_size(vdev_t *vd) +{ + return (vdev_get_min_asize(vd)); +} + +uint64_t +vdev_get_min_attach_size(vdev_t *vd) +{ + vdev_t *pvd = vd->vdev_parent; + if (vd == vd->vdev_top) + pvd = vd; + return (pvd->vdev_ops->vdev_op_min_attach_size(pvd)); +} + uint64_t vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg) { @@ -377,9 +394,10 @@ vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg) } uint64_t -vdev_default_min_asize(vdev_t *vd) +vdev_default_min_asize(vdev_t *pvd, vdev_t *cvd) { - return (vd->vdev_min_asize); + (void) cvd; + return (pvd->vdev_min_asize); } /* @@ -404,11 +422,32 @@ vdev_get_min_asize(vdev_t *vd) * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ - if (vd == vd->vdev_top) + if (vd == vd->vdev_top) { + if (vd->vdev_shrinking || (vdev_is_anyraid(vd) && + ((vdev_anyraid_t *)vd->vdev_tsd)->vd_contracting_leaf != + -1 && vd->vdev_ms_count != 0)) { + /* + * Find the last metaslab with anything in it, and + * declare the end of that metaslab to be the smallest + * size the disk can take on. + */ + for (uint64_t m = vd->vdev_ms_count - 1; m > 0; m--) { + metaslab_t *ms = vd->vdev_ms[m]; + if (metaslab_allocated_space(ms) != 0) { + return ((m + 1) << vd->vdev_ms_shift); + } + } + /* + * If the vdev is totally empty, we still probably + * don't want to shrink it to size 0. + */ + return (1ULL << vd->vdev_ms_shift); + } return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift, uint64_t)); + } - return (pvd->vdev_ops->vdev_op_min_asize(pvd)); + return (pvd->vdev_ops->vdev_op_min_asize(pvd, vd)); } void @@ -905,6 +944,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { return (SET_ERROR(ENOTSUP)); } + + /* spa_vdev_add() expects feature to be enabled */ + if ((ops == &vdev_anymirror_ops || ops == &vdev_anyraidz_ops) && + spa->spa_load_state != SPA_LOAD_CREATE && + !spa_feature_is_enabled(spa, SPA_FEATURE_ANYRAID)) { + return (SET_ERROR(ENOTSUP)); + } } /* @@ -1595,16 +1641,16 @@ vdev_metaslab_group_create(vdev_t *vd) } void -vdev_update_nonallocating_space(vdev_t *vd, boolean_t add) +vdev_update_nonallocating_space(vdev_t *vd, uint64_t bytes, boolean_t add) { spa_t *spa = vd->vdev_spa; - if (vd->vdev_mg->mg_class != spa_normal_class(spa)) + if (vd->vdev_mg->mg_class != spa_normal_class(spa) || bytes == 0) return; uint64_t raw_space = metaslab_group_get_space(vd->vdev_mg); - uint64_t dspace = spa_deflate(spa) ? - vdev_deflated_space(vd, raw_space) : raw_space; + uint64_t dspace = bytes != -1ULL ? bytes : (spa_deflate(spa) ? + vdev_deflated_space(vd, raw_space) : raw_space); if (add) { spa->spa_nonallocating_dspace += dspace; } else { @@ -1622,6 +1668,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) metaslab_t **mspp; int error; boolean_t expanding = (oldc != 0); + boolean_t shrinking = vd->vdev_shrinking; ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); @@ -1633,12 +1680,20 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) ASSERT(!vd->vdev_ishole); - ASSERT(oldc <= newc); + ASSERT(shrinking || oldc <= newc); + ASSERT(newc); mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); + for (uint64_t m = newc; m < oldc; m++) { + ASSERT(shrinking); + metaslab_t *msp = vd->vdev_ms[m]; + ASSERT(msp->ms_disabled); + metaslab_fini(msp); + } + if (expanding) { - memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp)); + memcpy(mspp, vd->vdev_ms, MIN(oldc, newc) * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } @@ -1650,7 +1705,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) * vdev. In order to ensure that all weights are correct at all times, * we need to recalculate here. */ - for (uint64_t m = 0; m < oldc; m++) { + for (uint64_t m = 0; m < MIN(oldc, newc); m++) { metaslab_t *msp = vd->vdev_ms[m]; mutex_enter(&msp->ms_lock); metaslab_recalculate_weight_and_sort(msp); @@ -1737,7 +1792,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) */ if (vd->vdev_noalloc) { /* track non-allocating vdev space */ - vdev_update_nonallocating_space(vd, B_TRUE); + vdev_update_nonallocating_space(vd, -1ULL, B_TRUE); } else if (!expanding) { metaslab_group_activate(vd->vdev_mg); if (vd->vdev_log_mg != NULL) @@ -2295,7 +2350,6 @@ vdev_open(vdev_t *vd) vd->vdev_copy_uberblocks = B_TRUE; vd->vdev_psize = psize; - /* * Make sure the allocatable size hasn't shrunk too much. */ @@ -2317,7 +2371,10 @@ vdev_open(vdev_t *vd) vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); - if (vd->vdev_asize == 0) { + if (vd->vdev_shrinking) { + vd->vdev_asize = asize; + vd->vdev_max_asize = max_asize; + } else if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For compatibility, a different ashift can be requested. @@ -2869,7 +2926,8 @@ vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT3U(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER), ==, + SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; @@ -3015,6 +3073,8 @@ vdev_metaslab_set_size(vdev_t *vd) if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); } + if (vd->vdev_ops->vdev_op_metaslab_size) + vd->vdev_ops->vdev_op_metaslab_size(vd, &ms_shift); vd->vdev_ms_shift = ms_shift; ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); @@ -3452,6 +3512,8 @@ vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) { raidz_dtl_reassessed(vd); + } else if (vdev_is_anyraid(vd)) { + anyraid_dtl_reassessed(vd); } } @@ -4005,6 +4067,11 @@ vdev_load(vdev_t *vd) VDEV_AUX_CORRUPT_DATA); return (error); } + if (vdev_is_anyraid(vd)) { + error = vdev_anyraid_load(vd); + if (error != 0) + return (error); + } uint64_t checkpoint_sm_obj; error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj); @@ -6728,6 +6795,55 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) break; } break; + case VDEV_PROP_ANYRAID_CAP_TILES: + { + vdev_t *pvd = vd->vdev_parent; + uint64_t total = 0; + if (vdev_is_anyraid(vd)) { + total = vdev_anyraid_child_capacity(vd, + NULL); + } else if (pvd && vdev_is_anyraid(pvd)) { + total = vdev_anyraid_child_capacity(pvd, + vd); + } else { + continue; + } + vdev_prop_add_list(outnvl, propname, + NULL, total, ZPROP_SRC_NONE); + continue; + } + case VDEV_PROP_ANYRAID_NUM_TILES: + { + vdev_t *pvd = vd->vdev_parent; + uint64_t total = 0; + if (vdev_is_anyraid(vd)) { + total = vdev_anyraid_child_num_tiles( + vd, NULL); + } else if (pvd && vdev_is_anyraid(pvd)) { + total = vdev_anyraid_child_num_tiles( + pvd, vd); + } else { + continue; + } + vdev_prop_add_list(outnvl, propname, + NULL, total, ZPROP_SRC_NONE); + continue; + } + case VDEV_PROP_ANYRAID_TILE_SIZE: + { + vdev_t *pvd = vd->vdev_parent; + vdev_anyraid_t *va = NULL; + if (vdev_is_anyraid(vd)) { + va = vd->vdev_tsd; + } else if (pvd && vdev_is_anyraid(pvd)) { + va = pvd->vdev_tsd; + } else { + continue; + } + vdev_prop_add_list(outnvl, propname, + NULL, va->vd_tile_size, ZPROP_SRC_NONE); + continue; + } default: err = ENOENT; break; @@ -6785,6 +6901,13 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) return (0); } +boolean_t +vdev_is_anyraid(vdev_t *vd) +{ + return (vd->vdev_ops == &vdev_anymirror_ops || + vd->vdev_ops == &vdev_anyraidz_ops); +} + EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); diff --git a/module/zfs/vdev_anyraid.c b/module/zfs/vdev_anyraid.c new file mode 100644 index 000000000000..48931d44d957 --- /dev/null +++ b/module/zfs/vdev_anyraid.c @@ -0,0 +1,3674 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Klara Inc. + */ + +/* + * Anyraid vdevs are a way to get the benefits of mirror (and, in the future, + * raidz) vdevs while using disks with mismatched sizes. The primary goal of + * this feature is maximizing the available space of the provided devices. + * Performance is secondary to that goal; nice to have, but not required. This + * feature is also designed to work on modern hard drives: while the feature + * will work on drives smaller than 1TB, the default tuning values are + * optimized for drives of at least that size. + * + * Anyraid works by splitting the vdev into "tiles". Each tile is the same + * size; by default, 1/64th of the size of the smallest disk in the vdev, or + * 16GiB, whichever is larger. A tile represents an area of + * logical-to-physical mapping: bytes within that logical tile are stored + * physically together. Subsequent tiles may be stored in different locations + * on the same disk, or different disks altogether. A mapping is stored on each + * disk to enable the vdev to be read normally. + * + * When parity is not considered, this provides some small benefits (device + * removal within the vdev is not yet implemented, but is very feasible, as is + * rebalancing data onto new disks), but is not generally recommended. However, + * if parity is considered, it is more useful. With mirror parity P, each + * tile is allocated onto P separate disks, providing the reliability and + * performance characteristics of a mirror vdev. In addition, because each tile + * can be allocated separately, smaller drives can work together to mirror + * larger ones dynamically and seamlessly. + * + * The mapping for these tiles is stored in a special area at the start of + * each device. Each disk has 4 full copies of the tile map, which rotate + * per txg in a similar manner to uberblocks. The tile map itself is 64MiB, + * plus a small header (~8KiB) before it. + * + * The exact space that is allocatable in an anyraid vdev is not easy to + * calculate in the general case. It's a variant of the bin-packing problem, so + * an optimal solution is complex. However, this case seems to be a sub-problem + * where greedy algorithms give optimal solutions, so that is what we do here. + * Each tile is allocated from the P disks that have the most available + * capacity. This does mean that calculating the size of a disk requires + * running the allocation algorithm until completion, but for the relatively + * small number of tiles we are working with, an O(n * log n) runtime is + * acceptable. + * + * Currently, there is a limit of 2^24 tiles in an anyraid vdev: 2^8 disks, + * and 2^16 tiles per disk. This means that by default, the largest device + * that can be fully utilized by an anyraid vdev is 1024 times the size of the + * smallest device that was present during device creation. This is not a + * fundamental limit, and could be expanded in the future. However, this does + * affect the size of the tile map. Currently, the tile map can always + * store all tiles without running out of space; 2^24 4-byte entries is 2^26 + * bytes = 64MiB. Expanding the maximum number of tiles per disk or disks per + * vdev would necessarily involve either expanding the tile map or adding + * handling for the tile map running out of space. + * + * When it comes to performance, there is a tradeoff. While the per-disk I/O + * rates are equivalent to using mirrors (because only a small amount of extra + * logic is used on top of the mirror code), the overall vdev throughput may + * not be. This is because the actively used tiles may be allocated to the + * same devices, leaving other devices idle for writes. This is especially true + * as the variation in drive sizes increases. To some extent, this problem is + * fundamental: writes fill up disks. If we want to fill all the disks, smaller + * disks will not be able to satisfy as many writes. Rewrite- and read-heavy + * workloads will encounter this problem to a lesser extent. The performance + * downsides can be mitigated with smaller tile sizes, larger metaslabs, + * and more active metaslab allocators. + * + * Checkpoints are currently supported by storing the maximum allocated tile + * at the time of the checkpoint, and then discarding all tiles after that + * when a checkpoint is rolled back. Because device addition is forbidden while + * a checkpoint is outstanding, no more complex logic is required. + * + * Currently, anyraid vdevs only work with mirror-type parity. However, plans + * for future work include: + * Raidz-type parity + * Anyraid vdev shrinking via device removal + * Rebalancing after device addition + * + * Possible future work also includes: + * Enabling rebalancing with an outstanding checkpoint + * Trim and initialize beyond the end of the allocated tiles + * Store device asizes so we can make better allocation decisions while a + * device is faulted + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The smallest allowable tile size. Shrinking this is mostly useful for + * testing. Increasing it may be useful if you plan to add much larger disks to + * an array in the future, and want to be sure their full capacity will be + * usable. + */ +uint64_t zfs_anyraid_min_tile_size = (16ULL << 30); +/* + * This controls how many tiles we have per disk (based on the smallest disk + * present at creation time) + */ +int anyraid_disk_shift = 6; + +/* + * Maximum amount of copy io's outstanding at once. + */ +#ifdef _ILP32 +static unsigned long anyraid_relocate_max_move_bytes = SPA_MAXBLOCKSIZE; +#else +static unsigned long anyraid_relocate_max_move_bytes = SPA_MAXBLOCKSIZE; +#endif + +/* + * Automatically start a pool scrub when a RAIDZ expansion completes in + * order to verify the checksums of all blocks which have been copied + * during the expansion. Automatic scrubbing is enabled by default and + * is strongly recommended. + */ +static int zfs_scrub_after_relocate = 1; + +/* + * For testing only: pause the anyraid relocate operations after reflowing this + * amount (accessed by ZTS and ztest). + */ +#ifdef _KERNEL +static +#endif /* _KERNEL */ +unsigned long anyraid_relocate_max_bytes_pause = 0; + +static int tasklist_read(vdev_t *vd); +static void anyraid_scrub_done(spa_t *spa, dmu_tx_t *tx, void *arg); + +struct anyraid_done_arg { + vdev_t *vd; +}; + +static int +af_compar(const void *p1, const void *p2) +{ + const anyraid_free_node_t *af1 = p1, *af2 = p2; + + return (TREE_CMP(af2->afn_tile, af1->afn_tile)); +} + +void +anyraid_freelist_create(anyraid_freelist_t *af, uint16_t off) +{ + avl_create(&af->af_list, af_compar, + sizeof (anyraid_free_node_t), + offsetof(anyraid_free_node_t, afn_node)); + af->af_next_off = off; +} + +void +anyraid_freelist_destroy(anyraid_freelist_t *af) +{ + void *cookie = NULL; + anyraid_free_node_t *node; + while ((node = avl_destroy_nodes(&af->af_list, &cookie)) != NULL) + kmem_free(node, sizeof (*node)); + avl_destroy(&af->af_list); +} + +void +anyraid_freelist_add(anyraid_freelist_t *af, uint16_t off) +{ + avl_tree_t *t = &af->af_list; + ASSERT3U(off, <, af->af_next_off); + if (off != af->af_next_off - 1) { + anyraid_free_node_t *new = kmem_alloc(sizeof (*new), KM_SLEEP); + new->afn_tile = off; + avl_add(t, new); + return; + } + af->af_next_off--; + for (anyraid_free_node_t *tail = avl_last(t); + tail != NULL && tail->afn_tile == af->af_next_off - 1; + tail = avl_last(t)) { + af->af_next_off--; + avl_remove(t, tail); + kmem_free(tail, sizeof (*tail)); + } +} + +void +anyraid_freelist_remove(anyraid_freelist_t *af, uint16_t off) +{ + avl_tree_t *t = &af->af_list; + anyraid_free_node_t search; + search.afn_tile = off; + avl_index_t where; + anyraid_free_node_t *node = avl_find(t, &search, &where); + if (node) { + avl_remove(t, node); + kmem_free(node, sizeof (*node)); + return; + } + ASSERT3U(off, >=, af->af_next_off); + while (off > af->af_next_off) { + node = kmem_alloc(sizeof (*node), KM_SLEEP); + node->afn_tile = af->af_next_off++; + avl_add(t, node); + } + af->af_next_off++; + return; + +} + +uint16_t +anyraid_freelist_pop(anyraid_freelist_t *af) +{ + avl_tree_t *t = &af->af_list; + if (avl_numnodes(t) == 0) { + return (af->af_next_off++); + } + + anyraid_free_node_t *head = avl_first(t); + avl_remove(t, head); + uint16_t ret = head->afn_tile; + kmem_free(head, sizeof (*head)); + return (ret); +} + +uint16_t +anyraid_freelist_alloc(const anyraid_freelist_t *af) +{ + return (af->af_next_off - avl_numnodes(&af->af_list)); +} + +boolean_t +anyraid_freelist_isfree(const anyraid_freelist_t *af, uint16_t off) +{ + if (off >= af->af_next_off) + return (B_TRUE); + anyraid_free_node_t search; + search.afn_tile = off; + avl_index_t where; + anyraid_free_node_t *node = avl_find(&af->af_list, &search, &where); + return (node != NULL); +} + +static inline uint64_t +vdev_anyraid_header_offset(vdev_t *vd, int id) +{ + uint64_t full_size = VDEV_ANYRAID_SINGLE_MAP_SIZE(vd->vdev_ashift); + if (id < VDEV_ANYRAID_START_COPIES) + return (VDEV_LABEL_START_SIZE + id * full_size); + else + return (vd->vdev_psize - VDEV_LABEL_END_SIZE - + (VDEV_ANYRAID_MAP_COPIES - id) * full_size); +} + +static inline int +anyraid_tile_compare(const void *p1, const void *p2) +{ + const anyraid_tile_t *r1 = p1, *r2 = p2; + + return (TREE_CMP(r1->at_tile_id, r2->at_tile_id)); +} + +static inline int +anyraid_child_compare(const void *p1, const void *p2) +{ + const vdev_anyraid_node_t *van1 = p1, *van2 = p2; + + int cmp = TREE_CMP( + (int64_t)van2->van_capacity - + anyraid_freelist_alloc(&van2->van_freelist), + (int64_t)van1->van_capacity - + anyraid_freelist_alloc(&van1->van_freelist)); + if (cmp != 0) + return (cmp); + + return (TREE_CMP(van1->van_id, van2->van_id)); +} + +/* + * Initialize private VDEV specific fields from the nvlist. + */ +static int +vdev_anyraid_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + (void) spa; + uint_t children; + nvlist_t **child; + int error = nvlist_lookup_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, &child, &children); + if (error != 0 || children > VDEV_ANYRAID_MAX_DISKS) + return (SET_ERROR(EINVAL)); + + uint64_t nparity; + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) != 0) + return (SET_ERROR(EINVAL)); + + vdev_anyraid_parity_type_t parity_type = VAP_TYPES; + if (nvlist_lookup_uint8(nv, ZPOOL_CONFIG_ANYRAID_PARITY_TYPE, + (uint8_t *)&parity_type) != 0) + return (SET_ERROR(EINVAL)); + uint8_t ndata = 1; + if (nvlist_lookup_uint8(nv, ZPOOL_CONFIG_ANYRAID_NDATA, + &ndata) != 0 && parity_type == VAP_RAIDZ) { + return (SET_ERROR(EINVAL)); + } + + if (ndata + nparity > children) { + zfs_dbgmsg("width too high when creating anyraid vdev"); + return (SET_ERROR(EINVAL)); + } + + vdev_anyraid_t *va = kmem_zalloc(sizeof (*va), KM_SLEEP); + va->vd_parity_type = parity_type; + va->vd_ndata = ndata; + va->vd_nparity = nparity; + va->vd_contracting_leaf = -1; + switch (parity_type) { + case VAP_MIRROR: + va->vd_width = ndata; + break; + case VAP_RAIDZ: + va->vd_width = ndata + nparity; + break; + default: + PANIC("Invalid parity type %d", parity_type); + } + rw_init(&va->vd_lock, NULL, RW_DEFAULT, NULL); + avl_create(&va->vd_tile_map, anyraid_tile_compare, + sizeof (anyraid_tile_t), offsetof(anyraid_tile_t, at_node)); + avl_create(&va->vd_children_tree, anyraid_child_compare, + sizeof (vdev_anyraid_node_t), + offsetof(vdev_anyraid_node_t, van_node)); + zfs_rangelock_init(&va->vd_rangelock, NULL, NULL); + vdev_anyraid_relocate_t *var = &va->vd_relocate; + var->var_offset = var->var_failed_offset = UINT64_MAX; + list_create(&var->var_list, + sizeof (vdev_anyraid_relocate_task_t), + offsetof(vdev_anyraid_relocate_task_t, vart_node)); + list_create(&var->var_done_list, + sizeof (vdev_anyraid_relocate_task_t), + offsetof(vdev_anyraid_relocate_task_t, vart_node)); + mutex_init(&var->var_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&var->var_cv, NULL, CV_DEFAULT, NULL); + + va->vd_children = kmem_zalloc(sizeof (*va->vd_children) * children, + KM_SLEEP); + for (int c = 0; c < children; c++) { + vdev_anyraid_node_t *van = kmem_zalloc(sizeof (*van), KM_SLEEP); + van->van_id = c; + anyraid_freelist_create(&van->van_freelist, 0); + avl_add(&va->vd_children_tree, van); + va->vd_children[c] = van; + } + + *tsd = va; + return (0); +} + +static void +vdev_anyraid_fini(vdev_t *vd) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + + if (vd->vdev_spa->spa_anyraid_relocate == &va->vd_relocate) + vd->vdev_spa->spa_anyraid_relocate = NULL; + avl_destroy(&va->vd_tile_map); + + vdev_anyraid_node_t *node; + void *cookie = NULL; + while ((node = avl_destroy_nodes(&va->vd_children_tree, &cookie))) { + anyraid_freelist_destroy(&node->van_freelist); + kmem_free(node, sizeof (*node)); + } + avl_destroy(&va->vd_children_tree); + zfs_rangelock_fini(&va->vd_rangelock); + vdev_anyraid_relocate_t *var = &va->vd_relocate; + mutex_destroy(&var->var_lock); + cv_destroy(&var->var_cv); + list_destroy(&var->var_list); + list_destroy(&var->var_done_list); + + rw_destroy(&va->vd_lock); + kmem_free(va->vd_children, + sizeof (*va->vd_children) * vd->vdev_children); + kmem_free(va, sizeof (*va)); +} + +/* + * Add ANYRAID specific fields to the config nvlist. + */ +static void +vdev_anyraid_config_generate(vdev_t *vd, nvlist_t *nv) +{ + ASSERT(vdev_is_anyraid(vd)); + vdev_anyraid_t *va = vd->vdev_tsd; + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, va->vd_nparity); + fnvlist_add_uint8(nv, ZPOOL_CONFIG_ANYRAID_PARITY_TYPE, + (uint8_t)va->vd_parity_type); + fnvlist_add_uint8(nv, ZPOOL_CONFIG_ANYRAID_NDATA, + (uint8_t)va->vd_ndata); +} + +/* + * Import/open related functions. + */ + +/* + * Add an entry to the tile map for the provided tile. + */ +static void +create_tile_entry(spa_t *spa, vdev_anyraid_t *va, + anyraid_map_loc_entry_t *amle, uint8_t *pat_cnt, anyraid_tile_t **out_at, + uint32_t *cur_tile) +{ + uint8_t disk = amle_get_disk(amle); + uint16_t offset = amle_get_offset(amle); + anyraid_tile_t *at = *out_at; + + if (*pat_cnt == 0) { + at = kmem_alloc(sizeof (*at), KM_SLEEP); + at->at_tile_id = *cur_tile; + at->at_synced = spa_current_txg(spa); + avl_add(&va->vd_tile_map, at); + list_create(&at->at_list, + sizeof (anyraid_tile_node_t), + offsetof(anyraid_tile_node_t, atn_node)); + + (*cur_tile)++; + } + + anyraid_tile_node_t *atn = kmem_alloc(sizeof (*atn), KM_SLEEP); + atn->atn_disk = disk; + atn->atn_tile_idx = offset; + list_insert_tail(&at->at_list, atn); + *pat_cnt = (*pat_cnt + 1) % (va->vd_nparity + va->vd_ndata); + + vdev_anyraid_node_t *van = va->vd_children[disk]; + avl_remove(&va->vd_children_tree, van); + + anyraid_freelist_remove(&van->van_freelist, offset); + avl_add(&va->vd_children_tree, van); + *out_at = at; +} + +static void +child_read_done(zio_t *zio) +{ + zio_t *pio = zio_unique_parent(zio); + abd_t **cbp = pio->io_private; + + if (zio->io_error == 0) { + mutex_enter(&pio->io_lock); + if (*cbp == NULL) + *cbp = zio->io_abd; + else + abd_free(zio->io_abd); + mutex_exit(&pio->io_lock); + } else { + abd_free(zio->io_abd); + } +} + +static void +child_read(zio_t *zio, vdev_t *vd, uint64_t offset, uint64_t size, + int checksum, void *private, int flags) +{ + for (int c = 0; c < vd->vdev_children; c++) { + child_read(zio, vd->vdev_child[c], offset, size, checksum, + private, flags); + } + + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + zio_nowait(zio_read_phys(zio, vd, offset, size, + abd_alloc_linear(size, B_TRUE), checksum, + child_read_done, private, ZIO_PRIORITY_SYNC_READ, flags, + B_FALSE)); + } +} + +/* + * This function is non-static for ZDB, and shouldn't be used for anything else. + * Utility function that issues the read for the header and parses out the + * nvlist. + */ +int +vdev_anyraid_open_header(vdev_t *cvd, int header, anyraid_header_t *out_header) +{ + spa_t *spa = cvd->vdev_spa; + uint64_t ashift = cvd->vdev_ashift; + uint64_t header_offset = vdev_anyraid_header_offset(cvd, header); + uint64_t header_size = VDEV_ANYRAID_MAP_HEADER_SIZE(ashift); + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE; + + abd_t *header_abd = NULL; + zio_t *rio = zio_root(spa, NULL, &header_abd, flags); + child_read(rio, cvd, header_offset, header_size, ZIO_CHECKSUM_LABEL, + NULL, flags); + + int error; + if ((error = zio_wait(rio)) != 0) { + zfs_dbgmsg("Error %d reading anyraid header %d on vdev %s", + error, header, cvd->vdev_path); + abd_free(header_abd); + return (error); + } + + char *header_buf = abd_borrow_buf(header_abd, header_size); + nvlist_t *header_nvl; + error = nvlist_unpack(header_buf, header_size, &header_nvl, + KM_SLEEP); + if (error != 0) { + zfs_dbgmsg("Error %d unpacking anyraid header %d on vdev %s", + error, header, cvd->vdev_path); + abd_return_buf(header_abd, header_buf, header_size); + abd_free(header_abd); + return (error); + } + out_header->ah_abd = header_abd; + out_header->ah_buf = header_buf; + out_header->ah_nvl = header_nvl; + + return (0); +} + +static void +free_header(anyraid_header_t *header, uint64_t header_size) { + fnvlist_free(header->ah_nvl); + abd_return_buf(header->ah_abd, header->ah_buf, header_size); + abd_free(header->ah_abd); +} + +/* + * This function is non-static for ZDB, and shouldn't be used for anything else. + * + * Iterate over all the copies of the map for the given child vdev and select + * the best one. + */ +int +vdev_anyraid_pick_best_mapping(vdev_t *cvd, uint64_t *out_txg, + anyraid_header_t *out_header, int *out_mapping) +{ + spa_t *spa = cvd->vdev_spa; + uint64_t ashift = cvd->vdev_ashift; + int error = 0; + uint64_t header_size = VDEV_ANYRAID_MAP_HEADER_SIZE(ashift); + + int best_mapping = -1; + uint64_t best_txg = 0; + anyraid_header_t best_header = {0}; + boolean_t checkpoint_rb = spa_importing_checkpoint(spa); + + for (int i = 0; i < VDEV_ANYRAID_MAP_COPIES; i++) { + anyraid_header_t header; + error = vdev_anyraid_open_header(cvd, i, &header); + + if (error) + continue; + + nvlist_t *hnvl = header.ah_nvl; + uint16_t version; + if ((error = nvlist_lookup_uint16(hnvl, + VDEV_ANYRAID_HEADER_VERSION, &version)) != 0) { + free_header(&header, header_size); + zfs_dbgmsg("Anyraid header %d on vdev %s: missing " + "version", i, cvd->vdev_path); + continue; + } + if (version != 0) { + free_header(&header, header_size); + error = SET_ERROR(ENOTSUP); + zfs_dbgmsg("Anyraid header %d on vdev %s: invalid " + "version", i, cvd->vdev_path); + continue; + } + + uint64_t pool_guid = 0; + if (nvlist_lookup_uint64(hnvl, VDEV_ANYRAID_HEADER_GUID, + &pool_guid) != 0 || pool_guid != spa_guid(spa)) { + free_header(&header, header_size); + error = SET_ERROR(EINVAL); + zfs_dbgmsg("Anyraid header %d on vdev %s: guid " + "mismatch: %llu %llu", i, cvd->vdev_path, + (u_longlong_t)pool_guid, + (u_longlong_t)spa_guid(spa)); + continue; + } + + uint64_t written_txg; + if (nvlist_lookup_uint64(hnvl, VDEV_ANYRAID_HEADER_TXG, + &written_txg) != 0) { + free_header(&header, header_size); + error = SET_ERROR(EINVAL); + zfs_dbgmsg("Anyraid header %d on vdev %s: no txg", + i, cvd->vdev_path); + continue; + } + /* + * If we're reopening, the current txg hasn't been synced out + * yet; look for one txg earlier. + */ + uint64_t min_txg = spa_current_txg(spa) - + (cvd->vdev_parent->vdev_reopening ? 1 : 0); + if ((written_txg < min_txg && !checkpoint_rb) || + written_txg > spa_load_max_txg(spa)) { + free_header(&header, header_size); + error = SET_ERROR(EINVAL); + zfs_dbgmsg("Anyraid header %d on vdev %s: txg %llu out " + "of bounds (%llu, %llu)", i, cvd->vdev_path, + (u_longlong_t)written_txg, + (u_longlong_t)min_txg, + (u_longlong_t)spa_load_max_txg(spa)); + continue; + } + if (written_txg > best_txg) { + best_txg = written_txg; + best_mapping = i; + if (best_header.ah_nvl) + free_header(&best_header, header_size); + + best_header = header; + } else { + free_header(&header, header_size); + } + } + + if (best_txg != 0) { + *out_txg = best_txg; + *out_mapping = best_mapping; + *out_header = best_header; + return (0); + } + ASSERT(error); + return (error); +} + +static int +anyraid_open_existing(vdev_t *vd, uint64_t child, uint32_t **child_capacities) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + vdev_t *cvd = vd->vdev_child[child]; + uint64_t ashift = cvd->vdev_ashift; + spa_t *spa = vd->vdev_spa; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE; + uint64_t header_size = VDEV_ANYRAID_MAP_HEADER_SIZE(ashift); + boolean_t checkpoint_rb = spa_importing_checkpoint(spa); + + anyraid_header_t header; + int mapping; + uint64_t txg; + int error = vdev_anyraid_pick_best_mapping(cvd, &txg, &header, + &mapping); + if (error) + return (error); + + uint8_t disk_id; + if (nvlist_lookup_uint8(header.ah_nvl, VDEV_ANYRAID_HEADER_DISK, + &disk_id) != 0) { + zfs_dbgmsg("Error opening anyraid vdev %llu: No disk ID", + (u_longlong_t)vd->vdev_id); + free_header(&header, header_size); + return (SET_ERROR(EINVAL)); + } + + uint64_t tile_size; + if (nvlist_lookup_uint64(header.ah_nvl, VDEV_ANYRAID_HEADER_TILE_SIZE, + &tile_size) != 0) { + zfs_dbgmsg("Error opening anyraid vdev %llu: No tile size", + (u_longlong_t)vd->vdev_id); + free_header(&header, header_size); + return (SET_ERROR(EINVAL)); + } + + uint32_t map_length; + if (nvlist_lookup_uint32(header.ah_nvl, VDEV_ANYRAID_HEADER_LENGTH, + &map_length) != 0) { + zfs_dbgmsg("Error opening anyraid vdev %llu: No map length", + (u_longlong_t)vd->vdev_id); + free_header(&header, header_size); + return (SET_ERROR(EINVAL)); + } + + uint16_t *caps = NULL; + uint_t count; + if (nvlist_lookup_uint16_array(header.ah_nvl, + VDEV_ANYRAID_HEADER_DISK_SIZES, &caps, &count) != 0) { + zfs_dbgmsg("Error opening anyraid vdev %llu: No child sizes", + (u_longlong_t)vd->vdev_id); + free_header(&header, header_size); + return (SET_ERROR(EINVAL)); + } + if (count != vd->vdev_children) { + zfs_dbgmsg("Error opening anyraid vdev %llu: Incorrect child " + "count %u vs %u", (u_longlong_t)vd->vdev_id, count, + (uint_t)vd->vdev_children); + free_header(&header, header_size); + return (SET_ERROR(EINVAL)); + } + + *child_capacities = kmem_alloc(sizeof (**child_capacities) * count, + KM_SLEEP); + for (int i = 0; i < count; i++) + (*child_capacities)[i] = caps[i] + 1; + if (vd->vdev_reopening) { + if (va->vd_contracting_leaf != -1) { + (*child_capacities)[va->vd_contracting_leaf] = 0; + } + free_header(&header, header_size); + return (0); + } + + uint32_t state = ARS_NONE; + (void) nvlist_lookup_uint32(header.ah_nvl, + VDEV_ANYRAID_HEADER_RELOC_STATE, &state); + if (state != ARS_NONE) { + vdev_anyraid_relocate_t *var = &va->vd_relocate; + var->var_state = state; + var->var_vd = vd->vdev_id; + if (spa->spa_anyraid_relocate != NULL) { + zfs_dbgmsg("Error opening anyraid vdev %llu: Relocate " + "active when another relocate is in progress", + (u_longlong_t)vd->vdev_id); + free_header(&header, header_size); + return (EINVAL); + } + spa->spa_anyraid_relocate = var; + } + if (state == ARS_CONTRACTING) + spa_async_request(spa, SPA_ASYNC_CONTRACTION_DONE); + + nvlist_t *cur_task; + error = nvlist_lookup_nvlist(header.ah_nvl, + VDEV_ANYRAID_HEADER_CUR_TASK, &cur_task); + if (error != 0 && error != ENOENT) { + zfs_dbgmsg("Error opening anyraid vdev %llu: Error opening " + "relocate info %d", (u_longlong_t)vd->vdev_id, error); + free_header(&header, header_size); + return (error); + } + + if (nvlist_lookup_uint32(header.ah_nvl, + VDEV_ANYRAID_HEADER_CONTRACTING_LEAF, + (uint32_t *)&va->vd_contracting_leaf) != 0) + va->vd_contracting_leaf = -1; + + if (error == 0) { + vdev_anyraid_relocate_t *var = &va->vd_relocate; + + ASSERT3U(var->var_state, ==, ARS_SCANNING); + var->var_failed_offset = UINT64_MAX; + var->var_failed_task = UINT64_MAX; + + var->var_offset = var->var_synced_offset = + fnvlist_lookup_uint64(cur_task, VART_OFFSET); + var->var_task = var->var_synced_task = + fnvlist_lookup_uint32(cur_task, VART_TASK); + vdev_anyraid_relocate_task_t *vart = + kmem_alloc(sizeof (*vart), KM_SLEEP); + vart->vart_source_disk = fnvlist_lookup_uint8(cur_task, + VART_SOURCE_DISK); + vart->vart_source_idx = fnvlist_lookup_uint16(cur_task, + VART_SOURCE_OFF); + vart->vart_dest_disk = fnvlist_lookup_uint8(cur_task, + VART_DEST_DISK); + vart->vart_dest_idx = fnvlist_lookup_uint16(cur_task, + VART_DEST_OFF); + vart->vart_tile = fnvlist_lookup_uint32(cur_task, + VART_TILE); + vart->vart_task = var->var_task; + list_insert_head(&var->var_list, vart); + (*child_capacities)[va->vd_contracting_leaf] = 0; + spa->spa_anyraid_relocate = var; + } + + va->vd_checkpoint_tile = UINT32_MAX; + (void) nvlist_lookup_uint32(header.ah_nvl, + VDEV_ANYRAID_HEADER_CHECKPOINT, &va->vd_checkpoint_tile); + + /* + * Because the tile map is 64 MiB and the maximum IO size is 16MiB, + * we may need to issue up to 4 reads to read in the whole thing. + * Similarly, when processing the mapping, we need to iterate across + * the 4 separate buffers. + */ + zio_t *rio = zio_root(spa, NULL, NULL, flags); + abd_t *map_abds[VDEV_ANYRAID_MAP_COPIES] = {0}; + uint64_t header_offset = vdev_anyraid_header_offset(cvd, mapping); + uint64_t map_offset = header_offset + header_size; + int i; + for (i = 0; i <= (map_length / SPA_MAXBLOCKSIZE); i++) { + zio_eck_t *cksum = (zio_eck_t *) + &header.ah_buf[VDEV_ANYRAID_NVL_BYTES(ashift) + + i * sizeof (*cksum)]; + zio_t *nio = zio_null(rio, spa, cvd, NULL, &map_abds[i], flags); + child_read(nio, cvd, map_offset + i * SPA_MAXBLOCKSIZE, + SPA_MAXBLOCKSIZE, ZIO_CHECKSUM_ANYRAID_MAP, cksum, flags); + zio_nowait(nio); + } + i--; + + if ((error = zio_wait(rio))) { + for (; i >= 0; i--) + abd_free(map_abds[i]); + free_header(&header, header_size); + zfs_dbgmsg("Error opening anyraid vdev %llu: map read error %d", + (u_longlong_t)vd->vdev_id, error); + return (error); + } + free_header(&header, header_size); + + uint32_t map = -1, cur_tile = 0; + /* + * For now, all entries are the size of a uint32_t. If that + * ever changes, the logic here needs to be altered to work for + * adaptive sizes, including entries split across 16MiB boundaries. + */ + uint32_t size = sizeof (anyraid_map_loc_entry_t); + uint8_t *map_buf = NULL; + uint8_t pat_cnt = 0; + anyraid_tile_t *at = NULL; + for (uint32_t off = 0; off < map_length; off += size) { + if (checkpoint_rb && cur_tile > va->vd_checkpoint_tile && + pat_cnt == 0) + break; + + int next_map = off / SPA_MAXBLOCKSIZE; + if (map != next_map) { + // switch maps + if (map != -1) { + abd_return_buf(map_abds[map], map_buf, + SPA_MAXBLOCKSIZE); + } + map_buf = abd_borrow_buf(map_abds[next_map], + SPA_MAXBLOCKSIZE); + map = next_map; + +#ifdef _ZFS_BIG_ENDIAN + uint32_t length = map_length - + next_map * SPA_MAXBLOCKSIZE; + byteswap_uint32_array(map_buf, MIN(length, + SPA_MAXBLOCKSIZE)); +#endif + } + anyraid_map_entry_t *entry = + (anyraid_map_entry_t *)(map_buf + (off % SPA_MAXBLOCKSIZE)); + uint8_t type = ame_get_type(entry); + switch (type) { + case AMET_SKIP: { + anyraid_map_skip_entry_t *amse = + &entry->ame_u.ame_amse; + ASSERT0(pat_cnt); + cur_tile += amse_get_skip_count(amse); + break; + } + case AMET_LOC: { + anyraid_map_loc_entry_t *amle = + &entry->ame_u.ame_amle; + create_tile_entry(vd->vdev_spa, va, amle, + &pat_cnt, &at, &cur_tile); + break; + } + default: + PANIC("Invalid entry type %d", type); + } + } + if (map_buf) + abd_return_buf(map_abds[map], map_buf, SPA_MAXBLOCKSIZE); + + va->vd_tile_size = tile_size; + + for (; i >= 0; i--) + abd_free(map_abds[i]); + + /* + * Now that we have the tile map read in, we have to reopen the + * children to properly set and handle the min_asize + */ + for (; i < vd->vdev_children; i++) { + vdev_t *cvd = vd->vdev_child[i]; + vdev_reopen(cvd); + } + + int lasterror = 0; + int numerrors = 0; + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) { + lasterror = cvd->vdev_open_error; + numerrors++; + continue; + } + } + + if (numerrors > va->vd_nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + return (0); +} + +/* + * When creating a new anyraid vdev, this function calculates the tile size + * to use. We take (by default) 1/64th of the size of the smallest disk or 16 + * GiB, whichever is larger. + */ +static int +anyraid_calculate_size(vdev_t *vd) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + + uint64_t smallest_disk_size = UINT64_MAX; + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + smallest_disk_size = MIN(smallest_disk_size, cvd->vdev_asize - + VDEV_ANYRAID_TOTAL_MAP_SIZE(cvd->vdev_ashift)); + } + + uint64_t disk_shift = anyraid_disk_shift; + uint64_t min_size = zfs_anyraid_min_tile_size; + if (smallest_disk_size < 1 << disk_shift || + smallest_disk_size < min_size) { + return (SET_ERROR(ENOLCK)); + } + + + ASSERT3U(smallest_disk_size, !=, UINT64_MAX); + uint64_t tile_size = smallest_disk_size >> disk_shift; + tile_size = MAX(tile_size, min_size); + va->vd_tile_size = 1ULL << (highbit64(tile_size - 1)); + + /* + * Later, we're going to cap the metaslab size at the tile + * size, so we need a tile to hold at least enough to store a + * max-size block, or we'll assert in that code. + */ + if (va->vd_tile_size * va->vd_ndata < SPA_MAXBLOCKSIZE) + return (SET_ERROR(ENOSPC)); + return (0); +} + +struct tile_count { + avl_node_t node; + int disk; + int remaining; +}; + +static int +rc_compar(const void *a, const void *b) +{ + const struct tile_count *ra = a; + const struct tile_count *rb = b; + + int cmp = TREE_CMP(rb->remaining, ra->remaining); + if (cmp != 0) + return (cmp); + return (TREE_CMP(rb->disk, ra->disk)); +} + +/* + * I think the only way to calculate the asize for anyraid devices is to + * actually run the allocation algorithm and see what we end up with. It's a + * variant of the bin-packing problem, which is NP-hard. Thankfully + * a first-fit descending algorithm seems to give optimal results for this + * variant. + */ +static uint64_t +calculate_asize(vdev_t *vd, uint64_t *num_tiles) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + + if (va->vd_nparity == 0) { + uint64_t count = 0; + for (int c = 0; c < vd->vdev_children; c++) { + count += num_tiles[c]; + } + return (count * va->vd_tile_size); + } + + /* + * Sort the disks by the number of additional tiles they can store. + */ + avl_tree_t t; + avl_create(&t, rc_compar, sizeof (struct tile_count), + offsetof(struct tile_count, node)); + for (int c = 0; c < vd->vdev_children; c++) { + if (num_tiles[c] == 0) { + ASSERTF(vd->vdev_child[c]->vdev_open_error || + va->vd_contracting_leaf == c, "%d %d", + va->vd_contracting_leaf, c); + continue; + } + struct tile_count *rc = kmem_alloc(sizeof (*rc), KM_SLEEP); + rc->disk = c; + rc->remaining = num_tiles[c] - + anyraid_freelist_alloc(&va->vd_children[c]->van_freelist); + avl_add(&t, rc); + } + + uint32_t map_width = va->vd_nparity + va->vd_ndata; + uint64_t count = avl_numnodes(&va->vd_tile_map); + struct tile_count **cur = kmem_alloc(sizeof (*cur) * map_width, + KM_SLEEP); + for (;;) { + /* Grab the nparity + 1 children with the most free capacity */ + for (int c = 0; c < map_width; c++) { + struct tile_count *rc = avl_first(&t); + ASSERT(rc); + cur[c] = rc; + avl_remove(&t, rc); + } + struct tile_count *rc = cur[map_width - 1]; + struct tile_count *next = avl_first(&t); + uint64_t next_rem = next == NULL ? 0 : next->remaining; + /* If one of the top N + 1 has no capacity left, we're done */ + if (rc->remaining == 0) + break; + + /* + * This is a performance optimization; if the child with the + * lowest free capacity of the ones we've selected has N more + * capacity than the next child, the next N iterations would + * all select the same children. So to save time, we add N + * tiles right now and reduce our iteration count. + */ + uint64_t this_iter = MAX(1, rc->remaining - next_rem); + count += this_iter; + + /* Re-add the selected children with their reduced capacity */ + for (int c = 0; c < map_width; c++) { + ASSERT3U(cur[c]->remaining, >=, this_iter); + cur[c]->remaining -= this_iter; + avl_add(&t, cur[c]); + } + } + for (int c = 0; c < map_width; c++) + kmem_free(cur[c], sizeof (*cur[c])); + kmem_free(cur, sizeof (*cur) * map_width); + void *cookie = NULL; + struct tile_count *node; + + while ((node = avl_destroy_nodes(&t, &cookie)) != NULL) + kmem_free(node, sizeof (*node)); + avl_destroy(&t); + return (count * va->vd_width * va->vd_tile_size); +} + +static int +vdev_anyraid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + int lasterror = 0; + int numerrors = 0; + + vdev_open_children(vd); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) { + lasterror = cvd->vdev_open_error; + numerrors++; + continue; + } + } + + /* + * If we have more faulted disks than parity, we can't open the device. + */ + if (numerrors > va->vd_nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + uint32_t *child_capacities = NULL; + if (vd->vdev_reopening) { + child_capacities = kmem_alloc(sizeof (*child_capacities) * + vd->vdev_children, KM_SLEEP); + for (uint64_t c = 0; c < vd->vdev_children; c++) { + child_capacities[c] = va->vd_children[c]->van_capacity; + } + if (va->vd_contracting_leaf != -1) + child_capacities[va->vd_contracting_leaf] = 0; + } else if (spa_load_state(vd->vdev_spa) != SPA_LOAD_CREATE && + spa_load_state(vd->vdev_spa) != SPA_LOAD_ERROR && + spa_load_state(vd->vdev_spa) != SPA_LOAD_NONE) { + for (uint64_t c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + if (cvd->vdev_open_error != 0) + continue; + if ((lasterror = anyraid_open_existing(vd, c, + &child_capacities)) == 0) + break; + } + if (lasterror) + return (lasterror); + } else if ((lasterror = anyraid_calculate_size(vd))) { + return (lasterror); + } + + uint64_t max_size = VDEV_ANYRAID_MAX_TPD * va->vd_tile_size; + + /* + * Calculate the number of tiles each child could fit, then use that + * to calculate the asize and min_asize. + */ + uint64_t *num_tiles = kmem_zalloc(vd->vdev_children * + sizeof (*num_tiles), KM_SLEEP); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + uint64_t casize; + if (cvd->vdev_open_error == 0) { + vdev_set_min_asize(cvd); + casize = MIN(max_size, cvd->vdev_asize - + VDEV_ANYRAID_TOTAL_MAP_SIZE(cvd->vdev_ashift)); + } else { + ASSERT(child_capacities); + casize = child_capacities[c] * va->vd_tile_size; + } + + num_tiles[c] = casize / va->vd_tile_size; + avl_remove(&va->vd_children_tree, va->vd_children[c]); + if (va->vd_contracting_leaf == c) + va->vd_children[c]->van_capacity = 0; + else + va->vd_children[c]->van_capacity = num_tiles[c]; + avl_add(&va->vd_children_tree, va->vd_children[c]); + } + *asize = calculate_asize(vd, num_tiles); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + uint64_t cmasize; + if (cvd->vdev_open_error == 0) { + cmasize = MIN(max_size, cvd->vdev_max_asize - + VDEV_ANYRAID_TOTAL_MAP_SIZE(cvd->vdev_ashift)); + } else { + cmasize = child_capacities[c] * va->vd_tile_size; + } + + num_tiles[c] = cmasize / va->vd_tile_size; + } + *max_asize = calculate_asize(vd, num_tiles); + + if (child_capacities) { + kmem_free(child_capacities, sizeof (*child_capacities) * + vd->vdev_children); + } + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) + continue; + + *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); + *physical_ashift = vdev_best_ashift(*logical_ashift, + *physical_ashift, cvd->vdev_physical_ashift); + } + kmem_free(num_tiles, vd->vdev_children * sizeof (*num_tiles)); + return (0); +} + +int +vdev_anyraid_load(vdev_t *vd) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + + if (va->vd_relocate.var_state == ARS_NONE || + va->vd_relocate.var_state == ARS_FINISHED) + return (0); + + return (tasklist_read(vd)); +} + +/* + * We cap the metaslab size at the tile size. This prevents us from having to + * split IOs across multiple tiles, which would be complex extra logic for + * little gain. + */ +static void +vdev_anyraid_metaslab_size(vdev_t *vd, uint64_t *shiftp) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + *shiftp = MIN(*shiftp, highbit64(va->vd_tile_size) - 1); +} + +static void +vdev_anyraid_close(vdev_t *vd) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c] != NULL) + vdev_close(vd->vdev_child[c]); + } + if (vd->vdev_reopening) + return; + anyraid_tile_t *tile = NULL; + void *cookie = NULL; + while ((tile = avl_destroy_nodes(&va->vd_tile_map, &cookie))) { + if (va->vd_nparity != 0) { + anyraid_tile_node_t *atn = NULL; + while ((atn = list_remove_head(&tile->at_list))) { + kmem_free(atn, sizeof (*atn)); + } + list_destroy(&tile->at_list); + } + kmem_free(tile, sizeof (*tile)); + } + + if (vd->vdev_spa->spa_anyraid_relocate == &va->vd_relocate) + vd->vdev_spa->spa_anyraid_relocate = NULL; + vdev_anyraid_relocate_t *var = &va->vd_relocate; + vdev_anyraid_relocate_task_t *vart; + while ((vart = list_remove_head(&var->var_list))) + kmem_free(vart, sizeof (*vart)); + while ((vart = list_remove_head(&var->var_done_list))) + kmem_free(vart, sizeof (*vart)); + // free relocate info +} + +/* + * Configure the mirror_map and then hand the write off to the normal mirror + * logic. + */ +static void +vdev_anyraid_mirror_start(zio_t *zio, anyraid_tile_t *tile, + vdev_anyraid_relocate_task_t *task, zfs_locked_range_t *lr) +{ + vdev_t *vd = zio->io_vd; + vdev_anyraid_t *va = vd->vdev_tsd; + mirror_map_t *mm = vdev_mirror_map_alloc(va->vd_nparity + 1, B_FALSE, + B_FALSE); + uint64_t tsize = va->vd_tile_size; + + anyraid_tile_node_t *atn = list_head(&tile->at_list); + for (int c = 0; c < mm->mm_children; c++) { + uint8_t disk; + uint16_t offset; + if (task && task->vart_source_disk == atn->atn_disk) { + disk = task->vart_dest_disk; + offset = task->vart_source_idx; + } else { + disk = atn->atn_disk; + offset = atn->atn_tile_idx; + } + ASSERT(atn); + mirror_child_t *mc = &mm->mm_child[c]; + mc->mc_vd = vd->vdev_child[disk]; + mc->mc_offset = VDEV_ANYRAID_START_OFFSET(vd->vdev_ashift) + + offset * tsize + zio->io_offset % tsize; + ASSERT3U(mc->mc_offset, <, mc->mc_vd->vdev_psize - + VDEV_LABEL_END_SIZE); + mm->mm_rebuilding = mc->mc_rebuilding = B_FALSE; + atn = list_next(&tile->at_list, atn); + } + ASSERT(atn == NULL); + + zio->io_aux_vsd = lr; + zio->io_vsd = mm; + zio->io_vsd_ops = &vdev_mirror_vsd_ops; + + vdev_mirror_io_start_impl(zio, mm); +} + +/* + * Translate the allocated and configured raidz map to use the proper disks + * based on the anyraid tile mapping. + */ +static void +vdev_anyraid_raidz_map_translate(vdev_t *vd, raidz_map_t *rm, + anyraid_tile_t *tile, vdev_anyraid_relocate_task_t *task) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + ASSERT3U(rm->rm_nrows, ==, 1); + raidz_row_t *rr = rm->rm_row[0]; + anyraid_tile_node_t **mapping = kmem_zalloc(sizeof (*mapping) * + va->vd_width, KM_SLEEP); + ASSERT(tile); + anyraid_tile_node_t *atn = list_head(&tile->at_list); + for (int i = 0; i < va->vd_width; i++) { + ASSERT(atn); + mapping[i] = atn; + atn = list_next(&tile->at_list, atn); + } + ASSERT3U(rr->rr_scols, <=, va->vd_width); + for (uint64_t c = 0; c < rr->rr_scols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + atn = mapping[rc->rc_devidx]; + uint8_t disk; + uint16_t offset; + if (task && task->vart_source_disk == atn->atn_disk) { + disk = task->vart_dest_disk; + offset = task->vart_source_idx; + } else { + disk = atn->atn_disk; + offset = atn->atn_tile_idx; + } + uint64_t tile_off = rc->rc_offset % va->vd_tile_size; + uint64_t disk_off = tile_off + + offset * va->vd_tile_size; + rc->rc_offset = VDEV_ANYRAID_TOTAL_MAP_SIZE(vd->vdev_ashift) + + disk_off; + rc->rc_devidx = disk; + } + kmem_free(mapping, sizeof (*mapping) * va->vd_width); +} + +/* + * Configure the raidz_map and then hand the write off to the normal raidz + * logic. + */ +static void +vdev_anyraid_raidz_start(zio_t *zio, anyraid_tile_t *tile, + vdev_anyraid_relocate_task_t *task, zfs_locked_range_t *lr) +{ + vdev_t *vd = zio->io_vd; + vdev_anyraid_t *va = vd->vdev_tsd; + raidz_map_t *rm = vdev_raidz_map_alloc(zio, vd->vdev_ashift, + va->vd_width, va->vd_nparity); + vdev_anyraid_raidz_map_translate(vd, rm, tile, task); + + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; + zio->io_aux_vsd = lr; + vdev_raidz_io_start_impl(zio, rm, va->vd_width, va->vd_width); +} + +typedef struct anyraid_map { + abd_t *am_abd; +} anyraid_map_t; + +static void +vdev_anyraid_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + mutex_enter(&pio->io_lock); + pio->io_error = zio_worst_error(pio->io_error, zio->io_error); + mutex_exit(&pio->io_lock); +} + +static void +vdev_anyraid_map_free_vsd(zio_t *zio) +{ + anyraid_map_t *am = zio->io_vsd; + abd_free(am->am_abd); + am->am_abd = NULL; + kmem_free(am, sizeof (*am)); +} + +const zio_vsd_ops_t vdev_anyraid_vsd_ops = { + .vsd_free = vdev_anyraid_map_free_vsd, +}; + +static void +vdev_anyraid_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_anyraid_t *va = vd->vdev_tsd; + uint64_t tsize = va->vd_tile_size * va->vd_width; + + uint64_t start_tile_id = zio->io_offset / tsize; + anyraid_tile_t search; + search.at_tile_id = start_tile_id; + avl_index_t where; + rw_enter(&va->vd_lock, RW_READER); + anyraid_tile_t *tile = avl_find(&va->vd_tile_map, &search, + &where); + + /* + * If we're doing an I/O somewhere that hasn't been allocated yet, we + * may need to allocate a new tile. Upgrade to a write lock so we can + * safely modify the data structure, and then check if someone else + * beat us to it. + */ + if (tile == NULL) { + rw_exit(&va->vd_lock); + rw_enter(&va->vd_lock, RW_WRITER); + tile = avl_find(&va->vd_tile_map, &search, &where); + } + if (tile == NULL) { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + zfs_dbgmsg("Allocating tile %llu for zio %px", + (u_longlong_t)start_tile_id, zio); + tile = kmem_alloc(sizeof (*tile), KM_SLEEP); + tile->at_tile_id = start_tile_id; + list_create(&tile->at_list, sizeof (anyraid_tile_node_t), + offsetof(anyraid_tile_node_t, atn_node)); + + uint_t width = va->vd_nparity + va->vd_ndata; + vdev_anyraid_node_t **vans = kmem_alloc(sizeof (*vans) * width, + KM_SLEEP); + for (int i = 0; i < width; i++) { + vans[i] = avl_first(&va->vd_children_tree); + avl_remove(&va->vd_children_tree, vans[i]); + + ASSERT3U(vans[i]->van_id, !=, va->vd_contracting_leaf); + anyraid_tile_node_t *atn = + kmem_alloc(sizeof (*atn), KM_SLEEP); + atn->atn_disk = vans[i]->van_id; + atn->atn_tile_idx = + anyraid_freelist_pop(&vans[i]->van_freelist); + list_insert_tail(&tile->at_list, atn); + } + for (int i = 0; i < width; i++) + avl_add(&va->vd_children_tree, vans[i]); + + kmem_free(vans, sizeof (*vans) * width); + avl_insert(&va->vd_tile_map, tile, where); + } + + zfs_locked_range_t *lr = zfs_rangelock_enter(&va->vd_rangelock, + zio->io_offset, zio->io_size, RL_READER); + + vdev_anyraid_relocate_task_t *task = NULL; + if (va->vd_relocate.var_state == ARS_SCANNING) { + vdev_anyraid_relocate_t *var = &va->vd_relocate; + mutex_enter(&var->var_lock); + vdev_anyraid_relocate_task_t *vart = list_head(&var->var_list); + if (vart && vart->vart_tile == tile->at_tile_id) { + ASSERTF(var->var_offset <= zio->io_offset || + var->var_offset >= zio->io_offset + zio->io_size, + "var_offset %llx is in the middle of IO %llx/%llx " + "%d %llx", (u_longlong_t)var->var_offset, + (u_longlong_t)zio->io_offset, + (u_longlong_t)zio->io_size, zio->io_type, + (u_longlong_t)zio->io_flags); + if (var->var_offset >= zio->io_offset + zio->io_size) { + task = kmem_zalloc(sizeof (*vart), KM_SLEEP); + *task = *vart; + } + } + mutex_exit(&var->var_lock); + } + rw_exit(&va->vd_lock); + + switch (va->vd_parity_type) { + case VAP_MIRROR: + if (va->vd_nparity > 0) { + vdev_anyraid_mirror_start(zio, tile, task, lr); + zio_execute(zio); + if (task) + kmem_free(task, sizeof (*task)); + return; + } + break; + case VAP_RAIDZ: + vdev_anyraid_raidz_start(zio, tile, task, lr); + zio_execute(zio); + if (task) + kmem_free(task, sizeof (*task)); + return; + default: + ASSERT0(1); + PANIC("Invalid parity type: %d", va->vd_parity_type); + } + + + anyraid_tile_node_t *atn = list_head(&tile->at_list); + vdev_t *cvd = vd->vdev_child[atn->atn_disk]; + uint64_t child_offset = atn->atn_tile_idx * tsize + + zio->io_offset % tsize; + child_offset += VDEV_ANYRAID_START_OFFSET(vd->vdev_ashift); + + anyraid_map_t *mm = kmem_alloc(sizeof (*mm), KM_SLEEP); + mm->am_abd = abd_get_offset(zio->io_abd, 0); + zio->io_vsd = mm; + zio->io_vsd_ops = &vdev_anyraid_vsd_ops; + zio->io_aux_vsd = lr; + + zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, child_offset, + mm->am_abd, zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_anyraid_child_done, zio); + zio_nowait(cio); + + zio_execute(zio); + if (task) + kmem_free(task, sizeof (*task)); +} + +static void +vdev_anyraid_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_anyraid_t *va = vd->vdev_tsd; + + switch (va->vd_parity_type) { + case VAP_MIRROR: + if (va->vd_nparity > 0) { + vdev_mirror_io_done(zio); + break; + } + break; + case VAP_RAIDZ: + vdev_raidz_io_done(zio); + break; + default: + panic("Invalid parity type: %d", va->vd_parity_type); + } + if (zio->io_stage != ZIO_STAGE_VDEV_IO_DONE) + return; + zfs_locked_range_t *lr = zio->io_aux_vsd; + ASSERT(lr); + zfs_rangelock_exit(lr); + zio->io_aux_vsd = NULL; +} + +static void +vdev_anyraid_state_change(vdev_t *vd, int faulted, int degraded) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + if (faulted > va->vd_nparity) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + } else if (degraded + faulted != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + } else { + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); + } +} + +/* + * Determine if any portion of the provided block resides on a child vdev + * with a dirty DTL and therefore needs to be resilvered. The function + * assumes that at least one DTL is dirty which implies that full stripe + * width blocks must be resilvered. + */ +static boolean_t +vdev_anyraid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + (void) psize; + vdev_anyraid_t *va = vd->vdev_tsd; + // TODO should we always resilver if we're rebalancing/contracting? + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + + uint64_t tsize = va->vd_tile_size * va->vd_width; + uint64_t start_tile_id = DVA_GET_OFFSET(dva) / tsize; + anyraid_tile_t search; + search.at_tile_id = start_tile_id; + avl_index_t where; + rw_enter(&va->vd_lock, RW_READER); + anyraid_tile_t *tile = avl_find(&va->vd_tile_map, &search, + &where); + rw_exit(&va->vd_lock); + ASSERT(tile); + + for (anyraid_tile_node_t *atn = list_head(&tile->at_list); + atn != NULL; atn = list_next(&tile->at_list, atn)) { + vdev_t *cvd = vd->vdev_child[atn->atn_disk]; + + if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Right now, we don't translate anything beyond the end of the allocated + * ranges for the target leaf vdev. This means that trim and initialize won't + * affect those areas on anyraid devices. Given the target use case, this is + * not a significant concern, but a rework of the xlate logic could enable this + * in the future. + */ +static void +vdev_anyraid_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs, + zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) +{ + vdev_t *anyraidvd = cvd->vdev_parent; + ASSERT(vdev_is_anyraid(anyraidvd)); + vdev_anyraid_t *va = anyraidvd->vdev_tsd; + uint64_t ptsize = va->vd_tile_size; + uint64_t ltsize = ptsize * va->vd_width; + // TODO should we always fail if we're rebalancing/contracting? + + uint64_t start_tile_id = logical_rs->rs_start / ltsize; + ASSERT3U(start_tile_id, ==, (logical_rs->rs_end - 1) / ltsize); + anyraid_tile_t search; + search.at_tile_id = start_tile_id; + avl_index_t where; + rw_enter(&va->vd_lock, RW_READER); + anyraid_tile_t *tile = avl_find(&va->vd_tile_map, &search, + &where); + rw_exit(&va->vd_lock); + // This tile doesn't exist yet + if (tile == NULL) { + physical_rs->rs_start = physical_rs->rs_end = 0; + return; + } + uint64_t idx = 0; + anyraid_tile_node_t *atn = list_head(&tile->at_list); + for (; atn != NULL; atn = list_next(&tile->at_list, atn), idx++) + if (anyraidvd->vdev_child[atn->atn_disk] == cvd) + break; + // The tile exists, but isn't stored on this child + if (atn == NULL) { + physical_rs->rs_start = physical_rs->rs_end = 0; + return; + } + + switch (va->vd_parity_type) { + case VAP_MIRROR: + { + uint64_t child_offset = atn->atn_tile_idx * ptsize + + logical_rs->rs_start % ptsize; + child_offset += + VDEV_ANYRAID_START_OFFSET(anyraidvd->vdev_ashift); + uint64_t size = logical_rs->rs_end - + logical_rs->rs_start; + + physical_rs->rs_start = child_offset; + physical_rs->rs_end = child_offset + size; + break; + } + case VAP_RAIDZ: + { + uint64_t width = va->vd_width; + uint64_t tgt_col = idx; + uint64_t ashift = anyraidvd->vdev_ashift; + uint64_t tile_start = VDEV_ANYRAID_TOTAL_MAP_SIZE( + anyraidvd->vdev_ashift) + atn->atn_tile_idx * + ptsize; + + uint64_t b_start = + (logical_rs->rs_start % ltsize) >> ashift; + uint64_t b_end = + (logical_rs->rs_end % ltsize) >> ashift; + + uint64_t start_row = 0; + /* avoid underflow */ + if (b_start > tgt_col) { + start_row = ((b_start - tgt_col - 1) / width) + + 1; + } + + uint64_t end_row = 0; + if (b_end > tgt_col) + end_row = ((b_end - tgt_col - 1) / width) + 1; + + physical_rs->rs_start = + tile_start + (start_row << ashift); + physical_rs->rs_end = + tile_start + (end_row << ashift); + break; + } + default: + panic("Invalid parity type: %d", va->vd_parity_type); + } + remain_rs->rs_start = 0; + remain_rs->rs_end = 0; +} + +static uint64_t +vdev_anyraid_nparity(vdev_t *vd) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + return (va->vd_nparity); +} + +static uint64_t +vdev_anyraid_ndisks(vdev_t *vd) +{ + return (vd->vdev_children); +} + +/* + * Functions related to syncing out the tile map each TXG. + */ +static boolean_t +map_write_loc_entry(anyraid_tile_node_t *atn, void *buf, uint32_t *offset) +{ + anyraid_map_loc_entry_t *entry = (void *)((char *)buf + *offset); + amle_set_type(entry); + amle_set_disk(entry, atn->atn_disk); + amle_set_offset(entry, atn->atn_tile_idx); + *offset += sizeof (*entry); + return (*offset == SPA_MAXBLOCKSIZE); +} + +static boolean_t +map_write_skip_entry(uint32_t tile, void *buf, uint32_t *offset, + uint32_t prev_id) +{ + anyraid_map_skip_entry_t *entry = (void *)((char *)buf + *offset); + amse_set_type(entry); + amse_set_skip_count(entry, tile - prev_id - 1); + *offset += sizeof (*entry); + return (*offset == SPA_MAXBLOCKSIZE); +} + +static void +anyraid_map_write_done(zio_t *zio) +{ + abd_free(zio->io_abd); +} + +static void +map_write_issue(zio_t *zio, vdev_t *vd, uint64_t base_offset, + uint8_t idx, uint32_t length, abd_t *abd, zio_eck_t *cksum_out, + int flags) +{ +#ifdef _ZFS_BIG_ENDIAN + void *buf = abd_borrow_buf(abd, SPA_MAXBLOCKSIZE); + byteswap_uint32_array(buf, length); + abd_return_buf(abd, buf, SPA_MAXBLOCKSIZE); +#else + (void) length; +#endif + + zio_nowait(zio_write_phys(zio, vd, base_offset + + idx * VDEV_ANYRAID_MAP_SIZE + + VDEV_ANYRAID_MAP_HEADER_SIZE(vd->vdev_ashift), SPA_MAXBLOCKSIZE, + abd, ZIO_CHECKSUM_ANYRAID_MAP, anyraid_map_write_done, cksum_out, + ZIO_PRIORITY_SYNC_WRITE, flags, B_FALSE)); +} + +static void +vdev_anyraid_write_map_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_private; + + if (zio->io_error == 0 && good_writes != NULL) + atomic_inc_64(good_writes); +} + +void +vdev_anyraid_write_map_sync(vdev_t *vd, zio_t *pio, uint64_t txg, + uint64_t *good_writes, int flags, vdev_config_sync_status_t status) +{ + vdev_t *anyraidvd = vd->vdev_parent; + ASSERT(vdev_is_anyraid(anyraidvd)); + spa_t *spa = vd->vdev_spa; + vdev_anyraid_t *va = anyraidvd->vdev_tsd; + uint32_t header_size = VDEV_ANYRAID_MAP_HEADER_SIZE(vd->vdev_ashift); + uint32_t nvl_bytes = VDEV_ANYRAID_NVL_BYTES(vd->vdev_ashift); + uint8_t update_target = txg % VDEV_ANYRAID_MAP_COPIES; + uint64_t base_offset = vdev_anyraid_header_offset(vd, update_target); + + abd_t *header_abd = + abd_alloc_linear(header_size, B_TRUE); + abd_zero(header_abd, header_size); + void *header_buf = abd_borrow_buf(header_abd, header_size); + zio_eck_t *cksums = (zio_eck_t *)&((char *)header_buf)[nvl_bytes]; + + abd_t *map_abd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_TRUE); + uint8_t written = 0; + void *buf = abd_borrow_buf(map_abd, SPA_MAXBLOCKSIZE); + + rw_enter(&va->vd_lock, RW_READER); + anyraid_tile_t *cur = avl_first(&va->vd_tile_map); + anyraid_tile_node_t *curn = cur != NULL ? + list_head(&cur->at_list) : NULL; + uint32_t buf_offset = 0, prev_id = UINT32_MAX; + zio_t *zio = zio_root(spa, NULL, NULL, flags); + /* Write out each sub-tile in turn */ + while (cur) { + if (status == VDEV_CONFIG_REWINDING_CHECKPOINT && + cur->at_tile_id > va->vd_checkpoint_tile) + break; + + anyraid_tile_t *next = AVL_NEXT(&va->vd_tile_map, cur); + IMPLY(prev_id != UINT32_MAX, cur->at_tile_id >= prev_id); + /* + * Determine if we need to write a skip entry before the + * current one. + */ + boolean_t skip = + (prev_id == UINT32_MAX && cur->at_tile_id != 0) || + (prev_id != UINT32_MAX && cur->at_tile_id > prev_id + 1); + if ((skip && map_write_skip_entry(cur->at_tile_id, buf, + &buf_offset, prev_id)) || + (!skip && map_write_loc_entry(curn, buf, &buf_offset))) { + // Let the final write handle it + if (next == NULL) + break; + abd_return_buf_copy(map_abd, buf, SPA_MAXBLOCKSIZE); + map_write_issue(zio, vd, base_offset, written, + buf_offset, map_abd, &cksums[written], flags); + + map_abd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_TRUE); + written++; + ASSERT3U(written, <, + VDEV_ANYRAID_MAP_SIZE / SPA_MAXBLOCKSIZE); + buf = abd_borrow_buf(map_abd, SPA_MAXBLOCKSIZE); + buf_offset = 0; + } + prev_id = cur->at_tile_id; + /* + * Advance the current sub-tile; if it moves us past the end + * of the current list of sub-tiles, start the next tile. + */ + if (!skip) { + curn = list_next(&cur->at_list, curn); + if (curn == NULL) { + cur = next; + curn = cur != NULL ? + list_head(&cur->at_list) : NULL; + } + } + } + + if (status == VDEV_CONFIG_NO_CHECKPOINT || + status == VDEV_CONFIG_REWINDING_CHECKPOINT) { + va->vd_checkpoint_tile = UINT32_MAX; + } else if (status == VDEV_CONFIG_CREATING_CHECKPOINT) { + anyraid_tile_t *at = avl_last(&va->vd_tile_map); + ASSERT(at); + va->vd_checkpoint_tile = at->at_tile_id; + } + rw_exit(&va->vd_lock); + + abd_return_buf_copy(map_abd, buf, SPA_MAXBLOCKSIZE); + map_write_issue(zio, vd, base_offset, written, buf_offset, map_abd, + &cksums[written], flags); + + if (zio_wait(zio)) + return; + + // Populate the header + uint16_t *sizes = kmem_zalloc(sizeof (*sizes) * + anyraidvd->vdev_children, KM_SLEEP); + uint64_t disk_id = 0; + for (uint64_t i = 0; i < anyraidvd->vdev_children; i++) { + if (anyraidvd->vdev_child[i] == vd) + disk_id = i; + sizes[i] = va->vd_children[i]->van_capacity - 1; + } + ASSERT3U(disk_id, <, anyraidvd->vdev_children); + nvlist_t *header = fnvlist_alloc(); + fnvlist_add_uint16(header, VDEV_ANYRAID_HEADER_VERSION, 0); + fnvlist_add_uint8(header, VDEV_ANYRAID_HEADER_DISK, disk_id); + fnvlist_add_uint64(header, VDEV_ANYRAID_HEADER_TXG, txg); + fnvlist_add_uint64(header, VDEV_ANYRAID_HEADER_GUID, spa_guid(spa)); + fnvlist_add_uint64(header, VDEV_ANYRAID_HEADER_TILE_SIZE, + va->vd_tile_size); + fnvlist_add_uint32(header, VDEV_ANYRAID_HEADER_LENGTH, + written * SPA_MAXBLOCKSIZE + buf_offset); + fnvlist_add_uint16_array(header, VDEV_ANYRAID_HEADER_DISK_SIZES, sizes, + anyraidvd->vdev_children); + kmem_free(sizes, sizeof (*sizes) * anyraidvd->vdev_children); + + if (va->vd_checkpoint_tile != UINT32_MAX) { + fnvlist_add_uint32(header, VDEV_ANYRAID_HEADER_CHECKPOINT, + va->vd_checkpoint_tile); + } + vdev_anyraid_relocate_t *var = &va->vd_relocate; + if (var->var_state != ARS_NONE && var->var_state != ARS_FINISHED) + fnvlist_add_uint32(header, VDEV_ANYRAID_HEADER_RELOC_STATE, + (uint32_t)var->var_state); + if (var->var_state == ARS_SCANNING) { + mutex_enter(&va->vd_relocate.var_lock); + uint64_t task = va->vd_relocate.var_synced_task; + list_t *l = &va->vd_relocate.var_done_list; + vdev_anyraid_relocate_task_t *vart = list_head(l); + for (;;) { + if (vart == NULL) { + l = &va->vd_relocate.var_list; + vart = list_head(l); + } + if (vart->vart_task == task) + break; + vart = list_next(l, vart); + } + nvlist_t *rebal_task = fnvlist_alloc(); + fnvlist_add_uint32(rebal_task, VART_TILE, + vart->vart_tile); + fnvlist_add_uint8(rebal_task, VART_SOURCE_DISK, + vart->vart_source_disk); + fnvlist_add_uint8(rebal_task, VART_DEST_DISK, + vart->vart_dest_disk); + fnvlist_add_uint16(rebal_task, VART_SOURCE_OFF, + vart->vart_source_idx); + fnvlist_add_uint16(rebal_task, VART_DEST_OFF, + vart->vart_dest_idx); + fnvlist_add_uint64(rebal_task, VART_OFFSET, + va->vd_relocate.var_synced_offset); + fnvlist_add_uint32(rebal_task, VART_TASK, task); + fnvlist_add_nvlist(header, + VDEV_ANYRAID_HEADER_CUR_TASK, rebal_task); + fnvlist_free(rebal_task); + mutex_exit(&va->vd_relocate.var_lock); + } + if (va->vd_contracting_leaf != -1) { + fnvlist_add_uint32(header, + VDEV_ANYRAID_HEADER_CONTRACTING_LEAF, + va->vd_contracting_leaf); + } + size_t packed_size; + char *packed = NULL; + VERIFY0(nvlist_pack(header, &packed, &packed_size, NV_ENCODE_XDR, + KM_SLEEP)); + fnvlist_free(header); + ASSERT3U(packed_size, <, nvl_bytes); + memcpy(header_buf, packed, packed_size); + fnvlist_pack_free(packed, packed_size); + abd_return_buf_copy(header_abd, header_buf, header_size); + + // Write out the header + zio_t *header_zio = zio_write_phys(pio, vd, base_offset, header_size, + header_abd, ZIO_CHECKSUM_LABEL, vdev_anyraid_write_map_done, + good_writes, ZIO_PRIORITY_SYNC_WRITE, flags, B_FALSE); + zio_nowait(header_zio); + abd_free(header_abd); +} + +static uint64_t +vdev_anyraid_min_attach_size(vdev_t *vd) +{ + ASSERT(vdev_is_anyraid(vd)); + ASSERT3U(spa_config_held(vd->vdev_spa, SCL_ALL, RW_READER), !=, 0); + vdev_anyraid_t *va = vd->vdev_tsd; + ASSERT(va->vd_tile_size); + return (VDEV_ANYRAID_TOTAL_MAP_SIZE(vd->vdev_ashift) + + va->vd_tile_size); +} + +static uint64_t +vdev_anyraid_min_asize(vdev_t *pvd, vdev_t *cvd) +{ + ASSERT(vdev_is_anyraid(pvd)); + vdev_anyraid_t *va = pvd->vdev_tsd; + if (va->vd_tile_size == 0) + return (VDEV_ANYRAID_TOTAL_MAP_SIZE(cvd->vdev_ashift)); + + rw_enter(&va->vd_lock, RW_READER); + uint64_t size = VDEV_ANYRAID_TOTAL_MAP_SIZE(cvd->vdev_ashift) + + va->vd_children[cvd->vdev_id]->van_capacity * + va->vd_tile_size; + rw_exit(&va->vd_lock); + return (size); +} + +void +vdev_anyraid_expand(vdev_t *tvd, vdev_t *newvd) +{ + vdev_anyraid_t *va = tvd->vdev_tsd; + uint64_t old_children = tvd->vdev_children - 1; + + ASSERT3U(spa_config_held(tvd->vdev_spa, SCL_ALL, RW_WRITER), ==, + SCL_ALL); + vdev_anyraid_node_t **nc = kmem_alloc(tvd->vdev_children * sizeof (*nc), + KM_SLEEP); + vdev_anyraid_node_t *newchild = kmem_alloc(sizeof (*newchild), + KM_SLEEP); + newchild->van_id = newvd->vdev_id; + anyraid_freelist_create(&newchild->van_freelist, 0); + uint64_t max_size = VDEV_ANYRAID_MAX_TPD * va->vd_tile_size; + newchild->van_capacity = (MIN(max_size, (newvd->vdev_asize - + VDEV_ANYRAID_TOTAL_MAP_SIZE(newvd->vdev_ashift))) / + va->vd_tile_size); + rw_enter(&va->vd_lock, RW_WRITER); + memcpy(nc, va->vd_children, old_children * sizeof (*nc)); + kmem_free(va->vd_children, old_children * sizeof (*nc)); + va->vd_children = nc; + va->vd_children[old_children] = newchild; + avl_add(&va->vd_children_tree, newchild); + rw_exit(&va->vd_lock); +} + +boolean_t +vdev_anyraid_mapped(vdev_t *vd, uint64_t offset, uint64_t txg) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + anyraid_tile_t search; + search.at_tile_id = offset / va->vd_tile_size; + + rw_enter(&va->vd_lock, RW_READER); + anyraid_tile_t *tile = avl_find(&va->vd_tile_map, &search, NULL); + boolean_t result = tile != NULL && tile->at_synced + + VDEV_ANYRAID_MAP_COPIES <= txg; + rw_exit(&va->vd_lock); + + return (result); +} + +/* + * Return the maximum asize for a rebuild zio in the provided range + * given the following constraints. An anyraid chunk may not: + * + * - Exceed the maximum allowed block size (SPA_MAXBLOCKSIZE), or + * - Span anyraid tiles + */ +static uint64_t +vdev_anyraid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, + uint64_t max_segment) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + ASSERT(vdev_is_anyraid(vd)); + + uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift), + SPA_MAXBLOCKSIZE); + + if (start / va->vd_tile_size != + (start + psize) / va->vd_tile_size) { + psize = P2ROUNDUP(start, va->vd_tile_size) - start; + } + + return (MIN(asize, vdev_psize_to_asize(vd, psize))); +} + +static uint64_t +vdev_anyraid_asize(vdev_t *vd, uint64_t psize, uint64_t txg) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + ASSERT(vdev_is_anyraid(vd)); + if (va->vd_parity_type == VAP_MIRROR) + return (vdev_default_asize(vd, psize, txg)); + + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t nparity = va->vd_nparity; + uint64_t cols = va->vd_width; + + uint64_t asize = ((psize - 1) >> ashift) + 1; + asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); + asize = roundup(asize, nparity + 1) << ashift; + +#ifdef ZFS_DEBUG + uint64_t asize_new = ((psize - 1) >> ashift) + 1; + uint64_t ncols_new = cols; + asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / + (ncols_new - nparity)); + asize_new = roundup(asize_new, nparity + 1) << ashift; + VERIFY3U(asize_new, <=, asize); +#endif + + return (asize); +} + +static uint64_t +vdev_anyraid_psize(vdev_t *vd, uint64_t asize, uint64_t txg) +{ + (void) txg; + vdev_anyraid_t *va = vd->vdev_tsd; + ASSERT(vdev_is_anyraid(vd)); + ASSERT3U(va->vd_parity_type, ==, VAP_RAIDZ); + + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t nparity = va->vd_nparity; + uint64_t cols = va->vd_width; + + ASSERT0(asize % (1 << ashift)); + + uint64_t psize = (asize >> ashift); + /* + * If the roundup to nparity + 1 caused us to spill into a new row, we + * need to ignore that row entirely (since it can't store data or + * parity). + */ + uint64_t rows = psize / cols; + psize = psize - (rows * cols) <= nparity ? rows * cols : psize; + /* Subtract out parity sectors for each row storing data. */ + psize -= nparity * DIV_ROUND_UP(psize, cols); + psize <<= ashift; + + return (psize); +} + +uint64_t +vdev_anyraid_child_num_tiles(vdev_t *vd, vdev_t *cvd) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + ASSERT(vdev_is_anyraid(vd)); + + uint64_t total = 0; + rw_enter(&va->vd_lock, RW_READER); + if (cvd != NULL) { + vdev_anyraid_node_t *n = va->vd_children[cvd->vdev_id]; + total = anyraid_freelist_alloc(&n->van_freelist); + } else { + for (int i = 0; i < vd->vdev_children; i++) { + vdev_anyraid_node_t *n = va->vd_children[i]; + total += anyraid_freelist_alloc(&n->van_freelist); + } + } + rw_exit(&va->vd_lock); + return (total); +} + +uint64_t +vdev_anyraid_child_capacity(vdev_t *vd, vdev_t *cvd) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + ASSERT(vdev_is_anyraid(vd)); + + uint64_t total = 0; + rw_enter(&va->vd_lock, RW_READER); + if (cvd != NULL) { + vdev_anyraid_node_t *n = va->vd_children[cvd->vdev_id]; + total = n->van_capacity; + } else { + for (int i = 0; i < vd->vdev_children; i++) { + vdev_anyraid_node_t *n = va->vd_children[i]; + total += n->van_capacity; + } + } + rw_exit(&va->vd_lock); + return (total); +} + +vdev_ops_t vdev_anymirror_ops = { + .vdev_op_init = vdev_anyraid_init, + .vdev_op_fini = vdev_anyraid_fini, + .vdev_op_open = vdev_anyraid_open, + .vdev_op_close = vdev_anyraid_close, + .vdev_op_psize_to_asize = vdev_anyraid_asize, + .vdev_op_asize_to_psize = vdev_default_psize, + .vdev_op_min_asize = vdev_anyraid_min_asize, + .vdev_op_min_attach_size = vdev_anyraid_min_attach_size, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_anyraid_io_start, + .vdev_op_io_done = vdev_anyraid_io_done, + .vdev_op_state_change = vdev_anyraid_state_change, + .vdev_op_need_resilver = vdev_anyraid_need_resilver, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_anyraid_xlate, + .vdev_op_rebuild_asize = vdev_anyraid_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = vdev_anyraid_config_generate, + .vdev_op_nparity = vdev_anyraid_nparity, + .vdev_op_ndisks = vdev_anyraid_ndisks, + .vdev_op_metaslab_size = vdev_anyraid_metaslab_size, + .vdev_op_type = VDEV_TYPE_ANYMIRROR, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ +}; + +vdev_ops_t vdev_anyraidz_ops = { + .vdev_op_init = vdev_anyraid_init, + .vdev_op_fini = vdev_anyraid_fini, + .vdev_op_open = vdev_anyraid_open, + .vdev_op_close = vdev_anyraid_close, + .vdev_op_psize_to_asize = vdev_anyraid_asize, + .vdev_op_asize_to_psize = vdev_anyraid_psize, + .vdev_op_min_asize = vdev_anyraid_min_asize, + .vdev_op_min_attach_size = vdev_anyraid_min_attach_size, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_anyraid_io_start, + .vdev_op_io_done = vdev_anyraid_io_done, + .vdev_op_state_change = vdev_anyraid_state_change, + .vdev_op_need_resilver = vdev_anyraid_need_resilver, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_anyraid_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = vdev_anyraid_config_generate, + .vdev_op_nparity = vdev_anyraid_nparity, + .vdev_op_ndisks = vdev_anyraid_ndisks, + .vdev_op_metaslab_size = vdev_anyraid_metaslab_size, + .vdev_op_type = VDEV_TYPE_ANYRAIDZ, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ +}; + + +/* + * ========================================================================== + * TILE MOTION & REBALANCE LOGIC + * ========================================================================== + */ + +vdev_anyraid_relocate_t * +vdev_anyraid_relocate_status(vdev_t *vd) +{ + ASSERT(vdev_is_anyraid(vd)); + vdev_anyraid_t *va = vd->vdev_tsd; + return (&va->vd_relocate); +} + +static void +tasklist_write(spa_t *spa, vdev_anyraid_relocate_t *var, dmu_tx_t *tx) +{ + uint64_t obj = var->var_object; + objset_t *mos = spa->spa_meta_objset; + ASSERT(MUTEX_HELD(&var->var_lock)); + + size_t total_count = 0, done_count = 0; + for (vdev_anyraid_relocate_task_t *t = list_head(&var->var_done_list); + t; t = list_next(&var->var_done_list, t)) { + done_count++; + total_count++; + } + for (vdev_anyraid_relocate_task_t *t = list_head(&var->var_list); t; + t = list_next(&var->var_list, t)) + total_count++; + size_t buflen = MIN(SPA_OLD_MAXBLOCKSIZE, + total_count * sizeof (relocate_task_phys_t)); + relocate_task_phys_t *buf = kmem_alloc(buflen, KM_SLEEP); + + size_t count = 0; + size_t written = 0; + list_t *ls[2]; + ls[0] = &var->var_done_list; + ls[1] = &var->var_list; + for (int i = 0; i < 2; i++) { + for (vdev_anyraid_relocate_task_t *t = list_head(ls[i]); t; + t = list_next(ls[i], t)) { + if (count == SPA_OLD_MAXBLOCKSIZE / sizeof (*buf)) { + ASSERT3U(buflen, ==, SPA_OLD_MAXBLOCKSIZE); + dmu_write(mos, obj, written * + SPA_OLD_MAXBLOCKSIZE, buflen, buf, tx, + DMU_READ_NO_PREFETCH); + + size_t next_buflen = MIN(SPA_OLD_MAXBLOCKSIZE, + (total_count - count) * sizeof (*buf)); + if (next_buflen != buflen) { + kmem_free(buf, buflen); + buf = kmem_alloc(next_buflen, KM_SLEEP); + buflen = next_buflen; + } + count = 0; + } + + ASSERT3U(count * sizeof (*buf), <, buflen); + relocate_task_phys_t *rtp = buf + count++; + rtp->rtp_source_disk = t->vart_source_disk; + rtp->rtp_dest_disk = t->vart_dest_disk; + rtp->rtp_source_idx = t->vart_source_idx; + rtp->rtp_dest_idx = t->vart_dest_idx; + rtp->rtp_tile = t->vart_tile; + rtp->rtp_task = t->vart_task; + rtp->rtp_pad2 = 0; + } + } + dmu_write(mos, obj, written * SPA_OLD_MAXBLOCKSIZE, buflen, buf, tx, + DMU_READ_NO_PREFETCH); + kmem_free(buf, buflen); + + dmu_buf_t *dbp; + VERIFY0(dmu_bonus_hold(mos, obj, FTAG, &dbp)); + ASSERT3U(dbp->db_size, >=, sizeof (relocate_phys_t)); + relocate_phys_t *rp = dbp->db_data; + dmu_buf_will_dirty(dbp, tx); + rp->rp_total = total_count; + rp->rp_done = done_count; + dmu_buf_rele(dbp, FTAG); +} + +static int +tasklist_read(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + vdev_anyraid_t *va = vd->vdev_tsd; + vdev_anyraid_relocate_t *var = &va->vd_relocate; + ASSERT3P(spa->spa_anyraid_relocate, ==, var); + + uint64_t object; + int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_RELOCATE_OBJ, sizeof (uint64_t), 1, &object); + if (error == ENOENT) { + if (var->var_state != ARS_CONTRACTING) + return (ENOENT); + goto disable_tail; + } + if (error != 0) + return (error); + + dmu_buf_t *dbp; + if ((error = dmu_bonus_hold(mos, object, FTAG, &dbp)) != 0) + return (error); + + relocate_phys_t *rpp = dbp->db_data; + size_t done = rpp->rp_done; + size_t total = rpp->rp_total; + dmu_buf_rele(dbp, FTAG); + + mutex_enter(&var->var_lock); + ASSERT0(var->var_object); + var->var_object = object; + mutex_exit(&var->var_lock); + size_t buflen = MIN(SPA_OLD_MAXBLOCKSIZE, + total * sizeof (relocate_task_phys_t)); + relocate_task_phys_t *buf = kmem_alloc(buflen, KM_SLEEP); + list_t *l = &var->var_list; + size_t i; + for (i = 0; i < total; i++) { + size_t idx = i % (SPA_OLD_MAXBLOCKSIZE / sizeof (*buf)); + if (idx == 0) { + size_t next_buflen = MIN(SPA_OLD_MAXBLOCKSIZE, + (total - i) * sizeof (relocate_task_phys_t)); + if (next_buflen != buflen) { + kmem_free(buf, buflen); + buflen = next_buflen; + buf = kmem_alloc(buflen, KM_SLEEP); + } + error = dmu_read(mos, var->var_object, + i * sizeof (*buf), buflen, buf, DMU_READ_PREFETCH); + if (error) { + // The task lists will be freed when we fini vd + kmem_free(buf, buflen); + return (error); + } + } + if (i == done && list_head(&var->var_list)) { + l = &var->var_list; + vdev_anyraid_relocate_task_t *vart = + list_remove_head(l); + ASSERT(vart); + kmem_free(vart, sizeof (*vart)); + } + vdev_anyraid_relocate_task_t *vart = + kmem_alloc(sizeof (*vart), KM_SLEEP); + relocate_task_phys_t *rtp = buf + idx; + vart->vart_source_disk = rtp->rtp_source_disk; + vart->vart_dest_disk = rtp->rtp_dest_disk; + vart->vart_source_idx = rtp->rtp_source_idx; + vart->vart_dest_idx = rtp->rtp_dest_idx; + vart->vart_tile = rtp->rtp_tile; + vart->vart_task = rtp->rtp_task; + + /* + * We need to disable metaslabs here; any metaslabs that are + * after the first done task but before or containing the + * resume offset. + */ + if (i >= done && vart->vart_task <= var->var_task) { + uint64_t ms_per_tile = va->vd_tile_size >> + vd->vdev_ms_shift; + uint64_t start = vart->vart_tile * ms_per_tile; + uint64_t end = start + ms_per_tile; + for (uint64_t m = start; m < end; m++) { + ASSERT(vd->vdev_ms); + metaslab_t *ms = vd->vdev_ms[m]; + if (vart->vart_task == var->var_task) { + zfs_range_seg64_t log, phys, rem; + log.rs_start = ms->ms_start; + log.rs_end = ms->ms_start + ms->ms_size; + vdev_xlate(vd->vdev_child[ + vart->vart_source_disk], &log, + &phys, &rem); + if (phys.rs_start == phys.rs_end || + phys.rs_start > var->var_offset) + continue; + } + metaslab_disable_nowait(ms); + vart->vart_dis_ms++; + } + } + + rw_enter(&va->vd_lock, RW_WRITER); + anyraid_freelist_t *af = + &va->vd_children[vart->vart_source_disk]->van_freelist; + boolean_t sourcefree = anyraid_freelist_isfree(af, + vart->vart_source_idx); + if (sourcefree) + anyraid_freelist_remove(af, vart->vart_source_idx); + + af = &va->vd_children[vart->vart_dest_disk]->van_freelist; + boolean_t destfree = anyraid_freelist_isfree(af, + vart->vart_dest_idx); + if (destfree) + anyraid_freelist_remove(af, vart->vart_dest_idx); + + // Either one or the other should be in the mapping already. + ASSERT3U(sourcefree, !=, destfree); + rw_exit(&va->vd_lock); + + list_insert_tail(l, vart); + } + if (i == done) { + vdev_anyraid_relocate_task_t *vart = + list_remove_head(&var->var_list); + ASSERT(vart); + kmem_free(vart, sizeof (*vart)); + } + kmem_free(buf, buflen); + +disable_tail: + uint64_t *num_tiles = kmem_zalloc(sizeof (*num_tiles) * + vd->vdev_children, KM_SLEEP); + rw_enter(&va->vd_lock, RW_READER); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_anyraid_node_t *van = va->vd_children[c]; + if (va->vd_contracting_leaf == c) { + num_tiles[c] = 0; + continue; + } + num_tiles[c] = van->van_capacity; + } + uint64_t updated_asize = calculate_asize(vd, num_tiles); + rw_exit(&va->vd_lock); + kmem_free(num_tiles, vd->vdev_children * sizeof (*num_tiles)); + var->var_nonalloc = vd->vdev_asize - updated_asize; + vdev_update_nonallocating_space(vd, var->var_nonalloc, B_TRUE); + if (va->vd_contracting_leaf != -1) { + uint64_t start = MIN(vd->vdev_ms_count, + updated_asize >> vd->vdev_ms_shift); + uint64_t end = vd->vdev_ms_count; + for (uint64_t m = start; m < end; m++) { + metaslab_t *ms = vd->vdev_ms[m]; + metaslab_disable_nowait(ms); + } + } + return (0); +} + +static void +anyraid_relocate_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = arg; + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + vdev_anyraid_relocate_t *var = spa->spa_anyraid_relocate; + + /* + * Ensure there are no i/os to the range that is being committed. + */ + uint64_t old_offset = var->var_synced_offset; + uint64_t old_task = var->var_synced_task; + + ASSERT3U(var->var_task_pertxg[txgoff], >=, old_task); + ASSERT(var->var_task_pertxg[txgoff] > old_task || + var->var_offset_pertxg[txgoff] >= old_offset); + + mutex_enter(&var->var_lock); + uint64_t new_offset = + MIN(var->var_offset_pertxg[txgoff], var->var_failed_offset); + uint64_t new_task = + MIN(var->var_task_pertxg[txgoff], var->var_failed_task); + /* + * We should not have committed anything that failed. + */ + mutex_exit(&var->var_lock); + + vdev_t *vd = vdev_lookup_top(spa, var->var_vd); + vdev_anyraid_t *va = vd->vdev_tsd; + + zfs_locked_range_t *lr = zfs_rangelock_enter(&va->vd_rangelock, + old_offset, new_offset - old_offset, + RL_WRITER); + + var->var_synced_offset = new_offset; + var->var_synced_task = new_task; + var->var_offset_pertxg[txgoff] = 0; + var->var_task_pertxg[txgoff] = 0; + zfs_rangelock_exit(lr); + + mutex_enter(&var->var_lock); + var->var_bytes_copied += var->var_bytes_copied_pertxg[txgoff]; + var->var_bytes_copied_pertxg[txgoff] = 0; + + tasklist_write(spa, var, tx); + mutex_exit(&var->var_lock); +} + +static void +anyraid_scrub_done(spa_t *spa, dmu_tx_t *tx, void *arg) +{ + struct anyraid_done_arg *ada = arg; + vdev_anyraid_t *va = ada->vd->vdev_tsd; + vdev_anyraid_relocate_t *var = &va->vd_relocate; + rw_enter(&va->vd_lock, RW_WRITER); + boolean_t noop = (list_head(&var->var_done_list) == NULL); + for (vdev_anyraid_relocate_task_t *task = + list_head(&var->var_done_list); task; + task = list_head(&var->var_done_list)) { + anyraid_freelist_add( + &va->vd_children[task->vart_source_disk]->van_freelist, + task->vart_source_idx); + list_remove(&var->var_done_list, task); + kmem_free(task, sizeof (*task)); + } + /* + * Usually there aren't any tasks left in the list, but this can happen + * if we finish our relocate in just the right way, and then export the + * pool and reimport during the scrub. + */ + for (vdev_anyraid_relocate_task_t *task = + list_head(&var->var_list); task; + task = list_head(&var->var_list)) { + anyraid_freelist_add( + &va->vd_children[task->vart_source_disk]->van_freelist, + task->vart_source_idx); + list_remove(&var->var_list, task); + kmem_free(task, sizeof (*task)); + } + + objset_t *mos = spa->spa_meta_objset; + + uint64_t object; + int res = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_RELOCATE_OBJ, sizeof (uint64_t), 1, &object); + if (res == 0) { + ASSERT3U(object, ==, var->var_object); + VERIFY0(dmu_object_free(mos, var->var_object, tx)); + VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_RELOCATE_OBJ, tx)); + } else { + ASSERT(noop); + } + + boolean_t contracting = va->vd_contracting_leaf != -1; + if (!contracting) { + vdev_update_nonallocating_space(ada->vd, var->var_nonalloc, + B_FALSE); + } else { + spa_async_request(spa, SPA_ASYNC_CONTRACTION_DONE); + } + + va->vd_relocate.var_state = ARS_CONTRACTING; + rw_exit(&va->vd_lock); + + spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER); + if (!contracting) { + ada->vd->vdev_expanding = B_TRUE; + vdev_reopen(ada->vd); + } + spa->spa_ccw_fail_time = 0; + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + spa_config_exit(spa, SCL_STATE_ALL, FTAG); + vdev_config_dirty(ada->vd); + kmem_free(ada, sizeof (*ada)); +} + +static void +anyraid_relocate_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = arg; + vdev_anyraid_relocate_t *var = spa->spa_anyraid_relocate; + vdev_t *vd = vdev_lookup_top(spa, var->var_vd); + vdev_anyraid_t *va = vd->vdev_tsd; + + for (int i = 0; i < TXG_SIZE; i++) { + VERIFY0(var->var_offset_pertxg[i]); + } + + rw_enter(&va->vd_lock, RW_WRITER); + /* + * will get written (based on vd_expand_txgs). TODO + */ + vdev_config_dirty(vd); + + var->var_end_time = gethrestime_sec(); + + spa_history_log_internal(spa, "anyraid relocate completed", tx, + "%s vdev %llu", spa_name(spa), + (unsigned long long)vd->vdev_id); + + rw_exit(&va->vd_lock); + + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); + spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); + spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); + + spa_notify_waiters(spa); + + var->var_state = ARS_SCRUBBING; + /* + * While we're in syncing context take the opportunity to + * setup a scrub. All the data has been sucessfully copied + * but we have not validated any checksums. + */ + struct anyraid_done_arg *ada = kmem_alloc(sizeof (*ada), KM_SLEEP); + ada->vd = vd; + setup_sync_arg_t setup_sync_arg = { + .func = POOL_SCAN_SCRUB, + .txgstart = 0, + .txgend = 0, + .done = anyraid_scrub_done, + .done_arg = ada, + }; + if (zfs_scrub_after_relocate && + dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0 && + list_head(&var->var_done_list) != NULL) { + dsl_scan_setup_sync(&setup_sync_arg, tx); + } else { + anyraid_scrub_done(spa, tx, ada); + } +} + +dsl_scan_done_func_t * +anyraid_setup_scan_done(spa_t *spa, uint64_t vd_id, void **arg) +{ + struct anyraid_done_arg *ada = kmem_alloc(sizeof (*ada), KM_SLEEP); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + ada->vd = vdev_lookup_top(spa, vd_id); + spa_config_exit(spa, SCL_STATE, FTAG); + *arg = ada; + return (anyraid_scrub_done); +} + + +struct rebal_node { + avl_node_t node1; + avl_node_t node2; + int cvd; + int free; // number of free tiles + int alloc; // number of allocated tiles + int64_t *arr; +}; + +static int +rebal_cmp_free(const void *a, const void *b) +{ + const struct rebal_node *ra = a; + const struct rebal_node *rb = b; + int cmp = TREE_CMP(ra->free, rb->free); + if (likely(cmp != 0)) + return (cmp); + return (TREE_CMP(rb->cvd, ra->cvd)); +} + +static int +rebal_cmp_alloc(const void *a, const void *b) +{ + const struct rebal_node *ra = a; + const struct rebal_node *rb = b; + int cmp = TREE_CMP(ra->alloc, rb->alloc); + if (likely(cmp != 0)) + return (cmp); + return (TREE_CMP(rb->cvd, ra->cvd)); +} + +static void +populate_child_array(vdev_anyraid_t *va, int child, int64_t *arr, uint32_t cap) +{ + for (anyraid_tile_t *tile = avl_first(&va->vd_tile_map); + tile; tile = AVL_NEXT(&va->vd_tile_map, tile)) { + for (anyraid_tile_node_t *atn = list_head(&tile->at_list); + atn; atn = list_next(&tile->at_list, atn)) { + if (atn->atn_disk == child) { + ASSERT3U(atn->atn_tile_idx, <, cap); + arr[atn->atn_tile_idx] = tile->at_tile_id; + } + } + } +} + +static void +create_reloc_task(vdev_anyraid_t *va, struct rebal_node *donor, uint16_t offset, + struct rebal_node *receiver, uint32_t *tid) +{ + vdev_anyraid_node_t *rvan = va->vd_children[receiver->cvd]; + vdev_anyraid_relocate_task_t *task = + kmem_zalloc(sizeof (*task), KM_SLEEP); + task->vart_source_disk = (uint8_t)donor->cvd; + task->vart_dest_disk = (uint8_t)receiver->cvd; + task->vart_source_idx = offset; + ASSERT(rvan->van_capacity - + anyraid_freelist_alloc(&rvan->van_freelist)); + task->vart_dest_idx = anyraid_freelist_pop( + &rvan->van_freelist); + task->vart_tile = donor->arr[offset]; + task->vart_task = (*tid)++; + list_insert_tail(&va->vd_relocate.var_list, task); + receiver->arr[task->vart_dest_idx] = donor->arr[offset]; + donor->arr[offset] = -1LL; +} + +static boolean_t +reloc_try_move_one(vdev_anyraid_t *va, struct rebal_node *donor, + uint16_t offset, struct rebal_node *receiver, uint32_t *tid) +{ + vdev_anyraid_node_t *rvan = va->vd_children[receiver->cvd]; + + boolean_t found = B_FALSE; + for (int j = 0; j < rvan->van_freelist.af_next_off; + j++) { + /* + * cause the total number of allocatable tiles to drop; + * if so, we have to skip it. + */ + if (donor->arr[offset] == receiver->arr[j]) { + found = B_TRUE; + break; + } + } + if (found) + return (B_FALSE); + + create_reloc_task(va, donor, offset, receiver, tid); + return (B_TRUE); +} + +static boolean_t +rebal_try_move(vdev_anyraid_t *va, struct rebal_node *donor, + struct rebal_node *receiver, uint32_t *tid) +{ + vdev_anyraid_node_t *dvan = va->vd_children[donor->cvd]; + + for (int i = 0; i < dvan->van_freelist.af_next_off; i++) { + ASSERT3U(dvan->van_freelist.af_next_off, <=, + dvan->van_capacity); + if (donor->arr[i] == -1LL) + continue; + if (reloc_try_move_one(va, donor, i, receiver, tid)) + return (B_TRUE); + } + return (B_FALSE); +} + +void +vdev_anyraid_setup_rebalance(vdev_t *vd, dmu_tx_t *tx) +{ + (void) tx; + ASSERT(vdev_is_anyraid(vd)); + vdev_anyraid_t *va = vd->vdev_tsd; + + vdev_config_dirty(vd); + + vdev_anyraid_relocate_t *var = &va->vd_relocate; + var->var_start_time = gethrestime_sec(); + var->var_state = ARS_SCANNING; + var->var_vd = vd->vdev_id; + var->var_failed_offset = var->var_failed_task = UINT64_MAX; + var->var_offset = 0; + + mutex_enter(&var->var_lock); + vd->vdev_spa->spa_anyraid_relocate = var; + + rw_enter(&va->vd_lock, RW_WRITER); + avl_tree_t ft; + avl_create(&ft, rebal_cmp_free, sizeof (struct rebal_node), + offsetof(struct rebal_node, node1)); + avl_tree_t at; + avl_create(&at, rebal_cmp_alloc, sizeof (struct rebal_node), + offsetof(struct rebal_node, node2)); + + uint64_t *num_tiles = kmem_zalloc(vd->vdev_children * + sizeof (*num_tiles), KM_SLEEP); + for (int c = 0; c < vd->vdev_children; c++) + num_tiles[c] = va->vd_children[c]->van_capacity; + + for (int i = 0; i < vd->vdev_children; i++) { + struct rebal_node *rn = kmem_zalloc(sizeof (*rn), KM_SLEEP); + rn->cvd = i; + vdev_anyraid_node_t *n = va->vd_children[i]; + uint32_t cap = n->van_capacity; + rn->alloc = anyraid_freelist_alloc(&n->van_freelist); + rn->free = cap - rn->alloc; + rn->arr = kmem_alloc(sizeof (*rn->arr) * cap, KM_SLEEP); + memset(rn->arr, -1, sizeof (*rn->arr) * cap); + populate_child_array(va, i, rn->arr, cap); + avl_add(&ft, rn); + avl_add(&at, rn); + } + uint32_t tid = 0; + for (;;) { + struct rebal_node *donor = avl_last(&at); + boolean_t moved = B_FALSE; + while (donor && donor->alloc > 0) { + struct rebal_node *prev_donor = AVL_PREV(&at, donor); + struct rebal_node *receiver = avl_last(&ft); + while (receiver && receiver->free > 0) { + struct rebal_node *prev_rec = + AVL_PREV(&ft, receiver); + if (receiver->free <= donor->free + 1) + break; + moved = rebal_try_move(va, + donor, receiver, &tid); + if (!moved) { + receiver = prev_rec; + continue; + } + avl_remove(&ft, receiver); + avl_remove(&at, receiver); + receiver->free--; + receiver->alloc++; + avl_add(&ft, receiver); + avl_add(&at, receiver); + num_tiles[receiver->cvd]--; + break; + } + if (moved) + break; + donor = prev_donor; + } + if (donor == NULL || donor->alloc == 0) + break; + } + + /* + * It's already balanced; clean up the state and report success + * immediately. + */ + if (tid == 0) { + rw_exit(&va->vd_lock); + kmem_free(num_tiles, vd->vdev_children * sizeof (*num_tiles)); + + struct rebal_node *node; + void *cookie = NULL; + while ((node = avl_destroy_nodes(&ft, &cookie)) != NULL) + ; + avl_destroy(&ft); + cookie = NULL; + while ((node = avl_destroy_nodes(&at, &cookie)) != NULL) { + kmem_free(node->arr, sizeof (*node->arr) * + (node->free + node->alloc)); + kmem_free(node, sizeof (*node)); + } + avl_destroy(&at); + + var->var_nonalloc = 0; + var->var_state = ARS_FINISHED; + mutex_exit(&var->var_lock); + anyraid_relocate_complete_sync(vd->vdev_spa, tx); + return; + } + + uint64_t updated_asize = calculate_asize(vd, num_tiles); + rw_exit(&va->vd_lock); + kmem_free(num_tiles, vd->vdev_children * sizeof (*num_tiles)); + ASSERT3U(vd->vdev_asize, >=, updated_asize); + var->var_nonalloc = vd->vdev_asize - updated_asize; + vdev_update_nonallocating_space(vd, var->var_nonalloc, B_TRUE); + + objset_t *mos = vd->vdev_spa->spa_meta_objset; + var->var_object = dmu_object_alloc(mos, DMU_OTN_UINT32_METADATA, + SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, + sizeof (relocate_phys_t), tx); + VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_RELOCATE_OBJ, + sizeof (uint64_t), 1, &var->var_object, tx)); + + tasklist_write(vd->vdev_spa, var, tx); + mutex_exit(&var->var_lock); + + struct rebal_node *node; + void *cookie = NULL; + while ((node = avl_destroy_nodes(&ft, &cookie)) != NULL) + ; + avl_destroy(&ft); + cookie = NULL; + while ((node = avl_destroy_nodes(&at, &cookie)) != NULL) { + kmem_free(node->arr, sizeof (*node->arr) * + (node->free + node->alloc)); + kmem_free(node, sizeof (*node)); + } + avl_destroy(&at); + zthr_wakeup(vd->vdev_spa->spa_anyraid_relocate_zthr); +} + +static boolean_t +spa_anyraid_relocate_thread_check(void *arg, zthr_t *zthr) +{ + (void) zthr; + spa_t *spa = arg; + vdev_anyraid_relocate_t *var = spa->spa_anyraid_relocate; + + return (var != NULL && var->var_state != ARS_SCRUBBING && + !var->var_waiting_for_resilver); +} + +/* + * Write of the new location on one child is done. Once all of them are done + * we can unlock and free everything. + */ +static void +anyraid_relocate_write_done(zio_t *zio) +{ + anyraid_move_arg_t *ama = zio->io_private; + vdev_anyraid_relocate_t *var = ama->ama_var; + + abd_free(zio->io_abd); + + mutex_enter(&var->var_lock); + if (zio->io_error != 0) { + /* Force a relocate pause on errors */ + var->var_failed_offset = + MIN(var->var_failed_offset, ama->ama_lr->lr_offset); + var->var_failed_task = MIN(var->var_failed_task, ama->ama_tid); + } + ASSERT3U(var->var_outstanding_bytes, >=, zio->io_size); + var->var_outstanding_bytes -= zio->io_size; + if (ama->ama_lr->lr_offset + ama->ama_lr->lr_length < + var->var_failed_offset) { + var->var_bytes_copied_pertxg[ama->ama_txg & TXG_MASK] += + zio->io_size; + } + cv_signal(&var->var_cv); + mutex_exit(&var->var_lock); + + spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); + zfs_rangelock_exit(ama->ama_lr); + kmem_free(ama, sizeof (*ama)); +} + +/* + * Read of the old location on one child is done. Once all of them are done + * writes should have all the data and we can issue them. + */ +static void +anyraid_relocate_read_done(zio_t *zio) +{ + anyraid_move_arg_t *ama = zio->io_private; + vdev_anyraid_relocate_t *var = ama->ama_var; + + /* + * If the read failed, or if it was done on a vdev that is not fully + * healthy (e.g. a child that has a resilver in progress), we may not + * have the correct data. Note that it's OK if the write proceeds. + * It may write garbage but the location is otherwise unused and we + * will retry later due to vre_failed_offset. + */ + if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { + zfs_dbgmsg("relocate read failed off=%llu size=%llu txg=%llu " + "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", + (long long)ama->ama_lr->lr_offset, + (long long)ama->ama_lr->lr_length, + (long long)ama->ama_txg, + zio->io_error, + vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), + vdev_dtl_empty(zio->io_vd, DTL_MISSING)); + mutex_enter(&var->var_lock); + /* Force a relocate pause on errors */ + var->var_failed_offset = + MIN(var->var_failed_offset, ama->ama_lr->lr_offset); + mutex_exit(&var->var_lock); + } + zio_nowait(ama->ama_zio); +} + +static void +anyraid_relocate_record_progress(vdev_anyraid_relocate_t *var, + uint64_t offset, uint64_t task, dmu_tx_t *tx) +{ + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (offset == 0) + return; + + mutex_enter(&var->var_lock); + var->var_offset = offset; + mutex_exit(&var->var_lock); + + if (var->var_offset_pertxg[txgoff] == 0) { + dsl_sync_task_nowait(dmu_tx_pool(tx), anyraid_relocate_sync, + spa, tx); + } + var->var_offset_pertxg[txgoff] = offset; + var->var_task_pertxg[txgoff] = task; +} + +static boolean_t +anyraid_relocate_impl(vdev_t *vd, vdev_anyraid_relocate_t *var, + zfs_range_tree_t *rt, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + uint_t ashift = vd->vdev_top->vdev_ashift; + vdev_anyraid_relocate_task_t *vart = list_head(&var->var_list); + vdev_anyraid_t *va = vd->vdev_tsd; + + zfs_range_seg_t *rs = zfs_range_tree_first(rt); + ASSERT(rs); + uint64_t offset = zfs_rs_get_start(rs, rt); + ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); + uint64_t size = zfs_rs_get_end(rs, rt) - offset; + ASSERT3U(size, >=, 1 << ashift); + ASSERT(IS_P2ALIGNED(size, 1 << ashift)); + + size = MIN(size, anyraid_relocate_max_move_bytes); + size = MAX(size, 1 << ashift); + + zfs_range_tree_remove(rt, offset, size); + + anyraid_move_arg_t *ama = kmem_zalloc(sizeof (*ama), KM_SLEEP); + ama->ama_var = var; + ama->ama_lr = zfs_rangelock_enter(&va->vd_rangelock, + offset, size, RL_WRITER); + ama->ama_txg = dmu_tx_get_txg(tx); + ama->ama_size = size; + ama->ama_tid = vart->vart_task; + + anyraid_relocate_record_progress(var, offset + size, vart->vart_task, + tx); + + /* + * SCL_STATE will be released when the read and write are done, + * by raidz_reflow_write_done(). + */ + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + mutex_enter(&var->var_lock); + var->var_outstanding_bytes += size; + mutex_exit(&var->var_lock); + + /* Allocate ABD and ZIO for each child we write. */ + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + zio_t *pio = spa->spa_txg_zio[txgoff]; + abd_t *abd = abd_alloc_for_io(size, B_FALSE); + vdev_t *source_vd = vd->vdev_child[vart->vart_source_disk]; + vdev_t *dest_vd = vd->vdev_child[vart->vart_dest_disk]; + uint64_t source_header = + VDEV_ANYRAID_START_OFFSET(source_vd->vdev_ashift); + uint64_t dest_header = + VDEV_ANYRAID_START_OFFSET(dest_vd->vdev_ashift); + uint64_t dest_off = dest_header + + vart->vart_dest_idx * va->vd_tile_size + + ((offset - source_header) % va->vd_tile_size); + ama->ama_zio = zio_vdev_child_io(pio, NULL, + dest_vd, dest_off, abd, size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, anyraid_relocate_write_done, ama); + + zio_nowait(zio_vdev_child_io(pio, NULL, + vd->vdev_child[vart->vart_source_disk], + offset, abd, size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, anyraid_relocate_read_done, ama)); + return (zfs_range_tree_numsegs(rt) == 0); +} + +struct physify_arg { + zfs_range_tree_t *rt; + vdev_t *vd; +}; + +static void +anyraid_rt_physify(void *arg, uint64_t start, uint64_t size) +{ + struct physify_arg *pa = (struct physify_arg *)arg; + zfs_range_tree_t *rt = pa->rt; + vdev_t *vd = pa->vd; + ASSERT3U(size, >, 0); + + zfs_range_seg64_t logical, physical, remain; + logical.rs_start = start; + logical.rs_end = start + size; + vdev_xlate(vd, &logical, &physical, &remain); + ASSERT3U(remain.rs_end, ==, remain.rs_start); + /* + * This can happen if the tile has actually already been moved, + * but the synced state hasn't caught up. + */ + if (physical.rs_end == physical.rs_start) + return; + ASSERT(physical.rs_end - physical.rs_start); + zfs_range_tree_add(rt, physical.rs_start, + physical.rs_end - physical.rs_start); +} + +static vdev_t * +process_one_metaslab(spa_t *spa, metaslab_t *msp, vdev_t *pvd, + vdev_anyraid_t *va, vdev_anyraid_relocate_task_t *vart, zthr_t *zthr) +{ + vdev_anyraid_relocate_t *var = spa->spa_anyraid_relocate; + vdev_t *source_vd = pvd->vdev_child[vart->vart_source_disk]; + metaslab_disable_nowait(msp); + mutex_enter(&msp->ms_lock); + + /* + * The metaslab may be newly created (for the expanded space), in which + * case its trees won't exist yet, so we need to bail out early. + */ + if (msp->ms_new) { + mutex_exit(&msp->ms_lock); + metaslab_enable(msp, B_FALSE, B_FALSE); + if (vart->vart_dis_ms > 0) { + vart->vart_dis_ms--; + metaslab_enable(msp, B_FALSE, B_FALSE); + } + return (pvd); + } + + VERIFY0(metaslab_load(msp)); + + /* + * We want to copy everything except the free (allocatable) space. + * Note that there may be a little bit more free space (e.g. in + * ms_defer), and it's fine to copy that too. + */ + uint64_t shift, start; + zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(pvd, + msp, &start, &shift); + zfs_range_tree_t *rt = zfs_range_tree_create_flags(NULL, type, NULL, + start, shift, ZFS_RT_F_DYN_NAME, metaslab_rt_name(msp->ms_group, + msp, "spa_anyraid_relocate_thread:rt")); + zfs_range_tree_add(rt, msp->ms_start, msp->ms_size); + zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove, rt); + mutex_exit(&msp->ms_lock); + + /* + * Now we need to convert the logical offsets of the metaslab into the + * physical offsets on disk. We also skip any extents that don't map to + * to the source tile. + */ + zfs_range_tree_t *phys = zfs_range_tree_create_flags(NULL, + ZFS_RANGE_SEG64, NULL, 0, pvd->vdev_ashift, ZFS_RT_F_DYN_NAME, + metaslab_rt_name(msp->ms_group, msp, + "spa_anyraid_relocate_thread2:rt")); + struct physify_arg pa; + pa.rt = phys; + pa.vd = source_vd; + zfs_range_tree_walk(rt, anyraid_rt_physify, &pa); + zfs_range_tree_vacate(rt, NULL, NULL); + zfs_range_tree_destroy(rt); + + /* + * When we are resuming from a paused relocate (i.e. + * when importing a pool with a relocate in progress), + * discard any state that we have already processed. + */ + if (vart->vart_task <= var->var_task) { + uint64_t end = vart->vart_task == var->var_task ? + var->var_offset : P2ALIGN_TYPED(UINT64_MAX, + (1 << pvd->vdev_ashift), uint64_t); + zfs_range_tree_clear(phys, 0, end); + } + + while (!zthr_iscancelled(zthr) && !zfs_range_tree_is_empty(phys) && + var->var_failed_offset == UINT64_MAX) { + /* + * We need to periodically drop the config lock so that writers + * can get in. Additionally, we can't wait for a txg to sync + * while holding a config lock (since a waiting writer could + * cause a 3-way deadlock with the sync thread, which also gets + * a config lock for reader). So we can't hold the config lock + * while calling dmu_tx_assign(). + */ + spa_config_exit(spa, SCL_CONFIG, FTAG); + rw_exit(&va->vd_lock); + + /* + * If requested, pause the reflow when the amount specified by + * anyraid_relocate_max_bytes_pause is reached. + * + * This pause is only used during testing or debugging. + */ + while (anyraid_relocate_max_bytes_pause != 0 && + anyraid_relocate_max_bytes_pause <= var->var_bytes_copied && + !zthr_iscancelled(zthr)) { + delay(hz); + } + + mutex_enter(&var->var_lock); + while (var->var_outstanding_bytes > + anyraid_relocate_max_move_bytes) { + cv_wait(&var->var_cv, &var->var_lock); + } + mutex_exit(&var->var_lock); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); + uint64_t txg = dmu_tx_get_txg(tx); + + /* + * Reacquire the vdev_config lock. Theoretically, the vdev_t + * that we're working on may have changed. + */ + rw_enter(&va->vd_lock, RW_READER); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + pvd = vdev_lookup_top(spa, var->var_vd); + + boolean_t needsync = anyraid_relocate_impl(pvd, var, phys, tx); + + dmu_tx_commit(tx); + + if (needsync) { + spa_config_exit(spa, SCL_CONFIG, FTAG); + rw_exit(&va->vd_lock); + txg_wait_synced(spa->spa_dsl_pool, txg); + rw_enter(&va->vd_lock, RW_READER); + spa_config_enter(spa, SCL_CONFIG, FTAG, + RW_READER); + } + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + metaslab_enable(msp, B_FALSE, B_FALSE); + if (vart->vart_dis_ms > 0) { + vart->vart_dis_ms--; + metaslab_enable(msp, B_FALSE, B_FALSE); + } + zfs_range_tree_vacate(phys, NULL, NULL); + zfs_range_tree_destroy(phys); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + return (vdev_lookup_top(spa, var->var_vd)); +} + +/* + * AnyRAID relocate background thread + */ +static void +spa_anyraid_relocate_thread(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + vdev_anyraid_relocate_t *var = spa->spa_anyraid_relocate; + ASSERT(var); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_t *pvd = vdev_lookup_top(spa, var->var_vd); + vdev_anyraid_t *va = pvd->vdev_tsd; + + mutex_enter(&var->var_lock); + /* Iterate over all the tasks */ + for (vdev_anyraid_relocate_task_t *vart = + list_head(&var->var_list); + vart != NULL && !zthr_iscancelled(zthr); + vart = list_head(&var->var_list)) { + mutex_exit(&var->var_lock); + rw_enter(&va->vd_lock, RW_READER); + uint16_t ms_shift = pvd->vdev_ms_shift; + uint64_t start = (va->vd_width * vart->vart_tile * + va->vd_tile_size) >> ms_shift; + uint64_t starting_offset = var->var_offset; + uint64_t end = start + ((va->vd_width * va->vd_tile_size) >> + ms_shift); + for (uint64_t i = start; i < end && !zthr_iscancelled(zthr); + i++) { + pvd = process_one_metaslab(spa, pvd->vdev_ms[i], pvd, + va, vart, zthr); + } + + if (zthr_iscancelled(zthr) || + var->var_failed_offset != UINT64_MAX) { + rw_exit(&va->vd_lock); + mutex_enter(&var->var_lock); + break; + } + rw_exit(&va->vd_lock); + rw_enter(&va->vd_lock, RW_WRITER); + + anyraid_tile_t search; + search.at_tile_id = vart->vart_tile; + anyraid_tile_t *tile = avl_find(&va->vd_tile_map, &search, + NULL); + boolean_t found = B_FALSE; + int count = 0; + for (anyraid_tile_node_t *atn = list_head(&tile->at_list); atn; + atn = list_next(&tile->at_list, atn)) { + ASSERT(atn); + if (atn->atn_disk != vart->vart_source_disk) { + count++; + continue; + } + ASSERT3U(atn->atn_tile_idx, ==, vart->vart_source_idx); + atn->atn_disk = vart->vart_dest_disk; + atn->atn_tile_idx = vart->vart_dest_idx; + found = B_TRUE; + break; + } + IMPLY(!found, starting_offset >= end); + mutex_enter(&var->var_lock); + list_remove(&var->var_list, vart); + list_insert_tail(&var->var_done_list, vart); + rw_exit(&va->vd_lock); + } + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_exit(&var->var_lock); + + /* + * The txg_wait_synced() here ensures that all relocate zio's have + * completed, and var_failed_offset has been set if necessary. It + * also ensures that the progress of the last anyraid_relocate_sync() + * is written to disk before anyraid_relocate_complete_sync() changes + * the in-memory var_state. vdev_anyraid_io_start() uses var_state to + * determine if a relocate is in progress, in which case we may need to + * write to both old and new locations. Therefore we can only change + * var_state once this is not necessary, which is once the on-disk + * progress (in spa_ubsync) has been set past any possible writes (to + * the end of the last metaslab). + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + + if (!zthr_iscancelled(zthr) && list_head(&var->var_list) == NULL) { + /* + * We are not being canceled or paused, so the reflow must be + * complete. In that case also mark it as completed on disk. + */ + ASSERT3U(var->var_failed_offset, ==, UINT64_MAX); + ASSERT(spa->spa_anyraid_relocate); + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + anyraid_relocate_complete_sync, spa, + 0, ZFS_SPACE_CHECK_NONE)); + } else { + /* + * Wait for all copy zio's to complete and for all the + * raidz_reflow_sync() synctasks to be run. + */ + spa_history_log_internal(spa, "relocate pause", + NULL, "offset=%llu failed_offset=%lld/%lld", + (long long)var->var_offset, + (long long)var->var_failed_task, + (long long)var->var_failed_offset); + if (var->var_failed_offset != UINT64_MAX) { + /* + * Reset progress so that we will retry everything + * after the point that something failed. + */ + var->var_offset = var->var_failed_offset; + var->var_task = var->var_failed_task; + var->var_failed_offset = UINT64_MAX; + var->var_failed_task = UINT64_MAX; + var->var_waiting_for_resilver = B_TRUE; + } + } +} + +void +spa_start_anyraid_relocate_thread(spa_t *spa) +{ + ASSERT0P(spa->spa_anyraid_relocate_zthr); + spa->spa_anyraid_relocate_zthr = zthr_create("anyraid_relocate", + spa_anyraid_relocate_thread_check, spa_anyraid_relocate_thread, + spa, defclsyspri); +} + +static boolean_t +vdev_anyraid_expand_child_replacing(vdev_t *anyraid_vd) +{ + for (int i = 0; i < anyraid_vd->vdev_children; i++) { + /* Quick check if a child is being replaced */ + if (!anyraid_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) + return (B_TRUE); + } + return (B_FALSE); +} + +void +anyraid_dtl_reassessed(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + if (spa->spa_anyraid_relocate != NULL) { + vdev_anyraid_relocate_t *var = spa->spa_anyraid_relocate; + /* + * we get called often from vdev_dtl_reassess() so make + * sure it's our vdev and any replacing is complete + */ + if (vd->vdev_top->vdev_id == var->var_vd && + !vdev_anyraid_expand_child_replacing(vd->vdev_top)) { + mutex_enter(&var->var_lock); + if (var->var_waiting_for_resilver) { + vdev_dbgmsg(vd, "DTL reassessed, " + "continuing anyraid relocate"); + var->var_waiting_for_resilver = B_FALSE; + zthr_wakeup(spa->spa_anyraid_relocate_zthr); + } + mutex_exit(&var->var_lock); + } + } +} + +int +spa_anyraid_relocate_get_stats(spa_t *spa, pool_anyraid_relocate_stat_t *pars) +{ + vdev_anyraid_relocate_t *var = spa->spa_anyraid_relocate; + + if (var == NULL) { + /* no removal in progress; find most recent completed */ + for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; + if (vdev_is_anyraid(vd)) { + vdev_anyraid_t *va = vd->vdev_tsd; + + if (va->vd_relocate.var_end_time != 0 && + (var == NULL || + va->vd_relocate.var_end_time > + var->var_end_time)) { + var = &va->vd_relocate; + } + } + } + } + + if (var == NULL) + return (SET_ERROR(ENOENT)); + + pars->pars_state = var->var_state; + pars->pars_relocating_vdev = var->var_vd; + + vdev_t *vd = vdev_lookup_top(spa, var->var_vd); + pars->pars_to_move = vd->vdev_stat.vs_alloc; + + mutex_enter(&var->var_lock); + pars->pars_moved = var->var_bytes_copied; + for (int i = 0; i < TXG_SIZE; i++) + pars->pars_moved += var->var_bytes_copied_pertxg[i]; + mutex_exit(&var->var_lock); + + pars->pars_start_time = var->var_start_time; + pars->pars_end_time = var->var_end_time; + pars->pars_waiting_for_resilver = var->var_waiting_for_resilver; + + return (0); +} + +/* + * ========================================================================== + * CONTRACTION-SPECIFIC LOGIC + * ========================================================================== + */ + +static int +vdev_anyraid_check_contract_fast(vdev_t *tvd, vdev_t *lvd) +{ + vdev_anyraid_t *va = tvd->vdev_tsd; + rw_enter(&va->vd_lock, RW_READER); + const anyraid_freelist_t *af = + &va->vd_children[lvd->vdev_id]->van_freelist; + uint16_t alloced = anyraid_freelist_alloc(af); + uint32_t free = 0; + for (int i = 0; i < tvd->vdev_children; i++) { + if (i == lvd->vdev_id) + continue; + vdev_anyraid_node_t *van = va->vd_children[i]; + free += van->van_capacity - + anyraid_freelist_alloc(&van->van_freelist); + } + rw_exit(&va->vd_lock); + return (free >= alloced ? 0 : ENOSPC); +} + +int +vdev_anyraid_check_contract(vdev_t *tvd, vdev_t *lvd, dmu_tx_t *tx) +{ + vdev_anyraid_t *va = tvd->vdev_tsd; + int error = 0; + spa_t *spa = tvd->vdev_spa; + if (spa_has_checkpoint(spa)) + return (SET_ERROR(EBUSY)); + if (spa->spa_anyraid_relocate != NULL) + return (SET_ERROR(EALREADY)); + if (tvd->vdev_children == va->vd_width) + return (SET_ERROR(ENODEV)); + + if (!dmu_tx_is_syncing(tx)) + return (vdev_anyraid_check_contract_fast(tvd, lvd)); + + vdev_anyraid_relocate_t *var = &va->vd_relocate; + var->var_start_time = gethrestime_sec(); + var->var_state = ARS_SCANNING; + var->var_vd = tvd->vdev_id; + var->var_failed_offset = var->var_failed_task = UINT64_MAX; + ASSERT3S(va->vd_contracting_leaf, ==, -1); + va->vd_contracting_leaf = lvd->vdev_id; + var->var_offset = 0; + + /* + * This is unlocked in the setup function, since we need the state to + * remain consistent between the two. + */ + mutex_enter(&var->var_lock); + tvd->vdev_spa->spa_anyraid_relocate = var; + + rw_enter(&va->vd_lock, RW_WRITER); + + /* + * Step 1: Calculate a movement plan that would empty the selected leaf + * vdev of tiles + */ + avl_tree_t ft; + avl_create(&ft, rebal_cmp_free, sizeof (struct rebal_node), + offsetof(struct rebal_node, node1)); + + uint64_t *num_tiles = kmem_zalloc(tvd->vdev_children * + sizeof (*num_tiles), KM_SLEEP); + for (int c = 0; c < tvd->vdev_children; c++) + num_tiles[c] = (va->vd_children[c]->van_capacity); + + num_tiles[lvd->vdev_id] = 0; + + struct rebal_node *donor = NULL; + for (int i = 0; i < tvd->vdev_children; i++) { + struct rebal_node *rn = kmem_zalloc(sizeof (*rn), KM_SLEEP); + rn->cvd = i; + vdev_anyraid_node_t *n = va->vd_children[i]; + uint32_t cap = n->van_capacity; + rn->alloc = anyraid_freelist_alloc(&n->van_freelist); + rn->free = cap - rn->alloc; + rn->arr = kmem_alloc(sizeof (*rn->arr) * cap, KM_SLEEP); + memset(rn->arr, -1, sizeof (*rn->arr) * cap); + populate_child_array(va, i, rn->arr, cap); + avl_add(&ft, rn); + if (i == lvd->vdev_id) + donor = rn; + } + anyraid_freelist_t *af = &va->vd_children[lvd->vdev_id]->van_freelist; + uint32_t tid = 0; + for (uint16_t o = 0; o < af->af_next_off; o++) { + if (anyraid_freelist_isfree(af, o)) + continue; + boolean_t moved = B_FALSE; + struct rebal_node *receiver = avl_last(&ft); + while (receiver && receiver->free > 0) { + struct rebal_node *prev_rec = + AVL_PREV(&ft, receiver); + moved = reloc_try_move_one(va, + donor, o, receiver, &tid); + if (!moved) { + receiver = prev_rec; + continue; + } + avl_remove(&ft, receiver); + receiver->free--; + receiver->alloc++; + avl_add(&ft, receiver); + break; + } + if (!moved) { + /* + * We couldn't find anywhere to put this tile, we can't + * do contraction right now. It's possible that by + * redoing the plan generation we could make different + * choices earlier that would work; that feature is + * left for future implementation. + */ + error = SET_ERROR(EXFULL); + goto out; + } + } + + /* + * Step 2: Calculate the new asize of the proposed movement plan + */ + uint64_t updated_asize = calculate_asize(tvd, num_tiles); + + /* + * Step 3: Verify that all the current data can fit in the proposed + * movement plan + */ + anyraid_tile_t *at = avl_last(&va->vd_tile_map); + uint32_t highest_tile = at->at_tile_id; + if (updated_asize / va->vd_tile_size <= highest_tile) { + /* + * In this case we do have room to generate a full movement + * plan, but we end up with not enough tiles to actually back + * the whole space we would need to reach the highest-offset + * currently allocated block without having a hole in the vdev. + * + * This mostly should not happen, since we strongly prefer + * earlier metaslabs to ensure that tiles are allocated in + * ascending logical order. But we should have logic to handle + * it, just in case. + */ + error = SET_ERROR(EDOM); + goto out; + } + + /* + * Step 4: Disable all the metaslabs that will become unusable + */ + for (uint64_t m = ((highest_tile + 1) * va->vd_tile_size) >> + tvd->vdev_ms_shift; m < tvd->vdev_ms_count; m++) { + metaslab_disable_nowait(tvd->vdev_ms[m]); + } + + va->vd_children[lvd->vdev_id]->van_capacity = 0; + /* + * At this point, the relocation plan has been generated and everything + * else involved in setup is fail-proof. We leave the rest of the + * process to happen in the _sync function, aside from some cleanup. + */ +out: + if (error != 0) { + vdev_anyraid_relocate_task_t *vart; + while ((vart = list_remove_head(&var->var_list))) { + vdev_anyraid_node_t *van = + va->vd_children[vart->vart_dest_disk]; + anyraid_freelist_add(&van->van_freelist, + vart->vart_dest_idx); + kmem_free(vart, sizeof (*vart)); + } + var->var_state = ARS_FINISHED; + tvd->vdev_spa->spa_anyraid_relocate = NULL; + va->vd_contracting_leaf = -1; + mutex_exit(&var->var_lock); + } + rw_exit(&va->vd_lock); + + kmem_free(num_tiles, tvd->vdev_children * sizeof (*num_tiles)); + + struct rebal_node *node; + void *cookie = NULL; + while ((node = avl_destroy_nodes(&ft, &cookie)) != NULL) { + kmem_free(node->arr, sizeof (*node->arr) * + (node->free + node->alloc)); + kmem_free(node, sizeof (*node)); + } + avl_destroy(&ft); + return (error); +} + +void +vdev_anyraid_setup_contract(vdev_t *tvd, dmu_tx_t *tx) +{ + vdev_anyraid_t *va = tvd->vdev_tsd; + vdev_anyraid_relocate_t *var = &va->vd_relocate; + ASSERT(MUTEX_HELD(&var->var_lock)); + spa_t *spa = tvd->vdev_spa; + if (list_head(&var->var_list) == NULL) { + mutex_exit(&var->var_lock); + anyraid_relocate_complete_sync(spa, tx); + return; + } + + objset_t *mos = spa->spa_meta_objset; + + var->var_object = dmu_object_alloc(mos, DMU_OTN_UINT32_METADATA, + SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, + sizeof (relocate_phys_t), tx); + VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_RELOCATE_OBJ, + sizeof (uint64_t), 1, &var->var_object, tx)); + + tasklist_write(spa, var, tx); + mutex_exit(&var->var_lock); + spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER); + vdev_reopen(tvd); + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + spa_config_exit(spa, SCL_STATE_ALL, FTAG); + zthr_wakeup(spa->spa_anyraid_relocate_zthr); +} + +void +vdev_anyraid_compact_children(vdev_t *vd) +{ + vdev_anyraid_t *va = vd->vdev_tsd; + vdev_anyraid_node_t **new_children = kmem_alloc( + sizeof (*new_children) * vd->vdev_children, KM_SLEEP); + int idx = 0; + for (int c = 0; c <= vd->vdev_children; c++) { + if (c == va->vd_contracting_leaf) + continue; + new_children[idx++] = va->vd_children[c]; + } + kmem_free(va->vd_children, sizeof (*va->vd_children) * + (vd->vdev_children + 1)); + va->vd_children = new_children; + + for (anyraid_tile_t *at = avl_first(&va->vd_tile_map); at; + at = AVL_NEXT(&va->vd_tile_map, at)) { + int count = 0; + for (anyraid_tile_node_t *atn = list_head(&at->at_list); + atn; atn = list_next(&at->at_list, atn)) { + ASSERT3U(atn->atn_disk, !=, va->vd_contracting_leaf); + if (atn->atn_disk > va->vd_contracting_leaf) + atn->atn_disk--; + count++; + } + } +} + +ZFS_MODULE_PARAM(zfs_anyraid, zfs_anyraid_, min_tile_size, U64, ZMOD_RW, + "Minimum tile size for anyraid"); + +ZFS_MODULE_PARAM(zfs_vdev, anyraid_, relocate_max_bytes_pause, ULONG, ZMOD_RW, + "For testing, pause AnyRAID relocate after moving this many bytes"); \ No newline at end of file diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 8588cfee3f7d..857d76413166 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1164,14 +1164,22 @@ vdev_draid_get_astart(vdev_t *vd, const uint64_t start) * 1 / (children - nspares) of its asize. */ static uint64_t -vdev_draid_min_asize(vdev_t *vd) +vdev_draid_min_asize(vdev_t *pvd, vdev_t *cvd) { - vdev_draid_config_t *vdc = vd->vdev_tsd; + (void) cvd; + vdev_draid_config_t *vdc = pvd->vdev_tsd; - ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3P(pvd->vdev_ops, ==, &vdev_draid_ops); return (VDEV_DRAID_REFLOW_RESERVE + - (vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks)); + (pvd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks)); +} + +static uint64_t +vdev_draid_min_attach_size(vdev_t *vd) +{ + ASSERT3U(vd->vdev_top, ==, vd); + return (vdev_draid_min_asize(vd, vd->vdev_child[0])); } /* @@ -2343,6 +2351,7 @@ vdev_ops_t vdev_draid_ops = { .vdev_op_psize_to_asize = vdev_draid_psize_to_asize, .vdev_op_asize_to_psize = vdev_draid_asize_to_psize, .vdev_op_min_asize = vdev_draid_min_asize, + .vdev_op_min_attach_size = vdev_draid_min_attach_size, .vdev_op_min_alloc = vdev_draid_min_alloc, .vdev_op_io_start = vdev_draid_io_start, .vdev_op_io_done = vdev_draid_io_done, @@ -2835,6 +2844,7 @@ vdev_ops_t vdev_draid_spare_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_draid_spare_io_start, .vdev_op_io_done = vdev_draid_spare_io_done, diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index da8fc363762d..576bd3144cbc 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -320,6 +320,7 @@ vdev_ops_t vdev_file_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_file_io_start, .vdev_op_io_done = vdev_file_io_done, diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 7538f471e63c..5f2b55047149 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -1870,6 +1870,7 @@ vdev_ops_t vdev_indirect_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_indirect_io_start, .vdev_op_io_done = vdev_indirect_io_done, diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 16ba09c6f295..0cf7a71ef56c 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -145,6 +145,7 @@ #include #include #include +#include #include #include #include @@ -436,6 +437,13 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t *)&pres, sizeof (pres) / sizeof (uint64_t)); } + + pool_anyraid_relocate_stat_t pars; + if (spa_anyraid_relocate_get_stats(spa, &pars) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_ANYRAID_RELOCATE_STATS, (uint64_t *)&pars, + sizeof (pars) / sizeof (uint64_t)); + } } static void @@ -1853,6 +1861,75 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) return (good_writes >= 1 ? 0 : EIO); } +/* + * Write the extra data of the specified vdev. + */ +static void +vdev_extra_sync(zio_t *zio, uint64_t *good_writes, vdev_t *vd, int flags, + uint64_t txg, vdev_config_sync_status_t status) +{ + for (uint64_t c = 0; c < vd->vdev_children; c++) { + vdev_extra_sync(zio, good_writes, vd->vdev_child[c], flags, txg, + status); + } + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + if (!vdev_writeable(vd)) + return; + + if (vdev_is_anyraid(vd->vdev_parent)) { + vdev_anyraid_write_map_sync(vd, zio, txg, good_writes, flags, + status); + } +} + +/* Sync the extra data of all vdevs in svd[] */ +static int +vdev_extra_sync_list(vdev_t **svd, int svdcount, int flags, uint64_t txg, + vdev_config_sync_status_t status) +{ + spa_t *spa = svd[0]->vdev_spa; + zio_t *zio; + uint64_t good_writes = 0; + + boolean_t have_extra = B_FALSE; + + for (int i = 0; i < svdcount; i++) { + if (vdev_is_anyraid(svd[i])) { + have_extra = B_TRUE; + break; + } + } + if (!have_extra) + return (0); + + zio = zio_root(spa, NULL, NULL, flags); + + for (int v = 0; v < svdcount; v++) + vdev_extra_sync(zio, &good_writes, svd[v], flags, txg, status); + + (void) zio_wait(zio); + + /* + * Flush the extra data to disk. This ensures that the odd labels + * are no longer needed (because the new uberblocks and the even + * labels are safely on disk), so it is safe to overwrite them. + */ + zio = zio_root(spa, NULL, NULL, flags); + + for (int v = 0; v < svdcount; v++) { + if (vdev_writeable(svd[v])) { + zio_flush(zio, svd[v]); + } + } + + (void) zio_wait(zio); + + return (good_writes >= 1 ? 0 : EIO); +} + /* * On success, increment the count of good writes for our top-level vdev. */ @@ -2036,7 +2113,8 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) * at any time, you can just call it again, and it will resume its work. */ int -vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) +vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, + vdev_config_sync_status_t status) { spa_t *spa = svd[0]->vdev_spa; uberblock_t *ub = &spa->spa_uberblock; @@ -2114,6 +2192,16 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) goto retry; } + if ((error = vdev_extra_sync_list(svd, svdcount, flags, txg, status) != + 0)) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) { + zfs_dbgmsg("vdev_extra_sync_list() returned error %d " + "for pool '%s' when syncing out the extra data " + "of dirty vdevs", error, spa_name(spa)); + } + goto retry; + } + /* * Sync the uberblocks to all vdevs in svd[]. * If the system dies in the middle of this step, there are two cases diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 18efdaac006f..8aeff63e1dbe 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -39,6 +39,7 @@ #include #include #include +#include /* * Vdev mirror kstats @@ -99,31 +100,6 @@ vdev_mirror_stat_fini(void) } } -/* - * Virtual device vector for mirroring. - */ -typedef struct mirror_child { - vdev_t *mc_vd; - abd_t *mc_abd; - uint64_t mc_offset; - int mc_error; - int mc_load; - uint8_t mc_tried; - uint8_t mc_skipped; - uint8_t mc_speculative; - uint8_t mc_rebuilding; -} mirror_child_t; - -typedef struct mirror_map { - int *mm_preferred; - int mm_preferred_cnt; - int mm_children; - boolean_t mm_resilvering; - boolean_t mm_rebuilding; - boolean_t mm_root; - mirror_child_t mm_child[]; -} mirror_map_t; - static const int vdev_mirror_shift = 21; /* @@ -152,7 +128,7 @@ vdev_mirror_map_size(int children) sizeof (int) * children); } -static inline mirror_map_t * +mirror_map_t * vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root) { mirror_map_t *mm; @@ -175,7 +151,7 @@ vdev_mirror_map_free(zio_t *zio) kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); } -static const zio_vsd_ops_t vdev_mirror_vsd_ops = { +zio_vsd_ops_t vdev_mirror_vsd_ops = { .vsd_free = vdev_mirror_map_free, }; @@ -601,24 +577,12 @@ vdev_mirror_child_select(zio_t *zio) return (-1); } -static void -vdev_mirror_io_start(zio_t *zio) +void +vdev_mirror_io_start_impl(zio_t *zio, mirror_map_t *mm) { - mirror_map_t *mm; mirror_child_t *mc; int c, children; - mm = vdev_mirror_map_init(zio); - zio->io_vsd = mm; - zio->io_vsd_ops = &vdev_mirror_vsd_ops; - - if (mm == NULL) { - ASSERT(!spa_trust_config(zio->io_spa)); - ASSERT(zio->io_type == ZIO_TYPE_READ); - zio_execute(zio); - return; - } - if (zio->io_type == ZIO_TYPE_READ) { if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) { /* @@ -650,7 +614,6 @@ vdev_mirror_io_start(zio_t *zio) vdev_mirror_child_done, mc)); first = B_FALSE; } - zio_execute(zio); return; } /* @@ -690,6 +653,25 @@ vdev_mirror_io_start(zio_t *zio) zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); } +} + +static void +vdev_mirror_io_start(zio_t *zio) +{ + mirror_map_t *mm; + + mm = vdev_mirror_map_init(zio); + zio->io_vsd = mm; + zio->io_vsd_ops = &vdev_mirror_vsd_ops; + + if (mm == NULL) { + ASSERT(!spa_trust_config(zio->io_spa)); + ASSERT(zio->io_type == ZIO_TYPE_READ); + zio_execute(zio); + return; + } + + vdev_mirror_io_start_impl(zio, mm); zio_execute(zio); } @@ -708,7 +690,7 @@ vdev_mirror_worst_error(mirror_map_t *mm) return (error[0] ? error[0] : error[1]); } -static void +void vdev_mirror_io_done(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; @@ -975,6 +957,7 @@ vdev_ops_t vdev_mirror_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, @@ -1001,6 +984,7 @@ vdev_ops_t vdev_replacing_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, @@ -1027,6 +1011,7 @@ vdev_ops_t vdev_spare_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c index c62faef2d05c..ac6866bdcec0 100644 --- a/module/zfs/vdev_missing.c +++ b/module/zfs/vdev_missing.c @@ -88,6 +88,7 @@ vdev_ops_t vdev_missing_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_missing_io_start, .vdev_op_io_done = vdev_missing_io_done, @@ -114,6 +115,7 @@ vdev_ops_t vdev_hole_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_missing_io_start, .vdev_op_io_done = vdev_missing_io_done, diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 520ddd692bda..84b68b81ba10 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -2332,10 +2332,18 @@ vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg) * so each child must provide at least 1/Nth of its asize. */ static uint64_t -vdev_raidz_min_asize(vdev_t *vd) +vdev_raidz_min_asize(vdev_t *pvd, vdev_t *cvd) { - return ((vd->vdev_min_asize + vd->vdev_children - 1) / - vd->vdev_children); + (void) cvd; + return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / + pvd->vdev_children); +} + +static uint64_t +vdev_raidz_min_attach_size(vdev_t *vd) +{ + ASSERT3U(vd->vdev_top, ==, vd); + return (vdev_raidz_min_asize(vd, vd->vdev_child[0])); } /* @@ -2400,8 +2408,8 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) zfs_range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size, - BP_GET_PHYSICAL_BIRTH(zio->io_bp)); + vdev_psize_to_asize_txg(zio->io_vd, + rr->rr_size, BP_GET_PHYSICAL_BIRTH(zio->io_bp)); raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; @@ -2638,6 +2646,22 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) } } +void +vdev_raidz_io_start_impl(zio_t *zio, raidz_map_t *rm, uint64_t logical_width, + uint64_t physical_width) +{ + if (zio->io_type == ZIO_TYPE_WRITE) { + for (int i = 0; i < rm->rm_nrows; i++) + vdev_raidz_io_start_write(zio, rm->rm_row[i]); + + if (logical_width == physical_width) + raidz_start_skip_writes(zio); + } else { + ASSERT(zio->io_type == ZIO_TYPE_READ); + vdev_raidz_io_start_read(zio, rm); + } +} + /* * Start an IO operation on a RAIDZ VDev * @@ -2717,18 +2741,8 @@ vdev_raidz_io_start(zio_t *zio) zio->io_vsd = rm; zio->io_vsd_ops = &vdev_raidz_vsd_ops; - if (zio->io_type == ZIO_TYPE_WRITE) { - for (int i = 0; i < rm->rm_nrows; i++) { - vdev_raidz_io_start_write(zio, rm->rm_row[i]); - } - - if (logical_width == vdrz->vd_physical_width) { - raidz_start_skip_writes(zio); - } - } else { - ASSERT(zio->io_type == ZIO_TYPE_READ); - vdev_raidz_io_start_read(zio, rm); - } + vdev_raidz_io_start_impl(zio, rm, logical_width, + vdrz->vd_physical_width); zio_execute(zio); } @@ -4171,6 +4185,8 @@ raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) .func = POOL_SCAN_SCRUB, .txgstart = 0, .txgend = 0, + .done = NULL, + .done_arg = NULL, }; if (zfs_scrub_after_expand && dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) { @@ -5450,6 +5466,7 @@ vdev_ops_t vdev_raidz_ops = { .vdev_op_psize_to_asize = vdev_raidz_psize_to_asize, .vdev_op_asize_to_psize = vdev_raidz_asize_to_psize, .vdev_op_min_asize = vdev_raidz_min_asize, + .vdev_op_min_attach_size = vdev_raidz_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_raidz_io_start, .vdev_op_io_done = vdev_raidz_io_done, diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 0e14d29d7fb2..44b36fe7da48 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -349,6 +349,8 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) .func = POOL_SCAN_SCRUB, .txgstart = 0, .txgend = 0, + .done = NULL, + .done_arg = NULL, }; if (dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0 && zfs_rebuild_scrub_enabled) { @@ -524,6 +526,7 @@ vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start, { ASSERT(vd->vdev_ops == &vdev_draid_ops || vd->vdev_ops == &vdev_mirror_ops || + vdev_is_anyraid(vd) || vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 89911e55bda8..7eb6fe29e841 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -215,7 +215,7 @@ vdev_activate(vdev_t *vd) metaslab_group_activate(mg); metaslab_group_activate(vd->vdev_log_mg); - vdev_update_nonallocating_space(vd, B_FALSE); + vdev_update_nonallocating_space(vd, -1ULL, B_FALSE); vd->vdev_noalloc = B_FALSE; } @@ -287,7 +287,7 @@ vdev_passivate(vdev_t *vd, uint64_t *txg) return (error); } - vdev_update_nonallocating_space(vd, B_TRUE); + vdev_update_nonallocating_space(vd, -1ULL, B_TRUE); vd->vdev_noalloc = B_TRUE; return (0); @@ -1411,7 +1411,7 @@ vdev_remove_complete(spa_t *spa) (u_longlong_t)vd->vdev_id, (u_longlong_t)txg); /* the vdev is no longer part of the dspace */ - vdev_update_nonallocating_space(vd, B_FALSE); + vdev_update_nonallocating_space(vd, -1ULL, B_FALSE); /* * Discard allocation state. diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index 21a81d6d25b9..55e059e1d8b0 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -150,6 +150,7 @@ vdev_ops_t vdev_root_ops = { .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_attach_size = vdev_default_min_attach_size, .vdev_op_min_alloc = NULL, .vdev_op_io_start = NULL, /* not applicable to the root */ .vdev_op_io_done = NULL, /* not applicable to the root */ diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 3bbc9107ae2e..a043b5c78334 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -7346,6 +7346,74 @@ zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) return (ret); } +/* + * Rebalance tiles on the provided anyraid vdevs, or all anyraid vdevs if + * none are specified. + * + * innvl: { + * "vdevs" (optional) -> raw uint64_t array of vdev guids + * } + * + * outnvl is unused + */ +static const zfs_ioc_key_t zfs_keys_pool_rebalance[] = { + {"vdevs", DATA_TYPE_UINT64_ARRAY, ZK_OPTIONAL}, +}; + +static int +zfs_ioc_pool_rebalance(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) +{ + (void) outnvl; + int err; + spa_t *spa; + + if ((err = spa_open(pool, &spa, FTAG)) != 0) + return (err); + + uint64_t *vdevs; + uint_t count; + if (nvlist_lookup_uint64_array(innvl, "vdevs", &vdevs, &count) == 0) + err = spa_rebalance_vdevs(spa, vdevs, count); + else + err = spa_rebalance_all(spa); + spa_close(spa, FTAG); + + return (0); +} + +/* + * Contract the specified anyraid vdev by removing the specified leaf vdev, + * moving that leaf's tiles to other children. + * + * innvl: { + * "anyraid_vdev" -> guid of the anyraid vdev + * "anyraid_vdev" -> guid of the leaf vdev + * } + * + * outnvl is unused + */ +static const zfs_ioc_key_t zfs_keys_pool_contract[] = { + {"anyraid_vdev", DATA_TYPE_UINT64, 0}, + {"leaf_vdev", DATA_TYPE_UINT64, 0}, +}; + +static int +zfs_ioc_pool_contract(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) +{ + (void) outnvl; + spa_t *spa; + int err; + if ((err = spa_open(pool, &spa, FTAG)) != 0) + return (err); + + uint64_t anyraid_vdev = fnvlist_lookup_uint64(innvl, "anyraid_vdev"); + uint64_t leaf_vdev = fnvlist_lookup_uint64(innvl, "leaf_vdev"); + err = spa_contract_vdev(spa, anyraid_vdev, leaf_vdev); + spa_close(spa, FTAG); + + return (err); +} + static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void @@ -7652,6 +7720,16 @@ zfs_ioctl_init(void) POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_ddt_prune, ARRAY_SIZE(zfs_keys_ddt_prune)); + zfs_ioctl_register("zpool_rebalance", ZFS_IOC_POOL_REBALANCE, + zfs_ioc_pool_rebalance, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_pool_rebalance, ARRAY_SIZE(zfs_keys_pool_rebalance)); + + zfs_ioctl_register("zpool_contract", ZFS_IOC_POOL_CONTRACT, + zfs_ioc_pool_contract, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_pool_contract, ARRAY_SIZE(zfs_keys_pool_contract)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/module/zfs/zio.c b/module/zfs/zio.c index a48854563044..bd6ad84f413f 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4319,6 +4319,8 @@ zio_dva_allocate(zio_t *zio) flags |= METASLAB_GANG_CHILD; if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) flags |= METASLAB_ASYNC_ALLOC; + if (zio->io_flags & ZIO_FLAG_ZILWRITE) + flags |= METASLAB_ZIL; /* * If not already chosen, choose an appropriate allocation class. @@ -4917,6 +4919,12 @@ zio_vdev_io_assess(zio_t *zio) zio->io_vsd_ops->vsd_free(zio); zio->io_vsd = NULL; } + /* + * The only VDEV types that use this should have handled their aux data + * by now. + */ + ASSERT3PF(NULL, ==, zio->io_aux_vsd, "%d %x", zio->io_error, + zio->io_pipeline_trace); /* * If a Direct I/O operation has a checksum verify error then this I/O @@ -5196,7 +5204,9 @@ zio_checksum_generate(zio_t *zio) if (checksum == ZIO_CHECKSUM_OFF) return (zio); - ASSERT(checksum == ZIO_CHECKSUM_LABEL); + ASSERTF(checksum == ZIO_CHECKSUM_LABEL || + checksum == ZIO_CHECKSUM_ANYRAID_MAP, + "checksum not label: %px %d", zio, checksum); } else { if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ASSERT(!IO_IS_ALLOCATING(zio)); @@ -5228,7 +5238,10 @@ zio_checksum_verify(zio_t *zio) if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (zio); - ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); + ASSERTF(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL || + zio->io_prop.zp_checksum == ZIO_CHECKSUM_ANYRAID_MAP, + "checksum not label: %px %d", zio, + zio->io_prop.zp_checksum); } ASSERT0(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR); diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index 1d0646a61185..1585744651af 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -206,6 +206,8 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { abd_checksum_blake3_tmpl_init, abd_checksum_blake3_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"}, + {{abd_checksum_sha256, abd_checksum_sha256}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA, "anyraid_map"}, }; /* @@ -408,6 +410,12 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, abd_copy_from_buf_off(abd, &cksum, eck_offset + offsetof(zio_eck_t, zec_cksum), sizeof (zio_cksum_t)); + } else if (checksum == ZIO_CHECKSUM_ANYRAID_MAP) { + zio_eck_t *eck = (zio_eck_t *)(zio->io_private); + ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], + &cksum); + eck->zec_cksum = cksum; + memcpy(&eck->zec_magic, &zec_magic, sizeof (zec_magic)); } else { saved = bp->blk_cksum; ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], @@ -419,13 +427,14 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, } int -zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, - enum zio_checksum checksum, abd_t *abd, uint64_t size, uint64_t offset, - zio_bad_cksum_t *info) +zio_checksum_error_impl(zio_t *zio, enum zio_checksum checksum, abd_t *abd, + uint64_t size, uint64_t offset, zio_bad_cksum_t *info) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t actual_cksum, expected_cksum; zio_eck_t eck; + spa_t *spa = zio->io_spa; + const blkptr_t *bp = zio->io_bp; int byteswap; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) @@ -433,8 +442,8 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, zio_checksum_template_init(checksum, spa); - IMPLY(bp == NULL, ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED); - IMPLY(bp == NULL, checksum == ZIO_CHECKSUM_LABEL); + IMPLY(bp == NULL, checksum == ZIO_CHECKSUM_LABEL || + checksum == ZIO_CHECKSUM_ANYRAID_MAP); if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_cksum_t verifier; @@ -498,6 +507,12 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); } + } else if (checksum == ZIO_CHECKSUM_ANYRAID_MAP) { + eck = *(zio_eck_t *)(zio->io_private); + byteswap = (eck.zec_magic == BSWAP_64(ZEC_MAGIC)); + expected_cksum = eck.zec_cksum; + ci->ci_func[byteswap](abd, size, + spa->spa_cksum_tmpls[checksum], &actual_cksum); } else { byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; @@ -548,24 +563,24 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) uint64_t size = bp ? BP_GET_PSIZE(bp) : zio->io_size; uint64_t offset = zio->io_offset; abd_t *data = zio->io_abd; - spa_t *spa = zio->io_spa; if (bp && BP_IS_GANG(bp)) { - if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) + if (spa_feature_is_active(zio->io_spa, + SPA_FEATURE_DYNAMIC_GANG_HEADER)) size = zio->io_size; else size = SPA_OLD_GANGBLOCKSIZE; } - error = zio_checksum_error_impl(spa, bp, checksum, data, size, - offset, info); + error = zio_checksum_error_impl(zio, checksum, data, size, offset, + info); if (error && bp && BP_IS_GANG(bp) && size > SPA_OLD_GANGBLOCKSIZE) { /* * It's possible that this is an old gang block. Rerun * the checksum with the old size; if that passes, then * update the gangblocksize appropriately. */ - error = zio_checksum_error_impl(spa, bp, checksum, data, + error = zio_checksum_error_impl(zio, checksum, data, SPA_OLD_GANGBLOCKSIZE, offset, info); if (error == 0) { ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 8394bc4bcda0..b1b48721a707 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -40,6 +40,30 @@ tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos', 'alloc_class_013_pos', 'alloc_class_016_pos'] tags = ['functional', 'alloc_class'] +[tests/functional/anyraid] +tests = [ 'anyraid_clean_mirror_001_pos', 'anyraid_clean_mirror_002_pos', + 'anyraid_clean_mirror_003_pos', 'anyraid_clean_mirror_004_pos', + 'anyraid_clean_raidz_001_pos', + 'anyraid_clean_raidz_002_pos', 'anyraid_clean_raidz_003_pos', + 'anyraid_offline_anymirror_001_pos', 'anyraid_offline_anymirror_002_neg', + 'anyraid_offline_anyraidz_001_pos', 'anyraid_offline_anyraidz_002_neg', + 'anyraid_tile_layout', 'anyraid_checkpoint', + 'anyraid_faildisk_write_replace_resilver', + 'anyraid_offline_write_online_resilver', + 'anyraid_special_vdev_001_pos', 'anyraid_special_vdev_002_pos', + 'anyraid_rebalance_001', 'anyraid_rebalance_002', 'anyraid_rebalance_003', + 'anyraid_rebalance_004', 'anyraid_rebalance_005', 'anyraid_rebalance_006', + 'anyraid_rebalance_007', 'anyraid_rebalance_008', 'anyraid_rebalance_009', + 'anyraid_contract_002_pos', 'anyraid_contract_raidz2_pos', + 'anyraid_write_verify_001_pos', 'anyraid_write_verify_002_pos', + 'anyraid_write_verify_003_pos', 'anyraid_write_verify_004_pos', + 'anyraid_write_verify_005_pos', 'anyraid_write_verify_006_pos', + 'anyraid_write_verify_007_neg', 'anyraid_write_verify_008_pos', + 'anyraid_encryption_001_pos', 'anyraid_snapshot_001_pos', + 'anyraid_send_recv_001_pos', 'anyraid_scrub_001_pos', + 'anyraid_properties_001_pos', 'anyraid_gang_blocks_001_pos'] +tags = ['functional', 'anyraid'] + [tests/functional/append] tests = ['file_append', 'threadsappend_001_pos'] tags = ['functional', 'append'] @@ -399,11 +423,14 @@ tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_warn_create', 'zpool_add_warn_degraded', 'zpool_add_warn_removal', 'add-o_ashift', - 'add_prop_ashift', 'zpool_add_dryrun_output'] + 'add_prop_ashift', 'zpool_add_dryrun_output', + 'zpool_add_anyraid_001_pos', 'zpool_add_anyraid_002_pos', + 'zpool_add_anyraid_003_pos', 'zpool_add_anyraid_004_pos'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] -tests = ['zpool_attach_001_neg', 'attach-o_ashift'] +tests = ['zpool_attach_001_neg', 'zpool_attach_002_pos', 'zpool_attach_003_pos', + 'attach-o_ashift'] tags = ['functional', 'cli_root', 'zpool_attach'] [tests/functional/cli_root/zpool_clear] @@ -420,7 +447,13 @@ tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_023_neg', 'zpool_create_024_pos', - 'zpool_create_encrypted', 'zpool_create_crypt_combos', + 'zpool_create_anyraid_001_pos', 'zpool_create_anyraid_002_pos', + 'zpool_create_anyraid_003_pos', 'zpool_create_anyraid_004_pos', + 'zpool_create_anyraid_005_neg', 'zpool_create_anyraid_006_neg', + 'zpool_create_anyraid_007_neg', 'zpool_create_anyraid_008_neg', + 'zpool_create_anyraid_009_neg', 'zpool_create_anyraid_010_neg', + 'zpool_create_anyraid_011_neg', 'zpool_create_encrypted', + 'zpool_create_crypt_combos', 'zpool_create_draid_001_pos', 'zpool_create_draid_002_pos', 'zpool_create_draid_003_pos', 'zpool_create_draid_004_pos', 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', @@ -451,6 +484,7 @@ tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_export] tests = ['zpool_export_001_pos', 'zpool_export_002_pos', 'zpool_export_003_neg', 'zpool_export_004_pos', + 'zpool_export_anyraid_001_pos', 'zpool_export_parallel_pos', 'zpool_export_parallel_admin'] tags = ['functional', 'cli_root', 'zpool_export'] @@ -506,6 +540,7 @@ tags = ['functional', 'cli_root', 'zpool_labelclear'] [tests/functional/cli_root/zpool_initialize] tests = ['zpool_initialize_attach_detach_add_remove', + 'zpool_initialize_anyraid_attach', 'zpool_initialize_fault_export_import_online', 'zpool_initialize_import_export', 'zpool_initialize_multiple_pools', @@ -519,7 +554,6 @@ tests = ['zpool_initialize_attach_detach_add_remove', 'zpool_initialize_unsupported_vdevs', 'zpool_initialize_verify_checksums', 'zpool_initialize_verify_initialized'] -pre = tags = ['functional', 'cli_root', 'zpool_initialize'] [tests/functional/cli_root/zpool_offline] diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index dad51d2e99be..8a85d1b164f3 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -40,6 +40,17 @@ tests = ['alloc_class_003_pos', 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_011_neg'] tags = ['functional', 'alloc_class'] +[tests/functional/anyraid] +tests = ['anyraid_offline_anymirror_001_pos', + 'anyraid_offline_anymirror_002_neg', 'anyraid_offline_anyraidz_001_pos', + 'anyraid_offline_anyraidz_002_neg', 'anyraid_write_verify_001_pos', + 'anyraid_write_verify_002_pos', 'anyraid_write_verify_003_pos', + 'anyraid_write_verify_004_pos', 'anyraid_encryption_001_pos', + 'anyraid_snapshot_001_pos', 'anyraid_send_recv_001_pos', + 'anyraid_scrub_001_pos', 'anyraid_properties_001_pos', + 'anyraid_gang_blocks_001_pos'] +tags = ['functional', 'anyraid'] + [tests/functional/arc] tests = ['dbufstats_001_pos', 'dbufstats_002_pos', 'arcstats_runtime_tuning'] tags = ['functional', 'arc'] @@ -252,11 +263,13 @@ tags = ['functional', 'cli_root', 'zpool'] [tests/functional/cli_root/zpool_add] tests = ['zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', - 'zpool_add_008_neg', 'zpool_add_009_neg'] + 'zpool_add_008_neg', 'zpool_add_009_neg', + 'zpool_add_anyraid_001_pos', 'zpool_add_anyraid_002_pos', + 'zpool_add_anyraid_003_pos', 'zpool_add_anyraid_004_pos'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] -tests = ['zpool_attach_001_neg'] +tests = ['zpool_attach_001_neg', 'zpool_attach_002_pos'] tags = ['functional', 'cli_root', 'zpool_attach'] [tests/functional/cli_root/zpool_clear] @@ -270,10 +283,14 @@ tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg', 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', - 'zpool_create_encrypted', - 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', - 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', - 'zpool_create_features_005_pos'] + 'zpool_create_anyraid_001_pos', 'zpool_create_anyraid_003_pos', + 'zpool_create_anyraid_004_pos', 'zpool_create_anyraid_005_neg', + 'zpool_create_anyraid_006_neg','zpool_create_anyraid_007_neg', + 'zpool_create_anyraid_008_neg', 'zpool_create_anyraid_009_neg', + 'zpool_create_anyraid_010_neg', 'zpool_create_anyraid_011_neg', + 'zpool_create_encrypted', 'zpool_create_features_001_pos', + 'zpool_create_features_002_pos', 'zpool_create_features_003_pos', + 'zpool_create_features_004_neg', 'zpool_create_features_005_pos'] tags = ['functional', 'cli_root', 'zpool_create'] [tests/functional/cli_root/zpool_destroy] @@ -319,7 +336,6 @@ tags = ['functional', 'cli_root', 'zpool_labelclear'] [tests/functional/cli_root/zpool_initialize] tests = ['zpool_initialize_online_offline'] -pre = tags = ['functional', 'cli_root', 'zpool_initialize'] [tests/functional/cli_root/zpool_offline] diff --git a/tests/zfs-tests/include/default.cfg.in b/tests/zfs-tests/include/default.cfg.in index 4e009acaff91..5b0bb04fd229 100644 --- a/tests/zfs-tests/include/default.cfg.in +++ b/tests/zfs-tests/include/default.cfg.in @@ -140,6 +140,10 @@ export MAX_FINDDISKSNUM=6 # Default minimum size for file based vdevs in the test suite export MINVDEVSIZE=$((256 * 1024 * 1024)) +# AnyRAID has higher requirements by design, +# it depends on the minimal region size +export MINVDEVSIZE2=$((24 * 1024 * 1024 * 1024)) + # Minimum vdev size possible as defined in the OS export SPA_MINDEVSIZE=$((64 * 1024 * 1024)) diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index d979d6874f9b..e323ba5599e2 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -790,6 +790,23 @@ function assert (($@)) || log_fail "$@" } +function get_file_size +{ + typeset filename="$1" + + if is_linux; then + if [ -b "$filename" ] ; then + filesize=$(blockdev --getsize64 $filename) + else + filesize=$(stat -c %s $filename) + fi + else + filesize=$(stat -s $filename | awk '{print $8}' | grep -o '[0-9]\+') + fi + + echo $filesize +} + # # Function to format partition size of a disk # Given a disk cxtxdx reduces all partitions @@ -1600,6 +1617,15 @@ function create_pool #pool devs_list if is_global_zone ; then [[ -d /$pool ]] && rm -rf /$pool + + for internal_vd in "$@" ; do + if [[ "$internal_vd" =~ "loop" ]] ; then + # If the device is a loopback, remove previously + # allocated data. + punch_hole 0 $(get_file_size /dev/$internal_vd) \ + /dev/$internal_vd + fi + done log_must zpool create -f $pool $@ fi @@ -1857,7 +1883,7 @@ function verify_pool function get_disklist # pool { echo $(zpool iostat -v $1 | awk '(NR > 4) {print $1}' | \ - grep -vEe '^-----' -e "^(mirror|raidz[1-3]|draid[1-3]|spare|log|cache|special|dedup)|\-[0-9]$") + grep -vEe '^-----' -e "^(mirror|raidz[1-3]|anymirror|draid[1-3]|spare|log|cache|special|dedup)|\-[0-9]$") } # @@ -2219,6 +2245,30 @@ BEGIN { FS="."; } echo $unused } +function create_sparse_files +{ + typeset prefix=$1 + typeset -i count=$2 + typeset size=$3 + + log_must mkdir -p $TESTDIR/sparse_files + + typeset sfiles="" + for (( i=0; i { typeset group=$1 diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 5e6959a54712..17ec7ba87f13 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -18,6 +18,8 @@ UNAME=$(uname) cat <<%%%% | ADMIN_SNAPSHOT UNSUPPORTED zfs_admin_snapshot ALLOW_REDACTED_DATASET_MOUNT allow_redacted_dataset_mount zfs_allow_redacted_dataset_mount +ANYRAID_MIN_TILE_SIZE anyraid.min_tile_size zfs_anyraid_min_tile_size +ANYRAID_RELOCATE_MAX_BYTES_PAUSE anyraid.rebalance_max_bytes_pause anyraid_relocate_max_bytes_pause ARC_MAX arc.max zfs_arc_max ARC_MIN arc.min zfs_arc_min ASYNC_BLOCK_MAX_BLOCKS async_block_max_blocks zfs_async_block_max_blocks diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index e3fcce9840d9..ba393cf3562f 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -90,6 +90,8 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/acl/acl_common.kshlib \ functional/alloc_class/alloc_class.cfg \ functional/alloc_class/alloc_class.kshlib \ + functional/anyraid/anyraid_common.kshlib \ + functional/anyraid/default.cfg \ functional/atime/atime.cfg \ functional/atime/atime_common.kshlib \ functional/bclone/bclone.cfg \ @@ -435,6 +437,51 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/alloc_class/alloc_class_016_pos.ksh \ functional/alloc_class/cleanup.ksh \ functional/alloc_class/setup.ksh \ + functional/anyraid/anyraid_checkpoint.ksh \ + functional/anyraid/anyraid_clean_mirror_001_pos.ksh \ + functional/anyraid/anyraid_clean_mirror_002_pos.ksh \ + functional/anyraid/anyraid_clean_mirror_003_pos.ksh \ + functional/anyraid/anyraid_clean_mirror_004_pos.ksh \ + functional/anyraid/anyraid_clean_raidz_001_pos.ksh \ + functional/anyraid/anyraid_clean_raidz_002_pos.ksh \ + functional/anyraid/anyraid_clean_raidz_003_pos.ksh \ + functional/anyraid/anyraid_contract_002_pos.ksh \ + functional/anyraid/anyraid_contract_raidz2_pos.ksh \ + functional/anyraid/anyraid_encryption_001_pos.ksh \ + functional/anyraid/anyraid_faildisk_write_replace_resilver.ksh \ + functional/anyraid/anyraid_gang_blocks_001_pos.ksh \ + functional/anyraid/anyraid_offline_anymirror_001_pos.ksh \ + functional/anyraid/anyraid_offline_anymirror_002_neg.ksh \ + functional/anyraid/anyraid_offline_anyraidz_001_pos.ksh \ + functional/anyraid/anyraid_offline_anyraidz_002_neg.ksh \ + functional/anyraid/anyraid_offline_write_online_resilver.ksh \ + functional/anyraid/anyraid_tile_layout.ksh \ + functional/anyraid/anyraid_rebalance_001.ksh \ + functional/anyraid/anyraid_rebalance_002.ksh \ + functional/anyraid/anyraid_rebalance_003.ksh \ + functional/anyraid/anyraid_rebalance_004.ksh \ + functional/anyraid/anyraid_rebalance_005.ksh \ + functional/anyraid/anyraid_rebalance_006.ksh \ + functional/anyraid/anyraid_rebalance_007.ksh \ + functional/anyraid/anyraid_rebalance_008.ksh \ + functional/anyraid/anyraid_rebalance_009.ksh \ + functional/anyraid/anyraid_scrub_001_pos.ksh \ + functional/anyraid/anyraid_snapshot_001_pos.ksh \ + functional/anyraid/anyraid_special_vdev_001_pos.ksh \ + functional/anyraid/anyraid_special_vdev_002_pos.ksh \ + functional/anyraid/anyraid_rebalance_001.ksh \ + functional/anyraid/anyraid_rebalance_002.ksh \ + functional/anyraid/anyraid_tile_layout.ksh \ + functional/anyraid/anyraid_write_verify_001_pos.ksh \ + functional/anyraid/anyraid_write_verify_002_pos.ksh \ + functional/anyraid/anyraid_write_verify_003_pos.ksh \ + functional/anyraid/anyraid_write_verify_004_pos.ksh \ + functional/anyraid/anyraid_write_verify_005_pos.ksh \ + functional/anyraid/anyraid_write_verify_006_pos.ksh \ + functional/anyraid/anyraid_write_verify_007_neg.ksh \ + functional/anyraid/anyraid_write_verify_008_pos.ksh \ + functional/anyraid/cleanup.ksh \ + functional/anyraid/setup.ksh \ functional/append/file_append.ksh \ functional/append/threadsappend_001_pos.ksh \ functional/append/cleanup.ksh \ @@ -1034,6 +1081,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_add/zpool_add_007_neg.ksh \ functional/cli_root/zpool_add/zpool_add_008_neg.ksh \ functional/cli_root/zpool_add/zpool_add_009_neg.ksh \ + functional/cli_root/zpool_add/zpool_add_anyraid_001_pos.ksh \ + functional/cli_root/zpool_add/zpool_add_anyraid_002_pos.ksh \ + functional/cli_root/zpool_add/zpool_add_anyraid_003_pos.ksh \ + functional/cli_root/zpool_add/zpool_add_anyraid_004_pos.ksh \ functional/cli_root/zpool_add/zpool_add_warn_create.ksh \ functional/cli_root/zpool_add/zpool_add_warn_degraded.ksh \ functional/cli_root/zpool_add/zpool_add_warn_removal.ksh \ @@ -1042,6 +1093,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_attach/cleanup.ksh \ functional/cli_root/zpool_attach/setup.ksh \ functional/cli_root/zpool_attach/zpool_attach_001_neg.ksh \ + functional/cli_root/zpool_attach/zpool_attach_002_pos.ksh \ + functional/cli_root/zpool_attach/zpool_attach_003_pos.ksh \ functional/cli_root/zpool/cleanup.ksh \ functional/cli_root/zpool_clear/cleanup.ksh \ functional/cli_root/zpool_clear/setup.ksh \ @@ -1075,6 +1128,17 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_create/zpool_create_022_pos.ksh \ functional/cli_root/zpool_create/zpool_create_023_neg.ksh \ functional/cli_root/zpool_create/zpool_create_024_pos.ksh \ + functional/cli_root/zpool_create/zpool_create_anyraid_001_pos.ksh \ + functional/cli_root/zpool_create/zpool_create_anyraid_002_pos.ksh \ + functional/cli_root/zpool_create/zpool_create_anyraid_003_pos.ksh \ + functional/cli_root/zpool_create/zpool_create_anyraid_004_pos.ksh \ + functional/cli_root/zpool_create/zpool_create_anyraid_005_neg.ksh \ + functional/cli_root/zpool_create/zpool_create_anyraid_006_neg.ksh \ + functional/cli_root/zpool_create/zpool_create_anyraid_007_neg.ksh \ + functional/cli_root/zpool_create/zpool_create_anyraid_008_neg.ksh \ + functional/cli_root/zpool_create/zpool_create_anyraid_009_neg.ksh \ + functional/cli_root/zpool_create/zpool_create_anyraid_010_neg.ksh \ + functional/cli_root/zpool_create/zpool_create_anyraid_011_neg.ksh \ functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh \ @@ -1121,6 +1185,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_export/zpool_export_002_pos.ksh \ functional/cli_root/zpool_export/zpool_export_003_neg.ksh \ functional/cli_root/zpool_export/zpool_export_004_pos.ksh \ + functional/cli_root/zpool_export/zpool_export_anyraid_001_pos.ksh \ functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh \ functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh \ functional/cli_root/zpool_get/cleanup.ksh \ @@ -1192,7 +1257,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_iostat/zpool_iostat_interval_all.ksh \ functional/cli_root/zpool_iostat/zpool_iostat_interval_some.ksh \ functional/cli_root/zpool_initialize/cleanup.ksh \ + functional/cli_root/zpool_initialize/setup.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \ + functional/cli_root/zpool_initialize/zpool_initialize_anyraid_attach.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_multiple_pools.ksh \ diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_checkpoint.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_checkpoint.ksh new file mode 100755 index 000000000000..009866e77839 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_checkpoint.ksh @@ -0,0 +1,65 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Anyraid works correctly with checkpoints +# +# STRATEGY: +# 1. Create an anyraid vdev +# 2. Take a checkpoint +# 3. Allocate more space +# 4. Roll back to the checkpoint +# 5. Verify that the tile map looks like what it did originally +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL +} + +log_onexit cleanup + +log_assert "Anyraid works correctly with checkpoints" +for vdev in "anymirror1" "anyraidz1:2"; do + log_must create_pool $TESTPOOL $vdev $DISKS + log_must zdb --anyraid-map $TESTPOOL + + map=$(zdb --anyraid-map $TESTPOOL) + log_must zpool checkpoint $TESTPOOL + + log_must file_write -o create -f /$TESTPOOL/f1 -b 1048576 -c 2048 -d R + + log_must zpool export $TESTPOOL + log_must zpool import --rewind-to-checkpoint $TESTPOOL + map2=$(zdb --anyraid-map $TESTPOOL) + log_must test "$map" == "$map2" +done +log_pass "Anyraid works correctly with checkpoints" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_001_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_001_pos.ksh new file mode 100755 index 000000000000..e5e172f84242 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_001_pos.ksh @@ -0,0 +1,66 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# AnyRAID 1-parity device can survive having 1 failed disk. +# +# STRATEGY: +# 1. Write several files to the ZFS filesystem mirror. +# 2. Override one of the disks of the mirror with zeroes. +# 3. Verify that all the file contents are unchanged on the file system. +# + +verify_runnable "global" + +log_assert "AnyRAID 1-parity can survive having 1 failed disk" + +log_must create_sparse_files "disk" 4 $DEVSIZE + +clean_mirror_spec_cases "anymirror1 $disk0 $disk1" \ + "$disk0" \ + "$disk1" + +clean_mirror_spec_cases "anymirror1 $disk0 $disk1 $disk2" \ + "$disk0" \ + "$disk1" \ + "$disk2" + +clean_mirror_spec_cases "anyraidz1:2 $disk0 $disk1 $disk2" \ + "$disk0" \ + "$disk1" \ + "$disk2" + +clean_mirror_spec_cases "anyraidz1:2 $disk0 $disk1 $disk2 $disk3" \ + "$disk0" \ + "$disk1" \ + "$disk2" \ + "$disk3" + +log_pass "AnyRAID 1-parity can survive having 1 failed disk" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_002_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_002_pos.ksh new file mode 100755 index 000000000000..29428fff3c19 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_002_pos.ksh @@ -0,0 +1,95 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# AnyRAID 2-parity can survive having 1-2 failed disks. +# +# STRATEGY: +# 1. Write several files to the ZFS filesystem mirror. +# 2. Override the selected disks of the mirror with zeroes. +# 3. Verify that all the file contents are unchanged on the file system. +# + +verify_runnable "global" + +log_assert "AnyRAID 2-parity can survive having 1-2 failed disks" + +log_must create_sparse_files "disk" 5 $DEVSIZE + +clean_mirror_spec_cases "anymirror2 $disk0 $disk1 $disk2" \ + "$disk0" \ + "$disk1" \ + "$disk2" \ + "\"$disk0 $disk1\"" \ + "\"$disk0 $disk2\"" \ + "\"$disk1 $disk2\"" + +clean_mirror_spec_cases "anymirror2 $disk0 $disk1 $disk2 $disk3" \ + "$disk0" \ + "$disk1" \ + "$disk2" \ + "$disk3" \ + "\"$disk0 $disk1\"" \ + "\"$disk0 $disk2\"" \ + "\"$disk0 $disk3\"" \ + "\"$disk1 $disk2\"" \ + "\"$disk1 $disk3\"" \ + "\"$disk2 $disk3\"" + +clean_mirror_spec_cases "anyraidz2:2 $disk0 $disk1 $disk2 $disk3" \ + "$disk0" \ + "$disk1" \ + "$disk2" \ + "$disk3" \ + "\"$disk0 $disk1\"" \ + "\"$disk0 $disk2\"" \ + "\"$disk0 $disk3\"" \ + "\"$disk1 $disk2\"" \ + "\"$disk1 $disk3\"" \ + "\"$disk2 $disk3\"" + +clean_mirror_spec_cases "anyraidz2:2 $disk0 $disk1 $disk2 $disk3 $disk4" \ + "$disk0" \ + "$disk1" \ + "$disk2" \ + "$disk3" \ + "$disk4" \ + "\"$disk0 $disk1\"" \ + "\"$disk0 $disk2\"" \ + "\"$disk0 $disk3\"" \ + "\"$disk0 $disk4\"" \ + "\"$disk1 $disk2\"" \ + "\"$disk1 $disk3\"" \ + "\"$disk1 $disk4\"" \ + "\"$disk2 $disk3\"" \ + "\"$disk2 $disk4\"" \ + "\"$disk3 $disk4\"" + +log_pass "AnyRAID 2-parity can survive having 1-2 failed disks" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_003_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_003_pos.ksh new file mode 100755 index 000000000000..05d6606db03c --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_003_pos.ksh @@ -0,0 +1,62 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# AnyRAID mirror3 can survive having 1-3 failed disks. +# +# STRATEGY: +# 1. Write several files to the ZFS filesystem mirror. +# 2. Override the selected disks of the mirror with zeroes. +# 3. Verify that all the file contents are unchanged on the file system. +# + +verify_runnable "global" + +log_assert "AnyRAID mirror3 can survive having 1-3 failed disks" + +log_must create_sparse_files "disk" 4 $DEVSIZE + +clean_mirror_spec_cases "anymirror3 $disk0 $disk1 $disk2 $disk3" \ + "$disk0" \ + "$disk1" \ + "$disk2" \ + "$disk3" \ + "\"$disk0 $disk1\"" \ + "\"$disk0 $disk2\"" \ + "\"$disk0 $disk3\"" \ + "\"$disk1 $disk2\"" \ + "\"$disk1 $disk3\"" \ + "\"$disk2 $disk3\"" \ + "\"$disk0 $disk1 $disk2\"" \ + "\"$disk0 $disk1 $disk3\"" \ + "\"$disk0 $disk2 $disk3\"" \ + "\"$disk1 $disk2 $disk3\"" + +log_pass "AnyRAID mirror3 can survive having 1-3 failed disks" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_004_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_004_pos.ksh new file mode 100755 index 000000000000..dca5fb2d2db4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_mirror_004_pos.ksh @@ -0,0 +1,56 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# AnyRAID mirror4 can survive having 1-4 failed disks. +# +# STRATEGY: +# 1. Write several files to the ZFS filesystem mirror. +# 2. Override the selected disks of the mirror with zeroes. +# 4. Verify that all the file contents are unchanged on the file system. +# + +verify_runnable "global" + +log_assert "AnyRAID mirror4 can survive having 1-4 failed disks" + +log_must create_sparse_files "disk" 5 $DEVSIZE + +clean_mirror_spec_cases "anymirror4 $disk0 $disk1 $disk2 $disk3 $disk4" \ + "$disk0" \ + "$disk4" \ + "\"$disk0 $disk1\"" \ + "\"$disk1 $disk4\"" \ + "\"$disk0 $disk1 $disk3\"" \ + "\"$disk2 $disk3 $disk4\"" \ + "\"$disk0 $disk1 $disk2 $disk3\"" \ + "\"$disk0 $disk2 $disk3 $disk4\"" + +log_pass "AnyRAID mirror4 can survive having 1-4 failed disks" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_raidz_001_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_raidz_001_pos.ksh new file mode 100755 index 000000000000..522e859eae55 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_raidz_001_pos.ksh @@ -0,0 +1,128 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# AnyRAID raidz1:2 can survive having 1 failed disk. This is the raidz +# equivalent of the clean_mirror tests. With parity=1, the pool should +# tolerate any single disk failure without data loss. +# +# STRATEGY: +# 1. Create an anyraidz1:2 pool with 4 disks. +# 2. Write files and record xxh128 checksums. +# 3. Punch holes in 1 disk at a time. +# 4. Export/import, verify all checksums match. +# 5. Scrub and verify no errors. +# 6. Repeat for each disk individually. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + log_must delete_sparse_files +} + +log_onexit cleanup + +log_assert "AnyRAID raidz1:2 can survive having 1 failed disk" + +log_must create_sparse_files "disk" 4 $DEVSIZE + +typeset poolspec="anyraidz1:2 $disk0 $disk1 $disk2 $disk3" +typeset diskdir=$(dirname $disk0) + +# +# Test each single-disk failure case individually. +# +for failed_disk in $disk0 $disk1 $disk2 $disk3; do + log_note "Testing single-disk failure: $failed_disk" + + log_must zpool create -f $TESTPOOL $poolspec + + # + # Write files and record checksums. + # + typeset -i atfile=0 + set -A files + set -A cksums + typeset newcksum + + while (( atfile < FILE_COUNT )); do + files[$atfile]=/$TESTPOOL/file.$atfile + log_must file_write -o create -f ${files[$atfile]} \ + -b $FILE_SIZE -c 1 + cksums[$atfile]=$(xxh128digest ${files[$atfile]}) + (( atfile = atfile + 1 )) + done + + # + # Punch a hole in the target disk to simulate failure. + # + log_must punch_hole $((DD_BLOCK * 8)) \ + $((DD_BLOCK * (DD_COUNT - 128))) $failed_disk + + # + # Flush out the cache by exporting and re-importing. + # + log_must zpool export $TESTPOOL + log_must zpool import -d $diskdir $TESTPOOL + + # + # Verify all file checksums match. + # + atfile=0 + typeset -i failedcount=0 + while (( atfile < FILE_COUNT )); do + newcksum=$(xxh128digest ${files[$atfile]}) + if [[ $newcksum != ${cksums[$atfile]} ]]; then + (( failedcount = failedcount + 1 )) + fi + (( atfile = atfile + 1 )) + done + + if (( failedcount > 0 )); then + log_fail "$failedcount of $FILE_COUNT files had wrong" \ + "checksums after failing disk $failed_disk" + fi + + # + # Run scrub and verify no errors. + # + log_must zpool scrub $TESTPOOL + log_must wait_scrubbed $TESTPOOL + + # + # Destroy pool for the next iteration. + # + log_must destroy_pool $TESTPOOL +done + +log_pass "AnyRAID raidz1:2 can survive having 1 failed disk" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_raidz_002_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_raidz_002_pos.ksh new file mode 100755 index 000000000000..4fafc496966d --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_raidz_002_pos.ksh @@ -0,0 +1,162 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# AnyRAID raidz2:2 can survive having 1-2 failed disks. This is the raidz +# equivalent of clean_mirror_002. With parity=2, the pool should tolerate +# up to 2 simultaneous disk failures without data loss. +# +# STRATEGY: +# 1. Create an anyraidz2:2 pool with 5 disks. +# 2. Write files and record xxh128 checksums. +# 3. Punch holes in 1 or 2 disks at a time. +# 4. Export/import, verify all checksums match. +# 5. Scrub and verify no errors. +# 6. Repeat for each disk failure combination. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + log_must delete_sparse_files +} + +log_onexit cleanup + +log_assert "AnyRAID raidz2:2 can survive having 1-2 failed disks" + +log_must create_sparse_files "disk" 5 $DEVSIZE + +typeset poolspec="anyraidz2:2 $disk0 $disk1 $disk2 $disk3 $disk4" +typeset diskdir=$(dirname $disk0) + +# +# Build the list of failure cases: all single-disk and all 2-disk +# combinations from 5 disks. +# +set -A all_disks $disk0 $disk1 $disk2 $disk3 $disk4 +set -A fail_cases +typeset -i case_idx=0 + +# Single-disk failures (5 cases). +typeset -i i=0 +while (( i < 5 )); do + fail_cases[$case_idx]="${all_disks[$i]}" + (( case_idx = case_idx + 1 )) + (( i = i + 1 )) +done + +# 2-disk failure combinations (10 cases). +i=0 +while (( i < 4 )); do + typeset -i j + (( j = i + 1 )) + while (( j < 5 )); do + fail_cases[$case_idx]="${all_disks[$i]} ${all_disks[$j]}" + (( case_idx = case_idx + 1 )) + (( j = j + 1 )) + done + (( i = i + 1 )) +done + +log_note "Total failure cases to test: $case_idx" + +typeset -i case_num=0 +while (( case_num < case_idx )); do + typeset tcase="${fail_cases[$case_num]}" + log_note "Test case $(( case_num + 1 ))/$case_idx: failing disks: $tcase" + + log_must zpool create -f $TESTPOOL $poolspec + + # + # Write files and record checksums. + # + typeset -i atfile=0 + set -A files + set -A cksums + typeset newcksum + + while (( atfile < FILE_COUNT )); do + files[$atfile]=/$TESTPOOL/file.$atfile + log_must file_write -o create -f ${files[$atfile]} \ + -b $FILE_SIZE -c 1 + cksums[$atfile]=$(xxh128digest ${files[$atfile]}) + (( atfile = atfile + 1 )) + done + + # + # Punch holes in the target disk(s) to simulate failure. + # + for failed_disk in $tcase; do + log_must punch_hole $((DD_BLOCK * 8)) \ + $((DD_BLOCK * (DD_COUNT - 128))) $failed_disk + done + + # + # Flush out the cache by exporting and re-importing. + # + log_must zpool export $TESTPOOL + log_must zpool import -d $diskdir $TESTPOOL + + # + # Verify all file checksums match. + # + atfile=0 + typeset -i failedcount=0 + while (( atfile < FILE_COUNT )); do + newcksum=$(xxh128digest ${files[$atfile]}) + if [[ $newcksum != ${cksums[$atfile]} ]]; then + (( failedcount = failedcount + 1 )) + fi + (( atfile = atfile + 1 )) + done + + if (( failedcount > 0 )); then + log_fail "$failedcount of $FILE_COUNT files had wrong" \ + "checksums after failing disks: $tcase" + fi + + # + # Run scrub and verify no errors. + # + log_must zpool scrub $TESTPOOL + log_must wait_scrubbed $TESTPOOL + + # + # Destroy pool for the next iteration. + # + log_must destroy_pool $TESTPOOL + + (( case_num = case_num + 1 )) +done + +log_pass "AnyRAID raidz2:2 can survive having 1-2 failed disks" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_raidz_003_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_raidz_003_pos.ksh new file mode 100755 index 000000000000..d78a15a0e2c2 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_clean_raidz_003_pos.ksh @@ -0,0 +1,163 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# AnyRAID raidz3:3 can survive having 1-3 failed disks. This is the raidz +# equivalent of clean_mirror_003. With parity=3, the pool should tolerate +# up to 3 simultaneous disk failures without data loss. +# +# STRATEGY: +# 1. Create an anyraidz3:3 pool with 7 disks. +# 2. Write files and record xxh128 checksums. +# 3. Punch holes in 1, 2, or 3 disks at a time. +# 4. Export/import, verify all checksums match. +# 5. Scrub and verify no errors. +# 6. Repeat for representative disk failure combinations. +# +# Note: With 7 disks, all combinations would be 63 cases. +# We test representative cases covering first/middle/last positions, +# adjacent/spread patterns for each failure count. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + log_must delete_sparse_files +} + +log_onexit cleanup + +log_assert "AnyRAID raidz3:3 can survive having 1-3 failed disks" + +log_must create_sparse_files "disk" 7 $DEVSIZE + +typeset poolspec="anyraidz3:3 $disk0 $disk1 $disk2 $disk3 $disk4 $disk5 $disk6" +typeset diskdir=$(dirname $disk0) + +# +# Build representative failure cases. +# Single-disk failures: first, middle, last. +# 2-disk failures: adjacent pair, spread pair, end pair. +# 3-disk failures: first three, spread three, last three. +# +set -A fail_cases +typeset -i case_idx=0 + +# Single-disk failures (3 representative cases). +fail_cases[$case_idx]="$disk0"; (( case_idx = case_idx + 1 )) +fail_cases[$case_idx]="$disk3"; (( case_idx = case_idx + 1 )) +fail_cases[$case_idx]="$disk6"; (( case_idx = case_idx + 1 )) + +# 2-disk failures (4 representative cases). +fail_cases[$case_idx]="$disk0 $disk1"; (( case_idx = case_idx + 1 )) +fail_cases[$case_idx]="$disk0 $disk6"; (( case_idx = case_idx + 1 )) +fail_cases[$case_idx]="$disk2 $disk5"; (( case_idx = case_idx + 1 )) +fail_cases[$case_idx]="$disk5 $disk6"; (( case_idx = case_idx + 1 )) + +# 3-disk failures (4 representative cases). +fail_cases[$case_idx]="$disk0 $disk1 $disk2"; (( case_idx = case_idx + 1 )) +fail_cases[$case_idx]="$disk0 $disk3 $disk6"; (( case_idx = case_idx + 1 )) +fail_cases[$case_idx]="$disk1 $disk3 $disk5"; (( case_idx = case_idx + 1 )) +fail_cases[$case_idx]="$disk4 $disk5 $disk6"; (( case_idx = case_idx + 1 )) + +log_note "Total failure cases to test: $case_idx" + +typeset -i case_num=0 +while (( case_num < case_idx )); do + typeset tcase="${fail_cases[$case_num]}" + log_note "Test case $(( case_num + 1 ))/$case_idx: failing disks: $tcase" + + log_must zpool create -f $TESTPOOL $poolspec + + # + # Write files and record checksums. + # + typeset -i atfile=0 + set -A files + set -A cksums + typeset newcksum + + while (( atfile < FILE_COUNT )); do + files[$atfile]=/$TESTPOOL/file.$atfile + log_must file_write -o create -f ${files[$atfile]} \ + -b $FILE_SIZE -c 1 + cksums[$atfile]=$(xxh128digest ${files[$atfile]}) + (( atfile = atfile + 1 )) + done + + # + # Punch holes in the target disk(s) to simulate failure. + # + for failed_disk in $tcase; do + log_must punch_hole $((DD_BLOCK * 8)) \ + $((DD_BLOCK * (DD_COUNT - 128))) $failed_disk + done + + # + # Flush out the cache by exporting and re-importing. + # + log_must zpool export $TESTPOOL + log_must zpool import -d $diskdir $TESTPOOL + + # + # Verify all file checksums match. + # + atfile=0 + typeset -i failedcount=0 + while (( atfile < FILE_COUNT )); do + newcksum=$(xxh128digest ${files[$atfile]}) + if [[ $newcksum != ${cksums[$atfile]} ]]; then + (( failedcount = failedcount + 1 )) + fi + (( atfile = atfile + 1 )) + done + + if (( failedcount > 0 )); then + log_fail "$failedcount of $FILE_COUNT files had wrong" \ + "checksums after failing disks: $tcase" + fi + + # + # Run scrub and verify no errors. + # + log_must zpool scrub $TESTPOOL + log_must wait_scrubbed $TESTPOOL + + # + # Destroy pool for the next iteration. + # + log_must destroy_pool $TESTPOOL + + (( case_num = case_num + 1 )) +done + +log_pass "AnyRAID raidz3:3 can survive having 1-3 failed disks" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_common.kshlib b/tests/zfs-tests/tests/functional/anyraid/anyraid_common.kshlib new file mode 100644 index 000000000000..1b4f7d15451c --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_common.kshlib @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/anyraid/default.cfg + +function wipe_some_disks_and_verify_content_is_still_okay +{ + typeset pool=$1 + shift + + typeset atfile=0 + set -A files + set -A cksums + typeset newcksum + + while (( atfile < FILE_COUNT )); do + files[$atfile]=/$pool/file.$atfile + log_must file_write -o create -f ${files[$atfile]} \ + -b $FILE_SIZE -c 1 + cksums[$atfile]=$(xxh128digest ${files[$atfile]}) + (( atfile = atfile + 1 )) + done + + for disk in $@; do + log_must punch_hole $((DD_BLOCK * 8)) $((DD_BLOCK * (DD_COUNT - 128))) $disk + done + + # + # Flush out the cache so that we ensure we're reading from disk. + # + log_must zpool status + log_must zpool export $pool + log_must zpool import -d $(dirname $1) + log_must zpool import -d $(dirname $1) $pool + + atfile=0 + typeset -i failedcount=0 + while (( atfile < FILE_COUNT )); do + newcksum=$(xxh128digest ${files[$atfile]}) + if [[ $newcksum != ${cksums[$atfile]} ]]; then + (( failedcount = failedcount + 1 )) + log_note "Wrong checksum of ${files[$atfile]}" + fi + (( atfile = atfile + 1 )) + done + + if [[ $failedcount > 0 ]]; then + log_fail "$failedcount of the $FILE_COUNT files did not" \ + "have the same checksum before and after" + fi + + log_must zpool status + log_must zpool scrub $TESTPOOL + log_must wait_scrubbed $TESTPOOL + log_must zpool status +} + +function clean_mirror_spec_cases +{ + typeset poolspec=$1 + shift + + typeset tcases + eval "typeset -a tcases=($*)" + + log_note "pool specification: $poolspec" + + for tcase in "${tcases[@]}"; do + log_note "failed disk case: $tcase" + log_must zpool create -f $TESTPOOL $poolspec + wipe_some_disks_and_verify_content_is_still_okay $TESTPOOL $tcase + poolexists $TESTPOOL && destroy_pool $TESTPOOL + done +} diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_contract_002_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_contract_002_pos.ksh new file mode 100755 index 000000000000..d1e74590844f --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_contract_002_pos.ksh @@ -0,0 +1,142 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Basic contraction on anyraidz1:2 preserves data integrity. +# Removes one disk from a pool with surplus disks and verifies +# all data checksums are unchanged. +# +# STRATEGY: +# 1. Create anyraidz1:2 pool with 5 disks (min width=3, so 2 surplus) +# 2. Write 10 files and record xxh128 checksums +# 3. Record pool capacity before contraction +# 4. Run zpool contract to remove one disk +# 5. Wait for relocation to complete +# 6. Verify pool capacity decreased +# 7. Verify all file checksums are unchanged +# 8. Run scrub, verify no errors +# + +verify_runnable "global" + +cleanup() { + log_note "DEBUG: cleanup started" + poolexists $TESTPOOL && destroy_pool $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} +} + +log_onexit cleanup + +log_note "DEBUG: creating sparse files" +log_must truncate -s 768M $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} + +log_note "DEBUG: setting tile size to 64MiB" +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "Basic contraction on anyraidz1:2 preserves data integrity" + +log_note "DEBUG: creating anyraidz1:2 pool with 5 disks" +log_must create_pool $TESTPOOL anyraidz1:2 \ + $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} + +# +# Write several files and record their checksums. +# +log_note "DEBUG: writing test files and recording checksums" +typeset -i file_count=10 +typeset -i idx=0 +set -A cksums + +while (( idx < file_count )); do + log_must file_write -o create -b 1048576 -c 1 -d 'R' \ + -f /$TESTPOOL/file.$idx + cksums[$idx]=$(xxh128digest /$TESTPOOL/file.$idx) + log_note "DEBUG: file.$idx checksum=${cksums[$idx]}" + (( idx = idx + 1 )) +done + +log_note "DEBUG: recording capacity before contraction" +cap_before=$(zpool get -Hp -o value size $TESTPOOL) +log_note "DEBUG: capacity before=$cap_before" + +# +# Get the actual vdev name from zpool status for anyraidz. +# The vdev name includes the full type spec (e.g., anyraidzz1:2-0). +# +log_note "DEBUG: looking up anyraidz vdev name" +vdev_name=$(zpool status $TESTPOOL | awk '/anyraidz/ && /raidz/ {print $1; exit}') +log_note "DEBUG: vdev name=$vdev_name" +[[ -n "$vdev_name" ]] || log_fail "Could not find anyraidz vdev name" + +# +# Contract the pool by removing disk 4. +# +log_note "DEBUG: starting contraction to remove vdev_file.4" +log_must zpool contract $TESTPOOL $vdev_name \ + $TEST_BASE_DIR/vdev_file.4 + +log_note "DEBUG: waiting for relocation to complete" +log_must zpool wait -t anyraid_relocate $TESTPOOL +log_must zpool sync $TESTPOOL + +# +# Verify capacity decreased. +# +log_note "DEBUG: checking capacity after contraction" +cap_after=$(zpool get -Hp -o value size $TESTPOOL) +log_note "DEBUG: capacity after=$cap_after" +[[ "$cap_after" -lt "$cap_before" ]] || \ + log_fail "Capacity did not decrease: before=$cap_before after=$cap_after" + +# +# Verify all file checksums are unchanged. +# +log_note "DEBUG: verifying file checksums" +idx=0 +while (( idx < file_count )); do + newcksum=$(xxh128digest /$TESTPOOL/file.$idx) + [[ "$newcksum" == "${cksums[$idx]}" ]] || \ + log_fail "Checksum mismatch for file.$idx: expected=${cksums[$idx]} got=$newcksum" + (( idx = idx + 1 )) +done + +# +# Verify pool health and no checksum errors. +# +log_note "DEBUG: running scrub" +log_must zpool scrub -w $TESTPOOL + +log_note "DEBUG: checking pool status" +log_must check_pool_status $TESTPOOL state ONLINE true +cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | awk 'NF > 2 && $5 != 0' | wc -l) +[[ "$cksum_count" -eq 0 ]] || log_fail "checksum errors detected" + +log_pass "Basic contraction on anyraidz1:2 preserves data integrity" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_contract_raidz2_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_contract_raidz2_pos.ksh new file mode 100755 index 000000000000..b3fe34bfc057 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_contract_raidz2_pos.ksh @@ -0,0 +1,136 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify contraction works on anyraidz2:2 (double parity). +# Minimum width = 4 (2 parity + 2 data), so 6 disks has 2 surplus. +# +# STRATEGY: +# 1. Create anyraidz2:2 pool with 6 disks +# 2. Write data and record checksums +# 3. Contract one disk (6 -> 5), wait, verify +# 4. Scrub, verify no errors +# + +verify_runnable "global" + +cleanup() { + log_note "DEBUG: cleanup started" + poolexists $TESTPOOL && destroy_pool $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2,3,4,5} +} + +log_onexit cleanup + +log_note "DEBUG: creating sparse files" +log_must truncate -s 768M $TEST_BASE_DIR/vdev_file.{0,1,2,3,4,5} + +log_note "DEBUG: setting tile size to 64MiB" +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "Contraction on anyraidz2:2 preserves data integrity" + +log_note "DEBUG: creating anyraidz2:2 pool with 6 disks" +log_must create_pool $TESTPOOL anyraidz2:2 \ + $TEST_BASE_DIR/vdev_file.{0,1,2,3,4,5} + +# +# Write data and record checksums. +# +log_note "DEBUG: writing test files and recording checksums" +typeset -i file_count=10 +typeset -i idx=0 +set -A cksums + +while (( idx < file_count )); do + log_must file_write -o create -b 1048576 -c 1 -d 'R' \ + -f /$TESTPOOL/file.$idx + cksums[$idx]=$(xxh128digest /$TESTPOOL/file.$idx) + log_note "DEBUG: file.$idx checksum=${cksums[$idx]}" + (( idx = idx + 1 )) +done + +log_note "DEBUG: recording capacity before contraction" +cap_before=$(zpool get -Hp -o value size $TESTPOOL) +log_note "DEBUG: capacity before=$cap_before" + +# +# Get the actual vdev name from zpool status for anyraidz. +# +log_note "DEBUG: looking up anyraidz vdev name" +vdev_name=$(zpool status $TESTPOOL | awk '/anyraidz/ && /raidz/ {print $1; exit}') +log_note "DEBUG: vdev name=$vdev_name" +[[ -n "$vdev_name" ]] || log_fail "Could not find anyraidz vdev name" + +# +# Contract the pool by removing disk 5 (6 -> 5 disks). +# +log_note "DEBUG: starting contraction to remove vdev_file.5" +log_must zpool contract $TESTPOOL $vdev_name \ + $TEST_BASE_DIR/vdev_file.5 + +log_note "DEBUG: waiting for relocation to complete" +log_must zpool wait -t anyraid_relocate $TESTPOOL +log_must zpool sync $TESTPOOL + +# +# Verify capacity decreased. +# +log_note "DEBUG: checking capacity after contraction" +cap_after=$(zpool get -Hp -o value size $TESTPOOL) +log_note "DEBUG: capacity after=$cap_after" +[[ "$cap_after" -lt "$cap_before" ]] || \ + log_fail "Capacity did not decrease: before=$cap_before after=$cap_after" + +# +# Verify all file checksums are unchanged. +# +log_note "DEBUG: verifying file checksums" +idx=0 +while (( idx < file_count )); do + newcksum=$(xxh128digest /$TESTPOOL/file.$idx) + [[ "$newcksum" == "${cksums[$idx]}" ]] || \ + log_fail "Checksum mismatch for file.$idx: expected=${cksums[$idx]} got=$newcksum" + (( idx = idx + 1 )) +done + +# +# Verify pool health and no checksum errors. +# +log_note "DEBUG: running scrub" +log_must zpool scrub -w $TESTPOOL + +log_note "DEBUG: checking pool status" +log_must check_pool_status $TESTPOOL state ONLINE true +cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | awk 'NF > 2 && $5 != 0' | wc -l) +[[ "$cksum_count" -eq 0 ]] || log_fail "checksum errors detected" + +log_pass "Contraction on anyraidz2:2 preserves data integrity" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_encryption_001_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_encryption_001_pos.ksh new file mode 100755 index 000000000000..eb7353e75dc5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_encryption_001_pos.ksh @@ -0,0 +1,147 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify AnyRAID works correctly with ZFS native encryption. +# Create an anymirror1 pool, create an encrypted dataset, write data, +# export/import, load the key, and verify data integrity. +# This test is self-contained and does not depend on any other test. +# +# STRATEGY: +# 1. Create an anymirror1 pool with 3 disks. +# 2. Create an encrypted dataset using a passphrase. +# 3. Write data and record xxh128 checksums. +# 4. Export the pool. +# 5. Import the pool and load the encryption key. +# 6. Verify all data checksums match. +# 7. Run scrub, verify no errors. +# + +verify_runnable "global" + +PASSPHRASE="testpassword123" + +cleanup() { + poolexists $TESTPOOL && destroy_pool $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2} +} + +log_onexit cleanup + +log_assert "AnyRAID with native encryption preserves data across export/import" + +# +# Create backing files and set tile size. +# +log_must truncate -s 1G $TEST_BASE_DIR/vdev_file.{0,1,2} +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +# +# Create the pool. +# +log_must create_pool $TESTPOOL anymirror1 \ + $TEST_BASE_DIR/vdev_file.{0,1,2} + +# +# Create an encrypted dataset. +# +log_must eval "echo '$PASSPHRASE' | zfs create \ + -o encryption=aes-256-gcm \ + -o keyformat=passphrase \ + -o keylocation=prompt \ + $TESTPOOL/encrypted" + +# +# Write files and record checksums. +# +set -A cksums +typeset -i idx=0 + +while (( idx < 5 )); do + log_must file_write -o create -b 1048576 -c 1 -d 'R' \ + -f /$TESTPOOL/encrypted/file.$idx + cksums[$idx]=$(xxh128digest /$TESTPOOL/encrypted/file.$idx) + (( idx = idx + 1 )) +done + +# +# Also write a larger file for more coverage. +# +log_must file_write -o create -b 1048576 -c 32 -d 'R' \ + -f /$TESTPOOL/encrypted/largefile +typeset large_cksum=$(xxh128digest /$TESTPOOL/encrypted/largefile) + +# +# Sync and export the pool. +# +log_must zpool sync $TESTPOOL +log_must zpool export $TESTPOOL + +# +# Import the pool. The encrypted dataset will not be mounted yet. +# +log_must zpool import -d $TEST_BASE_DIR $TESTPOOL + +# +# Load the encryption key and mount the dataset. +# +log_must eval "echo '$PASSPHRASE' | zfs load-key $TESTPOOL/encrypted" +log_must zfs mount $TESTPOOL/encrypted + +# +# Verify all checksums. +# +idx=0 +while (( idx < 5 )); do + typeset newcksum=$(xxh128digest /$TESTPOOL/encrypted/file.$idx) + [[ "$newcksum" == "${cksums[$idx]}" ]] || \ + log_fail "Checksum mismatch for file.$idx: expected=${cksums[$idx]} got=$newcksum" + (( idx = idx + 1 )) +done + +typeset new_large_cksum=$(xxh128digest /$TESTPOOL/encrypted/largefile) +[[ "$new_large_cksum" == "$large_cksum" ]] || \ + log_fail "Checksum mismatch for largefile: expected=$large_cksum got=$new_large_cksum" + +# +# Run scrub and verify no errors. +# +log_must zpool scrub $TESTPOOL +log_must zpool wait -t scrub $TESTPOOL + +log_must check_pool_status $TESTPOOL state ONLINE true +log_must is_pool_scrubbed $TESTPOOL true + +typeset cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | \ + awk 'NF > 2 && $5 != 0' | wc -l) +(( cksum_count == 0 )) || log_fail "Checksum errors detected after scrub" + +log_pass "AnyRAID with native encryption preserves data across export/import" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_faildisk_write_replace_resilver.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_faildisk_write_replace_resilver.ksh new file mode 100755 index 000000000000..129625b981ad --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_faildisk_write_replace_resilver.ksh @@ -0,0 +1,90 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# AnyRAID mirror can resilver a replaced disk. +# +# STRATEGY: +# 1. Fail one disk. +# 2. Write new data to the pool. +# 3. Get that disk replaced and resilvered. +# 4. Repeat to verify sequential resilvering. +# + +verify_runnable "global" + +log_assert "AnyRAID mirror can resilver a replaced disk" + +cleanup() { + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + + +for vdev in "anymirror1" "anyraidz1:1"; do + for replace_flags in '' '-s'; do + + [[ "$vdev" =~ "anyraidz1:1" && "$replace_flags" == "-s" ]] && continue + log_must create_sparse_files "disk" 3 $DEVSIZE + log_must create_sparse_files "spare" 1 $DEVSIZE + log_must zpool create -O compress=off -f $TESTPOOL $vdev $disks + log_must zfs set primarycache=none $TESTPOOL + + # Write initial data + log_must file_write -o create -f /$TESTPOOL/file1.bin -b 1048576 -c 256 -d Z + + # Fail one disk + log_must truncate -s0 $disk0 + + # Read initial data, write new data + log_must dd if=/$TESTPOOL/file1.bin of=/dev/null bs=1M count=256 + log_must file_write -o create -f /$TESTPOOL/file1.bin -b 1048576 -c 256 -d Y + + # Check that disk is faulted + zpool status + log_must check_state $TESTPOOL $disk0 "faulted" + + # Initiate disk replacement + log_must zpool replace -f $replace_flags $TESTPOOL $disk0 $spare0 + + # Wait until resilvering is done and the pool is back online + for i in {1..60}; do + check_state $TESTPOOL "" "online" && break + sleep 1 + done + zpool status + log_must check_state $TESTPOOL "" "online" + + destroy_pool $TESTPOOL + done +done + +log_pass "AnyRAID mirror can resilver a replaced disk" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_gang_blocks_001_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_gang_blocks_001_pos.ksh new file mode 100755 index 000000000000..8e948c0415f7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_gang_blocks_001_pos.ksh @@ -0,0 +1,165 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify AnyRAID handles gang blocks correctly when the pool is +# nearly full. Fill an anymirror1 pool with small disks to near +# capacity, then continue writing to force gang block allocation. +# Verify data integrity after the writes complete. +# This test is self-contained and does not depend on any other test. +# +# STRATEGY: +# 1. Create an anymirror1 pool with small disks (512MiB each). +# 2. Fill the pool to near capacity with known data. +# 3. Continue writing small files to push into gang block territory. +# 4. Record checksums of all successfully written files. +# 5. Export and re-import the pool. +# 6. Verify all checksums match. +# 7. Run scrub, verify no errors. +# + +verify_runnable "global" + +cleanup() { + poolexists $TESTPOOL && destroy_pool $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2} +} + +log_onexit cleanup + +log_assert "AnyRAID handles gang blocks correctly when pool is nearly full" + +# +# Create small backing files and set tile size. +# +log_must truncate -s 512M $TEST_BASE_DIR/vdev_file.{0,1,2} +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +# +# Create the pool. +# +log_must create_pool $TESTPOOL anymirror1 \ + $TEST_BASE_DIR/vdev_file.{0,1,2} + +# +# Fill the pool with 1MiB files until we approach capacity. +# With anymirror1 (parity=1) and 3 x 512MiB disks, usable +# capacity is roughly 768MiB (1536MiB / 2). Write ~700MiB +# to get close to full. +# +set -A cksums +typeset -i file_idx=0 +typeset -i fill_count=700 + +while (( file_idx < fill_count )); do + file_write -o create -b 1048576 -c 1 -d 'R' \ + -f /$TESTPOOL/fill.$file_idx 2>/dev/null + if (( $? != 0 )); then + break + fi + cksums[$file_idx]=$(xxh128digest /$TESTPOOL/fill.$file_idx) + (( file_idx = file_idx + 1 )) +done + +typeset -i total_files=$file_idx + +# +# Now write small 4K files to push further into fragmented/gang +# block territory. These writes may fail with ENOSPC which is +# expected. +# +typeset -i small_idx=0 +typeset -i small_count=0 + +while (( small_idx < 100 )); do + dd if=/dev/urandom of=/$TESTPOOL/small.$small_idx \ + bs=4096 count=1 2>/dev/null + if (( $? != 0 )); then + break + fi + cksums[$((total_files + small_idx))]=$(xxh128digest /$TESTPOOL/small.$small_idx) + (( small_idx = small_idx + 1 )) +done + +small_count=$small_idx +typeset -i all_files=$((total_files + small_count)) + +# +# Sync and verify pool is still ONLINE. +# +log_must zpool sync $TESTPOOL +log_must check_pool_status $TESTPOOL state ONLINE true + +# +# Export and re-import the pool. +# +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR $TESTPOOL + +# +# Verify all fill file checksums. +# +typeset -i failedcount=0 +file_idx=0 +while (( file_idx < total_files )); do + typeset newcksum=$(xxh128digest /$TESTPOOL/fill.$file_idx) + if [[ "$newcksum" != "${cksums[$file_idx]}" ]]; then + (( failedcount = failedcount + 1 )) + fi + (( file_idx = file_idx + 1 )) +done + +# +# Verify all small file checksums. +# +small_idx=0 +while (( small_idx < small_count )); do + typeset newcksum=$(xxh128digest /$TESTPOOL/small.$small_idx) + typeset expected_idx=$((total_files + small_idx)) + if [[ "$newcksum" != "${cksums[$expected_idx]}" ]]; then + (( failedcount = failedcount + 1 )) + fi + (( small_idx = small_idx + 1 )) +done + +if (( failedcount > 0 )); then + log_fail "$failedcount of $all_files files had wrong checksums after export/import" +fi + +# +# Run scrub and verify no errors. +# +log_must zpool scrub $TESTPOOL +log_must zpool wait -t scrub $TESTPOOL + +log_must check_pool_status $TESTPOOL state ONLINE true + +log_pass "AnyRAID handles gang blocks correctly when pool is nearly full" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_anymirror_001_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_anymirror_001_pos.ksh new file mode 100755 index 000000000000..845d7b5970f6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_anymirror_001_pos.ksh @@ -0,0 +1,136 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# Offlining a disk from an anymirror1 pool with surplus disks +# preserves data integrity. The pool becomes DEGRADED but all data +# remains readable. After onlining the disk back, the pool returns +# to ONLINE. This test is self-contained. +# +# STRATEGY: +# 1. Create an anymirror1 pool with 3 disks (surplus for parity=1). +# 2. Write data and record xxh128 checksums. +# 3. Offline one disk. +# 4. Verify pool is DEGRADED and all checksums match. +# 5. Online the disk back. +# 6. Wait for resilver, verify pool returns to ONLINE. +# 7. Scrub and verify no errors. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + log_must delete_sparse_files +} + +log_onexit cleanup + +log_assert "Offlining a disk from an anymirror1 pool with surplus disks preserves data" + +# +# Create sparse disk files and pool. +# +log_must create_sparse_files "disk" 3 $DEVSIZE +log_must zpool create -f $TESTPOOL anymirror1 $disk0 $disk1 $disk2 + +# +# Write files and record checksums. +# +typeset -i atfile=0 +set -A files +set -A cksums +typeset newcksum + +while (( atfile < FILE_COUNT )); do + files[$atfile]=/$TESTPOOL/file.$atfile + log_must file_write -o create -f ${files[$atfile]} \ + -b $FILE_SIZE -c 1 + cksums[$atfile]=$(xxh128digest ${files[$atfile]}) + (( atfile = atfile + 1 )) +done + +log_must zpool sync $TESTPOOL + +# +# Offline the third disk. +# +log_must zpool offline $TESTPOOL $disk2 + +# +# Verify the pool is DEGRADED. +# +log_must check_state $TESTPOOL "" "degraded" +log_must check_state $TESTPOOL $disk2 "offline" + +# +# Verify all file checksums still match while degraded. +# +atfile=0 +typeset -i failedcount=0 +while (( atfile < FILE_COUNT )); do + newcksum=$(xxh128digest ${files[$atfile]}) + if [[ $newcksum != ${cksums[$atfile]} ]]; then + (( failedcount = failedcount + 1 )) + fi + (( atfile = atfile + 1 )) +done + +if (( failedcount > 0 )); then + log_fail "$failedcount of $FILE_COUNT files had wrong checksums" \ + "while pool was degraded" +fi + +# +# Online the disk back. +# +log_must zpool online $TESTPOOL $disk2 + +# +# Wait for resilver to complete. +# +for i in {1..60}; do + check_state $TESTPOOL "" "online" && break + sleep 1 +done + +# +# Verify the pool is back to ONLINE. +# +log_must check_state $TESTPOOL "" "online" + +# +# Run scrub and verify no errors. +# +log_must zpool scrub $TESTPOOL +log_must wait_scrubbed $TESTPOOL + +log_pass "Offlining a disk from an anymirror1 pool with surplus disks preserves data" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_anymirror_002_neg.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_anymirror_002_neg.ksh new file mode 100755 index 000000000000..6c751bee5211 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_anymirror_002_neg.ksh @@ -0,0 +1,132 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# Offlining too many disks from an anymirror1 pool is rejected. +# An anymirror1 pool with 2 disks (minimum for parity=1) can +# tolerate 1 offline disk, but offlining the second must fail. +# This test is self-contained. +# +# STRATEGY: +# 1. Create an anymirror1 pool with 2 disks (minimum for parity=1). +# 2. Write data and record xxh128 checksums. +# 3. Offline one disk (succeeds, pool DEGRADED). +# 4. Attempt to offline the second disk (must fail). +# 5. Verify data integrity while degraded. +# 6. Online the first disk back, verify pool returns to ONLINE. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + log_must delete_sparse_files +} + +log_onexit cleanup + +log_assert "Offlining all disks from a minimum anymirror1 pool is rejected" + +# +# Create sparse disk files and pool with minimum disks for parity=1. +# +log_must create_sparse_files "disk" 2 $DEVSIZE +log_must zpool create -f $TESTPOOL anymirror1 $disk0 $disk1 + +# +# Write files and record checksums. +# +typeset -i atfile=0 +set -A files +set -A cksums +typeset newcksum + +while (( atfile < FILE_COUNT )); do + files[$atfile]=/$TESTPOOL/file.$atfile + log_must file_write -o create -f ${files[$atfile]} \ + -b $FILE_SIZE -c 1 + cksums[$atfile]=$(xxh128digest ${files[$atfile]}) + (( atfile = atfile + 1 )) +done + +log_must zpool sync $TESTPOOL + +# +# Offline the first disk. This should succeed because parity=1 +# can tolerate 1 offline disk. +# +log_must zpool offline $TESTPOOL $disk0 +log_must check_state $TESTPOOL "" "degraded" +log_must check_state $TESTPOOL $disk0 "offline" + +# +# Attempt to offline the second disk. This must fail because +# it would exceed the parity tolerance. +# +log_mustnot zpool offline $TESTPOOL $disk1 + +# +# Verify all file checksums still match while degraded. +# +atfile=0 +typeset -i failedcount=0 +while (( atfile < FILE_COUNT )); do + newcksum=$(xxh128digest ${files[$atfile]}) + if [[ $newcksum != ${cksums[$atfile]} ]]; then + (( failedcount = failedcount + 1 )) + fi + (( atfile = atfile + 1 )) +done + +if (( failedcount > 0 )); then + log_fail "$failedcount of $FILE_COUNT files had wrong checksums" \ + "while pool was degraded" +fi + +# +# Online the first disk back. +# +log_must zpool online $TESTPOOL $disk0 + +# +# Wait for resilver to complete. +# +for i in {1..60}; do + check_state $TESTPOOL "" "online" && break + sleep 1 +done + +# +# Verify the pool is back to ONLINE. +# +log_must check_state $TESTPOOL "" "online" + +log_pass "Offlining all disks from a minimum anymirror1 pool is rejected" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_anyraidz_001_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_anyraidz_001_pos.ksh new file mode 100755 index 000000000000..00bde30a1658 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_anyraidz_001_pos.ksh @@ -0,0 +1,136 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# Offlining a disk from an anyraidz1:2 pool with surplus disks +# preserves data integrity. The pool becomes DEGRADED but all data +# remains readable. After onlining the disk back, the pool returns +# to ONLINE. This test is self-contained. +# +# STRATEGY: +# 1. Create an anyraidz1:2 pool with 4 disks (minimum is 3). +# 2. Write data and record xxh128 checksums. +# 3. Offline one disk. +# 4. Verify pool is DEGRADED and all checksums match. +# 5. Online the disk back. +# 6. Wait for resilver, verify pool returns to ONLINE. +# 7. Scrub and verify no errors. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + log_must delete_sparse_files +} + +log_onexit cleanup + +log_assert "Offlining a disk from an anyraidz1:2 pool with surplus disks preserves data" + +# +# Create sparse disk files and pool. +# +log_must create_sparse_files "disk" 4 $DEVSIZE +log_must zpool create -f $TESTPOOL anyraidz1:2 $disk0 $disk1 $disk2 $disk3 + +# +# Write files and record checksums. +# +typeset -i atfile=0 +set -A files +set -A cksums +typeset newcksum + +while (( atfile < FILE_COUNT )); do + files[$atfile]=/$TESTPOOL/file.$atfile + log_must file_write -o create -f ${files[$atfile]} \ + -b $FILE_SIZE -c 1 + cksums[$atfile]=$(xxh128digest ${files[$atfile]}) + (( atfile = atfile + 1 )) +done + +log_must zpool sync $TESTPOOL + +# +# Offline the fourth disk. +# +log_must zpool offline $TESTPOOL $disk3 + +# +# Verify the pool is DEGRADED. +# +log_must check_state $TESTPOOL "" "degraded" +log_must check_state $TESTPOOL $disk3 "offline" + +# +# Verify all file checksums still match while degraded. +# +atfile=0 +typeset -i failedcount=0 +while (( atfile < FILE_COUNT )); do + newcksum=$(xxh128digest ${files[$atfile]}) + if [[ $newcksum != ${cksums[$atfile]} ]]; then + (( failedcount = failedcount + 1 )) + fi + (( atfile = atfile + 1 )) +done + +if (( failedcount > 0 )); then + log_fail "$failedcount of $FILE_COUNT files had wrong checksums" \ + "while pool was degraded" +fi + +# +# Online the disk back. +# +log_must zpool online $TESTPOOL $disk3 + +# +# Wait for resilver to complete. +# +for i in {1..60}; do + check_state $TESTPOOL "" "online" && break + sleep 1 +done + +# +# Verify the pool is back to ONLINE. +# +log_must check_state $TESTPOOL "" "online" + +# +# Run scrub and verify no errors. +# +log_must zpool scrub $TESTPOOL +log_must wait_scrubbed $TESTPOOL + +log_pass "Offlining a disk from an anyraidz1:2 pool with surplus disks preserves data" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_anyraidz_002_neg.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_anyraidz_002_neg.ksh new file mode 100755 index 000000000000..eff92ea05e36 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_anyraidz_002_neg.ksh @@ -0,0 +1,133 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# Offlining too many disks from an anyraidz1:2 pool is rejected. +# An anyraidz1:2 pool with 3 disks (minimum: parity=1 + data=2) +# can tolerate 1 offline disk, but offlining a second must fail. +# This test is self-contained. +# +# STRATEGY: +# 1. Create an anyraidz1:2 pool with 3 disks (minimum). +# 2. Write data and record xxh128 checksums. +# 3. Offline one disk (succeeds, pool DEGRADED). +# 4. Attempt to offline a second disk (must fail). +# 5. Verify data integrity while degraded. +# 6. Online the first disk back, verify pool returns to ONLINE. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + log_must delete_sparse_files +} + +log_onexit cleanup + +log_assert "Offlining too many disks from a minimum anyraidz1:2 pool is rejected" + +# +# Create sparse disk files and pool with minimum disks. +# anyraidz1:2 needs parity(1) + data(2) = 3 disks minimum. +# +log_must create_sparse_files "disk" 3 $DEVSIZE +log_must zpool create -f $TESTPOOL anyraidz1:2 $disk0 $disk1 $disk2 + +# +# Write files and record checksums. +# +typeset -i atfile=0 +set -A files +set -A cksums +typeset newcksum + +while (( atfile < FILE_COUNT )); do + files[$atfile]=/$TESTPOOL/file.$atfile + log_must file_write -o create -f ${files[$atfile]} \ + -b $FILE_SIZE -c 1 + cksums[$atfile]=$(xxh128digest ${files[$atfile]}) + (( atfile = atfile + 1 )) +done + +log_must zpool sync $TESTPOOL + +# +# Offline the first disk. This should succeed because parity=1 +# can tolerate 1 offline disk. +# +log_must zpool offline $TESTPOOL $disk0 +log_must check_state $TESTPOOL "" "degraded" +log_must check_state $TESTPOOL $disk0 "offline" + +# +# Attempt to offline the second disk. This must fail because +# it would exceed the parity tolerance. +# +log_mustnot zpool offline $TESTPOOL $disk1 + +# +# Verify all file checksums still match while degraded. +# +atfile=0 +typeset -i failedcount=0 +while (( atfile < FILE_COUNT )); do + newcksum=$(xxh128digest ${files[$atfile]}) + if [[ $newcksum != ${cksums[$atfile]} ]]; then + (( failedcount = failedcount + 1 )) + fi + (( atfile = atfile + 1 )) +done + +if (( failedcount > 0 )); then + log_fail "$failedcount of $FILE_COUNT files had wrong checksums" \ + "while pool was degraded" +fi + +# +# Online the first disk back. +# +log_must zpool online $TESTPOOL $disk0 + +# +# Wait for resilver to complete. +# +for i in {1..60}; do + check_state $TESTPOOL "" "online" && break + sleep 1 +done + +# +# Verify the pool is back to ONLINE. +# +log_must check_state $TESTPOOL "" "online" + +log_pass "Offlining too many disks from a minimum anyraidz1:2 pool is rejected" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_write_online_resilver.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_write_online_resilver.ksh new file mode 100755 index 000000000000..e44bc56eda3e --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_offline_write_online_resilver.ksh @@ -0,0 +1,130 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# AnyRAID mirror can resilver a disk after it gets back online. +# +# STRATEGY: +# 1. Offline one disk. +# 2. Write to the pool. +# 3. Get that disk back online. +# 4. Get it resilvered. +# + +verify_runnable "global" + +log_assert "AnyRAID mirror can resilver a disk after it gets back online" + +cleanup() { + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +# 1-parity + +for vdev in "anymirror1" "anyraidz1:1"; do + log_must create_sparse_files "disk" 3 $DEVSIZE + log_must zpool create -f $TESTPOOL $vdev $disks + + log_must zpool offline $TESTPOOL $disk0 + log_must check_state $TESTPOOL $disk0 "offline" + log_must check_state $TESTPOOL "" "degraded" + + log_must file_write -o create -f /$TESTPOOL/file.bin -b 1048576 -c 128 -d R + log_must zpool online $TESTPOOL $disk0 + log_must check_state $TESTPOOL $disk0 "online" + for i in {1..60}; do + check_state $TESTPOOL "" "online" && break + sleep 1 + done + zpool status + log_must check_state $TESTPOOL "" "online" + + log_must destroy_pool $TESTPOOL +done + + +# anymirror2 + +log_must create_sparse_files "disk" 5 $DEVSIZE +log_must zpool create -f $TESTPOOL anymirror2 $disks + +log_must zpool offline $TESTPOOL $disk0 +log_must zpool offline $TESTPOOL $disk1 +log_must check_state $TESTPOOL $disk0 "offline" +log_must check_state $TESTPOOL $disk1 "offline" +log_must check_state $TESTPOOL "" "degraded" + +log_must file_write -o create -f /$TESTPOOL/file.bin -b 1048576 -c 128 -d R +log_must zpool online $TESTPOOL $disk0 +log_must zpool online $TESTPOOL $disk1 +log_must check_state $TESTPOOL $disk0 "online" +log_must check_state $TESTPOOL $disk1 "online" +for i in {1..60}; do + check_state $TESTPOOL "" "online" && break + sleep 1 +done +zpool status +log_must check_state $TESTPOOL "" "online" + +log_must destroy_pool $TESTPOOL + + +# anymirror3 + +log_must create_sparse_files "disk" 7 $DEVSIZE +log_must zpool create -f $TESTPOOL anymirror3 $disks + +log_must zpool offline $TESTPOOL $disk0 +log_must zpool offline $TESTPOOL $disk1 +log_must zpool offline $TESTPOOL $disk2 +log_must check_state $TESTPOOL $disk0 "offline" +log_must check_state $TESTPOOL $disk1 "offline" +log_must check_state $TESTPOOL $disk2 "offline" +log_must check_state $TESTPOOL "" "degraded" + +log_must file_write -o create -f /$TESTPOOL/file.bin -b 1048576 -c 128 -d R +log_must zpool online $TESTPOOL $disk0 +log_must zpool online $TESTPOOL $disk1 +log_must zpool online $TESTPOOL $disk2 +log_must check_state $TESTPOOL $disk0 "online" +log_must check_state $TESTPOOL $disk1 "online" +log_must check_state $TESTPOOL $disk2 "online" +for i in {1..60}; do + check_state $TESTPOOL "" "online" && break + sleep 1 +done +zpool status +log_must check_state $TESTPOOL "" "online" + +log_must destroy_pool $TESTPOOL + +log_pass "AnyRAID mirror can resilver a disk after it gets back online" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_properties_001_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_properties_001_pos.ksh new file mode 100755 index 000000000000..e5b8ad3e87e2 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_properties_001_pos.ksh @@ -0,0 +1,202 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that zpool get and zpool status report correct properties +# for AnyRAID vdevs. Check that feature@anyraid is active, that +# feature@physical_rewrite transitions from enabled to active after +# a rewrite, that zpool status output correctly shows the AnyRAID +# vdev structure, and that pool size/free/allocated are reported +# correctly. This test is self-contained and does not depend on any +# other test. +# +# STRATEGY: +# 1. Create an anymirror1 pool with 3 disks. +# 2. Verify feature@anyraid is active via zpool get. +# 3. Verify feature@physical_rewrite is enabled on fresh pool. +# 4. Verify zpool status shows anymirror1-0 vdev name. +# 5. Verify pool size, free, and allocated are reported correctly. +# 6. Write data, verify allocated increases. +# 7. Run zfs rewrite -P, verify feature@physical_rewrite becomes active. +# 8. Repeat with an anyraidz1:2 pool to verify raidz-style naming. +# + +verify_runnable "global" + +cleanup() { + poolexists $TESTPOOL && destroy_pool $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2,3} +} + +log_onexit cleanup + +log_assert "zpool get and zpool status report correct properties for AnyRAID" + +# +# Create backing files and set tile size. +# +log_must truncate -s 1G $TEST_BASE_DIR/vdev_file.{0,1,2,3} +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +# --------------------------------------------------------------- +# Test 1: anymirror1 pool properties +# --------------------------------------------------------------- +log_must create_pool $TESTPOOL anymirror1 \ + $TEST_BASE_DIR/vdev_file.{0,1,2} + +# +# Verify feature@anyraid is active. +# +typeset anyraid_feature=$(zpool get -H -o value feature@anyraid $TESTPOOL) +[[ "$anyraid_feature" == "active" ]] || \ + log_fail "feature@anyraid should be active, got: $anyraid_feature" + +# +# Verify feature@physical_rewrite is enabled on a fresh pool +# (per-dataset feature, refcount=0 until a rewrite occurs). +# +typeset physrw_feature=$(zpool get -H -o value feature@physical_rewrite $TESTPOOL) +[[ "$physrw_feature" == "enabled" ]] || \ + log_fail "feature@physical_rewrite should be enabled on fresh pool, got: $physrw_feature" + +# +# Verify zpool status shows the anymirror1-0 vdev name. +# +typeset status_output=$(zpool status $TESTPOOL) + +echo "$status_output" | grep -q "anymirror1-0" || \ + log_fail "zpool status should show anymirror1-0 vdev" + +# +# Verify all 3 disks appear in the status output. +# +echo "$status_output" | grep -q "vdev_file.0" || \ + log_fail "vdev_file.0 not found in zpool status" +echo "$status_output" | grep -q "vdev_file.1" || \ + log_fail "vdev_file.1 not found in zpool status" +echo "$status_output" | grep -q "vdev_file.2" || \ + log_fail "vdev_file.2 not found in zpool status" + +# +# Verify pool health is ONLINE. +# +typeset health=$(zpool get -H -o value health $TESTPOOL) +[[ "$health" == "ONLINE" ]] || \ + log_fail "Pool health should be ONLINE, got: $health" + +# +# Verify pool size is non-zero. +# +typeset pool_size=$(zpool get -H -o value -p size $TESTPOOL) +(( pool_size > 0 )) || \ + log_fail "Pool size should be > 0, got: $pool_size" + +# +# Record initial allocated value, write data, verify it increases. +# +typeset alloc_before=$(zpool get -H -o value -p allocated $TESTPOOL) + +log_must file_write -o create -b 1048576 -c 8 -d 'R' \ + -f /$TESTPOOL/proptest_file + +log_must zpool sync $TESTPOOL + +typeset alloc_after=$(zpool get -H -o value -p allocated $TESTPOOL) +(( alloc_after > alloc_before )) || \ + log_fail "Allocated should increase after write: before=$alloc_before after=$alloc_after" + +# +# Verify free space is reported and is less than total size. +# +typeset pool_free=$(zpool get -H -o value -p free $TESTPOOL) +(( pool_free > 0 )) || \ + log_fail "Free space should be > 0, got: $pool_free" +(( pool_free < pool_size )) || \ + log_fail "Free space should be less than pool size" + +# +# Perform a physical rewrite and verify feature@physical_rewrite +# transitions from enabled to active. +# +log_must zfs rewrite -P /$TESTPOOL/proptest_file +log_must zpool sync $TESTPOOL + +physrw_feature=$(zpool get -H -o value feature@physical_rewrite $TESTPOOL) +[[ "$physrw_feature" == "active" ]] || \ + log_fail "feature@physical_rewrite should be active after rewrite, got: $physrw_feature" + +log_must destroy_pool $TESTPOOL + +# --------------------------------------------------------------- +# Test 2: anyraidz1:2 pool properties +# --------------------------------------------------------------- +log_must create_pool $TESTPOOL anyraidz1:2 \ + $TEST_BASE_DIR/vdev_file.{0,1,2,3} + +# +# Verify feature@anyraid is active. +# +anyraid_feature=$(zpool get -H -o value feature@anyraid $TESTPOOL) +[[ "$anyraid_feature" == "active" ]] || \ + log_fail "feature@anyraid should be active on raidz pool, got: $anyraid_feature" + +# +# Verify zpool status shows the anyraidz vdev name. +# +status_output=$(zpool status $TESTPOOL) + +echo "$status_output" | grep -q "anyraidz.*-0" || \ + log_fail "zpool status should show anyraidz vdev name" + +# +# Verify all 4 disks appear in the status output. +# +echo "$status_output" | grep -q "vdev_file.0" || \ + log_fail "vdev_file.0 not found in zpool status" +echo "$status_output" | grep -q "vdev_file.3" || \ + log_fail "vdev_file.3 not found in zpool status" + +# +# Verify pool health is ONLINE. +# +health=$(zpool get -H -o value health $TESTPOOL) +[[ "$health" == "ONLINE" ]] || \ + log_fail "Raidz pool health should be ONLINE, got: $health" + +# +# Run scrub and verify no errors on raidz pool. +# +log_must zpool scrub $TESTPOOL +log_must zpool wait -t scrub $TESTPOOL + +log_must check_pool_status $TESTPOOL state ONLINE true + +log_pass "zpool get and zpool status report correct properties for AnyRAID" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_001.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_001.ksh new file mode 100755 index 000000000000..09ec1beba35c --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_001.ksh @@ -0,0 +1,84 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Anyraid rebalance works correctly +# +# STRATEGY: +# 1. Create an anymirror1 vdev with several small disks +# 2. Fill the small disks +# 3. Attach a larger disk +# 4. Rebalance the vdev +# 5. Verify that available space has increased after completion +# 6. Verify that scrub found no errors +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm $TEST_BASE_DIR/vdev_file.{0,1,2,3,4,5} +} + +log_onexit cleanup + +log_must truncate -s 768M $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} +log_must truncate -s 10G $TEST_BASE_DIR/vdev_file.5 +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "Anyraid rebalance works correctly" + +log_must create_pool $TESTPOOL anymirror1 $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} + +log_must file_write -o create -b 1048576 -c 950 -d 'R' -f /$TESTPOOL/f1 + +cap=$(zpool get -Hp -o value size $TESTPOOL) +[[ "$cap" -eq $((17 * 64 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space for anyraid vdev: $cap" + +log_must zpool attach $TESTPOOL anymirror1-0 $TEST_BASE_DIR/vdev_file.5 +log_must zpool rebalance $TESTPOOL anymirror1-0 +cap=$(zpool get -Hp -o value size $TESTPOOL) +[[ "$cap" -eq $((18 * 64 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space for anyraid vdev: $cap" +log_must zpool wait -t anyraid_relocate,scrub $TESTPOOL +log_must zpool sync $TESTPOOL + +cap=$(zpool get -Hp -o value size $TESTPOOL) +[[ "$cap" -eq $((35 * 64 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space for anyraid vdev: $cap" + +log_must check_pool_status $TESTPOOL state ONLINE true +log_must is_pool_scrubbed $TESTPOOL true +cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | awk 'NF > 2 && $5 != 0' | wc -l) +[[ "$cksum_count" -eq 0 ]] || log_fail "checksum errors detected" + +log_pass "Anyraid rebalance works correctly" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_002.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_002.ksh new file mode 100755 index 000000000000..63fe0290f729 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_002.ksh @@ -0,0 +1,100 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Anyraid rebalance works correctly when paused and resumed +# +# STRATEGY: +# 1. Create an anymirror1 vdev with several small disks +# 2. Fill the small disks +# 3. Attach a larger disk +# 4. Rebalance the vdev +# 5. Pause the rebalance +# 6. Export and import the pool +# 7. Resume the rebalance +# 8. Verify that available space has increased after completion +# 9. Verify that scrub found no errors +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + restore_tunable ANYRAID_RELOCATE_MAX_BYTES_PAUSE + rm $TEST_BASE_DIR/vdev_file.{0,1,2,3,4,5} +} + +log_onexit cleanup + +log_must truncate -s 775M $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} +log_must truncate -s 1088M $TEST_BASE_DIR/vdev_file.5 +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 +save_tunable ANYRAID_RELOCATE_MAX_BYTES_PAUSE +set_tunable64 ANYRAID_RELOCATE_MAX_BYTES_PAUSE $((16 * 1024 * 1024)) + +log_assert "Anyraid rebalance works correctly when paused and resumed" + +log_must create_pool $TESTPOOL anymirror1 $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} + +log_must file_write -o create -b 1048576 -c 600 -d 'R' -f /$TESTPOOL/f1 + +cap=$(zpool get -Hp -o value size $TESTPOOL) +[[ "$cap" -eq $((20 * 64 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space for anyraid vdev: $cap" + +log_must zpool attach $TESTPOOL anymirror1-0 $TEST_BASE_DIR/vdev_file.5 +log_must file_write -o create -b 1048576 -c 240 -d 'R' -f /$TESTPOOL/f2 + + +log_must zpool rebalance $TESTPOOL anymirror1-0 +cap=$(zpool get -Hp -o value size $TESTPOOL) +[[ "$cap" -eq $((26 * 64 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space for anyraid vdev: $cap" + +log_must sleep 1 +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL -d $TEST_BASE_DIR + +set_tunable64 ANYRAID_RELOCATE_MAX_BYTES_PAUSE 0 + +log_must zpool wait -t anyraid_relocate,scrub $TESTPOOL +log_must zpool sync $TESTPOOL + +cap=$(zpool get -Hp -o value size $TESTPOOL) +[[ "$cap" -eq $((26 * 64 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space for anyraid vdev: $cap" + +log_must check_pool_status $TESTPOOL state ONLINE true +log_must is_pool_scrubbed $TESTPOOL true +cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | awk 'NF > 2 && $5 != 0' | wc -l) +[[ "$cksum_count" -eq 0 ]] || log_fail "checksum errors detected" + +log_pass "Anyraid rebalance works correctly" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_003.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_003.ksh new file mode 100755 index 000000000000..62123b3a8f18 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_003.ksh @@ -0,0 +1,105 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Anyraid rebalance with a same-size disk preserves data integrity +# +# STRATEGY: +# 1. Create an anymirror1 vdev with equal-sized disks +# 2. Write data and record checksums +# 3. Attach a new disk of the same size +# 4. Run rebalance +# 5. Verify all data checksums are unchanged +# 6. Verify that scrub found no errors +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm $TEST_BASE_DIR/vdev_file.{0,1,2,3,4,5} +} + +log_onexit cleanup + +log_must truncate -s 768M $TEST_BASE_DIR/vdev_file.{0,1,2,3,4,5} +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "Anyraid rebalance with a same-size disk preserves data integrity" + +log_must create_pool $TESTPOOL anymirror1 $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} + +# +# Write several files and record their checksums before rebalance. +# +typeset -i file_count=10 +typeset -i idx=0 +set -A cksums + +while (( idx < file_count )); do + log_must file_write -o create -b 1048576 -c 1 -d 'R' \ + -f /$TESTPOOL/file.$idx + cksums[$idx]=$(xxh128digest /$TESTPOOL/file.$idx) + (( idx = idx + 1 )) +done + +cap_before=$(zpool get -Hp -o value size $TESTPOOL) + +# +# Attach a same-size disk and rebalance. Since the new disk is the +# same size as the existing disks, the capacity increase should be +# modest (one more disk contributing tiles). +# +log_must zpool attach $TESTPOOL anymirror1-0 $TEST_BASE_DIR/vdev_file.5 +log_must zpool rebalance $TESTPOOL anymirror1-0 +log_must zpool wait -t anyraid_relocate,scrub $TESTPOOL +log_must zpool sync $TESTPOOL + +# +# Verify all file checksums are unchanged. +# +idx=0 +while (( idx < file_count )); do + newcksum=$(xxh128digest /$TESTPOOL/file.$idx) + [[ "$newcksum" == "${cksums[$idx]}" ]] || \ + log_fail "Checksum mismatch for file.$idx: expected=${cksums[$idx]} got=$newcksum" + (( idx = idx + 1 )) +done + +# +# Verify pool health and no checksum errors. +# +log_must check_pool_status $TESTPOOL state ONLINE true +log_must is_pool_scrubbed $TESTPOOL true +cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | awk 'NF > 2 && $5 != 0' | wc -l) +[[ "$cksum_count" -eq 0 ]] || log_fail "checksum errors detected" + +log_pass "Anyraid rebalance with a same-size disk preserves data integrity" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_004.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_004.ksh new file mode 100755 index 000000000000..80792e335f2a --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_004.ksh @@ -0,0 +1,113 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Anyraid rebalance works correctly after attaching multiple new disks +# +# STRATEGY: +# 1. Create an anymirror1 vdev with 3 small disks +# 2. Write data and record checksums +# 3. Attach 2 larger disks +# 4. Rebalance the vdev +# 5. Verify that available space has increased after completion +# 6. Verify all data checksums are unchanged +# 7. Verify that scrub found no errors +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} +} + +log_onexit cleanup + +log_must truncate -s 768M $TEST_BASE_DIR/vdev_file.{0,1,2} +log_must truncate -s 10G $TEST_BASE_DIR/vdev_file.{3,4} +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "Anyraid rebalance works correctly after attaching multiple new disks" + +log_must create_pool $TESTPOOL anymirror1 $TEST_BASE_DIR/vdev_file.{0,1,2} + +# +# Write several files and record their checksums before attaching new disks. +# +typeset -i file_count=10 +typeset -i idx=0 +set -A cksums + +while (( idx < file_count )); do + log_must file_write -o create -b 1048576 -c 1 -d 'R' \ + -f /$TESTPOOL/file.$idx + cksums[$idx]=$(xxh128digest /$TESTPOOL/file.$idx) + (( idx = idx + 1 )) +done + +cap_before=$(zpool get -Hp -o value size $TESTPOOL) + +# +# Attach two larger disks and then rebalance. +# +log_must zpool attach $TESTPOOL anymirror1-0 $TEST_BASE_DIR/vdev_file.3 +log_must zpool attach $TESTPOOL anymirror1-0 $TEST_BASE_DIR/vdev_file.4 +log_must zpool rebalance $TESTPOOL anymirror1-0 +log_must zpool wait -t anyraid_relocate,scrub $TESTPOOL +log_must zpool sync $TESTPOOL + +# +# Verify capacity increased after rebalancing with the new disks. +# +cap_after=$(zpool get -Hp -o value size $TESTPOOL) +[[ "$cap_after" -gt "$cap_before" ]] || \ + log_fail "Capacity did not increase after rebalance: before=$cap_before after=$cap_after" + +# +# Verify all file checksums are unchanged. +# +idx=0 +while (( idx < file_count )); do + newcksum=$(xxh128digest /$TESTPOOL/file.$idx) + [[ "$newcksum" == "${cksums[$idx]}" ]] || \ + log_fail "Checksum mismatch for file.$idx: expected=${cksums[$idx]} got=$newcksum" + (( idx = idx + 1 )) +done + +# +# Verify pool health and no checksum errors. +# +log_must check_pool_status $TESTPOOL state ONLINE true +log_must is_pool_scrubbed $TESTPOOL true +cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | awk 'NF > 2 && $5 != 0' | wc -l) +[[ "$cksum_count" -eq 0 ]] || log_fail "checksum errors detected" + +log_pass "Anyraid rebalance works correctly after attaching multiple new disks" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_005.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_005.ksh new file mode 100755 index 000000000000..354f336375c9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_005.ksh @@ -0,0 +1,124 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Anyraid rebalance with a concurrent scrub does not cause errors +# or data corruption. The rebalance operation automatically triggers +# a scrub upon completion. This test fills the pool with enough data +# to make the rebalance take significant time, then issues an explicit +# scrub immediately after starting the rebalance. If the explicit scrub +# fails because a scrub is already in progress (auto-started by the +# rebalance), that is acceptable. The test verifies data integrity and +# pool health after both operations complete. +# +# STRATEGY: +# 1. Create an anymirror1 vdev with several small disks +# 2. Fill with substantial data and record checksums +# 3. Attach a larger disk +# 4. Start rebalance +# 5. Immediately attempt to start a scrub +# 6. Wait for rebalance and scrub to complete +# 7. Verify all data checksums are unchanged +# 8. Verify pool health and no checksum errors +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm $TEST_BASE_DIR/vdev_file.{0,1,2,3,4,5} +} + +log_onexit cleanup + +log_must truncate -s 768M $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} +log_must truncate -s 10G $TEST_BASE_DIR/vdev_file.5 + +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "Scrub during anyraid rebalance does not cause errors or corruption" + +log_must create_pool $TESTPOOL anymirror1 $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} + +# +# Write substantial data and record checksums. Use enough data so that +# the rebalance takes measurable time. +# +typeset -i file_count=10 +typeset -i idx=0 +set -A cksums + +while (( idx < file_count )); do + log_must file_write -o create -b 1048576 -c 50 -d 'R' \ + -f /$TESTPOOL/file.$idx + cksums[$idx]=$(xxh128digest /$TESTPOOL/file.$idx) + (( idx = idx + 1 )) +done + +# +# Attach a larger disk and start rebalance. +# +log_must zpool attach $TESTPOOL anymirror1-0 $TEST_BASE_DIR/vdev_file.5 +log_must zpool rebalance $TESTPOOL anymirror1-0 + +# +# Immediately attempt a scrub. The rebalance may auto-start a scrub, +# so this may fail with "currently scrubbing" which is acceptable. +# Either way, a scrub will run concurrently with the rebalance. +# +zpool scrub $TESTPOOL +if [[ $? -ne 0 ]]; then + log_note "Scrub already in progress (auto-started by rebalance), continuing" +fi + +log_must zpool wait -t anyraid_relocate,scrub $TESTPOOL +log_must zpool sync $TESTPOOL + +# +# Verify all file checksums are unchanged. +# +idx=0 +while (( idx < file_count )); do + newcksum=$(xxh128digest /$TESTPOOL/file.$idx) + [[ "$newcksum" == "${cksums[$idx]}" ]] || \ + log_fail "Checksum mismatch for file.$idx: expected=${cksums[$idx]} got=$newcksum" + (( idx = idx + 1 )) +done + +# +# Verify pool health and no checksum errors. +# +log_must check_pool_status $TESTPOOL state ONLINE true +log_must is_pool_scrubbed $TESTPOOL true +cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | awk 'NF > 2 && $5 != 0' | wc -l) +[[ "$cksum_count" -eq 0 ]] || log_fail "checksum errors detected" + +log_pass "Scrub during anyraid rebalance does not cause errors or corruption" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_006.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_006.ksh new file mode 100755 index 000000000000..da6cad0d8483 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_006.ksh @@ -0,0 +1,130 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Replacing a disk while an anyraid rebalance is running (or has just +# completed) is handled correctly. The pool recovers to ONLINE state +# and all data remains intact. +# +# STRATEGY: +# 1. Create an anymirror1 vdev with several small disks +# 2. Fill with substantial data and record checksums +# 3. Attach a larger disk +# 4. Start rebalance +# 5. Fail one original disk (truncate to 0) +# 6. Replace the failed disk with a spare +# 7. Wait for all operations to complete (rebalance, resilver, scrub) +# 8. Verify pool returns to ONLINE state +# 9. Verify all data checksums are unchanged +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2,3,4,5} + rm -f $TEST_BASE_DIR/vdev_spare +} + +log_onexit cleanup + +log_must truncate -s 768M $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} +log_must truncate -s 10G $TEST_BASE_DIR/vdev_file.5 +log_must truncate -s 768M $TEST_BASE_DIR/vdev_spare + +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "Disk replace during anyraid rebalance is handled correctly" + +log_must create_pool $TESTPOOL anymirror1 $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} + +# +# Write substantial data and record checksums. +# +typeset -i file_count=10 +typeset -i idx=0 +set -A cksums + +while (( idx < file_count )); do + log_must file_write -o create -b 1048576 -c 50 -d 'R' \ + -f /$TESTPOOL/file.$idx + cksums[$idx]=$(xxh128digest /$TESTPOOL/file.$idx) + (( idx = idx + 1 )) +done + +# +# Attach a larger disk and start rebalance. +# +log_must zpool attach $TESTPOOL anymirror1-0 $TEST_BASE_DIR/vdev_file.5 +log_must zpool rebalance $TESTPOOL anymirror1-0 + +# +# Fail one of the original disks by truncating it to zero. +# This simulates a disk failure during or immediately after rebalance. +# +log_must truncate -s 0 $TEST_BASE_DIR/vdev_file.1 + +# +# Replace the failed disk with a spare. If replace fails during +# active rebalance, wait for rebalance to finish and retry. +# +zpool replace $TESTPOOL $TEST_BASE_DIR/vdev_file.1 $TEST_BASE_DIR/vdev_spare +if [[ $? -ne 0 ]]; then + log_note "Replace failed during rebalance, waiting and retrying" + log_must zpool wait -t anyraid_relocate $TESTPOOL + log_must zpool replace $TESTPOOL $TEST_BASE_DIR/vdev_file.1 $TEST_BASE_DIR/vdev_spare +fi + +log_must zpool wait -t anyraid_relocate,resilver,scrub $TESTPOOL +log_must zpool sync $TESTPOOL + +# +# Clear any errors from the failed disk so pool can return to ONLINE. +# +log_must zpool clear $TESTPOOL + +# +# Verify all file checksums are unchanged. +# +idx=0 +while (( idx < file_count )); do + newcksum=$(xxh128digest /$TESTPOOL/file.$idx) + [[ "$newcksum" == "${cksums[$idx]}" ]] || \ + log_fail "Checksum mismatch for file.$idx: expected=${cksums[$idx]} got=$newcksum" + (( idx = idx + 1 )) +done + +# +# Verify pool health. +# +log_must check_pool_status $TESTPOOL state ONLINE true + +log_pass "Disk replace during anyraid rebalance is handled correctly" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_007.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_007.ksh new file mode 100755 index 000000000000..c8222e4ad82a --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_007.ksh @@ -0,0 +1,135 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Interrupting an anyraid rebalance via export/import and then +# restarting it completes successfully with all data intact. +# +# STRATEGY: +# 1. Create an anymirror1 vdev with several small disks +# 2. Fill with substantial data and record checksums +# 3. Attach a larger disk +# 4. Start rebalance +# 5. Interrupt the rebalance by exporting the pool +# 6. Re-import the pool +# 7. Verify pool is healthy after interrupted rebalance +# 8. Restart rebalance +# 9. Wait for completion +# 10. Verify all data checksums are unchanged +# 11. Verify pool health and no checksum errors +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2,3,4,5} +} + +log_onexit cleanup + +log_must truncate -s 768M $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} +log_must truncate -s 10G $TEST_BASE_DIR/vdev_file.5 + +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "Interrupted anyraid rebalance can be restarted and completes correctly" + +log_must create_pool $TESTPOOL anymirror1 $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} + +# +# Write substantial data and record checksums. +# +typeset -i file_count=10 +typeset -i idx=0 +set -A cksums + +while (( idx < file_count )); do + log_must file_write -o create -b 1048576 -c 50 -d 'R' \ + -f /$TESTPOOL/file.$idx + cksums[$idx]=$(xxh128digest /$TESTPOOL/file.$idx) + (( idx = idx + 1 )) +done + +# +# Attach a larger disk and start rebalance. +# +log_must zpool attach $TESTPOOL anymirror1-0 $TEST_BASE_DIR/vdev_file.5 +log_must zpool rebalance $TESTPOOL anymirror1-0 + +# +# Interrupt the rebalance by exporting the pool. The rebalance may +# already be complete by the time we export (since it can be fast), +# but the export/import cycle still validates the interrupted path. +# +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR $TESTPOOL + +log_must check_pool_status $TESTPOOL state ONLINE true + +# +# Restart the rebalance. If the rebalance already completed before +# the export, this may fail or be a no-op. Handle both cases. +# NOTE: Known bug - rebalance with no pending work hangs. Only +# restart if the rebalance was actually interrupted. +# +zpool rebalance $TESTPOOL anymirror1-0 +rebalance_rc=$? +if [[ $rebalance_rc -eq 0 ]]; then + log_must zpool wait -t anyraid_relocate,scrub $TESTPOOL +else + log_note "Rebalance restart returned $rebalance_rc, rebalance may have completed before export" +fi + +log_must zpool sync $TESTPOOL + +# +# Wait for any auto-started scrub to finish. +# +zpool wait -t scrub $TESTPOOL + +# +# Verify all file checksums are unchanged. +# +idx=0 +while (( idx < file_count )); do + newcksum=$(xxh128digest /$TESTPOOL/file.$idx) + [[ "$newcksum" == "${cksums[$idx]}" ]] || \ + log_fail "Checksum mismatch for file.$idx: expected=${cksums[$idx]} got=$newcksum" + (( idx = idx + 1 )) +done + +# +# Verify pool health. +# +log_must check_pool_status $TESTPOOL state ONLINE true + +log_pass "Interrupted anyraid rebalance can be restarted and completes correctly" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_008.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_008.ksh new file mode 100755 index 000000000000..53dd6fe93aad --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_008.ksh @@ -0,0 +1,146 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Resilvering a disk during an AnyRAID rebalance is handled +# correctly. The rebalance pauses, and then resumes after the resilver +# completes. After the rebalance completes, the pool recovers to ONLINE +# state and data integrity is preserved. This test is self-contained +# and does not depend on any other test. +# +# STRATEGY: +# 1. Create an anymirror1 pool with several small disks. +# 2. Fill with substantial data and record checksums. +# 3. Attach a larger disk and start rebalance. +# 4. Resilver one of the original disks during rebalance. +# 5. Verify pool handles the offline gracefully. +# 6. Wait for rebalance and any resilver/scrub to complete. +# 7. Verify pool recovers to ONLINE state. +# 8. Verify all data checksums are unchanged. +# + +verify_runnable "global" + +cleanup() { + poolexists $TESTPOOL && destroy_pool $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2,3,4,5,6} +} + +log_onexit cleanup + +log_assert "Disk resilver during anyraid rebalance is handled correctly" + +# +# Create backing files. +# +log_must truncate -s 768M $TEST_BASE_DIR/vdev_file.{0,1,2,3,4,6} +log_must truncate -s 10G $TEST_BASE_DIR/vdev_file.5 +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +# +# Create pool and write substantial data. +# +log_must create_pool $TESTPOOL anymirror1 \ + $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} + +typeset -i file_count=10 +typeset -i idx=0 +set -A cksums + +while (( idx < file_count )); do + log_must file_write -o create -b 1048576 -c 50 -d 'R' \ + -f /$TESTPOOL/file.$idx + cksums[$idx]=$(xxh128digest /$TESTPOOL/file.$idx) + (( idx = idx + 1 )) +done + +log_must zpool sync $TESTPOOL + +# +# Attach a larger disk and start rebalance. +# +log_must zpool attach $TESTPOOL anymirror1-0 $TEST_BASE_DIR/vdev_file.5 +log_must zpool rebalance $TESTPOOL anymirror1-0 + +# +# Offline one of the original disks during rebalance. +# Use a brief sleep to let rebalance get underway. +# +sleep 1 + +log_must zpool offline $TESTPOOL $TEST_BASE_DIR/vdev_file.2 + +log_must zpool replace $TESTPOOL $TEST_BASE_DIR/vdev_file.2 $TEST_BASE_DIR/vdev_file.6 + +# +# Wait for the rebalance to complete (it may continue in degraded +# mode, or it may have already finished before the offline took effect). +# +log_must zpool wait -t anyraid_relocate $TESTPOOL + +# +# Clear any transient errors from the resilver. +# +log_must zpool clear $TESTPOOL + +# +# Verify the pool is ONLINE. +# +log_must check_pool_status $TESTPOOL state ONLINE true + +# +# Verify all file checksums are unchanged. +# +idx=0 +typeset -i failedcount=0 +while (( idx < file_count )); do + typeset newcksum=$(xxh128digest /$TESTPOOL/file.$idx) + if [[ "$newcksum" != "${cksums[$idx]}" ]]; then + (( failedcount = failedcount + 1 )) + fi + (( idx = idx + 1 )) +done + +if (( failedcount > 0 )); then + log_fail "$failedcount of $file_count files had wrong checksums" \ + "after resilver during rebalance" +fi + +# +# Final scrub to confirm no lingering errors. +# +zpool scrub $TESTPOOL +if (( $? != 0 )); then + log_note "Scrub already in progress, waiting" +fi +zpool wait -t scrub $TESTPOOL 2>/dev/null + +log_pass "Disk resilver during anyraid rebalance is handled correctly" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_009.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_009.ksh new file mode 100755 index 000000000000..759784530bf8 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_rebalance_009.ksh @@ -0,0 +1,148 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Taking a disk offline during an AnyRAID rebalance is handled +# correctly. The rebalance either completes or pauses gracefully. +# After bringing the disk back online, the pool recovers to ONLINE +# state and data integrity is preserved. This test is self-contained +# and does not depend on any other test. +# +# STRATEGY: +# 1. Create an anymirror1 pool with several small disks. +# 2. Fill with substantial data and record checksums. +# 3. Attach a larger disk and start rebalance. +# 4. Offline one of the original disks during rebalance. +# 5. Verify pool handles the offline gracefully. +# 6. Online the disk back. +# 7. Verify pool recovers to ONLINE state. +# 8. Verify all data checksums are unchanged. +# + +verify_runnable "global" + +cleanup() { + poolexists $TESTPOOL && destroy_pool $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2,3,4,5} +} + +log_onexit cleanup + +log_assert "Disk offline during anyraid rebalance is handled correctly" + +# +# Create backing files. +# +log_must truncate -s 768M $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} +log_must truncate -s 10G $TEST_BASE_DIR/vdev_file.5 +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +# +# Create pool and write substantial data. +# +log_must create_pool $TESTPOOL anymirror1 \ + $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} + +typeset -i file_count=10 +typeset -i idx=0 +set -A cksums + +while (( idx < file_count )); do + log_must file_write -o create -b 1048576 -c 50 -d 'R' \ + -f /$TESTPOOL/file.$idx + cksums[$idx]=$(xxh128digest /$TESTPOOL/file.$idx) + (( idx = idx + 1 )) +done + +log_must zpool sync $TESTPOOL + +# +# Attach a larger disk and start rebalance. +# +log_must zpool attach $TESTPOOL anymirror1-0 $TEST_BASE_DIR/vdev_file.5 +log_must zpool rebalance $TESTPOOL anymirror1-0 + +# +# Offline one of the original disks during rebalance. +# Use a brief sleep to let rebalance get underway. +# +sleep 1 + +log_must zpool offline $TESTPOOL $TEST_BASE_DIR/vdev_file.2 + +sleep 1 + +log_must zpool online $TESTPOOL $TEST_BASE_DIR/vdev_file.2 + +# +# Wait for the rebalance to complete (it may continue in degraded +# mode, or it may have already finished before the offline took effect). +# +log_must zpool wait -t anyraid_relocate $TESTPOOL + +# +# Clear any transient errors from the offline/online cycle. +# +log_must zpool clear $TESTPOOL + +# +# Verify the pool is ONLINE. +# +log_must check_pool_status $TESTPOOL state ONLINE true + +# +# Verify all file checksums are unchanged. +# +idx=0 +typeset -i failedcount=0 +while (( idx < file_count )); do + typeset newcksum=$(xxh128digest /$TESTPOOL/file.$idx) + if [[ "$newcksum" != "${cksums[$idx]}" ]]; then + (( failedcount = failedcount + 1 )) + fi + (( idx = idx + 1 )) +done + +if (( failedcount > 0 )); then + log_fail "$failedcount of $file_count files had wrong checksums" \ + "after offline during rebalance" +fi + +# +# Final scrub to confirm no lingering errors. +# +zpool scrub $TESTPOOL +if (( $? != 0 )); then + log_note "Scrub already in progress, waiting" +fi +zpool wait -t scrub $TESTPOOL 2>/dev/null + +log_pass "Disk offline during anyraid rebalance is handled correctly" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_scrub_001_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_scrub_001_pos.ksh new file mode 100755 index 000000000000..40edc712db30 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_scrub_001_pos.ksh @@ -0,0 +1,135 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that scrub on an AnyRAID pool detects and repairs errors +# from a partially corrupted disk. Create an anymirror1 pool with +# 3 disks, write data, punch a small hole in one disk to cause +# checksum mismatches, run scrub, and verify scrub reports repaired +# blocks and data is still intact. +# This test is self-contained and does not depend on any other test. +# +# STRATEGY: +# 1. Create an anymirror1 pool with 3 disks. +# 2. Write data and record xxh128 checksums. +# 3. Sync and export/import to flush caches. +# 4. Punch a small hole in one disk (enough to cause checksum +# mismatches but not enough to lose data with parity=1). +# 5. Run scrub. +# 6. Verify scrub reports repaired blocks. +# 7. Verify all data checksums are still correct. +# + +verify_runnable "global" + +cleanup() { + poolexists $TESTPOOL && destroy_pool $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2} +} + +log_onexit cleanup + +log_assert "Scrub on AnyRAID pool detects and repairs errors from corrupted disk" + +# +# Create backing files and set tile size. +# +log_must truncate -s 1G $TEST_BASE_DIR/vdev_file.{0,1,2} +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +# +# Create the pool. +# +log_must create_pool $TESTPOOL anymirror1 \ + $TEST_BASE_DIR/vdev_file.{0,1,2} + +# +# Write files and record checksums. +# +set -A cksums +typeset -i idx=0 + +while (( idx < 10 )); do + log_must file_write -o create -b 1048576 -c 1 -d 'R' \ + -f /$TESTPOOL/file.$idx + cksums[$idx]=$(xxh128digest /$TESTPOOL/file.$idx) + (( idx = idx + 1 )) +done + +# +# Sync the pool to ensure all data is on disk. +# +log_must zpool sync $TESTPOOL + +# +# Export and re-import to flush all caches. +# +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR $TESTPOOL + +# +# Punch a small hole in one disk to corrupt some blocks. +# This corrupts a region starting at offset 512K with length 64K. +# The mirror parity will allow recovery. +# +log_must punch_hole $((64 * 1024 * 8)) $((64 * 1024)) \ + $TEST_BASE_DIR/vdev_file.1 + +# +# Run scrub to detect and repair the corruption. +# +log_must zpool scrub $TESTPOOL +log_must zpool wait -t scrub $TESTPOOL + +# +# Verify the pool is still ONLINE. +# +log_must check_pool_status $TESTPOOL state ONLINE true + +# +# Verify all data checksums are still correct (scrub should have +# repaired the corrupted blocks using parity data). +# +idx=0 +typeset -i failedcount=0 +while (( idx < 10 )); do + typeset newcksum=$(xxh128digest /$TESTPOOL/file.$idx) + if [[ "$newcksum" != "${cksums[$idx]}" ]]; then + (( failedcount = failedcount + 1 )) + fi + (( idx = idx + 1 )) +done + +if (( failedcount > 0 )); then + log_fail "$failedcount of 10 files had wrong checksums after scrub repair" +fi + +log_pass "Scrub on AnyRAID pool detects and repairs errors from corrupted disk" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_send_recv_001_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_send_recv_001_pos.ksh new file mode 100755 index 000000000000..3dde0530f99c --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_send_recv_001_pos.ksh @@ -0,0 +1,155 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify ZFS send/receive works with AnyRAID pool as source and +# destination. Create an anymirror1 source pool, write data, take +# a snapshot, send the snapshot to a second AnyRAID pool, and +# verify the received data matches the source checksums. +# This test is self-contained and does not depend on any other test. +# +# STRATEGY: +# 1. Create an anymirror1 source pool with 3 disks. +# 2. Write data and record xxh128 checksums. +# 3. Take a snapshot. +# 4. Create a second anymirror1 destination pool with 3 disks. +# 5. Send the snapshot to the destination pool via zfs send | zfs recv. +# 6. Verify received data matches the source checksums. +# 7. Run scrub on both pools, verify no errors. +# + +verify_runnable "global" + +SRC_POOL="${TESTPOOL}_src" +DST_POOL="${TESTPOOL}_dst" +SNAP_NAME="$SRC_POOL@send_snap" + +cleanup() { + poolexists $SRC_POOL && destroy_pool $SRC_POOL + poolexists $DST_POOL && destroy_pool $DST_POOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/src_vdev.{0,1,2} + rm -f $TEST_BASE_DIR/dst_vdev.{0,1,2} +} + +log_onexit cleanup + +log_assert "ZFS send/receive works correctly with AnyRAID pools" + +# +# Create backing files and set tile size. +# +log_must truncate -s 1G $TEST_BASE_DIR/src_vdev.{0,1,2} +log_must truncate -s 1G $TEST_BASE_DIR/dst_vdev.{0,1,2} +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +# +# Create the source pool. +# +log_must zpool create -f $SRC_POOL anymirror1 \ + $TEST_BASE_DIR/src_vdev.{0,1,2} + +# +# Write files and record checksums on the source pool. +# +set -A cksums +typeset -i idx=0 + +while (( idx < 8 )); do + log_must file_write -o create -b 1048576 -c 1 -d 'R' \ + -f /$SRC_POOL/file.$idx + cksums[$idx]=$(xxh128digest /$SRC_POOL/file.$idx) + (( idx = idx + 1 )) +done + +# +# Also write a larger file. +# +log_must file_write -o create -b 1048576 -c 16 -d 'R' \ + -f /$SRC_POOL/largefile +typeset large_cksum=$(xxh128digest /$SRC_POOL/largefile) + +log_must zpool sync $SRC_POOL + +# +# Take a snapshot on the source pool. +# +log_must zfs snapshot $SNAP_NAME + +# +# Create the destination pool. +# +log_must zpool create -f $DST_POOL anymirror1 \ + $TEST_BASE_DIR/dst_vdev.{0,1,2} + +# +# Send the snapshot to the destination pool. +# +log_must eval "zfs send $SNAP_NAME | zfs recv $DST_POOL/received" + +# +# Verify received data checksums match the source. +# +idx=0 +while (( idx < 8 )); do + typeset newcksum=$(xxh128digest /$DST_POOL/received/file.$idx) + [[ "$newcksum" == "${cksums[$idx]}" ]] || \ + log_fail "Checksum mismatch for received file.$idx: expected=${cksums[$idx]} got=$newcksum" + (( idx = idx + 1 )) +done + +typeset new_large_cksum=$(xxh128digest /$DST_POOL/received/largefile) +[[ "$new_large_cksum" == "$large_cksum" ]] || \ + log_fail "Checksum mismatch for received largefile: expected=$large_cksum got=$new_large_cksum" + +# +# Run scrub on both pools and verify no errors. +# +log_must zpool scrub $SRC_POOL +log_must zpool wait -t scrub $SRC_POOL +log_must check_pool_status $SRC_POOL state ONLINE true +log_must is_pool_scrubbed $SRC_POOL true + +typeset src_cksum_count=$(zpool status -v $SRC_POOL | grep ONLINE | \ + awk 'NF > 2 && $5 != 0' | wc -l) +(( src_cksum_count == 0 )) || \ + log_fail "Checksum errors detected on source pool after scrub" + +log_must zpool scrub $DST_POOL +log_must zpool wait -t scrub $DST_POOL +log_must check_pool_status $DST_POOL state ONLINE true +log_must is_pool_scrubbed $DST_POOL true + +typeset dst_cksum_count=$(zpool status -v $DST_POOL | grep ONLINE | \ + awk 'NF > 2 && $5 != 0' | wc -l) +(( dst_cksum_count == 0 )) || \ + log_fail "Checksum errors detected on destination pool after scrub" + +log_pass "ZFS send/receive works correctly with AnyRAID pools" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_snapshot_001_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_snapshot_001_pos.ksh new file mode 100755 index 000000000000..35cfa7408df5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_snapshot_001_pos.ksh @@ -0,0 +1,150 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify snapshots and rollbacks work correctly on AnyRAID pools. +# Write file A, take a snapshot, write file B, rollback to the +# snapshot, and verify file A exists while file B does not. +# This test is self-contained and does not depend on any other test. +# +# STRATEGY: +# 1. Create an anymirror1 pool with 3 disks. +# 2. Write file A and record its checksum. +# 3. Take a snapshot. +# 4. Write file B. +# 5. Rollback to the snapshot. +# 6. Verify file A exists with correct checksum. +# 7. Verify file B does not exist. +# 8. Export/import to verify persistence. +# 9. Run scrub, verify no errors. +# + +verify_runnable "global" + +SNAP_NAME="$TESTPOOL@snap1" + +cleanup() { + poolexists $TESTPOOL && destroy_pool $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2} +} + +log_onexit cleanup + +log_assert "AnyRAID snapshots and rollbacks preserve data correctly" + +# +# Create backing files and set tile size. +# +log_must truncate -s 1G $TEST_BASE_DIR/vdev_file.{0,1,2} +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +# +# Create the pool. +# +log_must create_pool $TESTPOOL anymirror1 \ + $TEST_BASE_DIR/vdev_file.{0,1,2} + +# +# Write file A and record its checksum. +# +log_must file_write -o create -b 1048576 -c 4 -d 'R' \ + -f /$TESTPOOL/file_a +typeset cksum_a=$(xxh128digest /$TESTPOOL/file_a) + +log_must zpool sync $TESTPOOL + +# +# Take a snapshot. +# +log_must zfs snapshot $SNAP_NAME + +# +# Write file B after the snapshot. +# +log_must file_write -o create -b 1048576 -c 4 -d 'R' \ + -f /$TESTPOOL/file_b + +[[ -f /$TESTPOOL/file_b ]] || \ + log_fail "file_b should exist before rollback" + +log_must zpool sync $TESTPOOL + +# +# Rollback to the snapshot. +# +log_must zfs rollback $SNAP_NAME + +# +# Verify file A still exists with the correct checksum. +# +[[ -f /$TESTPOOL/file_a ]] || \ + log_fail "file_a should exist after rollback" + +typeset new_cksum_a=$(xxh128digest /$TESTPOOL/file_a) +[[ "$new_cksum_a" == "$cksum_a" ]] || \ + log_fail "file_a checksum mismatch after rollback: expected=$cksum_a got=$new_cksum_a" + +# +# Verify file B does NOT exist after rollback. +# +[[ ! -f /$TESTPOOL/file_b ]] || \ + log_fail "file_b should not exist after rollback to snapshot" + +# +# Export/import to verify persistence of the rollback state. +# +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR $TESTPOOL + +[[ -f /$TESTPOOL/file_a ]] || \ + log_fail "file_a should exist after export/import" + +typeset reimport_cksum_a=$(xxh128digest /$TESTPOOL/file_a) +[[ "$reimport_cksum_a" == "$cksum_a" ]] || \ + log_fail "file_a checksum mismatch after reimport: expected=$cksum_a got=$reimport_cksum_a" + +[[ ! -f /$TESTPOOL/file_b ]] || \ + log_fail "file_b should not exist after export/import post-rollback" + +# +# Run scrub and verify no errors. +# +log_must zpool scrub $TESTPOOL +log_must zpool wait -t scrub $TESTPOOL + +log_must check_pool_status $TESTPOOL state ONLINE true +log_must is_pool_scrubbed $TESTPOOL true + +typeset cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | \ + awk 'NF > 2 && $5 != 0' | wc -l) +(( cksum_count == 0 )) || log_fail "Checksum errors detected after scrub" + +log_pass "AnyRAID snapshots and rollbacks preserve data correctly" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_special_vdev_001_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_special_vdev_001_pos.ksh new file mode 100755 index 000000000000..fe068024d9be --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_special_vdev_001_pos.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# Verify a variety of AnyRAID pools with a special VDEV mirror. +# +# STRATEGY: +# 1. Create an AnyRAID pool with a special VDEV mirror. +# 2. Write to it, sync. +# 3. Export and re-import the pool. +# 4. Verify that all the file contents are unchanged on the file system. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} +log_onexit cleanup + +log_assert "Verify a variety of AnyRAID pools with a special VDEV mirror" + +log_must create_sparse_files "disk" 4 $DEVSIZE +log_must create_sparse_files "sdisk" 2 $DEVSIZE + +typeset oldcksum +typeset newcksum +for parity in {0..3}; do + log_must zpool create -f $TESTPOOL anymirror$parity $disks special mirror $sdisks + log_must poolexists $TESTPOOL + log_must zfs set special_small_blocks=4k $TESTPOOL + + log_must file_write -o create -f /$TESTPOOL/file.bin -b 1048576 -c 1 + log_must file_write -o create -f /$TESTPOOL/small.bin -b 4096 -c 1 + oldcksum=$(xxh128digest /$TESTPOOL/file.bin) + oldsmallcksum=$(xxh128digest /$TESTPOOL/small.bin) + log_must zpool export $TESTPOOL + + log_must zpool import -d $(dirname $disk0) $TESTPOOL + newcksum=$(xxh128digest /$TESTPOOL/file.bin) + newsmallcksum=$(xxh128digest /$TESTPOOL/small.bin) + + log_must test "$oldcksum" = "$newcksum" + log_must test "$oldsmallcksum" = "$newsmallcksum" + + log_must destroy_pool $TESTPOOL +done + +log_pass "Verify a variety of AnyRAID pools with a special VDEV mirror" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_special_vdev_002_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_special_vdev_002_pos.ksh new file mode 100755 index 000000000000..55214cac243a --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_special_vdev_002_pos.ksh @@ -0,0 +1,73 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/tests/functional/anyraid/anyraid_common.kshlib + +# +# DESCRIPTION: +# Verify a variety of AnyRAID pools with a special VDEV AnyRAID. +# +# STRATEGY: +# 1. Create an AnyRAID pool with a special VDEV AnyRAID. +# 2. Write to it, sync. +# 3. Export and re-import the pool. +# 4. Verify that all the file contents are unchanged on the file system. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} +log_onexit cleanup + +log_assert "Verify a variety of AnyRAID pools with a special VDEV AnyRAID" + +log_must create_sparse_files "disk" 4 $DEVSIZE +log_must create_sparse_files "sdisk" 4 $DEVSIZE + +typeset oldcksum +typeset newcksum +for parity in {0..3}; do + log_must zpool create $TESTPOOL anymirror$parity $disks special \ + anymirror$parity $sdisks + log_must poolexists $TESTPOOL + + log_must file_write -o create -f /$TESTPOOL/file.bin -b 1048576 -c 128 + oldcksum=$(xxh128digest /$TESTPOOL/file.bin) + log_must zpool export $TESTPOOL + + log_must zpool import -d $(dirname $disk0) $TESTPOOL + newcksum=$(xxh128digest /$TESTPOOL/file.bin) + + log_must test "$oldcksum" = "$newcksum" + + log_must destroy_pool $TESTPOOL +done + +log_pass "Verify a variety of AnyRAID pools with a special VDEV AnyRAID" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_tile_layout.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_tile_layout.ksh new file mode 100755 index 000000000000..267cc669b741 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_tile_layout.ksh @@ -0,0 +1,81 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Anyraid disks intelligently select which tiles to use +# +# STRATEGY: +# 1. Create an anymirror1 vdev with 1 large disk and 2 small disks +# 2. Verify that the full space can be used +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 +} + +log_onexit cleanup + +log_must truncate -s 512M $TEST_BASE_DIR/vdev_file.{0,1,2} +log_must truncate -s 1G $TEST_BASE_DIR/vdev_file.3 +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "Anyraid disks intelligently select which tiles to use" + +log_must create_pool $TESTPOOL anymirror1 $TEST_BASE_DIR/vdev_file.{0,1,2,3} + +cap=$(zpool get -Hp -o value size $TESTPOOL) +[[ "$cap" -eq $((9 * 64 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space for anyraid vdev: $cap" + +# +# This should just about fill the pool, when you account for the 128MiB of +# reserved slop space. If the space isn't being selected intelligently, we +# would hit ENOSPC 64MiB early. +# +log_must file_write -o create -f /$TESTPOOL/f1 -b 1048576 -c $((64 * 7 - 1)) -d R + +log_must destroy_pool $TESTPOOL +log_must create_pool $TESTPOOL anyraidz1:2 $TEST_BASE_DIR/vdev_file.{0,1,2,3} + +cap=$(zpool get -Hp -o value size $TESTPOOL) +[[ "$cap" -eq $((12 * 64 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space for anyraid vdev: $cap" + +# +# This should just about fill the pool, when you account for the 192MiB of +# reserved slop space. If the space isn't being selected intelligently, we +# would hit ENOSPC 64MiB early. +# +log_must dd if=/dev/urandom of=/$TESTPOOL/f1 bs=1M count=$((64 * 6 - 1)) + +log_pass "Anyraid disks intelligently select which tiles to use" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_001_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_001_pos.ksh new file mode 100755 index 000000000000..d059df0d8fe1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_001_pos.ksh @@ -0,0 +1,177 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Write various data patterns to an anymirror1 AnyRAID pool and verify +# they read back correctly after export/import. This tests basic +# AnyRAID mirror-style data preservation. +# +# STRATEGY: +# 1. Create pool with anymirror1 (3 disks) +# 2. Write multiple files with different patterns (random, zeros, +# known byte patterns) and varied sizes +# 3. Record xxh128 checksums of all files +# 4. Export and re-import the pool +# 5. Verify all checksums match +# 6. Run scrub, verify no errors +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2} +} + +log_onexit cleanup + +log_must truncate -s 1G $TEST_BASE_DIR/vdev_file.{0,1,2} + +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "anymirror1 pool preserves data correctly across export/import" + +log_must create_pool $TESTPOOL anymirror1 \ + $TEST_BASE_DIR/vdev_file.{0,1,2} + +# +# Write files of varied sizes and patterns, record checksums. +# +set -A cksums +typeset -i cksum_idx=0 + +# +# Small 4K files with random data (10 files). +# +typeset -i idx=0 +while (( idx < 10 )); do + log_must file_write -o create -b 4096 -c 1 -d 'R' \ + -f /$TESTPOOL/small_random.$idx + cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/small_random.$idx) + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# +# Medium 1MiB files with random data (5 files). +# +idx=0 +while (( idx < 5 )); do + log_must file_write -o create -b 1048576 -c 1 -d 'R' \ + -f /$TESTPOOL/medium_random.$idx + cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/medium_random.$idx) + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# +# Large 128MiB file with random data (1 file). +# +log_must file_write -o create -b 1048576 -c 128 -d 'R' \ + -f /$TESTPOOL/large_random.0 +cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/large_random.0) +(( cksum_idx = cksum_idx + 1 )) + +# +# Zero-filled files (3 files at 64K each). +# +idx=0 +while (( idx < 3 )); do + log_must dd if=/dev/zero of=/$TESTPOOL/zeros.$idx \ + bs=65536 count=1 2>/dev/null + cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/zeros.$idx) + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +typeset -i total_files=$cksum_idx + +# +# Export and re-import the pool. +# +log_must zpool sync $TESTPOOL +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR $TESTPOOL + +# +# Verify all checksums after import. +# +cksum_idx=0 + +# Verify small random files. +idx=0 +while (( idx < 10 )); do + newcksum=$(xxh128digest /$TESTPOOL/small_random.$idx) + [[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for small_random.$idx: expected=${cksums[$cksum_idx]} got=$newcksum" + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# Verify medium random files. +idx=0 +while (( idx < 5 )); do + newcksum=$(xxh128digest /$TESTPOOL/medium_random.$idx) + [[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for medium_random.$idx: expected=${cksums[$cksum_idx]} got=$newcksum" + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# Verify large random file. +newcksum=$(xxh128digest /$TESTPOOL/large_random.0) +[[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for large_random.0: expected=${cksums[$cksum_idx]} got=$newcksum" +(( cksum_idx = cksum_idx + 1 )) + +# Verify zero files. +idx=0 +while (( idx < 3 )); do + newcksum=$(xxh128digest /$TESTPOOL/zeros.$idx) + [[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for zeros.$idx: expected=${cksums[$cksum_idx]} got=$newcksum" + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# +# Run scrub and verify no errors. +# +log_must zpool scrub $TESTPOOL +log_must zpool wait -t scrub $TESTPOOL + +log_must check_pool_status $TESTPOOL state ONLINE true +log_must is_pool_scrubbed $TESTPOOL true + +cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | awk 'NF > 2 && $5 != 0' | wc -l) +(( cksum_count == 0 )) || log_fail "checksum errors detected after scrub" + +log_pass "anymirror1 pool preserves data correctly across export/import" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_002_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_002_pos.ksh new file mode 100755 index 000000000000..b2b2effa5fba --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_002_pos.ksh @@ -0,0 +1,176 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Write various data patterns to an anyraidz1:1 AnyRAID pool and verify +# they read back correctly after export/import. This tests raidz1 with +# a data width of 1. +# +# STRATEGY: +# 1. Create pool with anyraidz1:1 (3 disks) +# 2. Write multiple files with different patterns and varied sizes +# 3. Record xxh128 checksums of all files +# 4. Export and re-import the pool +# 5. Verify all checksums match +# 6. Run scrub, verify no errors +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2} +} + +log_onexit cleanup + +log_must truncate -s 1G $TEST_BASE_DIR/vdev_file.{0,1,2} + +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "anyraidz1:1 pool preserves data correctly across export/import" + +log_must create_pool $TESTPOOL anyraidz1:1 \ + $TEST_BASE_DIR/vdev_file.{0,1,2} + +# +# Write files of varied sizes and patterns, record checksums. +# +set -A cksums +typeset -i cksum_idx=0 + +# +# Small 4K files with random data (10 files). +# +typeset -i idx=0 +while (( idx < 10 )); do + log_must file_write -o create -b 4096 -c 1 -d 'R' \ + -f /$TESTPOOL/small_random.$idx + cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/small_random.$idx) + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# +# Medium 1MiB files with random data (5 files). +# +idx=0 +while (( idx < 5 )); do + log_must file_write -o create -b 1048576 -c 1 -d 'R' \ + -f /$TESTPOOL/medium_random.$idx + cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/medium_random.$idx) + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# +# Large 128MiB file with random data (1 file). +# +log_must file_write -o create -b 1048576 -c 128 -d 'R' \ + -f /$TESTPOOL/large_random.0 +cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/large_random.0) +(( cksum_idx = cksum_idx + 1 )) + +# +# Zero-filled files (3 files at 64K each). +# +idx=0 +while (( idx < 3 )); do + log_must dd if=/dev/zero of=/$TESTPOOL/zeros.$idx \ + bs=65536 count=1 2>/dev/null + cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/zeros.$idx) + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +typeset -i total_files=$cksum_idx + +# +# Export and re-import the pool. +# +log_must zpool sync $TESTPOOL +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR $TESTPOOL + +# +# Verify all checksums after import. +# +cksum_idx=0 + +# Verify small random files. +idx=0 +while (( idx < 10 )); do + newcksum=$(xxh128digest /$TESTPOOL/small_random.$idx) + [[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for small_random.$idx: expected=${cksums[$cksum_idx]} got=$newcksum" + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# Verify medium random files. +idx=0 +while (( idx < 5 )); do + newcksum=$(xxh128digest /$TESTPOOL/medium_random.$idx) + [[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for medium_random.$idx: expected=${cksums[$cksum_idx]} got=$newcksum" + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# Verify large random file. +newcksum=$(xxh128digest /$TESTPOOL/large_random.0) +[[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for large_random.0: expected=${cksums[$cksum_idx]} got=$newcksum" +(( cksum_idx = cksum_idx + 1 )) + +# Verify zero files. +idx=0 +while (( idx < 3 )); do + newcksum=$(xxh128digest /$TESTPOOL/zeros.$idx) + [[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for zeros.$idx: expected=${cksums[$cksum_idx]} got=$newcksum" + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# +# Run scrub and verify no errors. +# +log_must zpool scrub $TESTPOOL +log_must zpool wait -t scrub $TESTPOOL + +log_must check_pool_status $TESTPOOL state ONLINE true +log_must is_pool_scrubbed $TESTPOOL true + +cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | awk 'NF > 2 && $5 != 0' | wc -l) +(( cksum_count == 0 )) || log_fail "checksum errors detected after scrub" + +log_pass "anyraidz1:1 pool preserves data correctly across export/import" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_003_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_003_pos.ksh new file mode 100755 index 000000000000..af3fb12a5d32 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_003_pos.ksh @@ -0,0 +1,152 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Write various data patterns to an anyraidz1:2 pool and verify they +# read back correctly after export/import. This tests raidz-style +# AnyRAID with parity=1 and data_width=2 (minimum 3 disks). +# +# STRATEGY: +# 1. Create pool with anyraidz1:2 (4 disks) +# 2. Write files with varied sizes (small 4K, medium 1MiB, large 128MiB) +# 3. Record xxh128 checksums of all files +# 4. Export and re-import the pool +# 5. Verify all checksums match +# 6. Run scrub, verify no errors +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2,3} +} + +log_onexit cleanup + +log_must truncate -s 1G $TEST_BASE_DIR/vdev_file.{0,1,2,3} + +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "anyraidz1:2 pool preserves data correctly across export/import" + +log_must create_pool $TESTPOOL anyraidz1:2 \ + $TEST_BASE_DIR/vdev_file.{0,1,2,3} + +# +# Write files of varied sizes and record checksums. +# +set -A cksums +typeset -i cksum_idx=0 + +# +# Small 4K files (10 files). +# +typeset -i idx=0 +while (( idx < 10 )); do + log_must file_write -o create -b 4096 -c 1 -d 'R' \ + -f /$TESTPOOL/small.$idx + cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/small.$idx) + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# +# Medium 1MiB files (5 files). +# +idx=0 +while (( idx < 5 )); do + log_must file_write -o create -b 1048576 -c 1 -d 'R' \ + -f /$TESTPOOL/medium.$idx + cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/medium.$idx) + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# +# Large 128MiB file (1 file). +# +log_must file_write -o create -b 1048576 -c 128 -d 'R' \ + -f /$TESTPOOL/large.0 +cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/large.0) +(( cksum_idx = cksum_idx + 1 )) + +typeset -i total_files=$cksum_idx + +# +# Export and re-import the pool. +# +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR $TESTPOOL + +# +# Verify all checksums after import. +# +cksum_idx=0 + +# Verify small files. +idx=0 +while (( idx < 10 )); do + newcksum=$(xxh128digest /$TESTPOOL/small.$idx) + [[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for small.$idx: expected=${cksums[$cksum_idx]} got=$newcksum" + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# Verify medium files. +idx=0 +while (( idx < 5 )); do + newcksum=$(xxh128digest /$TESTPOOL/medium.$idx) + [[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for medium.$idx: expected=${cksums[$cksum_idx]} got=$newcksum" + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# Verify large file. +newcksum=$(xxh128digest /$TESTPOOL/large.0) +[[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for large.0: expected=${cksums[$cksum_idx]} got=$newcksum" + +# +# Run scrub and verify no errors. +# +log_must zpool scrub $TESTPOOL +log_must zpool wait -t scrub $TESTPOOL + +log_must check_pool_status $TESTPOOL state ONLINE true +log_must is_pool_scrubbed $TESTPOOL true + +cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | awk 'NF > 2 && $5 != 0' | wc -l) +[[ "$cksum_count" -eq 0 ]] || log_fail "checksum errors detected after scrub" + +log_pass "anyraidz1:2 pool preserves data correctly across export/import" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_004_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_004_pos.ksh new file mode 100755 index 000000000000..90f3455554d4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_004_pos.ksh @@ -0,0 +1,176 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Write various data patterns to an anyraidz2:2 AnyRAID pool and verify +# they read back correctly after export/import. This tests raidz2 with +# a data width of 2. +# +# STRATEGY: +# 1. Create pool with anyraidz2:2 (5 disks: 2 parity + 2 data = 4 min, use 5) +# 2. Write multiple files with different patterns and varied sizes +# 3. Record xxh128 checksums of all files +# 4. Export and re-import the pool +# 5. Verify all checksums match +# 6. Run scrub, verify no errors +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} +} + +log_onexit cleanup + +log_must truncate -s 1G $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} + +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "anyraidz2:2 pool preserves data correctly across export/import" + +log_must create_pool $TESTPOOL anyraidz2:2 \ + $TEST_BASE_DIR/vdev_file.{0,1,2,3,4} + +# +# Write files of varied sizes and patterns, record checksums. +# +set -A cksums +typeset -i cksum_idx=0 + +# +# Small 4K files with random data (10 files). +# +typeset -i idx=0 +while (( idx < 10 )); do + log_must file_write -o create -b 4096 -c 1 -d 'R' \ + -f /$TESTPOOL/small_random.$idx + cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/small_random.$idx) + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# +# Medium 1MiB files with random data (5 files). +# +idx=0 +while (( idx < 5 )); do + log_must file_write -o create -b 1048576 -c 1 -d 'R' \ + -f /$TESTPOOL/medium_random.$idx + cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/medium_random.$idx) + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# +# Large 128MiB file with random data (1 file). +# +log_must file_write -o create -b 1048576 -c 128 -d 'R' \ + -f /$TESTPOOL/large_random.0 +cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/large_random.0) +(( cksum_idx = cksum_idx + 1 )) + +# +# Zero-filled files (3 files at 64K each). +# +idx=0 +while (( idx < 3 )); do + log_must dd if=/dev/zero of=/$TESTPOOL/zeros.$idx \ + bs=65536 count=1 2>/dev/null + cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/zeros.$idx) + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +typeset -i total_files=$cksum_idx + +# +# Export and re-import the pool. +# +log_must zpool sync $TESTPOOL +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR $TESTPOOL + +# +# Verify all checksums after import. +# +cksum_idx=0 + +# Verify small random files. +idx=0 +while (( idx < 10 )); do + newcksum=$(xxh128digest /$TESTPOOL/small_random.$idx) + [[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for small_random.$idx: expected=${cksums[$cksum_idx]} got=$newcksum" + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# Verify medium random files. +idx=0 +while (( idx < 5 )); do + newcksum=$(xxh128digest /$TESTPOOL/medium_random.$idx) + [[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for medium_random.$idx: expected=${cksums[$cksum_idx]} got=$newcksum" + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# Verify large random file. +newcksum=$(xxh128digest /$TESTPOOL/large_random.0) +[[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for large_random.0: expected=${cksums[$cksum_idx]} got=$newcksum" +(( cksum_idx = cksum_idx + 1 )) + +# Verify zero files. +idx=0 +while (( idx < 3 )); do + newcksum=$(xxh128digest /$TESTPOOL/zeros.$idx) + [[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for zeros.$idx: expected=${cksums[$cksum_idx]} got=$newcksum" + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# +# Run scrub and verify no errors. +# +log_must zpool scrub $TESTPOOL +log_must zpool wait -t scrub $TESTPOOL + +log_must check_pool_status $TESTPOOL state ONLINE true +log_must is_pool_scrubbed $TESTPOOL true + +cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | awk 'NF > 2 && $5 != 0' | wc -l) +(( cksum_count == 0 )) || log_fail "checksum errors detected after scrub" + +log_pass "anyraidz2:2 pool preserves data correctly across export/import" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_005_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_005_pos.ksh new file mode 100755 index 000000000000..362a6751b74a --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_005_pos.ksh @@ -0,0 +1,176 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Write various data patterns to an anyraidz3:3 AnyRAID pool and verify +# they read back correctly after export/import. This tests raidz3 with +# a data width of 3. +# +# STRATEGY: +# 1. Create pool with anyraidz3:3 (7 disks: 3 parity + 3 data = 6 min, use 7) +# 2. Write multiple files with different patterns and varied sizes +# 3. Record xxh128 checksums of all files +# 4. Export and re-import the pool +# 5. Verify all checksums match +# 6. Run scrub, verify no errors +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2,3,4,5,6} +} + +log_onexit cleanup + +log_must truncate -s 1G $TEST_BASE_DIR/vdev_file.{0,1,2,3,4,5,6} + +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "anyraidz3:3 pool preserves data correctly across export/import" + +log_must create_pool $TESTPOOL anyraidz3:3 \ + $TEST_BASE_DIR/vdev_file.{0,1,2,3,4,5,6} + +# +# Write files of varied sizes and patterns, record checksums. +# +set -A cksums +typeset -i cksum_idx=0 + +# +# Small 4K files with random data (10 files). +# +typeset -i idx=0 +while (( idx < 10 )); do + log_must file_write -o create -b 4096 -c 1 -d 'R' \ + -f /$TESTPOOL/small_random.$idx + cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/small_random.$idx) + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# +# Medium 1MiB files with random data (5 files). +# +idx=0 +while (( idx < 5 )); do + log_must file_write -o create -b 1048576 -c 1 -d 'R' \ + -f /$TESTPOOL/medium_random.$idx + cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/medium_random.$idx) + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# +# Large 128MiB file with random data (1 file). +# +log_must file_write -o create -b 1048576 -c 128 -d 'R' \ + -f /$TESTPOOL/large_random.0 +cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/large_random.0) +(( cksum_idx = cksum_idx + 1 )) + +# +# Zero-filled files (3 files at 64K each). +# +idx=0 +while (( idx < 3 )); do + log_must dd if=/dev/zero of=/$TESTPOOL/zeros.$idx \ + bs=65536 count=1 2>/dev/null + cksums[$cksum_idx]=$(xxh128digest /$TESTPOOL/zeros.$idx) + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +typeset -i total_files=$cksum_idx + +# +# Export and re-import the pool. +# +log_must zpool sync $TESTPOOL +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR $TESTPOOL + +# +# Verify all checksums after import. +# +cksum_idx=0 + +# Verify small random files. +idx=0 +while (( idx < 10 )); do + newcksum=$(xxh128digest /$TESTPOOL/small_random.$idx) + [[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for small_random.$idx: expected=${cksums[$cksum_idx]} got=$newcksum" + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# Verify medium random files. +idx=0 +while (( idx < 5 )); do + newcksum=$(xxh128digest /$TESTPOOL/medium_random.$idx) + [[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for medium_random.$idx: expected=${cksums[$cksum_idx]} got=$newcksum" + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# Verify large random file. +newcksum=$(xxh128digest /$TESTPOOL/large_random.0) +[[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for large_random.0: expected=${cksums[$cksum_idx]} got=$newcksum" +(( cksum_idx = cksum_idx + 1 )) + +# Verify zero files. +idx=0 +while (( idx < 3 )); do + newcksum=$(xxh128digest /$TESTPOOL/zeros.$idx) + [[ "$newcksum" == "${cksums[$cksum_idx]}" ]] || \ + log_fail "Checksum mismatch for zeros.$idx: expected=${cksums[$cksum_idx]} got=$newcksum" + (( cksum_idx = cksum_idx + 1 )) + (( idx = idx + 1 )) +done + +# +# Run scrub and verify no errors. +# +log_must zpool scrub $TESTPOOL +log_must zpool wait -t scrub $TESTPOOL + +log_must check_pool_status $TESTPOOL state ONLINE true +log_must is_pool_scrubbed $TESTPOOL true + +cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | awk 'NF > 2 && $5 != 0' | wc -l) +(( cksum_count == 0 )) || log_fail "checksum errors detected after scrub" + +log_pass "anyraidz3:3 pool preserves data correctly across export/import" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_006_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_006_pos.ksh new file mode 100755 index 000000000000..16daa42230c3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_006_pos.ksh @@ -0,0 +1,105 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Write a large sequential file that fills a significant portion of an +# anymirror1 pool with mixed-size disks, then verify data integrity +# across export/import. This exercises AnyRAID's tile layout with +# heterogeneous disk sizes under heavy sequential write load. +# +# STRATEGY: +# 1. Create anymirror1 pool with 3 mixed-size disks (1G, 1536M, 2G). +# 2. Set ANYRAID_MIN_TILE_SIZE to 64MiB. +# 3. Write a large sequential file (~600MiB) using most pool capacity. +# 4. Record xxh128 checksum. +# 5. Export and re-import the pool. +# 6. Verify checksum matches. +# 7. Run scrub, verify no errors. +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2} +} + +log_onexit cleanup + +log_must truncate -s 1G $TEST_BASE_DIR/vdev_file.0 +log_must truncate -s 1536M $TEST_BASE_DIR/vdev_file.1 +log_must truncate -s 2G $TEST_BASE_DIR/vdev_file.2 + +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "Large sequential write to anymirror1 pool with mixed-size disks preserves data across export/import" + +log_must create_pool $TESTPOOL anymirror1 \ + $TEST_BASE_DIR/vdev_file.0 \ + $TEST_BASE_DIR/vdev_file.1 \ + $TEST_BASE_DIR/vdev_file.2 + +# +# Write a large sequential file (~600MiB) to fill most of the pool. +# Using file_write with 1MiB blocks x 600 count. +# +log_must file_write -o create -b 1048576 -c 600 -d 'R' \ + -f /$TESTPOOL/large_sequential.0 + +cksum_large=$(xxh128digest /$TESTPOOL/large_sequential.0) + +# +# Export and re-import the pool. +# +log_must zpool sync $TESTPOOL +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR $TESTPOOL + +# +# Verify checksum after import. +# +newcksum=$(xxh128digest /$TESTPOOL/large_sequential.0) +[[ "$newcksum" == "$cksum_large" ]] || \ + log_fail "Checksum mismatch for large_sequential.0: expected=$cksum_large got=$newcksum" + +# +# Run scrub and verify no errors. +# +log_must zpool scrub $TESTPOOL +log_must zpool wait -t scrub $TESTPOOL + +log_must check_pool_status $TESTPOOL state ONLINE true +log_must is_pool_scrubbed $TESTPOOL true + +cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | awk 'NF > 2 && $5 != 0' | wc -l) +(( cksum_count == 0 )) || log_fail "checksum errors detected after scrub" + +log_pass "Large sequential write to anymirror1 pool with mixed-size disks preserves data across export/import" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_007_neg.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_007_neg.ksh new file mode 100755 index 000000000000..8cd38dff7359 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_007_neg.ksh @@ -0,0 +1,95 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that writing more data than an anymirror1 pool with small +# mixed-size disks can hold results in ENOSPC. This is the inverse +# of test 006 -- it confirms the pool correctly enforces capacity +# limits with heterogeneous disk sizes. +# +# STRATEGY: +# 1. Create anymirror1 pool with 3 small mixed-size disks (512M, 768M, 1G). +# 2. Set ANYRAID_MIN_TILE_SIZE to 64MiB. +# 3. Attempt to write a 600MiB file (exceeds usable capacity). +# 4. Verify the write fails (ENOSPC). +# 5. Verify pool is still ONLINE and healthy after the failure. +# 6. Verify any partially written data is still readable (no corruption). +# + +verify_runnable "global" + +cleanup() { + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2} +} + +log_onexit cleanup + +log_must truncate -s 512M $TEST_BASE_DIR/vdev_file.0 +log_must truncate -s 768M $TEST_BASE_DIR/vdev_file.1 +log_must truncate -s 1G $TEST_BASE_DIR/vdev_file.2 + +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "Writing 600MiB to a small mixed-size anymirror1 pool must fail with ENOSPC" + +log_must create_pool $TESTPOOL anymirror1 \ + $TEST_BASE_DIR/vdev_file.0 \ + $TEST_BASE_DIR/vdev_file.1 \ + $TEST_BASE_DIR/vdev_file.2 + +# +# Attempt to write 600MiB. This must fail because the pool's usable +# capacity (~512MiB with mirroring) is less than 600MiB. +# +file_write -o create -b 1048576 -c 600 -d 'R' \ + -f /$TESTPOOL/oversized_file.0 +write_rc=$? + +if (( write_rc == 0 )); then + log_fail "600MiB write succeeded but should have failed with ENOSPC" +fi + +# +# Verify the pool is still healthy after the ENOSPC. +# +log_must check_pool_status $TESTPOOL state ONLINE true + +# +# Run scrub to verify no corruption from the partial write. +# +log_must zpool scrub $TESTPOOL +log_must zpool wait -t scrub $TESTPOOL + +cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | awk 'NF > 2 && $5 != 0' | wc -l) +(( cksum_count == 0 )) || log_fail "checksum errors detected after scrub" + +log_pass "Writing 600MiB to a small mixed-size anymirror1 pool must fail with ENOSPC" diff --git a/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_008_pos.ksh b/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_008_pos.ksh new file mode 100755 index 000000000000..6d7a3503a770 --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/anyraid_write_verify_008_pos.ksh @@ -0,0 +1,124 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Launch multiple concurrent writers to an anymirror1 AnyRAID pool, +# then verify all data reads back correctly after export/import. +# This exercises AnyRAID under concurrent I/O pressure. +# +# STRATEGY: +# 1. Create anymirror1 pool with 4 disks. +# 2. Set ANYRAID_MIN_TILE_SIZE to 64MiB. +# 3. Launch 8 background file_write processes writing to separate files. +# 4. Wait for all writers to complete. +# 5. Record xxh128 checksums of all files. +# 6. Export and re-import the pool. +# 7. Verify all checksums match. +# 8. Run scrub, verify no errors. +# + +verify_runnable "global" + +NUM_WRITERS=8 +FILE_SIZE_MB=32 + +cleanup() { + zpool destroy $TESTPOOL + set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + rm -f $TEST_BASE_DIR/vdev_file.{0,1,2,3} +} + +log_onexit cleanup + +log_must truncate -s 1G $TEST_BASE_DIR/vdev_file.{0,1,2,3} + +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "Concurrent writes to anymirror1 pool preserve data correctly across export/import" + +log_must create_pool $TESTPOOL anymirror1 \ + $TEST_BASE_DIR/vdev_file.{0,1,2,3} + +# +# Launch concurrent writers. Each writes a 32MiB file with random data. +# +typeset -i idx=0 +while (( idx < NUM_WRITERS )); do + file_write -o create -b 1048576 -c $FILE_SIZE_MB -d 'R' \ + -f /$TESTPOOL/concurrent.$idx & + (( idx = idx + 1 )) +done + +wait + +# +# Verify all files exist and record checksums. +# +set -A cksums +idx=0 +while (( idx < NUM_WRITERS )); do + if [[ ! -f /$TESTPOOL/concurrent.$idx ]]; then + log_fail "File concurrent.$idx was not created by background writer" + fi + cksums[$idx]=$(xxh128digest /$TESTPOOL/concurrent.$idx) + (( idx = idx + 1 )) +done + +# +# Export and re-import the pool. +# +log_must zpool sync $TESTPOOL +log_must zpool export $TESTPOOL +log_must zpool import -d $TEST_BASE_DIR $TESTPOOL + +# +# Verify all checksums after import. +# +idx=0 +while (( idx < NUM_WRITERS )); do + newcksum=$(xxh128digest /$TESTPOOL/concurrent.$idx) + [[ "$newcksum" == "${cksums[$idx]}" ]] || \ + log_fail "Checksum mismatch for concurrent.$idx: expected=${cksums[$idx]} got=$newcksum" + (( idx = idx + 1 )) +done + +# +# Run scrub and verify no errors. +# +log_must zpool scrub $TESTPOOL +log_must zpool wait -t scrub $TESTPOOL + +log_must check_pool_status $TESTPOOL state ONLINE true +log_must is_pool_scrubbed $TESTPOOL true + +cksum_count=$(zpool status -v $TESTPOOL | grep ONLINE | awk 'NF > 2 && $5 != 0' | wc -l) +(( cksum_count == 0 )) || log_fail "checksum errors detected after scrub" + +log_pass "Concurrent writes to anymirror1 pool preserve data correctly across export/import" diff --git a/tests/zfs-tests/tests/functional/anyraid/cleanup.ksh b/tests/zfs-tests/tests/functional/anyraid/cleanup.ksh new file mode 100755 index 000000000000..0e239571f23a --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/cleanup.ksh @@ -0,0 +1,34 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/anyraid/default.cfg + +poolexists $TESTPOOL && destroy_pool $TESTPOOL + +log_must delete_sparse_files +restore_tunable ANYRAID_MIN_TILE_SIZE diff --git a/tests/zfs-tests/tests/functional/anyraid/default.cfg b/tests/zfs-tests/tests/functional/anyraid/default.cfg new file mode 100644 index 000000000000..db3db19fb7aa --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/default.cfg @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +export DEVSIZE=4294967296 +export DD_BLOCK=$(( 64 * 1024 )) +export DD_COUNT=$(( DEVSIZE / DD_BLOCK )) + +export FILE_COUNT=10 +export FILE_SIZE=$(( 1024 * 1024 )) diff --git a/tests/zfs-tests/tests/functional/anyraid/setup.ksh b/tests/zfs-tests/tests/functional/anyraid/setup.ksh new file mode 100755 index 000000000000..3e923fdbb0ff --- /dev/null +++ b/tests/zfs-tests/tests/functional/anyraid/setup.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/anyraid/default.cfg + +verify_runnable "global" + +save_tunable ANYRAID_MIN_TILE_SIZE +set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + +log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib index 08795a7ea257..05b087854627 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib @@ -65,6 +65,7 @@ function setup_filesystem #disklist #pool #fs #mntpoint #type #vdev if [[ $vdev != "" && \ $vdev != "mirror" && \ $vdev != "raidz" && \ + $vdev != "anymirror" && \ $vdev != "draid" ]] ; then log_note "Wrong vdev: (\"$vdev\")" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/cleanup.ksh index 3c16a6f97f4a..4ffcd5cda088 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/cleanup.ksh @@ -33,4 +33,6 @@ . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zpool_add/zpool_add.kshlib +delete_sparse_files + log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh index 82d19e850f28..df28b601762a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh @@ -49,32 +49,32 @@ verify_runnable "global" function cleanup { poolexists $TESTPOOL && destroy_pool $TESTPOOL - rm -f $disk0 $disk1 } log_assert "'zpool add ...' can add devices to the pool." log_onexit cleanup -set -A keywords "" "mirror" "raidz" "raidz1" "draid:1s" "draid1:1s" "spare" +set -A keywords "" "mirror" "raidz" "raidz1" "anymirror" "anymirror1" "anymirror2" "anymirror3" "draid:1s" "draid1:1s" "spare" "anyraidz1:2" "anyraidz2:1" + +create_sparse_files "disk" 4 $MINVDEVSIZE2 +create_sparse_files "extradisk" 4 $MINVDEVSIZE2 pooldevs="${DISK0} \ \"${DISK0} ${DISK1}\" \ \"${DISK0} ${DISK1} ${DISK2}\"" mirrordevs="\"${DISK0} ${DISK1}\"" raidzdevs="\"${DISK0} ${DISK1}\"" +anyraiddevs="\"${extradisks}\"" draiddevs="\"${DISK0} ${DISK1} ${DISK2}\"" -disk0=$TEST_BASE_DIR/disk0 -disk1=$TEST_BASE_DIR/disk1 -disk2=$TEST_BASE_DIR/disk2 -truncate -s $MINVDEVSIZE $disk0 $disk1 $disk2 typeset -i i=0 typeset vdev eval set -A poolarray $pooldevs eval set -A mirrorarray $mirrordevs eval set -A raidzarray $raidzdevs +eval set -A anyraidarray $anyraiddevs eval set -A draidarray $draiddevs while (( $i < ${#keywords[*]} )); do @@ -111,6 +111,16 @@ while (( $i < ${#keywords[*]} )); do destroy_pool "$TESTPOOL" done + ;; + any*) + for vdev in "${anyraidarray[@]}"; do + create_pool "$TESTPOOL" "${keywords[i]}" $disks + log_must poolexists "$TESTPOOL" + log_must zpool add "$TESTPOOL" ${keywords[i]} $vdev + log_must vdevs_in_pool "$TESTPOOL" "$vdev" + destroy_pool "$TESTPOOL" + done + ;; draid:1s|draid1:1s) for vdev in "${draidarray[@]}"; do diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh index 2e1590faf8f5..97749bf6f1c6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh @@ -57,17 +57,19 @@ log_assert "'zpool add' should fail if vdevs are the same or vdev is " \ log_onexit cleanup -create_pool $TESTPOOL $DISK0 +create_sparse_files "disk" 2 $MINVDEVSIZE2 + +create_pool $TESTPOOL $disk0 log_must poolexists $TESTPOOL -log_mustnot zpool add -f $TESTPOOL $DISK0 +log_mustnot zpool add -f $TESTPOOL $disk0 -for type in "" "mirror" "raidz" "draid" "spare" "log" "dedup" "special" "cache" +for type in "" "mirror" "raidz" "anymirror" "draid" "spare" "log" "dedup" "special" "cache" do - log_mustnot zpool add -f $TESTPOOL $type $DISK0 $DISK1 - log_mustnot zpool add --allow-in-use $TESTPOOL $type $DISK0 $DISK1 - log_mustnot zpool add -f $TESTPOOL $type $DISK1 $DISK1 - log_mustnot zpool add --allow-in-use $TESTPOOL $type $DISK1 $DISK1 + log_mustnot zpool add -f $TESTPOOL $type $disk0 $disk1 + log_mustnot zpool add --allow-in-use $TESTPOOL $type $disk0 $disk1 + log_mustnot zpool add -f $TESTPOOL $type $disk1 $disk1 + log_mustnot zpool add --allow-in-use $TESTPOOL $type $disk1 $disk1 done log_pass "'zpool add' get fail as expected if vdevs are the same or vdev is " \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_anyraid_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_anyraid_001_pos.ksh new file mode 100755 index 000000000000..cd504798a2dd --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_anyraid_001_pos.ksh @@ -0,0 +1,87 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that an anymirror1 vdev can be added to an existing +# traditional mirror pool. +# +# STRATEGY: +# 1. Create a pool with a traditional mirror vdev. +# 2. Add an anymirror1 vdev to the pool using -f. +# 3. Verify the pool has both vdev types via zpool status. +# 4. Write data, record checksums. +# 5. Export/import, verify integrity. +# 6. Run scrub, verify no errors. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + delete_sparse_files +} + +log_assert "anymirror1 vdev can be added to a traditional mirror pool" +log_onexit cleanup + +create_sparse_files "disk" 4 $MINVDEVSIZE2 + +log_must zpool create $TESTPOOL mirror $disk0 $disk1 +log_must poolexists $TESTPOOL + +log_must zpool add -f $TESTPOOL anymirror1 $disk2 $disk3 + +log_must zpool status $TESTPOOL +zpool status $TESTPOOL | grep -q "mirror-" +if [ $? -ne 0 ]; then + log_fail "Pool status does not show traditional mirror vdev" +fi +zpool status $TESTPOOL | grep -q "anymirror" +if [ $? -ne 0 ]; then + log_fail "Pool status does not show anymirror vdev" +fi + +log_must dd if=/dev/urandom of=/$TESTPOOL/testfile1 bs=1M count=32 +checksum1=$(xxh128sum /$TESTPOOL/testfile1) + +log_must zpool export $TESTPOOL +log_must zpool import -d /var/tmp/testdir/sparse_files $TESTPOOL + +checksum1_after=$(xxh128sum /$TESTPOOL/testfile1) +if [ "$checksum1" != "$checksum1_after" ]; then + log_fail "Checksum mismatch after import: expected=$checksum1 got=$checksum1_after" +fi + +log_must zpool scrub $TESTPOOL +log_must wait_scrubbed $TESTPOOL +log_mustnot eval "zpool status $TESTPOOL | grep -q 'errors: No known data errors' && false" + +log_pass "anymirror1 vdev can be added to a traditional mirror pool" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_anyraid_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_anyraid_002_pos.ksh new file mode 100755 index 000000000000..4aaf778aac35 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_anyraid_002_pos.ksh @@ -0,0 +1,87 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that an anyraidz1:2 vdev can be added to an existing +# traditional raidz1 pool. +# +# STRATEGY: +# 1. Create a pool with a traditional raidz1 vdev. +# 2. Add an anyraidz1:2 vdev to the pool using -f. +# 3. Verify the pool has both vdev types via zpool status. +# 4. Write data, record checksums. +# 5. Export/import, verify integrity. +# 6. Run scrub, verify no errors. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + delete_sparse_files +} + +log_assert "anyraidz1:2 vdev can be added to a traditional raidz1 pool" +log_onexit cleanup + +create_sparse_files "disk" 6 $MINVDEVSIZE2 + +log_must zpool create $TESTPOOL raidz1 $disk0 $disk1 $disk2 +log_must poolexists $TESTPOOL + +log_must zpool add -f $TESTPOOL anyraidz1:2 $disk3 $disk4 $disk5 + +log_must zpool status $TESTPOOL +zpool status $TESTPOOL | grep -q "raidz1-" +if [ $? -ne 0 ]; then + log_fail "Pool status does not show traditional raidz1 vdev" +fi +zpool status $TESTPOOL | grep -q "anyraidz" +if [ $? -ne 0 ]; then + log_fail "Pool status does not show anyraidz vdev" +fi + +log_must dd if=/dev/urandom of=/$TESTPOOL/testfile1 bs=1M count=32 +checksum1=$(xxh128sum /$TESTPOOL/testfile1) + +log_must zpool export $TESTPOOL +log_must zpool import -d /var/tmp/testdir/sparse_files $TESTPOOL + +checksum1_after=$(xxh128sum /$TESTPOOL/testfile1) +if [ "$checksum1" != "$checksum1_after" ]; then + log_fail "Checksum mismatch after import: expected=$checksum1 got=$checksum1_after" +fi + +log_must zpool scrub $TESTPOOL +log_must wait_scrubbed $TESTPOOL +log_mustnot eval "zpool status $TESTPOOL | grep -q 'errors: No known data errors' && false" + +log_pass "anyraidz1:2 vdev can be added to a traditional raidz1 pool" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_anyraid_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_anyraid_003_pos.ksh new file mode 100755 index 000000000000..fd15f7fd6e49 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_anyraid_003_pos.ksh @@ -0,0 +1,87 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that a traditional mirror vdev can be added to an existing +# anymirror1 pool. +# +# STRATEGY: +# 1. Create a pool with an anymirror1 vdev. +# 2. Add a traditional mirror vdev to the pool using -f. +# 3. Verify the pool has both vdev types via zpool status. +# 4. Write data, record checksums. +# 5. Export/import, verify integrity. +# 6. Run scrub, verify no errors. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + delete_sparse_files +} + +log_assert "Traditional mirror vdev can be added to an anymirror1 pool" +log_onexit cleanup + +create_sparse_files "disk" 4 $MINVDEVSIZE2 + +log_must zpool create $TESTPOOL anymirror1 $disk0 $disk1 +log_must poolexists $TESTPOOL + +log_must zpool add -f $TESTPOOL mirror $disk2 $disk3 + +log_must zpool status $TESTPOOL +zpool status $TESTPOOL | grep -q "anymirror" +if [ $? -ne 0 ]; then + log_fail "Pool status does not show anymirror vdev" +fi +zpool status $TESTPOOL | grep -q "mirror-" +if [ $? -ne 0 ]; then + log_fail "Pool status does not show traditional mirror vdev" +fi + +log_must dd if=/dev/urandom of=/$TESTPOOL/testfile1 bs=1M count=32 +checksum1=$(xxh128sum /$TESTPOOL/testfile1) + +log_must zpool export $TESTPOOL +log_must zpool import -d /var/tmp/testdir/sparse_files $TESTPOOL + +checksum1_after=$(xxh128sum /$TESTPOOL/testfile1) +if [ "$checksum1" != "$checksum1_after" ]; then + log_fail "Checksum mismatch after import: expected=$checksum1 got=$checksum1_after" +fi + +log_must zpool scrub $TESTPOOL +log_must wait_scrubbed $TESTPOOL +log_mustnot eval "zpool status $TESTPOOL | grep -q 'errors: No known data errors' && false" + +log_pass "Traditional mirror vdev can be added to an anymirror1 pool" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_anyraid_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_anyraid_004_pos.ksh new file mode 100755 index 000000000000..2b4412e18d69 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_anyraid_004_pos.ksh @@ -0,0 +1,87 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that a traditional raidz1 vdev can be added to an existing +# anyraidz1:2 pool. +# +# STRATEGY: +# 1. Create a pool with an anyraidz1:2 vdev. +# 2. Add a traditional raidz1 vdev to the pool using -f. +# 3. Verify the pool has both vdev types via zpool status. +# 4. Write data, record checksums. +# 5. Export/import, verify integrity. +# 6. Run scrub, verify no errors. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + delete_sparse_files +} + +log_assert "Traditional raidz1 vdev can be added to an anyraidz1:2 pool" +log_onexit cleanup + +create_sparse_files "disk" 6 $MINVDEVSIZE2 + +log_must zpool create $TESTPOOL anyraidz1:2 $disk0 $disk1 $disk2 +log_must poolexists $TESTPOOL + +log_must zpool add -f $TESTPOOL raidz1 $disk3 $disk4 $disk5 + +log_must zpool status $TESTPOOL +zpool status $TESTPOOL | grep -q "anyraidz" +if [ $? -ne 0 ]; then + log_fail "Pool status does not show anyraidz vdev" +fi +zpool status $TESTPOOL | grep -q "raidz1-" +if [ $? -ne 0 ]; then + log_fail "Pool status does not show traditional raidz1 vdev" +fi + +log_must dd if=/dev/urandom of=/$TESTPOOL/testfile1 bs=1M count=32 +checksum1=$(xxh128sum /$TESTPOOL/testfile1) + +log_must zpool export $TESTPOOL +log_must zpool import -d /var/tmp/testdir/sparse_files $TESTPOOL + +checksum1_after=$(xxh128sum /$TESTPOOL/testfile1) +if [ "$checksum1" != "$checksum1_after" ]; then + log_fail "Checksum mismatch after import: expected=$checksum1 got=$checksum1_after" +fi + +log_must zpool scrub $TESTPOOL +log_must wait_scrubbed $TESTPOOL +log_mustnot eval "zpool status $TESTPOOL | grep -q 'errors: No known data errors' && false" + +log_pass "Traditional raidz1 vdev can be added to an anyraidz1:2 pool" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_attach/zpool_attach_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/zpool_attach_002_pos.ksh new file mode 100755 index 000000000000..309eb65a1412 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/zpool_attach_002_pos.ksh @@ -0,0 +1,74 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool attach' works to expand mirrors and anyraid vdevs +# +# STRATEGY: +# 1. Create a normal striped pool +# 2. Verify that attaching creates a mirror +# 3. Verify that attaching again creates a wider mirror +# 4. Create an anyraid vdev +# 5. Verify that attaching expands the anyraid vdev +# + +verify_runnable "global" + +cleanup() { + log_must zpool destroy $TESTPOOL2 + restore_tunable ANYRAID_MIN_TILE_SIZE +} + +log_onexit cleanup + +log_must truncate -s 8G /$TESTPOOL/vdev_file.{0,1,2,3} +save_tunable ANYRAID_MIN_TILE_SIZE +set_tunable64 ANYRAID_MIN_TILE_SIZE 1073741824 + +log_assert "'zpool attach' works to expand mirrors and anyraid vdevs" + +log_must create_pool $TESTPOOL2 /$TESTPOOL/vdev_file.0 +log_must zpool attach $TESTPOOL2 /$TESTPOOL/vdev_file.0 /$TESTPOOL/vdev_file.1 +log_must eval "zpool list -v $TESTPOOL2 | grep \" mirror\"" +log_must eval "zpool list -v $TESTPOOL2 | grep \" .*_file.0\"" +log_must eval "zpool list -v $TESTPOOL2 | grep \" .*_file.1\"" +log_must zpool attach $TESTPOOL2 /$TESTPOOL/vdev_file.0 /$TESTPOOL/vdev_file.2 +log_must eval "zpool list -v $TESTPOOL2 | grep \" .*_file.2\"" +log_must zpool destroy $TESTPOOL2 + +log_must create_pool $TESTPOOL2 anymirror1 /$TESTPOOL/vdev_file.{0,1,2} +log_must zpool attach $TESTPOOL2 anymirror-0 /$TESTPOOL/vdev_file.3 +log_must eval "zpool list -v $TESTPOOL2 | grep \" .*_file.3\"" + +log_must create_pool $TESTPOOL2 anyraidz1:2 /$TESTPOOL/vdev_file.{0,1,2} +log_must zpool attach $TESTPOOL2 anyraidz1:2-0 /$TESTPOOL/vdev_file.3 +log_must eval "zpool list -v $TESTPOOL2 | grep \" .*_file.3\"" + +log_pass "'zpool attach' works to expand mirrors and anyraid vdevs" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_attach/zpool_attach_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/zpool_attach_003_pos.ksh new file mode 100755 index 000000000000..cf6fdd8b8b15 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/zpool_attach_003_pos.ksh @@ -0,0 +1,122 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool attach' expands size correctly with anyraid vdevs. +# +# STRATEGY: +# 1. Create an anymirror1 vdev with small disks +# 2. Attach larger disk +# 3. Verify that not all the new space can be used +# 4. Attach another larger disk +# 5. Verify that all space is now usable +# 6. Repeat steps 1-5 with anymirror2 +# + +verify_runnable "global" + +cleanup() { + log_must zpool destroy $TESTPOOL2 + rm /$TESTPOOL/vdev_file.* + restore_tunable ANYRAID_MIN_TILE_SIZE +} + +log_onexit cleanup + +log_must truncate -s 512M $TEST_BASE_DIR/vdev_file.{0,1,2,3} +log_must truncate -s 2G $TEST_BASE_DIR/vdev_file.{4,5,6} +save_tunable ANYRAID_MIN_TILE_SIZE +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_assert "'zpool attach' expands size correctly with anyraid vdevs" + +log_must create_pool $TESTPOOL2 anymirror1 $TEST_BASE_DIR/vdev_file.{0,1,2} + +cap=$(zpool get -Hp -o value size $TESTPOOL2) +log_must zpool attach $TESTPOOL2 anymirror1-0 $TEST_BASE_DIR/vdev_file.4 +new_cap=$(zpool get -Hp -o value size $TESTPOOL2) +new_cap=$((new_cap - cap)) + +[[ "$new_cap" -eq $((3 * 64 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space added on attach: $new_cap" + +log_must zpool attach $TESTPOOL2 anymirror1-0 $TEST_BASE_DIR/vdev_file.5 +new_cap=$(zpool get -Hp -o value size $TESTPOOL2) +new_cap=$((new_cap - cap)) +[[ "$new_cap" -eq $(((2048 - 256 - 64) * 1024 * 1024)) ]] || \ + log_fail "Incorrect space added on attach: $new_cap" + +log_must zpool destroy $TESTPOOL2 +log_must create_pool $TESTPOOL2 anymirror2 $TEST_BASE_DIR/vdev_file.{0,1,2,3} + +cap=$(zpool get -Hp -o value size $TESTPOOL2) +log_must zpool attach $TESTPOOL2 anymirror2-0 $TEST_BASE_DIR/vdev_file.4 +new_cap=$(zpool get -Hp -o value size $TESTPOOL2) +new_cap=$((new_cap - cap)) + +[[ "$new_cap" -eq $((64 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space added on attach: $new_cap" + +log_must zpool attach $TESTPOOL2 anymirror2-0 $TEST_BASE_DIR/vdev_file.5 +new_cap=$(zpool get -Hp -o value size $TESTPOOL2) +new_cap=$((new_cap - cap)) +[[ "$new_cap" -eq $((256 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space added on attach: $new_cap" + +log_must zpool attach $TESTPOOL2 anymirror2-0 $TEST_BASE_DIR/vdev_file.6 +new_cap=$(zpool get -Hp -o value size $TESTPOOL2) +new_cap=$((new_cap - cap)) +[[ "$new_cap" -eq $(((2048 - 256 - 64) * 1024 * 1024)) ]] || \ + log_fail "Incorrect space added on attach: $new_cap" + +log_must zpool destroy $TESTPOOL2 +log_must create_pool $TESTPOOL2 anyraidz1:2 /$TESTPOOL/vdev_file.{0,1,2,3} + +cap=$(zpool get -Hp -o value size $TESTPOOL2) +log_must zpool attach $TESTPOOL2 anyraidz1:2-0 /$TESTPOOL/vdev_file.4 +new_cap=$(zpool get -Hp -o value size $TESTPOOL2) +new_cap=$((new_cap - cap)) + +[[ "$new_cap" -eq $((64 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space added on attach: $new_cap" + +log_must zpool attach $TESTPOOL2 anyraidz1:2-0 /$TESTPOOL/vdev_file.5 +new_cap=$(zpool get -Hp -o value size $TESTPOOL2) +new_cap=$((new_cap - cap)) +[[ "$new_cap" -eq $((256 * 1024 * 1024)) ]] || \ + log_fail "Incorrect space added on attach: $new_cap" + +log_must zpool attach $TESTPOOL2 anyraidz1-0 /$TESTPOOL/vdev_file.6 +new_cap=$(zpool get -Hp -o value size $TESTPOOL2) +new_cap=$((new_cap - cap)) +[[ "$new_cap" -eq $(((2048 - 256 - 64) * 1024 * 1024)) ]] || \ + log_fail "Incorrect space added on attach: $new_cap" + +log_pass "'zpool attach' expands size correctly with anyraid vdevs" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/cleanup.ksh index f504d15fc0c3..428c769444cf 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/cleanup.ksh @@ -34,5 +34,7 @@ . $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib cleanup_devices $DISKS +delete_sparse_files +rm -rf $TESTDIR $TESTDIR1 log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.shlib b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.shlib index ecab30ed3925..bbe68f8db24f 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.shlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.shlib @@ -36,7 +36,7 @@ # Given a pool vdevs list, create the pool,verify the created pool, # and destroy the pool # $1, pool name -# $2, pool type, mirror, raidz, or none +# $2, pool type, mirror, raidz, anyraid, draid or none # $3, vdevs list # function create_pool_test diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh index ad30c0fc87f9..b5b6c5d3f58f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh @@ -49,8 +49,6 @@ verify_runnable "global" function cleanup { poolexists $TESTPOOL && destroy_pool $TESTPOOL - - rm -f $disk1 $disk2 } log_assert "'zpool create ...' can successfully create" \ @@ -58,16 +56,16 @@ log_assert "'zpool create ...' can successfully create" \ log_onexit cleanup -typeset disk1=$(create_blockfile $FILESIZE) -typeset disk2=$(create_blockfile $FILESIZE) +create_sparse_files "disk" 5 $MINVDEVSIZE2 pooldevs="${DISK0} \ \"${DISK0} ${DISK1}\" \ \"${DISK0} ${DISK1} ${DISK2}\" \ - \"$disk1 $disk2\"" + \"$disk0 $disk1\"" mirrordevs="\"${DISK0} ${DISK1}\" \ $raidzdevs \ - \"$disk1 $disk2\"" + \"$disk0 $disk1\"" +anyraiddevs="\"$disk0 $disk1 $disk2 $disk3 $disk4\"" raidzdevs="\"${DISK0} ${DISK1} ${DISK2}\"" draiddevs="\"${DISK0} ${DISK1} ${DISK2}\"" @@ -75,6 +73,16 @@ create_pool_test "$TESTPOOL" "" "$pooldevs" create_pool_test "$TESTPOOL" "mirror" "$mirrordevs" create_pool_test "$TESTPOOL" "raidz" "$raidzdevs" create_pool_test "$TESTPOOL" "raidz1" "$raidzdevs" +create_pool_test "$TESTPOOL" "anymirror" "$anyraiddevs" +create_pool_test "$TESTPOOL" "anymirror0" "$anyraiddevs" +create_pool_test "$TESTPOOL" "anymirror1" "$anyraiddevs" +create_pool_test "$TESTPOOL" "anymirror2" "$anyraiddevs" +create_pool_test "$TESTPOOL" "anymirror3" "$anyraiddevs" +create_pool_test "$TESTPOOL" "anymirror4" "$anyraiddevs" +create_pool_test "$TESTPOOL" "anyraidz1:2" "$anyraiddevs" +create_pool_test "$TESTPOOL" "anyraidz1:3" "$anyraiddevs" +create_pool_test "$TESTPOOL" "anyraidz2:2" "$anyraiddevs" +create_pool_test "$TESTPOOL" "anyraidz3:1" "$anyraiddevs" create_pool_test "$TESTPOOL" "draid" "$draiddevs" log_pass "'zpool create ...' success." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh index f0c2e69a0c0f..6d4fd4cd6982 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh @@ -55,7 +55,7 @@ log_assert "'zpool create [-R root][-m mountpoint] ...' can create "an alternate pool or a new pool mounted at the specified mountpoint." log_onexit cleanup -set -A pooltype "" "mirror" "raidz" "raidz1" "raidz2" "draid" "draid2" +set -A pooltype "" "mirror" "raidz" "raidz1" "raidz2" "anymirror" "anymirror0" "anymirror1" "anymirror2" "anymirror3" "anyraidz1:2" "anyraidz2:1" "anyraidz3:1" "draid" "draid2" # # cleanup the pools created in previous case if zpool_create_004_pos timedout @@ -69,7 +69,7 @@ rm -rf $TESTDIR log_must mkdir -p $TESTDIR typeset -i i=1 while (( i < 5 )); do - log_must truncate -s $FILESIZE $TESTDIR/file.$i + log_must truncate -s $MINVDEVSIZE2 $TESTDIR/file.$i (( i = i + 1 )) done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh index adc47c48de28..b06eea7fa8d6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh @@ -60,7 +60,7 @@ mntpnt=$(get_prop mountpoint $TESTPOOL) typeset -i i=0 while ((i < 10)); do - log_must truncate -s $MINVDEVSIZE $mntpnt/vdev$i + log_must truncate -s $MINVDEVSIZE2 $mntpnt/vdev$i eval vdev$i=$mntpnt/vdev$i ((i += 1)) @@ -98,6 +98,18 @@ set -A valid_args \ "raidz2 $vdev0 $vdev1 $vdev2 spare $vdev3 raidz2 $vdev4 $vdev5 $vdev6" \ "raidz3 $vdev0 $vdev1 $vdev2 $vdev3 \ mirror $vdev4 $vdev5 $vdev6 $vdev7" \ + "anymirror0 $vdev0" \ + "anymirror0 $vdev0 $vdev1 anymirror0 $vdev2 $vdev3" \ + "anymirror1 $vdev0 $vdev1 anymirror1 $vdev2 $vdev3" \ + "anymirror2 $vdev0 $vdev1 $vdev2 anymirror2 $vdev3 $vdev4 $vdev5" \ + "anymirror2 $vdev0 $vdev1 $vdev2 $vdev3 anymirror2 $vdev4 $vdev5 $vdev6" \ + "anymirror3 $vdev0 $vdev1 $vdev2 $vdev3 anymirror3 $vdev4 $vdev5 $vdev6 $vdev7" \ + "anyraidz1:1 $vdev0 $vdev1 anyraidz1:1 $vdev2 $vdev3" \ + "anyraidz1:2 $vdev0 $vdev1 $vdev2 anyraidz1:2 $vdev3 $vdev4 $vdev5" \ + "anyraidz2:1 $vdev0 $vdev1 $vdev2 anyraidz2:1 $vdev3 $vdev4 $vdev5" \ + "anyraidz3:1 $vdev0 $vdev1 $vdev2 $vdev3 anyraidz3:1 $vdev4 $vdev5 $vdev6 $vdev7" \ + "anyraidz3:1 $vdev0 $vdev1 $vdev2 $vdev3 $vdev4 anyraidz3:1 $vdev5 $vdev6 $vdev7 $vdev8" \ + "anyraidz3:2 $vdev0 $vdev1 $vdev2 $vdev3 $vdev4 anyraidz3:2 $vdev5 $vdev6 $vdev7 $vdev8 $vdev9" \ "draid $vdev0 $vdev1 $vdev2 mirror $vdev3 $vdev4" \ "draid $vdev0 $vdev1 $vdev2 raidz1 $vdev3 $vdev4 $vdev5" \ "draid $vdev0 $vdev1 $vdev2 draid1 $vdev3 $vdev4 $vdev5" \ @@ -133,6 +145,9 @@ set -A forced_args \ spare $vdev4 raidz2 $vdev5 $vdev6 $vdev7" \ "mirror $vdev0 $vdev1 draid $vdev2 $vdev3 $vdev4 \ draid2 $vdev5 $vdev6 $vdev7 $vdev8 spare $vdev9" \ + "anymirror0 $vdev0 anymirror $vdev1 $vdev2" \ + "anymirror1 $vdev0 $vdev1 anymirror2 $vdev2 $vdev3 $vdev4" \ + "anymirror3 $vdev0 $vdev1 $vdev2 $vdev3 anymirror0 $vdev4" \ "draid $vdev0 $vdev1 $vdev2 $vdev3 \ draid2 $vdev4 $vdev5 $vdev6 $vdev7 $vdev8" \ "draid $vdev0 $vdev1 $vdev2 draid $vdev4 $vdev5 $vdev6 \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh index 2e377bc3b522..41672dcdf91d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh @@ -56,6 +56,16 @@ set -A args "" "-?" "-n" "-f" "-nf" "-fn" "-f -n" "--f" "-e" "-s" \ "$TESTPOOL mirror" "$TESTPOOL raidz" "$TESTPOOL mirror raidz" \ "$TESTPOOL raidz1" "$TESTPOOL mirror raidz1" \ "$TESTPOOL draid1" "$TESTPOOL mirror draid1" \ + "$TESTPOOL anymirror" "$TESTPOOL mirror anymirror" \ + "$TESTPOOL anymirror0" "$TESTPOOL mirror anymirror0" \ + "$TESTPOOL anymirror1 $DISK0" \ + "$TESTPOOL anymirror2 $DISK0 $DISK1" \ + "$TESTPOOL anymirror3 $DISK0 $DISK1 $DISK2" \ + "$TESTPOOL anyraiz1:1 $DISK0" \ + "$TESTPOOL anyraiz1:2 $DISK0" \ + "$TESTPOOL anyraiz1:2 $DISK0 $DISK1" \ + "$TESTPOOL anyraiz2:1 $DISK0 $DISK2" \ + "$TESTPOOL anyraiz3:1 $DISK0 $DISK2 $DISK3" \ "$TESTPOOL mirror c?t?d?" "$TESTPOOL mirror $DISK0 c0t1d?" \ "$TESTPOOL RAIDZ $DISK0 $DISK1" \ "$TESTPOOL $DISK0 log $DISK1 log $DISK2" \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh index 7656f5bb4fdf..f70fc2912d1b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh @@ -39,10 +39,12 @@ # devices, 'zpool create' should failed. # # STRATEGY: -# 1. Loop to create the following three kinds of pools. +# 1. Loop to create the following kinds of pools: # - Regular pool # - Mirror # - Raidz +# - AnyRAID +# - dRAID # 2. Create two pools but using the same disks, expect failed. # 3. Create one pool but using the same disks twice, expect failed. # @@ -62,13 +64,15 @@ log_assert "Create a pool with same devices twice or create two pools with " \ "same devices, 'zpool create' should fail." log_onexit cleanup +create_sparse_files "file" 4 $MINVDEVSIZE2 + unset NOINUSE_CHECK typeset opt -for opt in "" "mirror" "raidz" "draid"; do +for opt in "" "mirror" "raidz" "anymirror" "anyraidz1:1" "draid"; do if [[ $opt == "" ]]; then - typeset disks=$DISK0 + typeset disks=$file0 else - typeset disks=$DISKS + typeset disks=$files fi # Create two pools but using the same disks. @@ -78,7 +82,7 @@ for opt in "" "mirror" "raidz" "draid"; do # Create two pools and part of the devices were overlapped create_pool $TESTPOOL $opt $disks - log_mustnot zpool create -f $TESTPOOL1 $opt $DISK0 + log_mustnot zpool create -f $TESTPOOL1 $opt $file0 destroy_pool $TESTPOOL # Create one pool but using the same disks twice. diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh index 6d43227481bf..bb2241ac71b3 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh @@ -73,6 +73,8 @@ set -A args \ "$TOOSMALL $TESTDIR/file1" "$TESTPOOL1 $TESTDIR/file1 $TESTDIR/file2" \ "$TOOSMALL mirror $TESTDIR/file1 $TESTDIR/file2" \ "$TOOSMALL raidz $TESTDIR/file1 $TESTDIR/file2" \ + "$TOOSMALL anymirror0 $TESTDIR/file1" \ + "$TOOSMALL anyraidz1:1 $TESTDIR/file1 $TESTDIR/file2" \ "$TOOSMALL draid $TESTDIR/file1 $TESTDIR/file2 $TESTDIR/file3" typeset -i i=0 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_001_pos.ksh new file mode 100755 index 000000000000..fb875f81e9c7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_001_pos.ksh @@ -0,0 +1,63 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Create a variety of AnyRAID pools using the minimal vdev syntax. +# +# STRATEGY: +# 1. Create the required number of allowed vdevs. +# 2. Create few pools of various sizes using the anymirror* syntax. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "'zpool create ...' can create a pool." +log_onexit cleanup + +create_sparse_files "disk" 7 $MINVDEVSIZE2 + +# Verify the default parity +log_must zpool create $TESTPOOL anymirror $disks +log_must poolexists $TESTPOOL +destroy_pool $TESTPOOL + +# Verify specified parity +for parity in {0..6}; do + log_must zpool create $TESTPOOL anymirror$parity $disks + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL +done + +log_pass "'zpool create ...' can create a pool." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_002_pos.ksh new file mode 100755 index 000000000000..fe7cefff505e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_002_pos.ksh @@ -0,0 +1,70 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Create AnyRAID pool using the maximum number of vdevs (255). Then verify +# that creating a pool with 256 fails as expected. +# +# STRATEGY: +# 1. Verify a pool with fewer than the required vdevs fails. +# 2. Verify pools with a valid number of vdevs succeed. +# 3. Verify a pool which exceeds the maximum number of vdevs fails. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + log_pos rm -f $all_vdevs + log_pos rmdir $TESTDIR +} + +log_assert "'zpool create anyraid ...' can create a pool with maximum number of vdevs." +log_onexit cleanup + +all_vdevs=$(echo $TESTDIR/file.{01..256}) + +mkdir $TESTDIR +log_must truncate -s $MINVDEVSIZE2 $all_vdevs + +for type in "anymirror3" "anyraidz3:10"; do + # Verify pool sizes from 254-255. + for (( i=254; i<=255; i++ )); do + log_must zpool create $TESTPOOL $type \ + $(echo $TESTDIR/file.{01..$i}) + log_must destroy_pool $TESTPOOL + done + + # Exceeds maximum AnyRAID vdev count (256). + log_mustnot zpool create $TESTPOOL $type $(echo $TESTDIR/file.{01..256}) +done +log_pass "'zpool create anyraid ...' can create a pool with maximum number of vdevs." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_003_pos.ksh new file mode 100755 index 000000000000..f1235b094b33 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_003_pos.ksh @@ -0,0 +1,61 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib + +# +# DESCRIPTION: +# Verify that AnyRAID vdev can be created using disks of different sizes. +# +# STRATEGY: +# 1. Create a pool using disks of different sizes. +# 2. Verify the pool created successfully. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "'zpool create anyraid* ...' can create a pool with disks of various sizes." +log_onexit cleanup + +create_sparse_files "Adisk" 3 $(( $MINVDEVSIZE2 * 1 )) +create_sparse_files "Bdisk" 2 $(( $MINVDEVSIZE2 * 2 )) +create_sparse_files "Cdisk" 1 $(( $MINVDEVSIZE2 * 3 )) +ls -lh $Adisks $Bdisks $Cdisks + +for parity in mirror{0..3} raidz1:{1..3} raidz2:{1..3} raidz3:{1..3}; do + log_must zpool create $TESTPOOL any$parity $Cdisks $Adisks $Bdisks + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL +done + +log_pass "'zpool create anyraid* ...' can create a pool with disks of various sizes." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_004_pos.ksh new file mode 100755 index 000000000000..0cb3e106c1c1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_004_pos.ksh @@ -0,0 +1,57 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib + +# +# DESCRIPTION: +# Verify that AnyRAID vdevs of different sizes can be mixed in a pool +# +# STRATEGY: +# 1. Create a pool with two anyraid vdevs with different disk counts +# 2. Verify the pool created successfully +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "Pools can have multiple anyraid children with different disk counts" +log_onexit cleanup + +create_sparse_files "disk" 5 $MINVDEVSIZE2 + +# Verify the default parity +log_must zpool create $TESTPOOL anymirror $disk0 $disk1 $disk2 anymirror $disk3 $disk4 +log_must poolexists $TESTPOOL +destroy_pool $TESTPOOL + +log_pass "Pools can have multiple anyraid children with different disk counts." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_005_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_005_neg.ksh new file mode 100755 index 000000000000..7402d21e0a42 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_005_neg.ksh @@ -0,0 +1,66 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Run negative tests relating to anyraid vdevs and pool creation +# +# STRATEGY: +# 1. Try to create a pool with an invalid parity string +# 2. Try to create a pool with too large a parity +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "anyraid vdev specifications detect problems correctly" +log_onexit cleanup + +create_sparse_files "disk" 4 $MINVDEVSIZE2 + +log_mustnot zpool create $TESTPOOL anymirrorq $disks +log_mustnot zpool create $TESTPOOL anymirrorq1 $disks +log_mustnot zpool create $TESTPOOL anymirror-1 $disks +log_mustnot zpool create $TESTPOOL anymirror4 $disks +log_mustnot zpool create $TESTPOOL anyraidz $disks +log_mustnot zpool create $TESTPOOL anyraidz1 $disks +log_mustnot zpool create $TESTPOOL anyraidz1:0 $disks +log_mustnot zpool create $TESTPOOL anyraidzz1:1 $disks + +# +# vdev names should be reserved so they can't accidentally be used as a pool +# name. +# +log_mustnot zpool create anymirror $disks + +log_pass "anyraid vdev specifications detect problems correctly" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_006_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_006_neg.ksh new file mode 100755 index 000000000000..576ea2ee93f6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_006_neg.ksh @@ -0,0 +1,100 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that creating an anymirror pool with too few disks for +# the requested parity level produces a clear error and does not +# crash or create a pool. +# +# STRATEGY: +# 1. Create sparse file vdevs. +# 2. Attempt to create pools where the disk count is less than +# or equal to the parity level (need parity+1 disks minimum). +# 3. Verify each attempt fails with log_mustnot. +# 4. Verify no pool was created. +# 5. Verify stderr output is meaningful (not a panic or stack trace). +# + +verify_runnable "global" + +function cleanup +{ + log_note "DEBUG: cleanup - destroying pool if it exists" + poolexists $TESTPOOL && destroy_pool $TESTPOOL + log_note "DEBUG: cleanup - deleting sparse files" + delete_sparse_files +} + +log_assert "anymirror with too few disks for parity level must fail" +log_onexit cleanup + +create_sparse_files "disk" 3 $MINVDEVSIZE2 + +# +# anymirror1 requires at least 2 disks. +# With only 1 disk it must fail. +# +log_mustnot zpool create $TESTPOOL anymirror1 $disk0 +log_mustnot poolexists $TESTPOOL + +# +# anymirror2 requires at least 3 disks. +# With only 2 disks it must fail. +# +log_mustnot zpool create $TESTPOOL anymirror2 $disk0 $disk1 +log_mustnot poolexists $TESTPOOL + +# +# anymirror3 requires at least 4 disks. +# With only 3 disks it must fail. +# +log_mustnot zpool create $TESTPOOL anymirror3 $disk0 $disk1 $disk2 +log_mustnot poolexists $TESTPOOL + +# +# anymirror6 requires at least 7 disks. +# With only 3 disks it must fail. +# +log_mustnot zpool create $TESTPOOL anymirror6 $disk0 $disk1 $disk2 +log_mustnot poolexists $TESTPOOL + +# +# Verify the error output is not a panic or stack trace. +# +log_note "DEBUG: verifying error message is meaningful for anymirror3 with 3 disks" +errmsg=$(zpool create $TESTPOOL anymirror3 $disk0 $disk1 $disk2 2>&1) +log_note "DEBUG: error output was: $errmsg" +if echo "$errmsg" | grep -qi "panic\|stack\|dump\|segfault"; then + log_fail "Error output contains panic/crash indicators: $errmsg" +fi +log_note "DEBUG: error message verified as meaningful (no panic/crash)" +log_mustnot poolexists $TESTPOOL + +log_pass "anymirror with too few disks for parity level must fail" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_007_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_007_neg.ksh new file mode 100755 index 000000000000..e9f35ff79a1f --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_007_neg.ksh @@ -0,0 +1,97 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that creating an anyraidz pool where the data width exceeds +# the available disk count produces a clear error and does not crash +# or create a pool. +# +# STRATEGY: +# 1. Create sparse file vdevs. +# 2. Attempt to create anyraidz pools where parity + data width +# exceeds the number of disks provided. +# 3. Verify each attempt fails with log_mustnot. +# 4. Verify no pool was created. +# 5. Verify stderr output is meaningful (not a panic or stack trace). +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + delete_sparse_files +} + +log_assert "anyraidz with data width larger than disk count must fail" +log_onexit cleanup + +create_sparse_files "disk" 3 $MINVDEVSIZE2 + +# +# anyraidz1:4 requires 1 parity + 4 data = 5 disks minimum. +# With only 3 disks it must fail. +# +log_mustnot zpool create $TESTPOOL anyraidz1:4 $disk0 $disk1 $disk2 +log_note "DEBUG: anyraidz1:4 with 3 disks failed as expected" + +# +# anyraidz1:3 requires 1 parity + 3 data = 4 disks minimum. +# With only 3 disks it must fail. +# +log_mustnot zpool create $TESTPOOL anyraidz1:3 $disk0 $disk1 $disk2 +log_mustnot poolexists $TESTPOOL + +# +# anyraidz2:3 requires 2 parity + 3 data = 5 disks minimum. +# With only 3 disks it must fail. +# +log_mustnot zpool create $TESTPOOL anyraidz2:3 $disk0 $disk1 $disk2 +log_mustnot poolexists $TESTPOOL + +# +# anyraidz3:2 requires 3 parity + 2 data = 5 disks minimum. +# With only 3 disks it must fail. +# +log_mustnot zpool create $TESTPOOL anyraidz3:2 $disk0 $disk1 $disk2 +log_mustnot poolexists $TESTPOOL + +# +# Verify the error output is not a panic or stack trace. +# +log_note "DEBUG: verifying error message is meaningful for anyraidz1:4 with 3 disks" +errmsg=$(zpool create $TESTPOOL anyraidz1:4 $disk0 $disk1 $disk2 2>&1) +log_note "DEBUG: error output was: $errmsg" +if echo "$errmsg" | grep -qi "panic\|stack\|dump\|segfault"; then + log_fail "Error output contains panic/crash indicators: $errmsg" +fi +log_mustnot poolexists $TESTPOOL + +log_pass "anyraidz with data width larger than disk count must fail" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_008_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_008_neg.ksh new file mode 100755 index 000000000000..7ddde3f277b1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_008_neg.ksh @@ -0,0 +1,89 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that creating an anyraidz pool with a data width of 0 +# produces a clear error and does not crash or create a pool. +# Note: anyraidz1:0 is already tested in 005_neg. This test provides +# additional coverage with different parity levels. +# +# STRATEGY: +# 1. Create sparse file vdevs. +# 2. Attempt to create anyraidz pools with data width of 0 at +# each valid parity level (1, 2, 3). +# 3. Verify each attempt fails with log_mustnot. +# 4. Verify no pool was created. +# 5. Verify stderr output is meaningful (not a panic or stack trace). +# + +verify_runnable "global" + +function cleanup +{ + log_note "DEBUG: cleanup - destroying pool if it exists" + poolexists $TESTPOOL && destroy_pool $TESTPOOL + log_note "DEBUG: cleanup - deleting sparse files" + delete_sparse_files +} + +log_assert "anyraidz with data width of 0 must fail" +log_onexit cleanup + +create_sparse_files "disk" 4 $MINVDEVSIZE2 + +# +# anyraidz1:0 - parity 1 with 0 data width must fail. +# +log_mustnot zpool create $TESTPOOL anyraidz1:0 $disks +log_mustnot poolexists $TESTPOOL + +# +# anyraidz2:0 - parity 2 with 0 data width must fail. +# +log_mustnot zpool create $TESTPOOL anyraidz2:0 $disks +log_mustnot poolexists $TESTPOOL + +# +# anyraidz3:0 - parity 3 with 0 data width must fail. +# +log_mustnot zpool create $TESTPOOL anyraidz3:0 $disks +log_mustnot poolexists $TESTPOOL + +# +# Verify the error output is not a panic or stack trace. +# +errmsg=$(zpool create $TESTPOOL anyraidz2:0 $disks 2>&1) +log_note "DEBUG: error output was: $errmsg" +if echo "$errmsg" | grep -qi "panic\|stack\|dump\|segfault"; then + log_fail "Error output contains panic/crash indicators: $errmsg" +fi +log_mustnot poolexists $TESTPOOL + +log_pass "anyraidz with data width of 0 must fail" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_009_neg.ksh new file mode 100755 index 000000000000..114c27c3aaf5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_009_neg.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that creating an anyraidz pool with a parity level exceeding +# 3 produces a clear error and does not crash or create a pool. +# Valid anyraidz parity levels are 1, 2, and 3 only. +# +# STRATEGY: +# 1. Create sparse file vdevs. +# 2. Attempt to create anyraidz pools with parity levels 4, 5, 6 +# and other invalid values. +# 3. Verify each attempt fails with log_mustnot. +# 4. Verify no pool was created. +# 5. Verify stderr output is meaningful (not a panic or stack trace). +# + +verify_runnable "global" + +function cleanup +{ + log_note "DEBUG: cleanup - destroying pool if it exists" + poolexists $TESTPOOL && destroy_pool $TESTPOOL + log_note "DEBUG: cleanup - deleting sparse files" + delete_sparse_files +} + +log_assert "anyraidz with parity level exceeding 3 must fail" +log_onexit cleanup + +create_sparse_files "disk" 7 $MINVDEVSIZE2 + +# +# anyraidz4:1 - parity 4 is not valid for anyraidz. +# +log_mustnot zpool create $TESTPOOL anyraidz4:1 $disks +log_mustnot poolexists $TESTPOOL + +# +# anyraidz5:1 - parity 5 is not valid for anyraidz. +# +log_mustnot zpool create $TESTPOOL anyraidz5:1 $disks +log_mustnot poolexists $TESTPOOL + +# +# anyraidz6:1 - parity 6 is not valid for anyraidz. +# +log_mustnot zpool create $TESTPOOL anyraidz6:1 $disks +log_mustnot poolexists $TESTPOOL + +# +# anyraidz0:2 - parity 0 is not valid for anyraidz. +# +log_mustnot zpool create $TESTPOOL anyraidz0:2 $disks +log_mustnot poolexists $TESTPOOL + +# +# Verify the error output is not a panic or stack trace. +# +errmsg=$(zpool create $TESTPOOL anyraidz4:1 $disks 2>&1) +log_note "DEBUG: error output was: $errmsg" +if echo "$errmsg" | grep -qi "panic\|stack\|dump\|segfault"; then + log_fail "Error output contains panic/crash indicators: $errmsg" +fi +log_mustnot poolexists $TESTPOOL + +log_pass "anyraidz with parity level exceeding 3 must fail" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_010_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_010_neg.ksh new file mode 100755 index 000000000000..1cafebaefa22 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_010_neg.ksh @@ -0,0 +1,98 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that mixing anymirror and anyraidz keywords in the same +# vdev group produces a clear error and does not crash or create +# a pool. Each anyraid vdev group must use a single type. +# +# STRATEGY: +# 1. Create sparse file vdevs. +# 2. Attempt to create pools that combine anymirror and anyraidz +# in a single vdev specification (not as separate top-level vdevs). +# 3. Verify each attempt fails with log_mustnot. +# 4. Verify no pool was created. +# 5. Verify stderr output is meaningful (not a panic or stack trace). +# + +verify_runnable "global" + +function cleanup +{ + log_note "DEBUG: cleanup - destroying pool if it exists" + poolexists $TESTPOOL && destroy_pool $TESTPOOL + log_note "DEBUG: cleanup - deleting sparse files" + delete_sparse_files +} + +log_assert "Mixing anymirror and anyraidz in the same vdev group must fail" +log_onexit cleanup + +create_sparse_files "disk" 6 $MINVDEVSIZE2 + +# +# Attempt to specify both anymirror and anyraidz as if they were +# the same vdev group by placing them adjacent without separation. +# The parser should reject this. +# +log_mustnot zpool create $TESTPOOL anymirror1 anyraidz1:1 $disks +log_mustnot poolexists $TESTPOOL + +# +# Attempt with anyraidz first, then anymirror. +# +log_mustnot zpool create $TESTPOOL anyraidz1:1 anymirror1 $disks +log_mustnot poolexists $TESTPOOL + +# +# Attempt with both keywords back-to-back before disks, reversed order. +# +log_mustnot zpool create $TESTPOOL anyraidz1:2 anymirror1 $disks +log_mustnot poolexists $TESTPOOL + +# +# Attempt with both keywords back-to-back, different parity. +# +log_mustnot zpool create $TESTPOOL anymirror2 anyraidz2:1 $disks +log_mustnot poolexists $TESTPOOL + +# +# Verify the error output is not a panic or stack trace for one +# of the clearly invalid cases. +# +log_note "DEBUG: verifying error message is meaningful" +errmsg=$(zpool create $TESTPOOL anymirror1 anyraidz1:1 $disks 2>&1) +log_note "DEBUG: error output was: $errmsg" +if echo "$errmsg" | grep -qi "panic\|stack\|dump\|segfault"; then + log_fail "Error output contains panic/crash indicators: $errmsg" +fi +log_mustnot poolexists $TESTPOOL + +log_pass "Mixing anymirror and anyraidz in the same vdev group must fail" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_011_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_011_neg.ksh new file mode 100755 index 000000000000..afe3cf845d23 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_anyraid_011_neg.ksh @@ -0,0 +1,101 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that creating an anyraid pool with a single disk and +# non-zero parity produces a clear error and does not crash or +# create a pool. A single disk cannot provide redundancy. +# +# STRATEGY: +# 1. Create a single sparse file vdev. +# 2. Attempt to create pools with various non-zero parity levels +# using only the single disk. +# 3. Verify each attempt fails with log_mustnot. +# 4. Verify no pool was created. +# 5. Verify stderr output is meaningful (not a panic or stack trace). +# + +verify_runnable "global" + +function cleanup +{ + log_note "DEBUG: cleanup - destroying pool if it exists" + poolexists $TESTPOOL && destroy_pool $TESTPOOL + log_note "DEBUG: cleanup - deleting sparse files" + delete_sparse_files +} + +log_assert "Single disk with non-zero parity must fail" +log_onexit cleanup + +create_sparse_files "disk" 1 $MINVDEVSIZE2 + +# +# anymirror1 with 1 disk - needs at least 2 disks for parity 1. +# +log_mustnot zpool create $TESTPOOL anymirror1 $disk0 +log_mustnot poolexists $TESTPOOL + +# +# anymirror2 with 1 disk - needs at least 3 disks for parity 2. +# +log_mustnot zpool create $TESTPOOL anymirror2 $disk0 +log_mustnot poolexists $TESTPOOL + +# +# anymirror3 with 1 disk - needs at least 4 disks for parity 3. +# +log_mustnot zpool create $TESTPOOL anymirror3 $disk0 +log_mustnot poolexists $TESTPOOL + +# +# anymirror6 with 1 disk - needs at least 7 disks for parity 6. +# +log_mustnot zpool create $TESTPOOL anymirror6 $disk0 +log_mustnot poolexists $TESTPOOL + +# +# anyraidz1:1 with 1 disk - needs at least 2 disks (1 parity + 1 data). +# +log_mustnot zpool create $TESTPOOL anyraidz1:1 $disk0 +log_mustnot poolexists $TESTPOOL + +# +# Verify the error output is not a panic or stack trace. +# +log_note "DEBUG: verifying error message is meaningful for anymirror1 with 1 disk" +errmsg=$(zpool create $TESTPOOL anymirror1 $disk0 2>&1) +log_note "DEBUG: error output was: $errmsg" +if echo "$errmsg" | grep -qi "panic\|stack\|dump\|segfault"; then + log_fail "Error output contains panic/crash indicators: $errmsg" +fi +log_mustnot poolexists $TESTPOOL + +log_pass "Single disk with non-zero parity must fail" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/cleanup.ksh index 66de31744a96..5dce6bec18fd 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_export/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/cleanup.ksh @@ -28,4 +28,5 @@ . $STF_SUITE/include/libtest.shlib +delete_sparse_files default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_anyraid_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_anyraid_001_pos.ksh new file mode 100755 index 000000000000..8be23c681da7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_anyraid_001_pos.ksh @@ -0,0 +1,63 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# An AnyRAID pool should be exportable and not visible from 'zpool list'. +# +# STRATEGY: +# 1. Create AnyRAID pool. +# 2. Export the pool. +# 3. Verify the pool is no longer present in the list output. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "Verify an AnyRAID pool can be exported." +log_onexit cleanup + +poolexists $TESTPOOL && destroy_pool $TESTPOOL + +create_sparse_files "disk" 4 $MINVDEVSIZE2 + +for type in "anymirror3" "anyraidz1:2"; do + log_must zpool create $TESTPOOL $type $disks + log_must poolexists $TESTPOOL + log_must zpool export $TESTPOOL + + poolexists $TESTPOOL && \ + log_fail "$TESTPOOL unexpectedly found in 'zpool list' output." +done + +log_pass "Successfully exported an AnyRAID pool." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 99a4556f70d5..2652c3af7f6b 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -95,6 +95,7 @@ typeset -a properties=( "feature@redaction_list_spill" "feature@dynamic_gang_header" "feature@physical_rewrite" + "feature@anyraid" ) if is_linux || is_freebsd; then diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh index ce1c103cd3c3..3fe1fea0bc3a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh @@ -43,15 +43,18 @@ # 3. Create a draid2 pool C with dev2/3/4/5. Then destroy it. # 4. Create a raidz pool D with dev3/4. Then destroy it. # 5. Create a stripe pool E with dev4. Then destroy it. -# 6. Verify 'zpool import -D -a' recover all the pools. +# 6. Create an anyraid pool F with dev6. Then destroy it. +# 7. Verify 'zpool import -D -a' recover all the pools. # verify_runnable "global" +VDEV6="$DEVICE_DIR/disk6_anyraid" + function cleanup { typeset dt - for dt in $poolE $poolD $poolC $poolB $poolA; do + for dt in $poolF $poolE $poolD $poolC $poolB $poolA; do destroy_pool $dt done @@ -67,7 +70,7 @@ log_assert "'zpool -D -a' can import all the specified directories " \ "destroyed pools." log_onexit cleanup -poolA=poolA.$$; poolB=poolB.$$; poolC=poolC.$$; poolD=poolD.$$; poolE=poolE.$$ +poolA=poolA.$$; poolB=poolB.$$; poolC=poolC.$$; poolD=poolD.$$; poolE=poolE.$$; poolF=poolF.$$; log_must zpool create $poolA mirror $VDEV0 $VDEV1 $VDEV2 $VDEV3 $VDEV4 log_must zpool destroy $poolA @@ -84,9 +87,13 @@ log_must zpool destroy $poolD log_must zpool create $poolE $VDEV4 log_must zpool destroy $poolE +truncate -s 24G $VDEV6 +log_must zpool create $poolF anymirror0 $VDEV6 +log_must zpool destroy $poolF + log_must zpool import -d $DEVICE_DIR -D -f -a -for dt in $poolA $poolB $poolC $poolD $poolE; do +for dt in $poolA $poolB $poolC $poolD $poolE $poolF; do log_must datasetexists $dt done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh index a3beee135954..b4204014d573 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh @@ -29,4 +29,6 @@ verify_runnable "global" +restore_tunable ANYRAID_MIN_TILE_SIZE + default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/setup.ksh new file mode 100755 index 000000000000..1210475b12f7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/setup.ksh @@ -0,0 +1,35 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +save_tunable ANYRAID_MIN_TILE_SIZE +set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + +log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_anyraid_attach.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_anyraid_attach.ksh new file mode 100755 index 000000000000..dd4616670183 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_anyraid_attach.ksh @@ -0,0 +1,56 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Attaching data devices works with initializing for AnyRAID1. +# +# STRATEGY: +# 1. Create an AnyRAID1 pool. +# 2. Start initializing of the first disk. +# 3. Attach a third disk, ensure initializing continues. +# + +DISK1="$(echo $DISKS | cut -d' ' -f1)" +DISK2="$(echo $DISKS | cut -d' ' -f2)" +DISK3="$(echo $DISKS | cut -d' ' -f3)" + +log_must zpool create -f $TESTPOOL anymirror1 $DISK1 $DISK2 + +log_must zpool initialize $TESTPOOL $DISK1 +progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$progress" ]] && log_fail "Initializing did not start" + +log_must zpool attach $TESTPOOL anymirror1-0 $DISK3 +new_progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ "$progress" -le "$new_progress" ]] || \ + log_fail "Lost initializing progress on AnyRAID1 attach" +progress="$new_progress" + +log_pass "Attaching data devices works with initializing for AnyRAID1" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh index 26c369be5bee..c37cc2016eec 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh @@ -30,31 +30,42 @@ # 1. Create a pool with a two-way mirror. # 2. Start initializing, fault, export, import, online and verify along # the way that the initializing was cancelled and not restarted. +# 3. Repeat for AnyRAID1. # DISK1="$(echo $DISKS | cut -d' ' -f1)" DISK2="$(echo $DISKS | cut -d' ' -f2)" -log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 +for type in "mirror" "anymirror1"; do + log_must zpool create -O compress=off -f $TESTPOOL $type $DISK1 $DISK2 + if [[ "$type" == "anymirror1" ]]; then + log_must dd if=/dev/zero of=/$TESTPOOL/f1 bs=1M count=2k + log_must zpool sync + log_must rm /$TESTPOOL/f1 + fi -log_must zpool initialize $TESTPOOL $DISK1 -progress="$(initialize_progress $TESTPOOL $DISK1)" -[[ -z "$progress" ]] && log_fail "Initializing did not start" + log_must zpool initialize $TESTPOOL $DISK1 + progress="$(initialize_progress $TESTPOOL $DISK1)" + [[ -z "$progress" ]] && log_fail "Initializing did not start" -log_must zpool offline -f $TESTPOOL $DISK1 -log_must check_vdev_state $TESTPOOL $DISK1 "FAULTED" -log_must eval "zpool status -i $TESTPOOL | grep $DISK1 | grep uninitialized" + log_must zpool offline -f $TESTPOOL $DISK1 + log_must zpool sync $TESTPOOL + log_must check_vdev_state $TESTPOOL $DISK1 "FAULTED" + log_must eval "zpool status -i $TESTPOOL | grep $DISK1 | grep uninitialized" -log_must zpool export $TESTPOOL -log_must zpool import $TESTPOOL + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL -log_must check_vdev_state $TESTPOOL $DISK1 "FAULTED" -log_must eval "zpool status -i $TESTPOOL | grep $DISK1 | grep uninitialized" + log_must check_vdev_state $TESTPOOL $DISK1 "FAULTED" + log_must eval "zpool status -i $TESTPOOL | grep $DISK1 | grep uninitialized" -log_must zpool online $TESTPOOL $DISK1 -log_must zpool clear $TESTPOOL $DISK1 -log_must check_vdev_state $TESTPOOL $DISK1 "ONLINE" -log_must eval "zpool status -i $TESTPOOL | grep $DISK1 | grep uninitialized" + log_must zpool online $TESTPOOL $DISK1 + log_must zpool clear $TESTPOOL $DISK1 + log_must check_vdev_state $TESTPOOL $DISK1 "ONLINE" + log_must eval "zpool status -i $TESTPOOL | grep $DISK1 | grep uninitialized" + + poolexists $TESTPOOL && destroy_pool $TESTPOOL +done log_pass "Initializing behaves as expected at each step of:" \ "initialize + fault + export + import + online" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh index 341f4f75cf7d..7f386a9c9ec3 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh @@ -32,7 +32,7 @@ # Initializing automatically resumes across import/export. # # STRATEGY: -# 1. Create a one-disk pool. +# 1. Create a pool. # 2. Start initializing and verify that initializing is active. # 3. Export the pool. # 4. Import the pool. @@ -40,40 +40,52 @@ # 6. Suspend initializing. # 7. Repeat steps 3-4. # 8. Verify that progress does not regress but initializing is still suspended. +# 9. Repeat for other VDEV types. # -DISK1=${DISKS%% *} +DISK1="$(echo $DISKS | cut -d' ' -f1)" +DISK2="$(echo $DISKS | cut -d' ' -f2)" -log_must zpool create -f $TESTPOOL $DISK1 -log_must zpool initialize $TESTPOOL +for type in "" "anymirror1"; do + if [[ "$type" = "" ]]; then + VDEVS="$DISK1" + elif [[ "$type" = "anymirror1" ]]; then + VDEVS="$DISK1 $DISK2" + fi -sleep 2 + log_must zpool create -f $TESTPOOL $type $VDEVS + log_must zpool initialize $TESTPOOL -progress="$(initialize_progress $TESTPOOL $DISK1)" -[[ -z "$progress" ]] && log_fail "Initializing did not start" + sleep 2 -log_must zpool export $TESTPOOL -log_must zpool import $TESTPOOL + progress="$(initialize_progress $TESTPOOL $DISK1)" + [[ -z "$progress" ]] && log_fail "Initializing did not start" -new_progress="$(initialize_progress $TESTPOOL $DISK1)" -[[ -z "$new_progress" ]] && log_fail "Initializing did not restart after import" -[[ "$progress" -le "$new_progress" ]] || \ - log_fail "Initializing lost progress after import" -log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL -log_must zpool initialize -s $TESTPOOL $DISK1 -action_date="$(initialize_prog_line $TESTPOOL $DISK1 | \ - sed 's/.*ed at \(.*\)).*/\1/g')" -log_must zpool export $TESTPOOL -log_must zpool import $TESTPOOL -new_action_date=$(initialize_prog_line $TESTPOOL $DISK1 | \ - sed 's/.*ed at \(.*\)).*/\1/g') -[[ "$action_date" != "$new_action_date" ]] && \ - log_fail "Initializing action date did not persist across export/import" + new_progress="$(initialize_progress $TESTPOOL $DISK1)" + [[ -z "$new_progress" ]] && log_fail "Initializing did not restart after import" + [[ "$progress" -le "$new_progress" ]] || \ + log_fail "Initializing lost progress after import" + log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" -[[ "$new_progress" -le "$(initialize_progress $TESTPOOL $DISK1)" ]] || \ - log_fail "Initializing lost progress after import" + log_must zpool initialize -s $TESTPOOL $DISK1 + action_date="$(initialize_prog_line $TESTPOOL $DISK1 | \ + sed 's/.*ed at \(.*\)).*/\1/g')" + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL + new_action_date=$(initialize_prog_line $TESTPOOL $DISK1 | \ + sed 's/.*ed at \(.*\)).*/\1/g') + [[ "$action_date" != "$new_action_date" ]] && \ + log_fail "Initializing action date did not persist across export/import" -log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + [[ "$new_progress" -le "$(initialize_progress $TESTPOOL $DISK1)" ]] || \ + log_fail "Initializing lost progress after import" + + log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + + poolexists $TESTPOOL && destroy_pool $TESTPOOL +done log_pass "Initializing retains state as expected across export/import" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh index 89eace601577..33c747edc6c7 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh @@ -36,32 +36,45 @@ # 2. Start initializing, offline, export, import, online and verify that # initializing state is preserved / initializing behaves as expected # at each step. +# 3. Repeat for other VDEV types. # DISK1="$(echo $DISKS | cut -d' ' -f1)" DISK2="$(echo $DISKS | cut -d' ' -f2)" +DISK3="$(echo $DISKS | cut -d' ' -f3)" -log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 +for type in "mirror" "anymirror1"; do -log_must zpool initialize $TESTPOOL $DISK1 -log_must zpool offline $TESTPOOL $DISK1 -progress="$(initialize_progress $TESTPOOL $DISK1)" -[[ -z "$progress" ]] && log_fail "Initializing did not start" -log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + if [[ "$type" =~ "anymirror" ]]; then + export disks="$DISK1 $DISK2 $DISK3" + else + export disks="$DISK1 $DISK2" + fi + log_must zpool create -f $TESTPOOL $type $disks -log_must zpool export $TESTPOOL -log_must zpool import $TESTPOOL + log_must zpool initialize $TESTPOOL $DISK1 + log_must zpool offline $TESTPOOL $DISK1 + progress="$(initialize_progress $TESTPOOL $DISK1)" + [[ -z "$progress" ]] && log_fail "Initializing did not start" + log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" -new_progress="$(initialize_progress $TESTPOOL $DISK1)" -[[ -z "$new_progress" ]] && log_fail "Initializing did not start after import" -[[ "$new_progress" -ge "$progress" ]] || \ - log_fail "Initializing lost progress after import" -log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL -log_must zpool online $TESTPOOL $DISK1 -new_progress="$(initialize_progress $TESTPOOL $DISK1)" -[[ "$new_progress" -ge "$progress" ]] || \ - log_fail "Initializing lost progress after online" + new_progress="$(initialize_progress $TESTPOOL $DISK1)" + [[ -z "$new_progress" ]] && log_fail "Initializing did not start after import" + [[ "$new_progress" -ge "$progress" ]] || \ + log_fail "Initializing lost progress after import" + log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + + log_must zpool online $TESTPOOL $DISK1 + new_progress="$(initialize_progress $TESTPOOL $DISK1)" + [[ "$new_progress" -ge "$progress" ]] || \ + log_fail "Initializing lost progress after online" + + poolexists $TESTPOOL && destroy_pool $TESTPOOL + +done log_pass "Initializing behaves as expected at each step of:" \ "initialize + offline + export + import + online" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh index 10721c1f6cb2..614fb1149425 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh @@ -23,6 +23,7 @@ # # Copyright (c) 2016 by Delphix. All rights reserved. +# Copyright (c) 2025 by Klara, Inc. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib @@ -39,37 +40,57 @@ # 5. Verify that initializing resumes and progress does not regress. # 6. Suspend initializing. # 7. Repeat steps 3-4 and verify that initializing does not resume. +# 8. Repeat the scenario for other VDEVs # DISK1=${DISKS%% *} DISK2="$(echo $DISKS | cut -d' ' -f2)" +DISK3="$(echo $DISKS | cut -d' ' -f3)" -log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 -log_must zpool initialize $TESTPOOL $DISK1 +log_onexit_push zpool status -v -log_must zpool offline $TESTPOOL $DISK1 +for type in "mirror" "anymirror1"; do -progress="$(initialize_progress $TESTPOOL $DISK1)" -[[ -z "$progress" ]] && log_fail "Initializing did not start" + if [[ "$type" == "mirror" ]]; then + log_must zpool create -f $TESTPOOL $type $DISK1 $DISK2 + else + log_must zpool create -f $TESTPOOL $type $DISK1 $DISK2 $DISK3 + log_must file_write -o create -f /$TESTPOOL/f1 -b 1048576 -c 400 -d R + log_must zpool sync + log_must rm /$TESTPOOL/f1 + log_must zpool sync + fi + log_must zinject -D 10:1 -d $DISK1 -T write $TESTPOOL + log_must zpool initialize $TESTPOOL $DISK1 -log_must zpool online $TESTPOOL $DISK1 + log_must zpool offline $TESTPOOL $DISK1 -new_progress="$(initialize_progress $TESTPOOL $DISK1)" -[[ -z "$new_progress" ]] && \ - log_fail "Initializing did not restart after onlining" -[[ "$progress" -le "$new_progress" ]] || \ - log_fail "Initializing lost progress after onlining" -log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + progress="$(initialize_progress $TESTPOOL $DISK1)" + [[ -z "$progress" ]] && log_fail "Initializing did not start" -log_must zpool initialize -s $TESTPOOL $DISK1 -action_date="$(initialize_prog_line $TESTPOOL $DISK1 | \ - sed 's/.*ed at \(.*\)).*/\1/g')" -log_must zpool offline $TESTPOOL $DISK1 -log_must zpool online $TESTPOOL $DISK1 -new_action_date=$(initialize_prog_line $TESTPOOL $DISK1 | \ - sed 's/.*ed at \(.*\)).*/\1/g') -[[ "$action_date" != "$new_action_date" ]] && \ - log_fail "Initializing action date did not persist across offline/online" -log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + log_must zpool online $TESTPOOL $DISK1 + + new_progress="$(initialize_progress $TESTPOOL $DISK1)" + [[ -z "$new_progress" ]] && \ + log_fail "Initializing did not restart after onlining" + [[ "$progress" -le "$new_progress" ]] || \ + log_fail "Initializing lost progress after onlining" + log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + + log_must zpool initialize -s $TESTPOOL $DISK1 + log_must zinject -c all + action_date="$(initialize_prog_line $TESTPOOL $DISK1 | \ + sed 's/.*ed at \(.*\)).*/\1/g')" + log_must zpool offline $TESTPOOL $DISK1 + log_must zpool online $TESTPOOL $DISK1 + new_action_date=$(initialize_prog_line $TESTPOOL $DISK1 | \ + sed 's/.*ed at \(.*\)).*/\1/g') + [[ "$action_date" != "$new_action_date" ]] && \ + log_fail "Initializing action date did not persist across offline/online" + log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + + poolexists $TESTPOOL && destroy_pool $TESTPOOL + +done log_pass "Initializing performs as expected across offline/online" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh index 79bf0b6a2d08..3313a11e9f54 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh @@ -37,25 +37,37 @@ # 2. Start initializing and verify that initializing is active. # 3. Try to cancel and suspend initializing on the non-initializing disks. # 4. Try to re-initialize the currently initializing disk. +# 5. Repeat for other VDEVs # DISK1=${DISKS%% *} DISK2="$(echo $DISKS | cut -d' ' -f2)" DISK3="$(echo $DISKS | cut -d' ' -f3)" -log_must zpool list -v -log_must zpool create -f $TESTPOOL $DISK1 $DISK2 $DISK3 -log_must zpool initialize $TESTPOOL $DISK1 +for type in "" "anymirror2"; do -[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ - log_fail "Initialize did not start" + log_must zpool list -v + log_must zpool create -O compress=off -f $TESTPOOL $type $DISK1 $DISK2 $DISK3 + if [[ "$type" == "anymirror2" ]]; then + log_must file_write -o create -f /$TESTPOOL/f1 -b 1048576 -c 2000 -d Z + log_must zpool sync + log_must rm /$TESTPOOL/f1 + fi + log_must zpool initialize $TESTPOOL $DISK1 -log_mustnot zpool initialize -c $TESTPOOL $DISK2 -log_mustnot zpool initialize -c $TESTPOOL $DISK2 $DISK3 + [[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ + log_fail "Initialize did not start" -log_mustnot zpool initialize -s $TESTPOOL $DISK2 -log_mustnot zpool initialize -s $TESTPOOL $DISK2 $DISK3 + log_mustnot zpool initialize -c $TESTPOOL $DISK2 + log_mustnot zpool initialize -c $TESTPOOL $DISK2 $DISK3 -log_mustnot zpool initialize $TESTPOOL $DISK1 + log_mustnot zpool initialize -s $TESTPOOL $DISK2 + log_mustnot zpool initialize -s $TESTPOOL $DISK2 $DISK3 + + log_mustnot zpool initialize $TESTPOOL $DISK1 + + poolexists $TESTPOOL && destroy_pool $TESTPOOL + +done log_pass "Nonsensical initialize operations fail" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh index f872246a0661..65b56a067f0d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh @@ -35,19 +35,26 @@ # 1. Create a one-disk pool. # 2. Start initializing and verify that initializing is active. # 3. Cancel initializing and verify that initializing is not active. +# 4. Repeat for other VDEVs # DISK1=${DISKS%% *} -log_must zpool create -f $TESTPOOL $DISK1 -log_must zpool initialize $TESTPOOL +for type in "" "anymirror0"; do -[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ - log_fail "Initialize did not start" + log_must zpool create -f $TESTPOOL $type $DISK1 + log_must zpool initialize $TESTPOOL -log_must zpool initialize -c $TESTPOOL + [[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ + log_fail "Initialize did not start" -[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] || \ - log_fail "Initialize did not stop" + log_must zpool initialize -c $TESTPOOL + + [[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] || \ + log_fail "Initialize did not stop" + + poolexists $TESTPOOL && destroy_pool $TESTPOOL + +done log_pass "Initialize start + cancel works" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh index 6c75146af6b7..2040ab42eba3 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh @@ -40,7 +40,8 @@ # b. Verify uninitialize fails when actively initializing. # c. Cancel or suspend initializing and verify that initializing is not active. # d. Verify uninitialize succeeds after being cancelled. -# 4. Verify per-disk cancel|suspend + uninit +# 4. Verify per-disk cancel|suspend + uninit. +# 5. Repeat for other VDEVs. # DISK1="$(echo $DISKS | cut -d' ' -f1)" @@ -78,65 +79,76 @@ function status_check_all # pool disk-state status_check "$pool" "$disk_state" "$disk_state" "$disk_state" } -# 1. Create a one-disk pool. -log_must zpool create -f $TESTPOOL $DISK1 $DISK2 $DISK3 -status_check_all $TESTPOOL "uninitialized" +for type in "" "anymirror1"; do -# 2. Verify uninitialize succeeds for uninitialized pool. -log_must zpool initialize -u $TESTPOOL -status_check_all $TESTPOOL "uninitialized" + # 1. Create a one-disk pool. + log_must zpool create -O compress=off -f $TESTPOOL $type $DISK1 $DISK2 $DISK3 + status_check_all $TESTPOOL "uninitialized" + if [[ "$type" == "anymirror1" ]]; then + log_must file_write -o create -f /$TESTPOOL/f1 -b 1048576 -c 2000 -d Z + log_must zpool sync + log_must rm /$TESTPOOL/f1 + fi + + # 2. Verify uninitialize succeeds for uninitialized pool. + log_must zpool initialize -u $TESTPOOL + status_check_all $TESTPOOL "uninitialized" + + # 3. Verify pool wide cancel + uninit + log_must zpool initialize $TESTPOOL + status_check_all $TESTPOOL "[[:digit:]]* initialized" + + log_mustnot zpool initialize -u $TESTPOOL + status_check_all $TESTPOOL "[[:digit:]]* initialized" -# 3. Verify pool wide cancel + uninit -log_must zpool initialize $TESTPOOL -status_check_all $TESTPOOL "[[:digit:]]* initialized" + log_must zpool initialize -c $TESTPOOL + status_check_all $TESTPOOL "uninitialized" -log_mustnot zpool initialize -u $TESTPOOL -status_check_all $TESTPOOL "[[:digit:]]* initialized" + log_must zpool initialize -u $TESTPOOL + status_check_all $TESTPOOL "uninitialized" -log_must zpool initialize -c $TESTPOOL -status_check_all $TESTPOOL "uninitialized" + # 3. Verify pool wide suspend + uninit + log_must zpool initialize $TESTPOOL + status_check_all $TESTPOOL "[[:digit:]]* initialized" -log_must zpool initialize -u $TESTPOOL -status_check_all $TESTPOOL "uninitialized" + log_mustnot zpool initialize -u $TESTPOOL + status_check_all $TESTPOOL "[[:digit:]]* initialized" -# 3. Verify pool wide suspend + uninit -log_must zpool initialize $TESTPOOL -status_check_all $TESTPOOL "[[:digit:]]* initialized" + log_must zpool initialize -s $TESTPOOL + status_check_all $TESTPOOL "suspended" -log_mustnot zpool initialize -u $TESTPOOL -status_check_all $TESTPOOL "[[:digit:]]* initialized" + log_must zpool initialize -u $TESTPOOL + status_check_all $TESTPOOL "uninitialized" -log_must zpool initialize -s $TESTPOOL -status_check_all $TESTPOOL "suspended" + # 4. Verify per-disk cancel|suspend + uninit + log_must zpool initialize $TESTPOOL + status_check_all $TESTPOOL "[[:digit:]]* initialized" -log_must zpool initialize -u $TESTPOOL -status_check_all $TESTPOOL "uninitialized" + log_must zpool initialize -c $TESTPOOL $DISK1 + log_must zpool initialize -s $TESTPOOL $DISK2 + log_mustnot zpool initialize -u $TESTPOOL $DISK3 + status_check $TESTPOOL "uninitialized" "suspended" "[[:digit:]]* initialized" -# 4. Verify per-disk cancel|suspend + uninit -log_must zpool initialize $TESTPOOL -status_check_all $TESTPOOL "[[:digit:]]* initialized" + log_must zpool initialize -u $TESTPOOL $DISK1 + status_check $TESTPOOL "uninitialized" "suspended" "[[:digit:]]* initialized" -log_must zpool initialize -c $TESTPOOL $DISK1 -log_must zpool initialize -s $TESTPOOL $DISK2 -log_mustnot zpool initialize -u $TESTPOOL $DISK3 -status_check $TESTPOOL "uninitialized" "suspended" "[[:digit:]]* initialized" + log_must zpool initialize -u $TESTPOOL $DISK2 + status_check $TESTPOOL "uninitialized" "uninitialized" "[[:digit:]]* initialized" -log_must zpool initialize -u $TESTPOOL $DISK1 -status_check $TESTPOOL "uninitialized" "suspended" "[[:digit:]]* initialized" + log_must zpool initialize $TESTPOOL $DISK1 + status_check $TESTPOOL "[[:digit:]]* initialized" "uninitialized" "[[:digit:]]* initialized" -log_must zpool initialize -u $TESTPOOL $DISK2 -status_check $TESTPOOL "uninitialized" "uninitialized" "[[:digit:]]* initialized" + log_must zpool initialize $TESTPOOL $DISK2 + status_check_all $TESTPOOL "[[:digit:]]* initialized" -log_must zpool initialize $TESTPOOL $DISK1 -status_check $TESTPOOL "[[:digit:]]* initialized" "uninitialized" "[[:digit:]]* initialized" + log_must zpool initialize -s $TESTPOOL + status_check_all $TESTPOOL "suspended" -log_must zpool initialize $TESTPOOL $DISK2 -status_check_all $TESTPOOL "[[:digit:]]* initialized" + log_must zpool initialize -u $TESTPOOL $DISK1 $DISK2 $DISK3 + status_check_all $TESTPOOL "uninitialized" -log_must zpool initialize -s $TESTPOOL -status_check_all $TESTPOOL "suspended" + poolexists $TESTPOOL && destroy_pool $TESTPOOL -log_must zpool initialize -u $TESTPOOL $DISK1 $DISK2 $DISK3 -status_check_all $TESTPOOL "uninitialized" +done log_pass "Initialize start + cancel/suspend + uninit + start works" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh index a8d06d464851..a25fabfaee7d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh @@ -37,24 +37,31 @@ # 3. Start initializing and verify that initializing is active. # 4. Write more data to the pool. # 5. Run zdb to validate checksums. +# 6. Repeat for other VDEVs. # DISK1=${DISKS%% *} -log_must zpool create -f $TESTPOOL $DISK1 -log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1M count=30 -sync_all_pools +for type in "" "anymirror0"; do -log_must zpool initialize $TESTPOOL + log_must zpool create -f $TESTPOOL $type $DISK1 + log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1M count=30 + sync_all_pools -log_must zdb -cc $TESTPOOL + log_must zpool initialize $TESTPOOL -[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ - log_fail "Initializing did not start" + log_must zdb -cc $TESTPOOL -log_must dd if=/dev/urandom of=/$TESTPOOL/file2 bs=1M count=30 -sync_all_pools + [[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ + log_fail "Initializing did not start" -log_must zdb -cc $TESTPOOL + log_must dd if=/dev/urandom of=/$TESTPOOL/file2 bs=1M count=30 + sync_all_pools + + log_must zdb -cc $TESTPOOL + + poolexists $TESTPOOL && destroy_pool $TESTPOOL + +done log_pass "Initializing does not corrupt existing or new data" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh index 92e6164d637d..00a9f21896da 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh @@ -59,30 +59,37 @@ log_must set_tunable64 INITIALIZE_VALUE $(printf %llu 0x$PATTERN) log_must mkdir "$TESTDIR" log_must truncate -s $MINVDEVSIZE "$SMALLFILE" -log_must zpool create $TESTPOOL "$SMALLFILE" -log_must zpool initialize -w $TESTPOOL -log_must zpool export $TESTPOOL -metaslabs=0 -bs=512 -zdb -p $TESTDIR -Pme $TESTPOOL | awk '/metaslab[ ]+[0-9]+/ { print $4, $8 }' | -while read -r offset size; do - log_note "offset: '$offset'" - log_note "size: '$size'" +for type in "" "anymirror0"; do - metaslabs=$((metaslabs + 1)) - offset=$(((4 * 1024 * 1024) + 16#$offset)) - log_note "vdev file offset: '$offset'" + log_must zpool create $TESTPOOL $type "$SMALLFILE" + log_must zpool initialize -w $TESTPOOL + log_must zpool export $TESTPOOL - # Note we use '-t x4' instead of '-t x8' here because x8 is not - # a supported format on FreeBSD. - dd if=$SMALLFILE skip=$((offset / bs)) count=$((size / bs)) bs=$bs | - od -t x4 -Ad | grep -qE "deadbeef +deadbeef +deadbeef +deadbeef" || - log_fail "Pattern not found in metaslab free space" -done + metaslabs=0 + bs=512 + zdb -p $TESTDIR -Pme $TESTPOOL | awk '/metaslab[ ]+[0-9]+/ { print $4, $8 }' | + while read -r offset size; do + log_note "offset: '$offset'" + log_note "size: '$size'" + + metaslabs=$((metaslabs + 1)) + offset=$(((4 * 1024 * 1024) + 16#$offset)) + log_note "vdev file offset: '$offset'" + + # Note we use '-t x4' instead of '-t x8' here because x8 is not + # a supported format on FreeBSD. + dd if=$SMALLFILE skip=$((offset / bs)) count=$((size / bs)) bs=$bs | + od -t x4 -Ad | grep -qE "deadbeef +deadbeef +deadbeef +deadbeef" || + log_fail "Pattern not found in metaslab free space" + done -if [[ $metaslabs -eq 0 ]]; then - log_fail "Did not find any metaslabs to check" -else - log_pass "Initializing wrote to each metaslab" -fi + if [[ $metaslabs -eq 0 ]]; then + log_fail "Did not find any metaslabs to check" + else + log_pass "Initializing wrote to each metaslab" + fi + + poolexists $TESTPOOL && destroy_pool $TESTPOOL + +done diff --git a/tests/zfs-tests/tests/functional/direct/dio.kshlib b/tests/zfs-tests/tests/functional/direct/dio.kshlib index 33564ccc71e6..c8a6e5c00ac6 100644 --- a/tests/zfs-tests/tests/functional/direct/dio.kshlib +++ b/tests/zfs-tests/tests/functional/direct/dio.kshlib @@ -261,19 +261,6 @@ function check_read # pool file bs count skip flags buf_rd dio_rd fi } -function get_file_size -{ - typeset filename="$1" - - if is_linux; then - filesize=$(stat -c %s $filename) - else - filesize=$(stat -s $filename | awk '{print $8}' | grep -o '[0-9]\+') - fi - - echo $filesize -} - function do_truncate_reduce { typeset filename=$1 diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh index 6397e26b5d89..9feded3dfe03 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh @@ -56,7 +56,14 @@ zed_events_drain TESTFILE="/$TESTPOOL/$TESTFS/testfile" -for type in "mirror" "raidz" "raidz2" "draid:1s"; do +for type in "mirror" "raidz" "raidz2" "draid:1s" "anymirror1" "anymirror2" "anymirror3" "anyraidz2:2" ; do + if [[ "$type" =~ "any" ]]; then + export VDEVSIZE=1073741824 + export TESTFILE_SIZE=268435456 + else + export VDEVSIZE=$MINVDEVSIZE + export TESTFILE_SIZE=67108864 + fi if [ "$type" = "draid:1s" ]; then # 1. Create a dRAID pool with a distributed hot spare # @@ -64,13 +71,13 @@ for type in "mirror" "raidz" "raidz2" "draid:1s"; do # vdev since the dRAID permutation at these offsets maps # to distributed spare space and not data devices. # - log_must truncate -s $MINVDEVSIZE $VDEV_FILES + log_must truncate -s $VDEVSIZE $VDEV_FILES log_must zpool create -f $TESTPOOL $type $VDEV_FILES SPARE="draid1-0-0" FAULT="$TEST_BASE_DIR/file-2" else # 1. Create a pool with hot spares - log_must truncate -s $MINVDEVSIZE $VDEV_FILES $SPARE_FILE + log_must truncate -s $VDEVSIZE $VDEV_FILES $SPARE_FILE log_must zpool create -f $TESTPOOL $type $VDEV_FILES \ spare $SPARE_FILE SPARE=$SPARE_FILE @@ -79,14 +86,14 @@ for type in "mirror" "raidz" "raidz2" "draid:1s"; do # 2. Create a filesystem with the primary cache disable to force reads log_must zfs create -o primarycache=none $TESTPOOL/$TESTFS - log_must zfs set recordsize=16k $TESTPOOL/$TESTFS + log_must zfs set recordsize=16k compression=off $TESTPOOL/$TESTFS # 3. Write a file to the pool to be read back - log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=64 + log_must file_write -o create -f $TESTFILE -b 1048576 -c $(( TESTFILE_SIZE / 1024 / 1024 )) -d R # 4. Inject IO ERRORS on read with a zinject error handler log_must zinject -d $FAULT -e io -T read $TESTPOOL - log_must cp $TESTFILE /dev/null + log_must dd if=$TESTFILE of=/dev/null bs=1M count=$(( TESTFILE_SIZE / 1024 / 1024 )) # 5. Verify the ZED kicks in a hot spare and expected pool/device status log_note "Wait for ZED to auto-spare" diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh index 1d104fe6c106..fc5b028b7915 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh @@ -58,23 +58,30 @@ if ! is_freebsd; then fi TESTFILE="/$TESTPOOL/$TESTFS/testfile" - -for type in "mirror" "raidz" "raidz2"; do +for type in "mirror" "raidz" "raidz2" "anymirror1" "anymirror2" "anymirror3" "anyraidz2:2" ; do + if [[ "$type" =~ "any" ]]; then + export VDEVSIZE=1073741824 + export TESTFILE_SIZE=268435456 + else + export VDEVSIZE=$MINVDEVSIZE + export TESTFILE_SIZE=67108864 + fi # 1. Create a pool with hot spares - log_must truncate -s $MINVDEVSIZE $VDEV_FILES $SPARE_FILE + log_must truncate -s $VDEVSIZE $VDEV_FILES $SPARE_FILE log_must zpool create -f $TESTPOOL $type $VDEV_FILES \ spare $SPARE_FILE # 2. Create a filesystem with the primary cache disable to force reads log_must zfs create -o primarycache=none $TESTPOOL/$TESTFS - log_must zfs set recordsize=16k $TESTPOOL/$TESTFS + log_must zfs set recordsize=16k compression=off $TESTPOOL/$TESTFS # 3. Write a file to the pool to be read back - log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=64 + log_must file_write -o create -f $TESTFILE -b 1048576 -c $(( TESTFILE_SIZE / 1024 / 1024 )) -d R # 4. Inject CHECKSUM ERRORS on read with a zinject error handler - log_must zinject -d $FAULT_FILE -e corrupt -f 50 -T read $TESTPOOL - log_must dd if=$TESTFILE of=/dev/null bs=1M count=64 + log_must zinject -d $FAULT_FILE -e corrupt -f 100 -T read $TESTPOOL + log_must dd if=$TESTFILE of=/dev/null bs=1M count=$(( TESTFILE_SIZE / 1024 / 1024 )) + log_must zinject # 5. Verify the ZED kicks in a hot spare and expected pool/device status log_note "Wait for ZED to auto-spare" diff --git a/tests/zfs-tests/tests/functional/fault/cleanup.ksh b/tests/zfs-tests/tests/functional/fault/cleanup.ksh index 8801991263cc..bab3de0fdbfb 100755 --- a/tests/zfs-tests/tests/functional/fault/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/fault/cleanup.ksh @@ -35,4 +35,6 @@ zed_stop zed_cleanup resilver_finish-start-scrub.sh zed_events_drain +restore_tunable ANYRAID_MIN_TILE_SIZE + log_pass diff --git a/tests/zfs-tests/tests/functional/fault/fault.cfg b/tests/zfs-tests/tests/functional/fault/fault.cfg index 30887f290ed4..7773709ba23b 100644 --- a/tests/zfs-tests/tests/functional/fault/fault.cfg +++ b/tests/zfs-tests/tests/functional/fault/fault.cfg @@ -50,6 +50,6 @@ if is_linux; then fi export VDEV_FILES="$TEST_BASE_DIR/file-1 $TEST_BASE_DIR/file-2 \ - $TEST_BASE_DIR/file-3 $TEST_BASE_DIR/file-4" + $TEST_BASE_DIR/file-3 $TEST_BASE_DIR/file-4 $TEST_BASE_DIR/file-5" export SPARE_FILE="$TEST_BASE_DIR/spare-1" export FAULT_FILE="$TEST_BASE_DIR/file-1" diff --git a/tests/zfs-tests/tests/functional/fault/setup.ksh b/tests/zfs-tests/tests/functional/fault/setup.ksh index 6ca860ed6153..0357e35785b6 100755 --- a/tests/zfs-tests/tests/functional/fault/setup.ksh +++ b/tests/zfs-tests/tests/functional/fault/setup.ksh @@ -29,6 +29,9 @@ verify_runnable "global" +log_must save_tunable ANYRAID_MIN_TILE_SIZE +log_must set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + zed_events_drain zed_setup resilver_finish-start-scrub.sh zed_start diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh index a8deedfb8c3c..d72fdaafa82c 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh @@ -26,7 +26,7 @@ # # DESCRIPTION: -# Check various pool geometries stripe, mirror, raidz) +# Check various pool geometries stripe, mirror, anyraid, raidz. # # STRATEGY: # 1. Create a pool on file vdevs to trim. @@ -36,7 +36,7 @@ # 5. Remove all files making it possible to trim the entire pool. # 6. Wait for auto trim to issue trim IOs for the free blocks. # 7. Verify the disks contain 30% or less allocated blocks. -# 8. Repeat for test for striped, mirrored, and RAIDZ pools. +# 8. Repeat for test for striped, mirrored, AnyRAID, and RAIDZ pools. verify_runnable "global" @@ -70,13 +70,21 @@ log_must set_tunable64 VDEV_MIN_MS_COUNT 32 typeset VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.75 / 1024 / 1024) )) typeset VDEV_MIN_MB=$(( floor(4 * MINVDEVSIZE * 0.30 / 1024 / 1024) )) +typeset TXGS=64 -for type in "" "mirror" "raidz2" "draid"; do +for type in "" "mirror" "anymirror0" "anymirror1" "anymirror2" "anymirror3" "anyraidz1:2" "raidz2" "draid"; do if [[ "$type" = "" ]]; then VDEVS="$TRIM_VDEV1" elif [[ "$type" = "mirror" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2" + elif [[ "$type" =~ "any" ]]; then + VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3 $TRIM_VDEV4" + + # The per-vdev utilization is lower due to the capacity + # used by the tile map + VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.50 / 1024 / 1024) )) + TXGS=128 elif [[ "$type" = "raidz2" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3" elif [[ "$type" = "draid" ]]; then @@ -101,7 +109,7 @@ for type in "" "mirror" "raidz2" "draid"; do # Remove the file, wait for trim, verify the vdevs are now sparse. log_must rm /$TESTPOOL/file - wait_trim_io $TESTPOOL "ind" 64 + wait_trim_io $TESTPOOL "ind" $TXGS verify_vdevs "-le" "$VDEV_MIN_MB" $VDEVS log_must zpool destroy $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh index 1995dbe6fa5c..c08b4e187277 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh @@ -34,7 +34,7 @@ # 3. Generate some interesting pool data which can be trimmed. # 4. Verify trim IOs of the expected type were issued for the pool. # 5. Verify data integrity of the pool after trim. -# 6. Repeat test for striped, mirrored, and RAIDZ pools. +# 6. Repeat test for striped, mirrored, AnyRAID, and RAIDZ pools. verify_runnable "global" @@ -61,7 +61,7 @@ log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 512 typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) log_must set_tunable64 TRIM_TXG_BATCH 8 -for type in "" "mirror" "raidz" "draid"; do +for type in "" "mirror" "anymirror0" "anymirror1" "anymirror2" "anymirror3" "anyraidz1:2" "raidz" "draid"; do log_must truncate -s 1G $TRIM_VDEVS log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh index 440f2bd1302a..9ec15cc372f9 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh @@ -35,7 +35,7 @@ # 4. While generating data issue manual trims. # 4. Verify trim IOs of the expected type were issued for the pool. # 5. Verify data integrity of the pool after trim. -# 6. Repeat test for striped, mirrored, and RAIDZ pools. +# 6. Repeat test for striped, mirrored, AnyRAID, and RAIDZ pools. verify_runnable "global" @@ -62,7 +62,7 @@ log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 512 typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) log_must set_tunable64 TRIM_TXG_BATCH 8 -for type in "" "mirror" "raidz" "raidz2" "draid" "draid2"; do +for type in "" "mirror" "anymirror0" "anymirror1" "anymirror2" "anymirror3" "anyraidz1:2" "raidz" "raidz2" "draid" "draid2"; do log_must truncate -s 1G $TRIM_VDEVS log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS diff --git a/tests/zfs-tests/tests/functional/trim/cleanup.ksh b/tests/zfs-tests/tests/functional/trim/cleanup.ksh index faeefb8e5acd..ada38bd1d4fa 100755 --- a/tests/zfs-tests/tests/functional/trim/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/trim/cleanup.ksh @@ -46,4 +46,6 @@ TRIM_VDEVS="$TRIM_DIR/trim-vdev1 $TRIM_DIR/trim-vdev2 \ rm -rf $TRIM_VDEVS +restore_tunable ANYRAID_MIN_TILE_SIZE + default_cleanup diff --git a/tests/zfs-tests/tests/functional/trim/setup.ksh b/tests/zfs-tests/tests/functional/trim/setup.ksh index 7be2a316a873..de44ff82f5d7 100755 --- a/tests/zfs-tests/tests/functional/trim/setup.ksh +++ b/tests/zfs-tests/tests/functional/trim/setup.ksh @@ -41,4 +41,7 @@ else fi fi +log_must save_tunable ANYRAID_MIN_TILE_SIZE +log_must set_tunable64 ANYRAID_MIN_TILE_SIZE 67108864 + log_pass diff --git a/tests/zfs-tests/tests/functional/trim/trim_config.ksh b/tests/zfs-tests/tests/functional/trim/trim_config.ksh index ff569177357b..01e5a5f87b4d 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_config.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_config.ksh @@ -36,7 +36,7 @@ # 5. Manually trim the pool. # 6. Wait for trim to issue trim IOs for the free blocks. # 7. Verify the disks contain 30% or less allocated blocks. -# 8. Repeat for test for striped, mirrored, and RAIDZ pools. +# 8. Repeat for test for striped, mirrored, AnyRAID, and RAIDZ pools. verify_runnable "global" @@ -68,23 +68,43 @@ log_must set_tunable64 TRIM_TXG_BATCH 8 typeset vdev_min_ms_count=$(get_tunable VDEV_MIN_MS_COUNT) log_must set_tunable64 VDEV_MIN_MS_COUNT 32 -typeset VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.75 / 1024 / 1024) )) -typeset VDEV_MIN_MB=$(( floor(4 * MINVDEVSIZE * 0.30 / 1024 / 1024) )) +typeset VDEV_MAX_MB=$(( 4 * MINVDEVSIZE / 1024 / 1024 )) +typeset VDEV_MIN_MB=0 -for type in "" "mirror" "raidz2" "draid"; do +for type in "" "mirror" "anymirror0" "anymirror1" "anymirror2" "anymirror3" "anyraidz1:2" "raidz2" "draid"; do if [[ "$type" = "" ]]; then VDEVS="$TRIM_VDEV1" elif [[ "$type" = "mirror" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2" + elif [[ "$type" = "anymirror0" ]]; then + VDEVS="$TRIM_VDEV1" + elif [[ "$type" = "anymirror1" ]]; then + VDEVS="$TRIM_VDEV1 $TRIM_VDEV2" + elif [[ "$type" = "anymirror2" ]]; then + VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3" + elif [[ "$type" = "anymirror3" ]]; then + VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3 $TRIM_VDEV4" + elif [[ "$type" = "anyraidz1:2" ]]; then + VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3 $TRIM_VDEV4" elif [[ "$type" = "raidz2" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3" elif [[ "$type" = "draid" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3 $TRIM_VDEV4" + fi + if [[ "$type" =~ "anymirror" ]]; then + # The AnyRAID VDEV takes some space for the mapping itself + VDEV_MAX_MB=$(( floor(3 * MINVDEVSIZE * 0.75 / 1024 / 1024) )) + VDEV_MIN_MB=$(( floor(3 * MINVDEVSIZE * 0.30 / 1024 / 1024) )) + elif [[ "$type" = "draid" ]]; then # The per-vdev utilization is lower due to the capacity # resilverd for the distributed spare. VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.50 / 1024 / 1024) )) + VDEV_MIN_MB=$(( floor(4 * MINVDEVSIZE * 0.30 / 1024 / 1024) )) + else + VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.75 / 1024 / 1024) )) + VDEV_MIN_MB=$(( floor(4 * MINVDEVSIZE * 0.30 / 1024 / 1024) )) fi log_must truncate -s $((4 * MINVDEVSIZE)) $VDEVS diff --git a/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh index f298f66a44d8..e3dd1aed11c5 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh @@ -34,7 +34,7 @@ # 3. Manually trim the pool. # 4. Verify trim IOs of the expected type were issued for the pool. # 5. Verify data integrity of the pool after trim. -# 6. Repeat test for striped, mirrored, and RAIDZ pools. +# 6. Repeat test for striped, mirrored, AnyRAID, and RAIDZ pools. verify_runnable "global" @@ -61,7 +61,7 @@ log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 512 typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) log_must set_tunable64 TRIM_TXG_BATCH 8 -for type in "" "mirror" "raidz" "draid"; do +for type in "" "mirror" "anymirror0" "anymirror1" "anymirror2" "anymirror3" "anyraidz1:2" "raidz" "draid"; do log_must truncate -s 1G $TRIM_VDEVS log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS