LCOV - code coverage report
Current view: top level - lib/blob - blobstore.c (source / functions) Hit Total Coverage
Test: ut_cov_unit.info Lines: 4056 5084 79.8 %
Date: 2024-12-02 15:19:40 Functions: 339 361 93.9 %

          Line data    Source code
       1             : /*   SPDX-License-Identifier: BSD-3-Clause
       2             :  *   Copyright (C) 2017 Intel Corporation.
       3             :  *   All rights reserved.
       4             :  *   Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
       5             :  */
       6             : 
       7             : #include "spdk/stdinc.h"
       8             : 
       9             : #include "spdk/blob.h"
      10             : #include "spdk/crc32.h"
      11             : #include "spdk/env.h"
      12             : #include "spdk/queue.h"
      13             : #include "spdk/thread.h"
      14             : #include "spdk/bit_array.h"
      15             : #include "spdk/bit_pool.h"
      16             : #include "spdk/likely.h"
      17             : #include "spdk/util.h"
      18             : #include "spdk/string.h"
      19             : #include "spdk/trace.h"
      20             : 
      21             : #include "spdk_internal/assert.h"
      22             : #include "spdk_internal/trace_defs.h"
      23             : #include "spdk/log.h"
      24             : 
      25             : #include "blobstore.h"
      26             : 
      27             : #define BLOB_CRC32C_INITIAL    0xffffffffUL
      28             : 
      29             : static int bs_register_md_thread(struct spdk_blob_store *bs);
      30             : static int bs_unregister_md_thread(struct spdk_blob_store *bs);
      31             : static void blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno);
      32             : static void blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
      33             :                 uint64_t cluster, uint32_t extent, struct spdk_blob_md_page *page,
      34             :                 spdk_blob_op_complete cb_fn, void *cb_arg);
      35             : static void blob_free_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
      36             :                 uint32_t extent_page, struct spdk_blob_md_page *page, spdk_blob_op_complete cb_fn, void *cb_arg);
      37             : 
      38             : static int blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
      39             :                           uint16_t value_len, bool internal);
      40             : static int blob_get_xattr_value(struct spdk_blob *blob, const char *name,
      41             :                                 const void **value, size_t *value_len, bool internal);
      42             : static int blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal);
      43             : 
      44             : static void blob_write_extent_page(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num,
      45             :                                    struct spdk_blob_md_page *page, spdk_blob_op_complete cb_fn, void *cb_arg);
      46             : static void blob_freeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg);
      47             : 
      48             : static void bs_shallow_copy_cluster_find_next(void *cb_arg);
      49             : 
      50             : /*
      51             :  * External snapshots require a channel per thread per esnap bdev.  The tree
      52             :  * is populated lazily as blob IOs are handled by the back_bs_dev. When this
      53             :  * channel is destroyed, all the channels in the tree are destroyed.
      54             :  */
      55             : 
      56             : struct blob_esnap_channel {
      57             :         RB_ENTRY(blob_esnap_channel)    node;
      58             :         spdk_blob_id                    blob_id;
      59             :         struct spdk_io_channel          *channel;
      60             : };
      61             : 
      62             : static int blob_esnap_channel_compare(struct blob_esnap_channel *c1, struct blob_esnap_channel *c2);
      63             : static void blob_esnap_destroy_bs_dev_channels(struct spdk_blob *blob, bool abort_io,
      64             :                 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg);
      65             : static void blob_esnap_destroy_bs_channel(struct spdk_bs_channel *ch);
      66             : static void blob_set_back_bs_dev_frozen(void *_ctx, int bserrno);
      67       19201 : RB_GENERATE_STATIC(blob_esnap_channel_tree, blob_esnap_channel, node, blob_esnap_channel_compare)
      68             : 
      69             : static inline bool
      70       68456 : blob_is_esnap_clone(const struct spdk_blob *blob)
      71             : {
      72       68456 :         assert(blob != NULL);
      73       68456 :         return !!(blob->invalid_flags & SPDK_BLOB_EXTERNAL_SNAPSHOT);
      74             : }
      75             : 
      76             : static int
      77        2875 : blob_id_cmp(struct spdk_blob *blob1, struct spdk_blob *blob2)
      78             : {
      79        2875 :         assert(blob1 != NULL && blob2 != NULL);
      80        2875 :         return (blob1->id < blob2->id ? -1 : blob1->id > blob2->id);
      81             : }
      82             : 
      83       18769 : RB_GENERATE_STATIC(spdk_blob_tree, spdk_blob, link, blob_id_cmp);
      84             : 
      85             : static void
      86       46177 : blob_verify_md_op(struct spdk_blob *blob)
      87             : {
      88       46177 :         assert(blob != NULL);
      89       46177 :         assert(spdk_get_thread() == blob->bs->md_thread);
      90       46177 :         assert(blob->state != SPDK_BLOB_STATE_LOADING);
      91       46177 : }
      92             : 
      93             : static struct spdk_blob_list *
      94        4783 : bs_get_snapshot_entry(struct spdk_blob_store *bs, spdk_blob_id blobid)
      95             : {
      96        4783 :         struct spdk_blob_list *snapshot_entry = NULL;
      97             : 
      98        6018 :         TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) {
      99        2207 :                 if (snapshot_entry->id == blobid) {
     100         972 :                         break;
     101             :                 }
     102             :         }
     103             : 
     104        4783 :         return snapshot_entry;
     105             : }
     106             : 
     107             : static void
     108        3807 : bs_claim_md_page(struct spdk_blob_store *bs, uint32_t page)
     109             : {
     110        3807 :         assert(spdk_spin_held(&bs->used_lock));
     111        3807 :         assert(page < spdk_bit_array_capacity(bs->used_md_pages));
     112        3807 :         assert(spdk_bit_array_get(bs->used_md_pages, page) == false);
     113             : 
     114        3807 :         spdk_bit_array_set(bs->used_md_pages, page);
     115        3807 : }
     116             : 
     117             : static void
     118        2901 : bs_release_md_page(struct spdk_blob_store *bs, uint32_t page)
     119             : {
     120        2901 :         assert(spdk_spin_held(&bs->used_lock));
     121        2901 :         assert(page < spdk_bit_array_capacity(bs->used_md_pages));
     122        2901 :         assert(spdk_bit_array_get(bs->used_md_pages, page) == true);
     123             : 
     124        2901 :         spdk_bit_array_clear(bs->used_md_pages, page);
     125        2901 : }
     126             : 
     127             : static uint32_t
     128       10283 : bs_claim_cluster(struct spdk_blob_store *bs)
     129             : {
     130             :         uint32_t cluster_num;
     131             : 
     132       10283 :         assert(spdk_spin_held(&bs->used_lock));
     133             : 
     134       10283 :         cluster_num = spdk_bit_pool_allocate_bit(bs->used_clusters);
     135       10283 :         if (cluster_num == UINT32_MAX) {
     136           0 :                 return UINT32_MAX;
     137             :         }
     138             : 
     139       10283 :         SPDK_DEBUGLOG(blob, "Claiming cluster %u\n", cluster_num);
     140       10283 :         bs->num_free_clusters--;
     141             : 
     142       10283 :         return cluster_num;
     143             : }
     144             : 
     145             : static void
     146        2996 : bs_release_cluster(struct spdk_blob_store *bs, uint32_t cluster_num)
     147             : {
     148        2996 :         assert(spdk_spin_held(&bs->used_lock));
     149        2996 :         assert(cluster_num < spdk_bit_pool_capacity(bs->used_clusters));
     150        2996 :         assert(spdk_bit_pool_is_allocated(bs->used_clusters, cluster_num) == true);
     151        2996 :         assert(bs->num_free_clusters < bs->total_clusters);
     152             : 
     153        2996 :         SPDK_DEBUGLOG(blob, "Releasing cluster %u\n", cluster_num);
     154             : 
     155        2996 :         spdk_bit_pool_free_bit(bs->used_clusters, cluster_num);
     156        2996 :         bs->num_free_clusters++;
     157        2996 : }
     158             : 
     159             : static int
     160       10283 : blob_insert_cluster(struct spdk_blob *blob, uint32_t cluster_num, uint64_t cluster)
     161             : {
     162       10283 :         uint64_t *cluster_lba = &blob->active.clusters[cluster_num];
     163             : 
     164       10283 :         blob_verify_md_op(blob);
     165             : 
     166       10283 :         if (*cluster_lba != 0) {
     167           5 :                 return -EEXIST;
     168             :         }
     169             : 
     170       10278 :         *cluster_lba = bs_cluster_to_lba(blob->bs, cluster);
     171       10278 :         blob->active.num_allocated_clusters++;
     172             : 
     173       10278 :         return 0;
     174             : }
     175             : 
     176             : static int
     177       10283 : bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num,
     178             :                     uint64_t *cluster, uint32_t *lowest_free_md_page, bool update_map)
     179             : {
     180       10283 :         uint32_t *extent_page = 0;
     181             : 
     182       10283 :         assert(spdk_spin_held(&blob->bs->used_lock));
     183             : 
     184       10283 :         *cluster = bs_claim_cluster(blob->bs);
     185       10283 :         if (*cluster == UINT32_MAX) {
     186             :                 /* No more free clusters. Cannot satisfy the request */
     187           0 :                 return -ENOSPC;
     188             :         }
     189             : 
     190       10283 :         if (blob->use_extent_table) {
     191        6227 :                 extent_page = bs_cluster_to_extent_page(blob, cluster_num);
     192        6227 :                 if (*extent_page == 0) {
     193             :                         /* Extent page shall never occupy md_page so start the search from 1 */
     194        1087 :                         if (*lowest_free_md_page == 0) {
     195        1084 :                                 *lowest_free_md_page = 1;
     196             :                         }
     197             :                         /* No extent_page is allocated for the cluster */
     198        1087 :                         *lowest_free_md_page = spdk_bit_array_find_first_clear(blob->bs->used_md_pages,
     199             :                                                *lowest_free_md_page);
     200        1087 :                         if (*lowest_free_md_page == UINT32_MAX) {
     201             :                                 /* No more free md pages. Cannot satisfy the request */
     202           0 :                                 bs_release_cluster(blob->bs, *cluster);
     203           0 :                                 return -ENOSPC;
     204             :                         }
     205        1087 :                         bs_claim_md_page(blob->bs, *lowest_free_md_page);
     206             :                 }
     207             :         }
     208             : 
     209       10283 :         SPDK_DEBUGLOG(blob, "Claiming cluster %" PRIu64 " for blob 0x%" PRIx64 "\n", *cluster,
     210             :                       blob->id);
     211             : 
     212       10283 :         if (update_map) {
     213        9253 :                 blob_insert_cluster(blob, cluster_num, *cluster);
     214        9253 :                 if (blob->use_extent_table && *extent_page == 0) {
     215         958 :                         *extent_page = *lowest_free_md_page;
     216             :                 }
     217             :         }
     218             : 
     219       10283 :         return 0;
     220             : }
     221             : 
     222             : static void
     223        6977 : blob_xattrs_init(struct spdk_blob_xattr_opts *xattrs)
     224             : {
     225        6977 :         xattrs->count = 0;
     226        6977 :         xattrs->names = NULL;
     227        6977 :         xattrs->ctx = NULL;
     228        6977 :         xattrs->get_value = NULL;
     229        6977 : }
     230             : 
     231             : void
     232        4611 : spdk_blob_opts_init(struct spdk_blob_opts *opts, size_t opts_size)
     233             : {
     234        4611 :         if (!opts) {
     235           0 :                 SPDK_ERRLOG("opts should not be NULL\n");
     236           0 :                 return;
     237             :         }
     238             : 
     239        4611 :         if (!opts_size) {
     240           0 :                 SPDK_ERRLOG("opts_size should not be zero value\n");
     241           0 :                 return;
     242             :         }
     243             : 
     244        4611 :         memset(opts, 0, opts_size);
     245        4611 :         opts->opts_size = opts_size;
     246             : 
     247             : #define FIELD_OK(field) \
     248             :         offsetof(struct spdk_blob_opts, field) + sizeof(opts->field) <= opts_size
     249             : 
     250             : #define SET_FIELD(field, value) \
     251             :         if (FIELD_OK(field)) { \
     252             :                 opts->field = value; \
     253             :         } \
     254             : 
     255        4611 :         SET_FIELD(num_clusters, 0);
     256        4611 :         SET_FIELD(thin_provision, false);
     257        4611 :         SET_FIELD(clear_method, BLOB_CLEAR_WITH_DEFAULT);
     258             : 
     259        4611 :         if (FIELD_OK(xattrs)) {
     260        4611 :                 blob_xattrs_init(&opts->xattrs);
     261             :         }
     262             : 
     263        4611 :         SET_FIELD(use_extent_table, true);
     264             : 
     265             : #undef FIELD_OK
     266             : #undef SET_FIELD
     267             : }
     268             : 
     269             : void
     270        4346 : spdk_blob_open_opts_init(struct spdk_blob_open_opts *opts, size_t opts_size)
     271             : {
     272        4346 :         if (!opts) {
     273           0 :                 SPDK_ERRLOG("opts should not be NULL\n");
     274           0 :                 return;
     275             :         }
     276             : 
     277        4346 :         if (!opts_size) {
     278           0 :                 SPDK_ERRLOG("opts_size should not be zero value\n");
     279           0 :                 return;
     280             :         }
     281             : 
     282        4346 :         memset(opts, 0, opts_size);
     283        4346 :         opts->opts_size = opts_size;
     284             : 
     285             : #define FIELD_OK(field) \
     286             :         offsetof(struct spdk_blob_open_opts, field) + sizeof(opts->field) <= opts_size
     287             : 
     288             : #define SET_FIELD(field, value) \
     289             :         if (FIELD_OK(field)) { \
     290             :                 opts->field = value; \
     291             :         } \
     292             : 
     293        4346 :         SET_FIELD(clear_method, BLOB_CLEAR_WITH_DEFAULT);
     294             : 
     295             : #undef FIELD_OK
     296             : #undef SET_FILED
     297             : }
     298             : 
     299             : static struct spdk_blob *
     300        6707 : blob_alloc(struct spdk_blob_store *bs, spdk_blob_id id)
     301             : {
     302             :         struct spdk_blob *blob;
     303             : 
     304        6707 :         blob = calloc(1, sizeof(*blob));
     305        6707 :         if (!blob) {
     306           0 :                 return NULL;
     307             :         }
     308             : 
     309        6707 :         blob->id = id;
     310        6707 :         blob->bs = bs;
     311             : 
     312        6707 :         blob->parent_id = SPDK_BLOBID_INVALID;
     313             : 
     314        6707 :         blob->state = SPDK_BLOB_STATE_DIRTY;
     315        6707 :         blob->extent_rle_found = false;
     316        6707 :         blob->extent_table_found = false;
     317        6707 :         blob->active.num_pages = 1;
     318        6707 :         blob->active.pages = calloc(1, sizeof(*blob->active.pages));
     319        6707 :         if (!blob->active.pages) {
     320           0 :                 free(blob);
     321           0 :                 return NULL;
     322             :         }
     323             : 
     324        6707 :         blob->active.pages[0] = bs_blobid_to_page(id);
     325             : 
     326        6707 :         TAILQ_INIT(&blob->xattrs);
     327        6707 :         TAILQ_INIT(&blob->xattrs_internal);
     328        6707 :         TAILQ_INIT(&blob->pending_persists);
     329        6707 :         TAILQ_INIT(&blob->persists_to_complete);
     330             : 
     331        6707 :         return blob;
     332             : }
     333             : 
     334             : static void
     335       13414 : xattrs_free(struct spdk_xattr_tailq *xattrs)
     336             : {
     337             :         struct spdk_xattr       *xattr, *xattr_tmp;
     338             : 
     339       15601 :         TAILQ_FOREACH_SAFE(xattr, xattrs, link, xattr_tmp) {
     340        2187 :                 TAILQ_REMOVE(xattrs, xattr, link);
     341        2187 :                 free(xattr->name);
     342        2187 :                 free(xattr->value);
     343        2187 :                 free(xattr);
     344             :         }
     345       13414 : }
     346             : 
     347             : static void
     348        1398 : blob_unref_back_bs_dev(struct spdk_blob *blob)
     349             : {
     350        1398 :         blob->back_bs_dev->destroy(blob->back_bs_dev);
     351        1398 :         blob->back_bs_dev = NULL;
     352        1398 : }
     353             : 
     354             : static void
     355        6707 : blob_free(struct spdk_blob *blob)
     356             : {
     357        6707 :         assert(blob != NULL);
     358        6707 :         assert(TAILQ_EMPTY(&blob->pending_persists));
     359        6707 :         assert(TAILQ_EMPTY(&blob->persists_to_complete));
     360             : 
     361        6707 :         free(blob->active.extent_pages);
     362        6707 :         free(blob->clean.extent_pages);
     363        6707 :         free(blob->active.clusters);
     364        6707 :         free(blob->clean.clusters);
     365        6707 :         free(blob->active.pages);
     366        6707 :         free(blob->clean.pages);
     367             : 
     368        6707 :         xattrs_free(&blob->xattrs);
     369        6707 :         xattrs_free(&blob->xattrs_internal);
     370             : 
     371        6707 :         if (blob->back_bs_dev) {
     372        1363 :                 blob_unref_back_bs_dev(blob);
     373             :         }
     374             : 
     375        6707 :         free(blob);
     376        6707 : }
     377             : 
     378             : static void
     379         406 : blob_back_bs_destroy_esnap_done(void *ctx, struct spdk_blob *blob, int bserrno)
     380             : {
     381         406 :         struct spdk_bs_dev      *bs_dev = ctx;
     382             : 
     383         406 :         if (bserrno != 0) {
     384             :                 /*
     385             :                  * This is probably due to a memory allocation failure when creating the
     386             :                  * blob_esnap_destroy_ctx before iterating threads.
     387             :                  */
     388           0 :                 SPDK_ERRLOG("blob 0x%" PRIx64 ": Unable to destroy bs dev channels: error %d\n",
     389             :                             blob->id, bserrno);
     390           0 :                 assert(false);
     391             :         }
     392             : 
     393         406 :         if (bs_dev == NULL) {
     394             :                 /*
     395             :                  * This check exists to make scanbuild happy.
     396             :                  *
     397             :                  * blob->back_bs_dev for an esnap is NULL during the first iteration of blobs while
     398             :                  * the blobstore is being loaded. It could also be NULL if there was an error
     399             :                  * opening the esnap device. In each of these cases, no channels could have been
     400             :                  * created because back_bs_dev->create_channel() would have led to a NULL pointer
     401             :                  * deref.
     402             :                  */
     403           0 :                 assert(false);
     404             :                 return;
     405             :         }
     406             : 
     407         406 :         SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": calling destroy on back_bs_dev\n", blob->id);
     408         406 :         bs_dev->destroy(bs_dev);
     409             : }
     410             : 
     411             : static void
     412         406 : blob_back_bs_destroy(struct spdk_blob *blob)
     413             : {
     414         406 :         SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": preparing to destroy back_bs_dev\n",
     415             :                       blob->id);
     416             : 
     417         406 :         blob_esnap_destroy_bs_dev_channels(blob, false, blob_back_bs_destroy_esnap_done,
     418         406 :                                            blob->back_bs_dev);
     419         406 :         blob->back_bs_dev = NULL;
     420         406 : }
     421             : 
     422             : struct blob_parent {
     423             :         union {
     424             :                 struct {
     425             :                         spdk_blob_id id;
     426             :                         struct spdk_blob *blob;
     427             :                 } snapshot;
     428             : 
     429             :                 struct {
     430             :                         void *id;
     431             :                         uint32_t id_len;
     432             :                         struct spdk_bs_dev *back_bs_dev;
     433             :                 } esnap;
     434             :         } u;
     435             : };
     436             : 
     437             : typedef int (*set_parent_refs_cb)(struct spdk_blob *blob, struct blob_parent *parent);
     438             : 
     439             : struct set_bs_dev_ctx {
     440             :         struct spdk_blob        *blob;
     441             :         struct spdk_bs_dev      *back_bs_dev;
     442             : 
     443             :         /*
     444             :          * This callback is used during a set parent operation to change the references
     445             :          * to the parent of the blob.
     446             :          */
     447             :         set_parent_refs_cb      parent_refs_cb_fn;
     448             :         struct blob_parent      *parent_refs_cb_arg;
     449             : 
     450             :         spdk_blob_op_complete   cb_fn;
     451             :         void                    *cb_arg;
     452             :         int                     bserrno;
     453             : };
     454             : 
     455             : static void
     456          35 : blob_set_back_bs_dev(struct spdk_blob *blob, struct spdk_bs_dev *back_bs_dev,
     457             :                      set_parent_refs_cb parent_refs_cb_fn, struct blob_parent *parent_refs_cb_arg,
     458             :                      spdk_blob_op_complete cb_fn, void *cb_arg)
     459             : {
     460             :         struct set_bs_dev_ctx   *ctx;
     461             : 
     462          35 :         ctx = calloc(1, sizeof(*ctx));
     463          35 :         if (ctx == NULL) {
     464           0 :                 SPDK_ERRLOG("blob 0x%" PRIx64 ": out of memory while setting back_bs_dev\n",
     465             :                             blob->id);
     466           0 :                 cb_fn(cb_arg, -ENOMEM);
     467           0 :                 return;
     468             :         }
     469             : 
     470          35 :         ctx->parent_refs_cb_fn = parent_refs_cb_fn;
     471          35 :         ctx->parent_refs_cb_arg = parent_refs_cb_arg;
     472          35 :         ctx->cb_fn = cb_fn;
     473          35 :         ctx->cb_arg = cb_arg;
     474          35 :         ctx->back_bs_dev = back_bs_dev;
     475          35 :         ctx->blob = blob;
     476             : 
     477          35 :         blob_freeze_io(blob, blob_set_back_bs_dev_frozen, ctx);
     478             : }
     479             : 
     480             : struct freeze_io_ctx {
     481             :         struct spdk_bs_cpl cpl;
     482             :         struct spdk_blob *blob;
     483             : };
     484             : 
     485             : static void
     486         663 : blob_io_sync(struct spdk_io_channel_iter *i)
     487             : {
     488         663 :         spdk_for_each_channel_continue(i, 0);
     489         663 : }
     490             : 
     491             : static void
     492         648 : blob_execute_queued_io(struct spdk_io_channel_iter *i)
     493             : {
     494         648 :         struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
     495         648 :         struct spdk_bs_channel *ch = spdk_io_channel_get_ctx(_ch);
     496         648 :         struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
     497             :         struct spdk_bs_request_set      *set;
     498             :         struct spdk_bs_user_op_args     *args;
     499             :         spdk_bs_user_op_t *op, *tmp;
     500             : 
     501         653 :         TAILQ_FOREACH_SAFE(op, &ch->queued_io, link, tmp) {
     502           5 :                 set = (struct spdk_bs_request_set *)op;
     503           5 :                 args = &set->u.user_op;
     504             : 
     505           5 :                 if (args->blob == ctx->blob) {
     506           5 :                         TAILQ_REMOVE(&ch->queued_io, op, link);
     507           5 :                         bs_user_op_execute(op);
     508             :                 }
     509             :         }
     510             : 
     511         648 :         spdk_for_each_channel_continue(i, 0);
     512         648 : }
     513             : 
     514             : static void
     515        1271 : blob_io_cpl(struct spdk_io_channel_iter *i, int status)
     516             : {
     517        1271 :         struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
     518             : 
     519        1271 :         ctx->cpl.u.blob_basic.cb_fn(ctx->cpl.u.blob_basic.cb_arg, 0);
     520             : 
     521        1271 :         free(ctx);
     522        1271 : }
     523             : 
     524             : static void
     525         643 : blob_freeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
     526             : {
     527             :         struct freeze_io_ctx *ctx;
     528             : 
     529         643 :         blob_verify_md_op(blob);
     530             : 
     531         643 :         ctx = calloc(1, sizeof(*ctx));
     532         643 :         if (!ctx) {
     533           0 :                 cb_fn(cb_arg, -ENOMEM);
     534           0 :                 return;
     535             :         }
     536             : 
     537         643 :         ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
     538         643 :         ctx->cpl.u.blob_basic.cb_fn = cb_fn;
     539         643 :         ctx->cpl.u.blob_basic.cb_arg = cb_arg;
     540         643 :         ctx->blob = blob;
     541             : 
     542             :         /* Freeze I/O on blob */
     543         643 :         blob->frozen_refcnt++;
     544             : 
     545         643 :         spdk_for_each_channel(blob->bs, blob_io_sync, ctx, blob_io_cpl);
     546             : }
     547             : 
     548             : static void
     549         628 : blob_unfreeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
     550             : {
     551             :         struct freeze_io_ctx *ctx;
     552             : 
     553         628 :         blob_verify_md_op(blob);
     554             : 
     555         628 :         ctx = calloc(1, sizeof(*ctx));
     556         628 :         if (!ctx) {
     557           0 :                 cb_fn(cb_arg, -ENOMEM);
     558           0 :                 return;
     559             :         }
     560             : 
     561         628 :         ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
     562         628 :         ctx->cpl.u.blob_basic.cb_fn = cb_fn;
     563         628 :         ctx->cpl.u.blob_basic.cb_arg = cb_arg;
     564         628 :         ctx->blob = blob;
     565             : 
     566         628 :         assert(blob->frozen_refcnt > 0);
     567             : 
     568         628 :         blob->frozen_refcnt--;
     569             : 
     570         628 :         spdk_for_each_channel(blob->bs, blob_execute_queued_io, ctx, blob_io_cpl);
     571             : }
     572             : 
     573             : static int
     574       10498 : blob_mark_clean(struct spdk_blob *blob)
     575             : {
     576       10498 :         uint32_t *extent_pages = NULL;
     577       10498 :         uint64_t *clusters = NULL;
     578       10498 :         uint32_t *pages = NULL;
     579             : 
     580       10498 :         assert(blob != NULL);
     581             : 
     582       10498 :         if (blob->active.num_extent_pages) {
     583        4258 :                 assert(blob->active.extent_pages);
     584        4258 :                 extent_pages = calloc(blob->active.num_extent_pages, sizeof(*blob->active.extent_pages));
     585        4258 :                 if (!extent_pages) {
     586           0 :                         return -ENOMEM;
     587             :                 }
     588        4258 :                 memcpy(extent_pages, blob->active.extent_pages,
     589        4258 :                        blob->active.num_extent_pages * sizeof(*extent_pages));
     590             :         }
     591             : 
     592       10498 :         if (blob->active.num_clusters) {
     593        7349 :                 assert(blob->active.clusters);
     594        7349 :                 clusters = calloc(blob->active.num_clusters, sizeof(*blob->active.clusters));
     595        7349 :                 if (!clusters) {
     596           0 :                         free(extent_pages);
     597           0 :                         return -ENOMEM;
     598             :                 }
     599        7349 :                 memcpy(clusters, blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters));
     600             :         }
     601             : 
     602       10498 :         if (blob->active.num_pages) {
     603        8641 :                 assert(blob->active.pages);
     604        8641 :                 pages = calloc(blob->active.num_pages, sizeof(*blob->active.pages));
     605        8641 :                 if (!pages) {
     606           0 :                         free(extent_pages);
     607           0 :                         free(clusters);
     608           0 :                         return -ENOMEM;
     609             :                 }
     610        8641 :                 memcpy(pages, blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages));
     611             :         }
     612             : 
     613       10498 :         free(blob->clean.extent_pages);
     614       10498 :         free(blob->clean.clusters);
     615       10498 :         free(blob->clean.pages);
     616             : 
     617       10498 :         blob->clean.num_extent_pages = blob->active.num_extent_pages;
     618       10498 :         blob->clean.extent_pages = blob->active.extent_pages;
     619       10498 :         blob->clean.num_clusters = blob->active.num_clusters;
     620       10498 :         blob->clean.clusters = blob->active.clusters;
     621       10498 :         blob->clean.num_allocated_clusters = blob->active.num_allocated_clusters;
     622       10498 :         blob->clean.num_pages = blob->active.num_pages;
     623       10498 :         blob->clean.pages = blob->active.pages;
     624             : 
     625       10498 :         blob->active.extent_pages = extent_pages;
     626       10498 :         blob->active.clusters = clusters;
     627       10498 :         blob->active.pages = pages;
     628             : 
     629             :         /* If the metadata was dirtied again while the metadata was being written to disk,
     630             :          *  we do not want to revert the DIRTY state back to CLEAN here.
     631             :          */
     632       10498 :         if (blob->state == SPDK_BLOB_STATE_LOADING) {
     633        4259 :                 blob->state = SPDK_BLOB_STATE_CLEAN;
     634             :         }
     635             : 
     636       10498 :         return 0;
     637             : }
     638             : 
     639             : static int
     640        1592 : blob_deserialize_xattr(struct spdk_blob *blob,
     641             :                        struct spdk_blob_md_descriptor_xattr *desc_xattr, bool internal)
     642             : {
     643             :         struct spdk_xattr                       *xattr;
     644             : 
     645        1592 :         if (desc_xattr->length != sizeof(desc_xattr->name_length) +
     646             :             sizeof(desc_xattr->value_length) +
     647        1592 :             desc_xattr->name_length + desc_xattr->value_length) {
     648           0 :                 return -EINVAL;
     649             :         }
     650             : 
     651        1592 :         xattr = calloc(1, sizeof(*xattr));
     652        1592 :         if (xattr == NULL) {
     653           0 :                 return -ENOMEM;
     654             :         }
     655             : 
     656        1592 :         xattr->name = malloc(desc_xattr->name_length + 1);
     657        1592 :         if (xattr->name == NULL) {
     658           0 :                 free(xattr);
     659           0 :                 return -ENOMEM;
     660             :         }
     661             : 
     662        1592 :         xattr->value = malloc(desc_xattr->value_length);
     663        1592 :         if (xattr->value == NULL) {
     664           0 :                 free(xattr->name);
     665           0 :                 free(xattr);
     666           0 :                 return -ENOMEM;
     667             :         }
     668             : 
     669        1592 :         memcpy(xattr->name, desc_xattr->name, desc_xattr->name_length);
     670        1592 :         xattr->name[desc_xattr->name_length] = '\0';
     671        1592 :         xattr->value_len = desc_xattr->value_length;
     672        1592 :         memcpy(xattr->value,
     673        1592 :                (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length),
     674        1592 :                desc_xattr->value_length);
     675             : 
     676        1592 :         TAILQ_INSERT_TAIL(internal ? &blob->xattrs_internal : &blob->xattrs, xattr, link);
     677             : 
     678        1592 :         return 0;
     679             : }
     680             : 
     681             : 
     682             : static int
     683        5980 : blob_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob *blob)
     684             : {
     685             :         struct spdk_blob_md_descriptor *desc;
     686        5980 :         size_t  cur_desc = 0;
     687             :         void *tmp;
     688             : 
     689        5980 :         desc = (struct spdk_blob_md_descriptor *)page->descriptors;
     690       17389 :         while (cur_desc < sizeof(page->descriptors)) {
     691       17389 :                 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
     692        5920 :                         if (desc->length == 0) {
     693             :                                 /* If padding and length are 0, this terminates the page */
     694        5920 :                                 break;
     695             :                         }
     696       11469 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
     697             :                         struct spdk_blob_md_descriptor_flags    *desc_flags;
     698             : 
     699        4301 :                         desc_flags = (struct spdk_blob_md_descriptor_flags *)desc;
     700             : 
     701        4301 :                         if (desc_flags->length != sizeof(*desc_flags) - sizeof(*desc)) {
     702           0 :                                 return -EINVAL;
     703             :                         }
     704             : 
     705        4301 :                         if ((desc_flags->invalid_flags | SPDK_BLOB_INVALID_FLAGS_MASK) !=
     706             :                             SPDK_BLOB_INVALID_FLAGS_MASK) {
     707          10 :                                 return -EINVAL;
     708             :                         }
     709             : 
     710        4291 :                         if ((desc_flags->data_ro_flags | SPDK_BLOB_DATA_RO_FLAGS_MASK) !=
     711             :                             SPDK_BLOB_DATA_RO_FLAGS_MASK) {
     712          15 :                                 blob->data_ro = true;
     713          15 :                                 blob->md_ro = true;
     714             :                         }
     715             : 
     716        4291 :                         if ((desc_flags->md_ro_flags | SPDK_BLOB_MD_RO_FLAGS_MASK) !=
     717             :                             SPDK_BLOB_MD_RO_FLAGS_MASK) {
     718          15 :                                 blob->md_ro = true;
     719             :                         }
     720             : 
     721        4291 :                         if ((desc_flags->data_ro_flags & SPDK_BLOB_READ_ONLY)) {
     722         712 :                                 blob->data_ro = true;
     723         712 :                                 blob->md_ro = true;
     724             :                         }
     725             : 
     726        4291 :                         blob->invalid_flags = desc_flags->invalid_flags;
     727        4291 :                         blob->data_ro_flags = desc_flags->data_ro_flags;
     728        4291 :                         blob->md_ro_flags = desc_flags->md_ro_flags;
     729             : 
     730        7168 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
     731             :                         struct spdk_blob_md_descriptor_extent_rle       *desc_extent_rle;
     732             :                         unsigned int                            i, j;
     733        1396 :                         unsigned int                            cluster_count = blob->active.num_clusters;
     734             : 
     735        1396 :                         if (blob->extent_table_found) {
     736             :                                 /* Extent Table already present in the md,
     737             :                                  * both descriptors should never be at the same time. */
     738           0 :                                 return -EINVAL;
     739             :                         }
     740        1396 :                         blob->extent_rle_found = true;
     741             : 
     742        1396 :                         desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
     743             : 
     744        1396 :                         if (desc_extent_rle->length == 0 ||
     745        1396 :                             (desc_extent_rle->length % sizeof(desc_extent_rle->extents[0]) != 0)) {
     746           0 :                                 return -EINVAL;
     747             :                         }
     748             : 
     749        2968 :                         for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
     750       21240 :                                 for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
     751       19668 :                                         if (desc_extent_rle->extents[i].cluster_idx != 0) {
     752        6692 :                                                 if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters,
     753        6692 :                                                                                 desc_extent_rle->extents[i].cluster_idx + j)) {
     754           0 :                                                         return -EINVAL;
     755             :                                                 }
     756             :                                         }
     757       19668 :                                         cluster_count++;
     758             :                                 }
     759             :                         }
     760             : 
     761        1396 :                         if (cluster_count == 0) {
     762           0 :                                 return -EINVAL;
     763             :                         }
     764        1396 :                         tmp = realloc(blob->active.clusters, cluster_count * sizeof(*blob->active.clusters));
     765        1396 :                         if (tmp == NULL) {
     766           0 :                                 return -ENOMEM;
     767             :                         }
     768        1396 :                         blob->active.clusters = tmp;
     769        1396 :                         blob->active.cluster_array_size = cluster_count;
     770             : 
     771        2968 :                         for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
     772       21240 :                                 for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
     773       19668 :                                         if (desc_extent_rle->extents[i].cluster_idx != 0) {
     774        6692 :                                                 blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs,
     775        6692 :                                                                 desc_extent_rle->extents[i].cluster_idx + j);
     776        6692 :                                                 blob->active.num_allocated_clusters++;
     777       12976 :                                         } else if (spdk_blob_is_thin_provisioned(blob)) {
     778       12976 :                                                 blob->active.clusters[blob->active.num_clusters++] = 0;
     779             :                                         } else {
     780           0 :                                                 return -EINVAL;
     781             :                                         }
     782             :                                 }
     783             :                         }
     784        5772 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) {
     785             :                         struct spdk_blob_md_descriptor_extent_table *desc_extent_table;
     786        2624 :                         uint32_t num_extent_pages = blob->active.num_extent_pages;
     787             :                         uint32_t i, j;
     788             :                         size_t extent_pages_length;
     789             : 
     790        2624 :                         desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc;
     791        2624 :                         extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters);
     792             : 
     793        2624 :                         if (blob->extent_rle_found) {
     794             :                                 /* This means that Extent RLE is present in MD,
     795             :                                  * both should never be at the same time. */
     796           0 :                                 return -EINVAL;
     797        2624 :                         } else if (blob->extent_table_found &&
     798           0 :                                    desc_extent_table->num_clusters != blob->remaining_clusters_in_et) {
     799             :                                 /* Number of clusters in this ET does not match number
     800             :                                  * from previously read EXTENT_TABLE. */
     801           0 :                                 return -EINVAL;
     802             :                         }
     803             : 
     804        2624 :                         if (desc_extent_table->length == 0 ||
     805        2624 :                             (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) {
     806           0 :                                 return -EINVAL;
     807             :                         }
     808             : 
     809        2624 :                         blob->extent_table_found = true;
     810             : 
     811        4825 :                         for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
     812        2201 :                                 num_extent_pages += desc_extent_table->extent_page[i].num_pages;
     813             :                         }
     814             : 
     815        2624 :                         if (num_extent_pages > 0) {
     816        2177 :                                 tmp = realloc(blob->active.extent_pages, num_extent_pages * sizeof(uint32_t));
     817        2177 :                                 if (tmp == NULL) {
     818           0 :                                         return -ENOMEM;
     819             :                                 }
     820        2177 :                                 blob->active.extent_pages = tmp;
     821             :                         }
     822        2624 :                         blob->active.extent_pages_array_size = num_extent_pages;
     823             : 
     824        2624 :                         blob->remaining_clusters_in_et = desc_extent_table->num_clusters;
     825             : 
     826             :                         /* Extent table entries contain md page numbers for extent pages.
     827             :                          * Zeroes represent unallocated extent pages, those are run-length-encoded.
     828             :                          */
     829        4825 :                         for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
     830        2201 :                                 if (desc_extent_table->extent_page[i].page_idx != 0) {
     831        1565 :                                         assert(desc_extent_table->extent_page[i].num_pages == 1);
     832        3130 :                                         blob->active.extent_pages[blob->active.num_extent_pages++] =
     833        1565 :                                                 desc_extent_table->extent_page[i].page_idx;
     834         636 :                                 } else if (spdk_blob_is_thin_provisioned(blob)) {
     835        1272 :                                         for (j = 0; j < desc_extent_table->extent_page[i].num_pages; j++) {
     836         636 :                                                 blob->active.extent_pages[blob->active.num_extent_pages++] = 0;
     837             :                                         }
     838             :                                 } else {
     839           0 :                                         return -EINVAL;
     840             :                                 }
     841             :                         }
     842        3148 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
     843             :                         struct spdk_blob_md_descriptor_extent_page      *desc_extent;
     844             :                         unsigned int                                    i;
     845        1556 :                         unsigned int                                    cluster_count = 0;
     846             :                         size_t                                          cluster_idx_length;
     847             : 
     848        1556 :                         if (blob->extent_rle_found) {
     849             :                                 /* This means that Extent RLE is present in MD,
     850             :                                  * both should never be at the same time. */
     851           0 :                                 return -EINVAL;
     852             :                         }
     853             : 
     854        1556 :                         desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
     855        1556 :                         cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx);
     856             : 
     857        1556 :                         if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) ||
     858        1556 :                             (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) {
     859           0 :                                 return -EINVAL;
     860             :                         }
     861             : 
     862       24472 :                         for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) {
     863       22916 :                                 if (desc_extent->cluster_idx[i] != 0) {
     864       10415 :                                         if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, desc_extent->cluster_idx[i])) {
     865           0 :                                                 return -EINVAL;
     866             :                                         }
     867             :                                 }
     868       22916 :                                 cluster_count++;
     869             :                         }
     870             : 
     871        1556 :                         if (cluster_count == 0) {
     872           0 :                                 return -EINVAL;
     873             :                         }
     874             : 
     875             :                         /* When reading extent pages sequentially starting cluster idx should match
     876             :                          * current size of a blob.
     877             :                          * If changed to batch reading, this check shall be removed. */
     878        1556 :                         if (desc_extent->start_cluster_idx != blob->active.num_clusters) {
     879           0 :                                 return -EINVAL;
     880             :                         }
     881             : 
     882        1556 :                         tmp = realloc(blob->active.clusters,
     883        1556 :                                       (cluster_count + blob->active.num_clusters) * sizeof(*blob->active.clusters));
     884        1556 :                         if (tmp == NULL) {
     885           0 :                                 return -ENOMEM;
     886             :                         }
     887        1556 :                         blob->active.clusters = tmp;
     888        1556 :                         blob->active.cluster_array_size = (cluster_count + blob->active.num_clusters);
     889             : 
     890       24472 :                         for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) {
     891       22916 :                                 if (desc_extent->cluster_idx[i] != 0) {
     892       10415 :                                         blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs,
     893             :                                                         desc_extent->cluster_idx[i]);
     894       10415 :                                         blob->active.num_allocated_clusters++;
     895       12501 :                                 } else if (spdk_blob_is_thin_provisioned(blob)) {
     896       12501 :                                         blob->active.clusters[blob->active.num_clusters++] = 0;
     897             :                                 } else {
     898           0 :                                         return -EINVAL;
     899             :                                 }
     900             :                         }
     901        1556 :                         assert(desc_extent->start_cluster_idx + cluster_count == blob->active.num_clusters);
     902        1556 :                         assert(blob->remaining_clusters_in_et >= cluster_count);
     903        1556 :                         blob->remaining_clusters_in_et -= cluster_count;
     904        1592 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
     905             :                         int rc;
     906             : 
     907         476 :                         rc = blob_deserialize_xattr(blob,
     908             :                                                     (struct spdk_blob_md_descriptor_xattr *) desc, false);
     909         476 :                         if (rc != 0) {
     910           0 :                                 return rc;
     911             :                         }
     912        1116 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
     913             :                         int rc;
     914             : 
     915        1116 :                         rc = blob_deserialize_xattr(blob,
     916             :                                                     (struct spdk_blob_md_descriptor_xattr *) desc, true);
     917        1116 :                         if (rc != 0) {
     918           0 :                                 return rc;
     919             :                         }
     920             :                 } else {
     921             :                         /* Unrecognized descriptor type.  Do not fail - just continue to the
     922             :                          *  next descriptor.  If this descriptor is associated with some feature
     923             :                          *  defined in a newer version of blobstore, that version of blobstore
     924             :                          *  should create and set an associated feature flag to specify if this
     925             :                          *  blob can be loaded or not.
     926             :                          */
     927             :                 }
     928             : 
     929             :                 /* Advance to the next descriptor */
     930       11459 :                 cur_desc += sizeof(*desc) + desc->length;
     931       11459 :                 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
     932          50 :                         break;
     933             :                 }
     934       11409 :                 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
     935             :         }
     936             : 
     937        5970 :         return 0;
     938             : }
     939             : 
     940             : static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page);
     941             : 
     942             : static int
     943        1556 : blob_parse_extent_page(struct spdk_blob_md_page *extent_page, struct spdk_blob *blob)
     944             : {
     945        1556 :         assert(blob != NULL);
     946        1556 :         assert(blob->state == SPDK_BLOB_STATE_LOADING);
     947             : 
     948        1556 :         if (bs_load_cur_extent_page_valid(extent_page) == false) {
     949           0 :                 return -ENOENT;
     950             :         }
     951             : 
     952        1556 :         return blob_parse_page(extent_page, blob);
     953             : }
     954             : 
     955             : static int
     956        4306 : blob_parse(const struct spdk_blob_md_page *pages, uint32_t page_count,
     957             :            struct spdk_blob *blob)
     958             : {
     959             :         const struct spdk_blob_md_page *page;
     960             :         uint32_t i;
     961             :         int rc;
     962             :         void *tmp;
     963             : 
     964        4306 :         assert(page_count > 0);
     965        4306 :         assert(pages[0].sequence_num == 0);
     966        4306 :         assert(blob != NULL);
     967        4306 :         assert(blob->state == SPDK_BLOB_STATE_LOADING);
     968        4306 :         assert(blob->active.clusters == NULL);
     969             : 
     970             :         /* The blobid provided doesn't match what's in the MD, this can
     971             :          * happen for example if a bogus blobid is passed in through open.
     972             :          */
     973        4306 :         if (blob->id != pages[0].id) {
     974           5 :                 SPDK_ERRLOG("Blobid (0x%" PRIx64 ") doesn't match what's in metadata "
     975             :                             "(0x%" PRIx64 ")\n", blob->id, pages[0].id);
     976           5 :                 return -ENOENT;
     977             :         }
     978             : 
     979        4301 :         tmp = realloc(blob->active.pages, page_count * sizeof(*blob->active.pages));
     980        4301 :         if (!tmp) {
     981           0 :                 return -ENOMEM;
     982             :         }
     983        4301 :         blob->active.pages = tmp;
     984             : 
     985        4301 :         blob->active.pages[0] = pages[0].id;
     986             : 
     987        4424 :         for (i = 1; i < page_count; i++) {
     988         123 :                 assert(spdk_bit_array_get(blob->bs->used_md_pages, pages[i - 1].next));
     989         123 :                 blob->active.pages[i] = pages[i - 1].next;
     990             :         }
     991        4301 :         blob->active.num_pages = page_count;
     992             : 
     993        8715 :         for (i = 0; i < page_count; i++) {
     994        4424 :                 page = &pages[i];
     995             : 
     996        4424 :                 assert(page->id == blob->id);
     997        4424 :                 assert(page->sequence_num == i);
     998             : 
     999        4424 :                 rc = blob_parse_page(page, blob);
    1000        4424 :                 if (rc != 0) {
    1001          10 :                         return rc;
    1002             :                 }
    1003             :         }
    1004             : 
    1005        4291 :         return 0;
    1006             : }
    1007             : 
    1008             : static int
    1009        5532 : blob_serialize_add_page(const struct spdk_blob *blob,
    1010             :                         struct spdk_blob_md_page **pages,
    1011             :                         uint32_t *page_count,
    1012             :                         struct spdk_blob_md_page **last_page)
    1013             : {
    1014             :         struct spdk_blob_md_page *page, *tmp_pages;
    1015             : 
    1016        5532 :         assert(pages != NULL);
    1017        5532 :         assert(page_count != NULL);
    1018             : 
    1019        5532 :         *last_page = NULL;
    1020        5532 :         if (*page_count == 0) {
    1021        5423 :                 assert(*pages == NULL);
    1022        5423 :                 *pages = spdk_malloc(blob->bs->md_page_size, 0,
    1023             :                                      NULL, SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
    1024        5423 :                 if (*pages == NULL) {
    1025           0 :                         return -ENOMEM;
    1026             :                 }
    1027        5423 :                 *page_count = 1;
    1028             :         } else {
    1029         109 :                 assert(*pages != NULL);
    1030         109 :                 tmp_pages = spdk_realloc(*pages, blob->bs->md_page_size * (*page_count + 1), 0);
    1031         109 :                 if (tmp_pages == NULL) {
    1032           0 :                         return -ENOMEM;
    1033             :                 }
    1034         109 :                 (*page_count)++;
    1035         109 :                 *pages = tmp_pages;
    1036             :         }
    1037             : 
    1038        5532 :         page = &(*pages)[*page_count - 1];
    1039        5532 :         memset(page, 0, sizeof(*page));
    1040        5532 :         page->id = blob->id;
    1041        5532 :         page->sequence_num = *page_count - 1;
    1042        5532 :         page->next = SPDK_INVALID_MD_PAGE;
    1043        5532 :         *last_page = page;
    1044             : 
    1045        5532 :         return 0;
    1046             : }
    1047             : 
    1048             : /* Transform the in-memory representation 'xattr' into an on-disk xattr descriptor.
    1049             :  * Update required_sz on both success and failure.
    1050             :  *
    1051             :  */
    1052             : static int
    1053        2160 : blob_serialize_xattr(const struct spdk_xattr *xattr,
    1054             :                      uint8_t *buf, size_t buf_sz,
    1055             :                      size_t *required_sz, bool internal)
    1056             : {
    1057             :         struct spdk_blob_md_descriptor_xattr    *desc;
    1058             : 
    1059        4320 :         *required_sz = sizeof(struct spdk_blob_md_descriptor_xattr) +
    1060        2160 :                        strlen(xattr->name) +
    1061        2160 :                        xattr->value_len;
    1062             : 
    1063        2160 :         if (buf_sz < *required_sz) {
    1064          60 :                 return -1;
    1065             :         }
    1066             : 
    1067        2100 :         desc = (struct spdk_blob_md_descriptor_xattr *)buf;
    1068             : 
    1069        2100 :         desc->type = internal ? SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL : SPDK_MD_DESCRIPTOR_TYPE_XATTR;
    1070        2100 :         desc->length = sizeof(desc->name_length) +
    1071             :                        sizeof(desc->value_length) +
    1072        2100 :                        strlen(xattr->name) +
    1073        2100 :                        xattr->value_len;
    1074        2100 :         desc->name_length = strlen(xattr->name);
    1075        2100 :         desc->value_length = xattr->value_len;
    1076             : 
    1077        2100 :         memcpy(desc->name, xattr->name, desc->name_length);
    1078        2100 :         memcpy((void *)((uintptr_t)desc->name + desc->name_length),
    1079        2100 :                xattr->value,
    1080        2100 :                desc->value_length);
    1081             : 
    1082        2100 :         return 0;
    1083             : }
    1084             : 
    1085             : static void
    1086        2516 : blob_serialize_extent_table_entry(const struct spdk_blob *blob,
    1087             :                                   uint64_t start_ep, uint64_t *next_ep,
    1088             :                                   uint8_t **buf, size_t *remaining_sz)
    1089             : {
    1090             :         struct spdk_blob_md_descriptor_extent_table *desc;
    1091             :         size_t cur_sz;
    1092             :         uint64_t i, et_idx;
    1093             :         uint32_t extent_page, ep_len;
    1094             : 
    1095             :         /* The buffer must have room for at least num_clusters entry */
    1096        2516 :         cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc->num_clusters);
    1097        2516 :         if (*remaining_sz < cur_sz) {
    1098          30 :                 *next_ep = start_ep;
    1099          30 :                 return;
    1100             :         }
    1101             : 
    1102        2486 :         desc = (struct spdk_blob_md_descriptor_extent_table *)*buf;
    1103        2486 :         desc->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE;
    1104             : 
    1105        2486 :         desc->num_clusters = blob->active.num_clusters;
    1106             : 
    1107        2486 :         ep_len = 1;
    1108        2486 :         et_idx = 0;
    1109        6343 :         for (i = start_ep; i < blob->active.num_extent_pages; i++) {
    1110        3857 :                 if (*remaining_sz < cur_sz  + sizeof(desc->extent_page[0])) {
    1111             :                         /* If we ran out of buffer space, return */
    1112           0 :                         break;
    1113             :                 }
    1114             : 
    1115        3857 :                 extent_page = blob->active.extent_pages[i];
    1116             :                 /* Verify that next extent_page is unallocated */
    1117        3857 :                 if (extent_page == 0 &&
    1118        2283 :                     (i + 1 < blob->active.num_extent_pages && blob->active.extent_pages[i + 1] == 0)) {
    1119        1617 :                         ep_len++;
    1120        1617 :                         continue;
    1121             :                 }
    1122        2240 :                 desc->extent_page[et_idx].page_idx = extent_page;
    1123        2240 :                 desc->extent_page[et_idx].num_pages = ep_len;
    1124        2240 :                 et_idx++;
    1125             : 
    1126        2240 :                 ep_len = 1;
    1127        2240 :                 cur_sz += sizeof(desc->extent_page[et_idx]);
    1128             :         }
    1129        2486 :         *next_ep = i;
    1130             : 
    1131        2486 :         desc->length = sizeof(desc->num_clusters) + sizeof(desc->extent_page[0]) * et_idx;
    1132        2486 :         *remaining_sz -= sizeof(struct spdk_blob_md_descriptor) + desc->length;
    1133        2486 :         *buf += sizeof(struct spdk_blob_md_descriptor) + desc->length;
    1134             : }
    1135             : 
    1136             : static int
    1137        2489 : blob_serialize_extent_table(const struct spdk_blob *blob,
    1138             :                             struct spdk_blob_md_page **pages,
    1139             :                             struct spdk_blob_md_page *cur_page,
    1140             :                             uint32_t *page_count, uint8_t **buf,
    1141             :                             size_t *remaining_sz)
    1142             : {
    1143             :         uint64_t                                last_extent_page;
    1144             :         int                                     rc;
    1145             : 
    1146        2489 :         last_extent_page = 0;
    1147             :         /* At least single extent table entry has to be always persisted.
    1148             :          * Such case occurs with num_extent_pages == 0. */
    1149        2516 :         while (last_extent_page <= blob->active.num_extent_pages) {
    1150        2516 :                 blob_serialize_extent_table_entry(blob, last_extent_page, &last_extent_page, buf,
    1151             :                                                   remaining_sz);
    1152             : 
    1153        2516 :                 if (last_extent_page == blob->active.num_extent_pages) {
    1154        2489 :                         break;
    1155             :                 }
    1156             : 
    1157          27 :                 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page);
    1158          27 :                 if (rc < 0) {
    1159           0 :                         return rc;
    1160             :                 }
    1161             : 
    1162          27 :                 *buf = (uint8_t *)cur_page->descriptors;
    1163          27 :                 *remaining_sz = sizeof(cur_page->descriptors);
    1164             :         }
    1165             : 
    1166        2489 :         return 0;
    1167             : }
    1168             : 
    1169             : static void
    1170        1751 : blob_serialize_extent_rle(const struct spdk_blob *blob,
    1171             :                           uint64_t start_cluster, uint64_t *next_cluster,
    1172             :                           uint8_t **buf, size_t *buf_sz)
    1173             : {
    1174             :         struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle;
    1175             :         size_t cur_sz;
    1176             :         uint64_t i, extent_idx;
    1177             :         uint64_t lba, lba_per_cluster, lba_count;
    1178             : 
    1179             :         /* The buffer must have room for at least one extent */
    1180        1751 :         cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc_extent_rle->extents[0]);
    1181        1751 :         if (*buf_sz < cur_sz) {
    1182          18 :                 *next_cluster = start_cluster;
    1183          18 :                 return;
    1184             :         }
    1185             : 
    1186        1733 :         desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)*buf;
    1187        1733 :         desc_extent_rle->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE;
    1188             : 
    1189        1733 :         lba_per_cluster = bs_cluster_to_lba(blob->bs, 1);
    1190             :         /* Assert for scan-build false positive */
    1191        1733 :         assert(lba_per_cluster > 0);
    1192             : 
    1193        1733 :         lba = blob->active.clusters[start_cluster];
    1194        1733 :         lba_count = lba_per_cluster;
    1195        1733 :         extent_idx = 0;
    1196      810464 :         for (i = start_cluster + 1; i < blob->active.num_clusters; i++) {
    1197      808735 :                 if ((lba + lba_count) == blob->active.clusters[i] && lba != 0) {
    1198             :                         /* Run-length encode sequential non-zero LBA */
    1199        7276 :                         lba_count += lba_per_cluster;
    1200        7276 :                         continue;
    1201      801459 :                 } else if (lba == 0 && blob->active.clusters[i] == 0) {
    1202             :                         /* Run-length encode unallocated clusters */
    1203      800266 :                         lba_count += lba_per_cluster;
    1204      800266 :                         continue;
    1205             :                 }
    1206        1193 :                 desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster;
    1207        1193 :                 desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster;
    1208        1193 :                 extent_idx++;
    1209             : 
    1210        1193 :                 cur_sz += sizeof(desc_extent_rle->extents[extent_idx]);
    1211             : 
    1212        1193 :                 if (*buf_sz < cur_sz) {
    1213             :                         /* If we ran out of buffer space, return */
    1214           4 :                         *next_cluster = i;
    1215           4 :                         break;
    1216             :                 }
    1217             : 
    1218        1189 :                 lba = blob->active.clusters[i];
    1219        1189 :                 lba_count = lba_per_cluster;
    1220             :         }
    1221             : 
    1222        1733 :         if (*buf_sz >= cur_sz) {
    1223        1729 :                 desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster;
    1224        1729 :                 desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster;
    1225        1729 :                 extent_idx++;
    1226             : 
    1227        1729 :                 *next_cluster = blob->active.num_clusters;
    1228             :         }
    1229             : 
    1230        1733 :         desc_extent_rle->length = sizeof(desc_extent_rle->extents[0]) * extent_idx;
    1231        1733 :         *buf_sz -= sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length;
    1232        1733 :         *buf += sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length;
    1233             : }
    1234             : 
    1235             : static int
    1236        1943 : blob_serialize_extents_rle(const struct spdk_blob *blob,
    1237             :                            struct spdk_blob_md_page **pages,
    1238             :                            struct spdk_blob_md_page *cur_page,
    1239             :                            uint32_t *page_count, uint8_t **buf,
    1240             :                            size_t *remaining_sz)
    1241             : {
    1242             :         uint64_t                                last_cluster;
    1243             :         int                                     rc;
    1244             : 
    1245        1943 :         last_cluster = 0;
    1246        1965 :         while (last_cluster < blob->active.num_clusters) {
    1247        1751 :                 blob_serialize_extent_rle(blob, last_cluster, &last_cluster, buf, remaining_sz);
    1248             : 
    1249        1751 :                 if (last_cluster == blob->active.num_clusters) {
    1250        1729 :                         break;
    1251             :                 }
    1252             : 
    1253          22 :                 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page);
    1254          22 :                 if (rc < 0) {
    1255           0 :                         return rc;
    1256             :                 }
    1257             : 
    1258          22 :                 *buf = (uint8_t *)cur_page->descriptors;
    1259          22 :                 *remaining_sz = sizeof(cur_page->descriptors);
    1260             :         }
    1261             : 
    1262        1943 :         return 0;
    1263             : }
    1264             : 
    1265             : static void
    1266        1648 : blob_serialize_extent_page(const struct spdk_blob *blob,
    1267             :                            uint64_t cluster, struct spdk_blob_md_page *page)
    1268             : {
    1269             :         struct spdk_blob_md_descriptor_extent_page *desc_extent;
    1270             :         uint64_t i, extent_idx;
    1271             :         uint64_t lba, lba_per_cluster;
    1272        1648 :         uint64_t start_cluster_idx = (cluster / SPDK_EXTENTS_PER_EP) * SPDK_EXTENTS_PER_EP;
    1273             : 
    1274        1648 :         desc_extent = (struct spdk_blob_md_descriptor_extent_page *) page->descriptors;
    1275        1648 :         desc_extent->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE;
    1276             : 
    1277        1648 :         lba_per_cluster = bs_cluster_to_lba(blob->bs, 1);
    1278             : 
    1279        1648 :         desc_extent->start_cluster_idx = start_cluster_idx;
    1280        1648 :         extent_idx = 0;
    1281       63582 :         for (i = start_cluster_idx; i < blob->active.num_clusters; i++) {
    1282       62033 :                 lba = blob->active.clusters[i];
    1283       62033 :                 desc_extent->cluster_idx[extent_idx++] = lba / lba_per_cluster;
    1284       62033 :                 if (extent_idx >= SPDK_EXTENTS_PER_EP) {
    1285          99 :                         break;
    1286             :                 }
    1287             :         }
    1288        1648 :         desc_extent->length = sizeof(desc_extent->start_cluster_idx) +
    1289             :                               sizeof(desc_extent->cluster_idx[0]) * extent_idx;
    1290        1648 : }
    1291             : 
    1292             : static void
    1293        4432 : blob_serialize_flags(const struct spdk_blob *blob,
    1294             :                      uint8_t *buf, size_t *buf_sz)
    1295             : {
    1296             :         struct spdk_blob_md_descriptor_flags *desc;
    1297             : 
    1298             :         /*
    1299             :          * Flags get serialized first, so we should always have room for the flags
    1300             :          *  descriptor.
    1301             :          */
    1302        4432 :         assert(*buf_sz >= sizeof(*desc));
    1303             : 
    1304        4432 :         desc = (struct spdk_blob_md_descriptor_flags *)buf;
    1305        4432 :         desc->type = SPDK_MD_DESCRIPTOR_TYPE_FLAGS;
    1306        4432 :         desc->length = sizeof(*desc) - sizeof(struct spdk_blob_md_descriptor);
    1307        4432 :         desc->invalid_flags = blob->invalid_flags;
    1308        4432 :         desc->data_ro_flags = blob->data_ro_flags;
    1309        4432 :         desc->md_ro_flags = blob->md_ro_flags;
    1310             : 
    1311        4432 :         *buf_sz -= sizeof(*desc);
    1312        4432 : }
    1313             : 
    1314             : static int
    1315        8864 : blob_serialize_xattrs(const struct spdk_blob *blob,
    1316             :                       const struct spdk_xattr_tailq *xattrs, bool internal,
    1317             :                       struct spdk_blob_md_page **pages,
    1318             :                       struct spdk_blob_md_page *cur_page,
    1319             :                       uint32_t *page_count, uint8_t **buf,
    1320             :                       size_t *remaining_sz)
    1321             : {
    1322             :         const struct spdk_xattr *xattr;
    1323             :         int     rc;
    1324             : 
    1325       10964 :         TAILQ_FOREACH(xattr, xattrs, link) {
    1326        2100 :                 size_t required_sz = 0;
    1327             : 
    1328        2100 :                 rc = blob_serialize_xattr(xattr,
    1329             :                                           *buf, *remaining_sz,
    1330             :                                           &required_sz, internal);
    1331        2100 :                 if (rc < 0) {
    1332             :                         /* Need to add a new page to the chain */
    1333          60 :                         rc = blob_serialize_add_page(blob, pages, page_count,
    1334             :                                                      &cur_page);
    1335          60 :                         if (rc < 0) {
    1336           0 :                                 spdk_free(*pages);
    1337           0 :                                 *pages = NULL;
    1338           0 :                                 *page_count = 0;
    1339           0 :                                 return rc;
    1340             :                         }
    1341             : 
    1342          60 :                         *buf = (uint8_t *)cur_page->descriptors;
    1343          60 :                         *remaining_sz = sizeof(cur_page->descriptors);
    1344             : 
    1345             :                         /* Try again */
    1346          60 :                         required_sz = 0;
    1347          60 :                         rc = blob_serialize_xattr(xattr,
    1348             :                                                   *buf, *remaining_sz,
    1349             :                                                   &required_sz, internal);
    1350             : 
    1351          60 :                         if (rc < 0) {
    1352           0 :                                 spdk_free(*pages);
    1353           0 :                                 *pages = NULL;
    1354           0 :                                 *page_count = 0;
    1355           0 :                                 return rc;
    1356             :                         }
    1357             :                 }
    1358             : 
    1359        2100 :                 *remaining_sz -= required_sz;
    1360        2100 :                 *buf += required_sz;
    1361             :         }
    1362             : 
    1363        8864 :         return 0;
    1364             : }
    1365             : 
    1366             : static int
    1367        4432 : blob_serialize(const struct spdk_blob *blob, struct spdk_blob_md_page **pages,
    1368             :                uint32_t *page_count)
    1369             : {
    1370             :         struct spdk_blob_md_page                *cur_page;
    1371             :         int                                     rc;
    1372             :         uint8_t                                 *buf;
    1373             :         size_t                                  remaining_sz;
    1374             : 
    1375        4432 :         assert(pages != NULL);
    1376        4432 :         assert(page_count != NULL);
    1377        4432 :         assert(blob != NULL);
    1378        4432 :         assert(blob->state == SPDK_BLOB_STATE_DIRTY);
    1379             : 
    1380        4432 :         *pages = NULL;
    1381        4432 :         *page_count = 0;
    1382             : 
    1383             :         /* A blob always has at least 1 page, even if it has no descriptors */
    1384        4432 :         rc = blob_serialize_add_page(blob, pages, page_count, &cur_page);
    1385        4432 :         if (rc < 0) {
    1386           0 :                 return rc;
    1387             :         }
    1388             : 
    1389        4432 :         buf = (uint8_t *)cur_page->descriptors;
    1390        4432 :         remaining_sz = sizeof(cur_page->descriptors);
    1391             : 
    1392             :         /* Serialize flags */
    1393        4432 :         blob_serialize_flags(blob, buf, &remaining_sz);
    1394        4432 :         buf += sizeof(struct spdk_blob_md_descriptor_flags);
    1395             : 
    1396             :         /* Serialize xattrs */
    1397        4432 :         rc = blob_serialize_xattrs(blob, &blob->xattrs, false,
    1398             :                                    pages, cur_page, page_count, &buf, &remaining_sz);
    1399        4432 :         if (rc < 0) {
    1400           0 :                 return rc;
    1401             :         }
    1402             : 
    1403             :         /* Serialize internal xattrs */
    1404        4432 :         rc = blob_serialize_xattrs(blob, &blob->xattrs_internal, true,
    1405             :                                    pages, cur_page, page_count, &buf, &remaining_sz);
    1406        4432 :         if (rc < 0) {
    1407           0 :                 return rc;
    1408             :         }
    1409             : 
    1410        4432 :         if (blob->use_extent_table) {
    1411             :                 /* Serialize extent table */
    1412        2489 :                 rc = blob_serialize_extent_table(blob, pages, cur_page, page_count, &buf, &remaining_sz);
    1413             :         } else {
    1414             :                 /* Serialize extents */
    1415        1943 :                 rc = blob_serialize_extents_rle(blob, pages, cur_page, page_count, &buf, &remaining_sz);
    1416             :         }
    1417             : 
    1418        4432 :         return rc;
    1419             : }
    1420             : 
    1421             : struct spdk_blob_load_ctx {
    1422             :         struct spdk_blob                *blob;
    1423             : 
    1424             :         struct spdk_blob_md_page        *pages;
    1425             :         uint32_t                        num_pages;
    1426             :         uint32_t                        next_extent_page;
    1427             :         spdk_bs_sequence_t              *seq;
    1428             : 
    1429             :         spdk_bs_sequence_cpl            cb_fn;
    1430             :         void                            *cb_arg;
    1431             : };
    1432             : 
    1433             : static uint32_t
    1434       25719 : blob_md_page_calc_crc(void *page)
    1435             : {
    1436             :         uint32_t                crc;
    1437             : 
    1438       25719 :         crc = BLOB_CRC32C_INITIAL;
    1439       25719 :         crc = spdk_crc32c_update(page, SPDK_BS_PAGE_SIZE - 4, crc);
    1440       25719 :         crc ^= BLOB_CRC32C_INITIAL;
    1441             : 
    1442       25719 :         return crc;
    1443             : 
    1444             : }
    1445             : 
    1446             : static void
    1447        4341 : blob_load_final(struct spdk_blob_load_ctx *ctx, int bserrno)
    1448             : {
    1449        4341 :         struct spdk_blob                *blob = ctx->blob;
    1450             : 
    1451        4341 :         if (bserrno == 0) {
    1452        4259 :                 blob_mark_clean(blob);
    1453             :         }
    1454             : 
    1455        4341 :         ctx->cb_fn(ctx->seq, ctx->cb_arg, bserrno);
    1456             : 
    1457             :         /* Free the memory */
    1458        4341 :         spdk_free(ctx->pages);
    1459        4341 :         free(ctx);
    1460        4341 : }
    1461             : 
    1462             : static void
    1463         575 : blob_load_snapshot_cpl(void *cb_arg, struct spdk_blob *snapshot, int bserrno)
    1464             : {
    1465         575 :         struct spdk_blob_load_ctx       *ctx = cb_arg;
    1466         575 :         struct spdk_blob                *blob = ctx->blob;
    1467             : 
    1468         575 :         if (bserrno == 0) {
    1469         567 :                 blob->back_bs_dev = bs_create_blob_bs_dev(snapshot);
    1470         567 :                 if (blob->back_bs_dev == NULL) {
    1471           0 :                         bserrno = -ENOMEM;
    1472             :                 }
    1473             :         }
    1474         575 :         if (bserrno != 0) {
    1475           8 :                 SPDK_ERRLOG("Snapshot fail\n");
    1476             :         }
    1477             : 
    1478         575 :         blob_load_final(ctx, bserrno);
    1479         575 : }
    1480             : 
    1481             : static void blob_update_clear_method(struct spdk_blob *blob);
    1482             : 
    1483             : static int
    1484         150 : blob_load_esnap(struct spdk_blob *blob, void *blob_ctx)
    1485             : {
    1486         150 :         struct spdk_blob_store *bs = blob->bs;
    1487         150 :         struct spdk_bs_dev *bs_dev = NULL;
    1488         150 :         const void *esnap_id = NULL;
    1489         150 :         size_t id_len = 0;
    1490             :         int rc;
    1491             : 
    1492         150 :         if (bs->esnap_bs_dev_create == NULL) {
    1493          10 :                 SPDK_NOTICELOG("blob 0x%" PRIx64 " is an esnap clone but the blobstore was opened "
    1494             :                                "without support for esnap clones\n", blob->id);
    1495          10 :                 return -ENOTSUP;
    1496             :         }
    1497         140 :         assert(blob->back_bs_dev == NULL);
    1498             : 
    1499         140 :         rc = blob_get_xattr_value(blob, BLOB_EXTERNAL_SNAPSHOT_ID, &esnap_id, &id_len, true);
    1500         140 :         if (rc != 0) {
    1501           0 :                 SPDK_ERRLOG("blob 0x%" PRIx64 " is an esnap clone but has no esnap ID\n", blob->id);
    1502           0 :                 return -EINVAL;
    1503             :         }
    1504         140 :         assert(id_len > 0 && id_len < UINT32_MAX);
    1505             : 
    1506         140 :         SPDK_INFOLOG(blob, "Creating external snapshot device\n");
    1507             : 
    1508         140 :         rc = bs->esnap_bs_dev_create(bs->esnap_ctx, blob_ctx, blob, esnap_id, (uint32_t)id_len,
    1509             :                                      &bs_dev);
    1510         140 :         if (rc != 0) {
    1511           0 :                 SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": failed to load back_bs_dev "
    1512             :                               "with error %d\n", blob->id, rc);
    1513           0 :                 return rc;
    1514             :         }
    1515             : 
    1516             :         /*
    1517             :          * Note: bs_dev might be NULL if the consumer chose to not open the external snapshot.
    1518             :          * This especially might happen during spdk_bs_load() iteration.
    1519             :          */
    1520         140 :         if (bs_dev != NULL) {
    1521         140 :                 SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": loaded back_bs_dev\n", blob->id);
    1522         140 :                 if ((bs->io_unit_size % bs_dev->blocklen) != 0) {
    1523           5 :                         SPDK_NOTICELOG("blob 0x%" PRIx64 " external snapshot device block size %u "
    1524             :                                        "is not compatible with blobstore block size %u\n",
    1525             :                                        blob->id, bs_dev->blocklen, bs->io_unit_size);
    1526           5 :                         bs_dev->destroy(bs_dev);
    1527           5 :                         return -EINVAL;
    1528             :                 }
    1529             :         }
    1530             : 
    1531         135 :         blob->back_bs_dev = bs_dev;
    1532         135 :         blob->parent_id = SPDK_BLOBID_EXTERNAL_SNAPSHOT;
    1533             : 
    1534         135 :         return 0;
    1535             : }
    1536             : 
    1537             : static void
    1538        4282 : blob_load_backing_dev(spdk_bs_sequence_t *seq, void *cb_arg)
    1539             : {
    1540        4282 :         struct spdk_blob_load_ctx       *ctx = cb_arg;
    1541        4282 :         struct spdk_blob                *blob = ctx->blob;
    1542             :         const void                      *value;
    1543             :         size_t                          len;
    1544             :         int                             rc;
    1545             : 
    1546        4282 :         if (blob_is_esnap_clone(blob)) {
    1547         150 :                 rc = blob_load_esnap(blob, seq->cpl.u.blob_handle.esnap_ctx);
    1548         150 :                 blob_load_final(ctx, rc);
    1549         725 :                 return;
    1550             :         }
    1551             : 
    1552        4132 :         if (spdk_blob_is_thin_provisioned(blob)) {
    1553        1301 :                 rc = blob_get_xattr_value(blob, BLOB_SNAPSHOT, &value, &len, true);
    1554        1301 :                 if (rc == 0) {
    1555         575 :                         if (len != sizeof(spdk_blob_id)) {
    1556           0 :                                 blob_load_final(ctx, -EINVAL);
    1557           0 :                                 return;
    1558             :                         }
    1559             :                         /* open snapshot blob and continue in the callback function */
    1560         575 :                         blob->parent_id = *(spdk_blob_id *)value;
    1561         575 :                         spdk_bs_open_blob(blob->bs, blob->parent_id,
    1562             :                                           blob_load_snapshot_cpl, ctx);
    1563         575 :                         return;
    1564             :                 } else {
    1565             :                         /* add zeroes_dev for thin provisioned blob */
    1566         726 :                         blob->back_bs_dev = bs_create_zeroes_dev();
    1567             :                 }
    1568             :         } else {
    1569             :                 /* standard blob */
    1570        2831 :                 blob->back_bs_dev = NULL;
    1571             :         }
    1572        3557 :         blob_load_final(ctx, 0);
    1573             : }
    1574             : 
    1575             : static void
    1576        4189 : blob_load_cpl_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    1577             : {
    1578        4189 :         struct spdk_blob_load_ctx       *ctx = cb_arg;
    1579        4189 :         struct spdk_blob                *blob = ctx->blob;
    1580             :         struct spdk_blob_md_page        *page;
    1581             :         uint64_t                        i;
    1582             :         uint32_t                        crc;
    1583             :         uint64_t                        lba;
    1584             :         void                            *tmp;
    1585             :         uint64_t                        sz;
    1586             : 
    1587        4189 :         if (bserrno) {
    1588           9 :                 SPDK_ERRLOG("Extent page read failed: %d\n", bserrno);
    1589           9 :                 blob_load_final(ctx, bserrno);
    1590           9 :                 return;
    1591             :         }
    1592             : 
    1593        4180 :         if (ctx->pages == NULL) {
    1594             :                 /* First iteration of this function, allocate buffer for single EXTENT_PAGE */
    1595        2624 :                 ctx->pages = spdk_zmalloc(blob->bs->md_page_size, 0,
    1596             :                                           NULL, SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
    1597        2624 :                 if (!ctx->pages) {
    1598           0 :                         blob_load_final(ctx, -ENOMEM);
    1599           0 :                         return;
    1600             :                 }
    1601        2624 :                 ctx->num_pages = 1;
    1602        2624 :                 ctx->next_extent_page = 0;
    1603             :         } else {
    1604        1556 :                 page = &ctx->pages[0];
    1605        1556 :                 crc = blob_md_page_calc_crc(page);
    1606        1556 :                 if (crc != page->crc) {
    1607           0 :                         blob_load_final(ctx, -EINVAL);
    1608           0 :                         return;
    1609             :                 }
    1610             : 
    1611        1556 :                 if (page->next != SPDK_INVALID_MD_PAGE) {
    1612           0 :                         blob_load_final(ctx, -EINVAL);
    1613           0 :                         return;
    1614             :                 }
    1615             : 
    1616        1556 :                 bserrno = blob_parse_extent_page(page, blob);
    1617        1556 :                 if (bserrno) {
    1618           0 :                         blob_load_final(ctx, bserrno);
    1619           0 :                         return;
    1620             :                 }
    1621             :         }
    1622             : 
    1623        4816 :         for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) {
    1624        2201 :                 if (blob->active.extent_pages[i] != 0) {
    1625             :                         /* Extent page was allocated, read and parse it. */
    1626        1565 :                         lba = bs_md_page_to_lba(blob->bs, blob->active.extent_pages[i]);
    1627        1565 :                         ctx->next_extent_page = i + 1;
    1628             : 
    1629        1565 :                         bs_sequence_read_dev(seq, &ctx->pages[0], lba,
    1630        1565 :                                              bs_byte_to_lba(blob->bs, blob->bs->md_page_size),
    1631             :                                              blob_load_cpl_extents_cpl, ctx);
    1632        1565 :                         return;
    1633             :                 } else {
    1634             :                         /* Thin provisioned blobs can point to unallocated extent pages.
    1635             :                          * In this case blob size should be increased by up to the amount left in remaining_clusters_in_et. */
    1636             : 
    1637         636 :                         sz = spdk_min(blob->remaining_clusters_in_et, SPDK_EXTENTS_PER_EP);
    1638         636 :                         blob->active.num_clusters += sz;
    1639         636 :                         blob->remaining_clusters_in_et -= sz;
    1640             : 
    1641         636 :                         assert(spdk_blob_is_thin_provisioned(blob));
    1642         636 :                         assert(i + 1 < blob->active.num_extent_pages || blob->remaining_clusters_in_et == 0);
    1643             : 
    1644         636 :                         tmp = realloc(blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters));
    1645         636 :                         if (tmp == NULL) {
    1646           0 :                                 blob_load_final(ctx, -ENOMEM);
    1647           0 :                                 return;
    1648             :                         }
    1649         636 :                         memset(tmp + sizeof(*blob->active.clusters) * blob->active.cluster_array_size, 0,
    1650         636 :                                sizeof(*blob->active.clusters) * (blob->active.num_clusters - blob->active.cluster_array_size));
    1651         636 :                         blob->active.clusters = tmp;
    1652         636 :                         blob->active.cluster_array_size = blob->active.num_clusters;
    1653             :                 }
    1654             :         }
    1655             : 
    1656        2615 :         blob_load_backing_dev(seq, ctx);
    1657             : }
    1658             : 
    1659             : static void
    1660        4464 : blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    1661             : {
    1662        4464 :         struct spdk_blob_load_ctx       *ctx = cb_arg;
    1663        4464 :         struct spdk_blob                *blob = ctx->blob;
    1664             :         struct spdk_blob_md_page        *page;
    1665             :         int                             rc;
    1666             :         uint32_t                        crc;
    1667             :         uint32_t                        current_page;
    1668             : 
    1669        4464 :         if (ctx->num_pages == 1) {
    1670        4341 :                 current_page = bs_blobid_to_page(blob->id);
    1671             :         } else {
    1672         123 :                 assert(ctx->num_pages != 0);
    1673         123 :                 page = &ctx->pages[ctx->num_pages - 2];
    1674         123 :                 current_page = page->next;
    1675             :         }
    1676             : 
    1677        4464 :         if (bserrno) {
    1678          25 :                 SPDK_ERRLOG("Metadata page %d read failed for blobid 0x%" PRIx64 ": %d\n",
    1679             :                             current_page, blob->id, bserrno);
    1680          25 :                 blob_load_final(ctx, bserrno);
    1681          25 :                 return;
    1682             :         }
    1683             : 
    1684        4439 :         page = &ctx->pages[ctx->num_pages - 1];
    1685        4439 :         crc = blob_md_page_calc_crc(page);
    1686        4439 :         if (crc != page->crc) {
    1687          10 :                 SPDK_ERRLOG("Metadata page %d crc mismatch for blobid 0x%" PRIx64 "\n",
    1688             :                             current_page, blob->id);
    1689          10 :                 blob_load_final(ctx, -EINVAL);
    1690          10 :                 return;
    1691             :         }
    1692             : 
    1693        4429 :         if (page->next != SPDK_INVALID_MD_PAGE) {
    1694             :                 struct spdk_blob_md_page *tmp_pages;
    1695         123 :                 uint32_t next_page = page->next;
    1696         123 :                 uint64_t next_lba = bs_md_page_to_lba(blob->bs, next_page);
    1697             : 
    1698             :                 /* Read the next page */
    1699         123 :                 tmp_pages = spdk_realloc(ctx->pages, (sizeof(*page) * (ctx->num_pages + 1)), 0);
    1700         123 :                 if (tmp_pages == NULL) {
    1701           0 :                         blob_load_final(ctx, -ENOMEM);
    1702           0 :                         return;
    1703             :                 }
    1704         123 :                 ctx->num_pages++;
    1705         123 :                 ctx->pages = tmp_pages;
    1706             : 
    1707         123 :                 bs_sequence_read_dev(seq, &ctx->pages[ctx->num_pages - 1],
    1708             :                                      next_lba,
    1709         123 :                                      bs_byte_to_lba(blob->bs, sizeof(*page)),
    1710             :                                      blob_load_cpl, ctx);
    1711         123 :                 return;
    1712             :         }
    1713             : 
    1714             :         /* Parse the pages */
    1715        4306 :         rc = blob_parse(ctx->pages, ctx->num_pages, blob);
    1716        4306 :         if (rc) {
    1717          15 :                 blob_load_final(ctx, rc);
    1718          15 :                 return;
    1719             :         }
    1720             : 
    1721        4291 :         if (blob->extent_table_found == true) {
    1722             :                 /* If EXTENT_TABLE was found, that means support for it should be enabled. */
    1723        2624 :                 assert(blob->extent_rle_found == false);
    1724        2624 :                 blob->use_extent_table = true;
    1725             :         } else {
    1726             :                 /* If EXTENT_RLE or no extent_* descriptor was found disable support
    1727             :                  * for extent table. No extent_* descriptors means that blob has length of 0
    1728             :                  * and no extent_rle descriptors were persisted for it.
    1729             :                  * EXTENT_TABLE if used, is always present in metadata regardless of length. */
    1730        1667 :                 blob->use_extent_table = false;
    1731             :         }
    1732             : 
    1733             :         /* Check the clear_method stored in metadata vs what may have been passed
    1734             :          * via spdk_bs_open_blob_ext() and update accordingly.
    1735             :          */
    1736        4291 :         blob_update_clear_method(blob);
    1737             : 
    1738        4291 :         spdk_free(ctx->pages);
    1739        4291 :         ctx->pages = NULL;
    1740             : 
    1741        4291 :         if (blob->extent_table_found) {
    1742        2624 :                 blob_load_cpl_extents_cpl(seq, ctx, 0);
    1743             :         } else {
    1744        1667 :                 blob_load_backing_dev(seq, ctx);
    1745             :         }
    1746             : }
    1747             : 
    1748             : /* Load a blob from disk given a blobid */
    1749             : static void
    1750        4341 : blob_load(spdk_bs_sequence_t *seq, struct spdk_blob *blob,
    1751             :           spdk_bs_sequence_cpl cb_fn, void *cb_arg)
    1752             : {
    1753             :         struct spdk_blob_load_ctx *ctx;
    1754             :         struct spdk_blob_store *bs;
    1755             :         uint32_t page_num;
    1756             :         uint64_t lba;
    1757             : 
    1758        4341 :         blob_verify_md_op(blob);
    1759             : 
    1760        4341 :         bs = blob->bs;
    1761             : 
    1762        4341 :         ctx = calloc(1, sizeof(*ctx));
    1763        4341 :         if (!ctx) {
    1764           0 :                 cb_fn(seq, cb_arg, -ENOMEM);
    1765           0 :                 return;
    1766             :         }
    1767             : 
    1768        4341 :         ctx->blob = blob;
    1769        4341 :         ctx->pages = spdk_realloc(ctx->pages, bs->md_page_size, 0);
    1770        4341 :         if (!ctx->pages) {
    1771           0 :                 free(ctx);
    1772           0 :                 cb_fn(seq, cb_arg, -ENOMEM);
    1773           0 :                 return;
    1774             :         }
    1775        4341 :         ctx->num_pages = 1;
    1776        4341 :         ctx->cb_fn = cb_fn;
    1777        4341 :         ctx->cb_arg = cb_arg;
    1778        4341 :         ctx->seq = seq;
    1779             : 
    1780        4341 :         page_num = bs_blobid_to_page(blob->id);
    1781        4341 :         lba = bs_md_page_to_lba(blob->bs, page_num);
    1782             : 
    1783        4341 :         blob->state = SPDK_BLOB_STATE_LOADING;
    1784             : 
    1785        4341 :         bs_sequence_read_dev(seq, &ctx->pages[0], lba,
    1786        4341 :                              bs_byte_to_lba(bs, bs->md_page_size),
    1787             :                              blob_load_cpl, ctx);
    1788             : }
    1789             : 
    1790             : struct spdk_blob_persist_ctx {
    1791             :         struct spdk_blob                *blob;
    1792             : 
    1793             :         struct spdk_blob_md_page        *pages;
    1794             :         uint32_t                        next_extent_page;
    1795             :         struct spdk_blob_md_page        *extent_page;
    1796             : 
    1797             :         spdk_bs_sequence_t              *seq;
    1798             :         spdk_bs_sequence_cpl            cb_fn;
    1799             :         void                            *cb_arg;
    1800             :         TAILQ_ENTRY(spdk_blob_persist_ctx) link;
    1801             : };
    1802             : 
    1803             : static void
    1804        1584 : bs_batch_clear_dev(struct spdk_blob *blob, spdk_bs_batch_t *batch, uint64_t lba,
    1805             :                    uint64_t lba_count)
    1806             : {
    1807        1584 :         switch (blob->clear_method) {
    1808        1584 :         case BLOB_CLEAR_WITH_DEFAULT:
    1809             :         case BLOB_CLEAR_WITH_UNMAP:
    1810        1584 :                 bs_batch_unmap_dev(batch, lba, lba_count);
    1811        1584 :                 break;
    1812           0 :         case BLOB_CLEAR_WITH_WRITE_ZEROES:
    1813           0 :                 bs_batch_write_zeroes_dev(batch, lba, lba_count);
    1814           0 :                 break;
    1815           0 :         case BLOB_CLEAR_WITH_NONE:
    1816             :         default:
    1817           0 :                 break;
    1818             :         }
    1819        1584 : }
    1820             : 
    1821             : static int
    1822        1437 : bs_super_validate(struct spdk_bs_super_block *super, struct spdk_blob_store *bs)
    1823             : {
    1824             :         uint32_t        crc;
    1825             :         static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH];
    1826             : 
    1827        1437 :         if (super->version > SPDK_BS_VERSION ||
    1828        1432 :             super->version < SPDK_BS_INITIAL_VERSION) {
    1829          10 :                 return -EILSEQ;
    1830             :         }
    1831             : 
    1832        1427 :         if (memcmp(super->signature, SPDK_BS_SUPER_BLOCK_SIG,
    1833             :                    sizeof(super->signature)) != 0) {
    1834           0 :                 return -EILSEQ;
    1835             :         }
    1836             : 
    1837        1427 :         crc = blob_md_page_calc_crc(super);
    1838        1427 :         if (crc != super->crc) {
    1839           5 :                 return -EILSEQ;
    1840             :         }
    1841             : 
    1842        1422 :         if (memcmp(&bs->bstype, &super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) {
    1843        1405 :                 SPDK_DEBUGLOG(blob, "Bstype matched - loading blobstore\n");
    1844          17 :         } else if (memcmp(&bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) {
    1845           7 :                 SPDK_DEBUGLOG(blob, "Bstype wildcard used - loading blobstore regardless bstype\n");
    1846             :         } else {
    1847          10 :                 SPDK_DEBUGLOG(blob, "Unexpected bstype\n");
    1848          10 :                 SPDK_LOGDUMP(blob, "Expected:", bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH);
    1849          10 :                 SPDK_LOGDUMP(blob, "Found:", super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH);
    1850          10 :                 return -ENXIO;
    1851             :         }
    1852             : 
    1853        1412 :         if (super->size > bs->dev->blockcnt * bs->dev->blocklen) {
    1854          10 :                 SPDK_NOTICELOG("Size mismatch, dev size: %" PRIu64 ", blobstore size: %" PRIu64 "\n",
    1855             :                                bs->dev->blockcnt * bs->dev->blocklen, super->size);
    1856          10 :                 return -EILSEQ;
    1857             :         }
    1858             : 
    1859        1402 :         return 0;
    1860             : }
    1861             : 
    1862             : static void bs_mark_dirty(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs,
    1863             :                           spdk_bs_sequence_cpl cb_fn, void *cb_arg);
    1864             : 
    1865             : static void
    1866        6304 : blob_persist_complete_cb(void *arg)
    1867             : {
    1868        6304 :         struct spdk_blob_persist_ctx *ctx = arg;
    1869             : 
    1870             :         /* Call user callback */
    1871        6304 :         ctx->cb_fn(ctx->seq, ctx->cb_arg, 0);
    1872             : 
    1873             :         /* Free the memory */
    1874        6304 :         spdk_free(ctx->pages);
    1875        6304 :         free(ctx);
    1876        6304 : }
    1877             : 
    1878             : static void blob_persist_start(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno);
    1879             : 
    1880             : static void
    1881        6304 : blob_persist_complete(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx, int bserrno)
    1882             : {
    1883             :         struct spdk_blob_persist_ctx    *next_persist, *tmp;
    1884        6304 :         struct spdk_blob                *blob = ctx->blob;
    1885             : 
    1886        6304 :         if (bserrno == 0) {
    1887        6239 :                 blob_mark_clean(blob);
    1888             :         }
    1889             : 
    1890        6304 :         assert(ctx == TAILQ_FIRST(&blob->persists_to_complete));
    1891             : 
    1892             :         /* Complete all persists that were pending when the current persist started */
    1893       12608 :         TAILQ_FOREACH_SAFE(next_persist, &blob->persists_to_complete, link, tmp) {
    1894        6304 :                 TAILQ_REMOVE(&blob->persists_to_complete, next_persist, link);
    1895        6304 :                 spdk_thread_send_msg(spdk_get_thread(), blob_persist_complete_cb, next_persist);
    1896             :         }
    1897             : 
    1898        6304 :         if (TAILQ_EMPTY(&blob->pending_persists)) {
    1899        6276 :                 return;
    1900             :         }
    1901             : 
    1902             :         /* Queue up all pending persists for completion and start blob persist with first one */
    1903          28 :         TAILQ_SWAP(&blob->persists_to_complete, &blob->pending_persists, spdk_blob_persist_ctx, link);
    1904          28 :         next_persist = TAILQ_FIRST(&blob->persists_to_complete);
    1905             : 
    1906          28 :         blob->state = SPDK_BLOB_STATE_DIRTY;
    1907          28 :         bs_mark_dirty(seq, blob->bs, blob_persist_start, next_persist);
    1908             : }
    1909             : 
    1910             : static void
    1911        6239 : blob_persist_clear_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    1912             : {
    1913        6239 :         struct spdk_blob_persist_ctx    *ctx = cb_arg;
    1914        6239 :         struct spdk_blob                *blob = ctx->blob;
    1915        6239 :         struct spdk_blob_store          *bs = blob->bs;
    1916             :         size_t                          i;
    1917             : 
    1918        6239 :         if (bserrno != 0) {
    1919           0 :                 blob_persist_complete(seq, ctx, bserrno);
    1920           0 :                 return;
    1921             :         }
    1922             : 
    1923        6239 :         spdk_spin_lock(&bs->used_lock);
    1924             : 
    1925             :         /* Release all extent_pages that were truncated */
    1926        8837 :         for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) {
    1927             :                 /* Nothing to release if it was not allocated */
    1928        2598 :                 if (blob->active.extent_pages[i] != 0) {
    1929         936 :                         bs_release_md_page(bs, blob->active.extent_pages[i]);
    1930             :                 }
    1931             :         }
    1932             : 
    1933        6239 :         spdk_spin_unlock(&bs->used_lock);
    1934             : 
    1935        6239 :         if (blob->active.num_extent_pages == 0) {
    1936        4134 :                 free(blob->active.extent_pages);
    1937        4134 :                 blob->active.extent_pages = NULL;
    1938        4134 :                 blob->active.extent_pages_array_size = 0;
    1939        2105 :         } else if (blob->active.num_extent_pages != blob->active.extent_pages_array_size) {
    1940             : #ifndef __clang_analyzer__
    1941             :                 void *tmp;
    1942             : 
    1943             :                 /* scan-build really can't figure reallocs, workaround it */
    1944           3 :                 tmp = realloc(blob->active.extent_pages, sizeof(uint32_t) * blob->active.num_extent_pages);
    1945           3 :                 assert(tmp != NULL);
    1946           3 :                 blob->active.extent_pages = tmp;
    1947             : #endif
    1948           3 :                 blob->active.extent_pages_array_size = blob->active.num_extent_pages;
    1949             :         }
    1950             : 
    1951        6239 :         blob_persist_complete(seq, ctx, bserrno);
    1952             : }
    1953             : 
    1954             : static void
    1955        6239 : blob_persist_clear_extents(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx)
    1956             : {
    1957        6239 :         struct spdk_blob                *blob = ctx->blob;
    1958        6239 :         struct spdk_blob_store          *bs = blob->bs;
    1959             :         size_t                          i;
    1960             :         uint64_t                        lba;
    1961             :         uint64_t                        lba_count;
    1962             :         spdk_bs_batch_t                 *batch;
    1963             : 
    1964        6239 :         batch = bs_sequence_to_batch(seq, blob_persist_clear_extents_cpl, ctx);
    1965        6239 :         lba_count = bs_byte_to_lba(bs, bs->md_page_size);
    1966             : 
    1967             :         /* Clear all extent_pages that were truncated */
    1968        8837 :         for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) {
    1969             :                 /* Nothing to clear if it was not allocated */
    1970        2598 :                 if (blob->active.extent_pages[i] != 0) {
    1971         936 :                         lba = bs_md_page_to_lba(bs, blob->active.extent_pages[i]);
    1972         936 :                         bs_batch_write_zeroes_dev(batch, lba, lba_count);
    1973             :                 }
    1974             :         }
    1975             : 
    1976        6239 :         bs_batch_close(batch);
    1977        6239 : }
    1978             : 
    1979             : static void
    1980        6239 : blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    1981             : {
    1982        6239 :         struct spdk_blob_persist_ctx    *ctx = cb_arg;
    1983        6239 :         struct spdk_blob                *blob = ctx->blob;
    1984        6239 :         struct spdk_blob_store          *bs = blob->bs;
    1985             :         size_t                          i;
    1986             : 
    1987        6239 :         if (bserrno != 0) {
    1988           0 :                 blob_persist_complete(seq, ctx, bserrno);
    1989           0 :                 return;
    1990             :         }
    1991             : 
    1992        6239 :         spdk_spin_lock(&bs->used_lock);
    1993             :         /* Release all clusters that were truncated */
    1994     1342490 :         for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) {
    1995     1336251 :                 uint32_t cluster_num = bs_lba_to_cluster(bs, blob->active.clusters[i]);
    1996             : 
    1997             :                 /* Nothing to release if it was not allocated */
    1998     1336251 :                 if (blob->active.clusters[i] != 0) {
    1999        2926 :                         bs_release_cluster(bs, cluster_num);
    2000             :                 }
    2001             :         }
    2002        6239 :         spdk_spin_unlock(&bs->used_lock);
    2003             : 
    2004        6239 :         if (blob->active.num_clusters == 0) {
    2005        2423 :                 free(blob->active.clusters);
    2006        2423 :                 blob->active.clusters = NULL;
    2007        2423 :                 blob->active.cluster_array_size = 0;
    2008        3816 :         } else if (blob->active.num_clusters != blob->active.cluster_array_size) {
    2009             : #ifndef __clang_analyzer__
    2010             :                 void *tmp;
    2011             : 
    2012             :                 /* scan-build really can't figure reallocs, workaround it */
    2013          22 :                 tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * blob->active.num_clusters);
    2014          22 :                 assert(tmp != NULL);
    2015          22 :                 blob->active.clusters = tmp;
    2016             : 
    2017             : #endif
    2018          22 :                 blob->active.cluster_array_size = blob->active.num_clusters;
    2019             :         }
    2020             : 
    2021             :         /* Move on to clearing extent pages */
    2022        6239 :         blob_persist_clear_extents(seq, ctx);
    2023             : }
    2024             : 
    2025             : static void
    2026        6239 : blob_persist_clear_clusters(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx)
    2027             : {
    2028        6239 :         struct spdk_blob                *blob = ctx->blob;
    2029        6239 :         struct spdk_blob_store          *bs = blob->bs;
    2030             :         spdk_bs_batch_t                 *batch;
    2031             :         size_t                          i;
    2032             :         uint64_t                        lba;
    2033             :         uint64_t                        lba_count;
    2034             : 
    2035             :         /* Clusters don't move around in blobs. The list shrinks or grows
    2036             :          * at the end, but no changes ever occur in the middle of the list.
    2037             :          */
    2038             : 
    2039        6239 :         batch = bs_sequence_to_batch(seq, blob_persist_clear_clusters_cpl, ctx);
    2040             : 
    2041             :         /* Clear all clusters that were truncated */
    2042        6239 :         lba = 0;
    2043        6239 :         lba_count = 0;
    2044     1342490 :         for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) {
    2045     1336251 :                 uint64_t next_lba = blob->active.clusters[i];
    2046     1336251 :                 uint64_t next_lba_count = bs_cluster_to_lba(bs, 1);
    2047             : 
    2048     1336251 :                 if (next_lba > 0 && (lba + lba_count) == next_lba) {
    2049             :                         /* This cluster is contiguous with the previous one. */
    2050        1347 :                         lba_count += next_lba_count;
    2051        1347 :                         continue;
    2052     1334904 :                 } else if (next_lba == 0) {
    2053     1333325 :                         continue;
    2054             :                 }
    2055             : 
    2056             :                 /* This cluster is not contiguous with the previous one. */
    2057             : 
    2058             :                 /* If a run of LBAs previously existing, clear them now */
    2059        1579 :                 if (lba_count > 0) {
    2060          45 :                         bs_batch_clear_dev(ctx->blob, batch, lba, lba_count);
    2061             :                 }
    2062             : 
    2063             :                 /* Start building the next batch */
    2064        1579 :                 lba = next_lba;
    2065        1579 :                 if (next_lba > 0) {
    2066        1579 :                         lba_count = next_lba_count;
    2067             :                 } else {
    2068           0 :                         lba_count = 0;
    2069             :                 }
    2070             :         }
    2071             : 
    2072             :         /* If we ended with a contiguous set of LBAs, clear them now */
    2073        6239 :         if (lba_count > 0) {
    2074        1534 :                 bs_batch_clear_dev(ctx->blob, batch, lba, lba_count);
    2075             :         }
    2076             : 
    2077        6239 :         bs_batch_close(batch);
    2078        6239 : }
    2079             : 
    2080             : static void
    2081        6244 : blob_persist_zero_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    2082             : {
    2083        6244 :         struct spdk_blob_persist_ctx    *ctx = cb_arg;
    2084        6244 :         struct spdk_blob                *blob = ctx->blob;
    2085        6244 :         struct spdk_blob_store          *bs = blob->bs;
    2086             :         size_t                          i;
    2087             : 
    2088        6244 :         if (bserrno != 0) {
    2089           5 :                 blob_persist_complete(seq, ctx, bserrno);
    2090           5 :                 return;
    2091             :         }
    2092             : 
    2093        6239 :         spdk_spin_lock(&bs->used_lock);
    2094             : 
    2095             :         /* This loop starts at 1 because the first page is special and handled
    2096             :          * below. The pages (except the first) are never written in place,
    2097             :          * so any pages in the clean list must be zeroed.
    2098             :          */
    2099        6324 :         for (i = 1; i < blob->clean.num_pages; i++) {
    2100          85 :                 bs_release_md_page(bs, blob->clean.pages[i]);
    2101             :         }
    2102             : 
    2103        6239 :         if (blob->active.num_pages == 0) {
    2104             :                 uint32_t page_num;
    2105             : 
    2106        1857 :                 page_num = bs_blobid_to_page(blob->id);
    2107        1857 :                 bs_release_md_page(bs, page_num);
    2108             :         }
    2109             : 
    2110        6239 :         spdk_spin_unlock(&bs->used_lock);
    2111             : 
    2112             :         /* Move on to clearing clusters */
    2113        6239 :         blob_persist_clear_clusters(seq, ctx);
    2114             : }
    2115             : 
    2116             : static void
    2117        6294 : blob_persist_zero_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    2118             : {
    2119        6294 :         struct spdk_blob_persist_ctx    *ctx = cb_arg;
    2120        6294 :         struct spdk_blob                *blob = ctx->blob;
    2121        6294 :         struct spdk_blob_store          *bs = blob->bs;
    2122             :         uint64_t                        lba;
    2123             :         uint64_t                        lba_count;
    2124             :         spdk_bs_batch_t                 *batch;
    2125             :         size_t                          i;
    2126             : 
    2127        6294 :         if (bserrno != 0) {
    2128          50 :                 blob_persist_complete(seq, ctx, bserrno);
    2129          50 :                 return;
    2130             :         }
    2131             : 
    2132        6244 :         batch = bs_sequence_to_batch(seq, blob_persist_zero_pages_cpl, ctx);
    2133             : 
    2134        6244 :         lba_count = bs_byte_to_lba(bs, bs->md_page_size);
    2135             : 
    2136             :         /* This loop starts at 1 because the first page is special and handled
    2137             :          * below. The pages (except the first) are never written in place,
    2138             :          * so any pages in the clean list must be zeroed.
    2139             :          */
    2140        6329 :         for (i = 1; i < blob->clean.num_pages; i++) {
    2141          85 :                 lba = bs_md_page_to_lba(bs, blob->clean.pages[i]);
    2142             : 
    2143          85 :                 bs_batch_write_zeroes_dev(batch, lba, lba_count);
    2144             :         }
    2145             : 
    2146             :         /* The first page will only be zeroed if this is a delete. */
    2147        6244 :         if (blob->active.num_pages == 0) {
    2148             :                 uint32_t page_num;
    2149             : 
    2150             :                 /* The first page in the metadata goes where the blobid indicates */
    2151        1862 :                 page_num = bs_blobid_to_page(blob->id);
    2152        1862 :                 lba = bs_md_page_to_lba(bs, page_num);
    2153             : 
    2154        1862 :                 bs_batch_write_zeroes_dev(batch, lba, lba_count);
    2155             :         }
    2156             : 
    2157        6244 :         bs_batch_close(batch);
    2158             : }
    2159             : 
    2160             : static void
    2161        4432 : blob_persist_write_page_root(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    2162             : {
    2163        4432 :         struct spdk_blob_persist_ctx    *ctx = cb_arg;
    2164        4432 :         struct spdk_blob                *blob = ctx->blob;
    2165        4432 :         struct spdk_blob_store          *bs = blob->bs;
    2166             :         uint64_t                        lba;
    2167             :         uint32_t                        lba_count;
    2168             :         struct spdk_blob_md_page        *page;
    2169             : 
    2170        4432 :         if (bserrno != 0) {
    2171           0 :                 blob_persist_complete(seq, ctx, bserrno);
    2172           0 :                 return;
    2173             :         }
    2174             : 
    2175        4432 :         if (blob->active.num_pages == 0) {
    2176             :                 /* Move on to the next step */
    2177           0 :                 blob_persist_zero_pages(seq, ctx, 0);
    2178           0 :                 return;
    2179             :         }
    2180             : 
    2181        4432 :         lba_count = bs_byte_to_lba(bs, bs->md_page_size);
    2182             : 
    2183        4432 :         page = &ctx->pages[0];
    2184             :         /* The first page in the metadata goes where the blobid indicates */
    2185        4432 :         lba = bs_md_page_to_lba(bs, bs_blobid_to_page(blob->id));
    2186             : 
    2187        4432 :         bs_sequence_write_dev(seq, page, lba, lba_count,
    2188             :                               blob_persist_zero_pages, ctx);
    2189             : }
    2190             : 
    2191             : static void
    2192        4432 : blob_persist_write_page_chain(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx)
    2193             : {
    2194        4432 :         struct spdk_blob                *blob = ctx->blob;
    2195        4432 :         struct spdk_blob_store          *bs = blob->bs;
    2196             :         uint64_t                        lba;
    2197             :         uint32_t                        lba_count;
    2198             :         struct spdk_blob_md_page        *page;
    2199             :         spdk_bs_batch_t                 *batch;
    2200             :         size_t                          i;
    2201             : 
    2202             :         /* Clusters don't move around in blobs. The list shrinks or grows
    2203             :          * at the end, but no changes ever occur in the middle of the list.
    2204             :          */
    2205             : 
    2206        4432 :         lba_count = bs_byte_to_lba(bs, sizeof(*page));
    2207             : 
    2208        4432 :         batch = bs_sequence_to_batch(seq, blob_persist_write_page_root, ctx);
    2209             : 
    2210             :         /* This starts at 1. The root page is not written until
    2211             :          * all of the others are finished
    2212             :          */
    2213        4541 :         for (i = 1; i < blob->active.num_pages; i++) {
    2214         109 :                 page = &ctx->pages[i];
    2215         109 :                 assert(page->sequence_num == i);
    2216             : 
    2217         109 :                 lba = bs_md_page_to_lba(bs, blob->active.pages[i]);
    2218             : 
    2219         109 :                 bs_batch_write_dev(batch, page, lba, lba_count);
    2220             :         }
    2221             : 
    2222        4432 :         bs_batch_close(batch);
    2223        4432 : }
    2224             : 
    2225             : static int
    2226        4465 : blob_resize(struct spdk_blob *blob, uint64_t sz)
    2227             : {
    2228             :         uint64_t        i;
    2229             :         uint64_t        *tmp;
    2230             :         uint64_t        cluster;
    2231             :         uint32_t        lfmd; /*  lowest free md page */
    2232             :         uint64_t        num_clusters;
    2233             :         uint32_t        *ep_tmp;
    2234        4465 :         uint64_t        new_num_ep = 0, current_num_ep = 0;
    2235             :         struct spdk_blob_store *bs;
    2236             :         int             rc;
    2237             : 
    2238        4465 :         bs = blob->bs;
    2239             : 
    2240        4465 :         blob_verify_md_op(blob);
    2241             : 
    2242        4465 :         if (blob->active.num_clusters == sz) {
    2243         566 :                 return 0;
    2244             :         }
    2245             : 
    2246        3899 :         if (blob->active.num_clusters < blob->active.cluster_array_size) {
    2247             :                 /* If this blob was resized to be larger, then smaller, then
    2248             :                  * larger without syncing, then the cluster array already
    2249             :                  * contains spare assigned clusters we can use.
    2250             :                  */
    2251           0 :                 num_clusters = spdk_min(blob->active.cluster_array_size,
    2252             :                                         sz);
    2253             :         } else {
    2254        3899 :                 num_clusters = blob->active.num_clusters;
    2255             :         }
    2256             : 
    2257        3899 :         if (blob->use_extent_table) {
    2258             :                 /* Round up since every cluster beyond current Extent Table size,
    2259             :                  * requires new extent page. */
    2260        2359 :                 new_num_ep = spdk_divide_round_up(sz, SPDK_EXTENTS_PER_EP);
    2261        2359 :                 current_num_ep = spdk_divide_round_up(num_clusters, SPDK_EXTENTS_PER_EP);
    2262             :         }
    2263             : 
    2264        3899 :         assert(!spdk_spin_held(&bs->used_lock));
    2265             : 
    2266             :         /* Check first that we have enough clusters and md pages before we start claiming them.
    2267             :          * bs->used_lock is held to ensure that clusters we think are free are still free when we go
    2268             :          * to claim them later in this function.
    2269             :          */
    2270        3899 :         if (sz > num_clusters && spdk_blob_is_thin_provisioned(blob) == false) {
    2271        1624 :                 spdk_spin_lock(&bs->used_lock);
    2272        1624 :                 if ((sz - num_clusters) > bs->num_free_clusters) {
    2273          10 :                         rc = -ENOSPC;
    2274          10 :                         goto out;
    2275             :                 }
    2276        1614 :                 lfmd = 0;
    2277        2572 :                 for (i = current_num_ep; i < new_num_ep ; i++) {
    2278         958 :                         lfmd = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, lfmd);
    2279         958 :                         if (lfmd == UINT32_MAX) {
    2280             :                                 /* No more free md pages. Cannot satisfy the request */
    2281           0 :                                 rc = -ENOSPC;
    2282           0 :                                 goto out;
    2283             :                         }
    2284             :                 }
    2285             :         }
    2286             : 
    2287        3889 :         if (sz > num_clusters) {
    2288             :                 /* Expand the cluster array if necessary.
    2289             :                  * We only shrink the array when persisting.
    2290             :                  */
    2291        2130 :                 tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * sz);
    2292        2130 :                 if (sz > 0 && tmp == NULL) {
    2293           0 :                         rc = -ENOMEM;
    2294           0 :                         goto out;
    2295             :                 }
    2296        2130 :                 memset(tmp + blob->active.cluster_array_size, 0,
    2297        2130 :                        sizeof(*blob->active.clusters) * (sz - blob->active.cluster_array_size));
    2298        2130 :                 blob->active.clusters = tmp;
    2299        2130 :                 blob->active.cluster_array_size = sz;
    2300             : 
    2301             :                 /* Expand the extents table, only if enough clusters were added */
    2302        2130 :                 if (new_num_ep > current_num_ep && blob->use_extent_table) {
    2303        1255 :                         ep_tmp = realloc(blob->active.extent_pages, sizeof(*blob->active.extent_pages) * new_num_ep);
    2304        1255 :                         if (new_num_ep > 0 && ep_tmp == NULL) {
    2305           0 :                                 rc = -ENOMEM;
    2306           0 :                                 goto out;
    2307             :                         }
    2308        1255 :                         memset(ep_tmp + blob->active.extent_pages_array_size, 0,
    2309        1255 :                                sizeof(*blob->active.extent_pages) * (new_num_ep - blob->active.extent_pages_array_size));
    2310        1255 :                         blob->active.extent_pages = ep_tmp;
    2311        1255 :                         blob->active.extent_pages_array_size = new_num_ep;
    2312             :                 }
    2313             :         }
    2314             : 
    2315        3889 :         blob->state = SPDK_BLOB_STATE_DIRTY;
    2316             : 
    2317        3889 :         if (spdk_blob_is_thin_provisioned(blob) == false) {
    2318        3028 :                 cluster = 0;
    2319        3028 :                 lfmd = 0;
    2320       12281 :                 for (i = num_clusters; i < sz; i++) {
    2321        9253 :                         bs_allocate_cluster(blob, i, &cluster, &lfmd, true);
    2322             :                         /* Do not increment lfmd here.  lfmd will get updated
    2323             :                          * to the md_page allocated (if any) when a new extent
    2324             :                          * page is needed.  Just pass that value again,
    2325             :                          * bs_allocate_cluster will just start at that index
    2326             :                          * to find the next free md_page when needed.
    2327             :                          */
    2328             :                 }
    2329             :         }
    2330             : 
    2331             :         /* If we are shrinking the blob, we must adjust num_allocated_clusters */
    2332     1340190 :         for (i = sz; i < num_clusters; i++) {
    2333     1336301 :                 if (blob->active.clusters[i] != 0) {
    2334        2926 :                         blob->active.num_allocated_clusters--;
    2335             :                 }
    2336             :         }
    2337             : 
    2338        3889 :         blob->active.num_clusters = sz;
    2339        3889 :         blob->active.num_extent_pages = new_num_ep;
    2340             : 
    2341        3889 :         rc = 0;
    2342        3899 : out:
    2343        3899 :         if (spdk_spin_held(&bs->used_lock)) {
    2344        1624 :                 spdk_spin_unlock(&bs->used_lock);
    2345             :         }
    2346             : 
    2347        3899 :         return rc;
    2348             : }
    2349             : 
    2350             : static void
    2351        4432 : blob_persist_generate_new_md(struct spdk_blob_persist_ctx *ctx)
    2352             : {
    2353        4432 :         spdk_bs_sequence_t *seq = ctx->seq;
    2354        4432 :         struct spdk_blob *blob = ctx->blob;
    2355        4432 :         struct spdk_blob_store *bs = blob->bs;
    2356             :         uint64_t i;
    2357             :         uint32_t page_num;
    2358             :         void *tmp;
    2359             :         int rc;
    2360             : 
    2361             :         /* Generate the new metadata */
    2362        4432 :         rc = blob_serialize(blob, &ctx->pages, &blob->active.num_pages);
    2363        4432 :         if (rc < 0) {
    2364           0 :                 blob_persist_complete(seq, ctx, rc);
    2365           0 :                 return;
    2366             :         }
    2367             : 
    2368        4432 :         assert(blob->active.num_pages >= 1);
    2369             : 
    2370             :         /* Resize the cache of page indices */
    2371        4432 :         tmp = realloc(blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages));
    2372        4432 :         if (!tmp) {
    2373           0 :                 blob_persist_complete(seq, ctx, -ENOMEM);
    2374           0 :                 return;
    2375             :         }
    2376        4432 :         blob->active.pages = tmp;
    2377             : 
    2378             :         /* Assign this metadata to pages. This requires two passes - one to verify that there are
    2379             :          * enough pages and a second to actually claim them. The used_lock is held across
    2380             :          * both passes to ensure things don't change in the middle.
    2381             :          */
    2382        4432 :         spdk_spin_lock(&bs->used_lock);
    2383        4432 :         page_num = 0;
    2384             :         /* Note that this loop starts at one. The first page location is fixed by the blobid. */
    2385        4541 :         for (i = 1; i < blob->active.num_pages; i++) {
    2386         109 :                 page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num);
    2387         109 :                 if (page_num == UINT32_MAX) {
    2388           0 :                         spdk_spin_unlock(&bs->used_lock);
    2389           0 :                         blob_persist_complete(seq, ctx, -ENOMEM);
    2390           0 :                         return;
    2391             :                 }
    2392         109 :                 page_num++;
    2393             :         }
    2394             : 
    2395        4432 :         page_num = 0;
    2396        4432 :         blob->active.pages[0] = bs_blobid_to_page(blob->id);
    2397        4541 :         for (i = 1; i < blob->active.num_pages; i++) {
    2398         109 :                 page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num);
    2399         109 :                 ctx->pages[i - 1].next = page_num;
    2400             :                 /* Now that previous metadata page is complete, calculate the crc for it. */
    2401         109 :                 ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]);
    2402         109 :                 blob->active.pages[i] = page_num;
    2403         109 :                 bs_claim_md_page(bs, page_num);
    2404         109 :                 SPDK_DEBUGLOG(blob, "Claiming page %u for blob 0x%" PRIx64 "\n", page_num,
    2405             :                               blob->id);
    2406         109 :                 page_num++;
    2407             :         }
    2408        4432 :         spdk_spin_unlock(&bs->used_lock);
    2409        4432 :         ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]);
    2410             :         /* Start writing the metadata from last page to first */
    2411        4432 :         blob->state = SPDK_BLOB_STATE_CLEAN;
    2412        4432 :         blob_persist_write_page_chain(seq, ctx);
    2413             : }
    2414             : 
    2415             : static void
    2416        3108 : blob_persist_write_extent_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    2417             : {
    2418        3108 :         struct spdk_blob_persist_ctx    *ctx = cb_arg;
    2419        3108 :         struct spdk_blob                *blob = ctx->blob;
    2420             :         size_t                          i;
    2421             :         uint32_t                        extent_page_id;
    2422        3108 :         uint32_t                        page_count = 0;
    2423             :         int                             rc;
    2424             : 
    2425        3108 :         if (ctx->extent_page != NULL) {
    2426         991 :                 spdk_free(ctx->extent_page);
    2427         991 :                 ctx->extent_page = NULL;
    2428             :         }
    2429             : 
    2430        3108 :         if (bserrno != 0) {
    2431           0 :                 blob_persist_complete(seq, ctx, bserrno);
    2432         991 :                 return;
    2433             :         }
    2434             : 
    2435             :         /* Only write out Extent Pages when blob was resized. */
    2436        6492 :         for (i = ctx->next_extent_page; i < blob->active.extent_pages_array_size; i++) {
    2437        4375 :                 extent_page_id = blob->active.extent_pages[i];
    2438        4375 :                 if (extent_page_id == 0) {
    2439             :                         /* No Extent Page to persist */
    2440        3384 :                         assert(spdk_blob_is_thin_provisioned(blob));
    2441        3384 :                         continue;
    2442             :                 }
    2443         991 :                 assert(spdk_bit_array_get(blob->bs->used_md_pages, extent_page_id));
    2444         991 :                 ctx->next_extent_page = i + 1;
    2445         991 :                 rc = blob_serialize_add_page(ctx->blob, &ctx->extent_page, &page_count, &ctx->extent_page);
    2446         991 :                 if (rc < 0) {
    2447           0 :                         blob_persist_complete(seq, ctx, rc);
    2448           0 :                         return;
    2449             :                 }
    2450             : 
    2451         991 :                 blob->state = SPDK_BLOB_STATE_DIRTY;
    2452         991 :                 blob_serialize_extent_page(blob, i * SPDK_EXTENTS_PER_EP, ctx->extent_page);
    2453             : 
    2454         991 :                 ctx->extent_page->crc = blob_md_page_calc_crc(ctx->extent_page);
    2455             : 
    2456         991 :                 bs_sequence_write_dev(seq, ctx->extent_page, bs_md_page_to_lba(blob->bs, extent_page_id),
    2457         991 :                                       bs_byte_to_lba(blob->bs, blob->bs->md_page_size),
    2458             :                                       blob_persist_write_extent_pages, ctx);
    2459         991 :                 return;
    2460             :         }
    2461             : 
    2462        2117 :         blob_persist_generate_new_md(ctx);
    2463             : }
    2464             : 
    2465             : static void
    2466        6304 : blob_persist_start(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    2467             : {
    2468        6304 :         struct spdk_blob_persist_ctx *ctx = cb_arg;
    2469        6304 :         struct spdk_blob *blob = ctx->blob;
    2470             : 
    2471        6304 :         if (bserrno != 0) {
    2472          10 :                 blob_persist_complete(seq, ctx, bserrno);
    2473          10 :                 return;
    2474             :         }
    2475             : 
    2476        6294 :         if (blob->active.num_pages == 0) {
    2477             :                 /* This is the signal that the blob should be deleted.
    2478             :                  * Immediately jump to the clean up routine. */
    2479        1862 :                 assert(blob->clean.num_pages > 0);
    2480        1862 :                 blob->state = SPDK_BLOB_STATE_CLEAN;
    2481        1862 :                 blob_persist_zero_pages(seq, ctx, 0);
    2482        1862 :                 return;
    2483             : 
    2484             :         }
    2485             : 
    2486        4432 :         if (blob->clean.num_clusters < blob->active.num_clusters) {
    2487             :                 /* Blob was resized up */
    2488        2095 :                 assert(blob->clean.num_extent_pages <= blob->active.num_extent_pages);
    2489        2095 :                 ctx->next_extent_page = spdk_max(1, blob->clean.num_extent_pages) - 1;
    2490        2337 :         } else if (blob->active.num_clusters < blob->active.cluster_array_size) {
    2491             :                 /* Blob was resized down */
    2492          22 :                 assert(blob->clean.num_extent_pages >= blob->active.num_extent_pages);
    2493          22 :                 ctx->next_extent_page = spdk_max(1, blob->active.num_extent_pages) - 1;
    2494             :         } else {
    2495             :                 /* No change in size occurred */
    2496        2315 :                 blob_persist_generate_new_md(ctx);
    2497        2315 :                 return;
    2498             :         }
    2499             : 
    2500        2117 :         blob_persist_write_extent_pages(seq, ctx, 0);
    2501             : }
    2502             : 
    2503             : struct spdk_bs_mark_dirty {
    2504             :         struct spdk_blob_store          *bs;
    2505             :         struct spdk_bs_super_block      *super;
    2506             :         spdk_bs_sequence_cpl            cb_fn;
    2507             :         void                            *cb_arg;
    2508             : };
    2509             : 
    2510             : static void
    2511         197 : bs_mark_dirty_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    2512             : {
    2513         197 :         struct spdk_bs_mark_dirty *ctx = cb_arg;
    2514             : 
    2515         197 :         if (bserrno == 0) {
    2516         187 :                 ctx->bs->clean = 0;
    2517             :         }
    2518             : 
    2519         197 :         ctx->cb_fn(seq, ctx->cb_arg, bserrno);
    2520             : 
    2521         197 :         spdk_free(ctx->super);
    2522         197 :         free(ctx);
    2523         197 : }
    2524             : 
    2525             : static void bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs,
    2526             :                            struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg);
    2527             : 
    2528             : 
    2529             : static void
    2530         197 : bs_mark_dirty_write(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    2531             : {
    2532         197 :         struct spdk_bs_mark_dirty *ctx = cb_arg;
    2533             :         int rc;
    2534             : 
    2535         197 :         if (bserrno != 0) {
    2536           5 :                 bs_mark_dirty_write_cpl(seq, ctx, bserrno);
    2537           5 :                 return;
    2538             :         }
    2539             : 
    2540         192 :         rc = bs_super_validate(ctx->super, ctx->bs);
    2541         192 :         if (rc != 0) {
    2542           0 :                 bs_mark_dirty_write_cpl(seq, ctx, rc);
    2543           0 :                 return;
    2544             :         }
    2545             : 
    2546         192 :         ctx->super->clean = 0;
    2547         192 :         if (ctx->super->size == 0) {
    2548           5 :                 ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen;
    2549             :         }
    2550             : 
    2551         192 :         bs_write_super(seq, ctx->bs, ctx->super, bs_mark_dirty_write_cpl, ctx);
    2552             : }
    2553             : 
    2554             : static void
    2555        6961 : bs_mark_dirty(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs,
    2556             :               spdk_bs_sequence_cpl cb_fn, void *cb_arg)
    2557             : {
    2558             :         struct spdk_bs_mark_dirty *ctx;
    2559             : 
    2560             :         /* Blobstore is already marked dirty */
    2561        6961 :         if (bs->clean == 0) {
    2562        6764 :                 cb_fn(seq, cb_arg, 0);
    2563        6764 :                 return;
    2564             :         }
    2565             : 
    2566         197 :         ctx = calloc(1, sizeof(*ctx));
    2567         197 :         if (!ctx) {
    2568           0 :                 cb_fn(seq, cb_arg, -ENOMEM);
    2569           0 :                 return;
    2570             :         }
    2571         197 :         ctx->bs = bs;
    2572         197 :         ctx->cb_fn = cb_fn;
    2573         197 :         ctx->cb_arg = cb_arg;
    2574             : 
    2575         197 :         ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
    2576             :                                   SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
    2577         197 :         if (!ctx->super) {
    2578           0 :                 free(ctx);
    2579           0 :                 cb_fn(seq, cb_arg, -ENOMEM);
    2580           0 :                 return;
    2581             :         }
    2582             : 
    2583         197 :         bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0),
    2584         197 :                              bs_byte_to_lba(bs, sizeof(*ctx->super)),
    2585             :                              bs_mark_dirty_write, ctx);
    2586             : }
    2587             : 
    2588             : /* Write a blob to disk */
    2589             : static void
    2590       11346 : blob_persist(spdk_bs_sequence_t *seq, struct spdk_blob *blob,
    2591             :              spdk_bs_sequence_cpl cb_fn, void *cb_arg)
    2592             : {
    2593             :         struct spdk_blob_persist_ctx *ctx;
    2594             : 
    2595       11346 :         blob_verify_md_op(blob);
    2596             : 
    2597       11346 :         if (blob->state == SPDK_BLOB_STATE_CLEAN && TAILQ_EMPTY(&blob->persists_to_complete)) {
    2598        5042 :                 cb_fn(seq, cb_arg, 0);
    2599        5042 :                 return;
    2600             :         }
    2601             : 
    2602        6304 :         ctx = calloc(1, sizeof(*ctx));
    2603        6304 :         if (!ctx) {
    2604           0 :                 cb_fn(seq, cb_arg, -ENOMEM);
    2605           0 :                 return;
    2606             :         }
    2607        6304 :         ctx->blob = blob;
    2608        6304 :         ctx->seq = seq;
    2609        6304 :         ctx->cb_fn = cb_fn;
    2610        6304 :         ctx->cb_arg = cb_arg;
    2611             : 
    2612             :         /* Multiple blob persists can affect one another, via blob->state or
    2613             :          * blob mutable data changes. To prevent it, queue up the persists. */
    2614        6304 :         if (!TAILQ_EMPTY(&blob->persists_to_complete)) {
    2615          28 :                 TAILQ_INSERT_TAIL(&blob->pending_persists, ctx, link);
    2616          28 :                 return;
    2617             :         }
    2618        6276 :         TAILQ_INSERT_HEAD(&blob->persists_to_complete, ctx, link);
    2619             : 
    2620        6276 :         bs_mark_dirty(seq, blob->bs, blob_persist_start, ctx);
    2621             : }
    2622             : 
    2623             : struct spdk_blob_copy_cluster_ctx {
    2624             :         struct spdk_blob *blob;
    2625             :         uint8_t *buf;
    2626             :         uint64_t io_unit;
    2627             :         uint64_t new_cluster;
    2628             :         uint32_t new_extent_page;
    2629             :         spdk_bs_sequence_t *seq;
    2630             :         struct spdk_blob_md_page *new_cluster_page;
    2631             : };
    2632             : 
    2633             : struct spdk_blob_free_cluster_ctx {
    2634             :         struct spdk_blob *blob;
    2635             :         uint64_t page;
    2636             :         struct spdk_blob_md_page *md_page;
    2637             :         uint64_t cluster_num;
    2638             :         uint32_t extent_page;
    2639             :         spdk_bs_sequence_t *seq;
    2640             : };
    2641             : 
    2642             : static void
    2643        1025 : blob_allocate_and_copy_cluster_cpl(void *cb_arg, int bserrno)
    2644             : {
    2645        1025 :         struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
    2646        1025 :         struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)ctx->seq;
    2647             :         TAILQ_HEAD(, spdk_bs_request_set) requests;
    2648             :         spdk_bs_user_op_t *op;
    2649             : 
    2650        1025 :         TAILQ_INIT(&requests);
    2651        1025 :         TAILQ_SWAP(&set->channel->need_cluster_alloc, &requests, spdk_bs_request_set, link);
    2652             : 
    2653        2050 :         while (!TAILQ_EMPTY(&requests)) {
    2654        1025 :                 op = TAILQ_FIRST(&requests);
    2655        1025 :                 TAILQ_REMOVE(&requests, op, link);
    2656        1025 :                 if (bserrno == 0) {
    2657        1025 :                         bs_user_op_execute(op);
    2658             :                 } else {
    2659           0 :                         bs_user_op_abort(op, bserrno);
    2660             :                 }
    2661             :         }
    2662             : 
    2663        1025 :         spdk_free(ctx->buf);
    2664        1025 :         free(ctx);
    2665        1025 : }
    2666             : 
    2667             : static void
    2668          75 : blob_free_cluster_cpl(void *cb_arg, int bserrno)
    2669             : {
    2670          75 :         struct spdk_blob_free_cluster_ctx *ctx = cb_arg;
    2671          75 :         spdk_bs_sequence_t *seq = ctx->seq;
    2672             : 
    2673          75 :         bs_sequence_finish(seq, bserrno);
    2674             : 
    2675          75 :         free(ctx);
    2676          75 : }
    2677             : 
    2678             : static void
    2679           5 : blob_insert_cluster_revert(struct spdk_blob_copy_cluster_ctx *ctx)
    2680             : {
    2681           5 :         spdk_spin_lock(&ctx->blob->bs->used_lock);
    2682           5 :         bs_release_cluster(ctx->blob->bs, ctx->new_cluster);
    2683           5 :         if (ctx->new_extent_page != 0) {
    2684           3 :                 bs_release_md_page(ctx->blob->bs, ctx->new_extent_page);
    2685             :         }
    2686           5 :         spdk_spin_unlock(&ctx->blob->bs->used_lock);
    2687           5 : }
    2688             : 
    2689             : static void
    2690           5 : blob_insert_cluster_clear_cpl(void *cb_arg, int bserrno)
    2691             : {
    2692           5 :         struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
    2693             : 
    2694           5 :         if (bserrno) {
    2695           0 :                 SPDK_WARNLOG("Failed to clear cluster: %d\n", bserrno);
    2696             :         }
    2697             : 
    2698           5 :         blob_insert_cluster_revert(ctx);
    2699           5 :         bs_sequence_finish(ctx->seq, bserrno);
    2700           5 : }
    2701             : 
    2702             : static void
    2703           5 : blob_insert_cluster_clear(struct spdk_blob_copy_cluster_ctx *ctx)
    2704             : {
    2705             :         struct spdk_bs_cpl cpl;
    2706             :         spdk_bs_batch_t *batch;
    2707           5 :         struct spdk_io_channel *ch = spdk_io_channel_from_ctx(ctx->seq->channel);
    2708             : 
    2709             :         /*
    2710             :          * We allocated a cluster and we copied data to it. But now, we realized that we don't need
    2711             :          * this cluster and we want to release it. We must ensure that we clear the data on this
    2712             :          * cluster.
    2713             :          * The cluster may later be re-allocated by a thick-provisioned blob for example. When
    2714             :          * reading from this thick-provisioned blob before writing data, we should read zeroes.
    2715             :          */
    2716             : 
    2717           5 :         cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
    2718           5 :         cpl.u.blob_basic.cb_fn = blob_insert_cluster_clear_cpl;
    2719           5 :         cpl.u.blob_basic.cb_arg = ctx;
    2720             : 
    2721           5 :         batch = bs_batch_open(ch, &cpl, ctx->blob);
    2722           5 :         if (!batch) {
    2723           0 :                 blob_insert_cluster_clear_cpl(ctx, -ENOMEM);
    2724           0 :                 return;
    2725             :         }
    2726             : 
    2727           5 :         bs_batch_clear_dev(ctx->blob, batch, bs_cluster_to_lba(ctx->blob->bs, ctx->new_cluster),
    2728           5 :                            bs_cluster_to_lba(ctx->blob->bs, 1));
    2729           5 :         bs_batch_close(batch);
    2730             : }
    2731             : 
    2732             : static void
    2733        1025 : blob_insert_cluster_cpl(void *cb_arg, int bserrno)
    2734             : {
    2735        1025 :         struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
    2736             : 
    2737        1025 :         if (bserrno) {
    2738           5 :                 if (bserrno == -EEXIST) {
    2739             :                         /* The metadata insert failed because another thread
    2740             :                          * allocated the cluster first. Clear and free our cluster
    2741             :                          * but continue without error. */
    2742           5 :                         blob_insert_cluster_clear(ctx);
    2743           5 :                         return;
    2744             :                 }
    2745             : 
    2746           0 :                 blob_insert_cluster_revert(ctx);
    2747             :         }
    2748             : 
    2749        1020 :         bs_sequence_finish(ctx->seq, bserrno);
    2750             : }
    2751             : 
    2752             : static void
    2753         515 : blob_write_copy_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    2754             : {
    2755         515 :         struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
    2756             :         uint32_t cluster_number;
    2757             : 
    2758         515 :         if (bserrno) {
    2759             :                 /* The write failed, so jump to the final completion handler */
    2760           0 :                 bs_sequence_finish(seq, bserrno);
    2761           0 :                 return;
    2762             :         }
    2763             : 
    2764         515 :         cluster_number = bs_io_unit_to_cluster(ctx->blob->bs, ctx->io_unit);
    2765             : 
    2766         515 :         blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster,
    2767             :                                          ctx->new_extent_page, ctx->new_cluster_page, blob_insert_cluster_cpl, ctx);
    2768             : }
    2769             : 
    2770             : static void
    2771         385 : blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    2772             : {
    2773         385 :         struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
    2774             : 
    2775         385 :         if (bserrno != 0) {
    2776             :                 /* The read failed, so jump to the final completion handler */
    2777           0 :                 bs_sequence_finish(seq, bserrno);
    2778           0 :                 return;
    2779             :         }
    2780             : 
    2781             :         /* Write whole cluster */
    2782         385 :         bs_sequence_write_dev(seq, ctx->buf,
    2783         385 :                               bs_cluster_to_lba(ctx->blob->bs, ctx->new_cluster),
    2784         385 :                               bs_cluster_to_lba(ctx->blob->bs, 1),
    2785             :                               blob_write_copy_cpl, ctx);
    2786             : }
    2787             : 
    2788             : static bool
    2789        1005 : blob_can_copy(struct spdk_blob *blob, uint64_t cluster_start_io_unit, uint64_t *base_lba)
    2790             : {
    2791        1005 :         uint64_t lba = bs_dev_io_unit_to_lba(blob, blob->back_bs_dev, cluster_start_io_unit);
    2792             : 
    2793        1359 :         return (!blob_is_esnap_clone(blob) && blob->bs->dev->copy != NULL) &&
    2794         354 :                blob->back_bs_dev->translate_lba(blob->back_bs_dev, lba, base_lba);
    2795             : }
    2796             : 
    2797             : static void
    2798         130 : blob_copy(struct spdk_blob_copy_cluster_ctx *ctx, spdk_bs_user_op_t *op, uint64_t src_lba)
    2799             : {
    2800         130 :         struct spdk_blob *blob = ctx->blob;
    2801         130 :         uint64_t lba_count = bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz);
    2802             : 
    2803         130 :         bs_sequence_copy_dev(ctx->seq,
    2804         130 :                              bs_cluster_to_lba(blob->bs, ctx->new_cluster),
    2805             :                              src_lba,
    2806             :                              lba_count,
    2807             :                              blob_write_copy_cpl, ctx);
    2808         130 : }
    2809             : 
    2810             : static void
    2811        1025 : bs_allocate_and_copy_cluster(struct spdk_blob *blob,
    2812             :                              struct spdk_io_channel *_ch,
    2813             :                              uint64_t io_unit, spdk_bs_user_op_t *op)
    2814             : {
    2815             :         struct spdk_bs_cpl cpl;
    2816             :         struct spdk_bs_channel *ch;
    2817             :         struct spdk_blob_copy_cluster_ctx *ctx;
    2818             :         uint64_t cluster_start_io_unit;
    2819             :         uint32_t cluster_number;
    2820             :         bool is_zeroes;
    2821             :         bool can_copy;
    2822             :         bool is_valid_range;
    2823             :         uint64_t copy_src_lba;
    2824             :         int rc;
    2825             : 
    2826        1025 :         ch = spdk_io_channel_get_ctx(_ch);
    2827             : 
    2828        1025 :         if (!TAILQ_EMPTY(&ch->need_cluster_alloc)) {
    2829             :                 /* There are already operations pending. Queue this user op
    2830             :                  * and return because it will be re-executed when the outstanding
    2831             :                  * cluster allocation completes. */
    2832           0 :                 TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link);
    2833           0 :                 return;
    2834             :         }
    2835             : 
    2836             :         /* Round the io_unit offset down to the first io_unit in the cluster */
    2837        1025 :         cluster_start_io_unit = bs_io_unit_to_cluster_start(blob, io_unit);
    2838             : 
    2839             :         /* Calculate which index in the metadata cluster array the corresponding
    2840             :          * cluster is supposed to be at. */
    2841        1025 :         cluster_number = bs_io_unit_to_cluster_number(blob, io_unit);
    2842             : 
    2843        1025 :         ctx = calloc(1, sizeof(*ctx));
    2844        1025 :         if (!ctx) {
    2845           0 :                 bs_user_op_abort(op, -ENOMEM);
    2846           0 :                 return;
    2847             :         }
    2848             : 
    2849        1025 :         assert(blob->bs->cluster_sz % blob->back_bs_dev->blocklen == 0);
    2850             : 
    2851        1025 :         ctx->blob = blob;
    2852        1025 :         ctx->io_unit = cluster_start_io_unit;
    2853        1025 :         ctx->new_cluster_page = ch->new_cluster_page;
    2854        1025 :         memset(ctx->new_cluster_page, 0, blob->bs->md_page_size);
    2855             : 
    2856             :         /* Check if the cluster that we intend to do CoW for is valid for
    2857             :          * the backing dev. For zeroes backing dev, it'll be always valid.
    2858             :          * For other backing dev e.g. a snapshot, it could be invalid if
    2859             :          * the blob has been resized after snapshot was taken. */
    2860        2050 :         is_valid_range = blob->back_bs_dev->is_range_valid(blob->back_bs_dev,
    2861             :                          bs_dev_io_unit_to_lba(blob, blob->back_bs_dev, cluster_start_io_unit),
    2862        1025 :                          bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz));
    2863             : 
    2864        1025 :         can_copy = is_valid_range && blob_can_copy(blob, cluster_start_io_unit, &copy_src_lba);
    2865             : 
    2866        2030 :         is_zeroes = is_valid_range && blob->back_bs_dev->is_zeroes(blob->back_bs_dev,
    2867             :                         bs_dev_io_unit_to_lba(blob, blob->back_bs_dev, cluster_start_io_unit),
    2868        1005 :                         bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz));
    2869        1025 :         if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes && !can_copy) {
    2870         385 :                 ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen,
    2871             :                                        NULL, SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
    2872         385 :                 if (!ctx->buf) {
    2873           0 :                         SPDK_ERRLOG("DMA allocation for cluster of size = %" PRIu32 " failed.\n",
    2874             :                                     blob->bs->cluster_sz);
    2875           0 :                         free(ctx);
    2876           0 :                         bs_user_op_abort(op, -ENOMEM);
    2877           0 :                         return;
    2878             :                 }
    2879             :         }
    2880             : 
    2881        1025 :         spdk_spin_lock(&blob->bs->used_lock);
    2882        1025 :         rc = bs_allocate_cluster(blob, cluster_number, &ctx->new_cluster, &ctx->new_extent_page,
    2883             :                                  false);
    2884        1025 :         spdk_spin_unlock(&blob->bs->used_lock);
    2885        1025 :         if (rc != 0) {
    2886           0 :                 spdk_free(ctx->buf);
    2887           0 :                 free(ctx);
    2888           0 :                 bs_user_op_abort(op, rc);
    2889           0 :                 return;
    2890             :         }
    2891             : 
    2892        1025 :         cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
    2893        1025 :         cpl.u.blob_basic.cb_fn = blob_allocate_and_copy_cluster_cpl;
    2894        1025 :         cpl.u.blob_basic.cb_arg = ctx;
    2895             : 
    2896        1025 :         ctx->seq = bs_sequence_start_blob(_ch, &cpl, blob);
    2897        1025 :         if (!ctx->seq) {
    2898           0 :                 spdk_spin_lock(&blob->bs->used_lock);
    2899           0 :                 bs_release_cluster(blob->bs, ctx->new_cluster);
    2900           0 :                 spdk_spin_unlock(&blob->bs->used_lock);
    2901           0 :                 spdk_free(ctx->buf);
    2902           0 :                 free(ctx);
    2903           0 :                 bs_user_op_abort(op, -ENOMEM);
    2904           0 :                 return;
    2905             :         }
    2906             : 
    2907             :         /* Queue the user op to block other incoming operations */
    2908        1025 :         TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link);
    2909             : 
    2910        1025 :         if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes) {
    2911         515 :                 if (can_copy) {
    2912         130 :                         blob_copy(ctx, op, copy_src_lba);
    2913             :                 } else {
    2914             :                         /* Read cluster from backing device */
    2915         385 :                         bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf,
    2916             :                                                 bs_dev_io_unit_to_lba(blob, blob->back_bs_dev, cluster_start_io_unit),
    2917         385 :                                                 bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz),
    2918             :                                                 blob_write_copy, ctx);
    2919             :                 }
    2920             : 
    2921             :         } else {
    2922         510 :                 blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster,
    2923             :                                                  ctx->new_extent_page, ctx->new_cluster_page, blob_insert_cluster_cpl, ctx);
    2924             :         }
    2925             : }
    2926             : 
    2927             : static inline bool
    2928       56755 : blob_calculate_lba_and_lba_count(struct spdk_blob *blob, uint64_t io_unit, uint64_t length,
    2929             :                                  uint64_t *lba, uint64_t *lba_count)
    2930             : {
    2931       56755 :         *lba_count = length;
    2932             : 
    2933       56755 :         if (!bs_io_unit_is_allocated(blob, io_unit)) {
    2934        5196 :                 assert(blob->back_bs_dev != NULL);
    2935        5196 :                 *lba = bs_io_unit_to_back_dev_lba(blob, io_unit);
    2936        5196 :                 *lba_count = bs_io_unit_to_back_dev_lba(blob, *lba_count);
    2937        5196 :                 return false;
    2938             :         } else {
    2939       51559 :                 *lba = bs_blob_io_unit_to_lba(blob, io_unit);
    2940       51559 :                 return true;
    2941             :         }
    2942             : }
    2943             : 
    2944             : struct op_split_ctx {
    2945             :         struct spdk_blob *blob;
    2946             :         struct spdk_io_channel *channel;
    2947             :         uint64_t io_unit_offset;
    2948             :         uint64_t io_units_remaining;
    2949             :         void *curr_payload;
    2950             :         enum spdk_blob_op_type op_type;
    2951             :         spdk_bs_sequence_t *seq;
    2952             :         bool in_submit_ctx;
    2953             :         bool completed_in_submit_ctx;
    2954             :         bool done;
    2955             : };
    2956             : 
    2957             : static void
    2958         966 : blob_request_submit_op_split_next(void *cb_arg, int bserrno)
    2959             : {
    2960         966 :         struct op_split_ctx     *ctx = cb_arg;
    2961         966 :         struct spdk_blob        *blob = ctx->blob;
    2962         966 :         struct spdk_io_channel  *ch = ctx->channel;
    2963         966 :         enum spdk_blob_op_type  op_type = ctx->op_type;
    2964             :         uint8_t                 *buf;
    2965             :         uint64_t                offset;
    2966             :         uint64_t                length;
    2967             :         uint64_t                op_length;
    2968             : 
    2969         966 :         if (bserrno != 0 || ctx->io_units_remaining == 0) {
    2970         222 :                 bs_sequence_finish(ctx->seq, bserrno);
    2971         222 :                 if (ctx->in_submit_ctx) {
    2972             :                         /* Defer freeing of the ctx object, since it will be
    2973             :                          * accessed when this unwinds back to the submission
    2974             :                          * context.
    2975             :                          */
    2976          50 :                         ctx->done = true;
    2977             :                 } else {
    2978         172 :                         free(ctx);
    2979             :                 }
    2980         222 :                 return;
    2981             :         }
    2982             : 
    2983         744 :         if (ctx->in_submit_ctx) {
    2984             :                 /* If this split operation completed in the context
    2985             :                  * of its submission, mark the flag and return immediately
    2986             :                  * to avoid recursion.
    2987             :                  */
    2988          85 :                 ctx->completed_in_submit_ctx = true;
    2989          85 :                 return;
    2990             :         }
    2991             : 
    2992             :         while (true) {
    2993         744 :                 ctx->completed_in_submit_ctx = false;
    2994             : 
    2995         744 :                 offset = ctx->io_unit_offset;
    2996         744 :                 length = ctx->io_units_remaining;
    2997         744 :                 buf = ctx->curr_payload;
    2998         744 :                 op_length = spdk_min(length, bs_num_io_units_to_cluster_boundary(blob,
    2999             :                                      offset));
    3000             : 
    3001             :                 /* Update length and payload for next operation */
    3002         744 :                 ctx->io_units_remaining -= op_length;
    3003         744 :                 ctx->io_unit_offset += op_length;
    3004         744 :                 if (op_type == SPDK_BLOB_WRITE || op_type == SPDK_BLOB_READ) {
    3005         659 :                         ctx->curr_payload += op_length * blob->bs->io_unit_size;
    3006             :                 }
    3007             : 
    3008         744 :                 assert(!ctx->in_submit_ctx);
    3009         744 :                 ctx->in_submit_ctx = true;
    3010             : 
    3011         744 :                 switch (op_type) {
    3012         522 :                 case SPDK_BLOB_READ:
    3013         522 :                         spdk_blob_io_read(blob, ch, buf, offset, op_length,
    3014             :                                           blob_request_submit_op_split_next, ctx);
    3015         522 :                         break;
    3016         137 :                 case SPDK_BLOB_WRITE:
    3017         137 :                         spdk_blob_io_write(blob, ch, buf, offset, op_length,
    3018             :                                            blob_request_submit_op_split_next, ctx);
    3019         137 :                         break;
    3020          45 :                 case SPDK_BLOB_UNMAP:
    3021          45 :                         spdk_blob_io_unmap(blob, ch, offset, op_length,
    3022             :                                            blob_request_submit_op_split_next, ctx);
    3023          45 :                         break;
    3024          40 :                 case SPDK_BLOB_WRITE_ZEROES:
    3025          40 :                         spdk_blob_io_write_zeroes(blob, ch, offset, op_length,
    3026             :                                                   blob_request_submit_op_split_next, ctx);
    3027          40 :                         break;
    3028           0 :                 case SPDK_BLOB_READV:
    3029             :                 case SPDK_BLOB_WRITEV:
    3030           0 :                         SPDK_ERRLOG("readv/write not valid\n");
    3031           0 :                         bs_sequence_finish(ctx->seq, -EINVAL);
    3032           0 :                         free(ctx);
    3033           0 :                         return;
    3034             :                 }
    3035             : 
    3036             : #ifndef __clang_analyzer__
    3037             :                 /* scan-build reports a false positive around accessing the ctx here. It
    3038             :                  * forms a path that recursively calls this function, but then says
    3039             :                  * "assuming ctx->in_submit_ctx is false", when that isn't possible.
    3040             :                  * This path does free(ctx), returns to here, and reports a use-after-free
    3041             :                  * bug.  Wrapping this bit of code so that scan-build doesn't see it
    3042             :                  * works around the scan-build bug.
    3043             :                  */
    3044         744 :                 assert(ctx->in_submit_ctx);
    3045         744 :                 ctx->in_submit_ctx = false;
    3046             : 
    3047             :                 /* If the operation completed immediately, loop back and submit the
    3048             :                  * next operation.  Otherwise we can return and the next split
    3049             :                  * operation will get submitted when this current operation is
    3050             :                  * later completed asynchronously.
    3051             :                  */
    3052         744 :                 if (ctx->completed_in_submit_ctx) {
    3053          85 :                         continue;
    3054         659 :                 } else if (ctx->done) {
    3055          50 :                         free(ctx);
    3056             :                 }
    3057             : #endif
    3058         659 :                 break;
    3059             :         }
    3060             : }
    3061             : 
    3062             : static void
    3063         222 : blob_request_submit_op_split(struct spdk_io_channel *ch, struct spdk_blob *blob,
    3064             :                              void *payload, uint64_t offset, uint64_t length,
    3065             :                              spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
    3066             : {
    3067             :         struct op_split_ctx *ctx;
    3068             :         spdk_bs_sequence_t *seq;
    3069             :         struct spdk_bs_cpl cpl;
    3070             : 
    3071         222 :         assert(blob != NULL);
    3072             : 
    3073         222 :         ctx = calloc(1, sizeof(struct op_split_ctx));
    3074         222 :         if (ctx == NULL) {
    3075           0 :                 cb_fn(cb_arg, -ENOMEM);
    3076           0 :                 return;
    3077             :         }
    3078             : 
    3079         222 :         cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
    3080         222 :         cpl.u.blob_basic.cb_fn = cb_fn;
    3081         222 :         cpl.u.blob_basic.cb_arg = cb_arg;
    3082             : 
    3083         222 :         seq = bs_sequence_start_blob(ch, &cpl, blob);
    3084         222 :         if (!seq) {
    3085           0 :                 free(ctx);
    3086           0 :                 cb_fn(cb_arg, -ENOMEM);
    3087           0 :                 return;
    3088             :         }
    3089             : 
    3090         222 :         ctx->blob = blob;
    3091         222 :         ctx->channel = ch;
    3092         222 :         ctx->curr_payload = payload;
    3093         222 :         ctx->io_unit_offset = offset;
    3094         222 :         ctx->io_units_remaining = length;
    3095         222 :         ctx->op_type = op_type;
    3096         222 :         ctx->seq = seq;
    3097             : 
    3098         222 :         blob_request_submit_op_split_next(ctx, 0);
    3099             : }
    3100             : 
    3101             : static void
    3102          75 : spdk_free_cluster_unmap_complete(void *cb_arg, int bserrno)
    3103             : {
    3104          75 :         struct spdk_blob_free_cluster_ctx *ctx = cb_arg;
    3105             : 
    3106          75 :         if (bserrno) {
    3107           0 :                 bs_sequence_finish(ctx->seq, bserrno);
    3108           0 :                 free(ctx);
    3109           0 :                 return;
    3110             :         }
    3111             : 
    3112          75 :         blob_free_cluster_on_md_thread(ctx->blob, ctx->cluster_num,
    3113             :                                        ctx->extent_page, ctx->md_page, blob_free_cluster_cpl, ctx);
    3114             : }
    3115             : 
    3116             : static void
    3117       52830 : blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blob,
    3118             :                               void *payload, uint64_t offset, uint64_t length,
    3119             :                               spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
    3120             : {
    3121             :         struct spdk_bs_cpl cpl;
    3122             :         uint64_t lba;
    3123             :         uint64_t lba_count;
    3124             :         bool is_allocated;
    3125             : 
    3126       52830 :         assert(blob != NULL);
    3127             : 
    3128       52830 :         cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
    3129       52830 :         cpl.u.blob_basic.cb_fn = cb_fn;
    3130       52830 :         cpl.u.blob_basic.cb_arg = cb_arg;
    3131             : 
    3132       52830 :         if (blob->frozen_refcnt) {
    3133             :                 /* This blob I/O is frozen */
    3134             :                 spdk_bs_user_op_t *op;
    3135           5 :                 struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_ch);
    3136             : 
    3137           5 :                 op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length);
    3138           5 :                 if (!op) {
    3139           0 :                         cb_fn(cb_arg, -ENOMEM);
    3140           5 :                         return;
    3141             :                 }
    3142             : 
    3143           5 :                 TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link);
    3144             : 
    3145           5 :                 return;
    3146             :         }
    3147             : 
    3148       52825 :         is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count);
    3149             : 
    3150       52825 :         switch (op_type) {
    3151       25377 :         case SPDK_BLOB_READ: {
    3152             :                 spdk_bs_batch_t *batch;
    3153             : 
    3154       25377 :                 batch = bs_batch_open(_ch, &cpl, blob);
    3155       25377 :                 if (!batch) {
    3156           0 :                         cb_fn(cb_arg, -ENOMEM);
    3157           0 :                         return;
    3158             :                 }
    3159             : 
    3160       25377 :                 if (is_allocated) {
    3161             :                         /* Read from the blob */
    3162       23531 :                         bs_batch_read_dev(batch, payload, lba, lba_count);
    3163             :                 } else {
    3164             :                         /* Read from the backing block device */
    3165        1846 :                         bs_batch_read_bs_dev(batch, blob->back_bs_dev, payload, lba, lba_count);
    3166             :                 }
    3167             : 
    3168       25377 :                 bs_batch_close(batch);
    3169       25377 :                 break;
    3170             :         }
    3171       27328 :         case SPDK_BLOB_WRITE:
    3172             :         case SPDK_BLOB_WRITE_ZEROES: {
    3173       27328 :                 if (is_allocated) {
    3174             :                         /* Write to the blob */
    3175             :                         spdk_bs_batch_t *batch;
    3176             : 
    3177       26888 :                         if (lba_count == 0) {
    3178           0 :                                 cb_fn(cb_arg, 0);
    3179           0 :                                 return;
    3180             :                         }
    3181             : 
    3182       26888 :                         batch = bs_batch_open(_ch, &cpl, blob);
    3183       26888 :                         if (!batch) {
    3184           0 :                                 cb_fn(cb_arg, -ENOMEM);
    3185           0 :                                 return;
    3186             :                         }
    3187             : 
    3188       26888 :                         if (op_type == SPDK_BLOB_WRITE) {
    3189       26848 :                                 bs_batch_write_dev(batch, payload, lba, lba_count);
    3190             :                         } else {
    3191          40 :                                 bs_batch_write_zeroes_dev(batch, lba, lba_count);
    3192             :                         }
    3193             : 
    3194       26888 :                         bs_batch_close(batch);
    3195             :                 } else {
    3196             :                         /* Queue this operation and allocate the cluster */
    3197             :                         spdk_bs_user_op_t *op;
    3198             : 
    3199         440 :                         op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length);
    3200         440 :                         if (!op) {
    3201           0 :                                 cb_fn(cb_arg, -ENOMEM);
    3202           0 :                                 return;
    3203             :                         }
    3204             : 
    3205         440 :                         bs_allocate_and_copy_cluster(blob, _ch, offset, op);
    3206             :                 }
    3207       27328 :                 break;
    3208             :         }
    3209         120 :         case SPDK_BLOB_UNMAP: {
    3210         120 :                 struct spdk_blob_free_cluster_ctx *ctx = NULL;
    3211             :                 spdk_bs_batch_t *batch;
    3212             : 
    3213             :                 /* if aligned with cluster release cluster */
    3214         210 :                 if (spdk_blob_is_thin_provisioned(blob) && is_allocated &&
    3215         175 :                     blob_backed_with_zeroes_dev(blob) &&
    3216          85 :                     bs_io_units_per_cluster(blob) == length) {
    3217          75 :                         struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_ch);
    3218             :                         uint64_t cluster_start_page;
    3219             :                         uint32_t cluster_number;
    3220             : 
    3221          75 :                         assert(offset % bs_io_units_per_cluster(blob) == 0);
    3222             : 
    3223             :                         /* Round the io_unit offset down to the first page in the cluster */
    3224          75 :                         cluster_start_page = bs_io_unit_to_cluster_start(blob, offset);
    3225             : 
    3226             :                         /* Calculate which index in the metadata cluster array the corresponding
    3227             :                          * cluster is supposed to be at. */
    3228          75 :                         cluster_number = bs_io_unit_to_cluster_number(blob, offset);
    3229             : 
    3230          75 :                         ctx = calloc(1, sizeof(*ctx));
    3231          75 :                         if (!ctx) {
    3232           0 :                                 cb_fn(cb_arg, -ENOMEM);
    3233           0 :                                 return;
    3234             :                         }
    3235             :                         /* When freeing a cluster the flow should be (in order):
    3236             :                          * 1. Unmap the underlying area (so if the cluster is reclaimed in the future, it won't leak
    3237             :                          * old data)
    3238             :                          * 2. Once the unmap completes (to avoid any races with incoming writes that may claim the
    3239             :                          * cluster), update and sync metadata freeing the cluster
    3240             :                          * 3. Once metadata update is done, complete the user unmap request
    3241             :                          */
    3242          75 :                         ctx->blob = blob;
    3243          75 :                         ctx->page = cluster_start_page;
    3244          75 :                         ctx->cluster_num = cluster_number;
    3245          75 :                         ctx->md_page = bs_channel->new_cluster_page;
    3246          75 :                         ctx->seq = bs_sequence_start_bs(_ch, &cpl);
    3247          75 :                         if (!ctx->seq) {
    3248           0 :                                 free(ctx);
    3249           0 :                                 cb_fn(cb_arg, -ENOMEM);
    3250           0 :                                 return;
    3251             :                         }
    3252             : 
    3253          75 :                         if (blob->use_extent_table) {
    3254          45 :                                 ctx->extent_page = *bs_cluster_to_extent_page(blob, cluster_number);
    3255             :                         }
    3256             : 
    3257          75 :                         cpl.u.blob_basic.cb_fn = spdk_free_cluster_unmap_complete;
    3258          75 :                         cpl.u.blob_basic.cb_arg = ctx;
    3259             :                 }
    3260             : 
    3261         120 :                 batch = bs_batch_open(_ch, &cpl, blob);
    3262         120 :                 if (!batch) {
    3263           0 :                         free(ctx);
    3264           0 :                         cb_fn(cb_arg, -ENOMEM);
    3265           0 :                         return;
    3266             :                 }
    3267             : 
    3268         120 :                 if (is_allocated) {
    3269         120 :                         bs_batch_unmap_dev(batch, lba, lba_count);
    3270             :                 }
    3271             : 
    3272         120 :                 bs_batch_close(batch);
    3273         120 :                 break;
    3274             :         }
    3275           0 :         case SPDK_BLOB_READV:
    3276             :         case SPDK_BLOB_WRITEV:
    3277           0 :                 SPDK_ERRLOG("readv/write not valid\n");
    3278           0 :                 cb_fn(cb_arg, -EINVAL);
    3279           0 :                 break;
    3280             :         }
    3281             : }
    3282             : 
    3283             : static void
    3284       53692 : blob_request_submit_op(struct spdk_blob *blob, struct spdk_io_channel *_channel,
    3285             :                        void *payload, uint64_t offset, uint64_t length,
    3286             :                        spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
    3287             : {
    3288       53692 :         assert(blob != NULL);
    3289             : 
    3290       53692 :         if (blob->data_ro && op_type != SPDK_BLOB_READ) {
    3291           5 :                 cb_fn(cb_arg, -EPERM);
    3292           5 :                 return;
    3293             :         }
    3294             : 
    3295       53687 :         if (length == 0) {
    3296         615 :                 cb_fn(cb_arg, 0);
    3297         615 :                 return;
    3298             :         }
    3299             : 
    3300       53072 :         if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) {
    3301          30 :                 cb_fn(cb_arg, -EINVAL);
    3302          30 :                 return;
    3303             :         }
    3304       53042 :         if (length <= bs_num_io_units_to_cluster_boundary(blob, offset)) {
    3305       52820 :                 blob_request_submit_op_single(_channel, blob, payload, offset, length,
    3306             :                                               cb_fn, cb_arg, op_type);
    3307             :         } else {
    3308         222 :                 blob_request_submit_op_split(_channel, blob, payload, offset, length,
    3309             :                                              cb_fn, cb_arg, op_type);
    3310             :         }
    3311             : }
    3312             : 
    3313             : struct rw_iov_ctx {
    3314             :         struct spdk_blob *blob;
    3315             :         struct spdk_io_channel *channel;
    3316             :         spdk_blob_op_complete cb_fn;
    3317             :         void *cb_arg;
    3318             :         bool read;
    3319             :         int iovcnt;
    3320             :         struct iovec *orig_iov;
    3321             :         uint64_t io_unit_offset;
    3322             :         uint64_t io_units_remaining;
    3323             :         uint64_t io_units_done;
    3324             :         struct spdk_blob_ext_io_opts *ext_io_opts;
    3325             :         struct iovec iov[0];
    3326             : };
    3327             : 
    3328             : static void
    3329        3910 : rw_iov_done(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    3330             : {
    3331        3910 :         assert(cb_arg == NULL);
    3332        3910 :         bs_sequence_finish(seq, bserrno);
    3333        3910 : }
    3334             : 
    3335             : static void
    3336         930 : rw_iov_split_next(void *cb_arg, int bserrno)
    3337             : {
    3338         930 :         struct rw_iov_ctx *ctx = cb_arg;
    3339         930 :         struct spdk_blob *blob = ctx->blob;
    3340             :         struct iovec *iov, *orig_iov;
    3341             :         int iovcnt;
    3342             :         size_t orig_iovoff;
    3343             :         uint64_t io_units_count, io_units_to_boundary, io_unit_offset;
    3344             :         uint64_t byte_count;
    3345             : 
    3346         930 :         if (bserrno != 0 || ctx->io_units_remaining == 0) {
    3347         255 :                 ctx->cb_fn(ctx->cb_arg, bserrno);
    3348         255 :                 free(ctx);
    3349         255 :                 return;
    3350             :         }
    3351             : 
    3352         675 :         io_unit_offset = ctx->io_unit_offset;
    3353         675 :         io_units_to_boundary = bs_num_io_units_to_cluster_boundary(blob, io_unit_offset);
    3354         675 :         io_units_count = spdk_min(ctx->io_units_remaining, io_units_to_boundary);
    3355             :         /*
    3356             :          * Get index and offset into the original iov array for our current position in the I/O sequence.
    3357             :          *  byte_count will keep track of how many bytes remaining until orig_iov and orig_iovoff will
    3358             :          *  point to the current position in the I/O sequence.
    3359             :          */
    3360         675 :         byte_count = ctx->io_units_done * blob->bs->io_unit_size;
    3361         675 :         orig_iov = &ctx->orig_iov[0];
    3362         675 :         orig_iovoff = 0;
    3363        1435 :         while (byte_count > 0) {
    3364         760 :                 if (byte_count >= orig_iov->iov_len) {
    3365         440 :                         byte_count -= orig_iov->iov_len;
    3366         440 :                         orig_iov++;
    3367             :                 } else {
    3368         320 :                         orig_iovoff = byte_count;
    3369         320 :                         byte_count = 0;
    3370             :                 }
    3371             :         }
    3372             : 
    3373             :         /*
    3374             :          * Build an iov array for the next I/O in the sequence.  byte_count will keep track of how many
    3375             :          *  bytes of this next I/O remain to be accounted for in the new iov array.
    3376             :          */
    3377         675 :         byte_count = io_units_count * blob->bs->io_unit_size;
    3378         675 :         iov = &ctx->iov[0];
    3379         675 :         iovcnt = 0;
    3380        1725 :         while (byte_count > 0) {
    3381        1050 :                 assert(iovcnt < ctx->iovcnt);
    3382        1050 :                 iov->iov_len = spdk_min(byte_count, orig_iov->iov_len - orig_iovoff);
    3383        1050 :                 iov->iov_base = orig_iov->iov_base + orig_iovoff;
    3384        1050 :                 byte_count -= iov->iov_len;
    3385        1050 :                 orig_iovoff = 0;
    3386        1050 :                 orig_iov++;
    3387        1050 :                 iov++;
    3388        1050 :                 iovcnt++;
    3389             :         }
    3390             : 
    3391         675 :         ctx->io_unit_offset += io_units_count;
    3392         675 :         ctx->io_units_remaining -= io_units_count;
    3393         675 :         ctx->io_units_done += io_units_count;
    3394         675 :         iov = &ctx->iov[0];
    3395             : 
    3396         675 :         if (ctx->read) {
    3397         510 :                 spdk_blob_io_readv_ext(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset,
    3398             :                                        io_units_count, rw_iov_split_next, ctx, ctx->ext_io_opts);
    3399             :         } else {
    3400         165 :                 spdk_blob_io_writev_ext(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset,
    3401             :                                         io_units_count, rw_iov_split_next, ctx, ctx->ext_io_opts);
    3402             :         }
    3403             : }
    3404             : 
    3405             : static void
    3406        4195 : blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_channel,
    3407             :                            struct iovec *iov, int iovcnt,
    3408             :                            uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg, bool read,
    3409             :                            struct spdk_blob_ext_io_opts *ext_io_opts)
    3410             : {
    3411             :         struct spdk_bs_cpl      cpl;
    3412             : 
    3413        4195 :         assert(blob != NULL);
    3414             : 
    3415        4195 :         if (!read && blob->data_ro) {
    3416           5 :                 cb_fn(cb_arg, -EPERM);
    3417          10 :                 return;
    3418             :         }
    3419             : 
    3420        4190 :         if (length == 0) {
    3421           0 :                 cb_fn(cb_arg, 0);
    3422           0 :                 return;
    3423             :         }
    3424             : 
    3425        4190 :         if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) {
    3426           0 :                 cb_fn(cb_arg, -EINVAL);
    3427           0 :                 return;
    3428             :         }
    3429             : 
    3430             :         /*
    3431             :          * For now, we implement readv/writev using a sequence (instead of a batch) to account for having
    3432             :          *  to split a request that spans a cluster boundary.  For I/O that do not span a cluster boundary,
    3433             :          *  there will be no noticeable difference compared to using a batch.  For I/O that do span a cluster
    3434             :          *  boundary, the target LBAs (after blob offset to LBA translation) may not be contiguous, so we need
    3435             :          *  to allocate a separate iov array and split the I/O such that none of the resulting
    3436             :          *  smaller I/O cross a cluster boundary.  These smaller I/O will be issued in sequence (not in parallel)
    3437             :          *  but since this case happens very infrequently, any performance impact will be negligible.
    3438             :          *
    3439             :          * This could be optimized in the future to allocate a big enough iov array to account for all of the iovs
    3440             :          *  for all of the smaller I/Os, pre-build all of the iov arrays for the smaller I/Os, then issue them
    3441             :          *  in a batch.  That would also require creating an intermediate spdk_bs_cpl that would get called
    3442             :          *  when the batch was completed, to allow for freeing the memory for the iov arrays.
    3443             :          */
    3444        4190 :         if (spdk_likely(length <= bs_num_io_units_to_cluster_boundary(blob, offset))) {
    3445             :                 uint64_t lba_count;
    3446             :                 uint64_t lba;
    3447             :                 bool is_allocated;
    3448             : 
    3449        3930 :                 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
    3450        3930 :                 cpl.u.blob_basic.cb_fn = cb_fn;
    3451        3930 :                 cpl.u.blob_basic.cb_arg = cb_arg;
    3452             : 
    3453        3930 :                 if (blob->frozen_refcnt) {
    3454             :                         /* This blob I/O is frozen */
    3455             :                         enum spdk_blob_op_type op_type;
    3456             :                         spdk_bs_user_op_t *op;
    3457           0 :                         struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_channel);
    3458             : 
    3459           0 :                         op_type = read ? SPDK_BLOB_READV : SPDK_BLOB_WRITEV;
    3460           0 :                         op = bs_user_op_alloc(_channel, &cpl, op_type, blob, iov, iovcnt, offset, length);
    3461           0 :                         if (!op) {
    3462           0 :                                 cb_fn(cb_arg, -ENOMEM);
    3463           0 :                                 return;
    3464             :                         }
    3465             : 
    3466           0 :                         TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link);
    3467             : 
    3468           0 :                         return;
    3469             :                 }
    3470             : 
    3471        3930 :                 is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count);
    3472             : 
    3473        3930 :                 if (read) {
    3474             :                         spdk_bs_sequence_t *seq;
    3475             : 
    3476        3565 :                         seq = bs_sequence_start_blob(_channel, &cpl, blob);
    3477        3565 :                         if (!seq) {
    3478           0 :                                 cb_fn(cb_arg, -ENOMEM);
    3479           0 :                                 return;
    3480             :                         }
    3481             : 
    3482        3565 :                         seq->ext_io_opts = ext_io_opts;
    3483             : 
    3484        3565 :                         if (is_allocated) {
    3485         675 :                                 bs_sequence_readv_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL);
    3486             :                         } else {
    3487        2890 :                                 bs_sequence_readv_bs_dev(seq, blob->back_bs_dev, iov, iovcnt, lba, lba_count,
    3488             :                                                          rw_iov_done, NULL);
    3489             :                         }
    3490             :                 } else {
    3491         365 :                         if (is_allocated) {
    3492             :                                 spdk_bs_sequence_t *seq;
    3493             : 
    3494         345 :                                 seq = bs_sequence_start_blob(_channel, &cpl, blob);
    3495         345 :                                 if (!seq) {
    3496           0 :                                         cb_fn(cb_arg, -ENOMEM);
    3497           0 :                                         return;
    3498             :                                 }
    3499             : 
    3500         345 :                                 seq->ext_io_opts = ext_io_opts;
    3501             : 
    3502         345 :                                 bs_sequence_writev_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL);
    3503             :                         } else {
    3504             :                                 /* Queue this operation and allocate the cluster */
    3505             :                                 spdk_bs_user_op_t *op;
    3506             : 
    3507          20 :                                 op = bs_user_op_alloc(_channel, &cpl, SPDK_BLOB_WRITEV, blob, iov, iovcnt, offset,
    3508             :                                                       length);
    3509          20 :                                 if (!op) {
    3510           0 :                                         cb_fn(cb_arg, -ENOMEM);
    3511           0 :                                         return;
    3512             :                                 }
    3513             : 
    3514          20 :                                 op->ext_io_opts = ext_io_opts;
    3515             : 
    3516          20 :                                 bs_allocate_and_copy_cluster(blob, _channel, offset, op);
    3517             :                         }
    3518             :                 }
    3519             :         } else {
    3520             :                 struct rw_iov_ctx *ctx;
    3521             : 
    3522         260 :                 ctx = calloc(1, sizeof(struct rw_iov_ctx) + iovcnt * sizeof(struct iovec));
    3523         260 :                 if (ctx == NULL) {
    3524           5 :                         cb_fn(cb_arg, -ENOMEM);
    3525           5 :                         return;
    3526             :                 }
    3527             : 
    3528         255 :                 ctx->blob = blob;
    3529         255 :                 ctx->channel = _channel;
    3530         255 :                 ctx->cb_fn = cb_fn;
    3531         255 :                 ctx->cb_arg = cb_arg;
    3532         255 :                 ctx->read = read;
    3533         255 :                 ctx->orig_iov = iov;
    3534         255 :                 ctx->iovcnt = iovcnt;
    3535         255 :                 ctx->io_unit_offset = offset;
    3536         255 :                 ctx->io_units_remaining = length;
    3537         255 :                 ctx->io_units_done = 0;
    3538         255 :                 ctx->ext_io_opts = ext_io_opts;
    3539             : 
    3540         255 :                 rw_iov_split_next(ctx, 0);
    3541             :         }
    3542             : }
    3543             : 
    3544             : static struct spdk_blob *
    3545        9668 : blob_lookup(struct spdk_blob_store *bs, spdk_blob_id blobid)
    3546             : {
    3547             :         struct spdk_blob find;
    3548             : 
    3549        9668 :         if (spdk_bit_array_get(bs->open_blobids, blobid) == 0) {
    3550        8681 :                 return NULL;
    3551             :         }
    3552             : 
    3553         987 :         find.id = blobid;
    3554         987 :         return RB_FIND(spdk_blob_tree, &bs->open_blobs, &find);
    3555             : }
    3556             : 
    3557             : static void
    3558        2256 : blob_get_snapshot_and_clone_entries(struct spdk_blob *blob,
    3559             :                                     struct spdk_blob_list **snapshot_entry, struct spdk_blob_list **clone_entry)
    3560             : {
    3561        2256 :         assert(blob != NULL);
    3562        2256 :         *snapshot_entry = NULL;
    3563        2256 :         *clone_entry = NULL;
    3564             : 
    3565        2256 :         if (blob->parent_id == SPDK_BLOBID_INVALID) {
    3566        1901 :                 return;
    3567             :         }
    3568             : 
    3569         535 :         TAILQ_FOREACH(*snapshot_entry, &blob->bs->snapshots, link) {
    3570         470 :                 if ((*snapshot_entry)->id == blob->parent_id) {
    3571         290 :                         break;
    3572             :                 }
    3573             :         }
    3574             : 
    3575         355 :         if (*snapshot_entry != NULL) {
    3576         345 :                 TAILQ_FOREACH(*clone_entry, &(*snapshot_entry)->clones, link) {
    3577         345 :                         if ((*clone_entry)->id == blob->id) {
    3578         290 :                                 break;
    3579             :                         }
    3580             :                 }
    3581             : 
    3582         290 :                 assert(*clone_entry != NULL);
    3583             :         }
    3584             : }
    3585             : 
    3586             : static int
    3587         993 : bs_channel_create(void *io_device, void *ctx_buf)
    3588             : {
    3589         993 :         struct spdk_blob_store          *bs = io_device;
    3590         993 :         struct spdk_bs_channel          *channel = ctx_buf;
    3591             :         struct spdk_bs_dev              *dev;
    3592         993 :         uint32_t                        max_ops = bs->max_channel_ops;
    3593             :         uint32_t                        i;
    3594             : 
    3595         993 :         dev = bs->dev;
    3596             : 
    3597         993 :         channel->req_mem = calloc(max_ops, sizeof(struct spdk_bs_request_set));
    3598         993 :         if (!channel->req_mem) {
    3599           0 :                 return -1;
    3600             :         }
    3601             : 
    3602         993 :         TAILQ_INIT(&channel->reqs);
    3603             : 
    3604      509409 :         for (i = 0; i < max_ops; i++) {
    3605      508416 :                 TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link);
    3606             :         }
    3607             : 
    3608         993 :         channel->bs = bs;
    3609         993 :         channel->dev = dev;
    3610         993 :         channel->dev_channel = dev->create_channel(dev);
    3611             : 
    3612         993 :         if (!channel->dev_channel) {
    3613           0 :                 SPDK_ERRLOG("Failed to create device channel.\n");
    3614           0 :                 free(channel->req_mem);
    3615           0 :                 return -1;
    3616             :         }
    3617             : 
    3618         993 :         channel->new_cluster_page = spdk_zmalloc(bs->md_page_size, 0, NULL, SPDK_ENV_NUMA_ID_ANY,
    3619             :                                     SPDK_MALLOC_DMA);
    3620         993 :         if (!channel->new_cluster_page) {
    3621           0 :                 SPDK_ERRLOG("Failed to allocate new cluster page\n");
    3622           0 :                 free(channel->req_mem);
    3623           0 :                 channel->dev->destroy_channel(channel->dev, channel->dev_channel);
    3624           0 :                 return -1;
    3625             :         }
    3626             : 
    3627         993 :         TAILQ_INIT(&channel->need_cluster_alloc);
    3628         993 :         TAILQ_INIT(&channel->queued_io);
    3629         993 :         RB_INIT(&channel->esnap_channels);
    3630             : 
    3631         993 :         return 0;
    3632             : }
    3633             : 
    3634             : static void
    3635         993 : bs_channel_destroy(void *io_device, void *ctx_buf)
    3636             : {
    3637         993 :         struct spdk_bs_channel *channel = ctx_buf;
    3638             :         spdk_bs_user_op_t *op;
    3639             : 
    3640         993 :         while (!TAILQ_EMPTY(&channel->need_cluster_alloc)) {
    3641           0 :                 op = TAILQ_FIRST(&channel->need_cluster_alloc);
    3642           0 :                 TAILQ_REMOVE(&channel->need_cluster_alloc, op, link);
    3643           0 :                 bs_user_op_abort(op, -EIO);
    3644             :         }
    3645             : 
    3646         993 :         while (!TAILQ_EMPTY(&channel->queued_io)) {
    3647           0 :                 op = TAILQ_FIRST(&channel->queued_io);
    3648           0 :                 TAILQ_REMOVE(&channel->queued_io, op, link);
    3649           0 :                 bs_user_op_abort(op, -EIO);
    3650             :         }
    3651             : 
    3652         993 :         blob_esnap_destroy_bs_channel(channel);
    3653             : 
    3654         993 :         free(channel->req_mem);
    3655         993 :         spdk_free(channel->new_cluster_page);
    3656         993 :         channel->dev->destroy_channel(channel->dev, channel->dev_channel);
    3657         993 : }
    3658             : 
    3659             : static void
    3660         973 : bs_dev_destroy(void *io_device)
    3661             : {
    3662         973 :         struct spdk_blob_store *bs = io_device;
    3663             :         struct spdk_blob        *blob, *blob_tmp;
    3664             : 
    3665         973 :         bs->dev->destroy(bs->dev);
    3666             : 
    3667         973 :         RB_FOREACH_SAFE(blob, spdk_blob_tree, &bs->open_blobs, blob_tmp) {
    3668           0 :                 RB_REMOVE(spdk_blob_tree, &bs->open_blobs, blob);
    3669           0 :                 spdk_bit_array_clear(bs->open_blobids, blob->id);
    3670           0 :                 blob_free(blob);
    3671             :         }
    3672             : 
    3673         973 :         spdk_spin_destroy(&bs->used_lock);
    3674             : 
    3675         973 :         spdk_bit_array_free(&bs->open_blobids);
    3676         973 :         spdk_bit_array_free(&bs->used_blobids);
    3677         973 :         spdk_bit_array_free(&bs->used_md_pages);
    3678         973 :         spdk_bit_pool_free(&bs->used_clusters);
    3679             :         /*
    3680             :          * If this function is called for any reason except a successful unload,
    3681             :          * the unload_cpl type will be NONE and this will be a nop.
    3682             :          */
    3683         973 :         bs_call_cpl(&bs->unload_cpl, bs->unload_err);
    3684             : 
    3685         973 :         free(bs);
    3686         973 : }
    3687             : 
    3688             : static int
    3689        1139 : bs_blob_list_add(struct spdk_blob *blob)
    3690             : {
    3691             :         spdk_blob_id snapshot_id;
    3692        1139 :         struct spdk_blob_list *snapshot_entry = NULL;
    3693        1139 :         struct spdk_blob_list *clone_entry = NULL;
    3694             : 
    3695        1139 :         assert(blob != NULL);
    3696             : 
    3697        1139 :         snapshot_id = blob->parent_id;
    3698        1139 :         if (snapshot_id == SPDK_BLOBID_INVALID ||
    3699             :             snapshot_id == SPDK_BLOBID_EXTERNAL_SNAPSHOT) {
    3700         617 :                 return 0;
    3701             :         }
    3702             : 
    3703         522 :         snapshot_entry = bs_get_snapshot_entry(blob->bs, snapshot_id);
    3704         522 :         if (snapshot_entry == NULL) {
    3705             :                 /* Snapshot not found */
    3706         362 :                 snapshot_entry = calloc(1, sizeof(struct spdk_blob_list));
    3707         362 :                 if (snapshot_entry == NULL) {
    3708           0 :                         return -ENOMEM;
    3709             :                 }
    3710         362 :                 snapshot_entry->id = snapshot_id;
    3711         362 :                 TAILQ_INIT(&snapshot_entry->clones);
    3712         362 :                 TAILQ_INSERT_TAIL(&blob->bs->snapshots, snapshot_entry, link);
    3713             :         } else {
    3714         255 :                 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
    3715          95 :                         if (clone_entry->id == blob->id) {
    3716           0 :                                 break;
    3717             :                         }
    3718             :                 }
    3719             :         }
    3720             : 
    3721         522 :         if (clone_entry == NULL) {
    3722             :                 /* Clone not found */
    3723         522 :                 clone_entry = calloc(1, sizeof(struct spdk_blob_list));
    3724         522 :                 if (clone_entry == NULL) {
    3725           0 :                         return -ENOMEM;
    3726             :                 }
    3727         522 :                 clone_entry->id = blob->id;
    3728         522 :                 TAILQ_INIT(&clone_entry->clones);
    3729         522 :                 TAILQ_INSERT_TAIL(&snapshot_entry->clones, clone_entry, link);
    3730         522 :                 snapshot_entry->clone_count++;
    3731             :         }
    3732             : 
    3733         522 :         return 0;
    3734             : }
    3735             : 
    3736             : static void
    3737        2158 : bs_blob_list_remove(struct spdk_blob *blob)
    3738             : {
    3739        2158 :         struct spdk_blob_list *snapshot_entry = NULL;
    3740        2158 :         struct spdk_blob_list *clone_entry = NULL;
    3741             : 
    3742        2158 :         blob_get_snapshot_and_clone_entries(blob, &snapshot_entry, &clone_entry);
    3743             : 
    3744        2158 :         if (snapshot_entry == NULL) {
    3745        1888 :                 return;
    3746             :         }
    3747             : 
    3748         270 :         blob->parent_id = SPDK_BLOBID_INVALID;
    3749         270 :         TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
    3750         270 :         free(clone_entry);
    3751             : 
    3752         270 :         snapshot_entry->clone_count--;
    3753             : }
    3754             : 
    3755             : static int
    3756         973 : bs_blob_list_free(struct spdk_blob_store *bs)
    3757             : {
    3758             :         struct spdk_blob_list *snapshot_entry;
    3759             :         struct spdk_blob_list *snapshot_entry_tmp;
    3760             :         struct spdk_blob_list *clone_entry;
    3761             :         struct spdk_blob_list *clone_entry_tmp;
    3762             : 
    3763        1155 :         TAILQ_FOREACH_SAFE(snapshot_entry, &bs->snapshots, link, snapshot_entry_tmp) {
    3764         374 :                 TAILQ_FOREACH_SAFE(clone_entry, &snapshot_entry->clones, link, clone_entry_tmp) {
    3765         192 :                         TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
    3766         192 :                         free(clone_entry);
    3767             :                 }
    3768         182 :                 TAILQ_REMOVE(&bs->snapshots, snapshot_entry, link);
    3769         182 :                 free(snapshot_entry);
    3770             :         }
    3771             : 
    3772         973 :         return 0;
    3773             : }
    3774             : 
    3775             : static void
    3776         973 : bs_free(struct spdk_blob_store *bs)
    3777             : {
    3778         973 :         bs_blob_list_free(bs);
    3779             : 
    3780         973 :         bs_unregister_md_thread(bs);
    3781         973 :         spdk_io_device_unregister(bs, bs_dev_destroy);
    3782         973 : }
    3783             : 
    3784             : void
    3785        1303 : spdk_bs_opts_init(struct spdk_bs_opts *opts, size_t opts_size)
    3786             : {
    3787             : 
    3788        1303 :         if (!opts) {
    3789           0 :                 SPDK_ERRLOG("opts should not be NULL\n");
    3790           0 :                 return;
    3791             :         }
    3792             : 
    3793        1303 :         if (!opts_size) {
    3794           0 :                 SPDK_ERRLOG("opts_size should not be zero value\n");
    3795           0 :                 return;
    3796             :         }
    3797             : 
    3798        1303 :         memset(opts, 0, opts_size);
    3799        1303 :         opts->opts_size = opts_size;
    3800             : 
    3801             : #define FIELD_OK(field) \
    3802             :         offsetof(struct spdk_bs_opts, field) + sizeof(opts->field) <= opts_size
    3803             : 
    3804             : #define SET_FIELD(field, value) \
    3805             :         if (FIELD_OK(field)) { \
    3806             :                 opts->field = value; \
    3807             :         } \
    3808             : 
    3809        1303 :         SET_FIELD(cluster_sz, SPDK_BLOB_OPTS_CLUSTER_SZ);
    3810        1303 :         SET_FIELD(num_md_pages, SPDK_BLOB_OPTS_NUM_MD_PAGES);
    3811        1303 :         SET_FIELD(max_md_ops, SPDK_BLOB_OPTS_NUM_MD_PAGES);
    3812        1303 :         SET_FIELD(max_channel_ops, SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS);
    3813        1303 :         SET_FIELD(clear_method,  BS_CLEAR_WITH_UNMAP);
    3814             : 
    3815        1303 :         if (FIELD_OK(bstype)) {
    3816        1303 :                 memset(&opts->bstype, 0, sizeof(opts->bstype));
    3817             :         }
    3818             : 
    3819        1303 :         SET_FIELD(iter_cb_fn, NULL);
    3820        1303 :         SET_FIELD(iter_cb_arg, NULL);
    3821        1303 :         SET_FIELD(force_recover, false);
    3822        1303 :         SET_FIELD(esnap_bs_dev_create, NULL);
    3823        1303 :         SET_FIELD(esnap_ctx, NULL);
    3824             : 
    3825             : #undef FIELD_OK
    3826             : #undef SET_FIELD
    3827             : }
    3828             : 
    3829             : static int
    3830         602 : bs_opts_verify(struct spdk_bs_opts *opts)
    3831             : {
    3832         602 :         if (opts->cluster_sz == 0 || opts->num_md_pages == 0 || opts->max_md_ops == 0 ||
    3833         597 :             opts->max_channel_ops == 0) {
    3834           5 :                 SPDK_ERRLOG("Blobstore options cannot be set to 0\n");
    3835           5 :                 return -1;
    3836             :         }
    3837             : 
    3838         597 :         if ((opts->cluster_sz % SPDK_BS_PAGE_SIZE) != 0) {
    3839           5 :                 SPDK_ERRLOG("Cluster size %" PRIu32 " is not an integral multiple of blocklen %" PRIu32"\n",
    3840             :                             opts->cluster_sz, SPDK_BS_PAGE_SIZE);
    3841           5 :                 return -1;
    3842             :         }
    3843             : 
    3844         592 :         return 0;
    3845             : }
    3846             : 
    3847             : /* START spdk_bs_load */
    3848             : 
    3849             : /* spdk_bs_load_ctx is used for init, load, unload and dump code paths. */
    3850             : 
    3851             : struct spdk_bs_load_ctx {
    3852             :         struct spdk_blob_store          *bs;
    3853             :         struct spdk_bs_super_block      *super;
    3854             : 
    3855             :         struct spdk_bs_md_mask          *mask;
    3856             :         bool                            in_page_chain;
    3857             :         uint32_t                        page_index;
    3858             :         uint32_t                        cur_page;
    3859             :         struct spdk_blob_md_page        *page;
    3860             : 
    3861             :         uint64_t                        num_extent_pages;
    3862             :         uint32_t                        *extent_page_num;
    3863             :         struct spdk_blob_md_page        *extent_pages;
    3864             :         struct spdk_bit_array           *used_clusters;
    3865             : 
    3866             :         spdk_bs_sequence_t                      *seq;
    3867             :         spdk_blob_op_with_handle_complete       iter_cb_fn;
    3868             :         void                                    *iter_cb_arg;
    3869             :         struct spdk_blob                        *blob;
    3870             :         spdk_blob_id                            blobid;
    3871             : 
    3872             :         bool                                    force_recover;
    3873             : 
    3874             :         /* These fields are used in the spdk_bs_dump path. */
    3875             :         bool                                    dumping;
    3876             :         FILE                                    *fp;
    3877             :         spdk_bs_dump_print_xattr                print_xattr_fn;
    3878             :         char                                    xattr_name[4096];
    3879             : };
    3880             : 
    3881             : static void
    3882        1324 : bs_init_per_cluster_fields(struct spdk_blob_store *bs)
    3883             : {
    3884        1324 :         bs->pages_per_cluster = bs->cluster_sz / bs->md_page_size;
    3885        1324 :         if (spdk_u32_is_pow2(bs->pages_per_cluster)) {
    3886        1324 :                 bs->pages_per_cluster_shift = spdk_u32log2(bs->pages_per_cluster);
    3887             :         }
    3888        1324 :         bs->io_units_per_cluster = bs->cluster_sz / bs->io_unit_size;
    3889        1324 :         if (spdk_u32_is_pow2(bs->io_units_per_cluster)) {
    3890        1324 :                 bs->io_units_per_cluster_shift = spdk_u32log2(bs->io_units_per_cluster);
    3891             :         }
    3892        1324 : }
    3893             : 
    3894             : static int
    3895         973 : bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs,
    3896             :          struct spdk_bs_load_ctx **_ctx)
    3897             : {
    3898             :         struct spdk_blob_store  *bs;
    3899             :         struct spdk_bs_load_ctx *ctx;
    3900             :         uint64_t dev_size;
    3901             :         uint32_t md_page_size;
    3902             :         int rc;
    3903             : 
    3904         973 :         dev_size = dev->blocklen * dev->blockcnt;
    3905         973 :         if (dev_size < opts->cluster_sz) {
    3906             :                 /* Device size cannot be smaller than cluster size of blobstore */
    3907           0 :                 SPDK_INFOLOG(blob, "Device size %" PRIu64 " is smaller than cluster size %" PRIu32 "\n",
    3908             :                              dev_size, opts->cluster_sz);
    3909           0 :                 return -ENOSPC;
    3910             :         }
    3911             : 
    3912         973 :         md_page_size = spdk_max(spdk_max(dev->phys_blocklen, SPDK_BS_PAGE_SIZE),
    3913             :                                 opts->md_page_size);
    3914         973 :         if (opts->cluster_sz < md_page_size) {
    3915             :                 /* Cluster size cannot be smaller than page size */
    3916           0 :                 SPDK_ERRLOG("Cluster size %" PRIu32 " is smaller than page size %d\n",
    3917             :                             opts->cluster_sz, md_page_size);
    3918           0 :                 return -EINVAL;
    3919             :         }
    3920         973 :         bs = calloc(1, sizeof(struct spdk_blob_store));
    3921         973 :         if (!bs) {
    3922           0 :                 return -ENOMEM;
    3923             :         }
    3924             : 
    3925         973 :         ctx = calloc(1, sizeof(struct spdk_bs_load_ctx));
    3926         973 :         if (!ctx) {
    3927           0 :                 free(bs);
    3928           0 :                 return -ENOMEM;
    3929             :         }
    3930             : 
    3931         973 :         ctx->bs = bs;
    3932         973 :         ctx->iter_cb_fn = opts->iter_cb_fn;
    3933         973 :         ctx->iter_cb_arg = opts->iter_cb_arg;
    3934         973 :         ctx->force_recover = opts->force_recover;
    3935             : 
    3936         973 :         ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
    3937             :                                   SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
    3938         973 :         if (!ctx->super) {
    3939           0 :                 free(ctx);
    3940           0 :                 free(bs);
    3941           0 :                 return -ENOMEM;
    3942             :         }
    3943             : 
    3944         973 :         RB_INIT(&bs->open_blobs);
    3945         973 :         TAILQ_INIT(&bs->snapshots);
    3946         973 :         bs->dev = dev;
    3947         973 :         bs->md_page_size = md_page_size;
    3948         973 :         bs->md_thread = spdk_get_thread();
    3949         973 :         assert(bs->md_thread != NULL);
    3950             : 
    3951             :         /*
    3952             :          * Do not use bs_lba_to_cluster() here since blockcnt may not be an
    3953             :          *  even multiple of the cluster size.
    3954             :          */
    3955         973 :         bs->cluster_sz = opts->cluster_sz;
    3956         973 :         bs->total_clusters = dev->blockcnt / (bs->cluster_sz / dev->blocklen);
    3957         973 :         ctx->used_clusters = spdk_bit_array_create(bs->total_clusters);
    3958         973 :         if (!ctx->used_clusters) {
    3959           0 :                 spdk_free(ctx->super);
    3960           0 :                 free(ctx);
    3961           0 :                 free(bs);
    3962           0 :                 return -ENOMEM;
    3963             :         }
    3964             : 
    3965         973 :         bs->num_free_clusters = bs->total_clusters;
    3966         973 :         bs->io_unit_size = dev->blocklen;
    3967         973 :         bs_init_per_cluster_fields(bs);
    3968             : 
    3969         973 :         bs->max_channel_ops = opts->max_channel_ops;
    3970         973 :         bs->super_blob = SPDK_BLOBID_INVALID;
    3971         973 :         memcpy(&bs->bstype, &opts->bstype, sizeof(opts->bstype));
    3972         973 :         bs->esnap_bs_dev_create = opts->esnap_bs_dev_create;
    3973         973 :         bs->esnap_ctx = opts->esnap_ctx;
    3974             : 
    3975             :         /* The metadata is assumed to be at least 1 page */
    3976         973 :         bs->used_md_pages = spdk_bit_array_create(1);
    3977         973 :         bs->used_blobids = spdk_bit_array_create(0);
    3978         973 :         bs->open_blobids = spdk_bit_array_create(0);
    3979             : 
    3980         973 :         spdk_spin_init(&bs->used_lock);
    3981             : 
    3982         973 :         spdk_io_device_register(bs, bs_channel_create, bs_channel_destroy,
    3983             :                                 sizeof(struct spdk_bs_channel), "blobstore");
    3984         973 :         rc = bs_register_md_thread(bs);
    3985         973 :         if (rc == -1) {
    3986           0 :                 spdk_io_device_unregister(bs, NULL);
    3987           0 :                 spdk_spin_destroy(&bs->used_lock);
    3988           0 :                 spdk_bit_array_free(&bs->open_blobids);
    3989           0 :                 spdk_bit_array_free(&bs->used_blobids);
    3990           0 :                 spdk_bit_array_free(&bs->used_md_pages);
    3991           0 :                 spdk_bit_array_free(&ctx->used_clusters);
    3992           0 :                 spdk_free(ctx->super);
    3993           0 :                 free(ctx);
    3994           0 :                 free(bs);
    3995             :                 /* FIXME: this is a lie but don't know how to get a proper error code here */
    3996           0 :                 return -ENOMEM;
    3997             :         }
    3998             : 
    3999         973 :         *_ctx = ctx;
    4000         973 :         *_bs = bs;
    4001         973 :         return 0;
    4002             : }
    4003             : 
    4004             : static void
    4005          30 : bs_load_ctx_fail(struct spdk_bs_load_ctx *ctx, int bserrno)
    4006             : {
    4007          30 :         assert(bserrno != 0);
    4008             : 
    4009          30 :         spdk_free(ctx->super);
    4010          30 :         bs_sequence_finish(ctx->seq, bserrno);
    4011          30 :         bs_free(ctx->bs);
    4012          30 :         spdk_bit_array_free(&ctx->used_clusters);
    4013          30 :         free(ctx);
    4014          30 : }
    4015             : 
    4016             : static void
    4017        1026 : bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs,
    4018             :                struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg)
    4019             : {
    4020             :         /* Update the values in the super block */
    4021        1026 :         super->super_blob = bs->super_blob;
    4022        1026 :         memcpy(&super->bstype, &bs->bstype, sizeof(bs->bstype));
    4023        1026 :         super->crc = blob_md_page_calc_crc(super);
    4024        1026 :         bs_sequence_write_dev(seq, super, bs_page_to_lba(bs, 0),
    4025        1026 :                               bs_byte_to_lba(bs, sizeof(*super)),
    4026             :                               cb_fn, cb_arg);
    4027        1026 : }
    4028             : 
    4029             : static void
    4030         948 : bs_write_used_clusters(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
    4031             : {
    4032         948 :         struct spdk_bs_load_ctx *ctx = arg;
    4033             :         uint64_t        mask_size, lba, lba_count;
    4034             : 
    4035             :         /* Write out the used clusters mask */
    4036         948 :         mask_size = ctx->super->used_cluster_mask_len * ctx->bs->md_page_size;
    4037         948 :         ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
    4038             :                                  SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
    4039         948 :         if (!ctx->mask) {
    4040           0 :                 bs_load_ctx_fail(ctx, -ENOMEM);
    4041           0 :                 return;
    4042             :         }
    4043             : 
    4044         948 :         ctx->mask->type = SPDK_MD_MASK_TYPE_USED_CLUSTERS;
    4045         948 :         ctx->mask->length = ctx->bs->total_clusters;
    4046             :         /* We could get here through the normal unload path, or through dirty
    4047             :          * shutdown recovery.  For the normal unload path, we use the mask from
    4048             :          * the bit pool.  For dirty shutdown recovery, we don't have a bit pool yet -
    4049             :          * only the bit array from the load ctx.
    4050             :          */
    4051         948 :         if (ctx->bs->used_clusters) {
    4052         814 :                 assert(ctx->mask->length == spdk_bit_pool_capacity(ctx->bs->used_clusters));
    4053         814 :                 spdk_bit_pool_store_mask(ctx->bs->used_clusters, ctx->mask->mask);
    4054             :         } else {
    4055         134 :                 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->used_clusters));
    4056         134 :                 spdk_bit_array_store_mask(ctx->used_clusters, ctx->mask->mask);
    4057             :         }
    4058         948 :         lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start);
    4059         948 :         lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len);
    4060         948 :         bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
    4061             : }
    4062             : 
    4063             : static void
    4064         948 : bs_write_used_md(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
    4065             : {
    4066         948 :         struct spdk_bs_load_ctx *ctx = arg;
    4067             :         uint64_t        mask_size, lba, lba_count;
    4068             : 
    4069         948 :         mask_size = ctx->super->used_page_mask_len * ctx->bs->md_page_size;
    4070         948 :         ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
    4071             :                                  SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
    4072         948 :         if (!ctx->mask) {
    4073           0 :                 bs_load_ctx_fail(ctx, -ENOMEM);
    4074           0 :                 return;
    4075             :         }
    4076             : 
    4077         948 :         ctx->mask->type = SPDK_MD_MASK_TYPE_USED_PAGES;
    4078         948 :         ctx->mask->length = ctx->super->md_len;
    4079         948 :         assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_md_pages));
    4080             : 
    4081         948 :         spdk_bit_array_store_mask(ctx->bs->used_md_pages, ctx->mask->mask);
    4082         948 :         lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start);
    4083         948 :         lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len);
    4084         948 :         bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
    4085             : }
    4086             : 
    4087             : static void
    4088         948 : bs_write_used_blobids(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
    4089             : {
    4090         948 :         struct spdk_bs_load_ctx *ctx = arg;
    4091             :         uint64_t        mask_size, lba, lba_count;
    4092             : 
    4093         948 :         if (ctx->super->used_blobid_mask_len == 0) {
    4094             :                 /*
    4095             :                  * This is a pre-v3 on-disk format where the blobid mask does not get
    4096             :                  *  written to disk.
    4097             :                  */
    4098          30 :                 cb_fn(seq, arg, 0);
    4099          30 :                 return;
    4100             :         }
    4101             : 
    4102         918 :         mask_size = ctx->super->used_blobid_mask_len * ctx->bs->md_page_size;
    4103         918 :         ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_NUMA_ID_ANY,
    4104             :                                  SPDK_MALLOC_DMA);
    4105         918 :         if (!ctx->mask) {
    4106           0 :                 bs_load_ctx_fail(ctx, -ENOMEM);
    4107           0 :                 return;
    4108             :         }
    4109             : 
    4110         918 :         ctx->mask->type = SPDK_MD_MASK_TYPE_USED_BLOBIDS;
    4111         918 :         ctx->mask->length = ctx->super->md_len;
    4112         918 :         assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_blobids));
    4113             : 
    4114         918 :         spdk_bit_array_store_mask(ctx->bs->used_blobids, ctx->mask->mask);
    4115         918 :         lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start);
    4116         918 :         lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len);
    4117         918 :         bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
    4118             : }
    4119             : 
    4120             : static void
    4121         882 : blob_set_thin_provision(struct spdk_blob *blob)
    4122             : {
    4123         882 :         blob_verify_md_op(blob);
    4124         882 :         blob->invalid_flags |= SPDK_BLOB_THIN_PROV;
    4125         882 :         blob->state = SPDK_BLOB_STATE_DIRTY;
    4126         882 : }
    4127             : 
    4128             : static void
    4129        2617 : blob_set_clear_method(struct spdk_blob *blob, enum blob_clear_method clear_method)
    4130             : {
    4131        2617 :         blob_verify_md_op(blob);
    4132        2617 :         blob->clear_method = clear_method;
    4133        2617 :         blob->md_ro_flags |= (clear_method << SPDK_BLOB_CLEAR_METHOD_SHIFT);
    4134        2617 :         blob->state = SPDK_BLOB_STATE_DIRTY;
    4135        2617 : }
    4136             : 
    4137             : static void bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno);
    4138             : 
    4139             : static void
    4140          30 : bs_delete_corrupted_blob_cpl(void *cb_arg, int bserrno)
    4141             : {
    4142          30 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    4143             :         spdk_blob_id id;
    4144             :         int64_t page_num;
    4145             : 
    4146             :         /* Iterate to next blob (we can't use spdk_bs_iter_next function as our
    4147             :          * last blob has been removed */
    4148          30 :         page_num = bs_blobid_to_page(ctx->blobid);
    4149          30 :         page_num++;
    4150          30 :         page_num = spdk_bit_array_find_first_set(ctx->bs->used_blobids, page_num);
    4151          30 :         if (page_num >= spdk_bit_array_capacity(ctx->bs->used_blobids)) {
    4152          30 :                 bs_load_iter(ctx, NULL, -ENOENT);
    4153          30 :                 return;
    4154             :         }
    4155             : 
    4156           0 :         id = bs_page_to_blobid(page_num);
    4157             : 
    4158           0 :         spdk_bs_open_blob(ctx->bs, id, bs_load_iter, ctx);
    4159             : }
    4160             : 
    4161             : static void
    4162          30 : bs_delete_corrupted_close_cb(void *cb_arg, int bserrno)
    4163             : {
    4164          30 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    4165             : 
    4166          30 :         if (bserrno != 0) {
    4167           0 :                 SPDK_ERRLOG("Failed to close corrupted blob\n");
    4168           0 :                 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
    4169           0 :                 return;
    4170             :         }
    4171             : 
    4172          30 :         spdk_bs_delete_blob(ctx->bs, ctx->blobid, bs_delete_corrupted_blob_cpl, ctx);
    4173             : }
    4174             : 
    4175             : static void
    4176          30 : bs_delete_corrupted_blob(void *cb_arg, int bserrno)
    4177             : {
    4178          30 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    4179             :         uint64_t i;
    4180             : 
    4181          30 :         if (bserrno != 0) {
    4182           0 :                 SPDK_ERRLOG("Failed to close clone of a corrupted blob\n");
    4183           0 :                 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
    4184           0 :                 return;
    4185             :         }
    4186             : 
    4187             :         /* Snapshot and clone have the same copy of cluster map and extent pages
    4188             :          * at this point. Let's clear both for snapshot now,
    4189             :          * so that it won't be cleared for clone later when we remove snapshot.
    4190             :          * Also set thin provision to pass data corruption check */
    4191         330 :         for (i = 0; i < ctx->blob->active.num_clusters; i++) {
    4192         300 :                 ctx->blob->active.clusters[i] = 0;
    4193             :         }
    4194          48 :         for (i = 0; i < ctx->blob->active.num_extent_pages; i++) {
    4195          18 :                 ctx->blob->active.extent_pages[i] = 0;
    4196             :         }
    4197             : 
    4198          30 :         ctx->blob->active.num_allocated_clusters = 0;
    4199             : 
    4200          30 :         ctx->blob->md_ro = false;
    4201             : 
    4202          30 :         blob_set_thin_provision(ctx->blob);
    4203             : 
    4204          30 :         ctx->blobid = ctx->blob->id;
    4205             : 
    4206          30 :         spdk_blob_close(ctx->blob, bs_delete_corrupted_close_cb, ctx);
    4207             : }
    4208             : 
    4209             : static void
    4210          15 : bs_update_corrupted_blob(void *cb_arg, int bserrno)
    4211             : {
    4212          15 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    4213             : 
    4214          15 :         if (bserrno != 0) {
    4215           0 :                 SPDK_ERRLOG("Failed to close clone of a corrupted blob\n");
    4216           0 :                 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
    4217           0 :                 return;
    4218             :         }
    4219             : 
    4220          15 :         ctx->blob->md_ro = false;
    4221          15 :         blob_remove_xattr(ctx->blob, SNAPSHOT_PENDING_REMOVAL, true);
    4222          15 :         blob_remove_xattr(ctx->blob, SNAPSHOT_IN_PROGRESS, true);
    4223          15 :         spdk_blob_set_read_only(ctx->blob);
    4224             : 
    4225          15 :         if (ctx->iter_cb_fn) {
    4226           0 :                 ctx->iter_cb_fn(ctx->iter_cb_arg, ctx->blob, 0);
    4227             :         }
    4228          15 :         bs_blob_list_add(ctx->blob);
    4229             : 
    4230          15 :         spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
    4231             : }
    4232             : 
    4233             : static void
    4234          45 : bs_examine_clone(void *cb_arg, struct spdk_blob *blob, int bserrno)
    4235             : {
    4236          45 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    4237             : 
    4238          45 :         if (bserrno != 0) {
    4239           0 :                 SPDK_ERRLOG("Failed to open clone of a corrupted blob\n");
    4240           0 :                 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
    4241           0 :                 return;
    4242             :         }
    4243             : 
    4244          45 :         if (blob->parent_id == ctx->blob->id) {
    4245             :                 /* Power failure occurred before updating clone (snapshot delete case)
    4246             :                  * or after updating clone (creating snapshot case) - keep snapshot */
    4247          15 :                 spdk_blob_close(blob, bs_update_corrupted_blob, ctx);
    4248             :         } else {
    4249             :                 /* Power failure occurred after updating clone (snapshot delete case)
    4250             :                  * or before updating clone (creating snapshot case) - remove snapshot */
    4251          30 :                 spdk_blob_close(blob, bs_delete_corrupted_blob, ctx);
    4252             :         }
    4253             : }
    4254             : 
    4255             : static void
    4256         903 : bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno)
    4257             : {
    4258         903 :         struct spdk_bs_load_ctx *ctx = arg;
    4259             :         const void *value;
    4260             :         size_t len;
    4261         903 :         int rc = 0;
    4262             : 
    4263         903 :         if (bserrno == 0) {
    4264             :                 /* Examine blob if it is corrupted after power failure. Fix
    4265             :                  * the ones that can be fixed and remove any other corrupted
    4266             :                  * ones. If it is not corrupted just process it */
    4267         552 :                 rc = blob_get_xattr_value(blob, SNAPSHOT_PENDING_REMOVAL, &value, &len, true);
    4268         552 :                 if (rc != 0) {
    4269         527 :                         rc = blob_get_xattr_value(blob, SNAPSHOT_IN_PROGRESS, &value, &len, true);
    4270         527 :                         if (rc != 0) {
    4271             :                                 /* Not corrupted - process it and continue with iterating through blobs */
    4272         507 :                                 if (ctx->iter_cb_fn) {
    4273          42 :                                         ctx->iter_cb_fn(ctx->iter_cb_arg, blob, 0);
    4274             :                                 }
    4275         507 :                                 bs_blob_list_add(blob);
    4276         507 :                                 spdk_bs_iter_next(ctx->bs, blob, bs_load_iter, ctx);
    4277         552 :                                 return;
    4278             :                         }
    4279             : 
    4280             :                 }
    4281             : 
    4282          45 :                 assert(len == sizeof(spdk_blob_id));
    4283             : 
    4284          45 :                 ctx->blob = blob;
    4285             : 
    4286             :                 /* Open clone to check if we are able to fix this blob or should we remove it */
    4287          45 :                 spdk_bs_open_blob(ctx->bs, *(spdk_blob_id *)value, bs_examine_clone, ctx);
    4288          45 :                 return;
    4289         351 :         } else if (bserrno == -ENOENT) {
    4290         351 :                 bserrno = 0;
    4291             :         } else {
    4292             :                 /*
    4293             :                  * This case needs to be looked at further.  Same problem
    4294             :                  *  exists with applications that rely on explicit blob
    4295             :                  *  iteration.  We should just skip the blob that failed
    4296             :                  *  to load and continue on to the next one.
    4297             :                  */
    4298           0 :                 SPDK_ERRLOG("Error in iterating blobs\n");
    4299             :         }
    4300             : 
    4301         351 :         ctx->iter_cb_fn = NULL;
    4302             : 
    4303         351 :         spdk_free(ctx->super);
    4304         351 :         spdk_free(ctx->mask);
    4305         351 :         bs_sequence_finish(ctx->seq, bserrno);
    4306         351 :         free(ctx);
    4307             : }
    4308             : 
    4309             : static void bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg);
    4310             : 
    4311             : static void
    4312         351 : bs_load_complete(struct spdk_bs_load_ctx *ctx)
    4313             : {
    4314         351 :         ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters);
    4315         351 :         if (ctx->dumping) {
    4316           0 :                 bs_dump_read_md_page(ctx->seq, ctx);
    4317           0 :                 return;
    4318             :         }
    4319         351 :         spdk_bs_iter_first(ctx->bs, bs_load_iter, ctx);
    4320             : }
    4321             : 
    4322             : static void
    4323         217 : bs_load_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    4324             : {
    4325         217 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    4326             :         int rc;
    4327             : 
    4328             :         /* The type must be correct */
    4329         217 :         assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_BLOBIDS);
    4330             : 
    4331             :         /* The length of the mask (in bits) must not be greater than
    4332             :          * the length of the buffer (converted to bits) */
    4333         217 :         assert(ctx->mask->length <= (ctx->super->used_blobid_mask_len * ctx->super->md_page_size * 8));
    4334             : 
    4335             :         /* The length of the mask must be exactly equal to the size
    4336             :          * (in pages) of the metadata region */
    4337         217 :         assert(ctx->mask->length == ctx->super->md_len);
    4338             : 
    4339         217 :         rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->mask->length);
    4340         217 :         if (rc < 0) {
    4341           0 :                 spdk_free(ctx->mask);
    4342           0 :                 bs_load_ctx_fail(ctx, rc);
    4343           0 :                 return;
    4344             :         }
    4345             : 
    4346         217 :         spdk_bit_array_load_mask(ctx->bs->used_blobids, ctx->mask->mask);
    4347         217 :         bs_load_complete(ctx);
    4348             : }
    4349             : 
    4350             : static void
    4351         217 : bs_load_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    4352             : {
    4353         217 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    4354             :         uint64_t                lba, lba_count, mask_size;
    4355             :         int                     rc;
    4356             : 
    4357         217 :         if (bserrno != 0) {
    4358           0 :                 bs_load_ctx_fail(ctx, bserrno);
    4359           0 :                 return;
    4360             :         }
    4361             : 
    4362             :         /* The type must be correct */
    4363         217 :         assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS);
    4364             :         /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */
    4365         217 :         assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof(
    4366             :                                              struct spdk_blob_md_page) * 8));
    4367             :         /*
    4368             :          * The length of the mask must be equal to or larger than the total number of clusters. It may be
    4369             :          * larger than the total number of clusters due to a failure spdk_bs_grow.
    4370             :          */
    4371         217 :         assert(ctx->mask->length >= ctx->bs->total_clusters);
    4372         217 :         if (ctx->mask->length > ctx->bs->total_clusters) {
    4373           5 :                 SPDK_WARNLOG("Shrink the used_custers mask length to total_clusters");
    4374           5 :                 ctx->mask->length = ctx->bs->total_clusters;
    4375             :         }
    4376             : 
    4377         217 :         rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->mask->length);
    4378         217 :         if (rc < 0) {
    4379           0 :                 spdk_free(ctx->mask);
    4380           0 :                 bs_load_ctx_fail(ctx, rc);
    4381           0 :                 return;
    4382             :         }
    4383             : 
    4384         217 :         spdk_bit_array_load_mask(ctx->used_clusters, ctx->mask->mask);
    4385         217 :         ctx->bs->num_free_clusters = spdk_bit_array_count_clear(ctx->used_clusters);
    4386         217 :         assert(ctx->bs->num_free_clusters <= ctx->bs->total_clusters);
    4387             : 
    4388         217 :         spdk_free(ctx->mask);
    4389             : 
    4390             :         /* Read the used blobids mask */
    4391         217 :         mask_size = ctx->super->used_blobid_mask_len * ctx->super->md_page_size;
    4392         217 :         ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_NUMA_ID_ANY,
    4393             :                                  SPDK_MALLOC_DMA);
    4394         217 :         if (!ctx->mask) {
    4395           0 :                 bs_load_ctx_fail(ctx, -ENOMEM);
    4396           0 :                 return;
    4397             :         }
    4398         217 :         lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start);
    4399         217 :         lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len);
    4400         217 :         bs_sequence_read_dev(seq, ctx->mask, lba, lba_count,
    4401             :                              bs_load_used_blobids_cpl, ctx);
    4402             : }
    4403             : 
    4404             : static void
    4405         217 : bs_load_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    4406             : {
    4407         217 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    4408             :         uint64_t                lba, lba_count, mask_size;
    4409             :         int                     rc;
    4410             : 
    4411         217 :         if (bserrno != 0) {
    4412           0 :                 bs_load_ctx_fail(ctx, bserrno);
    4413           0 :                 return;
    4414             :         }
    4415             : 
    4416             :         /* The type must be correct */
    4417         217 :         assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_PAGES);
    4418             :         /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */
    4419         217 :         assert(ctx->mask->length <= (ctx->super->used_page_mask_len * ctx->super->md_page_size *
    4420             :                                      8));
    4421             :         /* The length of the mask must be exactly equal to the size (in pages) of the metadata region */
    4422         217 :         if (ctx->mask->length != ctx->super->md_len) {
    4423           0 :                 SPDK_ERRLOG("mismatched md_len in used_pages mask: "
    4424             :                             "mask->length=%" PRIu32 " super->md_len=%" PRIu32 "\n",
    4425             :                             ctx->mask->length, ctx->super->md_len);
    4426           0 :                 assert(false);
    4427             :         }
    4428             : 
    4429         217 :         rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->mask->length);
    4430         217 :         if (rc < 0) {
    4431           0 :                 spdk_free(ctx->mask);
    4432           0 :                 bs_load_ctx_fail(ctx, rc);
    4433           0 :                 return;
    4434             :         }
    4435             : 
    4436         217 :         spdk_bit_array_load_mask(ctx->bs->used_md_pages, ctx->mask->mask);
    4437         217 :         spdk_free(ctx->mask);
    4438             : 
    4439             :         /* Read the used clusters mask */
    4440         217 :         mask_size = ctx->super->used_cluster_mask_len * ctx->super->md_page_size;
    4441         217 :         ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_NUMA_ID_ANY,
    4442             :                                  SPDK_MALLOC_DMA);
    4443         217 :         if (!ctx->mask) {
    4444           0 :                 bs_load_ctx_fail(ctx, -ENOMEM);
    4445           0 :                 return;
    4446             :         }
    4447         217 :         lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start);
    4448         217 :         lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len);
    4449         217 :         bs_sequence_read_dev(seq, ctx->mask, lba, lba_count,
    4450             :                              bs_load_used_clusters_cpl, ctx);
    4451             : }
    4452             : 
    4453             : static void
    4454         217 : bs_load_read_used_pages(struct spdk_bs_load_ctx *ctx)
    4455             : {
    4456             :         uint64_t lba, lba_count, mask_size;
    4457             : 
    4458             :         /* Read the used pages mask */
    4459         217 :         mask_size = ctx->super->used_page_mask_len * ctx->super->md_page_size;
    4460         217 :         ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
    4461             :                                  SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
    4462         217 :         if (!ctx->mask) {
    4463           0 :                 bs_load_ctx_fail(ctx, -ENOMEM);
    4464           0 :                 return;
    4465             :         }
    4466             : 
    4467         217 :         lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start);
    4468         217 :         lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len);
    4469         217 :         bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count,
    4470             :                              bs_load_used_pages_cpl, ctx);
    4471             : }
    4472             : 
    4473             : static int
    4474         323 : bs_load_replay_md_parse_page(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_page *page)
    4475             : {
    4476         323 :         struct spdk_blob_store *bs = ctx->bs;
    4477             :         struct spdk_blob_md_descriptor *desc;
    4478         323 :         size_t  cur_desc = 0;
    4479             : 
    4480         323 :         desc = (struct spdk_blob_md_descriptor *)page->descriptors;
    4481         933 :         while (cur_desc < sizeof(page->descriptors)) {
    4482         933 :                 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
    4483         298 :                         if (desc->length == 0) {
    4484             :                                 /* If padding and length are 0, this terminates the page */
    4485         298 :                                 break;
    4486             :                         }
    4487         635 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
    4488             :                         struct spdk_blob_md_descriptor_extent_rle       *desc_extent_rle;
    4489             :                         unsigned int                            i, j;
    4490          68 :                         unsigned int                            cluster_count = 0;
    4491             :                         uint32_t                                cluster_idx;
    4492             : 
    4493          68 :                         desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
    4494             : 
    4495         136 :                         for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
    4496         828 :                                 for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
    4497         760 :                                         cluster_idx = desc_extent_rle->extents[i].cluster_idx;
    4498             :                                         /*
    4499             :                                          * cluster_idx = 0 means an unallocated cluster - don't mark that
    4500             :                                          * in the used cluster map.
    4501             :                                          */
    4502         760 :                                         if (cluster_idx != 0) {
    4503         540 :                                                 SPDK_NOTICELOG("Recover: cluster %" PRIu32 "\n", cluster_idx + j);
    4504         540 :                                                 spdk_bit_array_set(ctx->used_clusters, cluster_idx + j);
    4505         540 :                                                 if (bs->num_free_clusters == 0) {
    4506           0 :                                                         return -ENOSPC;
    4507             :                                                 }
    4508         540 :                                                 bs->num_free_clusters--;
    4509             :                                         }
    4510         760 :                                         cluster_count++;
    4511             :                                 }
    4512             :                         }
    4513          68 :                         if (cluster_count == 0) {
    4514           0 :                                 return -EINVAL;
    4515             :                         }
    4516         567 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
    4517             :                         struct spdk_blob_md_descriptor_extent_page      *desc_extent;
    4518             :                         uint32_t                                        i;
    4519          78 :                         uint32_t                                        cluster_count = 0;
    4520             :                         uint32_t                                        cluster_idx;
    4521             :                         size_t                                          cluster_idx_length;
    4522             : 
    4523          78 :                         desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
    4524          78 :                         cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx);
    4525             : 
    4526          78 :                         if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) ||
    4527          78 :                             (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) {
    4528           0 :                                 return -EINVAL;
    4529             :                         }
    4530             : 
    4531         978 :                         for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) {
    4532         900 :                                 cluster_idx = desc_extent->cluster_idx[i];
    4533             :                                 /*
    4534             :                                  * cluster_idx = 0 means an unallocated cluster - don't mark that
    4535             :                                  * in the used cluster map.
    4536             :                                  */
    4537         900 :                                 if (cluster_idx != 0) {
    4538         900 :                                         if (cluster_idx < desc_extent->start_cluster_idx &&
    4539           0 :                                             cluster_idx >= desc_extent->start_cluster_idx + cluster_count) {
    4540           0 :                                                 return -EINVAL;
    4541             :                                         }
    4542         900 :                                         spdk_bit_array_set(ctx->used_clusters, cluster_idx);
    4543         900 :                                         if (bs->num_free_clusters == 0) {
    4544           0 :                                                 return -ENOSPC;
    4545             :                                         }
    4546         900 :                                         bs->num_free_clusters--;
    4547             :                                 }
    4548         900 :                                 cluster_count++;
    4549             :                         }
    4550             : 
    4551          78 :                         if (cluster_count == 0) {
    4552           0 :                                 return -EINVAL;
    4553             :                         }
    4554         489 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
    4555             :                         /* Skip this item */
    4556         394 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
    4557             :                         /* Skip this item */
    4558         318 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
    4559             :                         /* Skip this item */
    4560         123 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) {
    4561             :                         struct spdk_blob_md_descriptor_extent_table *desc_extent_table;
    4562         123 :                         uint32_t num_extent_pages = ctx->num_extent_pages;
    4563             :                         uint32_t i;
    4564             :                         size_t extent_pages_length;
    4565             :                         void *tmp;
    4566             : 
    4567         123 :                         desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc;
    4568         123 :                         extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters);
    4569             : 
    4570         123 :                         if (desc_extent_table->length == 0 ||
    4571         123 :                             (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) {
    4572           0 :                                 return -EINVAL;
    4573             :                         }
    4574             : 
    4575         240 :                         for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
    4576         117 :                                 if (desc_extent_table->extent_page[i].page_idx != 0) {
    4577          78 :                                         if (desc_extent_table->extent_page[i].num_pages != 1) {
    4578           0 :                                                 return -EINVAL;
    4579             :                                         }
    4580          78 :                                         num_extent_pages += 1;
    4581             :                                 }
    4582             :                         }
    4583             : 
    4584         123 :                         if (num_extent_pages > 0) {
    4585          78 :                                 tmp = realloc(ctx->extent_page_num, num_extent_pages * sizeof(uint32_t));
    4586          78 :                                 if (tmp == NULL) {
    4587           0 :                                         return -ENOMEM;
    4588             :                                 }
    4589          78 :                                 ctx->extent_page_num = tmp;
    4590             : 
    4591             :                                 /* Extent table entries contain md page numbers for extent pages.
    4592             :                                  * Zeroes represent unallocated extent pages, those are run-length-encoded.
    4593             :                                  */
    4594         156 :                                 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
    4595          78 :                                         if (desc_extent_table->extent_page[i].page_idx != 0) {
    4596          78 :                                                 ctx->extent_page_num[ctx->num_extent_pages] = desc_extent_table->extent_page[i].page_idx;
    4597          78 :                                                 ctx->num_extent_pages += 1;
    4598             :                                         }
    4599             :                                 }
    4600             :                         }
    4601             :                 } else {
    4602             :                         /* Error */
    4603           0 :                         return -EINVAL;
    4604             :                 }
    4605             :                 /* Advance to the next descriptor */
    4606         635 :                 cur_desc += sizeof(*desc) + desc->length;
    4607         635 :                 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
    4608          25 :                         break;
    4609             :                 }
    4610         610 :                 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
    4611             :         }
    4612         323 :         return 0;
    4613             : }
    4614             : 
    4615             : static bool
    4616        1884 : bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page)
    4617             : {
    4618             :         uint32_t crc;
    4619        1884 :         struct spdk_blob_md_descriptor *desc = (struct spdk_blob_md_descriptor *)page->descriptors;
    4620             :         size_t desc_len;
    4621             : 
    4622        1884 :         crc = blob_md_page_calc_crc(page);
    4623        1884 :         if (crc != page->crc) {
    4624           0 :                 return false;
    4625             :         }
    4626             : 
    4627             :         /* Extent page should always be of sequence num 0. */
    4628        1884 :         if (page->sequence_num != 0) {
    4629          55 :                 return false;
    4630             :         }
    4631             : 
    4632             :         /* Descriptor type must be EXTENT_PAGE. */
    4633        1829 :         if (desc->type != SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
    4634         195 :                 return false;
    4635             :         }
    4636             : 
    4637             :         /* Descriptor length cannot exceed the page. */
    4638        1634 :         desc_len = sizeof(*desc) + desc->length;
    4639        1634 :         if (desc_len > sizeof(page->descriptors)) {
    4640           0 :                 return false;
    4641             :         }
    4642             : 
    4643             :         /* It has to be the only descriptor in the page. */
    4644        1634 :         if (desc_len + sizeof(*desc) <= sizeof(page->descriptors)) {
    4645        1634 :                 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + desc_len);
    4646        1634 :                 if (desc->length != 0) {
    4647           0 :                         return false;
    4648             :                 }
    4649             :         }
    4650             : 
    4651        1634 :         return true;
    4652             : }
    4653             : 
    4654             : static bool
    4655        8531 : bs_load_cur_md_page_valid(struct spdk_bs_load_ctx *ctx)
    4656             : {
    4657             :         uint32_t crc;
    4658        8531 :         struct spdk_blob_md_page *page = ctx->page;
    4659             : 
    4660        8531 :         crc = blob_md_page_calc_crc(page);
    4661        8531 :         if (crc != page->crc) {
    4662        8254 :                 return false;
    4663             :         }
    4664             : 
    4665             :         /* First page of a sequence should match the blobid. */
    4666         277 :         if (page->sequence_num == 0 &&
    4667         222 :             bs_page_to_blobid(ctx->cur_page) != page->id) {
    4668          27 :                 return false;
    4669             :         }
    4670         250 :         assert(bs_load_cur_extent_page_valid(page) == false);
    4671             : 
    4672         250 :         return true;
    4673             : }
    4674             : 
    4675             : static void bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx);
    4676             : 
    4677             : static void
    4678         134 : bs_load_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    4679             : {
    4680         134 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    4681             : 
    4682         134 :         if (bserrno != 0) {
    4683           0 :                 bs_load_ctx_fail(ctx, bserrno);
    4684           0 :                 return;
    4685             :         }
    4686             : 
    4687         134 :         bs_load_complete(ctx);
    4688             : }
    4689             : 
    4690             : static void
    4691         134 : bs_load_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    4692             : {
    4693         134 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    4694             : 
    4695         134 :         spdk_free(ctx->mask);
    4696         134 :         ctx->mask = NULL;
    4697             : 
    4698         134 :         if (bserrno != 0) {
    4699           0 :                 bs_load_ctx_fail(ctx, bserrno);
    4700           0 :                 return;
    4701             :         }
    4702             : 
    4703         134 :         bs_write_used_clusters(seq, ctx, bs_load_write_used_clusters_cpl);
    4704             : }
    4705             : 
    4706             : static void
    4707         134 : bs_load_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    4708             : {
    4709         134 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    4710             : 
    4711         134 :         spdk_free(ctx->mask);
    4712         134 :         ctx->mask = NULL;
    4713             : 
    4714         134 :         if (bserrno != 0) {
    4715           0 :                 bs_load_ctx_fail(ctx, bserrno);
    4716           0 :                 return;
    4717             :         }
    4718             : 
    4719         134 :         bs_write_used_blobids(seq, ctx, bs_load_write_used_blobids_cpl);
    4720             : }
    4721             : 
    4722             : static void
    4723         134 : bs_load_write_used_md(struct spdk_bs_load_ctx *ctx)
    4724             : {
    4725         134 :         bs_write_used_md(ctx->seq, ctx, bs_load_write_used_pages_cpl);
    4726         134 : }
    4727             : 
    4728             : static void
    4729        8481 : bs_load_replay_md_chain_cpl(struct spdk_bs_load_ctx *ctx)
    4730             : {
    4731             :         uint64_t num_md_clusters;
    4732             :         uint64_t i;
    4733             : 
    4734        8481 :         ctx->in_page_chain = false;
    4735             : 
    4736             :         do {
    4737        8576 :                 ctx->page_index++;
    4738        8576 :         } while (spdk_bit_array_get(ctx->bs->used_md_pages, ctx->page_index) == true);
    4739             : 
    4740        8481 :         if (ctx->page_index < ctx->super->md_len) {
    4741        8347 :                 ctx->cur_page = ctx->page_index;
    4742        8347 :                 bs_load_replay_cur_md_page(ctx);
    4743             :         } else {
    4744             :                 /* Claim all of the clusters used by the metadata */
    4745         134 :                 num_md_clusters = spdk_divide_round_up(
    4746         268 :                                           ctx->super->md_start + ctx->super->md_len, ctx->bs->pages_per_cluster);
    4747         629 :                 for (i = 0; i < num_md_clusters; i++) {
    4748         495 :                         spdk_bit_array_set(ctx->used_clusters, i);
    4749             :                 }
    4750         134 :                 ctx->bs->num_free_clusters -= num_md_clusters;
    4751         134 :                 spdk_free(ctx->page);
    4752         134 :                 bs_load_write_used_md(ctx);
    4753             :         }
    4754        8481 : }
    4755             : 
    4756             : static void
    4757          78 : bs_load_replay_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    4758             : {
    4759          78 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    4760             :         uint32_t page_num;
    4761             :         uint64_t i;
    4762             : 
    4763          78 :         if (bserrno != 0) {
    4764           0 :                 spdk_free(ctx->extent_pages);
    4765           0 :                 bs_load_ctx_fail(ctx, bserrno);
    4766           0 :                 return;
    4767             :         }
    4768             : 
    4769         156 :         for (i = 0; i < ctx->num_extent_pages; i++) {
    4770             :                 /* Extent pages are only read when present within in chain md.
    4771             :                  * Integrity of md is not right if that page was not a valid extent page. */
    4772          78 :                 if (bs_load_cur_extent_page_valid(&ctx->extent_pages[i]) != true) {
    4773           0 :                         spdk_free(ctx->extent_pages);
    4774           0 :                         bs_load_ctx_fail(ctx, -EILSEQ);
    4775           0 :                         return;
    4776             :                 }
    4777             : 
    4778          78 :                 page_num = ctx->extent_page_num[i];
    4779          78 :                 spdk_bit_array_set(ctx->bs->used_md_pages, page_num);
    4780          78 :                 if (bs_load_replay_md_parse_page(ctx, &ctx->extent_pages[i])) {
    4781           0 :                         spdk_free(ctx->extent_pages);
    4782           0 :                         bs_load_ctx_fail(ctx, -EILSEQ);
    4783           0 :                         return;
    4784             :                 }
    4785             :         }
    4786             : 
    4787          78 :         spdk_free(ctx->extent_pages);
    4788          78 :         free(ctx->extent_page_num);
    4789          78 :         ctx->extent_page_num = NULL;
    4790          78 :         ctx->num_extent_pages = 0;
    4791             : 
    4792          78 :         bs_load_replay_md_chain_cpl(ctx);
    4793             : }
    4794             : 
    4795             : static void
    4796          78 : bs_load_replay_extent_pages(struct spdk_bs_load_ctx *ctx)
    4797             : {
    4798             :         spdk_bs_batch_t *batch;
    4799             :         uint32_t page;
    4800             :         uint64_t lba;
    4801             :         uint64_t i;
    4802             : 
    4803          78 :         ctx->extent_pages = spdk_zmalloc(ctx->super->md_page_size * ctx->num_extent_pages, 0,
    4804             :                                          NULL, SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
    4805          78 :         if (!ctx->extent_pages) {
    4806           0 :                 bs_load_ctx_fail(ctx, -ENOMEM);
    4807           0 :                 return;
    4808             :         }
    4809             : 
    4810          78 :         batch = bs_sequence_to_batch(ctx->seq, bs_load_replay_extent_page_cpl, ctx);
    4811             : 
    4812         156 :         for (i = 0; i < ctx->num_extent_pages; i++) {
    4813          78 :                 page = ctx->extent_page_num[i];
    4814          78 :                 assert(page < ctx->super->md_len);
    4815          78 :                 lba = bs_md_page_to_lba(ctx->bs, page);
    4816          78 :                 bs_batch_read_dev(batch, &ctx->extent_pages[i], lba,
    4817          78 :                                   bs_byte_to_lba(ctx->bs, ctx->super->md_page_size));
    4818             :         }
    4819             : 
    4820          78 :         bs_batch_close(batch);
    4821             : }
    4822             : 
    4823             : static void
    4824        8531 : bs_load_replay_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    4825             : {
    4826        8531 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    4827             :         uint32_t page_num;
    4828             :         struct spdk_blob_md_page *page;
    4829             : 
    4830        8531 :         if (bserrno != 0) {
    4831           0 :                 bs_load_ctx_fail(ctx, bserrno);
    4832           0 :                 return;
    4833             :         }
    4834             : 
    4835        8531 :         page_num = ctx->cur_page;
    4836        8531 :         page = ctx->page;
    4837        8531 :         if (bs_load_cur_md_page_valid(ctx) == true) {
    4838         250 :                 if (page->sequence_num == 0 || ctx->in_page_chain == true) {
    4839         245 :                         spdk_spin_lock(&ctx->bs->used_lock);
    4840         245 :                         bs_claim_md_page(ctx->bs, page_num);
    4841         245 :                         spdk_spin_unlock(&ctx->bs->used_lock);
    4842         245 :                         if (page->sequence_num == 0) {
    4843         195 :                                 SPDK_NOTICELOG("Recover: blob 0x%" PRIx32 "\n", page_num);
    4844         195 :                                 spdk_bit_array_set(ctx->bs->used_blobids, page_num);
    4845             :                         }
    4846         245 :                         if (bs_load_replay_md_parse_page(ctx, page)) {
    4847           0 :                                 bs_load_ctx_fail(ctx, -EILSEQ);
    4848           0 :                                 return;
    4849             :                         }
    4850         245 :                         if (page->next != SPDK_INVALID_MD_PAGE) {
    4851          50 :                                 ctx->in_page_chain = true;
    4852          50 :                                 ctx->cur_page = page->next;
    4853          50 :                                 bs_load_replay_cur_md_page(ctx);
    4854          50 :                                 return;
    4855             :                         }
    4856         195 :                         if (ctx->num_extent_pages != 0) {
    4857          78 :                                 bs_load_replay_extent_pages(ctx);
    4858          78 :                                 return;
    4859             :                         }
    4860             :                 }
    4861             :         }
    4862        8403 :         bs_load_replay_md_chain_cpl(ctx);
    4863             : }
    4864             : 
    4865             : static void
    4866        8531 : bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx)
    4867             : {
    4868             :         uint64_t lba;
    4869             : 
    4870        8531 :         assert(ctx->cur_page < ctx->super->md_len);
    4871        8531 :         lba = bs_md_page_to_lba(ctx->bs, ctx->cur_page);
    4872        8531 :         bs_sequence_read_dev(ctx->seq, ctx->page, lba,
    4873        8531 :                              bs_byte_to_lba(ctx->bs, ctx->super->md_page_size),
    4874             :                              bs_load_replay_md_cpl, ctx);
    4875        8531 : }
    4876             : 
    4877             : static void
    4878         134 : bs_load_replay_md(struct spdk_bs_load_ctx *ctx)
    4879             : {
    4880         134 :         ctx->page_index = 0;
    4881         134 :         ctx->cur_page = 0;
    4882         134 :         ctx->page = spdk_zmalloc(ctx->bs->md_page_size, 0,
    4883             :                                  NULL, SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
    4884         134 :         if (!ctx->page) {
    4885           0 :                 bs_load_ctx_fail(ctx, -ENOMEM);
    4886           0 :                 return;
    4887             :         }
    4888         134 :         bs_load_replay_cur_md_page(ctx);
    4889             : }
    4890             : 
    4891             : static void
    4892         134 : bs_recover(struct spdk_bs_load_ctx *ctx)
    4893             : {
    4894             :         int             rc;
    4895             : 
    4896         134 :         SPDK_NOTICELOG("Performing recovery on blobstore\n");
    4897         134 :         rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->super->md_len);
    4898         134 :         if (rc < 0) {
    4899           0 :                 bs_load_ctx_fail(ctx, -ENOMEM);
    4900           0 :                 return;
    4901             :         }
    4902             : 
    4903         134 :         rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->super->md_len);
    4904         134 :         if (rc < 0) {
    4905           0 :                 bs_load_ctx_fail(ctx, -ENOMEM);
    4906           0 :                 return;
    4907             :         }
    4908             : 
    4909         134 :         rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters);
    4910         134 :         if (rc < 0) {
    4911           0 :                 bs_load_ctx_fail(ctx, -ENOMEM);
    4912           0 :                 return;
    4913             :         }
    4914             : 
    4915         134 :         rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->super->md_len);
    4916         134 :         if (rc < 0) {
    4917           0 :                 bs_load_ctx_fail(ctx, -ENOMEM);
    4918           0 :                 return;
    4919             :         }
    4920             : 
    4921         134 :         ctx->bs->num_free_clusters = ctx->bs->total_clusters;
    4922         134 :         bs_load_replay_md(ctx);
    4923             : }
    4924             : 
    4925             : static int
    4926         346 : bs_parse_super(struct spdk_bs_load_ctx *ctx)
    4927             : {
    4928             :         int rc;
    4929             : 
    4930         346 :         if (ctx->super->size == 0) {
    4931          10 :                 ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen;
    4932             :         }
    4933             : 
    4934         346 :         if (ctx->super->io_unit_size == 0) {
    4935          10 :                 ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE;
    4936             :         }
    4937         346 :         if (ctx->super->md_page_size == 0) {
    4938           5 :                 ctx->super->md_page_size = SPDK_BS_PAGE_SIZE;
    4939             :         }
    4940             : 
    4941         346 :         ctx->bs->clean = 1;
    4942         346 :         ctx->bs->cluster_sz = ctx->super->cluster_size;
    4943         346 :         ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size;
    4944         346 :         ctx->bs->io_unit_size = ctx->super->io_unit_size;
    4945         346 :         ctx->bs->md_page_size = ctx->super->md_page_size;
    4946         346 :         bs_init_per_cluster_fields(ctx->bs);
    4947         346 :         rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters);
    4948         346 :         if (rc < 0) {
    4949           0 :                 return -ENOMEM;
    4950             :         }
    4951         346 :         ctx->bs->md_start = ctx->super->md_start;
    4952         346 :         ctx->bs->md_len = ctx->super->md_len;
    4953         346 :         rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->bs->md_len);
    4954         346 :         if (rc < 0) {
    4955           0 :                 return -ENOMEM;
    4956             :         }
    4957             : 
    4958        1038 :         ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up(
    4959         692 :                                                ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster);
    4960         346 :         ctx->bs->super_blob = ctx->super->super_blob;
    4961         346 :         memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype));
    4962             : 
    4963         346 :         return 0;
    4964             : }
    4965             : 
    4966             : static void
    4967         376 : bs_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    4968             : {
    4969         376 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    4970             :         int rc;
    4971             : 
    4972         376 :         rc = bs_super_validate(ctx->super, ctx->bs);
    4973         376 :         if (rc != 0) {
    4974          30 :                 bs_load_ctx_fail(ctx, rc);
    4975          30 :                 return;
    4976             :         }
    4977             : 
    4978         346 :         rc = bs_parse_super(ctx);
    4979         346 :         if (rc < 0) {
    4980           0 :                 bs_load_ctx_fail(ctx, rc);
    4981           0 :                 return;
    4982             :         }
    4983             : 
    4984         346 :         if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0 || ctx->force_recover) {
    4985         134 :                 bs_recover(ctx);
    4986             :         } else {
    4987         212 :                 bs_load_read_used_pages(ctx);
    4988             :         }
    4989             : }
    4990             : 
    4991             : static inline int
    4992         380 : bs_opts_copy(struct spdk_bs_opts *src, struct spdk_bs_opts *dst)
    4993             : {
    4994             : 
    4995         380 :         if (!src->opts_size) {
    4996           0 :                 SPDK_ERRLOG("opts_size should not be zero value\n");
    4997           0 :                 return -1;
    4998             :         }
    4999             : 
    5000             : #define FIELD_OK(field) \
    5001             :         offsetof(struct spdk_bs_opts, field) + sizeof(src->field) <= src->opts_size
    5002             : 
    5003             : #define SET_FIELD(field) \
    5004             :         if (FIELD_OK(field)) { \
    5005             :                 dst->field = src->field; \
    5006             :         } \
    5007             : 
    5008         380 :         SET_FIELD(cluster_sz);
    5009         380 :         SET_FIELD(num_md_pages);
    5010         380 :         SET_FIELD(max_md_ops);
    5011         380 :         SET_FIELD(max_channel_ops);
    5012         380 :         SET_FIELD(clear_method);
    5013             : 
    5014         380 :         if (FIELD_OK(bstype)) {
    5015         380 :                 memcpy(&dst->bstype, &src->bstype, sizeof(dst->bstype));
    5016             :         }
    5017         380 :         SET_FIELD(md_page_size);
    5018         380 :         SET_FIELD(iter_cb_fn);
    5019         380 :         SET_FIELD(iter_cb_arg);
    5020         380 :         SET_FIELD(force_recover);
    5021         380 :         SET_FIELD(esnap_bs_dev_create);
    5022         380 :         SET_FIELD(esnap_ctx);
    5023             : 
    5024         380 :         dst->opts_size = src->opts_size;
    5025             : 
    5026             :         /* You should not remove this statement, but need to update the assert statement
    5027             :          * if you add a new field, and also add a corresponding SET_FIELD statement */
    5028             :         SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_opts) == 88, "Incorrect size");
    5029             : 
    5030             : #undef FIELD_OK
    5031             : #undef SET_FIELD
    5032             : 
    5033         380 :         return 0;
    5034             : }
    5035             : 
    5036             : void
    5037         391 : spdk_bs_load(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
    5038             :              spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
    5039             : {
    5040             :         struct spdk_blob_store  *bs;
    5041             :         struct spdk_bs_cpl      cpl;
    5042             :         struct spdk_bs_load_ctx *ctx;
    5043         391 :         struct spdk_bs_opts     opts = {};
    5044             :         int err;
    5045             : 
    5046         391 :         SPDK_DEBUGLOG(blob, "Loading blobstore from dev %p\n", dev);
    5047             : 
    5048         391 :         if ((dev->phys_blocklen % dev->blocklen) != 0) {
    5049           5 :                 SPDK_DEBUGLOG(blob, "unsupported dev block length of %d\n", dev->blocklen);
    5050           5 :                 dev->destroy(dev);
    5051           5 :                 cb_fn(cb_arg, NULL, -EINVAL);
    5052          15 :                 return;
    5053             :         }
    5054             : 
    5055         386 :         spdk_bs_opts_init(&opts, sizeof(opts));
    5056         386 :         if (o) {
    5057         152 :                 if (bs_opts_copy(o, &opts)) {
    5058           0 :                         return;
    5059             :                 }
    5060             :         }
    5061             : 
    5062         386 :         if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) {
    5063          10 :                 dev->destroy(dev);
    5064          10 :                 cb_fn(cb_arg, NULL, -EINVAL);
    5065          10 :                 return;
    5066             :         }
    5067             : 
    5068         376 :         err = bs_alloc(dev, &opts, &bs, &ctx);
    5069         376 :         if (err) {
    5070           0 :                 dev->destroy(dev);
    5071           0 :                 cb_fn(cb_arg, NULL, err);
    5072           0 :                 return;
    5073             :         }
    5074             : 
    5075         376 :         cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE;
    5076         376 :         cpl.u.bs_handle.cb_fn = cb_fn;
    5077         376 :         cpl.u.bs_handle.cb_arg = cb_arg;
    5078         376 :         cpl.u.bs_handle.bs = bs;
    5079             : 
    5080         376 :         ctx->seq = bs_sequence_start_bs(bs->md_channel, &cpl);
    5081         376 :         if (!ctx->seq) {
    5082           0 :                 spdk_free(ctx->super);
    5083           0 :                 free(ctx);
    5084           0 :                 bs_free(bs);
    5085           0 :                 cb_fn(cb_arg, NULL, -ENOMEM);
    5086           0 :                 return;
    5087             :         }
    5088             : 
    5089             :         /* Read the super block */
    5090         376 :         bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0),
    5091         376 :                              bs_byte_to_lba(bs, sizeof(*ctx->super)),
    5092             :                              bs_load_super_cpl, ctx);
    5093             : }
    5094             : 
    5095             : /* END spdk_bs_load */
    5096             : 
    5097             : /* START spdk_bs_dump */
    5098             : 
    5099             : static void
    5100           0 : bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_load_ctx *ctx, int bserrno)
    5101             : {
    5102           0 :         spdk_free(ctx->super);
    5103             : 
    5104             :         /*
    5105             :          * We need to defer calling bs_call_cpl() until after
    5106             :          * dev destruction, so tuck these away for later use.
    5107             :          */
    5108           0 :         ctx->bs->unload_err = bserrno;
    5109           0 :         memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
    5110           0 :         seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
    5111             : 
    5112           0 :         bs_sequence_finish(seq, 0);
    5113           0 :         bs_free(ctx->bs);
    5114           0 :         free(ctx);
    5115           0 : }
    5116             : 
    5117             : static void
    5118           0 : bs_dump_print_xattr(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc)
    5119             : {
    5120             :         struct spdk_blob_md_descriptor_xattr *desc_xattr;
    5121             :         uint32_t i;
    5122             :         const char *type;
    5123             : 
    5124           0 :         desc_xattr = (struct spdk_blob_md_descriptor_xattr *)desc;
    5125             : 
    5126           0 :         if (desc_xattr->length !=
    5127             :             sizeof(desc_xattr->name_length) + sizeof(desc_xattr->value_length) +
    5128           0 :             desc_xattr->name_length + desc_xattr->value_length) {
    5129             :         }
    5130             : 
    5131           0 :         memcpy(ctx->xattr_name, desc_xattr->name, desc_xattr->name_length);
    5132           0 :         ctx->xattr_name[desc_xattr->name_length] = '\0';
    5133           0 :         if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
    5134           0 :                 type = "XATTR";
    5135           0 :         } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
    5136           0 :                 type = "XATTR_INTERNAL";
    5137             :         } else {
    5138           0 :                 assert(false);
    5139             :                 type = "XATTR_?";
    5140             :         }
    5141           0 :         fprintf(ctx->fp, "%s: name = \"%s\"\n", type, ctx->xattr_name);
    5142           0 :         fprintf(ctx->fp, "       value = \"");
    5143           0 :         ctx->print_xattr_fn(ctx->fp, ctx->super->bstype.bstype, ctx->xattr_name,
    5144           0 :                             (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length),
    5145           0 :                             desc_xattr->value_length);
    5146           0 :         fprintf(ctx->fp, "\"\n");
    5147           0 :         for (i = 0; i < desc_xattr->value_length; i++) {
    5148           0 :                 if (i % 16 == 0) {
    5149           0 :                         fprintf(ctx->fp, "               ");
    5150             :                 }
    5151           0 :                 fprintf(ctx->fp, "%02" PRIx8 " ", *((uint8_t *)desc_xattr->name + desc_xattr->name_length + i));
    5152           0 :                 if ((i + 1) % 16 == 0) {
    5153           0 :                         fprintf(ctx->fp, "\n");
    5154             :                 }
    5155             :         }
    5156           0 :         if (i % 16 != 0) {
    5157           0 :                 fprintf(ctx->fp, "\n");
    5158             :         }
    5159           0 : }
    5160             : 
    5161             : struct type_flag_desc {
    5162             :         uint64_t mask;
    5163             :         uint64_t val;
    5164             :         const char *name;
    5165             : };
    5166             : 
    5167             : static void
    5168           0 : bs_dump_print_type_bits(struct spdk_bs_load_ctx *ctx, uint64_t flags,
    5169             :                         struct type_flag_desc *desc, size_t numflags)
    5170             : {
    5171           0 :         uint64_t covered = 0;
    5172             :         size_t i;
    5173             : 
    5174           0 :         for (i = 0; i < numflags; i++) {
    5175           0 :                 if ((desc[i].mask & flags) != desc[i].val) {
    5176           0 :                         continue;
    5177             :                 }
    5178           0 :                 fprintf(ctx->fp, "\t\t 0x%016" PRIx64 " %s", desc[i].val, desc[i].name);
    5179           0 :                 if (desc[i].mask != desc[i].val) {
    5180           0 :                         fprintf(ctx->fp, " (mask 0x%" PRIx64 " value 0x%" PRIx64 ")",
    5181           0 :                                 desc[i].mask, desc[i].val);
    5182             :                 }
    5183           0 :                 fprintf(ctx->fp, "\n");
    5184           0 :                 covered |= desc[i].mask;
    5185             :         }
    5186           0 :         if ((flags & ~covered) != 0) {
    5187           0 :                 fprintf(ctx->fp, "\t\t 0x%016" PRIx64 " Unknown\n", flags & ~covered);
    5188             :         }
    5189           0 : }
    5190             : 
    5191             : static void
    5192           0 : bs_dump_print_type_flags(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc)
    5193             : {
    5194             :         struct spdk_blob_md_descriptor_flags *type_desc;
    5195             : #define ADD_FLAG(f) { f, f, #f }
    5196             : #define ADD_MASK_VAL(m, v) { m, v, #v }
    5197             :         static struct type_flag_desc invalid[] = {
    5198             :                 ADD_FLAG(SPDK_BLOB_THIN_PROV),
    5199             :                 ADD_FLAG(SPDK_BLOB_INTERNAL_XATTR),
    5200             :                 ADD_FLAG(SPDK_BLOB_EXTENT_TABLE),
    5201             :         };
    5202             :         static struct type_flag_desc data_ro[] = {
    5203             :                 ADD_FLAG(SPDK_BLOB_READ_ONLY),
    5204             :         };
    5205             :         static struct type_flag_desc md_ro[] = {
    5206             :                 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_DEFAULT),
    5207             :                 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_NONE),
    5208             :                 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_UNMAP),
    5209             :                 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_WRITE_ZEROES),
    5210             :         };
    5211             : #undef ADD_FLAG
    5212             : #undef ADD_MASK_VAL
    5213             : 
    5214           0 :         type_desc = (struct spdk_blob_md_descriptor_flags *)desc;
    5215           0 :         fprintf(ctx->fp, "Flags:\n");
    5216           0 :         fprintf(ctx->fp, "\tinvalid: 0x%016" PRIx64 "\n", type_desc->invalid_flags);
    5217           0 :         bs_dump_print_type_bits(ctx, type_desc->invalid_flags, invalid,
    5218             :                                 SPDK_COUNTOF(invalid));
    5219           0 :         fprintf(ctx->fp, "\tdata_ro: 0x%016" PRIx64 "\n", type_desc->data_ro_flags);
    5220           0 :         bs_dump_print_type_bits(ctx, type_desc->data_ro_flags, data_ro,
    5221             :                                 SPDK_COUNTOF(data_ro));
    5222           0 :         fprintf(ctx->fp, "\t  md_ro: 0x%016" PRIx64 "\n", type_desc->md_ro_flags);
    5223           0 :         bs_dump_print_type_bits(ctx, type_desc->md_ro_flags, md_ro,
    5224             :                                 SPDK_COUNTOF(md_ro));
    5225           0 : }
    5226             : 
    5227             : static void
    5228           0 : bs_dump_print_extent_table(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc)
    5229             : {
    5230             :         struct spdk_blob_md_descriptor_extent_table *et_desc;
    5231             :         uint64_t num_extent_pages;
    5232             :         uint32_t et_idx;
    5233             : 
    5234           0 :         et_desc = (struct spdk_blob_md_descriptor_extent_table *)desc;
    5235           0 :         num_extent_pages = (et_desc->length - sizeof(et_desc->num_clusters)) /
    5236             :                            sizeof(et_desc->extent_page[0]);
    5237             : 
    5238           0 :         fprintf(ctx->fp, "Extent table:\n");
    5239           0 :         for (et_idx = 0; et_idx < num_extent_pages; et_idx++) {
    5240           0 :                 if (et_desc->extent_page[et_idx].page_idx == 0) {
    5241             :                         /* Zeroes represent unallocated extent pages. */
    5242           0 :                         continue;
    5243             :                 }
    5244           0 :                 fprintf(ctx->fp, "\tExtent page: %5" PRIu32 " length %3" PRIu32
    5245             :                         " at LBA %" PRIu64 "\n", et_desc->extent_page[et_idx].page_idx,
    5246             :                         et_desc->extent_page[et_idx].num_pages,
    5247             :                         bs_md_page_to_lba(ctx->bs, et_desc->extent_page[et_idx].page_idx));
    5248             :         }
    5249           0 : }
    5250             : 
    5251             : static void
    5252           0 : bs_dump_print_md_page(struct spdk_bs_load_ctx *ctx)
    5253             : {
    5254           0 :         uint32_t page_idx = ctx->cur_page;
    5255           0 :         struct spdk_blob_md_page *page = ctx->page;
    5256             :         struct spdk_blob_md_descriptor *desc;
    5257           0 :         size_t cur_desc = 0;
    5258             :         uint32_t crc;
    5259             : 
    5260           0 :         fprintf(ctx->fp, "=========\n");
    5261           0 :         fprintf(ctx->fp, "Metadata Page Index: %" PRIu32 " (0x%" PRIx32 ")\n", page_idx, page_idx);
    5262           0 :         fprintf(ctx->fp, "Start LBA: %" PRIu64 "\n", bs_md_page_to_lba(ctx->bs, page_idx));
    5263           0 :         fprintf(ctx->fp, "Blob ID: 0x%" PRIx64 "\n", page->id);
    5264           0 :         fprintf(ctx->fp, "Sequence: %" PRIu32 "\n", page->sequence_num);
    5265           0 :         if (page->next == SPDK_INVALID_MD_PAGE) {
    5266           0 :                 fprintf(ctx->fp, "Next: None\n");
    5267             :         } else {
    5268           0 :                 fprintf(ctx->fp, "Next: %" PRIu32 "\n", page->next);
    5269             :         }
    5270           0 :         fprintf(ctx->fp, "In used bit array%s:", ctx->super->clean ? "" : " (not clean: dubious)");
    5271           0 :         if (spdk_bit_array_get(ctx->bs->used_md_pages, page_idx)) {
    5272           0 :                 fprintf(ctx->fp, " md");
    5273             :         }
    5274           0 :         if (spdk_bit_array_get(ctx->bs->used_blobids, page_idx)) {
    5275           0 :                 fprintf(ctx->fp, " blob");
    5276             :         }
    5277           0 :         fprintf(ctx->fp, "\n");
    5278             : 
    5279           0 :         crc = blob_md_page_calc_crc(page);
    5280           0 :         fprintf(ctx->fp, "CRC: 0x%" PRIx32 " (%s)\n", page->crc, crc == page->crc ? "OK" : "Mismatch");
    5281             : 
    5282           0 :         desc = (struct spdk_blob_md_descriptor *)page->descriptors;
    5283           0 :         while (cur_desc < sizeof(page->descriptors)) {
    5284           0 :                 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
    5285           0 :                         if (desc->length == 0) {
    5286             :                                 /* If padding and length are 0, this terminates the page */
    5287           0 :                                 break;
    5288             :                         }
    5289           0 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
    5290             :                         struct spdk_blob_md_descriptor_extent_rle       *desc_extent_rle;
    5291             :                         unsigned int                            i;
    5292             : 
    5293           0 :                         desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
    5294             : 
    5295           0 :                         for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
    5296           0 :                                 if (desc_extent_rle->extents[i].cluster_idx != 0) {
    5297           0 :                                         fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32,
    5298             :                                                 desc_extent_rle->extents[i].cluster_idx);
    5299             :                                 } else {
    5300           0 :                                         fprintf(ctx->fp, "Unallocated Extent - ");
    5301             :                                 }
    5302           0 :                                 fprintf(ctx->fp, " Length: %" PRIu32, desc_extent_rle->extents[i].length);
    5303           0 :                                 fprintf(ctx->fp, "\n");
    5304             :                         }
    5305           0 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
    5306             :                         struct spdk_blob_md_descriptor_extent_page      *desc_extent;
    5307             :                         unsigned int                                    i;
    5308             : 
    5309           0 :                         desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
    5310             : 
    5311           0 :                         for (i = 0; i < desc_extent->length / sizeof(desc_extent->cluster_idx[0]); i++) {
    5312           0 :                                 if (desc_extent->cluster_idx[i] != 0) {
    5313           0 :                                         fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32,
    5314             :                                                 desc_extent->cluster_idx[i]);
    5315             :                                 } else {
    5316           0 :                                         fprintf(ctx->fp, "Unallocated Extent");
    5317             :                                 }
    5318           0 :                                 fprintf(ctx->fp, "\n");
    5319             :                         }
    5320           0 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
    5321           0 :                         bs_dump_print_xattr(ctx, desc);
    5322           0 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
    5323           0 :                         bs_dump_print_xattr(ctx, desc);
    5324           0 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
    5325           0 :                         bs_dump_print_type_flags(ctx, desc);
    5326           0 :                 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) {
    5327           0 :                         bs_dump_print_extent_table(ctx, desc);
    5328             :                 } else {
    5329             :                         /* Error */
    5330           0 :                         fprintf(ctx->fp, "Unknown descriptor type %" PRIu8 "\n", desc->type);
    5331             :                 }
    5332             :                 /* Advance to the next descriptor */
    5333           0 :                 cur_desc += sizeof(*desc) + desc->length;
    5334           0 :                 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
    5335           0 :                         break;
    5336             :                 }
    5337           0 :                 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
    5338             :         }
    5339           0 : }
    5340             : 
    5341             : static void
    5342           0 : bs_dump_read_md_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    5343             : {
    5344           0 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    5345             : 
    5346           0 :         if (bserrno != 0) {
    5347           0 :                 bs_dump_finish(seq, ctx, bserrno);
    5348           0 :                 return;
    5349             :         }
    5350             : 
    5351           0 :         if (ctx->page->id != 0) {
    5352           0 :                 bs_dump_print_md_page(ctx);
    5353             :         }
    5354             : 
    5355           0 :         ctx->cur_page++;
    5356             : 
    5357           0 :         if (ctx->cur_page < ctx->super->md_len) {
    5358           0 :                 bs_dump_read_md_page(seq, ctx);
    5359             :         } else {
    5360           0 :                 spdk_free(ctx->page);
    5361           0 :                 bs_dump_finish(seq, ctx, 0);
    5362             :         }
    5363             : }
    5364             : 
    5365             : static void
    5366           0 : bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg)
    5367             : {
    5368           0 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    5369             :         uint64_t lba;
    5370             : 
    5371           0 :         assert(ctx->cur_page < ctx->super->md_len);
    5372           0 :         lba = bs_page_to_lba(ctx->bs, ctx->super->md_start + ctx->cur_page);
    5373           0 :         bs_sequence_read_dev(seq, ctx->page, lba,
    5374           0 :                              bs_byte_to_lba(ctx->bs, ctx->super->md_page_size),
    5375             :                              bs_dump_read_md_page_cpl, ctx);
    5376           0 : }
    5377             : 
    5378             : static void
    5379           0 : bs_dump_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    5380             : {
    5381           0 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    5382             :         int rc;
    5383             : 
    5384           0 :         fprintf(ctx->fp, "Signature: \"%.8s\" ", ctx->super->signature);
    5385           0 :         if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
    5386             :                    sizeof(ctx->super->signature)) != 0) {
    5387           0 :                 fprintf(ctx->fp, "(Mismatch)\n");
    5388           0 :                 bs_dump_finish(seq, ctx, bserrno);
    5389           0 :                 return;
    5390             :         } else {
    5391           0 :                 fprintf(ctx->fp, "(OK)\n");
    5392             :         }
    5393           0 :         fprintf(ctx->fp, "Version: %" PRIu32 "\n", ctx->super->version);
    5394           0 :         fprintf(ctx->fp, "CRC: 0x%x (%s)\n", ctx->super->crc,
    5395           0 :                 (ctx->super->crc == blob_md_page_calc_crc(ctx->super)) ? "OK" : "Mismatch");
    5396           0 :         fprintf(ctx->fp, "Blobstore Type: %.*s\n", SPDK_BLOBSTORE_TYPE_LENGTH, ctx->super->bstype.bstype);
    5397           0 :         fprintf(ctx->fp, "Cluster Size: %" PRIu32 "\n", ctx->super->cluster_size);
    5398           0 :         fprintf(ctx->fp, "Super Blob ID: ");
    5399           0 :         if (ctx->super->super_blob == SPDK_BLOBID_INVALID) {
    5400           0 :                 fprintf(ctx->fp, "(None)\n");
    5401             :         } else {
    5402           0 :                 fprintf(ctx->fp, "0x%" PRIx64 "\n", ctx->super->super_blob);
    5403             :         }
    5404           0 :         fprintf(ctx->fp, "Clean: %" PRIu32 "\n", ctx->super->clean);
    5405           0 :         fprintf(ctx->fp, "Used Metadata Page Mask Start: %" PRIu32 "\n", ctx->super->used_page_mask_start);
    5406           0 :         fprintf(ctx->fp, "Used Metadata Page Mask Length: %" PRIu32 "\n", ctx->super->used_page_mask_len);
    5407           0 :         fprintf(ctx->fp, "Used Cluster Mask Start: %" PRIu32 "\n", ctx->super->used_cluster_mask_start);
    5408           0 :         fprintf(ctx->fp, "Used Cluster Mask Length: %" PRIu32 "\n", ctx->super->used_cluster_mask_len);
    5409           0 :         fprintf(ctx->fp, "Used Blob ID Mask Start: %" PRIu32 "\n", ctx->super->used_blobid_mask_start);
    5410           0 :         fprintf(ctx->fp, "Used Blob ID Mask Length: %" PRIu32 "\n", ctx->super->used_blobid_mask_len);
    5411           0 :         fprintf(ctx->fp, "Metadata Start: %" PRIu32 "\n", ctx->super->md_start);
    5412           0 :         fprintf(ctx->fp, "Metadata Length: %" PRIu32 "\n", ctx->super->md_len);
    5413             : 
    5414           0 :         ctx->cur_page = 0;
    5415           0 :         ctx->page = spdk_zmalloc(ctx->super->md_page_size, 0,
    5416             :                                  NULL, SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
    5417           0 :         if (!ctx->page) {
    5418           0 :                 bs_dump_finish(seq, ctx, -ENOMEM);
    5419           0 :                 return;
    5420             :         }
    5421             : 
    5422           0 :         rc = bs_parse_super(ctx);
    5423           0 :         if (rc < 0) {
    5424           0 :                 bs_load_ctx_fail(ctx, rc);
    5425           0 :                 return;
    5426             :         }
    5427             : 
    5428           0 :         bs_load_read_used_pages(ctx);
    5429             : }
    5430             : 
    5431             : void
    5432           0 : spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_xattr_fn,
    5433             :              spdk_bs_op_complete cb_fn, void *cb_arg)
    5434             : {
    5435             :         struct spdk_blob_store  *bs;
    5436             :         struct spdk_bs_cpl      cpl;
    5437             :         struct spdk_bs_load_ctx *ctx;
    5438           0 :         struct spdk_bs_opts     opts = {};
    5439             :         int err;
    5440             : 
    5441           0 :         SPDK_DEBUGLOG(blob, "Dumping blobstore from dev %p\n", dev);
    5442             : 
    5443           0 :         spdk_bs_opts_init(&opts, sizeof(opts));
    5444             : 
    5445           0 :         err = bs_alloc(dev, &opts, &bs, &ctx);
    5446           0 :         if (err) {
    5447           0 :                 dev->destroy(dev);
    5448           0 :                 cb_fn(cb_arg, err);
    5449           0 :                 return;
    5450             :         }
    5451             : 
    5452           0 :         ctx->dumping = true;
    5453           0 :         ctx->fp = fp;
    5454           0 :         ctx->print_xattr_fn = print_xattr_fn;
    5455             : 
    5456           0 :         cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
    5457           0 :         cpl.u.bs_basic.cb_fn = cb_fn;
    5458           0 :         cpl.u.bs_basic.cb_arg = cb_arg;
    5459             : 
    5460           0 :         ctx->seq = bs_sequence_start_bs(bs->md_channel, &cpl);
    5461           0 :         if (!ctx->seq) {
    5462           0 :                 spdk_free(ctx->super);
    5463           0 :                 free(ctx);
    5464           0 :                 bs_free(bs);
    5465           0 :                 cb_fn(cb_arg, -ENOMEM);
    5466           0 :                 return;
    5467             :         }
    5468             : 
    5469             :         /* Read the super block */
    5470           0 :         bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0),
    5471           0 :                              bs_byte_to_lba(bs, sizeof(*ctx->super)),
    5472             :                              bs_dump_super_cpl, ctx);
    5473             : }
    5474             : 
    5475             : /* END spdk_bs_dump */
    5476             : 
    5477             : /* START spdk_bs_init */
    5478             : 
    5479             : static void
    5480         587 : bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    5481             : {
    5482         587 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    5483             : 
    5484         587 :         ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters);
    5485         587 :         spdk_free(ctx->super);
    5486         587 :         free(ctx);
    5487             : 
    5488         587 :         bs_sequence_finish(seq, bserrno);
    5489         587 : }
    5490             : 
    5491             : static void
    5492         587 : bs_init_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    5493             : {
    5494         587 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    5495             : 
    5496             :         /* Write super block */
    5497         587 :         bs_sequence_write_dev(seq, ctx->super, bs_page_to_lba(ctx->bs, 0),
    5498         587 :                               bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)),
    5499             :                               bs_init_persist_super_cpl, ctx);
    5500         587 : }
    5501             : 
    5502             : void
    5503         607 : spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
    5504             :              spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
    5505             : {
    5506             :         struct spdk_bs_load_ctx *ctx;
    5507             :         struct spdk_blob_store  *bs;
    5508             :         struct spdk_bs_cpl      cpl;
    5509             :         spdk_bs_sequence_t      *seq;
    5510             :         spdk_bs_batch_t         *batch;
    5511             :         uint64_t                num_md_lba;
    5512             :         uint64_t                num_md_pages;
    5513             :         uint64_t                num_md_clusters;
    5514             :         uint64_t                max_used_cluster_mask_len;
    5515             :         uint32_t                i;
    5516         607 :         struct spdk_bs_opts     opts = {};
    5517             :         int                     rc;
    5518             :         uint64_t                lba, lba_count;
    5519             : 
    5520         607 :         SPDK_DEBUGLOG(blob, "Initializing blobstore on dev %p\n", dev);
    5521         607 :         if ((dev->phys_blocklen % dev->blocklen) != 0) {
    5522           5 :                 SPDK_ERRLOG("unsupported dev block length of %d\n",
    5523             :                             dev->blocklen);
    5524           5 :                 dev->destroy(dev);
    5525           5 :                 cb_fn(cb_arg, NULL, -EINVAL);
    5526          20 :                 return;
    5527             :         }
    5528             : 
    5529         602 :         spdk_bs_opts_init(&opts, sizeof(opts));
    5530         602 :         if (o) {
    5531         223 :                 if (bs_opts_copy(o, &opts)) {
    5532           0 :                         return;
    5533             :                 }
    5534             :         }
    5535             : 
    5536         602 :         if (bs_opts_verify(&opts) != 0) {
    5537          10 :                 dev->destroy(dev);
    5538          10 :                 cb_fn(cb_arg, NULL, -EINVAL);
    5539          10 :                 return;
    5540             :         }
    5541             : 
    5542         592 :         rc = bs_alloc(dev, &opts, &bs, &ctx);
    5543         592 :         if (rc) {
    5544           0 :                 dev->destroy(dev);
    5545           0 :                 cb_fn(cb_arg, NULL, rc);
    5546           0 :                 return;
    5547             :         }
    5548             : 
    5549         592 :         if (opts.num_md_pages == SPDK_BLOB_OPTS_NUM_MD_PAGES) {
    5550             :                 /* By default, allocate 1 page per cluster.
    5551             :                  * Technically, this over-allocates metadata
    5552             :                  * because more metadata will reduce the number
    5553             :                  * of usable clusters. This can be addressed with
    5554             :                  * more complex math in the future.
    5555             :                  */
    5556         582 :                 bs->md_len = bs->total_clusters;
    5557             :         } else {
    5558          10 :                 bs->md_len = opts.num_md_pages;
    5559             :         }
    5560         592 :         rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len);
    5561         592 :         if (rc < 0) {
    5562           0 :                 spdk_free(ctx->super);
    5563           0 :                 free(ctx);
    5564           0 :                 bs_free(bs);
    5565           0 :                 cb_fn(cb_arg, NULL, -ENOMEM);
    5566           0 :                 return;
    5567             :         }
    5568             : 
    5569         592 :         rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len);
    5570         592 :         if (rc < 0) {
    5571           0 :                 spdk_free(ctx->super);
    5572           0 :                 free(ctx);
    5573           0 :                 bs_free(bs);
    5574           0 :                 cb_fn(cb_arg, NULL, -ENOMEM);
    5575           0 :                 return;
    5576             :         }
    5577             : 
    5578         592 :         rc = spdk_bit_array_resize(&bs->open_blobids, bs->md_len);
    5579         592 :         if (rc < 0) {
    5580           0 :                 spdk_free(ctx->super);
    5581           0 :                 free(ctx);
    5582           0 :                 bs_free(bs);
    5583           0 :                 cb_fn(cb_arg, NULL, -ENOMEM);
    5584           0 :                 return;
    5585             :         }
    5586             : 
    5587         592 :         memcpy(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
    5588             :                sizeof(ctx->super->signature));
    5589         592 :         ctx->super->version = SPDK_BS_VERSION;
    5590         592 :         ctx->super->length = sizeof(*ctx->super);
    5591         592 :         ctx->super->super_blob = bs->super_blob;
    5592         592 :         ctx->super->clean = 0;
    5593         592 :         ctx->super->cluster_size = bs->cluster_sz;
    5594         592 :         ctx->super->io_unit_size = bs->io_unit_size;
    5595         592 :         ctx->super->md_page_size = bs->md_page_size;
    5596         592 :         memcpy(&ctx->super->bstype, &bs->bstype, sizeof(bs->bstype));
    5597             : 
    5598             :         /* Calculate how many pages the metadata consumes at the front
    5599             :          * of the disk.
    5600             :          */
    5601             : 
    5602             :         /* The super block uses 1 page */
    5603         592 :         num_md_pages = 1;
    5604             : 
    5605             :         /* The used_md_pages mask requires 1 bit per metadata page, rounded
    5606             :          * up to the nearest page, plus a header.
    5607             :          */
    5608         592 :         ctx->super->used_page_mask_start = num_md_pages;
    5609         592 :         ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
    5610         592 :                                          spdk_divide_round_up(bs->md_len, 8),
    5611         592 :                                          ctx->super->md_page_size);
    5612         592 :         num_md_pages += ctx->super->used_page_mask_len;
    5613             : 
    5614             :         /* The used_clusters mask requires 1 bit per cluster, rounded
    5615             :          * up to the nearest page, plus a header.
    5616             :          */
    5617         592 :         ctx->super->used_cluster_mask_start = num_md_pages;
    5618         592 :         ctx->super->used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
    5619         592 :                                             spdk_divide_round_up(bs->total_clusters, 8),
    5620         592 :                                             ctx->super->md_page_size);
    5621             :         /* The blobstore might be extended, then the used_cluster bitmap will need more space.
    5622             :          * Here we calculate the max clusters we can support according to the
    5623             :          * num_md_pages (bs->md_len).
    5624             :          */
    5625         592 :         max_used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
    5626         592 :                                     spdk_divide_round_up(bs->md_len, 8),
    5627         592 :                                     ctx->super->md_page_size);
    5628         592 :         max_used_cluster_mask_len = spdk_max(max_used_cluster_mask_len,
    5629             :                                              ctx->super->used_cluster_mask_len);
    5630         592 :         num_md_pages += max_used_cluster_mask_len;
    5631             : 
    5632             :         /* The used_blobids mask requires 1 bit per metadata page, rounded
    5633             :          * up to the nearest page, plus a header.
    5634             :          */
    5635         592 :         ctx->super->used_blobid_mask_start = num_md_pages;
    5636         592 :         ctx->super->used_blobid_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
    5637         592 :                                            spdk_divide_round_up(bs->md_len, 8),
    5638         592 :                                            ctx->super->md_page_size);
    5639         592 :         num_md_pages += ctx->super->used_blobid_mask_len;
    5640             : 
    5641             :         /* The metadata region size was chosen above */
    5642         592 :         ctx->super->md_start = bs->md_start = num_md_pages;
    5643         592 :         ctx->super->md_len = bs->md_len;
    5644         592 :         num_md_pages += bs->md_len;
    5645             : 
    5646         592 :         num_md_lba = bs_page_to_lba(bs, num_md_pages);
    5647             : 
    5648         592 :         ctx->super->size = dev->blockcnt * dev->blocklen;
    5649             : 
    5650         592 :         ctx->super->crc = blob_md_page_calc_crc(ctx->super);
    5651             : 
    5652         592 :         num_md_clusters = spdk_divide_round_up(num_md_pages, bs->pages_per_cluster);
    5653         592 :         if (num_md_clusters > bs->total_clusters) {
    5654           5 :                 SPDK_ERRLOG("Blobstore metadata cannot use more clusters than is available, "
    5655             :                             "please decrease number of pages reserved for metadata "
    5656             :                             "or increase cluster size.\n");
    5657           5 :                 spdk_free(ctx->super);
    5658           5 :                 spdk_bit_array_free(&ctx->used_clusters);
    5659           5 :                 free(ctx);
    5660           5 :                 bs_free(bs);
    5661           5 :                 cb_fn(cb_arg, NULL, -ENOMEM);
    5662           5 :                 return;
    5663             :         }
    5664             :         /* Claim all of the clusters used by the metadata */
    5665       79357 :         for (i = 0; i < num_md_clusters; i++) {
    5666       78770 :                 spdk_bit_array_set(ctx->used_clusters, i);
    5667             :         }
    5668             : 
    5669         587 :         bs->num_free_clusters -= num_md_clusters;
    5670         587 :         bs->total_data_clusters = bs->num_free_clusters;
    5671             : 
    5672         587 :         cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE;
    5673         587 :         cpl.u.bs_handle.cb_fn = cb_fn;
    5674         587 :         cpl.u.bs_handle.cb_arg = cb_arg;
    5675         587 :         cpl.u.bs_handle.bs = bs;
    5676             : 
    5677         587 :         seq = bs_sequence_start_bs(bs->md_channel, &cpl);
    5678         587 :         if (!seq) {
    5679           0 :                 spdk_free(ctx->super);
    5680           0 :                 free(ctx);
    5681           0 :                 bs_free(bs);
    5682           0 :                 cb_fn(cb_arg, NULL, -ENOMEM);
    5683           0 :                 return;
    5684             :         }
    5685             : 
    5686         587 :         batch = bs_sequence_to_batch(seq, bs_init_trim_cpl, ctx);
    5687             : 
    5688             :         /* Clear metadata space */
    5689         587 :         bs_batch_write_zeroes_dev(batch, 0, num_md_lba);
    5690             : 
    5691         587 :         lba = num_md_lba;
    5692         587 :         lba_count = ctx->bs->dev->blockcnt - lba;
    5693         587 :         switch (opts.clear_method) {
    5694         567 :         case BS_CLEAR_WITH_UNMAP:
    5695             :                 /* Trim data clusters */
    5696         567 :                 bs_batch_unmap_dev(batch, lba, lba_count);
    5697         567 :                 break;
    5698           0 :         case BS_CLEAR_WITH_WRITE_ZEROES:
    5699             :                 /* Write_zeroes to data clusters */
    5700           0 :                 bs_batch_write_zeroes_dev(batch, lba, lba_count);
    5701           0 :                 break;
    5702          20 :         case BS_CLEAR_WITH_NONE:
    5703             :         default:
    5704          20 :                 break;
    5705             :         }
    5706             : 
    5707         587 :         bs_batch_close(batch);
    5708             : }
    5709             : 
    5710             : /* END spdk_bs_init */
    5711             : 
    5712             : /* START spdk_bs_destroy */
    5713             : 
    5714             : static void
    5715           5 : bs_destroy_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    5716             : {
    5717           5 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    5718           5 :         struct spdk_blob_store *bs = ctx->bs;
    5719             : 
    5720             :         /*
    5721             :          * We need to defer calling bs_call_cpl() until after
    5722             :          * dev destruction, so tuck these away for later use.
    5723             :          */
    5724           5 :         bs->unload_err = bserrno;
    5725           5 :         memcpy(&bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
    5726           5 :         seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
    5727             : 
    5728           5 :         bs_sequence_finish(seq, bserrno);
    5729             : 
    5730           5 :         bs_free(bs);
    5731           5 :         free(ctx);
    5732           5 : }
    5733             : 
    5734             : void
    5735           5 : spdk_bs_destroy(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn,
    5736             :                 void *cb_arg)
    5737             : {
    5738             :         struct spdk_bs_cpl      cpl;
    5739             :         spdk_bs_sequence_t      *seq;
    5740             :         struct spdk_bs_load_ctx *ctx;
    5741             : 
    5742           5 :         SPDK_DEBUGLOG(blob, "Destroying blobstore\n");
    5743             : 
    5744           5 :         if (!RB_EMPTY(&bs->open_blobs)) {
    5745           0 :                 SPDK_ERRLOG("Blobstore still has open blobs\n");
    5746           0 :                 cb_fn(cb_arg, -EBUSY);
    5747           0 :                 return;
    5748             :         }
    5749             : 
    5750           5 :         cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
    5751           5 :         cpl.u.bs_basic.cb_fn = cb_fn;
    5752           5 :         cpl.u.bs_basic.cb_arg = cb_arg;
    5753             : 
    5754           5 :         ctx = calloc(1, sizeof(*ctx));
    5755           5 :         if (!ctx) {
    5756           0 :                 cb_fn(cb_arg, -ENOMEM);
    5757           0 :                 return;
    5758             :         }
    5759             : 
    5760           5 :         ctx->bs = bs;
    5761             : 
    5762           5 :         seq = bs_sequence_start_bs(bs->md_channel, &cpl);
    5763           5 :         if (!seq) {
    5764           0 :                 free(ctx);
    5765           0 :                 cb_fn(cb_arg, -ENOMEM);
    5766           0 :                 return;
    5767             :         }
    5768             : 
    5769             :         /* Write zeroes to the super block */
    5770           5 :         bs_sequence_write_zeroes_dev(seq,
    5771             :                                      bs_page_to_lba(bs, 0),
    5772             :                                      bs_byte_to_lba(bs, sizeof(struct spdk_bs_super_block)),
    5773             :                                      bs_destroy_trim_cpl, ctx);
    5774             : }
    5775             : 
    5776             : /* END spdk_bs_destroy */
    5777             : 
    5778             : /* START spdk_bs_unload */
    5779             : 
    5780             : static void
    5781         814 : bs_unload_finish(struct spdk_bs_load_ctx *ctx, int bserrno)
    5782             : {
    5783         814 :         spdk_bs_sequence_t *seq = ctx->seq;
    5784             : 
    5785         814 :         spdk_free(ctx->super);
    5786             : 
    5787             :         /*
    5788             :          * We need to defer calling bs_call_cpl() until after
    5789             :          * dev destruction, so tuck these away for later use.
    5790             :          */
    5791         814 :         ctx->bs->unload_err = bserrno;
    5792         814 :         memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
    5793         814 :         seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
    5794             : 
    5795         814 :         bs_sequence_finish(seq, bserrno);
    5796             : 
    5797         814 :         bs_free(ctx->bs);
    5798         814 :         free(ctx);
    5799         814 : }
    5800             : 
    5801             : static void
    5802         814 : bs_unload_write_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    5803             : {
    5804         814 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    5805             : 
    5806         814 :         bs_unload_finish(ctx, bserrno);
    5807         814 : }
    5808             : 
    5809             : static void
    5810         814 : bs_unload_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    5811             : {
    5812         814 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    5813             : 
    5814         814 :         spdk_free(ctx->mask);
    5815             : 
    5816         814 :         if (bserrno != 0) {
    5817           0 :                 bs_unload_finish(ctx, bserrno);
    5818           0 :                 return;
    5819             :         }
    5820             : 
    5821         814 :         ctx->super->clean = 1;
    5822             : 
    5823         814 :         bs_write_super(seq, ctx->bs, ctx->super, bs_unload_write_super_cpl, ctx);
    5824             : }
    5825             : 
    5826             : static void
    5827         814 : bs_unload_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    5828             : {
    5829         814 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    5830             : 
    5831         814 :         spdk_free(ctx->mask);
    5832         814 :         ctx->mask = NULL;
    5833             : 
    5834         814 :         if (bserrno != 0) {
    5835           0 :                 bs_unload_finish(ctx, bserrno);
    5836           0 :                 return;
    5837             :         }
    5838             : 
    5839         814 :         bs_write_used_clusters(seq, ctx, bs_unload_write_used_clusters_cpl);
    5840             : }
    5841             : 
    5842             : static void
    5843         814 : bs_unload_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    5844             : {
    5845         814 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    5846             : 
    5847         814 :         spdk_free(ctx->mask);
    5848         814 :         ctx->mask = NULL;
    5849             : 
    5850         814 :         if (bserrno != 0) {
    5851           0 :                 bs_unload_finish(ctx, bserrno);
    5852           0 :                 return;
    5853             :         }
    5854             : 
    5855         814 :         bs_write_used_blobids(seq, ctx, bs_unload_write_used_blobids_cpl);
    5856             : }
    5857             : 
    5858             : static void
    5859         814 : bs_unload_read_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    5860             : {
    5861         814 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    5862             :         int rc;
    5863             : 
    5864         814 :         if (bserrno != 0) {
    5865           0 :                 bs_unload_finish(ctx, bserrno);
    5866           0 :                 return;
    5867             :         }
    5868             : 
    5869         814 :         rc = bs_super_validate(ctx->super, ctx->bs);
    5870         814 :         if (rc != 0) {
    5871           0 :                 bs_unload_finish(ctx, rc);
    5872           0 :                 return;
    5873             :         }
    5874             : 
    5875         814 :         bs_write_used_md(seq, cb_arg, bs_unload_write_used_pages_cpl);
    5876             : }
    5877             : 
    5878             : void
    5879         824 : spdk_bs_unload(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, void *cb_arg)
    5880             : {
    5881             :         struct spdk_bs_cpl      cpl;
    5882             :         struct spdk_bs_load_ctx *ctx;
    5883             : 
    5884         824 :         SPDK_DEBUGLOG(blob, "Syncing blobstore\n");
    5885             : 
    5886             :         /*
    5887             :          * If external snapshot channels are being destroyed while the blobstore is unloaded, the
    5888             :          * unload is deferred until after the channel destruction completes.
    5889             :          */
    5890         824 :         if (bs->esnap_channels_unloading != 0) {
    5891           5 :                 if (bs->esnap_unload_cb_fn != NULL) {
    5892           0 :                         SPDK_ERRLOG("Blobstore unload in progress\n");
    5893           0 :                         cb_fn(cb_arg, -EBUSY);
    5894          10 :                         return;
    5895             :                 }
    5896           5 :                 SPDK_DEBUGLOG(blob_esnap, "Blobstore unload deferred: %" PRIu32
    5897             :                               " esnap clones are unloading\n", bs->esnap_channels_unloading);
    5898           5 :                 bs->esnap_unload_cb_fn = cb_fn;
    5899           5 :                 bs->esnap_unload_cb_arg = cb_arg;
    5900           5 :                 return;
    5901             :         }
    5902         819 :         if (bs->esnap_unload_cb_fn != NULL) {
    5903           5 :                 SPDK_DEBUGLOG(blob_esnap, "Blobstore deferred unload progressing\n");
    5904           5 :                 assert(bs->esnap_unload_cb_fn == cb_fn);
    5905           5 :                 assert(bs->esnap_unload_cb_arg == cb_arg);
    5906           5 :                 bs->esnap_unload_cb_fn = NULL;
    5907           5 :                 bs->esnap_unload_cb_arg = NULL;
    5908             :         }
    5909             : 
    5910         819 :         if (!RB_EMPTY(&bs->open_blobs)) {
    5911           5 :                 SPDK_ERRLOG("Blobstore still has open blobs\n");
    5912           5 :                 cb_fn(cb_arg, -EBUSY);
    5913           5 :                 return;
    5914             :         }
    5915             : 
    5916         814 :         ctx = calloc(1, sizeof(*ctx));
    5917         814 :         if (!ctx) {
    5918           0 :                 cb_fn(cb_arg, -ENOMEM);
    5919           0 :                 return;
    5920             :         }
    5921             : 
    5922         814 :         ctx->bs = bs;
    5923             : 
    5924         814 :         ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
    5925             :                                   SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
    5926         814 :         if (!ctx->super) {
    5927           0 :                 free(ctx);
    5928           0 :                 cb_fn(cb_arg, -ENOMEM);
    5929           0 :                 return;
    5930             :         }
    5931             : 
    5932         814 :         cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
    5933         814 :         cpl.u.bs_basic.cb_fn = cb_fn;
    5934         814 :         cpl.u.bs_basic.cb_arg = cb_arg;
    5935             : 
    5936         814 :         ctx->seq = bs_sequence_start_bs(bs->md_channel, &cpl);
    5937         814 :         if (!ctx->seq) {
    5938           0 :                 spdk_free(ctx->super);
    5939           0 :                 free(ctx);
    5940           0 :                 cb_fn(cb_arg, -ENOMEM);
    5941           0 :                 return;
    5942             :         }
    5943             : 
    5944             :         /* Read super block */
    5945         814 :         bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0),
    5946         814 :                              bs_byte_to_lba(bs, sizeof(*ctx->super)),
    5947             :                              bs_unload_read_super_cpl, ctx);
    5948             : }
    5949             : 
    5950             : /* END spdk_bs_unload */
    5951             : 
    5952             : /* START spdk_bs_set_super */
    5953             : 
    5954             : struct spdk_bs_set_super_ctx {
    5955             :         struct spdk_blob_store          *bs;
    5956             :         struct spdk_bs_super_block      *super;
    5957             : };
    5958             : 
    5959             : static void
    5960          10 : bs_set_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    5961             : {
    5962          10 :         struct spdk_bs_set_super_ctx    *ctx = cb_arg;
    5963             : 
    5964          10 :         if (bserrno != 0) {
    5965           0 :                 SPDK_ERRLOG("Unable to write to super block of blobstore\n");
    5966             :         }
    5967             : 
    5968          10 :         spdk_free(ctx->super);
    5969             : 
    5970          10 :         bs_sequence_finish(seq, bserrno);
    5971             : 
    5972          10 :         free(ctx);
    5973          10 : }
    5974             : 
    5975             : static void
    5976          10 : bs_set_super_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    5977             : {
    5978          10 :         struct spdk_bs_set_super_ctx    *ctx = cb_arg;
    5979             :         int rc;
    5980             : 
    5981          10 :         if (bserrno != 0) {
    5982           0 :                 SPDK_ERRLOG("Unable to read super block of blobstore\n");
    5983           0 :                 spdk_free(ctx->super);
    5984           0 :                 bs_sequence_finish(seq, bserrno);
    5985           0 :                 free(ctx);
    5986           0 :                 return;
    5987             :         }
    5988             : 
    5989          10 :         rc = bs_super_validate(ctx->super, ctx->bs);
    5990          10 :         if (rc != 0) {
    5991           0 :                 SPDK_ERRLOG("Not a valid super block\n");
    5992           0 :                 spdk_free(ctx->super);
    5993           0 :                 bs_sequence_finish(seq, rc);
    5994           0 :                 free(ctx);
    5995           0 :                 return;
    5996             :         }
    5997             : 
    5998          10 :         bs_write_super(seq, ctx->bs, ctx->super, bs_set_super_write_cpl, ctx);
    5999             : }
    6000             : 
    6001             : void
    6002          10 : spdk_bs_set_super(struct spdk_blob_store *bs, spdk_blob_id blobid,
    6003             :                   spdk_bs_op_complete cb_fn, void *cb_arg)
    6004             : {
    6005             :         struct spdk_bs_cpl              cpl;
    6006             :         spdk_bs_sequence_t              *seq;
    6007             :         struct spdk_bs_set_super_ctx    *ctx;
    6008             : 
    6009          10 :         SPDK_DEBUGLOG(blob, "Setting super blob id on blobstore\n");
    6010             : 
    6011          10 :         ctx = calloc(1, sizeof(*ctx));
    6012          10 :         if (!ctx) {
    6013           0 :                 cb_fn(cb_arg, -ENOMEM);
    6014           0 :                 return;
    6015             :         }
    6016             : 
    6017          10 :         ctx->bs = bs;
    6018             : 
    6019          10 :         ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
    6020             :                                   SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
    6021          10 :         if (!ctx->super) {
    6022           0 :                 free(ctx);
    6023           0 :                 cb_fn(cb_arg, -ENOMEM);
    6024           0 :                 return;
    6025             :         }
    6026             : 
    6027          10 :         cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
    6028          10 :         cpl.u.bs_basic.cb_fn = cb_fn;
    6029          10 :         cpl.u.bs_basic.cb_arg = cb_arg;
    6030             : 
    6031          10 :         seq = bs_sequence_start_bs(bs->md_channel, &cpl);
    6032          10 :         if (!seq) {
    6033           0 :                 spdk_free(ctx->super);
    6034           0 :                 free(ctx);
    6035           0 :                 cb_fn(cb_arg, -ENOMEM);
    6036           0 :                 return;
    6037             :         }
    6038             : 
    6039          10 :         bs->super_blob = blobid;
    6040             : 
    6041             :         /* Read super block */
    6042          10 :         bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0),
    6043          10 :                              bs_byte_to_lba(bs, sizeof(*ctx->super)),
    6044             :                              bs_set_super_read_cpl, ctx);
    6045             : }
    6046             : 
    6047             : /* END spdk_bs_set_super */
    6048             : 
    6049             : void
    6050          15 : spdk_bs_get_super(struct spdk_blob_store *bs,
    6051             :                   spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
    6052             : {
    6053          15 :         if (bs->super_blob == SPDK_BLOBID_INVALID) {
    6054           5 :                 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOENT);
    6055             :         } else {
    6056          10 :                 cb_fn(cb_arg, bs->super_blob, 0);
    6057             :         }
    6058          15 : }
    6059             : 
    6060             : uint64_t
    6061         254 : spdk_bs_get_cluster_size(struct spdk_blob_store *bs)
    6062             : {
    6063         254 :         return bs->cluster_sz;
    6064             : }
    6065             : 
    6066             : uint64_t
    6067         114 : spdk_bs_get_page_size(struct spdk_blob_store *bs)
    6068             : {
    6069         114 :         return bs->md_page_size;
    6070             : }
    6071             : 
    6072             : uint64_t
    6073        1006 : spdk_bs_get_io_unit_size(struct spdk_blob_store *bs)
    6074             : {
    6075        1006 :         return bs->io_unit_size;
    6076             : }
    6077             : 
    6078             : uint64_t
    6079         700 : spdk_bs_free_cluster_count(struct spdk_blob_store *bs)
    6080             : {
    6081         700 :         return bs->num_free_clusters;
    6082             : }
    6083             : 
    6084             : uint64_t
    6085         194 : spdk_bs_total_data_cluster_count(struct spdk_blob_store *bs)
    6086             : {
    6087         194 :         return bs->total_data_clusters;
    6088             : }
    6089             : 
    6090             : static int
    6091         973 : bs_register_md_thread(struct spdk_blob_store *bs)
    6092             : {
    6093         973 :         bs->md_channel = spdk_get_io_channel(bs);
    6094         973 :         if (!bs->md_channel) {
    6095           0 :                 SPDK_ERRLOG("Failed to get IO channel.\n");
    6096           0 :                 return -1;
    6097             :         }
    6098             : 
    6099         973 :         return 0;
    6100             : }
    6101             : 
    6102             : static int
    6103         973 : bs_unregister_md_thread(struct spdk_blob_store *bs)
    6104             : {
    6105         973 :         spdk_put_io_channel(bs->md_channel);
    6106             : 
    6107         973 :         return 0;
    6108             : }
    6109             : 
    6110             : spdk_blob_id
    6111         712 : spdk_blob_get_id(struct spdk_blob *blob)
    6112             : {
    6113         712 :         assert(blob != NULL);
    6114             : 
    6115         712 :         return blob->id;
    6116             : }
    6117             : 
    6118             : uint64_t
    6119          30 : spdk_blob_get_num_io_units(struct spdk_blob *blob)
    6120             : {
    6121          30 :         assert(blob != NULL);
    6122             : 
    6123          30 :         return bs_cluster_to_io_unit(blob->bs, blob->active.num_clusters);
    6124             : }
    6125             : 
    6126             : uint64_t
    6127         707 : spdk_blob_get_num_clusters(struct spdk_blob *blob)
    6128             : {
    6129         707 :         assert(blob != NULL);
    6130             : 
    6131         707 :         return blob->active.num_clusters;
    6132             : }
    6133             : 
    6134             : uint64_t
    6135         415 : spdk_blob_get_num_allocated_clusters(struct spdk_blob *blob)
    6136             : {
    6137         415 :         assert(blob != NULL);
    6138             : 
    6139         415 :         return blob->active.num_allocated_clusters;
    6140             : }
    6141             : 
    6142             : static uint64_t
    6143          30 : blob_find_io_unit(struct spdk_blob *blob, uint64_t offset, bool is_allocated)
    6144             : {
    6145          30 :         uint64_t blob_io_unit_num = spdk_blob_get_num_io_units(blob);
    6146             : 
    6147          55 :         while (offset < blob_io_unit_num) {
    6148          50 :                 if (bs_io_unit_is_allocated(blob, offset) == is_allocated) {
    6149          25 :                         return offset;
    6150             :                 }
    6151             : 
    6152          25 :                 offset += bs_num_io_units_to_cluster_boundary(blob, offset);
    6153             :         }
    6154             : 
    6155           5 :         return UINT64_MAX;
    6156             : }
    6157             : 
    6158             : uint64_t
    6159          15 : spdk_blob_get_next_allocated_io_unit(struct spdk_blob *blob, uint64_t offset)
    6160             : {
    6161          15 :         return blob_find_io_unit(blob, offset, true);
    6162             : }
    6163             : 
    6164             : uint64_t
    6165          15 : spdk_blob_get_next_unallocated_io_unit(struct spdk_blob *blob, uint64_t offset)
    6166             : {
    6167          15 :         return blob_find_io_unit(blob, offset, false);
    6168             : }
    6169             : 
    6170             : /* START spdk_bs_create_blob */
    6171             : 
    6172             : static void
    6173        2346 : bs_create_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    6174             : {
    6175        2346 :         struct spdk_blob *blob = cb_arg;
    6176        2346 :         uint32_t page_idx = bs_blobid_to_page(blob->id);
    6177             : 
    6178        2346 :         if (bserrno != 0) {
    6179           0 :                 spdk_spin_lock(&blob->bs->used_lock);
    6180           0 :                 spdk_bit_array_clear(blob->bs->used_blobids, page_idx);
    6181           0 :                 bs_release_md_page(blob->bs, page_idx);
    6182           0 :                 spdk_spin_unlock(&blob->bs->used_lock);
    6183             :         }
    6184             : 
    6185        2346 :         blob_free(blob);
    6186             : 
    6187        2346 :         bs_sequence_finish(seq, bserrno);
    6188        2346 : }
    6189             : 
    6190             : static int
    6191        4717 : blob_set_xattrs(struct spdk_blob *blob, const struct spdk_blob_xattr_opts *xattrs,
    6192             :                 bool internal)
    6193             : {
    6194             :         uint64_t i;
    6195        4717 :         size_t value_len = 0;
    6196             :         int rc;
    6197        4717 :         const void *value = NULL;
    6198        4717 :         if (xattrs->count > 0 && xattrs->get_value == NULL) {
    6199          10 :                 return -EINVAL;
    6200             :         }
    6201        5103 :         for (i = 0; i < xattrs->count; i++) {
    6202         401 :                 xattrs->get_value(xattrs->ctx, xattrs->names[i], &value, &value_len);
    6203         401 :                 if (value == NULL || value_len == 0) {
    6204           5 :                         return -EINVAL;
    6205             :                 }
    6206         396 :                 rc = blob_set_xattr(blob, xattrs->names[i], value, value_len, internal);
    6207         396 :                 if (rc < 0) {
    6208           0 :                         return rc;
    6209             :                 }
    6210             :         }
    6211        4702 :         return 0;
    6212             : }
    6213             : 
    6214             : static void
    6215        2330 : blob_opts_copy(const struct spdk_blob_opts *src, struct spdk_blob_opts *dst)
    6216             : {
    6217             : #define FIELD_OK(field) \
    6218             :         offsetof(struct spdk_blob_opts, field) + sizeof(src->field) <= src->opts_size
    6219             : 
    6220             : #define SET_FIELD(field) \
    6221             :         if (FIELD_OK(field)) { \
    6222             :                 dst->field = src->field; \
    6223             :         } \
    6224             : 
    6225        2330 :         SET_FIELD(num_clusters);
    6226        2330 :         SET_FIELD(thin_provision);
    6227        2330 :         SET_FIELD(clear_method);
    6228             : 
    6229        2330 :         if (FIELD_OK(xattrs)) {
    6230        2330 :                 memcpy(&dst->xattrs, &src->xattrs, sizeof(src->xattrs));
    6231             :         }
    6232             : 
    6233        2330 :         SET_FIELD(use_extent_table);
    6234        2330 :         SET_FIELD(esnap_id);
    6235        2330 :         SET_FIELD(esnap_id_len);
    6236             : 
    6237        2330 :         dst->opts_size = src->opts_size;
    6238             : 
    6239             :         /* You should not remove this statement, but need to update the assert statement
    6240             :          * if you add a new field, and also add a corresponding SET_FIELD statement */
    6241             :         SPDK_STATIC_ASSERT(sizeof(struct spdk_blob_opts) == 80, "Incorrect size");
    6242             : 
    6243             : #undef FIELD_OK
    6244             : #undef SET_FIELD
    6245        2330 : }
    6246             : 
    6247             : static void
    6248        2366 : bs_create_blob(struct spdk_blob_store *bs,
    6249             :                const struct spdk_blob_opts *opts,
    6250             :                const struct spdk_blob_xattr_opts *internal_xattrs,
    6251             :                spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
    6252             : {
    6253             :         struct spdk_blob        *blob;
    6254             :         uint32_t                page_idx;
    6255             :         struct spdk_bs_cpl      cpl;
    6256             :         struct spdk_blob_opts   opts_local;
    6257             :         struct spdk_blob_xattr_opts internal_xattrs_default;
    6258             :         spdk_bs_sequence_t      *seq;
    6259             :         spdk_blob_id            id;
    6260             :         int rc;
    6261             : 
    6262        2366 :         assert(spdk_get_thread() == bs->md_thread);
    6263             : 
    6264        2366 :         spdk_spin_lock(&bs->used_lock);
    6265        2366 :         page_idx = spdk_bit_array_find_first_clear(bs->used_md_pages, 0);
    6266        2366 :         if (page_idx == UINT32_MAX) {
    6267           0 :                 spdk_spin_unlock(&bs->used_lock);
    6268           0 :                 cb_fn(cb_arg, 0, -ENOMEM);
    6269        2346 :                 return;
    6270             :         }
    6271        2366 :         spdk_bit_array_set(bs->used_blobids, page_idx);
    6272        2366 :         bs_claim_md_page(bs, page_idx);
    6273        2366 :         spdk_spin_unlock(&bs->used_lock);
    6274             : 
    6275        2366 :         id = bs_page_to_blobid(page_idx);
    6276             : 
    6277        2366 :         SPDK_DEBUGLOG(blob, "Creating blob with id 0x%" PRIx64 " at page %u\n", id, page_idx);
    6278             : 
    6279        2366 :         spdk_blob_opts_init(&opts_local, sizeof(opts_local));
    6280        2366 :         if (opts) {
    6281        2330 :                 blob_opts_copy(opts, &opts_local);
    6282             :         }
    6283             : 
    6284        2366 :         blob = blob_alloc(bs, id);
    6285        2366 :         if (!blob) {
    6286           0 :                 rc = -ENOMEM;
    6287           0 :                 goto error;
    6288             :         }
    6289             : 
    6290        2366 :         blob->use_extent_table = opts_local.use_extent_table;
    6291        2366 :         if (blob->use_extent_table) {
    6292        1440 :                 blob->invalid_flags |= SPDK_BLOB_EXTENT_TABLE;
    6293             :         }
    6294             : 
    6295        2366 :         if (!internal_xattrs) {
    6296        2025 :                 blob_xattrs_init(&internal_xattrs_default);
    6297        2025 :                 internal_xattrs = &internal_xattrs_default;
    6298             :         }
    6299             : 
    6300        2366 :         rc = blob_set_xattrs(blob, &opts_local.xattrs, false);
    6301        2366 :         if (rc < 0) {
    6302          15 :                 goto error;
    6303             :         }
    6304             : 
    6305        2351 :         rc = blob_set_xattrs(blob, internal_xattrs, true);
    6306        2351 :         if (rc < 0) {
    6307           0 :                 goto error;
    6308             :         }
    6309             : 
    6310        2351 :         if (opts_local.thin_provision) {
    6311         446 :                 blob_set_thin_provision(blob);
    6312             :         }
    6313             : 
    6314        2351 :         blob_set_clear_method(blob, opts_local.clear_method);
    6315             : 
    6316        2351 :         if (opts_local.esnap_id != NULL) {
    6317          75 :                 if (opts_local.esnap_id_len > UINT16_MAX) {
    6318           0 :                         SPDK_ERRLOG("esnap id length %" PRIu64 "is too long\n",
    6319             :                                     opts_local.esnap_id_len);
    6320           0 :                         rc = -EINVAL;
    6321           0 :                         goto error;
    6322             : 
    6323             :                 }
    6324          75 :                 blob_set_thin_provision(blob);
    6325          75 :                 blob->invalid_flags |= SPDK_BLOB_EXTERNAL_SNAPSHOT;
    6326          75 :                 rc = blob_set_xattr(blob, BLOB_EXTERNAL_SNAPSHOT_ID,
    6327          75 :                                     opts_local.esnap_id, opts_local.esnap_id_len, true);
    6328          75 :                 if (rc != 0) {
    6329           0 :                         goto error;
    6330             :                 }
    6331             :         }
    6332             : 
    6333        2351 :         rc = blob_resize(blob, opts_local.num_clusters);
    6334        2351 :         if (rc < 0) {
    6335           5 :                 goto error;
    6336             :         }
    6337        2346 :         cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
    6338        2346 :         cpl.u.blobid.cb_fn = cb_fn;
    6339        2346 :         cpl.u.blobid.cb_arg = cb_arg;
    6340        2346 :         cpl.u.blobid.blobid = blob->id;
    6341             : 
    6342        2346 :         seq = bs_sequence_start_bs(bs->md_channel, &cpl);
    6343        2346 :         if (!seq) {
    6344           0 :                 rc = -ENOMEM;
    6345           0 :                 goto error;
    6346             :         }
    6347             : 
    6348        2346 :         blob_persist(seq, blob, bs_create_blob_cpl, blob);
    6349        2346 :         return;
    6350             : 
    6351          20 : error:
    6352          20 :         SPDK_ERRLOG("Failed to create blob: %s, size in clusters/size: %lu (clusters)\n",
    6353             :                     spdk_strerror(rc), opts_local.num_clusters);
    6354          20 :         if (blob != NULL) {
    6355          20 :                 blob_free(blob);
    6356             :         }
    6357          20 :         spdk_spin_lock(&bs->used_lock);
    6358          20 :         spdk_bit_array_clear(bs->used_blobids, page_idx);
    6359          20 :         bs_release_md_page(bs, page_idx);
    6360          20 :         spdk_spin_unlock(&bs->used_lock);
    6361          20 :         cb_fn(cb_arg, 0, rc);
    6362             : }
    6363             : 
    6364             : void
    6365          16 : spdk_bs_create_blob(struct spdk_blob_store *bs,
    6366             :                     spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
    6367             : {
    6368          16 :         bs_create_blob(bs, NULL, NULL, cb_fn, cb_arg);
    6369          16 : }
    6370             : 
    6371             : void
    6372        1999 : spdk_bs_create_blob_ext(struct spdk_blob_store *bs, const struct spdk_blob_opts *opts,
    6373             :                         spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
    6374             : {
    6375        1999 :         bs_create_blob(bs, opts, NULL, cb_fn, cb_arg);
    6376        1999 : }
    6377             : 
    6378             : /* END spdk_bs_create_blob */
    6379             : 
    6380             : /* START blob_cleanup */
    6381             : 
    6382             : struct spdk_clone_snapshot_ctx {
    6383             :         struct spdk_bs_cpl      cpl;
    6384             :         int bserrno;
    6385             :         bool frozen;
    6386             : 
    6387             :         struct spdk_io_channel *channel;
    6388             : 
    6389             :         /* Current cluster for inflate operation */
    6390             :         uint64_t cluster;
    6391             : 
    6392             :         /* For inflation force allocation of all unallocated clusters and remove
    6393             :          * thin-provisioning. Otherwise only decouple parent and keep clone thin. */
    6394             :         bool allocate_all;
    6395             : 
    6396             :         struct {
    6397             :                 spdk_blob_id id;
    6398             :                 struct spdk_blob *blob;
    6399             :                 bool md_ro;
    6400             :         } original;
    6401             :         struct {
    6402             :                 spdk_blob_id id;
    6403             :                 struct spdk_blob *blob;
    6404             :         } new;
    6405             : 
    6406             :         /* xattrs specified for snapshot/clones only. They have no impact on
    6407             :          * the original blobs xattrs. */
    6408             :         const struct spdk_blob_xattr_opts *xattrs;
    6409             : };
    6410             : 
    6411             : static void
    6412         429 : bs_clone_snapshot_cleanup_finish(void *cb_arg, int bserrno)
    6413             : {
    6414         429 :         struct spdk_clone_snapshot_ctx *ctx = cb_arg;
    6415         429 :         struct spdk_bs_cpl *cpl = &ctx->cpl;
    6416             : 
    6417         429 :         if (bserrno != 0) {
    6418           8 :                 if (ctx->bserrno != 0) {
    6419           0 :                         SPDK_ERRLOG("Cleanup error %d\n", bserrno);
    6420             :                 } else {
    6421           8 :                         ctx->bserrno = bserrno;
    6422             :                 }
    6423             :         }
    6424             : 
    6425         429 :         switch (cpl->type) {
    6426         354 :         case SPDK_BS_CPL_TYPE_BLOBID:
    6427         354 :                 cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg, cpl->u.blobid.blobid, ctx->bserrno);
    6428         354 :                 break;
    6429          75 :         case SPDK_BS_CPL_TYPE_BLOB_BASIC:
    6430          75 :                 cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, ctx->bserrno);
    6431          75 :                 break;
    6432           0 :         default:
    6433           0 :                 SPDK_UNREACHABLE();
    6434             :                 break;
    6435             :         }
    6436             : 
    6437         429 :         free(ctx);
    6438         429 : }
    6439             : 
    6440             : static void
    6441         411 : bs_snapshot_unfreeze_cpl(void *cb_arg, int bserrno)
    6442             : {
    6443         411 :         struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
    6444         411 :         struct spdk_blob *origblob = ctx->original.blob;
    6445             : 
    6446         411 :         if (bserrno != 0) {
    6447           0 :                 if (ctx->bserrno != 0) {
    6448           0 :                         SPDK_ERRLOG("Unfreeze error %d\n", bserrno);
    6449             :                 } else {
    6450           0 :                         ctx->bserrno = bserrno;
    6451             :                 }
    6452             :         }
    6453             : 
    6454         411 :         ctx->original.id = origblob->id;
    6455         411 :         origblob->locked_operation_in_progress = false;
    6456             : 
    6457             :         /* Revert md_ro to original state */
    6458         411 :         origblob->md_ro = ctx->original.md_ro;
    6459             : 
    6460         411 :         spdk_blob_close(origblob, bs_clone_snapshot_cleanup_finish, ctx);
    6461         411 : }
    6462             : 
    6463             : static void
    6464         411 : bs_clone_snapshot_origblob_cleanup(void *cb_arg, int bserrno)
    6465             : {
    6466         411 :         struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
    6467         411 :         struct spdk_blob *origblob = ctx->original.blob;
    6468             : 
    6469         411 :         if (bserrno != 0) {
    6470          30 :                 if (ctx->bserrno != 0) {
    6471           5 :                         SPDK_ERRLOG("Cleanup error %d\n", bserrno);
    6472             :                 } else {
    6473          25 :                         ctx->bserrno = bserrno;
    6474             :                 }
    6475             :         }
    6476             : 
    6477         411 :         if (ctx->frozen) {
    6478             :                 /* Unfreeze any outstanding I/O */
    6479         266 :                 blob_unfreeze_io(origblob, bs_snapshot_unfreeze_cpl, ctx);
    6480             :         } else {
    6481         145 :                 bs_snapshot_unfreeze_cpl(ctx, 0);
    6482             :         }
    6483             : 
    6484         411 : }
    6485             : 
    6486             : static void
    6487           5 : bs_clone_snapshot_newblob_cleanup(struct spdk_clone_snapshot_ctx *ctx, int bserrno)
    6488             : {
    6489           5 :         struct spdk_blob *newblob = ctx->new.blob;
    6490             : 
    6491           5 :         if (bserrno != 0) {
    6492           5 :                 if (ctx->bserrno != 0) {
    6493           0 :                         SPDK_ERRLOG("Cleanup error %d\n", bserrno);
    6494             :                 } else {
    6495           5 :                         ctx->bserrno = bserrno;
    6496             :                 }
    6497             :         }
    6498             : 
    6499           5 :         ctx->new.id = newblob->id;
    6500           5 :         spdk_blob_close(newblob, bs_clone_snapshot_origblob_cleanup, ctx);
    6501           5 : }
    6502             : 
    6503             : /* END blob_cleanup */
    6504             : 
    6505             : /* START spdk_bs_create_snapshot */
    6506             : 
    6507             : static void
    6508         276 : bs_snapshot_swap_cluster_maps(struct spdk_blob *blob1, struct spdk_blob *blob2)
    6509             : {
    6510             :         uint64_t *cluster_temp;
    6511             :         uint64_t num_allocated_clusters_temp;
    6512             :         uint32_t *extent_page_temp;
    6513             : 
    6514         276 :         cluster_temp = blob1->active.clusters;
    6515         276 :         blob1->active.clusters = blob2->active.clusters;
    6516         276 :         blob2->active.clusters = cluster_temp;
    6517             : 
    6518         276 :         num_allocated_clusters_temp = blob1->active.num_allocated_clusters;
    6519         276 :         blob1->active.num_allocated_clusters = blob2->active.num_allocated_clusters;
    6520         276 :         blob2->active.num_allocated_clusters = num_allocated_clusters_temp;
    6521             : 
    6522         276 :         extent_page_temp = blob1->active.extent_pages;
    6523         276 :         blob1->active.extent_pages = blob2->active.extent_pages;
    6524         276 :         blob2->active.extent_pages = extent_page_temp;
    6525         276 : }
    6526             : 
    6527             : /* Copies an internal xattr */
    6528             : static int
    6529          25 : bs_snapshot_copy_xattr(struct spdk_blob *toblob, struct spdk_blob *fromblob, const char *name)
    6530             : {
    6531          25 :         const void      *val = NULL;
    6532             :         size_t          len;
    6533             :         int             bserrno;
    6534             : 
    6535          25 :         bserrno = blob_get_xattr_value(fromblob, name, &val, &len, true);
    6536          25 :         if (bserrno != 0) {
    6537           0 :                 SPDK_ERRLOG("blob 0x%" PRIx64 " missing %s XATTR\n", fromblob->id, name);
    6538           0 :                 return bserrno;
    6539             :         }
    6540             : 
    6541          25 :         bserrno = blob_set_xattr(toblob, name, val, len, true);
    6542          25 :         if (bserrno != 0) {
    6543           0 :                 SPDK_ERRLOG("could not set %s XATTR on blob 0x%" PRIx64 "\n",
    6544             :                             name, toblob->id);
    6545           0 :                 return bserrno;
    6546             :         }
    6547          25 :         return 0;
    6548             : }
    6549             : 
    6550             : static void
    6551         261 : bs_snapshot_origblob_sync_cpl(void *cb_arg, int bserrno)
    6552             : {
    6553         261 :         struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
    6554         261 :         struct spdk_blob *origblob = ctx->original.blob;
    6555         261 :         struct spdk_blob *newblob = ctx->new.blob;
    6556             : 
    6557         261 :         if (bserrno != 0) {
    6558           5 :                 bs_snapshot_swap_cluster_maps(newblob, origblob);
    6559           5 :                 if (blob_is_esnap_clone(newblob)) {
    6560           0 :                         bs_snapshot_copy_xattr(origblob, newblob, BLOB_EXTERNAL_SNAPSHOT_ID);
    6561           0 :                         origblob->invalid_flags |= SPDK_BLOB_EXTERNAL_SNAPSHOT;
    6562             :                 }
    6563           5 :                 bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
    6564           5 :                 return;
    6565             :         }
    6566             : 
    6567             :         /* Remove metadata descriptor SNAPSHOT_IN_PROGRESS */
    6568         256 :         bserrno = blob_remove_xattr(newblob, SNAPSHOT_IN_PROGRESS, true);
    6569         256 :         if (bserrno != 0) {
    6570           0 :                 bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
    6571           0 :                 return;
    6572             :         }
    6573             : 
    6574         256 :         bs_blob_list_add(ctx->original.blob);
    6575             : 
    6576         256 :         spdk_blob_set_read_only(newblob);
    6577             : 
    6578             :         /* sync snapshot metadata */
    6579         256 :         spdk_blob_sync_md(newblob, bs_clone_snapshot_origblob_cleanup, ctx);
    6580             : }
    6581             : 
    6582             : static void
    6583         266 : bs_snapshot_newblob_sync_cpl(void *cb_arg, int bserrno)
    6584             : {
    6585         266 :         struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
    6586         266 :         struct spdk_blob *origblob = ctx->original.blob;
    6587         266 :         struct spdk_blob *newblob = ctx->new.blob;
    6588             : 
    6589         266 :         if (bserrno != 0) {
    6590             :                 /* return cluster map back to original */
    6591           5 :                 bs_snapshot_swap_cluster_maps(newblob, origblob);
    6592             : 
    6593             :                 /* Newblob md sync failed. Valid clusters are only present in origblob.
    6594             :                  * Since I/O is frozen on origblob, not changes to zeroed out cluster map should have occurred.
    6595             :                  * Newblob needs to be reverted to thin_provisioned state at creation to properly close. */
    6596           5 :                 blob_set_thin_provision(newblob);
    6597           5 :                 assert(spdk_mem_all_zero(newblob->active.clusters,
    6598             :                                          newblob->active.num_clusters * sizeof(*newblob->active.clusters)));
    6599           5 :                 assert(spdk_mem_all_zero(newblob->active.extent_pages,
    6600             :                                          newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages)));
    6601             : 
    6602           5 :                 bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
    6603           5 :                 return;
    6604             :         }
    6605             : 
    6606             :         /* Set internal xattr for snapshot id */
    6607         261 :         bserrno = blob_set_xattr(origblob, BLOB_SNAPSHOT, &newblob->id, sizeof(spdk_blob_id), true);
    6608         261 :         if (bserrno != 0) {
    6609             :                 /* return cluster map back to original */
    6610           0 :                 bs_snapshot_swap_cluster_maps(newblob, origblob);
    6611           0 :                 blob_set_thin_provision(newblob);
    6612           0 :                 bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
    6613           0 :                 return;
    6614             :         }
    6615             : 
    6616             :         /* Create new back_bs_dev for snapshot */
    6617         261 :         origblob->back_bs_dev = bs_create_blob_bs_dev(newblob);
    6618         261 :         if (origblob->back_bs_dev == NULL) {
    6619             :                 /* return cluster map back to original */
    6620           0 :                 bs_snapshot_swap_cluster_maps(newblob, origblob);
    6621           0 :                 blob_set_thin_provision(newblob);
    6622           0 :                 bs_clone_snapshot_newblob_cleanup(ctx, -EINVAL);
    6623           0 :                 return;
    6624             :         }
    6625             : 
    6626             :         /* Remove the xattr that references an external snapshot */
    6627         261 :         if (blob_is_esnap_clone(origblob)) {
    6628          15 :                 origblob->invalid_flags &= ~SPDK_BLOB_EXTERNAL_SNAPSHOT;
    6629          15 :                 bserrno = blob_remove_xattr(origblob, BLOB_EXTERNAL_SNAPSHOT_ID, true);
    6630          15 :                 if (bserrno != 0) {
    6631           0 :                         if (bserrno == -ENOENT) {
    6632           0 :                                 SPDK_ERRLOG("blob 0x%" PRIx64 " has no " BLOB_EXTERNAL_SNAPSHOT_ID
    6633             :                                             " xattr to remove\n", origblob->id);
    6634           0 :                                 assert(false);
    6635             :                         } else {
    6636             :                                 /* return cluster map back to original */
    6637           0 :                                 bs_snapshot_swap_cluster_maps(newblob, origblob);
    6638           0 :                                 blob_set_thin_provision(newblob);
    6639           0 :                                 bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
    6640           0 :                                 return;
    6641             :                         }
    6642             :                 }
    6643             :         }
    6644             : 
    6645         261 :         bs_blob_list_remove(origblob);
    6646         261 :         origblob->parent_id = newblob->id;
    6647             :         /* set clone blob as thin provisioned */
    6648         261 :         blob_set_thin_provision(origblob);
    6649             : 
    6650         261 :         bs_blob_list_add(newblob);
    6651             : 
    6652             :         /* sync clone metadata */
    6653         261 :         spdk_blob_sync_md(origblob, bs_snapshot_origblob_sync_cpl, ctx);
    6654             : }
    6655             : 
    6656             : static void
    6657         266 : bs_snapshot_freeze_cpl(void *cb_arg, int rc)
    6658             : {
    6659         266 :         struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
    6660         266 :         struct spdk_blob *origblob = ctx->original.blob;
    6661         266 :         struct spdk_blob *newblob = ctx->new.blob;
    6662             :         int bserrno;
    6663             : 
    6664         266 :         if (rc != 0) {
    6665           0 :                 bs_clone_snapshot_newblob_cleanup(ctx, rc);
    6666           0 :                 return;
    6667             :         }
    6668             : 
    6669         266 :         ctx->frozen = true;
    6670             : 
    6671         266 :         if (blob_is_esnap_clone(origblob)) {
    6672             :                 /* Clean up any channels associated with the original blob id because future IO will
    6673             :                  * perform IO using the snapshot blob_id.
    6674             :                  */
    6675          15 :                 blob_esnap_destroy_bs_dev_channels(origblob, false, NULL, NULL);
    6676             :         }
    6677         266 :         if (newblob->back_bs_dev) {
    6678         266 :                 blob_back_bs_destroy(newblob);
    6679             :         }
    6680             :         /* set new back_bs_dev for snapshot */
    6681         266 :         newblob->back_bs_dev = origblob->back_bs_dev;
    6682             :         /* Set invalid flags from origblob */
    6683         266 :         newblob->invalid_flags = origblob->invalid_flags;
    6684             : 
    6685             :         /* inherit parent from original blob if set */
    6686         266 :         newblob->parent_id = origblob->parent_id;
    6687         266 :         switch (origblob->parent_id) {
    6688          15 :         case SPDK_BLOBID_EXTERNAL_SNAPSHOT:
    6689          15 :                 bserrno = bs_snapshot_copy_xattr(newblob, origblob, BLOB_EXTERNAL_SNAPSHOT_ID);
    6690          15 :                 if (bserrno != 0) {
    6691           0 :                         bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
    6692           0 :                         return;
    6693             :                 }
    6694          15 :                 break;
    6695         186 :         case SPDK_BLOBID_INVALID:
    6696         186 :                 break;
    6697          65 :         default:
    6698             :                 /* Set internal xattr for snapshot id */
    6699          65 :                 bserrno = blob_set_xattr(newblob, BLOB_SNAPSHOT,
    6700          65 :                                          &origblob->parent_id, sizeof(spdk_blob_id), true);
    6701          65 :                 if (bserrno != 0) {
    6702           0 :                         bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
    6703           0 :                         return;
    6704             :                 }
    6705             :         }
    6706             : 
    6707             :         /* swap cluster maps */
    6708         266 :         bs_snapshot_swap_cluster_maps(newblob, origblob);
    6709             : 
    6710             :         /* Set the clear method on the new blob to match the original. */
    6711         266 :         blob_set_clear_method(newblob, origblob->clear_method);
    6712             : 
    6713             :         /* sync snapshot metadata */
    6714         266 :         spdk_blob_sync_md(newblob, bs_snapshot_newblob_sync_cpl, ctx);
    6715             : }
    6716             : 
    6717             : static void
    6718         271 : bs_snapshot_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
    6719             : {
    6720         271 :         struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
    6721         271 :         struct spdk_blob *origblob = ctx->original.blob;
    6722         271 :         struct spdk_blob *newblob = _blob;
    6723             : 
    6724         271 :         if (bserrno != 0) {
    6725           5 :                 bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
    6726           5 :                 return;
    6727             :         }
    6728             : 
    6729         266 :         ctx->new.blob = newblob;
    6730         266 :         assert(spdk_blob_is_thin_provisioned(newblob));
    6731         266 :         assert(spdk_mem_all_zero(newblob->active.clusters,
    6732             :                                  newblob->active.num_clusters * sizeof(*newblob->active.clusters)));
    6733         266 :         assert(spdk_mem_all_zero(newblob->active.extent_pages,
    6734             :                                  newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages)));
    6735             : 
    6736         266 :         blob_freeze_io(origblob, bs_snapshot_freeze_cpl, ctx);
    6737             : }
    6738             : 
    6739             : static void
    6740         276 : bs_snapshot_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno)
    6741             : {
    6742         276 :         struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
    6743         276 :         struct spdk_blob *origblob = ctx->original.blob;
    6744             : 
    6745         276 :         if (bserrno != 0) {
    6746           5 :                 bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
    6747           5 :                 return;
    6748             :         }
    6749             : 
    6750         271 :         ctx->new.id = blobid;
    6751         271 :         ctx->cpl.u.blobid.blobid = blobid;
    6752             : 
    6753         271 :         spdk_bs_open_blob(origblob->bs, ctx->new.id, bs_snapshot_newblob_open_cpl, ctx);
    6754             : }
    6755             : 
    6756             : 
    6757             : static void
    6758         276 : bs_xattr_snapshot(void *arg, const char *name,
    6759             :                   const void **value, size_t *value_len)
    6760             : {
    6761         276 :         assert(strncmp(name, SNAPSHOT_IN_PROGRESS, sizeof(SNAPSHOT_IN_PROGRESS)) == 0);
    6762             : 
    6763         276 :         struct spdk_blob *blob = (struct spdk_blob *)arg;
    6764         276 :         *value = &blob->id;
    6765         276 :         *value_len = sizeof(blob->id);
    6766         276 : }
    6767             : 
    6768             : static void
    6769         289 : bs_snapshot_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
    6770             : {
    6771         289 :         struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
    6772             :         struct spdk_blob_opts opts;
    6773             :         struct spdk_blob_xattr_opts internal_xattrs;
    6774         289 :         char *xattrs_names[] = { SNAPSHOT_IN_PROGRESS };
    6775             : 
    6776         289 :         if (bserrno != 0) {
    6777           8 :                 bs_clone_snapshot_cleanup_finish(ctx, bserrno);
    6778          13 :                 return;
    6779             :         }
    6780             : 
    6781         281 :         ctx->original.blob = _blob;
    6782             : 
    6783         281 :         if (_blob->data_ro || _blob->md_ro) {
    6784           5 :                 SPDK_DEBUGLOG(blob, "Cannot create snapshot from read only blob with id 0x%"
    6785             :                               PRIx64 "\n", _blob->id);
    6786           5 :                 ctx->bserrno = -EINVAL;
    6787           5 :                 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
    6788           5 :                 return;
    6789             :         }
    6790             : 
    6791         276 :         if (_blob->locked_operation_in_progress) {
    6792           0 :                 SPDK_DEBUGLOG(blob, "Cannot create snapshot - another operation in progress\n");
    6793           0 :                 ctx->bserrno = -EBUSY;
    6794           0 :                 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
    6795           0 :                 return;
    6796             :         }
    6797             : 
    6798         276 :         _blob->locked_operation_in_progress = true;
    6799             : 
    6800         276 :         spdk_blob_opts_init(&opts, sizeof(opts));
    6801         276 :         blob_xattrs_init(&internal_xattrs);
    6802             : 
    6803             :         /* Change the size of new blob to the same as in original blob,
    6804             :          * but do not allocate clusters */
    6805         276 :         opts.thin_provision = true;
    6806         276 :         opts.num_clusters = spdk_blob_get_num_clusters(_blob);
    6807         276 :         opts.use_extent_table = _blob->use_extent_table;
    6808             : 
    6809             :         /* If there are any xattrs specified for snapshot, set them now */
    6810         276 :         if (ctx->xattrs) {
    6811           5 :                 memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs));
    6812             :         }
    6813             :         /* Set internal xattr SNAPSHOT_IN_PROGRESS */
    6814         276 :         internal_xattrs.count = 1;
    6815         276 :         internal_xattrs.ctx = _blob;
    6816         276 :         internal_xattrs.names = xattrs_names;
    6817         276 :         internal_xattrs.get_value = bs_xattr_snapshot;
    6818             : 
    6819         276 :         bs_create_blob(_blob->bs, &opts, &internal_xattrs,
    6820             :                        bs_snapshot_newblob_create_cpl, ctx);
    6821             : }
    6822             : 
    6823             : void
    6824         289 : spdk_bs_create_snapshot(struct spdk_blob_store *bs, spdk_blob_id blobid,
    6825             :                         const struct spdk_blob_xattr_opts *snapshot_xattrs,
    6826             :                         spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
    6827             : {
    6828         289 :         struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx));
    6829             : 
    6830         289 :         if (!ctx) {
    6831           0 :                 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM);
    6832           0 :                 return;
    6833             :         }
    6834         289 :         ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
    6835         289 :         ctx->cpl.u.blobid.cb_fn = cb_fn;
    6836         289 :         ctx->cpl.u.blobid.cb_arg = cb_arg;
    6837         289 :         ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID;
    6838         289 :         ctx->bserrno = 0;
    6839         289 :         ctx->frozen = false;
    6840         289 :         ctx->original.id = blobid;
    6841         289 :         ctx->xattrs = snapshot_xattrs;
    6842             : 
    6843         289 :         spdk_bs_open_blob(bs, ctx->original.id, bs_snapshot_origblob_open_cpl, ctx);
    6844             : }
    6845             : /* END spdk_bs_create_snapshot */
    6846             : 
    6847             : /* START spdk_bs_create_clone */
    6848             : 
    6849             : static void
    6850          60 : bs_xattr_clone(void *arg, const char *name,
    6851             :                const void **value, size_t *value_len)
    6852             : {
    6853          60 :         assert(strncmp(name, BLOB_SNAPSHOT, sizeof(BLOB_SNAPSHOT)) == 0);
    6854             : 
    6855          60 :         struct spdk_blob *blob = (struct spdk_blob *)arg;
    6856          60 :         *value = &blob->id;
    6857          60 :         *value_len = sizeof(blob->id);
    6858          60 : }
    6859             : 
    6860             : static void
    6861          60 : bs_clone_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
    6862             : {
    6863          60 :         struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
    6864          60 :         struct spdk_blob *clone = _blob;
    6865             : 
    6866          60 :         ctx->new.blob = clone;
    6867          60 :         bs_blob_list_add(clone);
    6868             : 
    6869          60 :         spdk_blob_close(clone, bs_clone_snapshot_origblob_cleanup, ctx);
    6870          60 : }
    6871             : 
    6872             : static void
    6873          60 : bs_clone_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno)
    6874             : {
    6875          60 :         struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
    6876             : 
    6877          60 :         ctx->cpl.u.blobid.blobid = blobid;
    6878          60 :         spdk_bs_open_blob(ctx->original.blob->bs, blobid, bs_clone_newblob_open_cpl, ctx);
    6879          60 : }
    6880             : 
    6881             : static void
    6882          65 : bs_clone_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
    6883             : {
    6884          65 :         struct spdk_clone_snapshot_ctx  *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
    6885             :         struct spdk_blob_opts           opts;
    6886             :         struct spdk_blob_xattr_opts internal_xattrs;
    6887          65 :         char *xattr_names[] = { BLOB_SNAPSHOT };
    6888             : 
    6889          65 :         if (bserrno != 0) {
    6890           0 :                 bs_clone_snapshot_cleanup_finish(ctx, bserrno);
    6891           5 :                 return;
    6892             :         }
    6893             : 
    6894          65 :         ctx->original.blob = _blob;
    6895          65 :         ctx->original.md_ro = _blob->md_ro;
    6896             : 
    6897          65 :         if (!_blob->data_ro || !_blob->md_ro) {
    6898           5 :                 SPDK_DEBUGLOG(blob, "Clone not from read-only blob\n");
    6899           5 :                 ctx->bserrno = -EINVAL;
    6900           5 :                 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
    6901           5 :                 return;
    6902             :         }
    6903             : 
    6904          60 :         if (_blob->locked_operation_in_progress) {
    6905           0 :                 SPDK_DEBUGLOG(blob, "Cannot create clone - another operation in progress\n");
    6906           0 :                 ctx->bserrno = -EBUSY;
    6907           0 :                 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
    6908           0 :                 return;
    6909             :         }
    6910             : 
    6911          60 :         _blob->locked_operation_in_progress = true;
    6912             : 
    6913          60 :         spdk_blob_opts_init(&opts, sizeof(opts));
    6914          60 :         blob_xattrs_init(&internal_xattrs);
    6915             : 
    6916          60 :         opts.thin_provision = true;
    6917          60 :         opts.num_clusters = spdk_blob_get_num_clusters(_blob);
    6918          60 :         opts.use_extent_table = _blob->use_extent_table;
    6919          60 :         if (ctx->xattrs) {
    6920           5 :                 memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs));
    6921             :         }
    6922             : 
    6923             :         /* Set internal xattr BLOB_SNAPSHOT */
    6924          60 :         internal_xattrs.count = 1;
    6925          60 :         internal_xattrs.ctx = _blob;
    6926          60 :         internal_xattrs.names = xattr_names;
    6927          60 :         internal_xattrs.get_value = bs_xattr_clone;
    6928             : 
    6929          60 :         bs_create_blob(_blob->bs, &opts, &internal_xattrs,
    6930             :                        bs_clone_newblob_create_cpl, ctx);
    6931             : }
    6932             : 
    6933             : void
    6934          65 : spdk_bs_create_clone(struct spdk_blob_store *bs, spdk_blob_id blobid,
    6935             :                      const struct spdk_blob_xattr_opts *clone_xattrs,
    6936             :                      spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
    6937             : {
    6938          65 :         struct spdk_clone_snapshot_ctx  *ctx = calloc(1, sizeof(*ctx));
    6939             : 
    6940          65 :         if (!ctx) {
    6941           0 :                 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM);
    6942           0 :                 return;
    6943             :         }
    6944             : 
    6945          65 :         ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
    6946          65 :         ctx->cpl.u.blobid.cb_fn = cb_fn;
    6947          65 :         ctx->cpl.u.blobid.cb_arg = cb_arg;
    6948          65 :         ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID;
    6949          65 :         ctx->bserrno = 0;
    6950          65 :         ctx->xattrs = clone_xattrs;
    6951          65 :         ctx->original.id = blobid;
    6952             : 
    6953          65 :         spdk_bs_open_blob(bs, ctx->original.id, bs_clone_origblob_open_cpl, ctx);
    6954             : }
    6955             : 
    6956             : /* END spdk_bs_create_clone */
    6957             : 
    6958             : /* START spdk_bs_inflate_blob */
    6959             : 
    6960             : static void
    6961          15 : bs_inflate_blob_set_parent_cpl(void *cb_arg, struct spdk_blob *_parent, int bserrno)
    6962             : {
    6963          15 :         struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
    6964          15 :         struct spdk_blob *_blob = ctx->original.blob;
    6965             : 
    6966          15 :         if (bserrno != 0) {
    6967           0 :                 bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
    6968           0 :                 return;
    6969             :         }
    6970             : 
    6971             :         /* Temporarily override md_ro flag for MD modification */
    6972          15 :         _blob->md_ro = false;
    6973             : 
    6974          15 :         bserrno = blob_set_xattr(_blob, BLOB_SNAPSHOT, &_parent->id, sizeof(spdk_blob_id), true);
    6975          15 :         if (bserrno != 0) {
    6976           0 :                 bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
    6977           0 :                 return;
    6978             :         }
    6979             : 
    6980          15 :         assert(_parent != NULL);
    6981             : 
    6982          15 :         bs_blob_list_remove(_blob);
    6983          15 :         _blob->parent_id = _parent->id;
    6984             : 
    6985          15 :         blob_back_bs_destroy(_blob);
    6986          15 :         _blob->back_bs_dev = bs_create_blob_bs_dev(_parent);
    6987          15 :         bs_blob_list_add(_blob);
    6988             : 
    6989          15 :         spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx);
    6990             : }
    6991             : 
    6992             : static void
    6993          70 : bs_inflate_blob_done(struct spdk_clone_snapshot_ctx *ctx)
    6994             : {
    6995          70 :         struct spdk_blob *_blob = ctx->original.blob;
    6996             :         struct spdk_blob *_parent;
    6997             : 
    6998          70 :         if (ctx->allocate_all) {
    6999             :                 /* remove thin provisioning */
    7000          40 :                 bs_blob_list_remove(_blob);
    7001          40 :                 if (_blob->parent_id == SPDK_BLOBID_EXTERNAL_SNAPSHOT) {
    7002          10 :                         blob_remove_xattr(_blob, BLOB_EXTERNAL_SNAPSHOT_ID, true);
    7003          10 :                         _blob->invalid_flags &= ~SPDK_BLOB_EXTERNAL_SNAPSHOT;
    7004             :                 } else {
    7005          30 :                         blob_remove_xattr(_blob, BLOB_SNAPSHOT, true);
    7006             :                 }
    7007          40 :                 _blob->invalid_flags = _blob->invalid_flags & ~SPDK_BLOB_THIN_PROV;
    7008          40 :                 blob_back_bs_destroy(_blob);
    7009          40 :                 _blob->parent_id = SPDK_BLOBID_INVALID;
    7010             :         } else {
    7011             :                 /* For now, esnap clones always have allocate_all set. */
    7012          30 :                 assert(!blob_is_esnap_clone(_blob));
    7013             : 
    7014          30 :                 _parent = ((struct spdk_blob_bs_dev *)(_blob->back_bs_dev))->blob;
    7015          30 :                 if (_parent->parent_id != SPDK_BLOBID_INVALID) {
    7016             :                         /* We must change the parent of the inflated blob */
    7017          15 :                         spdk_bs_open_blob(_blob->bs, _parent->parent_id,
    7018             :                                           bs_inflate_blob_set_parent_cpl, ctx);
    7019          15 :                         return;
    7020             :                 }
    7021             : 
    7022          15 :                 bs_blob_list_remove(_blob);
    7023          15 :                 _blob->parent_id = SPDK_BLOBID_INVALID;
    7024          15 :                 blob_back_bs_destroy(_blob);
    7025          15 :                 _blob->back_bs_dev = bs_create_zeroes_dev();
    7026             :         }
    7027             : 
    7028             :         /* Temporarily override md_ro flag for MD modification */
    7029          55 :         _blob->md_ro = false;
    7030          55 :         blob_remove_xattr(_blob, BLOB_SNAPSHOT, true);
    7031          55 :         _blob->state = SPDK_BLOB_STATE_DIRTY;
    7032             : 
    7033          55 :         spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx);
    7034             : }
    7035             : 
    7036             : /* Check if cluster needs allocation */
    7037             : static inline bool
    7038        1500 : bs_cluster_needs_allocation(struct spdk_blob *blob, uint64_t cluster, bool allocate_all)
    7039             : {
    7040             :         struct spdk_blob_bs_dev *b;
    7041             : 
    7042        1500 :         assert(blob != NULL);
    7043             : 
    7044        1500 :         if (blob->active.clusters[cluster] != 0) {
    7045             :                 /* Cluster is already allocated */
    7046          40 :                 return false;
    7047             :         }
    7048             : 
    7049        1460 :         if (blob->parent_id == SPDK_BLOBID_INVALID) {
    7050             :                 /* Blob have no parent blob */
    7051         100 :                 return allocate_all;
    7052             :         }
    7053             : 
    7054        1360 :         if (blob->parent_id == SPDK_BLOBID_EXTERNAL_SNAPSHOT) {
    7055          80 :                 return true;
    7056             :         }
    7057             : 
    7058        1280 :         b = (struct spdk_blob_bs_dev *)blob->back_bs_dev;
    7059        1280 :         return (allocate_all || b->blob->active.clusters[cluster] != 0);
    7060             : }
    7061             : 
    7062             : static void
    7063         635 : bs_inflate_blob_touch_next(void *cb_arg, int bserrno)
    7064             : {
    7065         635 :         struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
    7066         635 :         struct spdk_blob *_blob = ctx->original.blob;
    7067             :         struct spdk_bs_cpl cpl;
    7068             :         spdk_bs_user_op_t *op;
    7069             :         uint64_t offset;
    7070             : 
    7071         635 :         if (bserrno != 0) {
    7072           0 :                 bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
    7073           0 :                 return;
    7074             :         }
    7075             : 
    7076         820 :         for (; ctx->cluster < _blob->active.num_clusters; ctx->cluster++) {
    7077         750 :                 if (bs_cluster_needs_allocation(_blob, ctx->cluster, ctx->allocate_all)) {
    7078         565 :                         break;
    7079             :                 }
    7080             :         }
    7081             : 
    7082         635 :         if (ctx->cluster < _blob->active.num_clusters) {
    7083         565 :                 offset = bs_cluster_to_lba(_blob->bs, ctx->cluster);
    7084             : 
    7085             :                 /* We may safely increment a cluster before copying */
    7086         565 :                 ctx->cluster++;
    7087             : 
    7088             :                 /* Use a dummy 0B read as a context for cluster copy */
    7089         565 :                 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
    7090         565 :                 cpl.u.blob_basic.cb_fn = bs_inflate_blob_touch_next;
    7091         565 :                 cpl.u.blob_basic.cb_arg = ctx;
    7092             : 
    7093         565 :                 op = bs_user_op_alloc(ctx->channel, &cpl, SPDK_BLOB_READ, _blob,
    7094             :                                       NULL, 0, offset, 0);
    7095         565 :                 if (!op) {
    7096           0 :                         bs_clone_snapshot_origblob_cleanup(ctx, -ENOMEM);
    7097           0 :                         return;
    7098             :                 }
    7099             : 
    7100         565 :                 bs_allocate_and_copy_cluster(_blob, ctx->channel, offset, op);
    7101             :         } else {
    7102          70 :                 bs_inflate_blob_done(ctx);
    7103             :         }
    7104             : }
    7105             : 
    7106             : static void
    7107          75 : bs_inflate_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
    7108             : {
    7109          75 :         struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
    7110             :         uint64_t clusters_needed;
    7111             :         uint64_t i;
    7112             : 
    7113          75 :         if (bserrno != 0) {
    7114           0 :                 bs_clone_snapshot_cleanup_finish(ctx, bserrno);
    7115           0 :                 return;
    7116             :         }
    7117             : 
    7118          75 :         ctx->original.blob = _blob;
    7119          75 :         ctx->original.md_ro = _blob->md_ro;
    7120             : 
    7121          75 :         if (_blob->locked_operation_in_progress) {
    7122           0 :                 SPDK_DEBUGLOG(blob, "Cannot inflate blob - another operation in progress\n");
    7123           0 :                 ctx->bserrno = -EBUSY;
    7124           0 :                 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
    7125           0 :                 return;
    7126             :         }
    7127             : 
    7128          75 :         _blob->locked_operation_in_progress = true;
    7129             : 
    7130          75 :         switch (_blob->parent_id) {
    7131          10 :         case SPDK_BLOBID_INVALID:
    7132          10 :                 if (!ctx->allocate_all) {
    7133             :                         /* This blob has no parent, so we cannot decouple it. */
    7134           5 :                         SPDK_ERRLOG("Cannot decouple parent of blob with no parent.\n");
    7135           5 :                         bs_clone_snapshot_origblob_cleanup(ctx, -EINVAL);
    7136           5 :                         return;
    7137             :                 }
    7138           5 :                 break;
    7139          10 :         case SPDK_BLOBID_EXTERNAL_SNAPSHOT:
    7140             :                 /*
    7141             :                  * It would be better to rely on back_bs_dev->is_zeroes(), to determine which
    7142             :                  * clusters require allocation. Until there is a blobstore consumer that
    7143             :                  * uses esnaps with an spdk_bs_dev that implements a useful is_zeroes() it is not
    7144             :                  * worth the effort.
    7145             :                  */
    7146          10 :                 ctx->allocate_all = true;
    7147          10 :                 break;
    7148          55 :         default:
    7149          55 :                 break;
    7150             :         }
    7151             : 
    7152          70 :         if (spdk_blob_is_thin_provisioned(_blob) == false) {
    7153             :                 /* This is not thin provisioned blob. No need to inflate. */
    7154           0 :                 bs_clone_snapshot_origblob_cleanup(ctx, 0);
    7155           0 :                 return;
    7156             :         }
    7157             : 
    7158             :         /* Do two passes - one to verify that we can obtain enough clusters
    7159             :          * and another to actually claim them.
    7160             :          */
    7161          70 :         clusters_needed = 0;
    7162         820 :         for (i = 0; i < _blob->active.num_clusters; i++) {
    7163         750 :                 if (bs_cluster_needs_allocation(_blob, i, ctx->allocate_all)) {
    7164         565 :                         clusters_needed++;
    7165             :                 }
    7166             :         }
    7167             : 
    7168          70 :         if (clusters_needed > _blob->bs->num_free_clusters) {
    7169             :                 /* Not enough free clusters. Cannot satisfy the request. */
    7170           0 :                 bs_clone_snapshot_origblob_cleanup(ctx, -ENOSPC);
    7171           0 :                 return;
    7172             :         }
    7173             : 
    7174          70 :         ctx->cluster = 0;
    7175          70 :         bs_inflate_blob_touch_next(ctx, 0);
    7176             : }
    7177             : 
    7178             : static void
    7179          75 : bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
    7180             :                 spdk_blob_id blobid, bool allocate_all, spdk_blob_op_complete cb_fn, void *cb_arg)
    7181             : {
    7182          75 :         struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx));
    7183             : 
    7184          75 :         if (!ctx) {
    7185           0 :                 cb_fn(cb_arg, -ENOMEM);
    7186           0 :                 return;
    7187             :         }
    7188          75 :         ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
    7189          75 :         ctx->cpl.u.bs_basic.cb_fn = cb_fn;
    7190          75 :         ctx->cpl.u.bs_basic.cb_arg = cb_arg;
    7191          75 :         ctx->bserrno = 0;
    7192          75 :         ctx->original.id = blobid;
    7193          75 :         ctx->channel = channel;
    7194          75 :         ctx->allocate_all = allocate_all;
    7195             : 
    7196          75 :         spdk_bs_open_blob(bs, ctx->original.id, bs_inflate_blob_open_cpl, ctx);
    7197             : }
    7198             : 
    7199             : void
    7200          35 : spdk_bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
    7201             :                      spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg)
    7202             : {
    7203          35 :         bs_inflate_blob(bs, channel, blobid, true, cb_fn, cb_arg);
    7204          35 : }
    7205             : 
    7206             : void
    7207          40 : spdk_bs_blob_decouple_parent(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
    7208             :                              spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg)
    7209             : {
    7210          40 :         bs_inflate_blob(bs, channel, blobid, false, cb_fn, cb_arg);
    7211          40 : }
    7212             : /* END spdk_bs_inflate_blob */
    7213             : 
    7214             : /* START spdk_bs_blob_shallow_copy */
    7215             : 
    7216             : struct shallow_copy_ctx {
    7217             :         struct spdk_bs_cpl cpl;
    7218             :         int bserrno;
    7219             : 
    7220             :         /* Blob source for copy */
    7221             :         struct spdk_blob_store *bs;
    7222             :         spdk_blob_id blobid;
    7223             :         struct spdk_blob *blob;
    7224             :         struct spdk_io_channel *blob_channel;
    7225             : 
    7226             :         /* Destination device for copy */
    7227             :         struct spdk_bs_dev *ext_dev;
    7228             :         struct spdk_io_channel *ext_channel;
    7229             : 
    7230             :         /* Current cluster for copy operation */
    7231             :         uint64_t cluster;
    7232             : 
    7233             :         /* Buffer for blob reading */
    7234             :         uint8_t *read_buff;
    7235             : 
    7236             :         /* Struct for external device writing */
    7237             :         struct spdk_bs_dev_cb_args ext_args;
    7238             : 
    7239             :         /* Actual number of copied clusters */
    7240             :         uint64_t copied_clusters_count;
    7241             : 
    7242             :         /* Status callback for updates about the ongoing operation */
    7243             :         spdk_blob_shallow_copy_status status_cb;
    7244             : 
    7245             :         /* Argument passed to function status_cb */
    7246             :         void *status_cb_arg;
    7247             : };
    7248             : 
    7249             : static void
    7250          20 : bs_shallow_copy_cleanup_finish(void *cb_arg, int bserrno)
    7251             : {
    7252          20 :         struct shallow_copy_ctx *ctx = cb_arg;
    7253          20 :         struct spdk_bs_cpl *cpl = &ctx->cpl;
    7254             : 
    7255          20 :         if (bserrno != 0) {
    7256           0 :                 SPDK_ERRLOG("blob 0x%" PRIx64 " shallow copy, cleanup error %d\n", ctx->blob->id, bserrno);
    7257           0 :                 ctx->bserrno = bserrno;
    7258             :         }
    7259             : 
    7260          20 :         ctx->ext_dev->destroy_channel(ctx->ext_dev, ctx->ext_channel);
    7261          20 :         spdk_free(ctx->read_buff);
    7262             : 
    7263          20 :         cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, ctx->bserrno);
    7264             : 
    7265          20 :         free(ctx);
    7266          20 : }
    7267             : 
    7268             : static void
    7269          10 : bs_shallow_copy_bdev_write_cpl(struct spdk_io_channel *channel, void *cb_arg, int bserrno)
    7270             : {
    7271          10 :         struct shallow_copy_ctx *ctx = cb_arg;
    7272          10 :         struct spdk_blob *_blob = ctx->blob;
    7273             : 
    7274          10 :         if (bserrno != 0) {
    7275           0 :                 SPDK_ERRLOG("blob 0x%" PRIx64 " shallow copy, ext dev write error %d\n", ctx->blob->id, bserrno);
    7276           0 :                 ctx->bserrno = bserrno;
    7277           0 :                 _blob->locked_operation_in_progress = false;
    7278           0 :                 spdk_blob_close(_blob, bs_shallow_copy_cleanup_finish, ctx);
    7279           0 :                 return;
    7280             :         }
    7281             : 
    7282          10 :         ctx->cluster++;
    7283          10 :         if (ctx->status_cb) {
    7284          10 :                 ctx->copied_clusters_count++;
    7285          10 :                 ctx->status_cb(ctx->copied_clusters_count, ctx->status_cb_arg);
    7286             :         }
    7287             : 
    7288          10 :         bs_shallow_copy_cluster_find_next(ctx);
    7289             : }
    7290             : 
    7291             : static void
    7292          10 : bs_shallow_copy_blob_read_cpl(void *cb_arg, int bserrno)
    7293             : {
    7294          10 :         struct shallow_copy_ctx *ctx = cb_arg;
    7295          10 :         struct spdk_bs_dev *ext_dev = ctx->ext_dev;
    7296          10 :         struct spdk_blob *_blob = ctx->blob;
    7297             : 
    7298          10 :         if (bserrno != 0) {
    7299           0 :                 SPDK_ERRLOG("blob 0x%" PRIx64 " shallow copy, blob read error %d\n", ctx->blob->id, bserrno);
    7300           0 :                 ctx->bserrno = bserrno;
    7301           0 :                 _blob->locked_operation_in_progress = false;
    7302           0 :                 spdk_blob_close(_blob, bs_shallow_copy_cleanup_finish, ctx);
    7303           0 :                 return;
    7304             :         }
    7305             : 
    7306          10 :         ctx->ext_args.channel = ctx->ext_channel;
    7307          10 :         ctx->ext_args.cb_fn = bs_shallow_copy_bdev_write_cpl;
    7308          10 :         ctx->ext_args.cb_arg = ctx;
    7309             : 
    7310          20 :         ext_dev->write(ext_dev, ctx->ext_channel, ctx->read_buff,
    7311          10 :                        bs_cluster_to_lba(_blob->bs, ctx->cluster),
    7312          10 :                        bs_dev_byte_to_lba(_blob->bs->dev, _blob->bs->cluster_sz),
    7313             :                        &ctx->ext_args);
    7314             : }
    7315             : 
    7316             : static void
    7317          15 : bs_shallow_copy_cluster_find_next(void *cb_arg)
    7318             : {
    7319          15 :         struct shallow_copy_ctx *ctx = cb_arg;
    7320          15 :         struct spdk_blob *_blob = ctx->blob;
    7321             : 
    7322          25 :         while (ctx->cluster < _blob->active.num_clusters) {
    7323          20 :                 if (_blob->active.clusters[ctx->cluster] != 0) {
    7324          10 :                         break;
    7325             :                 }
    7326             : 
    7327          10 :                 ctx->cluster++;
    7328             :         }
    7329             : 
    7330          15 :         if (ctx->cluster < _blob->active.num_clusters) {
    7331          20 :                 blob_request_submit_op_single(ctx->blob_channel, _blob, ctx->read_buff,
    7332          10 :                                               bs_cluster_to_lba(_blob->bs, ctx->cluster),
    7333          10 :                                               bs_dev_byte_to_lba(_blob->bs->dev, _blob->bs->cluster_sz),
    7334             :                                               bs_shallow_copy_blob_read_cpl, ctx, SPDK_BLOB_READ);
    7335             :         } else {
    7336           5 :                 _blob->locked_operation_in_progress = false;
    7337           5 :                 spdk_blob_close(_blob, bs_shallow_copy_cleanup_finish, ctx);
    7338             :         }
    7339          15 : }
    7340             : 
    7341             : static void
    7342          20 : bs_shallow_copy_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
    7343             : {
    7344          20 :         struct shallow_copy_ctx *ctx = cb_arg;
    7345          20 :         struct spdk_bs_dev *ext_dev = ctx->ext_dev;
    7346             :         uint32_t blob_block_size;
    7347             :         uint64_t blob_total_size;
    7348             : 
    7349          20 :         if (bserrno != 0) {
    7350           0 :                 SPDK_ERRLOG("Shallow copy blob open error %d\n", bserrno);
    7351           0 :                 ctx->bserrno = bserrno;
    7352           0 :                 bs_shallow_copy_cleanup_finish(ctx, 0);
    7353           0 :                 return;
    7354             :         }
    7355             : 
    7356          20 :         if (!spdk_blob_is_read_only(_blob)) {
    7357           5 :                 SPDK_ERRLOG("blob 0x%" PRIx64 " shallow copy, blob must be read only\n", _blob->id);
    7358           5 :                 ctx->bserrno = -EPERM;
    7359           5 :                 spdk_blob_close(_blob, bs_shallow_copy_cleanup_finish, ctx);
    7360           5 :                 return;
    7361             :         }
    7362             : 
    7363          15 :         blob_block_size = _blob->bs->dev->blocklen;
    7364          15 :         blob_total_size = spdk_blob_get_num_clusters(_blob) * spdk_bs_get_cluster_size(_blob->bs);
    7365             : 
    7366          15 :         if (blob_total_size > ext_dev->blockcnt * ext_dev->blocklen) {
    7367           5 :                 SPDK_ERRLOG("blob 0x%" PRIx64 " shallow copy, external device must have at least blob size\n",
    7368             :                             _blob->id);
    7369           5 :                 ctx->bserrno = -EINVAL;
    7370           5 :                 spdk_blob_close(_blob, bs_shallow_copy_cleanup_finish, ctx);
    7371           5 :                 return;
    7372             :         }
    7373             : 
    7374          10 :         if (blob_block_size % ext_dev->blocklen != 0) {
    7375           5 :                 SPDK_ERRLOG("blob 0x%" PRIx64 " shallow copy, external device block size is not compatible with \
    7376             : blobstore block size\n", _blob->id);
    7377           5 :                 ctx->bserrno = -EINVAL;
    7378           5 :                 spdk_blob_close(_blob, bs_shallow_copy_cleanup_finish, ctx);
    7379           5 :                 return;
    7380             :         }
    7381             : 
    7382           5 :         ctx->blob = _blob;
    7383             : 
    7384           5 :         if (_blob->locked_operation_in_progress) {
    7385           0 :                 SPDK_DEBUGLOG(blob, "blob 0x%" PRIx64 " shallow copy - another operation in progress\n", _blob->id);
    7386           0 :                 ctx->bserrno = -EBUSY;
    7387           0 :                 spdk_blob_close(_blob, bs_shallow_copy_cleanup_finish, ctx);
    7388           0 :                 return;
    7389             :         }
    7390             : 
    7391           5 :         _blob->locked_operation_in_progress = true;
    7392             : 
    7393           5 :         ctx->cluster = 0;
    7394           5 :         bs_shallow_copy_cluster_find_next(ctx);
    7395             : }
    7396             : 
    7397             : int
    7398          20 : spdk_bs_blob_shallow_copy(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
    7399             :                           spdk_blob_id blobid, struct spdk_bs_dev *ext_dev,
    7400             :                           spdk_blob_shallow_copy_status status_cb_fn, void *status_cb_arg,
    7401             :                           spdk_blob_op_complete cb_fn, void *cb_arg)
    7402             : {
    7403             :         struct shallow_copy_ctx *ctx;
    7404             :         struct spdk_io_channel *ext_channel;
    7405             : 
    7406          20 :         ctx = calloc(1, sizeof(*ctx));
    7407          20 :         if (!ctx) {
    7408           0 :                 return -ENOMEM;
    7409             :         }
    7410             : 
    7411          20 :         ctx->bs = bs;
    7412          20 :         ctx->blobid = blobid;
    7413          20 :         ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
    7414          20 :         ctx->cpl.u.bs_basic.cb_fn = cb_fn;
    7415          20 :         ctx->cpl.u.bs_basic.cb_arg = cb_arg;
    7416          20 :         ctx->bserrno = 0;
    7417          20 :         ctx->blob_channel = channel;
    7418          20 :         ctx->status_cb = status_cb_fn;
    7419          20 :         ctx->status_cb_arg = status_cb_arg;
    7420          20 :         ctx->read_buff = spdk_malloc(bs->cluster_sz, bs->dev->blocklen, NULL,
    7421             :                                      SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
    7422          20 :         if (!ctx->read_buff) {
    7423           0 :                 free(ctx);
    7424           0 :                 return -ENOMEM;
    7425             :         }
    7426             : 
    7427          20 :         ext_channel = ext_dev->create_channel(ext_dev);
    7428          20 :         if (!ext_channel) {
    7429           0 :                 spdk_free(ctx->read_buff);
    7430           0 :                 free(ctx);
    7431           0 :                 return -ENOMEM;
    7432             :         }
    7433          20 :         ctx->ext_dev = ext_dev;
    7434          20 :         ctx->ext_channel = ext_channel;
    7435             : 
    7436          20 :         spdk_bs_open_blob(ctx->bs, ctx->blobid, bs_shallow_copy_blob_open_cpl, ctx);
    7437             : 
    7438          20 :         return 0;
    7439             : }
    7440             : /* END spdk_bs_blob_shallow_copy */
    7441             : 
    7442             : /* START spdk_bs_blob_set_parent */
    7443             : 
    7444             : struct set_parent_ctx {
    7445             :         struct spdk_blob_store *bs;
    7446             :         int                     bserrno;
    7447             :         spdk_bs_op_complete     cb_fn;
    7448             :         void                    *cb_arg;
    7449             : 
    7450             :         struct spdk_blob        *blob;
    7451             :         bool                    blob_md_ro;
    7452             : 
    7453             :         struct blob_parent      parent;
    7454             : };
    7455             : 
    7456             : static void
    7457          30 : bs_set_parent_cleanup_finish(void *cb_arg, int bserrno)
    7458             : {
    7459          30 :         struct set_parent_ctx *ctx = cb_arg;
    7460             : 
    7461          30 :         assert(ctx != NULL);
    7462             : 
    7463          30 :         if (bserrno != 0) {
    7464           0 :                 SPDK_ERRLOG("blob set parent finish error %d\n", bserrno);
    7465           0 :                 if (ctx->bserrno == 0) {
    7466           0 :                         ctx->bserrno = bserrno;
    7467             :                 }
    7468             :         }
    7469             : 
    7470          30 :         ctx->cb_fn(ctx->cb_arg, ctx->bserrno);
    7471             : 
    7472          30 :         free(ctx);
    7473          30 : }
    7474             : 
    7475             : static void
    7476          25 : bs_set_parent_close_snapshot(void *cb_arg, int bserrno)
    7477             : {
    7478          25 :         struct set_parent_ctx *ctx = cb_arg;
    7479             : 
    7480          25 :         if (ctx->bserrno != 0) {
    7481          10 :                 spdk_blob_close(ctx->parent.u.snapshot.blob, bs_set_parent_cleanup_finish, ctx);
    7482          10 :                 return;
    7483             :         }
    7484             : 
    7485          15 :         if (bserrno != 0) {
    7486           0 :                 SPDK_ERRLOG("blob close error %d\n", bserrno);
    7487           0 :                 ctx->bserrno = bserrno;
    7488             :         }
    7489             : 
    7490          15 :         bs_set_parent_cleanup_finish(ctx, ctx->bserrno);
    7491             : }
    7492             : 
    7493             : static void
    7494          15 : bs_set_parent_close_blob(void *cb_arg, int bserrno)
    7495             : {
    7496          15 :         struct set_parent_ctx *ctx = cb_arg;
    7497          15 :         struct spdk_blob *blob = ctx->blob;
    7498          15 :         struct spdk_blob *snapshot = ctx->parent.u.snapshot.blob;
    7499             : 
    7500          15 :         if (bserrno != 0 && ctx->bserrno == 0) {
    7501           0 :                 SPDK_ERRLOG("error %d in metadata sync\n", bserrno);
    7502           0 :                 ctx->bserrno = bserrno;
    7503             :         }
    7504             : 
    7505             :         /* Revert md_ro to original state */
    7506          15 :         blob->md_ro = ctx->blob_md_ro;
    7507             : 
    7508          15 :         blob->locked_operation_in_progress = false;
    7509          15 :         snapshot->locked_operation_in_progress = false;
    7510             : 
    7511          15 :         spdk_blob_close(blob, bs_set_parent_close_snapshot, ctx);
    7512          15 : }
    7513             : 
    7514             : static void
    7515          15 : bs_set_parent_set_back_bs_dev_done(void *cb_arg, int bserrno)
    7516             : {
    7517          15 :         struct set_parent_ctx *ctx = cb_arg;
    7518          15 :         struct spdk_blob *blob = ctx->blob;
    7519             : 
    7520          15 :         if (bserrno != 0) {
    7521           0 :                 SPDK_ERRLOG("error %d setting back_bs_dev\n", bserrno);
    7522           0 :                 ctx->bserrno = bserrno;
    7523           0 :                 bs_set_parent_close_blob(ctx, bserrno);
    7524           0 :                 return;
    7525             :         }
    7526             : 
    7527          15 :         spdk_blob_sync_md(blob, bs_set_parent_close_blob, ctx);
    7528             : }
    7529             : 
    7530             : static int
    7531          15 : bs_set_parent_refs(struct spdk_blob *blob, struct blob_parent *parent)
    7532             : {
    7533             :         int rc;
    7534             : 
    7535          15 :         bs_blob_list_remove(blob);
    7536             : 
    7537          15 :         rc = blob_set_xattr(blob, BLOB_SNAPSHOT, &parent->u.snapshot.id, sizeof(spdk_blob_id), true);
    7538          15 :         if (rc != 0) {
    7539           0 :                 SPDK_ERRLOG("error %d setting snapshot xattr\n", rc);
    7540           0 :                 return rc;
    7541             :         }
    7542          15 :         blob->parent_id = parent->u.snapshot.id;
    7543             : 
    7544          15 :         if (blob_is_esnap_clone(blob)) {
    7545             :                 /* Remove the xattr that references the external snapshot */
    7546           5 :                 blob->invalid_flags &= ~SPDK_BLOB_EXTERNAL_SNAPSHOT;
    7547           5 :                 blob_remove_xattr(blob, BLOB_EXTERNAL_SNAPSHOT_ID, true);
    7548             :         }
    7549             : 
    7550          15 :         bs_blob_list_add(blob);
    7551             : 
    7552          15 :         return 0;
    7553             : }
    7554             : 
    7555             : static void
    7556          25 : bs_set_parent_snapshot_open_cpl(void *cb_arg, struct spdk_blob *snapshot, int bserrno)
    7557             : {
    7558          25 :         struct set_parent_ctx *ctx = cb_arg;
    7559          25 :         struct spdk_blob *blob = ctx->blob;
    7560             :         struct spdk_bs_dev *back_bs_dev;
    7561             : 
    7562          25 :         if (bserrno != 0) {
    7563           0 :                 SPDK_ERRLOG("snapshot open error %d\n", bserrno);
    7564           0 :                 ctx->bserrno = bserrno;
    7565           0 :                 spdk_blob_close(blob, bs_set_parent_cleanup_finish, ctx);
    7566           0 :                 return;
    7567             :         }
    7568             : 
    7569          25 :         ctx->parent.u.snapshot.blob = snapshot;
    7570          25 :         ctx->parent.u.snapshot.id = snapshot->id;
    7571             : 
    7572          25 :         if (!spdk_blob_is_snapshot(snapshot)) {
    7573           5 :                 SPDK_ERRLOG("parent blob is not a snapshot\n");
    7574           5 :                 ctx->bserrno = -EINVAL;
    7575           5 :                 spdk_blob_close(blob, bs_set_parent_close_snapshot, ctx);
    7576           5 :                 return;
    7577             :         }
    7578             : 
    7579          20 :         if (blob->active.num_clusters != snapshot->active.num_clusters) {
    7580           5 :                 SPDK_ERRLOG("parent blob has a number of clusters different from child's ones\n");
    7581           5 :                 ctx->bserrno = -EINVAL;
    7582           5 :                 spdk_blob_close(blob, bs_set_parent_close_snapshot, ctx);
    7583           5 :                 return;
    7584             :         }
    7585             : 
    7586          15 :         if (blob->locked_operation_in_progress || snapshot->locked_operation_in_progress) {
    7587           0 :                 SPDK_ERRLOG("cannot set parent of blob, another operation in progress\n");
    7588           0 :                 ctx->bserrno = -EBUSY;
    7589           0 :                 spdk_blob_close(blob, bs_set_parent_close_snapshot, ctx);
    7590           0 :                 return;
    7591             :         }
    7592             : 
    7593          15 :         blob->locked_operation_in_progress = true;
    7594          15 :         snapshot->locked_operation_in_progress = true;
    7595             : 
    7596             :         /* Temporarily override md_ro flag for MD modification */
    7597          15 :         blob->md_ro = false;
    7598             : 
    7599          15 :         back_bs_dev = bs_create_blob_bs_dev(snapshot);
    7600             : 
    7601          15 :         blob_set_back_bs_dev(blob, back_bs_dev, bs_set_parent_refs, &ctx->parent,
    7602             :                              bs_set_parent_set_back_bs_dev_done,
    7603             :                              ctx);
    7604             : }
    7605             : 
    7606             : static void
    7607          30 : bs_set_parent_blob_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno)
    7608             : {
    7609          30 :         struct set_parent_ctx *ctx = cb_arg;
    7610             : 
    7611          30 :         if (bserrno != 0) {
    7612           0 :                 SPDK_ERRLOG("blob open error %d\n", bserrno);
    7613           0 :                 ctx->bserrno = bserrno;
    7614           0 :                 bs_set_parent_cleanup_finish(ctx, 0);
    7615           0 :                 return;
    7616             :         }
    7617             : 
    7618          30 :         if (!spdk_blob_is_thin_provisioned(blob)) {
    7619           5 :                 SPDK_ERRLOG("blob is not thin-provisioned\n");
    7620           5 :                 ctx->bserrno = -EINVAL;
    7621           5 :                 spdk_blob_close(blob, bs_set_parent_cleanup_finish, ctx);
    7622           5 :                 return;
    7623             :         }
    7624             : 
    7625          25 :         ctx->blob = blob;
    7626          25 :         ctx->blob_md_ro = blob->md_ro;
    7627             : 
    7628          25 :         spdk_bs_open_blob(ctx->bs, ctx->parent.u.snapshot.id, bs_set_parent_snapshot_open_cpl, ctx);
    7629             : }
    7630             : 
    7631             : void
    7632          45 : spdk_bs_blob_set_parent(struct spdk_blob_store *bs, spdk_blob_id blob_id,
    7633             :                         spdk_blob_id snapshot_id, spdk_blob_op_complete cb_fn, void *cb_arg)
    7634             : {
    7635             :         struct set_parent_ctx *ctx;
    7636             : 
    7637          45 :         if (snapshot_id == SPDK_BLOBID_INVALID) {
    7638           5 :                 SPDK_ERRLOG("snapshot id not valid\n");
    7639           5 :                 cb_fn(cb_arg, -EINVAL);
    7640           5 :                 return;
    7641             :         }
    7642             : 
    7643          40 :         if (blob_id == snapshot_id) {
    7644           5 :                 SPDK_ERRLOG("blob id and snapshot id cannot be the same\n");
    7645           5 :                 cb_fn(cb_arg, -EINVAL);
    7646           5 :                 return;
    7647             :         }
    7648             : 
    7649          35 :         if (spdk_blob_get_parent_snapshot(bs, blob_id) == snapshot_id) {
    7650           5 :                 SPDK_NOTICELOG("snapshot is already the parent of blob\n");
    7651           5 :                 cb_fn(cb_arg, -EEXIST);
    7652           5 :                 return;
    7653             :         }
    7654             : 
    7655          30 :         ctx = calloc(1, sizeof(*ctx));
    7656          30 :         if (!ctx) {
    7657           0 :                 cb_fn(cb_arg, -ENOMEM);
    7658           0 :                 return;
    7659             :         }
    7660             : 
    7661          30 :         ctx->bs = bs;
    7662          30 :         ctx->parent.u.snapshot.id = snapshot_id;
    7663          30 :         ctx->cb_fn = cb_fn;
    7664          30 :         ctx->cb_arg = cb_arg;
    7665          30 :         ctx->bserrno = 0;
    7666             : 
    7667          30 :         spdk_bs_open_blob(bs, blob_id, bs_set_parent_blob_open_cpl, ctx);
    7668             : }
    7669             : /* END spdk_bs_blob_set_parent */
    7670             : 
    7671             : /* START spdk_bs_blob_set_external_parent */
    7672             : 
    7673             : static void
    7674          20 : bs_set_external_parent_cleanup_finish(void *cb_arg, int bserrno)
    7675             : {
    7676          20 :         struct set_parent_ctx *ctx = cb_arg;
    7677             : 
    7678          20 :         if (bserrno != 0) {
    7679           0 :                 SPDK_ERRLOG("blob set external parent finish error %d\n", bserrno);
    7680           0 :                 if (ctx->bserrno == 0) {
    7681           0 :                         ctx->bserrno = bserrno;
    7682             :                 }
    7683             :         }
    7684             : 
    7685          20 :         ctx->cb_fn(ctx->cb_arg, ctx->bserrno);
    7686             : 
    7687          20 :         free(ctx->parent.u.esnap.id);
    7688          20 :         free(ctx);
    7689          20 : }
    7690             : 
    7691             : static void
    7692          10 : bs_set_external_parent_close_blob(void *cb_arg, int bserrno)
    7693             : {
    7694          10 :         struct set_parent_ctx *ctx = cb_arg;
    7695          10 :         struct spdk_blob *blob = ctx->blob;
    7696             : 
    7697          10 :         if (bserrno != 0 && ctx->bserrno == 0) {
    7698           0 :                 SPDK_ERRLOG("error %d in metadata sync\n", bserrno);
    7699           0 :                 ctx->bserrno = bserrno;
    7700             :         }
    7701             : 
    7702             :         /* Revert md_ro to original state */
    7703          10 :         blob->md_ro = ctx->blob_md_ro;
    7704             : 
    7705          10 :         blob->locked_operation_in_progress = false;
    7706             : 
    7707          10 :         spdk_blob_close(blob, bs_set_external_parent_cleanup_finish, ctx);
    7708          10 : }
    7709             : 
    7710             : static void
    7711          10 : bs_set_external_parent_unfrozen(void *cb_arg, int bserrno)
    7712             : {
    7713          10 :         struct set_parent_ctx *ctx = cb_arg;
    7714          10 :         struct spdk_blob *blob = ctx->blob;
    7715             : 
    7716          10 :         if (bserrno != 0) {
    7717           0 :                 SPDK_ERRLOG("error %d setting back_bs_dev\n", bserrno);
    7718           0 :                 ctx->bserrno = bserrno;
    7719           0 :                 bs_set_external_parent_close_blob(ctx, bserrno);
    7720           0 :                 return;
    7721             :         }
    7722             : 
    7723          10 :         spdk_blob_sync_md(blob, bs_set_external_parent_close_blob, ctx);
    7724             : }
    7725             : 
    7726             : static int
    7727          10 : bs_set_external_parent_refs(struct spdk_blob *blob, struct blob_parent *parent)
    7728             : {
    7729             :         int rc;
    7730             : 
    7731          10 :         bs_blob_list_remove(blob);
    7732             : 
    7733          10 :         if (spdk_blob_is_clone(blob)) {
    7734             :                 /* Remove the xattr that references the snapshot */
    7735           0 :                 blob->parent_id = SPDK_BLOBID_INVALID;
    7736           0 :                 blob_remove_xattr(blob, BLOB_SNAPSHOT, true);
    7737             :         }
    7738             : 
    7739          10 :         rc = blob_set_xattr(blob, BLOB_EXTERNAL_SNAPSHOT_ID, parent->u.esnap.id,
    7740          10 :                             parent->u.esnap.id_len, true);
    7741          10 :         if (rc != 0) {
    7742           0 :                 SPDK_ERRLOG("error %d setting external snapshot xattr\n", rc);
    7743           0 :                 return rc;
    7744             :         }
    7745          10 :         blob->invalid_flags |= SPDK_BLOB_EXTERNAL_SNAPSHOT;
    7746          10 :         blob->parent_id = SPDK_BLOBID_EXTERNAL_SNAPSHOT;
    7747             : 
    7748          10 :         bs_blob_list_add(blob);
    7749             : 
    7750          10 :         return 0;
    7751             : }
    7752             : 
    7753             : static void
    7754          20 : bs_set_external_parent_blob_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno)
    7755             : {
    7756          20 :         struct set_parent_ctx *ctx = cb_arg;
    7757             :         const void *esnap_id;
    7758             :         size_t esnap_id_len;
    7759             :         int rc;
    7760             : 
    7761          20 :         if (bserrno != 0) {
    7762           0 :                 SPDK_ERRLOG("blob open error %d\n", bserrno);
    7763           0 :                 ctx->bserrno = bserrno;
    7764           0 :                 bs_set_parent_cleanup_finish(ctx, 0);
    7765          10 :                 return;
    7766             :         }
    7767             : 
    7768          20 :         ctx->blob = blob;
    7769          20 :         ctx->blob_md_ro = blob->md_ro;
    7770             : 
    7771          20 :         rc = spdk_blob_get_esnap_id(blob, &esnap_id, &esnap_id_len);
    7772          20 :         if (rc == 0 && esnap_id != NULL && esnap_id_len == ctx->parent.u.esnap.id_len &&
    7773           5 :             memcmp(esnap_id, ctx->parent.u.esnap.id, esnap_id_len) == 0) {
    7774           5 :                 SPDK_ERRLOG("external snapshot is already the parent of blob\n");
    7775           5 :                 ctx->bserrno = -EEXIST;
    7776           5 :                 goto error;
    7777             :         }
    7778             : 
    7779          15 :         if (!spdk_blob_is_thin_provisioned(blob)) {
    7780           5 :                 SPDK_ERRLOG("blob is not thin-provisioned\n");
    7781           5 :                 ctx->bserrno = -EINVAL;
    7782           5 :                 goto error;
    7783             :         }
    7784             : 
    7785          10 :         if (blob->locked_operation_in_progress) {
    7786           0 :                 SPDK_ERRLOG("cannot set external parent of blob, another operation in progress\n");
    7787           0 :                 ctx->bserrno = -EBUSY;
    7788           0 :                 goto error;
    7789             :         }
    7790             : 
    7791          10 :         blob->locked_operation_in_progress = true;
    7792             : 
    7793             :         /* Temporarily override md_ro flag for MD modification */
    7794          10 :         blob->md_ro = false;
    7795             : 
    7796          10 :         blob_set_back_bs_dev(blob, ctx->parent.u.esnap.back_bs_dev, bs_set_external_parent_refs,
    7797             :                              &ctx->parent, bs_set_external_parent_unfrozen, ctx);
    7798          10 :         return;
    7799             : 
    7800          10 : error:
    7801          10 :         spdk_blob_close(blob, bs_set_external_parent_cleanup_finish, ctx);
    7802             : }
    7803             : 
    7804             : void
    7805          30 : spdk_bs_blob_set_external_parent(struct spdk_blob_store *bs, spdk_blob_id blob_id,
    7806             :                                  struct spdk_bs_dev *esnap_bs_dev, const void *esnap_id,
    7807             :                                  uint32_t esnap_id_len, spdk_blob_op_complete cb_fn, void *cb_arg)
    7808             : {
    7809             :         struct set_parent_ctx *ctx;
    7810             :         uint64_t esnap_dev_size, cluster_sz;
    7811             : 
    7812          30 :         if (sizeof(blob_id) == esnap_id_len && memcmp(&blob_id, esnap_id, sizeof(blob_id)) == 0) {
    7813           5 :                 SPDK_ERRLOG("blob id and external snapshot id cannot be the same\n");
    7814           5 :                 cb_fn(cb_arg, -EINVAL);
    7815           5 :                 return;
    7816             :         }
    7817             : 
    7818          25 :         esnap_dev_size = esnap_bs_dev->blockcnt * esnap_bs_dev->blocklen;
    7819          25 :         cluster_sz = spdk_bs_get_cluster_size(bs);
    7820          25 :         if ((esnap_dev_size % cluster_sz) != 0) {
    7821           5 :                 SPDK_ERRLOG("Esnap device size %" PRIu64 " is not an integer multiple of "
    7822             :                             "cluster size %" PRIu64 "\n", esnap_dev_size, cluster_sz);
    7823           5 :                 cb_fn(cb_arg, -EINVAL);
    7824           5 :                 return;
    7825             :         }
    7826             : 
    7827          20 :         ctx = calloc(1, sizeof(*ctx));
    7828          20 :         if (!ctx) {
    7829           0 :                 cb_fn(cb_arg, -ENOMEM);
    7830           0 :                 return;
    7831             :         }
    7832             : 
    7833          20 :         ctx->parent.u.esnap.id = calloc(1, esnap_id_len);
    7834          20 :         if (!ctx->parent.u.esnap.id) {
    7835           0 :                 free(ctx);
    7836           0 :                 cb_fn(cb_arg, -ENOMEM);
    7837           0 :                 return;
    7838             :         }
    7839             : 
    7840          20 :         ctx->bs = bs;
    7841          20 :         ctx->parent.u.esnap.back_bs_dev = esnap_bs_dev;
    7842          20 :         memcpy(ctx->parent.u.esnap.id, esnap_id, esnap_id_len);
    7843          20 :         ctx->parent.u.esnap.id_len = esnap_id_len;
    7844          20 :         ctx->cb_fn = cb_fn;
    7845          20 :         ctx->cb_arg = cb_arg;
    7846          20 :         ctx->bserrno = 0;
    7847             : 
    7848          20 :         spdk_bs_open_blob(bs, blob_id, bs_set_external_parent_blob_open_cpl, ctx);
    7849             : }
    7850             : /* END spdk_bs_blob_set_external_parent */
    7851             : 
    7852             : /* START spdk_blob_resize */
    7853             : struct spdk_bs_resize_ctx {
    7854             :         spdk_blob_op_complete cb_fn;
    7855             :         void *cb_arg;
    7856             :         struct spdk_blob *blob;
    7857             :         uint64_t sz;
    7858             :         int rc;
    7859             : };
    7860             : 
    7861             : static void
    7862         252 : bs_resize_unfreeze_cpl(void *cb_arg, int rc)
    7863             : {
    7864         252 :         struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg;
    7865             : 
    7866         252 :         if (rc != 0) {
    7867           0 :                 SPDK_ERRLOG("Unfreeze failed, rc=%d\n", rc);
    7868             :         }
    7869             : 
    7870         252 :         if (ctx->rc != 0) {
    7871           5 :                 SPDK_ERRLOG("Unfreeze failed, ctx->rc=%d\n", ctx->rc);
    7872           5 :                 rc = ctx->rc;
    7873             :         }
    7874             : 
    7875         252 :         ctx->blob->locked_operation_in_progress = false;
    7876             : 
    7877         252 :         ctx->cb_fn(ctx->cb_arg, rc);
    7878         252 :         free(ctx);
    7879         252 : }
    7880             : 
    7881             : static void
    7882         252 : bs_resize_freeze_cpl(void *cb_arg, int rc)
    7883             : {
    7884         252 :         struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg;
    7885             : 
    7886         252 :         if (rc != 0) {
    7887           0 :                 ctx->blob->locked_operation_in_progress = false;
    7888           0 :                 ctx->cb_fn(ctx->cb_arg, rc);
    7889           0 :                 free(ctx);
    7890           0 :                 return;
    7891             :         }
    7892             : 
    7893         252 :         ctx->rc = blob_resize(ctx->blob, ctx->sz);
    7894             : 
    7895         252 :         blob_unfreeze_io(ctx->blob, bs_resize_unfreeze_cpl, ctx);
    7896             : }
    7897             : 
    7898             : void
    7899         269 : spdk_blob_resize(struct spdk_blob *blob, uint64_t sz, spdk_blob_op_complete cb_fn, void *cb_arg)
    7900             : {
    7901             :         struct spdk_bs_resize_ctx *ctx;
    7902             : 
    7903         269 :         blob_verify_md_op(blob);
    7904             : 
    7905         269 :         SPDK_DEBUGLOG(blob, "Resizing blob 0x%" PRIx64 " to %" PRIu64 " clusters\n", blob->id, sz);
    7906             : 
    7907         269 :         if (blob->md_ro) {
    7908           5 :                 cb_fn(cb_arg, -EPERM);
    7909           5 :                 return;
    7910             :         }
    7911             : 
    7912         264 :         if (sz == blob->active.num_clusters) {
    7913          12 :                 cb_fn(cb_arg, 0);
    7914          12 :                 return;
    7915             :         }
    7916             : 
    7917         252 :         if (blob->locked_operation_in_progress) {
    7918           0 :                 cb_fn(cb_arg, -EBUSY);
    7919           0 :                 return;
    7920             :         }
    7921             : 
    7922         252 :         ctx = calloc(1, sizeof(*ctx));
    7923         252 :         if (!ctx) {
    7924           0 :                 cb_fn(cb_arg, -ENOMEM);
    7925           0 :                 return;
    7926             :         }
    7927             : 
    7928         252 :         blob->locked_operation_in_progress = true;
    7929         252 :         ctx->cb_fn = cb_fn;
    7930         252 :         ctx->cb_arg = cb_arg;
    7931         252 :         ctx->blob = blob;
    7932         252 :         ctx->sz = sz;
    7933         252 :         blob_freeze_io(blob, bs_resize_freeze_cpl, ctx);
    7934             : }
    7935             : 
    7936             : /* END spdk_blob_resize */
    7937             : 
    7938             : 
    7939             : /* START spdk_bs_delete_blob */
    7940             : 
    7941             : static void
    7942        1862 : bs_delete_close_cpl(void *cb_arg, int bserrno)
    7943             : {
    7944        1862 :         spdk_bs_sequence_t *seq = cb_arg;
    7945             : 
    7946        1862 :         bs_sequence_finish(seq, bserrno);
    7947        1862 : }
    7948             : 
    7949             : static void
    7950        1862 : bs_delete_persist_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    7951             : {
    7952        1862 :         struct spdk_blob *blob = cb_arg;
    7953             : 
    7954        1862 :         if (bserrno != 0) {
    7955             :                 /*
    7956             :                  * We already removed this blob from the blobstore tailq, so
    7957             :                  *  we need to free it here since this is the last reference
    7958             :                  *  to it.
    7959             :                  */
    7960           0 :                 blob_free(blob);
    7961           0 :                 bs_delete_close_cpl(seq, bserrno);
    7962           0 :                 return;
    7963             :         }
    7964             : 
    7965             :         /*
    7966             :          * This will immediately decrement the ref_count and call
    7967             :          *  the completion routine since the metadata state is clean.
    7968             :          *  By calling spdk_blob_close, we reduce the number of call
    7969             :          *  points into code that touches the blob->open_ref count
    7970             :          *  and the blobstore's blob list.
    7971             :          */
    7972        1862 :         spdk_blob_close(blob, bs_delete_close_cpl, seq);
    7973             : }
    7974             : 
    7975             : struct delete_snapshot_ctx {
    7976             :         struct spdk_blob_list *parent_snapshot_entry;
    7977             :         struct spdk_blob *snapshot;
    7978             :         struct spdk_blob_md_page *page;
    7979             :         bool snapshot_md_ro;
    7980             :         struct spdk_blob *clone;
    7981             :         bool clone_md_ro;
    7982             :         spdk_blob_op_with_handle_complete cb_fn;
    7983             :         void *cb_arg;
    7984             :         int bserrno;
    7985             :         uint32_t next_extent_page;
    7986             : };
    7987             : 
    7988             : static void
    7989         138 : delete_blob_cleanup_finish(void *cb_arg, int bserrno)
    7990             : {
    7991         138 :         struct delete_snapshot_ctx *ctx = cb_arg;
    7992             : 
    7993         138 :         if (bserrno != 0) {
    7994           0 :                 SPDK_ERRLOG("Snapshot cleanup error %d\n", bserrno);
    7995             :         }
    7996             : 
    7997         138 :         assert(ctx != NULL);
    7998             : 
    7999         138 :         if (bserrno != 0 && ctx->bserrno == 0) {
    8000           0 :                 ctx->bserrno = bserrno;
    8001             :         }
    8002             : 
    8003         138 :         ctx->cb_fn(ctx->cb_arg, ctx->snapshot, ctx->bserrno);
    8004         138 :         spdk_free(ctx->page);
    8005         138 :         free(ctx);
    8006         138 : }
    8007             : 
    8008             : static void
    8009          28 : delete_snapshot_cleanup_snapshot(void *cb_arg, int bserrno)
    8010             : {
    8011          28 :         struct delete_snapshot_ctx *ctx = cb_arg;
    8012             : 
    8013          28 :         if (bserrno != 0) {
    8014           0 :                 ctx->bserrno = bserrno;
    8015           0 :                 SPDK_ERRLOG("Clone cleanup error %d\n", bserrno);
    8016             :         }
    8017             : 
    8018          28 :         if (ctx->bserrno != 0) {
    8019          28 :                 assert(blob_lookup(ctx->snapshot->bs, ctx->snapshot->id) == NULL);
    8020          28 :                 RB_INSERT(spdk_blob_tree, &ctx->snapshot->bs->open_blobs, ctx->snapshot);
    8021          28 :                 spdk_bit_array_set(ctx->snapshot->bs->open_blobids, ctx->snapshot->id);
    8022             :         }
    8023             : 
    8024          28 :         ctx->snapshot->locked_operation_in_progress = false;
    8025          28 :         ctx->snapshot->md_ro = ctx->snapshot_md_ro;
    8026             : 
    8027          28 :         spdk_blob_close(ctx->snapshot, delete_blob_cleanup_finish, ctx);
    8028          28 : }
    8029             : 
    8030             : static void
    8031          15 : delete_snapshot_cleanup_clone(void *cb_arg, int bserrno)
    8032             : {
    8033          15 :         struct delete_snapshot_ctx *ctx = cb_arg;
    8034             : 
    8035          15 :         ctx->clone->locked_operation_in_progress = false;
    8036          15 :         ctx->clone->md_ro = ctx->clone_md_ro;
    8037             : 
    8038          15 :         spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx);
    8039          15 : }
    8040             : 
    8041             : static void
    8042          60 : delete_snapshot_unfreeze_cpl(void *cb_arg, int bserrno)
    8043             : {
    8044          60 :         struct delete_snapshot_ctx *ctx = cb_arg;
    8045             : 
    8046          60 :         if (bserrno) {
    8047           0 :                 ctx->bserrno = bserrno;
    8048           0 :                 delete_snapshot_cleanup_clone(ctx, 0);
    8049           0 :                 return;
    8050             :         }
    8051             : 
    8052          60 :         ctx->clone->locked_operation_in_progress = false;
    8053          60 :         spdk_blob_close(ctx->clone, delete_blob_cleanup_finish, ctx);
    8054             : }
    8055             : 
    8056             : static void
    8057          65 : delete_snapshot_sync_snapshot_cpl(void *cb_arg, int bserrno)
    8058             : {
    8059          65 :         struct delete_snapshot_ctx *ctx = cb_arg;
    8060          65 :         struct spdk_blob_list *parent_snapshot_entry = NULL;
    8061          65 :         struct spdk_blob_list *snapshot_entry = NULL;
    8062          65 :         struct spdk_blob_list *clone_entry = NULL;
    8063          65 :         struct spdk_blob_list *snapshot_clone_entry = NULL;
    8064             : 
    8065          65 :         if (bserrno) {
    8066           5 :                 SPDK_ERRLOG("Failed to sync MD on blob\n");
    8067           5 :                 ctx->bserrno = bserrno;
    8068           5 :                 delete_snapshot_cleanup_clone(ctx, 0);
    8069           5 :                 return;
    8070             :         }
    8071             : 
    8072             :         /* Get snapshot entry for the snapshot we want to remove */
    8073          60 :         snapshot_entry = bs_get_snapshot_entry(ctx->snapshot->bs, ctx->snapshot->id);
    8074             : 
    8075          60 :         assert(snapshot_entry != NULL);
    8076             : 
    8077             :         /* Remove clone entry in this snapshot (at this point there can be only one clone) */
    8078          60 :         clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
    8079          60 :         assert(clone_entry != NULL);
    8080          60 :         TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
    8081          60 :         snapshot_entry->clone_count--;
    8082          60 :         assert(TAILQ_EMPTY(&snapshot_entry->clones));
    8083             : 
    8084          60 :         switch (ctx->snapshot->parent_id) {
    8085          50 :         case SPDK_BLOBID_INVALID:
    8086             :         case SPDK_BLOBID_EXTERNAL_SNAPSHOT:
    8087             :                 /* No parent snapshot - just remove clone entry */
    8088          50 :                 free(clone_entry);
    8089          50 :                 break;
    8090          10 :         default:
    8091             :                 /* This snapshot is at the same time a clone of another snapshot - we need to
    8092             :                  * update parent snapshot (remove current clone, add new one inherited from
    8093             :                  * the snapshot that is being removed) */
    8094             : 
    8095             :                 /* Get snapshot entry for parent snapshot and clone entry within that snapshot for
    8096             :                  * snapshot that we are removing */
    8097          10 :                 blob_get_snapshot_and_clone_entries(ctx->snapshot, &parent_snapshot_entry,
    8098             :                                                     &snapshot_clone_entry);
    8099             : 
    8100             :                 /* Switch clone entry in parent snapshot */
    8101          10 :                 TAILQ_INSERT_TAIL(&parent_snapshot_entry->clones, clone_entry, link);
    8102          10 :                 TAILQ_REMOVE(&parent_snapshot_entry->clones, snapshot_clone_entry, link);
    8103          10 :                 free(snapshot_clone_entry);
    8104             :         }
    8105             : 
    8106             :         /* Restore md_ro flags */
    8107          60 :         ctx->clone->md_ro = ctx->clone_md_ro;
    8108          60 :         ctx->snapshot->md_ro = ctx->snapshot_md_ro;
    8109             : 
    8110          60 :         blob_unfreeze_io(ctx->clone, delete_snapshot_unfreeze_cpl, ctx);
    8111             : }
    8112             : 
    8113             : static void
    8114          70 : delete_snapshot_sync_clone_cpl(void *cb_arg, int bserrno)
    8115             : {
    8116          70 :         struct delete_snapshot_ctx *ctx = cb_arg;
    8117             :         uint64_t i;
    8118             : 
    8119          70 :         ctx->snapshot->md_ro = false;
    8120             : 
    8121          70 :         if (bserrno) {
    8122           5 :                 SPDK_ERRLOG("Failed to sync MD on clone\n");
    8123           5 :                 ctx->bserrno = bserrno;
    8124             : 
    8125             :                 /* Restore snapshot to previous state */
    8126           5 :                 bserrno = blob_remove_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, true);
    8127           5 :                 if (bserrno != 0) {
    8128           0 :                         delete_snapshot_cleanup_clone(ctx, bserrno);
    8129           0 :                         return;
    8130             :                 }
    8131             : 
    8132           5 :                 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_cleanup_clone, ctx);
    8133           5 :                 return;
    8134             :         }
    8135             : 
    8136             :         /* Clear cluster map entries for snapshot */
    8137         690 :         for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) {
    8138         625 :                 if (ctx->clone->active.clusters[i] == ctx->snapshot->active.clusters[i]) {
    8139         615 :                         if (ctx->snapshot->active.clusters[i] != 0) {
    8140         410 :                                 ctx->snapshot->active.num_allocated_clusters--;
    8141             :                         }
    8142         615 :                         ctx->snapshot->active.clusters[i] = 0;
    8143             :                 }
    8144             :         }
    8145         104 :         for (i = 0; i < ctx->snapshot->active.num_extent_pages &&
    8146          78 :              i < ctx->clone->active.num_extent_pages; i++) {
    8147          39 :                 if (ctx->clone->active.extent_pages[i] == ctx->snapshot->active.extent_pages[i]) {
    8148          36 :                         ctx->snapshot->active.extent_pages[i] = 0;
    8149             :                 }
    8150             :         }
    8151             : 
    8152          65 :         blob_set_thin_provision(ctx->snapshot);
    8153          65 :         ctx->snapshot->state = SPDK_BLOB_STATE_DIRTY;
    8154             : 
    8155          65 :         if (ctx->parent_snapshot_entry != NULL) {
    8156          10 :                 ctx->snapshot->back_bs_dev = NULL;
    8157             :         }
    8158             : 
    8159          65 :         spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_cpl, ctx);
    8160             : }
    8161             : 
    8162             : static void
    8163          70 : delete_snapshot_update_extent_pages_cpl(struct delete_snapshot_ctx *ctx)
    8164             : {
    8165             :         int bserrno;
    8166             : 
    8167             :         /* Delete old backing bs_dev from clone (related to snapshot that will be removed) */
    8168          70 :         blob_back_bs_destroy(ctx->clone);
    8169             : 
    8170             :         /* Set/remove snapshot xattr and switch parent ID and backing bs_dev on clone... */
    8171          70 :         if (ctx->snapshot->parent_id == SPDK_BLOBID_EXTERNAL_SNAPSHOT) {
    8172          10 :                 bserrno = bs_snapshot_copy_xattr(ctx->clone, ctx->snapshot,
    8173             :                                                  BLOB_EXTERNAL_SNAPSHOT_ID);
    8174          10 :                 if (bserrno != 0) {
    8175           0 :                         ctx->bserrno = bserrno;
    8176             : 
    8177             :                         /* Restore snapshot to previous state */
    8178           0 :                         bserrno = blob_remove_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, true);
    8179           0 :                         if (bserrno != 0) {
    8180           0 :                                 delete_snapshot_cleanup_clone(ctx, bserrno);
    8181           0 :                                 return;
    8182             :                         }
    8183             : 
    8184           0 :                         spdk_blob_sync_md(ctx->snapshot, delete_snapshot_cleanup_clone, ctx);
    8185           0 :                         return;
    8186             :                 }
    8187          10 :                 ctx->clone->parent_id = SPDK_BLOBID_EXTERNAL_SNAPSHOT;
    8188          10 :                 ctx->clone->back_bs_dev = ctx->snapshot->back_bs_dev;
    8189             :                 /* Do not delete the external snapshot along with this snapshot */
    8190          10 :                 ctx->snapshot->back_bs_dev = NULL;
    8191          10 :                 ctx->clone->invalid_flags |= SPDK_BLOB_EXTERNAL_SNAPSHOT;
    8192          60 :         } else if (ctx->parent_snapshot_entry != NULL) {
    8193             :                 /* ...to parent snapshot */
    8194          10 :                 ctx->clone->parent_id = ctx->parent_snapshot_entry->id;
    8195          10 :                 ctx->clone->back_bs_dev = ctx->snapshot->back_bs_dev;
    8196          10 :                 blob_set_xattr(ctx->clone, BLOB_SNAPSHOT, &ctx->parent_snapshot_entry->id,
    8197             :                                sizeof(spdk_blob_id),
    8198             :                                true);
    8199             :         } else {
    8200             :                 /* ...to blobid invalid and zeroes dev */
    8201          50 :                 ctx->clone->parent_id = SPDK_BLOBID_INVALID;
    8202          50 :                 ctx->clone->back_bs_dev = bs_create_zeroes_dev();
    8203          50 :                 blob_remove_xattr(ctx->clone, BLOB_SNAPSHOT, true);
    8204             :         }
    8205             : 
    8206          70 :         spdk_blob_sync_md(ctx->clone, delete_snapshot_sync_clone_cpl, ctx);
    8207             : }
    8208             : 
    8209             : static void
    8210          73 : delete_snapshot_update_extent_pages(void *cb_arg, int bserrno)
    8211             : {
    8212          73 :         struct delete_snapshot_ctx *ctx = cb_arg;
    8213             :         uint32_t *extent_page;
    8214             :         uint64_t i;
    8215             : 
    8216         112 :         for (i = ctx->next_extent_page; i < ctx->snapshot->active.num_extent_pages &&
    8217          81 :              i < ctx->clone->active.num_extent_pages; i++) {
    8218          42 :                 if (ctx->snapshot->active.extent_pages[i] == 0) {
    8219             :                         /* No extent page to use from snapshot */
    8220          12 :                         continue;
    8221             :                 }
    8222             : 
    8223          30 :                 extent_page = &ctx->clone->active.extent_pages[i];
    8224          30 :                 if (*extent_page == 0) {
    8225             :                         /* Copy extent page from snapshot when clone did not have a matching one */
    8226          27 :                         *extent_page = ctx->snapshot->active.extent_pages[i];
    8227          27 :                         continue;
    8228             :                 }
    8229             : 
    8230             :                 /* Clone and snapshot both contain partially filled matching extent pages.
    8231             :                  * Update the clone extent page in place with cluster map containing the mix of both. */
    8232           3 :                 ctx->next_extent_page = i + 1;
    8233           3 :                 memset(ctx->page, 0, SPDK_BS_PAGE_SIZE);
    8234             : 
    8235           3 :                 blob_write_extent_page(ctx->clone, *extent_page, i * SPDK_EXTENTS_PER_EP, ctx->page,
    8236             :                                        delete_snapshot_update_extent_pages, ctx);
    8237           3 :                 return;
    8238             :         }
    8239          70 :         delete_snapshot_update_extent_pages_cpl(ctx);
    8240             : }
    8241             : 
    8242             : static void
    8243          75 : delete_snapshot_sync_snapshot_xattr_cpl(void *cb_arg, int bserrno)
    8244             : {
    8245          75 :         struct delete_snapshot_ctx *ctx = cb_arg;
    8246             :         uint64_t i;
    8247             : 
    8248             :         /* Temporarily override md_ro flag for clone for MD modification */
    8249          75 :         ctx->clone_md_ro = ctx->clone->md_ro;
    8250          75 :         ctx->clone->md_ro = false;
    8251             : 
    8252          75 :         if (bserrno) {
    8253           5 :                 SPDK_ERRLOG("Failed to sync MD with xattr on blob\n");
    8254           5 :                 ctx->bserrno = bserrno;
    8255           5 :                 delete_snapshot_cleanup_clone(ctx, 0);
    8256           5 :                 return;
    8257             :         }
    8258             : 
    8259             :         /* Copy snapshot map to clone map (only unallocated clusters in clone) */
    8260         745 :         for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) {
    8261         675 :                 if (ctx->clone->active.clusters[i] == 0) {
    8262         665 :                         ctx->clone->active.clusters[i] = ctx->snapshot->active.clusters[i];
    8263         665 :                         if (ctx->clone->active.clusters[i] != 0) {
    8264         460 :                                 ctx->clone->active.num_allocated_clusters++;
    8265             :                         }
    8266             :                 }
    8267             :         }
    8268          70 :         ctx->next_extent_page = 0;
    8269          70 :         delete_snapshot_update_extent_pages(ctx, 0);
    8270             : }
    8271             : 
    8272             : static void
    8273          10 : delete_snapshot_esnap_channels_destroyed_cb(void *cb_arg, struct spdk_blob *blob, int bserrno)
    8274             : {
    8275          10 :         struct delete_snapshot_ctx *ctx = cb_arg;
    8276             : 
    8277          10 :         if (bserrno != 0) {
    8278           0 :                 SPDK_ERRLOG("blob 0x%" PRIx64 ": failed to destroy esnap channels: %d\n",
    8279             :                             blob->id, bserrno);
    8280             :                 /* That error should not stop us from syncing metadata. */
    8281             :         }
    8282             : 
    8283          10 :         spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_xattr_cpl, ctx);
    8284          10 : }
    8285             : 
    8286             : static void
    8287          75 : delete_snapshot_freeze_io_cb(void *cb_arg, int bserrno)
    8288             : {
    8289          75 :         struct delete_snapshot_ctx *ctx = cb_arg;
    8290             : 
    8291          75 :         if (bserrno) {
    8292           0 :                 SPDK_ERRLOG("Failed to freeze I/O on clone\n");
    8293           0 :                 ctx->bserrno = bserrno;
    8294           0 :                 delete_snapshot_cleanup_clone(ctx, 0);
    8295           0 :                 return;
    8296             :         }
    8297             : 
    8298             :         /* Temporarily override md_ro flag for snapshot for MD modification */
    8299          75 :         ctx->snapshot_md_ro = ctx->snapshot->md_ro;
    8300          75 :         ctx->snapshot->md_ro = false;
    8301             : 
    8302             :         /* Mark blob as pending for removal for power failure safety, use clone id for recovery */
    8303          75 :         ctx->bserrno = blob_set_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, &ctx->clone->id,
    8304             :                                       sizeof(spdk_blob_id), true);
    8305          75 :         if (ctx->bserrno != 0) {
    8306           0 :                 delete_snapshot_cleanup_clone(ctx, 0);
    8307           0 :                 return;
    8308             :         }
    8309             : 
    8310          75 :         if (blob_is_esnap_clone(ctx->snapshot)) {
    8311          10 :                 blob_esnap_destroy_bs_dev_channels(ctx->snapshot, false,
    8312             :                                                    delete_snapshot_esnap_channels_destroyed_cb,
    8313             :                                                    ctx);
    8314          10 :                 return;
    8315             :         }
    8316             : 
    8317          65 :         spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_xattr_cpl, ctx);
    8318             : }
    8319             : 
    8320             : static void
    8321          88 : delete_snapshot_open_clone_cb(void *cb_arg, struct spdk_blob *clone, int bserrno)
    8322             : {
    8323          88 :         struct delete_snapshot_ctx *ctx = cb_arg;
    8324             : 
    8325          88 :         if (bserrno) {
    8326          13 :                 SPDK_ERRLOG("Failed to open clone\n");
    8327          13 :                 ctx->bserrno = bserrno;
    8328          13 :                 delete_snapshot_cleanup_snapshot(ctx, 0);
    8329          13 :                 return;
    8330             :         }
    8331             : 
    8332          75 :         ctx->clone = clone;
    8333             : 
    8334          75 :         if (clone->locked_operation_in_progress) {
    8335           0 :                 SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress on its clone\n");
    8336           0 :                 ctx->bserrno = -EBUSY;
    8337           0 :                 spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx);
    8338           0 :                 return;
    8339             :         }
    8340             : 
    8341          75 :         clone->locked_operation_in_progress = true;
    8342             : 
    8343          75 :         blob_freeze_io(clone, delete_snapshot_freeze_io_cb, ctx);
    8344             : }
    8345             : 
    8346             : static void
    8347          88 : update_clone_on_snapshot_deletion(struct spdk_blob *snapshot, struct delete_snapshot_ctx *ctx)
    8348             : {
    8349          88 :         struct spdk_blob_list *snapshot_entry = NULL;
    8350          88 :         struct spdk_blob_list *clone_entry = NULL;
    8351          88 :         struct spdk_blob_list *snapshot_clone_entry = NULL;
    8352             : 
    8353             :         /* Get snapshot entry for the snapshot we want to remove */
    8354          88 :         snapshot_entry = bs_get_snapshot_entry(snapshot->bs, snapshot->id);
    8355             : 
    8356          88 :         assert(snapshot_entry != NULL);
    8357             : 
    8358             :         /* Get clone of the snapshot (at this point there can be only one clone) */
    8359          88 :         clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
    8360          88 :         assert(snapshot_entry->clone_count == 1);
    8361          88 :         assert(clone_entry != NULL);
    8362             : 
    8363             :         /* Get snapshot entry for parent snapshot and clone entry within that snapshot for
    8364             :          * snapshot that we are removing */
    8365          88 :         blob_get_snapshot_and_clone_entries(snapshot, &ctx->parent_snapshot_entry,
    8366             :                                             &snapshot_clone_entry);
    8367             : 
    8368          88 :         spdk_bs_open_blob(snapshot->bs, clone_entry->id, delete_snapshot_open_clone_cb, ctx);
    8369          88 : }
    8370             : 
    8371             : static void
    8372        1940 : bs_delete_blob_finish(void *cb_arg, struct spdk_blob *blob, int bserrno)
    8373             : {
    8374        1940 :         spdk_bs_sequence_t *seq = cb_arg;
    8375        1940 :         struct spdk_blob_list *snapshot_entry = NULL;
    8376             :         uint32_t page_num;
    8377             : 
    8378        1940 :         if (bserrno) {
    8379          78 :                 SPDK_ERRLOG("Failed to remove blob\n");
    8380          78 :                 bs_sequence_finish(seq, bserrno);
    8381          78 :                 return;
    8382             :         }
    8383             : 
    8384             :         /* Remove snapshot from the list */
    8385        1862 :         snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id);
    8386        1862 :         if (snapshot_entry != NULL) {
    8387         180 :                 TAILQ_REMOVE(&blob->bs->snapshots, snapshot_entry, link);
    8388         180 :                 free(snapshot_entry);
    8389             :         }
    8390             : 
    8391        1862 :         page_num = bs_blobid_to_page(blob->id);
    8392        1862 :         spdk_bit_array_clear(blob->bs->used_blobids, page_num);
    8393        1862 :         blob->state = SPDK_BLOB_STATE_DIRTY;
    8394        1862 :         blob->active.num_pages = 0;
    8395        1862 :         blob_resize(blob, 0);
    8396             : 
    8397        1862 :         blob_persist(seq, blob, bs_delete_persist_cpl, blob);
    8398             : }
    8399             : 
    8400             : static int
    8401        1940 : bs_is_blob_deletable(struct spdk_blob *blob, bool *update_clone)
    8402             : {
    8403        1940 :         struct spdk_blob_list *snapshot_entry = NULL;
    8404        1940 :         struct spdk_blob_list *clone_entry = NULL;
    8405        1940 :         struct spdk_blob *clone = NULL;
    8406        1940 :         bool has_one_clone = false;
    8407             : 
    8408             :         /* Check if this is a snapshot with clones */
    8409        1940 :         snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id);
    8410        1940 :         if (snapshot_entry != NULL) {
    8411         243 :                 if (snapshot_entry->clone_count > 1) {
    8412          30 :                         SPDK_ERRLOG("Cannot remove snapshot with more than one clone\n");
    8413          30 :                         return -EBUSY;
    8414         213 :                 } else if (snapshot_entry->clone_count == 1) {
    8415          88 :                         has_one_clone = true;
    8416             :                 }
    8417             :         }
    8418             : 
    8419             :         /* Check if someone has this blob open (besides this delete context):
    8420             :          * - open_ref = 1 - only this context opened blob, so it is ok to remove it
    8421             :          * - open_ref <= 2 && has_one_clone = true - clone is holding snapshot
    8422             :          *      and that is ok, because we will update it accordingly */
    8423        1910 :         if (blob->open_ref <= 2 && has_one_clone) {
    8424          88 :                 clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
    8425          88 :                 assert(clone_entry != NULL);
    8426          88 :                 clone = blob_lookup(blob->bs, clone_entry->id);
    8427             : 
    8428          88 :                 if (blob->open_ref == 2 && clone == NULL) {
    8429             :                         /* Clone is closed and someone else opened this blob */
    8430           0 :                         SPDK_ERRLOG("Cannot remove snapshot because it is open\n");
    8431           0 :                         return -EBUSY;
    8432             :                 }
    8433             : 
    8434          88 :                 *update_clone = true;
    8435          88 :                 return 0;
    8436             :         }
    8437             : 
    8438        1822 :         if (blob->open_ref > 1) {
    8439          20 :                 SPDK_ERRLOG("Cannot remove snapshot because it is open\n");
    8440          20 :                 return -EBUSY;
    8441             :         }
    8442             : 
    8443        1802 :         assert(has_one_clone == false);
    8444        1802 :         *update_clone = false;
    8445        1802 :         return 0;
    8446             : }
    8447             : 
    8448             : static void
    8449           0 : bs_delete_enomem_close_cpl(void *cb_arg, int bserrno)
    8450             : {
    8451           0 :         spdk_bs_sequence_t *seq = cb_arg;
    8452             : 
    8453           0 :         bs_sequence_finish(seq, -ENOMEM);
    8454           0 : }
    8455             : 
    8456             : static void
    8457        1953 : bs_delete_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno)
    8458             : {
    8459        1953 :         spdk_bs_sequence_t *seq = cb_arg;
    8460             :         struct delete_snapshot_ctx *ctx;
    8461        1953 :         bool update_clone = false;
    8462             : 
    8463        1953 :         if (bserrno != 0) {
    8464          13 :                 bs_sequence_finish(seq, bserrno);
    8465          63 :                 return;
    8466             :         }
    8467             : 
    8468        1940 :         blob_verify_md_op(blob);
    8469             : 
    8470        1940 :         ctx = calloc(1, sizeof(*ctx));
    8471        1940 :         if (ctx == NULL) {
    8472           0 :                 spdk_blob_close(blob, bs_delete_enomem_close_cpl, seq);
    8473           0 :                 return;
    8474             :         }
    8475             : 
    8476        1940 :         ctx->snapshot = blob;
    8477        1940 :         ctx->cb_fn = bs_delete_blob_finish;
    8478        1940 :         ctx->cb_arg = seq;
    8479             : 
    8480             :         /* Check if blob can be removed and if it is a snapshot with clone on top of it */
    8481        1940 :         ctx->bserrno = bs_is_blob_deletable(blob, &update_clone);
    8482        1940 :         if (ctx->bserrno) {
    8483          50 :                 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx);
    8484          50 :                 return;
    8485             :         }
    8486             : 
    8487        1890 :         if (blob->locked_operation_in_progress) {
    8488           0 :                 SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress\n");
    8489           0 :                 ctx->bserrno = -EBUSY;
    8490           0 :                 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx);
    8491           0 :                 return;
    8492             :         }
    8493             : 
    8494        1890 :         blob->locked_operation_in_progress = true;
    8495             : 
    8496             :         /*
    8497             :          * Remove the blob from the blob_store list now, to ensure it does not
    8498             :          *  get returned after this point by blob_lookup().
    8499             :          */
    8500        1890 :         spdk_bit_array_clear(blob->bs->open_blobids, blob->id);
    8501        1890 :         RB_REMOVE(spdk_blob_tree, &blob->bs->open_blobs, blob);
    8502             : 
    8503        1890 :         if (update_clone) {
    8504          88 :                 ctx->page = spdk_zmalloc(blob->bs->md_page_size, 0, NULL, SPDK_ENV_NUMA_ID_ANY,
    8505             :                                          SPDK_MALLOC_DMA);
    8506          88 :                 if (!ctx->page) {
    8507           0 :                         ctx->bserrno = -ENOMEM;
    8508           0 :                         spdk_blob_close(blob, delete_blob_cleanup_finish, ctx);
    8509           0 :                         return;
    8510             :                 }
    8511             :                 /* This blob is a snapshot with active clone - update clone first */
    8512          88 :                 update_clone_on_snapshot_deletion(blob, ctx);
    8513             :         } else {
    8514             :                 /* This blob does not have any clones - just remove it */
    8515        1802 :                 bs_blob_list_remove(blob);
    8516        1802 :                 bs_delete_blob_finish(seq, blob, 0);
    8517        1802 :                 free(ctx);
    8518             :         }
    8519             : }
    8520             : 
    8521             : void
    8522        1953 : spdk_bs_delete_blob(struct spdk_blob_store *bs, spdk_blob_id blobid,
    8523             :                     spdk_blob_op_complete cb_fn, void *cb_arg)
    8524             : {
    8525             :         struct spdk_bs_cpl      cpl;
    8526             :         spdk_bs_sequence_t      *seq;
    8527             : 
    8528        1953 :         SPDK_DEBUGLOG(blob, "Deleting blob 0x%" PRIx64 "\n", blobid);
    8529             : 
    8530        1953 :         assert(spdk_get_thread() == bs->md_thread);
    8531             : 
    8532        1953 :         cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
    8533        1953 :         cpl.u.blob_basic.cb_fn = cb_fn;
    8534        1953 :         cpl.u.blob_basic.cb_arg = cb_arg;
    8535             : 
    8536        1953 :         seq = bs_sequence_start_bs(bs->md_channel, &cpl);
    8537        1953 :         if (!seq) {
    8538           0 :                 cb_fn(cb_arg, -ENOMEM);
    8539           0 :                 return;
    8540             :         }
    8541             : 
    8542        1953 :         spdk_bs_open_blob(bs, blobid, bs_delete_open_cpl, seq);
    8543             : }
    8544             : 
    8545             : /* END spdk_bs_delete_blob */
    8546             : 
    8547             : /* START spdk_bs_open_blob */
    8548             : 
    8549             : static void
    8550        4341 : bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    8551             : {
    8552        4341 :         struct spdk_blob *blob = cb_arg;
    8553             :         struct spdk_blob *existing;
    8554             : 
    8555        4341 :         if (bserrno != 0) {
    8556          82 :                 blob_free(blob);
    8557          82 :                 seq->cpl.u.blob_handle.blob = NULL;
    8558          82 :                 bs_sequence_finish(seq, bserrno);
    8559          82 :                 return;
    8560             :         }
    8561             : 
    8562        4259 :         existing = blob_lookup(blob->bs, blob->id);
    8563        4259 :         if (existing) {
    8564           5 :                 blob_free(blob);
    8565           5 :                 existing->open_ref++;
    8566           5 :                 seq->cpl.u.blob_handle.blob = existing;
    8567           5 :                 bs_sequence_finish(seq, 0);
    8568           5 :                 return;
    8569             :         }
    8570             : 
    8571        4254 :         blob->open_ref++;
    8572             : 
    8573        4254 :         spdk_bit_array_set(blob->bs->open_blobids, blob->id);
    8574        4254 :         RB_INSERT(spdk_blob_tree, &blob->bs->open_blobs, blob);
    8575             : 
    8576        4254 :         bs_sequence_finish(seq, bserrno);
    8577             : }
    8578             : 
    8579             : static inline void
    8580           5 : blob_open_opts_copy(const struct spdk_blob_open_opts *src, struct spdk_blob_open_opts *dst)
    8581             : {
    8582             : #define FIELD_OK(field) \
    8583             :         offsetof(struct spdk_blob_open_opts, field) + sizeof(src->field) <= src->opts_size
    8584             : 
    8585             : #define SET_FIELD(field) \
    8586             :         if (FIELD_OK(field)) { \
    8587             :                 dst->field = src->field; \
    8588             :         } \
    8589             : 
    8590           5 :         SET_FIELD(clear_method);
    8591           5 :         SET_FIELD(esnap_ctx);
    8592             : 
    8593           5 :         dst->opts_size = src->opts_size;
    8594             : 
    8595             :         /* You should not remove this statement, but need to update the assert statement
    8596             :          * if you add a new field, and also add a corresponding SET_FIELD statement */
    8597             :         SPDK_STATIC_ASSERT(sizeof(struct spdk_blob_open_opts) == 24, "Incorrect size");
    8598             : 
    8599             : #undef FIELD_OK
    8600             : #undef SET_FIELD
    8601           5 : }
    8602             : 
    8603             : static void
    8604        5353 : bs_open_blob(struct spdk_blob_store *bs,
    8605             :              spdk_blob_id blobid,
    8606             :              struct spdk_blob_open_opts *opts,
    8607             :              spdk_blob_op_with_handle_complete cb_fn,
    8608             :              void *cb_arg)
    8609             : {
    8610             :         struct spdk_blob                *blob;
    8611             :         struct spdk_bs_cpl              cpl;
    8612             :         struct spdk_blob_open_opts      opts_local;
    8613             :         spdk_bs_sequence_t              *seq;
    8614             :         uint32_t                        page_num;
    8615             : 
    8616        5353 :         SPDK_DEBUGLOG(blob, "Opening blob 0x%" PRIx64 "\n", blobid);
    8617        5353 :         assert(spdk_get_thread() == bs->md_thread);
    8618             : 
    8619        5353 :         page_num = bs_blobid_to_page(blobid);
    8620        5353 :         if (spdk_bit_array_get(bs->used_blobids, page_num) == false) {
    8621             :                 /* Invalid blobid */
    8622          60 :                 cb_fn(cb_arg, NULL, -ENOENT);
    8623        1012 :                 return;
    8624             :         }
    8625             : 
    8626        5293 :         blob = blob_lookup(bs, blobid);
    8627        5293 :         if (blob) {
    8628         952 :                 blob->open_ref++;
    8629         952 :                 cb_fn(cb_arg, blob, 0);
    8630         952 :                 return;
    8631             :         }
    8632             : 
    8633        4341 :         blob = blob_alloc(bs, blobid);
    8634        4341 :         if (!blob) {
    8635           0 :                 cb_fn(cb_arg, NULL, -ENOMEM);
    8636           0 :                 return;
    8637             :         }
    8638             : 
    8639        4341 :         spdk_blob_open_opts_init(&opts_local, sizeof(opts_local));
    8640        4341 :         if (opts) {
    8641           5 :                 blob_open_opts_copy(opts, &opts_local);
    8642             :         }
    8643             : 
    8644        4341 :         blob->clear_method = opts_local.clear_method;
    8645             : 
    8646        4341 :         cpl.type = SPDK_BS_CPL_TYPE_BLOB_HANDLE;
    8647        4341 :         cpl.u.blob_handle.cb_fn = cb_fn;
    8648        4341 :         cpl.u.blob_handle.cb_arg = cb_arg;
    8649        4341 :         cpl.u.blob_handle.blob = blob;
    8650        4341 :         cpl.u.blob_handle.esnap_ctx = opts_local.esnap_ctx;
    8651             : 
    8652        4341 :         seq = bs_sequence_start_bs(bs->md_channel, &cpl);
    8653        4341 :         if (!seq) {
    8654           0 :                 blob_free(blob);
    8655           0 :                 cb_fn(cb_arg, NULL, -ENOMEM);
    8656           0 :                 return;
    8657             :         }
    8658             : 
    8659        4341 :         blob_load(seq, blob, bs_open_blob_cpl, blob);
    8660             : }
    8661             : 
    8662             : void
    8663        5348 : spdk_bs_open_blob(struct spdk_blob_store *bs, spdk_blob_id blobid,
    8664             :                   spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
    8665             : {
    8666        5348 :         bs_open_blob(bs, blobid, NULL, cb_fn, cb_arg);
    8667        5348 : }
    8668             : 
    8669             : void
    8670           5 : spdk_bs_open_blob_ext(struct spdk_blob_store *bs, spdk_blob_id blobid,
    8671             :                       struct spdk_blob_open_opts *opts, spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
    8672             : {
    8673           5 :         bs_open_blob(bs, blobid, opts, cb_fn, cb_arg);
    8674           5 : }
    8675             : 
    8676             : /* END spdk_bs_open_blob */
    8677             : 
    8678             : /* START spdk_blob_set_read_only */
    8679             : int
    8680         296 : spdk_blob_set_read_only(struct spdk_blob *blob)
    8681             : {
    8682         296 :         blob_verify_md_op(blob);
    8683             : 
    8684         296 :         blob->data_ro_flags |= SPDK_BLOB_READ_ONLY;
    8685             : 
    8686         296 :         blob->state = SPDK_BLOB_STATE_DIRTY;
    8687         296 :         return 0;
    8688             : }
    8689             : /* END spdk_blob_set_read_only */
    8690             : 
    8691             : /* START spdk_blob_sync_md */
    8692             : 
    8693             : static void
    8694        1927 : blob_sync_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    8695             : {
    8696        1927 :         struct spdk_blob *blob = cb_arg;
    8697             : 
    8698        1927 :         if (bserrno == 0 && (blob->data_ro_flags & SPDK_BLOB_READ_ONLY)) {
    8699         497 :                 blob->data_ro = true;
    8700         497 :                 blob->md_ro = true;
    8701             :         }
    8702             : 
    8703        1927 :         bs_sequence_finish(seq, bserrno);
    8704        1927 : }
    8705             : 
    8706             : static void
    8707        1927 : blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
    8708             : {
    8709             :         struct spdk_bs_cpl      cpl;
    8710             :         spdk_bs_sequence_t      *seq;
    8711             : 
    8712        1927 :         cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
    8713        1927 :         cpl.u.blob_basic.cb_fn = cb_fn;
    8714        1927 :         cpl.u.blob_basic.cb_arg = cb_arg;
    8715             : 
    8716        1927 :         seq = bs_sequence_start_bs(blob->bs->md_channel, &cpl);
    8717        1927 :         if (!seq) {
    8718           0 :                 cb_fn(cb_arg, -ENOMEM);
    8719           0 :                 return;
    8720             :         }
    8721             : 
    8722        1927 :         blob_persist(seq, blob, blob_sync_md_cpl, blob);
    8723             : }
    8724             : 
    8725             : void
    8726        1370 : spdk_blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
    8727             : {
    8728        1370 :         blob_verify_md_op(blob);
    8729             : 
    8730        1370 :         SPDK_DEBUGLOG(blob, "Syncing blob 0x%" PRIx64 "\n", blob->id);
    8731             : 
    8732        1370 :         if (blob->md_ro) {
    8733           5 :                 assert(blob->state == SPDK_BLOB_STATE_CLEAN);
    8734           5 :                 cb_fn(cb_arg, 0);
    8735           5 :                 return;
    8736             :         }
    8737             : 
    8738        1365 :         blob_sync_md(blob, cb_fn, cb_arg);
    8739             : }
    8740             : 
    8741             : /* END spdk_blob_sync_md */
    8742             : 
    8743             : struct spdk_blob_cluster_op_ctx {
    8744             :         struct spdk_thread      *thread;
    8745             :         struct spdk_blob        *blob;
    8746             :         uint32_t                cluster_num;    /* cluster index in blob */
    8747             :         uint32_t                cluster;        /* cluster on disk */
    8748             :         uint32_t                extent_page;    /* extent page on disk */
    8749             :         struct spdk_blob_md_page *page; /* preallocated extent page */
    8750             :         int                     rc;
    8751             :         spdk_blob_op_complete   cb_fn;
    8752             :         void                    *cb_arg;
    8753             : };
    8754             : 
    8755             : static void
    8756        1105 : blob_op_cluster_msg_cpl(void *arg)
    8757             : {
    8758        1105 :         struct spdk_blob_cluster_op_ctx *ctx = arg;
    8759             : 
    8760        1105 :         ctx->cb_fn(ctx->cb_arg, ctx->rc);
    8761        1105 :         free(ctx);
    8762        1105 : }
    8763             : 
    8764             : static void
    8765        1061 : blob_op_cluster_msg_cb(void *arg, int bserrno)
    8766             : {
    8767        1061 :         struct spdk_blob_cluster_op_ctx *ctx = arg;
    8768             : 
    8769        1061 :         ctx->rc = bserrno;
    8770        1061 :         spdk_thread_send_msg(ctx->thread, blob_op_cluster_msg_cpl, ctx);
    8771        1061 : }
    8772             : 
    8773             : static void
    8774         126 : blob_insert_new_ep_cb(void *arg, int bserrno)
    8775             : {
    8776         126 :         struct spdk_blob_cluster_op_ctx *ctx = arg;
    8777             :         uint32_t *extent_page;
    8778             : 
    8779         126 :         extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num);
    8780         126 :         *extent_page = ctx->extent_page;
    8781         126 :         ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
    8782         126 :         blob_sync_md(ctx->blob, blob_op_cluster_msg_cb, ctx);
    8783         126 : }
    8784             : 
    8785             : struct spdk_blob_write_extent_page_ctx {
    8786             :         struct spdk_blob_store          *bs;
    8787             : 
    8788             :         uint32_t                        extent;
    8789             :         struct spdk_blob_md_page        *page;
    8790             : };
    8791             : 
    8792             : static void
    8793          39 : blob_free_cluster_msg_cb(void *arg, int bserrno)
    8794             : {
    8795          39 :         struct spdk_blob_cluster_op_ctx *ctx = arg;
    8796             : 
    8797          39 :         spdk_spin_lock(&ctx->blob->bs->used_lock);
    8798          39 :         bs_release_cluster(ctx->blob->bs, ctx->cluster);
    8799          39 :         spdk_spin_unlock(&ctx->blob->bs->used_lock);
    8800             : 
    8801          39 :         ctx->rc = bserrno;
    8802          39 :         spdk_thread_send_msg(ctx->thread, blob_op_cluster_msg_cpl, ctx);
    8803          39 : }
    8804             : 
    8805             : static void
    8806          39 : blob_free_cluster_update_ep_cb(void *arg, int bserrno)
    8807             : {
    8808          39 :         struct spdk_blob_cluster_op_ctx *ctx = arg;
    8809             : 
    8810          39 :         if (bserrno != 0 || ctx->blob->bs->clean == 0) {
    8811          39 :                 blob_free_cluster_msg_cb(ctx, bserrno);
    8812          39 :                 return;
    8813             :         }
    8814             : 
    8815           0 :         ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
    8816           0 :         blob_sync_md(ctx->blob, blob_free_cluster_msg_cb, ctx);
    8817             : }
    8818             : 
    8819             : static void
    8820           0 : blob_free_cluster_free_ep_cb(void *arg, int bserrno)
    8821             : {
    8822           0 :         struct spdk_blob_cluster_op_ctx *ctx = arg;
    8823             : 
    8824           0 :         spdk_spin_lock(&ctx->blob->bs->used_lock);
    8825           0 :         assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true);
    8826           0 :         bs_release_md_page(ctx->blob->bs, ctx->extent_page);
    8827           0 :         spdk_spin_unlock(&ctx->blob->bs->used_lock);
    8828           0 :         ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
    8829           0 :         blob_sync_md(ctx->blob, blob_free_cluster_msg_cb, ctx);
    8830           0 : }
    8831             : 
    8832             : static void
    8833         657 : blob_persist_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    8834             : {
    8835         657 :         struct spdk_blob_write_extent_page_ctx *ctx = cb_arg;
    8836             : 
    8837         657 :         free(ctx);
    8838         657 :         bs_sequence_finish(seq, bserrno);
    8839         657 : }
    8840             : 
    8841             : static void
    8842         657 : blob_write_extent_page_ready(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    8843             : {
    8844         657 :         struct spdk_blob_write_extent_page_ctx *ctx = cb_arg;
    8845             : 
    8846         657 :         if (bserrno != 0) {
    8847           0 :                 blob_persist_extent_page_cpl(seq, ctx, bserrno);
    8848           0 :                 return;
    8849             :         }
    8850         657 :         bs_sequence_write_dev(seq, ctx->page, bs_md_page_to_lba(ctx->bs, ctx->extent),
    8851         657 :                               bs_byte_to_lba(ctx->bs, ctx->bs->md_page_size),
    8852             :                               blob_persist_extent_page_cpl, ctx);
    8853             : }
    8854             : 
    8855             : static void
    8856         657 : blob_write_extent_page(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num,
    8857             :                        struct spdk_blob_md_page *page, spdk_blob_op_complete cb_fn, void *cb_arg)
    8858             : {
    8859             :         struct spdk_blob_write_extent_page_ctx  *ctx;
    8860             :         spdk_bs_sequence_t                      *seq;
    8861             :         struct spdk_bs_cpl                      cpl;
    8862             : 
    8863         657 :         ctx = calloc(1, sizeof(*ctx));
    8864         657 :         if (!ctx) {
    8865           0 :                 cb_fn(cb_arg, -ENOMEM);
    8866           0 :                 return;
    8867             :         }
    8868         657 :         ctx->bs = blob->bs;
    8869         657 :         ctx->extent = extent;
    8870         657 :         ctx->page = page;
    8871             : 
    8872         657 :         cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
    8873         657 :         cpl.u.blob_basic.cb_fn = cb_fn;
    8874         657 :         cpl.u.blob_basic.cb_arg = cb_arg;
    8875             : 
    8876         657 :         seq = bs_sequence_start_bs(blob->bs->md_channel, &cpl);
    8877         657 :         if (!seq) {
    8878           0 :                 free(ctx);
    8879           0 :                 cb_fn(cb_arg, -ENOMEM);
    8880           0 :                 return;
    8881             :         }
    8882             : 
    8883         657 :         assert(page);
    8884         657 :         page->next = SPDK_INVALID_MD_PAGE;
    8885         657 :         page->id = blob->id;
    8886         657 :         page->sequence_num = 0;
    8887             : 
    8888         657 :         blob_serialize_extent_page(blob, cluster_num, page);
    8889             : 
    8890         657 :         page->crc = blob_md_page_calc_crc(page);
    8891             : 
    8892         657 :         assert(spdk_bit_array_get(blob->bs->used_md_pages, extent) == true);
    8893             : 
    8894         657 :         bs_mark_dirty(seq, blob->bs, blob_write_extent_page_ready, ctx);
    8895             : }
    8896             : 
    8897             : static void
    8898        1030 : blob_insert_cluster_msg(void *arg)
    8899             : {
    8900        1030 :         struct spdk_blob_cluster_op_ctx *ctx = arg;
    8901             :         uint32_t *extent_page;
    8902             : 
    8903        1030 :         ctx->rc = blob_insert_cluster(ctx->blob, ctx->cluster_num, ctx->cluster);
    8904        1030 :         if (ctx->rc != 0) {
    8905           5 :                 spdk_thread_send_msg(ctx->thread, blob_op_cluster_msg_cpl, ctx);
    8906           5 :                 return;
    8907             :         }
    8908             : 
    8909        1025 :         if (ctx->blob->use_extent_table == false) {
    8910             :                 /* Extent table is not used, proceed with sync of md that will only use extents_rle. */
    8911         410 :                 ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
    8912         410 :                 blob_sync_md(ctx->blob, blob_op_cluster_msg_cb, ctx);
    8913         410 :                 return;
    8914             :         }
    8915             : 
    8916         615 :         extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num);
    8917         615 :         if (*extent_page == 0) {
    8918             :                 /* Extent page requires allocation.
    8919             :                  * It was already claimed in the used_md_pages map and placed in ctx. */
    8920         126 :                 assert(ctx->extent_page != 0);
    8921         126 :                 assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true);
    8922         126 :                 blob_write_extent_page(ctx->blob, ctx->extent_page, ctx->cluster_num, ctx->page,
    8923             :                                        blob_insert_new_ep_cb, ctx);
    8924             :         } else {
    8925             :                 /* It is possible for original thread to allocate extent page for
    8926             :                  * different cluster in the same extent page. In such case proceed with
    8927             :                  * updating the existing extent page, but release the additional one. */
    8928         489 :                 if (ctx->extent_page != 0) {
    8929           0 :                         spdk_spin_lock(&ctx->blob->bs->used_lock);
    8930           0 :                         assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true);
    8931           0 :                         bs_release_md_page(ctx->blob->bs, ctx->extent_page);
    8932           0 :                         spdk_spin_unlock(&ctx->blob->bs->used_lock);
    8933           0 :                         ctx->extent_page = 0;
    8934             :                 }
    8935             :                 /* Extent page already allocated.
    8936             :                  * Every cluster allocation, requires just an update of single extent page. */
    8937         489 :                 blob_write_extent_page(ctx->blob, *extent_page, ctx->cluster_num, ctx->page,
    8938             :                                        blob_op_cluster_msg_cb, ctx);
    8939             :         }
    8940             : }
    8941             : 
    8942             : static void
    8943        1030 : blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
    8944             :                                  uint64_t cluster, uint32_t extent_page, struct spdk_blob_md_page *page,
    8945             :                                  spdk_blob_op_complete cb_fn, void *cb_arg)
    8946             : {
    8947             :         struct spdk_blob_cluster_op_ctx *ctx;
    8948             : 
    8949        1030 :         ctx = calloc(1, sizeof(*ctx));
    8950        1030 :         if (ctx == NULL) {
    8951           0 :                 cb_fn(cb_arg, -ENOMEM);
    8952           0 :                 return;
    8953             :         }
    8954             : 
    8955        1030 :         ctx->thread = spdk_get_thread();
    8956        1030 :         ctx->blob = blob;
    8957        1030 :         ctx->cluster_num = cluster_num;
    8958        1030 :         ctx->cluster = cluster;
    8959        1030 :         ctx->extent_page = extent_page;
    8960        1030 :         ctx->page = page;
    8961        1030 :         ctx->cb_fn = cb_fn;
    8962        1030 :         ctx->cb_arg = cb_arg;
    8963             : 
    8964        1030 :         spdk_thread_send_msg(blob->bs->md_thread, blob_insert_cluster_msg, ctx);
    8965             : }
    8966             : 
    8967             : static void
    8968          75 : blob_free_cluster_msg(void *arg)
    8969             : {
    8970          75 :         struct spdk_blob_cluster_op_ctx *ctx = arg;
    8971             :         uint32_t *extent_page;
    8972             :         uint32_t start_cluster_idx;
    8973          75 :         bool free_extent_page = true;
    8974             :         size_t i;
    8975             : 
    8976          75 :         ctx->cluster = bs_lba_to_cluster(ctx->blob->bs, ctx->blob->active.clusters[ctx->cluster_num]);
    8977             : 
    8978             :         /* There were concurrent unmaps to the same cluster, only release the cluster on the first one */
    8979          75 :         if (ctx->cluster == 0) {
    8980          10 :                 blob_op_cluster_msg_cb(ctx, 0);
    8981          10 :                 return;
    8982             :         }
    8983             : 
    8984          65 :         ctx->blob->active.clusters[ctx->cluster_num] = 0;
    8985          65 :         if (ctx->cluster != 0) {
    8986          65 :                 ctx->blob->active.num_allocated_clusters--;
    8987             :         }
    8988             : 
    8989          65 :         if (ctx->blob->use_extent_table == false) {
    8990             :                 /* Extent table is not used, proceed with sync of md that will only use extents_rle. */
    8991          26 :                 spdk_spin_lock(&ctx->blob->bs->used_lock);
    8992          26 :                 bs_release_cluster(ctx->blob->bs, ctx->cluster);
    8993          26 :                 spdk_spin_unlock(&ctx->blob->bs->used_lock);
    8994          26 :                 ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
    8995          26 :                 blob_sync_md(ctx->blob, blob_op_cluster_msg_cb, ctx);
    8996          26 :                 return;
    8997             :         }
    8998             : 
    8999          39 :         extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num);
    9000             : 
    9001             :         /* There shouldn't be parallel release operations on same cluster */
    9002          39 :         assert(*extent_page == ctx->extent_page);
    9003             : 
    9004          39 :         start_cluster_idx = (ctx->cluster_num / SPDK_EXTENTS_PER_EP) * SPDK_EXTENTS_PER_EP;
    9005          72 :         for (i = 0; i < SPDK_EXTENTS_PER_EP; ++i) {
    9006          72 :                 if (ctx->blob->active.clusters[start_cluster_idx + i] != 0) {
    9007          39 :                         free_extent_page = false;
    9008          39 :                         break;
    9009             :                 }
    9010             :         }
    9011             : 
    9012          39 :         if (free_extent_page) {
    9013           0 :                 assert(ctx->extent_page != 0);
    9014           0 :                 assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true);
    9015           0 :                 ctx->blob->active.extent_pages[bs_cluster_to_extent_table_id(ctx->cluster_num)] = 0;
    9016           0 :                 blob_write_extent_page(ctx->blob, ctx->extent_page, ctx->cluster_num, ctx->page,
    9017             :                                        blob_free_cluster_free_ep_cb, ctx);
    9018             :         } else {
    9019          39 :                 blob_write_extent_page(ctx->blob, *extent_page, ctx->cluster_num, ctx->page,
    9020             :                                        blob_free_cluster_update_ep_cb, ctx);
    9021             :         }
    9022             : }
    9023             : 
    9024             : 
    9025             : static void
    9026          75 : blob_free_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, uint32_t extent_page,
    9027             :                                struct spdk_blob_md_page *page, spdk_blob_op_complete cb_fn, void *cb_arg)
    9028             : {
    9029             :         struct spdk_blob_cluster_op_ctx *ctx;
    9030             : 
    9031          75 :         ctx = calloc(1, sizeof(*ctx));
    9032          75 :         if (ctx == NULL) {
    9033           0 :                 cb_fn(cb_arg, -ENOMEM);
    9034           0 :                 return;
    9035             :         }
    9036             : 
    9037          75 :         ctx->thread = spdk_get_thread();
    9038          75 :         ctx->blob = blob;
    9039          75 :         ctx->cluster_num = cluster_num;
    9040          75 :         ctx->extent_page = extent_page;
    9041          75 :         ctx->page = page;
    9042          75 :         ctx->cb_fn = cb_fn;
    9043          75 :         ctx->cb_arg = cb_arg;
    9044             : 
    9045          75 :         spdk_thread_send_msg(blob->bs->md_thread, blob_free_cluster_msg, ctx);
    9046             : }
    9047             : 
    9048             : /* START spdk_blob_close */
    9049             : 
    9050             : static void
    9051        5211 : blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    9052             : {
    9053        5211 :         struct spdk_blob *blob = cb_arg;
    9054             : 
    9055        5211 :         if (bserrno == 0) {
    9056        5211 :                 blob->open_ref--;
    9057        5211 :                 if (blob->open_ref == 0) {
    9058             :                         /*
    9059             :                          * Blobs with active.num_pages == 0 are deleted blobs.
    9060             :                          *  these blobs are removed from the blob_store list
    9061             :                          *  when the deletion process starts - so don't try to
    9062             :                          *  remove them again.
    9063             :                          */
    9064        4254 :                         if (blob->active.num_pages > 0) {
    9065        2392 :                                 spdk_bit_array_clear(blob->bs->open_blobids, blob->id);
    9066        2392 :                                 RB_REMOVE(spdk_blob_tree, &blob->bs->open_blobs, blob);
    9067             :                         }
    9068        4254 :                         blob_free(blob);
    9069             :                 }
    9070             :         }
    9071             : 
    9072        5211 :         bs_sequence_finish(seq, bserrno);
    9073        5211 : }
    9074             : 
    9075             : static void
    9076         140 : blob_close_esnap_done(void *cb_arg, struct spdk_blob *blob, int bserrno)
    9077             : {
    9078         140 :         spdk_bs_sequence_t      *seq = cb_arg;
    9079             : 
    9080         140 :         if (bserrno != 0) {
    9081           0 :                 SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": close failed with error %d\n",
    9082             :                               blob->id, bserrno);
    9083           0 :                 bs_sequence_finish(seq, bserrno);
    9084           0 :                 return;
    9085             :         }
    9086             : 
    9087         140 :         SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": closed, syncing metadata on thread %s\n",
    9088             :                       blob->id, spdk_thread_get_name(spdk_get_thread()));
    9089             : 
    9090             :         /* Sync metadata */
    9091         140 :         blob_persist(seq, blob, blob_close_cpl, blob);
    9092             : }
    9093             : 
    9094             : void
    9095        5211 : spdk_blob_close(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
    9096             : {
    9097             :         struct spdk_bs_cpl      cpl;
    9098             :         spdk_bs_sequence_t      *seq;
    9099             : 
    9100        5211 :         blob_verify_md_op(blob);
    9101             : 
    9102        5211 :         SPDK_DEBUGLOG(blob, "Closing blob 0x%" PRIx64 "\n", blob->id);
    9103             : 
    9104        5211 :         if (blob->open_ref == 0) {
    9105           0 :                 cb_fn(cb_arg, -EBADF);
    9106         140 :                 return;
    9107             :         }
    9108             : 
    9109        5211 :         cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
    9110        5211 :         cpl.u.blob_basic.cb_fn = cb_fn;
    9111        5211 :         cpl.u.blob_basic.cb_arg = cb_arg;
    9112             : 
    9113        5211 :         seq = bs_sequence_start_bs(blob->bs->md_channel, &cpl);
    9114        5211 :         if (!seq) {
    9115           0 :                 cb_fn(cb_arg, -ENOMEM);
    9116           0 :                 return;
    9117             :         }
    9118             : 
    9119        5211 :         if (blob->open_ref == 1 && blob_is_esnap_clone(blob)) {
    9120         140 :                 blob_esnap_destroy_bs_dev_channels(blob, false, blob_close_esnap_done, seq);
    9121         140 :                 return;
    9122             :         }
    9123             : 
    9124             :         /* Sync metadata */
    9125        5071 :         blob_persist(seq, blob, blob_close_cpl, blob);
    9126             : }
    9127             : 
    9128             : /* END spdk_blob_close */
    9129             : 
    9130         276 : struct spdk_io_channel *spdk_bs_alloc_io_channel(struct spdk_blob_store *bs)
    9131             : {
    9132         276 :         return spdk_get_io_channel(bs);
    9133             : }
    9134             : 
    9135             : void
    9136         276 : spdk_bs_free_io_channel(struct spdk_io_channel *channel)
    9137             : {
    9138         276 :         blob_esnap_destroy_bs_channel(spdk_io_channel_get_ctx(channel));
    9139         276 :         spdk_put_io_channel(channel);
    9140         276 : }
    9141             : 
    9142             : void
    9143         140 : spdk_blob_io_unmap(struct spdk_blob *blob, struct spdk_io_channel *channel,
    9144             :                    uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg)
    9145             : {
    9146         140 :         blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg,
    9147             :                                SPDK_BLOB_UNMAP);
    9148         140 : }
    9149             : 
    9150             : void
    9151          60 : spdk_blob_io_write_zeroes(struct spdk_blob *blob, struct spdk_io_channel *channel,
    9152             :                           uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg)
    9153             : {
    9154          60 :         blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg,
    9155             :                                SPDK_BLOB_WRITE_ZEROES);
    9156          60 : }
    9157             : 
    9158             : void
    9159       27349 : spdk_blob_io_write(struct spdk_blob *blob, struct spdk_io_channel *channel,
    9160             :                    void *payload, uint64_t offset, uint64_t length,
    9161             :                    spdk_blob_op_complete cb_fn, void *cb_arg)
    9162             : {
    9163       27349 :         blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg,
    9164             :                                SPDK_BLOB_WRITE);
    9165       27349 : }
    9166             : 
    9167             : void
    9168       26143 : spdk_blob_io_read(struct spdk_blob *blob, struct spdk_io_channel *channel,
    9169             :                   void *payload, uint64_t offset, uint64_t length,
    9170             :                   spdk_blob_op_complete cb_fn, void *cb_arg)
    9171             : {
    9172       26143 :         blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg,
    9173             :                                SPDK_BLOB_READ);
    9174       26143 : }
    9175             : 
    9176             : void
    9177         175 : spdk_blob_io_writev(struct spdk_blob *blob, struct spdk_io_channel *channel,
    9178             :                     struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
    9179             :                     spdk_blob_op_complete cb_fn, void *cb_arg)
    9180             : {
    9181         175 :         blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false, NULL);
    9182         175 : }
    9183             : 
    9184             : void
    9185        1655 : spdk_blob_io_readv(struct spdk_blob *blob, struct spdk_io_channel *channel,
    9186             :                    struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
    9187             :                    spdk_blob_op_complete cb_fn, void *cb_arg)
    9188             : {
    9189        1655 :         blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true, NULL);
    9190        1655 : }
    9191             : 
    9192             : void
    9193         260 : spdk_blob_io_writev_ext(struct spdk_blob *blob, struct spdk_io_channel *channel,
    9194             :                         struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
    9195             :                         spdk_blob_op_complete cb_fn, void *cb_arg, struct spdk_blob_ext_io_opts *io_opts)
    9196             : {
    9197         260 :         blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false,
    9198             :                                    io_opts);
    9199         260 : }
    9200             : 
    9201             : void
    9202        2105 : spdk_blob_io_readv_ext(struct spdk_blob *blob, struct spdk_io_channel *channel,
    9203             :                        struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
    9204             :                        spdk_blob_op_complete cb_fn, void *cb_arg, struct spdk_blob_ext_io_opts *io_opts)
    9205             : {
    9206        2105 :         blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true,
    9207             :                                    io_opts);
    9208        2105 : }
    9209             : 
    9210             : struct spdk_bs_iter_ctx {
    9211             :         int64_t page_num;
    9212             :         struct spdk_blob_store *bs;
    9213             : 
    9214             :         spdk_blob_op_with_handle_complete cb_fn;
    9215             :         void *cb_arg;
    9216             : };
    9217             : 
    9218             : static void
    9219        1460 : bs_iter_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
    9220             : {
    9221        1460 :         struct spdk_bs_iter_ctx *ctx = cb_arg;
    9222        1460 :         struct spdk_blob_store *bs = ctx->bs;
    9223             :         spdk_blob_id id;
    9224             : 
    9225        1460 :         if (bserrno == 0) {
    9226         557 :                 ctx->cb_fn(ctx->cb_arg, _blob, bserrno);
    9227         557 :                 free(ctx);
    9228         557 :                 return;
    9229             :         }
    9230             : 
    9231         903 :         ctx->page_num++;
    9232         903 :         ctx->page_num = spdk_bit_array_find_first_set(bs->used_blobids, ctx->page_num);
    9233         903 :         if (ctx->page_num >= spdk_bit_array_capacity(bs->used_blobids)) {
    9234         336 :                 ctx->cb_fn(ctx->cb_arg, NULL, -ENOENT);
    9235         336 :                 free(ctx);
    9236         336 :                 return;
    9237             :         }
    9238             : 
    9239         567 :         id = bs_page_to_blobid(ctx->page_num);
    9240             : 
    9241         567 :         spdk_bs_open_blob(bs, id, bs_iter_cpl, ctx);
    9242             : }
    9243             : 
    9244             : void
    9245         366 : spdk_bs_iter_first(struct spdk_blob_store *bs,
    9246             :                    spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
    9247             : {
    9248             :         struct spdk_bs_iter_ctx *ctx;
    9249             : 
    9250         366 :         ctx = calloc(1, sizeof(*ctx));
    9251         366 :         if (!ctx) {
    9252           0 :                 cb_fn(cb_arg, NULL, -ENOMEM);
    9253           0 :                 return;
    9254             :         }
    9255             : 
    9256         366 :         ctx->page_num = -1;
    9257         366 :         ctx->bs = bs;
    9258         366 :         ctx->cb_fn = cb_fn;
    9259         366 :         ctx->cb_arg = cb_arg;
    9260             : 
    9261         366 :         bs_iter_cpl(ctx, NULL, -1);
    9262             : }
    9263             : 
    9264             : static void
    9265         527 : bs_iter_close_cpl(void *cb_arg, int bserrno)
    9266             : {
    9267         527 :         struct spdk_bs_iter_ctx *ctx = cb_arg;
    9268             : 
    9269         527 :         bs_iter_cpl(ctx, NULL, -1);
    9270         527 : }
    9271             : 
    9272             : void
    9273         527 : spdk_bs_iter_next(struct spdk_blob_store *bs, struct spdk_blob *blob,
    9274             :                   spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
    9275             : {
    9276             :         struct spdk_bs_iter_ctx *ctx;
    9277             : 
    9278         527 :         assert(blob != NULL);
    9279             : 
    9280         527 :         ctx = calloc(1, sizeof(*ctx));
    9281         527 :         if (!ctx) {
    9282           0 :                 cb_fn(cb_arg, NULL, -ENOMEM);
    9283           0 :                 return;
    9284             :         }
    9285             : 
    9286         527 :         ctx->page_num = bs_blobid_to_page(blob->id);
    9287         527 :         ctx->bs = bs;
    9288         527 :         ctx->cb_fn = cb_fn;
    9289         527 :         ctx->cb_arg = cb_arg;
    9290             : 
    9291             :         /* Close the existing blob */
    9292         527 :         spdk_blob_close(blob, bs_iter_close_cpl, ctx);
    9293             : }
    9294             : 
    9295             : static int
    9296        1178 : blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
    9297             :                uint16_t value_len, bool internal)
    9298             : {
    9299             :         struct spdk_xattr_tailq *xattrs;
    9300             :         struct spdk_xattr       *xattr;
    9301             :         size_t                  desc_size;
    9302             :         void                    *tmp;
    9303             : 
    9304        1178 :         blob_verify_md_op(blob);
    9305             : 
    9306        1178 :         if (blob->md_ro) {
    9307           5 :                 return -EPERM;
    9308             :         }
    9309             : 
    9310        1173 :         desc_size = sizeof(struct spdk_blob_md_descriptor_xattr) + strlen(name) + value_len;
    9311        1173 :         if (desc_size > SPDK_BS_MAX_DESC_SIZE) {
    9312           5 :                 SPDK_DEBUGLOG(blob, "Xattr '%s' of size %zu does not fix into single page %zu\n", name,
    9313             :                               desc_size, SPDK_BS_MAX_DESC_SIZE);
    9314           5 :                 return -ENOMEM;
    9315             :         }
    9316             : 
    9317        1168 :         if (internal) {
    9318         917 :                 xattrs = &blob->xattrs_internal;
    9319         917 :                 blob->invalid_flags |= SPDK_BLOB_INTERNAL_XATTR;
    9320             :         } else {
    9321         251 :                 xattrs = &blob->xattrs;
    9322             :         }
    9323             : 
    9324        1438 :         TAILQ_FOREACH(xattr, xattrs, link) {
    9325         402 :                 if (!strcmp(name, xattr->name)) {
    9326         132 :                         tmp = malloc(value_len);
    9327         132 :                         if (!tmp) {
    9328           0 :                                 return -ENOMEM;
    9329             :                         }
    9330             : 
    9331         132 :                         free(xattr->value);
    9332         132 :                         xattr->value_len = value_len;
    9333         132 :                         xattr->value = tmp;
    9334         132 :                         memcpy(xattr->value, value, value_len);
    9335             : 
    9336         132 :                         blob->state = SPDK_BLOB_STATE_DIRTY;
    9337             : 
    9338         132 :                         return 0;
    9339             :                 }
    9340             :         }
    9341             : 
    9342        1036 :         xattr = calloc(1, sizeof(*xattr));
    9343        1036 :         if (!xattr) {
    9344           0 :                 return -ENOMEM;
    9345             :         }
    9346             : 
    9347        1036 :         xattr->name = strdup(name);
    9348        1036 :         if (!xattr->name) {
    9349           0 :                 free(xattr);
    9350           0 :                 return -ENOMEM;
    9351             :         }
    9352             : 
    9353        1036 :         xattr->value_len = value_len;
    9354        1036 :         xattr->value = malloc(value_len);
    9355        1036 :         if (!xattr->value) {
    9356           0 :                 free(xattr->name);
    9357           0 :                 free(xattr);
    9358           0 :                 return -ENOMEM;
    9359             :         }
    9360        1036 :         memcpy(xattr->value, value, value_len);
    9361        1036 :         TAILQ_INSERT_TAIL(xattrs, xattr, link);
    9362             : 
    9363        1036 :         blob->state = SPDK_BLOB_STATE_DIRTY;
    9364             : 
    9365        1036 :         return 0;
    9366             : }
    9367             : 
    9368             : int
    9369         216 : spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
    9370             :                     uint16_t value_len)
    9371             : {
    9372         216 :         return blob_set_xattr(blob, name, value, value_len, false);
    9373             : }
    9374             : 
    9375             : static int
    9376         511 : blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal)
    9377             : {
    9378             :         struct spdk_xattr_tailq *xattrs;
    9379             :         struct spdk_xattr       *xattr;
    9380             : 
    9381         511 :         blob_verify_md_op(blob);
    9382             : 
    9383         511 :         if (blob->md_ro) {
    9384           5 :                 return -EPERM;
    9385             :         }
    9386         506 :         xattrs = internal ? &blob->xattrs_internal : &blob->xattrs;
    9387             : 
    9388         521 :         TAILQ_FOREACH(xattr, xattrs, link) {
    9389         456 :                 if (!strcmp(name, xattr->name)) {
    9390         441 :                         TAILQ_REMOVE(xattrs, xattr, link);
    9391         441 :                         free(xattr->value);
    9392         441 :                         free(xattr->name);
    9393         441 :                         free(xattr);
    9394             : 
    9395         441 :                         if (internal && TAILQ_EMPTY(&blob->xattrs_internal)) {
    9396         306 :                                 blob->invalid_flags &= ~SPDK_BLOB_INTERNAL_XATTR;
    9397             :                         }
    9398         441 :                         blob->state = SPDK_BLOB_STATE_DIRTY;
    9399             : 
    9400         441 :                         return 0;
    9401             :                 }
    9402             :         }
    9403             : 
    9404          65 :         return -ENOENT;
    9405             : }
    9406             : 
    9407             : int
    9408          45 : spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name)
    9409             : {
    9410          45 :         return blob_remove_xattr(blob, name, false);
    9411             : }
    9412             : 
    9413             : static int
    9414        2852 : blob_get_xattr_value(struct spdk_blob *blob, const char *name,
    9415             :                      const void **value, size_t *value_len, bool internal)
    9416             : {
    9417             :         struct spdk_xattr       *xattr;
    9418             :         struct spdk_xattr_tailq *xattrs;
    9419             : 
    9420        2852 :         xattrs = internal ? &blob->xattrs_internal : &blob->xattrs;
    9421             : 
    9422        3636 :         TAILQ_FOREACH(xattr, xattrs, link) {
    9423        1728 :                 if (!strcmp(name, xattr->name)) {
    9424         944 :                         *value = xattr->value;
    9425         944 :                         *value_len = xattr->value_len;
    9426         944 :                         return 0;
    9427             :                 }
    9428             :         }
    9429        1908 :         return -ENOENT;
    9430             : }
    9431             : 
    9432             : int
    9433         192 : spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name,
    9434             :                           const void **value, size_t *value_len)
    9435             : {
    9436         192 :         blob_verify_md_op(blob);
    9437             : 
    9438         192 :         return blob_get_xattr_value(blob, name, value, value_len, false);
    9439             : }
    9440             : 
    9441             : struct spdk_xattr_names {
    9442             :         uint32_t        count;
    9443             :         const char      *names[0];
    9444             : };
    9445             : 
    9446             : static int
    9447           5 : blob_get_xattr_names(struct spdk_xattr_tailq *xattrs, struct spdk_xattr_names **names)
    9448             : {
    9449             :         struct spdk_xattr       *xattr;
    9450           5 :         int                     count = 0;
    9451             : 
    9452          15 :         TAILQ_FOREACH(xattr, xattrs, link) {
    9453          10 :                 count++;
    9454             :         }
    9455             : 
    9456           5 :         *names = calloc(1, sizeof(struct spdk_xattr_names) + count * sizeof(char *));
    9457           5 :         if (*names == NULL) {
    9458           0 :                 return -ENOMEM;
    9459             :         }
    9460             : 
    9461          15 :         TAILQ_FOREACH(xattr, xattrs, link) {
    9462          10 :                 (*names)->names[(*names)->count++] = xattr->name;
    9463             :         }
    9464             : 
    9465           5 :         return 0;
    9466             : }
    9467             : 
    9468             : int
    9469           5 : spdk_blob_get_xattr_names(struct spdk_blob *blob, struct spdk_xattr_names **names)
    9470             : {
    9471           5 :         blob_verify_md_op(blob);
    9472             : 
    9473           5 :         return blob_get_xattr_names(&blob->xattrs, names);
    9474             : }
    9475             : 
    9476             : uint32_t
    9477           5 : spdk_xattr_names_get_count(struct spdk_xattr_names *names)
    9478             : {
    9479           5 :         assert(names != NULL);
    9480             : 
    9481           5 :         return names->count;
    9482             : }
    9483             : 
    9484             : const char *
    9485          10 : spdk_xattr_names_get_name(struct spdk_xattr_names *names, uint32_t index)
    9486             : {
    9487          10 :         if (index >= names->count) {
    9488           0 :                 return NULL;
    9489             :         }
    9490             : 
    9491          10 :         return names->names[index];
    9492             : }
    9493             : 
    9494             : void
    9495           5 : spdk_xattr_names_free(struct spdk_xattr_names *names)
    9496             : {
    9497           5 :         free(names);
    9498           5 : }
    9499             : 
    9500             : struct spdk_bs_type
    9501           2 : spdk_bs_get_bstype(struct spdk_blob_store *bs)
    9502             : {
    9503           2 :         return bs->bstype;
    9504             : }
    9505             : 
    9506             : void
    9507           0 : spdk_bs_set_bstype(struct spdk_blob_store *bs, struct spdk_bs_type bstype)
    9508             : {
    9509           0 :         memcpy(&bs->bstype, &bstype, sizeof(bstype));
    9510           0 : }
    9511             : 
    9512             : bool
    9513          60 : spdk_blob_is_read_only(struct spdk_blob *blob)
    9514             : {
    9515          60 :         assert(blob != NULL);
    9516          60 :         return (blob->data_ro || blob->md_ro);
    9517             : }
    9518             : 
    9519             : bool
    9520          65 : spdk_blob_is_snapshot(struct spdk_blob *blob)
    9521             : {
    9522             :         struct spdk_blob_list *snapshot_entry;
    9523             : 
    9524          65 :         assert(blob != NULL);
    9525             : 
    9526          65 :         snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id);
    9527          65 :         if (snapshot_entry == NULL) {
    9528          35 :                 return false;
    9529             :         }
    9530             : 
    9531          30 :         return true;
    9532             : }
    9533             : 
    9534             : bool
    9535          85 : spdk_blob_is_clone(struct spdk_blob *blob)
    9536             : {
    9537          85 :         assert(blob != NULL);
    9538             : 
    9539          85 :         if (blob->parent_id != SPDK_BLOBID_INVALID &&
    9540          65 :             blob->parent_id != SPDK_BLOBID_EXTERNAL_SNAPSHOT) {
    9541          50 :                 assert(spdk_blob_is_thin_provisioned(blob));
    9542          50 :                 return true;
    9543             :         }
    9544             : 
    9545          35 :         return false;
    9546             : }
    9547             : 
    9548             : bool
    9549       46657 : spdk_blob_is_thin_provisioned(struct spdk_blob *blob)
    9550             : {
    9551       46657 :         assert(blob != NULL);
    9552       46657 :         return !!(blob->invalid_flags & SPDK_BLOB_THIN_PROV);
    9553             : }
    9554             : 
    9555             : bool
    9556       57607 : spdk_blob_is_esnap_clone(const struct spdk_blob *blob)
    9557             : {
    9558       57607 :         return blob_is_esnap_clone(blob);
    9559             : }
    9560             : 
    9561             : static void
    9562        4291 : blob_update_clear_method(struct spdk_blob *blob)
    9563             : {
    9564             :         enum blob_clear_method stored_cm;
    9565             : 
    9566        4291 :         assert(blob != NULL);
    9567             : 
    9568             :         /* If BLOB_CLEAR_WITH_DEFAULT was passed in, use the setting stored
    9569             :          * in metadata previously.  If something other than the default was
    9570             :          * specified, ignore stored value and used what was passed in.
    9571             :          */
    9572        4291 :         stored_cm = ((blob->md_ro_flags & SPDK_BLOB_CLEAR_METHOD) >> SPDK_BLOB_CLEAR_METHOD_SHIFT);
    9573             : 
    9574        4291 :         if (blob->clear_method == BLOB_CLEAR_WITH_DEFAULT) {
    9575        4291 :                 blob->clear_method = stored_cm;
    9576           0 :         } else if (blob->clear_method != stored_cm) {
    9577           0 :                 SPDK_WARNLOG("Using passed in clear method 0x%x instead of stored value of 0x%x\n",
    9578             :                              blob->clear_method, stored_cm);
    9579             :         }
    9580        4291 : }
    9581             : 
    9582             : spdk_blob_id
    9583         324 : spdk_blob_get_parent_snapshot(struct spdk_blob_store *bs, spdk_blob_id blob_id)
    9584             : {
    9585         324 :         struct spdk_blob_list *snapshot_entry = NULL;
    9586         324 :         struct spdk_blob_list *clone_entry = NULL;
    9587             : 
    9588         619 :         TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) {
    9589         916 :                 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
    9590         621 :                         if (clone_entry->id == blob_id) {
    9591         211 :                                 return snapshot_entry->id;
    9592             :                         }
    9593             :                 }
    9594             :         }
    9595             : 
    9596         113 :         return SPDK_BLOBID_INVALID;
    9597             : }
    9598             : 
    9599             : int
    9600         246 : spdk_blob_get_clones(struct spdk_blob_store *bs, spdk_blob_id blobid, spdk_blob_id *ids,
    9601             :                      size_t *count)
    9602             : {
    9603             :         struct spdk_blob_list *snapshot_entry, *clone_entry;
    9604             :         size_t n;
    9605             : 
    9606         246 :         snapshot_entry = bs_get_snapshot_entry(bs, blobid);
    9607         246 :         if (snapshot_entry == NULL) {
    9608          35 :                 *count = 0;
    9609          35 :                 return 0;
    9610             :         }
    9611             : 
    9612         211 :         if (ids == NULL || *count < snapshot_entry->clone_count) {
    9613          10 :                 *count = snapshot_entry->clone_count;
    9614          10 :                 return -ENOMEM;
    9615             :         }
    9616         201 :         *count = snapshot_entry->clone_count;
    9617             : 
    9618         201 :         n = 0;
    9619         427 :         TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
    9620         226 :                 ids[n++] = clone_entry->id;
    9621             :         }
    9622             : 
    9623         201 :         return 0;
    9624             : }
    9625             : 
    9626             : static void
    9627           5 : bs_load_grow_continue(struct spdk_bs_load_ctx *ctx)
    9628             : {
    9629             :         int rc;
    9630             : 
    9631           5 :         if (ctx->super->size == 0) {
    9632           0 :                 ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen;
    9633             :         }
    9634             : 
    9635           5 :         if (ctx->super->io_unit_size == 0) {
    9636           0 :                 ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE;
    9637             :         }
    9638           5 :         if (ctx->super->md_page_size == 0) {
    9639           0 :                 ctx->super->md_page_size = SPDK_BS_PAGE_SIZE;
    9640             :         }
    9641             : 
    9642             :         /* Parse the super block */
    9643           5 :         ctx->bs->clean = 1;
    9644           5 :         ctx->bs->cluster_sz = ctx->super->cluster_size;
    9645           5 :         ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size;
    9646           5 :         ctx->bs->md_page_size = ctx->super->md_page_size;
    9647           5 :         ctx->bs->io_unit_size = ctx->super->io_unit_size;
    9648           5 :         bs_init_per_cluster_fields(ctx->bs);
    9649           5 :         rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters);
    9650           5 :         if (rc < 0) {
    9651           0 :                 bs_load_ctx_fail(ctx, -ENOMEM);
    9652           0 :                 return;
    9653             :         }
    9654           5 :         ctx->bs->md_start = ctx->super->md_start;
    9655           5 :         ctx->bs->md_len = ctx->super->md_len;
    9656           5 :         rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->bs->md_len);
    9657           5 :         if (rc < 0) {
    9658           0 :                 bs_load_ctx_fail(ctx, -ENOMEM);
    9659           0 :                 return;
    9660             :         }
    9661             : 
    9662          15 :         ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up(
    9663          10 :                                                ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster);
    9664           5 :         ctx->bs->super_blob = ctx->super->super_blob;
    9665           5 :         memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype));
    9666             : 
    9667           5 :         if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0) {
    9668           0 :                 SPDK_ERRLOG("Can not grow an unclean blobstore, please load it normally to clean it.\n");
    9669           0 :                 bs_load_ctx_fail(ctx, -EIO);
    9670           0 :                 return;
    9671             :         } else {
    9672           5 :                 bs_load_read_used_pages(ctx);
    9673             :         }
    9674             : }
    9675             : 
    9676             : static void
    9677           5 : bs_load_grow_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    9678             : {
    9679           5 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    9680             : 
    9681           5 :         if (bserrno != 0) {
    9682           0 :                 bs_load_ctx_fail(ctx, bserrno);
    9683           0 :                 return;
    9684             :         }
    9685           5 :         bs_load_grow_continue(ctx);
    9686             : }
    9687             : 
    9688             : static void
    9689           5 : bs_load_grow_used_clusters_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    9690             : {
    9691           5 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    9692             : 
    9693           5 :         if (bserrno != 0) {
    9694           0 :                 bs_load_ctx_fail(ctx, bserrno);
    9695           0 :                 return;
    9696             :         }
    9697             : 
    9698           5 :         spdk_free(ctx->mask);
    9699             : 
    9700           5 :         bs_sequence_write_dev(ctx->seq, ctx->super, bs_page_to_lba(ctx->bs, 0),
    9701           5 :                               bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)),
    9702             :                               bs_load_grow_super_write_cpl, ctx);
    9703             : }
    9704             : 
    9705             : static void
    9706           5 : bs_load_grow_used_clusters_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    9707             : {
    9708           5 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    9709             :         uint64_t                lba, lba_count;
    9710             :         uint64_t                dev_size;
    9711             :         uint64_t                total_clusters;
    9712             : 
    9713           5 :         if (bserrno != 0) {
    9714           0 :                 bs_load_ctx_fail(ctx, bserrno);
    9715           0 :                 return;
    9716             :         }
    9717             : 
    9718             :         /* The type must be correct */
    9719           5 :         assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS);
    9720             :         /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */
    9721           5 :         assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof(
    9722             :                                              struct spdk_blob_md_page) * 8));
    9723           5 :         dev_size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen;
    9724           5 :         total_clusters = dev_size / ctx->super->cluster_size;
    9725           5 :         ctx->mask->length = total_clusters;
    9726             : 
    9727           5 :         lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start);
    9728           5 :         lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len);
    9729           5 :         bs_sequence_write_dev(ctx->seq, ctx->mask, lba, lba_count,
    9730             :                               bs_load_grow_used_clusters_write_cpl, ctx);
    9731             : }
    9732             : 
    9733             : static void
    9734           5 : bs_load_try_to_grow(struct spdk_bs_load_ctx *ctx)
    9735             : {
    9736             :         uint64_t dev_size, total_clusters, used_cluster_mask_len, max_used_cluster_mask;
    9737             :         uint64_t lba, lba_count, mask_size;
    9738             : 
    9739           5 :         dev_size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen;
    9740           5 :         total_clusters = dev_size / ctx->super->cluster_size;
    9741           5 :         used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
    9742           5 :                                 spdk_divide_round_up(total_clusters, 8),
    9743           5 :                                 ctx->super->md_page_size);
    9744           5 :         max_used_cluster_mask = ctx->super->used_blobid_mask_start - ctx->super->used_cluster_mask_start;
    9745             :         /* No necessary to grow or no space to grow */
    9746           5 :         if (ctx->super->size >= dev_size || used_cluster_mask_len > max_used_cluster_mask) {
    9747           0 :                 SPDK_DEBUGLOG(blob, "No grow\n");
    9748           0 :                 bs_load_grow_continue(ctx);
    9749           0 :                 return;
    9750             :         }
    9751             : 
    9752           5 :         SPDK_DEBUGLOG(blob, "Resize blobstore\n");
    9753             : 
    9754           5 :         ctx->super->size = dev_size;
    9755           5 :         ctx->super->used_cluster_mask_len = used_cluster_mask_len;
    9756           5 :         ctx->super->crc = blob_md_page_calc_crc(ctx->super);
    9757             : 
    9758           5 :         mask_size = used_cluster_mask_len * ctx->super->md_page_size;
    9759           5 :         ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_NUMA_ID_ANY,
    9760             :                                  SPDK_MALLOC_DMA);
    9761           5 :         if (!ctx->mask) {
    9762           0 :                 bs_load_ctx_fail(ctx, -ENOMEM);
    9763           0 :                 return;
    9764             :         }
    9765           5 :         lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start);
    9766           5 :         lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len);
    9767           5 :         bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count,
    9768             :                              bs_load_grow_used_clusters_read_cpl, ctx);
    9769             : }
    9770             : 
    9771             : static void
    9772           5 : bs_grow_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    9773             : {
    9774           5 :         struct spdk_bs_load_ctx *ctx = cb_arg;
    9775             :         int rc;
    9776             : 
    9777           5 :         rc = bs_super_validate(ctx->super, ctx->bs);
    9778           5 :         if (rc != 0) {
    9779           0 :                 bs_load_ctx_fail(ctx, rc);
    9780           0 :                 return;
    9781             :         }
    9782             : 
    9783           5 :         bs_load_try_to_grow(ctx);
    9784             : }
    9785             : 
    9786             : struct spdk_bs_grow_ctx {
    9787             :         struct spdk_blob_store          *bs;
    9788             :         struct spdk_bs_super_block      *super;
    9789             : 
    9790             :         struct spdk_bit_pool            *new_used_clusters;
    9791             :         struct spdk_bs_md_mask          *new_used_clusters_mask;
    9792             : 
    9793             :         spdk_bs_sequence_t              *seq;
    9794             : };
    9795             : 
    9796             : static void
    9797          40 : bs_grow_live_done(struct spdk_bs_grow_ctx *ctx, int bserrno)
    9798             : {
    9799          40 :         if (bserrno != 0) {
    9800          10 :                 spdk_bit_pool_free(&ctx->new_used_clusters);
    9801             :         }
    9802             : 
    9803          40 :         bs_sequence_finish(ctx->seq, bserrno);
    9804          40 :         free(ctx->new_used_clusters_mask);
    9805          40 :         spdk_free(ctx->super);
    9806          40 :         free(ctx);
    9807          40 : }
    9808             : 
    9809             : static void
    9810          10 : bs_grow_live_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    9811             : {
    9812          10 :         struct spdk_bs_grow_ctx *ctx = cb_arg;
    9813          10 :         struct spdk_blob_store *bs = ctx->bs;
    9814             :         uint64_t total_clusters;
    9815             : 
    9816          10 :         if (bserrno != 0) {
    9817           0 :                 bs_grow_live_done(ctx, bserrno);
    9818           0 :                 return;
    9819             :         }
    9820             : 
    9821             :         /*
    9822             :          * Blobstore is not clean until unload, for now only the super block is up to date.
    9823             :          * This is similar to state right after blobstore init, when bs_write_used_md() didn't
    9824             :          * yet execute.
    9825             :          * When cleanly unloaded, the used md pages will be written out.
    9826             :          * In case of unclean shutdown, loading blobstore will go through recovery path correctly
    9827             :          * filling out the used_clusters with new size and writing it out.
    9828             :          */
    9829          10 :         bs->clean = 0;
    9830             : 
    9831             :         /* Reverting the super->size past this point is complex, avoid any error paths
    9832             :          * that require to do so. */
    9833          10 :         spdk_spin_lock(&bs->used_lock);
    9834             : 
    9835          10 :         total_clusters = ctx->super->size / ctx->super->cluster_size;
    9836             : 
    9837          10 :         assert(total_clusters >= spdk_bit_pool_capacity(bs->used_clusters));
    9838          10 :         spdk_bit_pool_store_mask(bs->used_clusters, ctx->new_used_clusters_mask);
    9839             : 
    9840          10 :         assert(total_clusters == spdk_bit_pool_capacity(ctx->new_used_clusters));
    9841          10 :         spdk_bit_pool_load_mask(ctx->new_used_clusters, ctx->new_used_clusters_mask);
    9842             : 
    9843          10 :         spdk_bit_pool_free(&bs->used_clusters);
    9844          10 :         bs->used_clusters = ctx->new_used_clusters;
    9845             : 
    9846          10 :         bs->total_clusters = total_clusters;
    9847          20 :         bs->total_data_clusters = bs->total_clusters - spdk_divide_round_up(
    9848          10 :                                           bs->md_start + bs->md_len, bs->pages_per_cluster);
    9849             : 
    9850          10 :         bs->num_free_clusters = spdk_bit_pool_count_free(bs->used_clusters);
    9851          10 :         assert(ctx->bs->num_free_clusters <= ctx->bs->total_clusters);
    9852          10 :         spdk_spin_unlock(&bs->used_lock);
    9853             : 
    9854          10 :         bs_grow_live_done(ctx, 0);
    9855             : }
    9856             : 
    9857             : static void
    9858          40 : bs_grow_live_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
    9859             : {
    9860          40 :         struct spdk_bs_grow_ctx *ctx = cb_arg;
    9861             :         uint64_t dev_size, total_clusters, used_cluster_mask_len, max_used_cluster_mask;
    9862             :         int rc;
    9863             : 
    9864          40 :         if (bserrno != 0) {
    9865           0 :                 bs_grow_live_done(ctx, bserrno);
    9866           0 :                 return;
    9867             :         }
    9868             : 
    9869          40 :         rc = bs_super_validate(ctx->super, ctx->bs);
    9870          40 :         if (rc != 0) {
    9871           5 :                 bs_grow_live_done(ctx, rc);
    9872           5 :                 return;
    9873             :         }
    9874             : 
    9875          35 :         dev_size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen;
    9876          35 :         total_clusters = dev_size / ctx->super->cluster_size;
    9877          35 :         used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
    9878          35 :                                 spdk_divide_round_up(total_clusters, 8),
    9879          35 :                                 ctx->super->md_page_size);
    9880          35 :         max_used_cluster_mask = ctx->super->used_blobid_mask_start - ctx->super->used_cluster_mask_start;
    9881             :         /* Only checking dev_size. Since it can change, but total_clusters remain the same. */
    9882          35 :         if (dev_size == ctx->super->size) {
    9883          20 :                 SPDK_DEBUGLOG(blob, "No need to grow blobstore\n");
    9884          20 :                 bs_grow_live_done(ctx, 0);
    9885          20 :                 return;
    9886             :         }
    9887             :         /*
    9888             :          * Blobstore cannot be shrunk, so check before if:
    9889             :          * - new size of the device is smaller than size in super_block
    9890             :          * - new total number of clusters is smaller than used_clusters bit_pool
    9891             :          * - there is enough space in metadata for used_cluster_mask to be written out
    9892             :          */
    9893          15 :         if (dev_size < ctx->super->size ||
    9894          15 :             total_clusters < spdk_bit_pool_capacity(ctx->bs->used_clusters) ||
    9895             :             used_cluster_mask_len > max_used_cluster_mask) {
    9896           5 :                 SPDK_DEBUGLOG(blob, "No space to grow blobstore\n");
    9897           5 :                 bs_grow_live_done(ctx, -ENOSPC);
    9898           5 :                 return;
    9899             :         }
    9900             : 
    9901          10 :         SPDK_DEBUGLOG(blob, "Resizing blobstore\n");
    9902             : 
    9903          10 :         ctx->new_used_clusters_mask = calloc(1, total_clusters);
    9904          10 :         if (!ctx->new_used_clusters_mask) {
    9905           0 :                 bs_grow_live_done(ctx, -ENOMEM);
    9906           0 :                 return;
    9907             :         }
    9908          10 :         ctx->new_used_clusters = spdk_bit_pool_create(total_clusters);
    9909          10 :         if (!ctx->new_used_clusters) {
    9910           0 :                 bs_grow_live_done(ctx, -ENOMEM);
    9911           0 :                 return;
    9912             :         }
    9913             : 
    9914          10 :         ctx->super->clean = 0;
    9915          10 :         ctx->super->size = dev_size;
    9916          10 :         ctx->super->used_cluster_mask_len = used_cluster_mask_len;
    9917          10 :         bs_write_super(seq, ctx->bs, ctx->super, bs_grow_live_super_write_cpl, ctx);
    9918             : }
    9919             : 
    9920             : void
    9921          40 : spdk_bs_grow_live(struct spdk_blob_store *bs,
    9922             :                   spdk_bs_op_complete cb_fn, void *cb_arg)
    9923             : {
    9924             :         struct spdk_bs_cpl      cpl;
    9925             :         struct spdk_bs_grow_ctx *ctx;
    9926             : 
    9927          40 :         assert(spdk_get_thread() == bs->md_thread);
    9928             : 
    9929          40 :         SPDK_DEBUGLOG(blob, "Growing blobstore on dev %p\n", bs->dev);
    9930             : 
    9931          40 :         cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
    9932          40 :         cpl.u.bs_basic.cb_fn = cb_fn;
    9933          40 :         cpl.u.bs_basic.cb_arg = cb_arg;
    9934             : 
    9935          40 :         ctx = calloc(1, sizeof(struct spdk_bs_grow_ctx));
    9936          40 :         if (!ctx) {
    9937           0 :                 cb_fn(cb_arg, -ENOMEM);
    9938           0 :                 return;
    9939             :         }
    9940          40 :         ctx->bs = bs;
    9941             : 
    9942          40 :         ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
    9943             :                                   SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
    9944          40 :         if (!ctx->super) {
    9945           0 :                 free(ctx);
    9946           0 :                 cb_fn(cb_arg, -ENOMEM);
    9947           0 :                 return;
    9948             :         }
    9949             : 
    9950          40 :         ctx->seq = bs_sequence_start_bs(bs->md_channel, &cpl);
    9951          40 :         if (!ctx->seq) {
    9952           0 :                 spdk_free(ctx->super);
    9953           0 :                 free(ctx);
    9954           0 :                 cb_fn(cb_arg, -ENOMEM);
    9955           0 :                 return;
    9956             :         }
    9957             : 
    9958             :         /* Read the super block */
    9959          40 :         bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0),
    9960          40 :                              bs_byte_to_lba(bs, sizeof(*ctx->super)),
    9961             :                              bs_grow_live_load_super_cpl, ctx);
    9962             : }
    9963             : 
    9964             : void
    9965           5 : spdk_bs_grow(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
    9966             :              spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
    9967             : {
    9968             :         struct spdk_blob_store  *bs;
    9969             :         struct spdk_bs_cpl      cpl;
    9970             :         struct spdk_bs_load_ctx *ctx;
    9971           5 :         struct spdk_bs_opts     opts = {};
    9972             :         int err;
    9973             : 
    9974           5 :         SPDK_DEBUGLOG(blob, "Loading blobstore from dev %p\n", dev);
    9975             : 
    9976           5 :         if ((dev->phys_blocklen % dev->blocklen) != 0) {
    9977           0 :                 SPDK_DEBUGLOG(blob, "unsupported dev block length of %d\n", dev->blocklen);
    9978           0 :                 dev->destroy(dev);
    9979           0 :                 cb_fn(cb_arg, NULL, -EINVAL);
    9980           0 :                 return;
    9981             :         }
    9982             : 
    9983           5 :         spdk_bs_opts_init(&opts, sizeof(opts));
    9984           5 :         if (o) {
    9985           5 :                 if (bs_opts_copy(o, &opts)) {
    9986           0 :                         return;
    9987             :                 }
    9988             :         }
    9989             : 
    9990           5 :         if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) {
    9991           0 :                 dev->destroy(dev);
    9992           0 :                 cb_fn(cb_arg, NULL, -EINVAL);
    9993           0 :                 return;
    9994             :         }
    9995             : 
    9996           5 :         err = bs_alloc(dev, &opts, &bs, &ctx);
    9997           5 :         if (err) {
    9998           0 :                 dev->destroy(dev);
    9999           0 :                 cb_fn(cb_arg, NULL, err);
   10000           0 :                 return;
   10001             :         }
   10002             : 
   10003           5 :         cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE;
   10004           5 :         cpl.u.bs_handle.cb_fn = cb_fn;
   10005           5 :         cpl.u.bs_handle.cb_arg = cb_arg;
   10006           5 :         cpl.u.bs_handle.bs = bs;
   10007             : 
   10008           5 :         ctx->seq = bs_sequence_start_bs(bs->md_channel, &cpl);
   10009           5 :         if (!ctx->seq) {
   10010           0 :                 spdk_free(ctx->super);
   10011           0 :                 free(ctx);
   10012           0 :                 bs_free(bs);
   10013           0 :                 cb_fn(cb_arg, NULL, -ENOMEM);
   10014           0 :                 return;
   10015             :         }
   10016             : 
   10017             :         /* Read the super block */
   10018           5 :         bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0),
   10019           5 :                              bs_byte_to_lba(bs, sizeof(*ctx->super)),
   10020             :                              bs_grow_load_super_cpl, ctx);
   10021             : }
   10022             : 
   10023             : int
   10024          30 : spdk_blob_get_esnap_id(struct spdk_blob *blob, const void **id, size_t *len)
   10025             : {
   10026          30 :         if (!blob_is_esnap_clone(blob)) {
   10027          15 :                 return -EINVAL;
   10028             :         }
   10029             : 
   10030          15 :         return blob_get_xattr_value(blob, BLOB_EXTERNAL_SNAPSHOT_ID, id, len, true);
   10031             : }
   10032             : 
   10033             : struct spdk_io_channel *
   10034       17482 : blob_esnap_get_io_channel(struct spdk_io_channel *ch, struct spdk_blob *blob)
   10035             : {
   10036       17482 :         struct spdk_bs_channel          *bs_channel = spdk_io_channel_get_ctx(ch);
   10037       17482 :         struct spdk_bs_dev              *bs_dev = blob->back_bs_dev;
   10038       17482 :         struct blob_esnap_channel       find = {};
   10039             :         struct blob_esnap_channel       *esnap_channel, *existing;
   10040             : 
   10041       17482 :         find.blob_id = blob->id;
   10042       17482 :         esnap_channel = RB_FIND(blob_esnap_channel_tree, &bs_channel->esnap_channels, &find);
   10043       17482 :         if (spdk_likely(esnap_channel != NULL)) {
   10044       17427 :                 SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": using cached channel on thread %s\n",
   10045             :                               blob->id, spdk_thread_get_name(spdk_get_thread()));
   10046       17427 :                 return esnap_channel->channel;
   10047             :         }
   10048             : 
   10049          55 :         SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": allocating channel on thread %s\n",
   10050             :                       blob->id, spdk_thread_get_name(spdk_get_thread()));
   10051             : 
   10052          55 :         esnap_channel = calloc(1, sizeof(*esnap_channel));
   10053          55 :         if (esnap_channel == NULL) {
   10054           0 :                 SPDK_NOTICELOG("blob 0x%" PRIx64 " channel allocation failed: no memory\n",
   10055             :                                find.blob_id);
   10056           0 :                 return NULL;
   10057             :         }
   10058          55 :         esnap_channel->channel = bs_dev->create_channel(bs_dev);
   10059          55 :         if (esnap_channel->channel == NULL) {
   10060           0 :                 SPDK_NOTICELOG("blob 0x%" PRIx64 " back channel allocation failed\n", blob->id);
   10061           0 :                 free(esnap_channel);
   10062           0 :                 return NULL;
   10063             :         }
   10064          55 :         esnap_channel->blob_id = find.blob_id;
   10065          55 :         existing = RB_INSERT(blob_esnap_channel_tree, &bs_channel->esnap_channels, esnap_channel);
   10066          55 :         if (spdk_unlikely(existing != NULL)) {
   10067             :                 /*
   10068             :                  * This should be unreachable: all modifications to this tree happen on this thread.
   10069             :                  */
   10070           0 :                 SPDK_ERRLOG("blob 0x%" PRIx64 "lost race to allocate a channel\n", find.blob_id);
   10071           0 :                 assert(false);
   10072             : 
   10073             :                 bs_dev->destroy_channel(bs_dev, esnap_channel->channel);
   10074             :                 free(esnap_channel);
   10075             : 
   10076             :                 return existing->channel;
   10077             :         }
   10078             : 
   10079          55 :         return esnap_channel->channel;
   10080             : }
   10081             : 
   10082             : static int
   10083       17452 : blob_esnap_channel_compare(struct blob_esnap_channel *c1, struct blob_esnap_channel *c2)
   10084             : {
   10085       17452 :         return (c1->blob_id < c2->blob_id ? -1 : c1->blob_id > c2->blob_id);
   10086             : }
   10087             : 
   10088             : struct blob_esnap_destroy_ctx {
   10089             :         spdk_blob_op_with_handle_complete       cb_fn;
   10090             :         void                                    *cb_arg;
   10091             :         struct spdk_blob                        *blob;
   10092             :         struct spdk_bs_dev                      *back_bs_dev;
   10093             :         bool                                    abort_io;
   10094             : };
   10095             : 
   10096             : static void
   10097         170 : blob_esnap_destroy_channels_done(struct spdk_io_channel_iter *i, int status)
   10098             : {
   10099         170 :         struct blob_esnap_destroy_ctx   *ctx = spdk_io_channel_iter_get_ctx(i);
   10100         170 :         struct spdk_blob                *blob = ctx->blob;
   10101         170 :         struct spdk_blob_store          *bs = blob->bs;
   10102             : 
   10103         170 :         SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": done destroying channels for this blob\n",
   10104             :                       blob->id);
   10105             : 
   10106         170 :         if (ctx->cb_fn != NULL) {
   10107         155 :                 ctx->cb_fn(ctx->cb_arg, blob, status);
   10108             :         }
   10109         170 :         free(ctx);
   10110             : 
   10111         170 :         bs->esnap_channels_unloading--;
   10112         170 :         if (bs->esnap_channels_unloading == 0 && bs->esnap_unload_cb_fn != NULL) {
   10113           5 :                 spdk_bs_unload(bs, bs->esnap_unload_cb_fn, bs->esnap_unload_cb_arg);
   10114             :         }
   10115         170 : }
   10116             : 
   10117             : static void
   10118         180 : blob_esnap_destroy_one_channel(struct spdk_io_channel_iter *i)
   10119             : {
   10120         180 :         struct blob_esnap_destroy_ctx   *ctx = spdk_io_channel_iter_get_ctx(i);
   10121         180 :         struct spdk_blob                *blob = ctx->blob;
   10122         180 :         struct spdk_bs_dev              *bs_dev = ctx->back_bs_dev;
   10123         180 :         struct spdk_io_channel          *channel = spdk_io_channel_iter_get_channel(i);
   10124         180 :         struct spdk_bs_channel          *bs_channel = spdk_io_channel_get_ctx(channel);
   10125             :         struct blob_esnap_channel       *esnap_channel;
   10126         180 :         struct blob_esnap_channel       find = {};
   10127             : 
   10128         180 :         assert(spdk_get_thread() == spdk_io_channel_get_thread(channel));
   10129             : 
   10130         180 :         find.blob_id = blob->id;
   10131         180 :         esnap_channel = RB_FIND(blob_esnap_channel_tree, &bs_channel->esnap_channels, &find);
   10132         180 :         if (esnap_channel != NULL) {
   10133          15 :                 SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": destroying channel on thread %s\n",
   10134             :                               blob->id, spdk_thread_get_name(spdk_get_thread()));
   10135          15 :                 RB_REMOVE(blob_esnap_channel_tree, &bs_channel->esnap_channels, esnap_channel);
   10136             : 
   10137          15 :                 if (ctx->abort_io) {
   10138             :                         spdk_bs_user_op_t *op, *tmp;
   10139             : 
   10140          10 :                         TAILQ_FOREACH_SAFE(op, &bs_channel->queued_io, link, tmp) {
   10141           0 :                                 if (op->back_channel == esnap_channel->channel) {
   10142           0 :                                         TAILQ_REMOVE(&bs_channel->queued_io, op, link);
   10143           0 :                                         bs_user_op_abort(op, -EIO);
   10144             :                                 }
   10145             :                         }
   10146             :                 }
   10147             : 
   10148          15 :                 bs_dev->destroy_channel(bs_dev, esnap_channel->channel);
   10149          15 :                 free(esnap_channel);
   10150             :         }
   10151             : 
   10152         180 :         spdk_for_each_channel_continue(i, 0);
   10153         180 : }
   10154             : 
   10155             : /*
   10156             :  * Destroy the channels for a specific blob on each thread with a blobstore channel. This should be
   10157             :  * used when closing an esnap clone blob and after decoupling from the parent.
   10158             :  */
   10159             : static void
   10160         606 : blob_esnap_destroy_bs_dev_channels(struct spdk_blob *blob, bool abort_io,
   10161             :                                    spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
   10162             : {
   10163             :         struct blob_esnap_destroy_ctx   *ctx;
   10164             : 
   10165         606 :         if (!blob_is_esnap_clone(blob) || blob->back_bs_dev == NULL) {
   10166         436 :                 if (cb_fn != NULL) {
   10167         436 :                         cb_fn(cb_arg, blob, 0);
   10168             :                 }
   10169         436 :                 return;
   10170             :         }
   10171             : 
   10172         170 :         ctx = calloc(1, sizeof(*ctx));
   10173         170 :         if (ctx == NULL) {
   10174           0 :                 if (cb_fn != NULL) {
   10175           0 :                         cb_fn(cb_arg, blob, -ENOMEM);
   10176             :                 }
   10177           0 :                 return;
   10178             :         }
   10179         170 :         ctx->cb_fn = cb_fn;
   10180         170 :         ctx->cb_arg = cb_arg;
   10181         170 :         ctx->blob = blob;
   10182         170 :         ctx->back_bs_dev = blob->back_bs_dev;
   10183         170 :         ctx->abort_io = abort_io;
   10184             : 
   10185         170 :         SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": destroying channels for this blob\n",
   10186             :                       blob->id);
   10187             : 
   10188         170 :         blob->bs->esnap_channels_unloading++;
   10189         170 :         spdk_for_each_channel(blob->bs, blob_esnap_destroy_one_channel, ctx,
   10190             :                               blob_esnap_destroy_channels_done);
   10191             : }
   10192             : 
   10193             : /*
   10194             :  * Destroy all bs_dev channels on a specific blobstore channel. This should be used when a
   10195             :  * bs_channel is destroyed.
   10196             :  */
   10197             : static void
   10198        1269 : blob_esnap_destroy_bs_channel(struct spdk_bs_channel *ch)
   10199             : {
   10200             :         struct blob_esnap_channel *esnap_channel, *esnap_channel_tmp;
   10201             : 
   10202        1269 :         assert(spdk_get_thread() == spdk_io_channel_get_thread(spdk_io_channel_from_ctx(ch)));
   10203             : 
   10204        1269 :         SPDK_DEBUGLOG(blob_esnap, "destroying channels on thread %s\n",
   10205             :                       spdk_thread_get_name(spdk_get_thread()));
   10206        1309 :         RB_FOREACH_SAFE(esnap_channel, blob_esnap_channel_tree, &ch->esnap_channels,
   10207             :                         esnap_channel_tmp) {
   10208          40 :                 SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64
   10209             :                               ": destroying one channel in thread %s\n",
   10210             :                               esnap_channel->blob_id, spdk_thread_get_name(spdk_get_thread()));
   10211          40 :                 RB_REMOVE(blob_esnap_channel_tree, &ch->esnap_channels, esnap_channel);
   10212          40 :                 spdk_put_io_channel(esnap_channel->channel);
   10213          40 :                 free(esnap_channel);
   10214             :         }
   10215        1269 :         SPDK_DEBUGLOG(blob_esnap, "done destroying channels on thread %s\n",
   10216             :                       spdk_thread_get_name(spdk_get_thread()));
   10217        1269 : }
   10218             : 
   10219             : static void
   10220          35 : blob_set_back_bs_dev_done(void *_ctx, int bserrno)
   10221             : {
   10222          35 :         struct set_bs_dev_ctx   *ctx = _ctx;
   10223             : 
   10224          35 :         if (bserrno != 0) {
   10225             :                 /* Even though the unfreeze failed, the update may have succeed. */
   10226           0 :                 SPDK_ERRLOG("blob 0x%" PRIx64 ": unfreeze failed with error %d\n", ctx->blob->id,
   10227             :                             bserrno);
   10228             :         }
   10229          35 :         ctx->cb_fn(ctx->cb_arg, ctx->bserrno);
   10230          35 :         free(ctx);
   10231          35 : }
   10232             : 
   10233             : static void
   10234          35 : blob_frozen_set_back_bs_dev(void *_ctx, struct spdk_blob *blob, int bserrno)
   10235             : {
   10236          35 :         struct set_bs_dev_ctx   *ctx = _ctx;
   10237             :         int rc;
   10238             : 
   10239          35 :         if (bserrno != 0) {
   10240           0 :                 SPDK_ERRLOG("blob 0x%" PRIx64 ": failed to release old back_bs_dev with error %d\n",
   10241             :                             blob->id, bserrno);
   10242           0 :                 ctx->bserrno = bserrno;
   10243           0 :                 blob_unfreeze_io(blob, blob_set_back_bs_dev_done, ctx);
   10244           0 :                 return;
   10245             :         }
   10246             : 
   10247          35 :         if (blob->back_bs_dev != NULL) {
   10248          35 :                 blob_unref_back_bs_dev(blob);
   10249             :         }
   10250             : 
   10251          35 :         if (ctx->parent_refs_cb_fn) {
   10252          25 :                 rc = ctx->parent_refs_cb_fn(blob, ctx->parent_refs_cb_arg);
   10253          25 :                 if (rc != 0) {
   10254           0 :                         ctx->bserrno = rc;
   10255           0 :                         blob_unfreeze_io(blob, blob_set_back_bs_dev_done, ctx);
   10256           0 :                         return;
   10257             :                 }
   10258             :         }
   10259             : 
   10260          35 :         SPDK_NOTICELOG("blob 0x%" PRIx64 ": hotplugged back_bs_dev\n", blob->id);
   10261          35 :         blob->back_bs_dev = ctx->back_bs_dev;
   10262          35 :         ctx->bserrno = 0;
   10263             : 
   10264          35 :         blob_unfreeze_io(blob, blob_set_back_bs_dev_done, ctx);
   10265             : }
   10266             : 
   10267             : static void
   10268          35 : blob_set_back_bs_dev_frozen(void *_ctx, int bserrno)
   10269             : {
   10270          35 :         struct set_bs_dev_ctx   *ctx = _ctx;
   10271          35 :         struct spdk_blob        *blob = ctx->blob;
   10272             : 
   10273          35 :         if (bserrno != 0) {
   10274           0 :                 SPDK_ERRLOG("blob 0x%" PRIx64 ": failed to freeze with error %d\n", blob->id,
   10275             :                             bserrno);
   10276           0 :                 ctx->cb_fn(ctx->cb_arg, bserrno);
   10277           0 :                 free(ctx);
   10278           0 :                 return;
   10279             :         }
   10280             : 
   10281             :         /*
   10282             :          * This does not prevent future reads from the esnap device because any future IO will
   10283             :          * lazily create a new esnap IO channel.
   10284             :          */
   10285          35 :         blob_esnap_destroy_bs_dev_channels(blob, true, blob_frozen_set_back_bs_dev, ctx);
   10286             : }
   10287             : 
   10288             : void
   10289          10 : spdk_blob_set_esnap_bs_dev(struct spdk_blob *blob, struct spdk_bs_dev *back_bs_dev,
   10290             :                            spdk_blob_op_complete cb_fn, void *cb_arg)
   10291             : {
   10292          10 :         if (!blob_is_esnap_clone(blob)) {
   10293           0 :                 SPDK_ERRLOG("blob 0x%" PRIx64 ": not an esnap clone\n", blob->id);
   10294           0 :                 cb_fn(cb_arg, -EINVAL);
   10295           0 :                 return;
   10296             :         }
   10297             : 
   10298          10 :         blob_set_back_bs_dev(blob, back_bs_dev, NULL, NULL, cb_fn, cb_arg);
   10299             : }
   10300             : 
   10301             : struct spdk_bs_dev *
   10302           5 : spdk_blob_get_esnap_bs_dev(const struct spdk_blob *blob)
   10303             : {
   10304           5 :         if (!blob_is_esnap_clone(blob)) {
   10305           0 :                 SPDK_ERRLOG("blob 0x%" PRIx64 ": not an esnap clone\n", blob->id);
   10306           0 :                 return NULL;
   10307             :         }
   10308             : 
   10309           5 :         return blob->back_bs_dev;
   10310             : }
   10311             : 
   10312             : bool
   10313          35 : spdk_blob_is_degraded(const struct spdk_blob *blob)
   10314             : {
   10315          35 :         if (blob->bs->dev->is_degraded != NULL && blob->bs->dev->is_degraded(blob->bs->dev)) {
   10316           5 :                 return true;
   10317             :         }
   10318          30 :         if (blob->back_bs_dev == NULL || blob->back_bs_dev->is_degraded == NULL) {
   10319          15 :                 return false;
   10320             :         }
   10321             : 
   10322          15 :         return blob->back_bs_dev->is_degraded(blob->back_bs_dev);
   10323             : }
   10324             : 
   10325           3 : SPDK_LOG_REGISTER_COMPONENT(blob)
   10326           3 : SPDK_LOG_REGISTER_COMPONENT(blob_esnap)
   10327             : 
   10328             : static void
   10329           0 : blob_trace(void)
   10330             : {
   10331           0 :         struct spdk_trace_tpoint_opts opts[] = {
   10332             :                 {
   10333             :                         "BLOB_REQ_SET_START", TRACE_BLOB_REQ_SET_START,
   10334             :                         OWNER_TYPE_NONE, OBJECT_BLOB_CB_ARG, 1,
   10335             :                         {
   10336             :                                 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }
   10337             :                         }
   10338             :                 },
   10339             :                 {
   10340             :                         "BLOB_REQ_SET_COMPLETE", TRACE_BLOB_REQ_SET_COMPLETE,
   10341             :                         OWNER_TYPE_NONE, OBJECT_BLOB_CB_ARG, 0,
   10342             :                         {
   10343             :                                 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }
   10344             :                         }
   10345             :                 },
   10346             :         };
   10347             : 
   10348           0 :         spdk_trace_register_object(OBJECT_BLOB_CB_ARG, 'a');
   10349           0 :         spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
   10350           0 :         spdk_trace_tpoint_register_relation(TRACE_BDEV_IO_START, OBJECT_BLOB_CB_ARG, 1);
   10351           0 :         spdk_trace_tpoint_register_relation(TRACE_BDEV_IO_DONE, OBJECT_BLOB_CB_ARG, 0);
   10352           0 : }
   10353           3 : SPDK_TRACE_REGISTER_FN(blob_trace, "blob", TRACE_GROUP_BLOB)

Generated by: LCOV version 1.15