From 0e9703628cbb5ec5cf6223c0b00e4f777f84b5f9 Mon Sep 17 00:00:00 2001 Message-Id: <0e9703628cbb5ec5cf6223c0b00e4f777f84b5f9.1369658547.git.minovotn@redhat.com> In-Reply-To: <07146f8b79923c529fd93fa528e6fcbd6f571a02.1369658547.git.minovotn@redhat.com> References: <07146f8b79923c529fd93fa528e6fcbd6f571a02.1369658547.git.minovotn@redhat.com> From: Fam Zheng Date: Mon, 20 May 2013 03:36:19 +0200 Subject: [PATCH 04/47] VMDK: introduce VmdkExtent RH-Author: Fam Zheng Message-id: <1369021022-22728-5-git-send-email-famz@redhat.com> Patchwork-id: 51440 O-Subject: [PATCH RHEL-6.5 qemu-kvm v3 04/47] VMDK: introduce VmdkExtent Bugzilla: 960685 RH-Acked-by: Stefan Hajnoczi RH-Acked-by: Jeffrey Cody RH-Acked-by: Kevin Wolf From: Fam Zheng Introduced VmdkExtent array into BDRVVmdkState, enable holding multiple image extents for multiple file image support. Signed-off-by: Fam Zheng Reviewed-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf (cherry picked from commit b3976d3c8b895e8929d133b80ef5e373b60cf357) Signed-off-by: Fam Zheng Conflicts: block/vmdk.c Resolved conflicts with coroutine that was developed on upstream later but backported earlier. --- block/vmdk.c | 351 +++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 247 insertions(+), 104 deletions(-) Signed-off-by: Michal Novotny --- block/vmdk.c | 351 +++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 247 insertions(+), 104 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index c447216..bbd2ca6 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -60,7 +60,11 @@ typedef struct { #define L2_CACHE_SIZE 16 -typedef struct BDRVVmdkState { +typedef struct VmdkExtent { + BlockDriverState *file; + bool flat; + int64_t sectors; + int64_t end_sector; int64_t l1_table_offset; int64_t l1_backup_table_offset; uint32_t *l1_table; @@ -74,9 +78,14 @@ typedef struct BDRVVmdkState { uint32_t l2_cache_counts[L2_CACHE_SIZE]; unsigned int cluster_sectors; +} VmdkExtent; +typedef struct BDRVVmdkState { CoMutex lock; uint32_t parent_cid; + int num_extents; + /* Extent array with num_extents entries, ascend ordered by address */ + VmdkExtent *extents; } BDRVVmdkState; typedef struct VmdkMetaData { @@ -107,6 +116,19 @@ static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename) #define DESC_SIZE 20*SECTOR_SIZE // 20 sectors of 512 bytes each #define HEADER_SIZE 512 // first sector of 512 bytes +static void vmdk_free_extents(BlockDriverState *bs) +{ + int i; + BDRVVmdkState *s = bs->opaque; + + for (i = 0; i < s->num_extents; i++) { + g_free(s->extents[i].l1_table); + g_free(s->extents[i].l2_cache); + g_free(s->extents[i].l1_backup_table); + } + g_free(s->extents); +} + static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) { char desc[DESC_SIZE]; @@ -360,11 +382,50 @@ static int vmdk_parent_open(BlockDriverState *bs) return 0; } +/* Create and append extent to the extent array. Return the added VmdkExtent + * address. return NULL if allocation failed. */ +static VmdkExtent *vmdk_add_extent(BlockDriverState *bs, + BlockDriverState *file, bool flat, int64_t sectors, + int64_t l1_offset, int64_t l1_backup_offset, + uint32_t l1_size, + int l2_size, unsigned int cluster_sectors) +{ + VmdkExtent *extent; + BDRVVmdkState *s = bs->opaque; + + s->extents = g_realloc(s->extents, + (s->num_extents + 1) * sizeof(VmdkExtent)); + extent = &s->extents[s->num_extents]; + s->num_extents++; + + memset(extent, 0, sizeof(VmdkExtent)); + extent->file = file; + extent->flat = flat; + extent->sectors = sectors; + extent->l1_table_offset = l1_offset; + extent->l1_backup_table_offset = l1_backup_offset; + extent->l1_size = l1_size; + extent->l1_entry_sectors = l2_size * cluster_sectors; + extent->l2_size = l2_size; + extent->cluster_sectors = cluster_sectors; + + if (s->num_extents > 1) { + extent->end_sector = (*(extent - 1)).end_sector + extent->sectors; + } else { + extent->end_sector = extent->sectors; + } + bs->total_sectors = extent->end_sector; + return extent; +} + + static int vmdk_open(BlockDriverState *bs, int flags) { BDRVVmdkState *s = bs->opaque; uint32_t magic; - int l1_size, i; + int i; + uint32_t l1_size, l1_entry_sectors; + VmdkExtent *extent = NULL; if (bdrv_pread(bs->file, 0, &magic, sizeof(magic)) != sizeof(magic)) goto fail; @@ -372,32 +433,34 @@ static int vmdk_open(BlockDriverState *bs, int flags) magic = be32_to_cpu(magic); if (magic == VMDK3_MAGIC) { VMDK3Header header; - - if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header)) != sizeof(header)) + if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header)) + != sizeof(header)) { goto fail; - s->cluster_sectors = le32_to_cpu(header.granularity); - s->l2_size = 1 << 9; - s->l1_size = 1 << 6; - bs->total_sectors = le32_to_cpu(header.disk_sectors); - s->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9; - s->l1_backup_table_offset = 0; - s->l1_entry_sectors = s->l2_size * s->cluster_sectors; + } + extent = vmdk_add_extent(bs, bs->file, false, + le32_to_cpu(header.disk_sectors), + le32_to_cpu(header.l1dir_offset) << 9, 0, + 1 << 6, 1 << 9, le32_to_cpu(header.granularity)); } else if (magic == VMDK4_MAGIC) { VMDK4Header header; - - if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header)) != sizeof(header)) + if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header)) + != sizeof(header)) { goto fail; - bs->total_sectors = le64_to_cpu(header.capacity); - s->cluster_sectors = le64_to_cpu(header.granularity); - s->l2_size = le32_to_cpu(header.num_gtes_per_gte); - s->l1_entry_sectors = s->l2_size * s->cluster_sectors; - if (s->l1_entry_sectors <= 0) + } + l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gte) + * le64_to_cpu(header.granularity); + l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1) + / l1_entry_sectors; + extent = vmdk_add_extent(bs, bs->file, false, + le64_to_cpu(header.capacity), + le64_to_cpu(header.gd_offset) << 9, + le64_to_cpu(header.rgd_offset) << 9, + l1_size, + le32_to_cpu(header.num_gtes_per_gte), + le64_to_cpu(header.granularity)); + if (extent->l1_entry_sectors <= 0) { goto fail; - s->l1_size = (bs->total_sectors + s->l1_entry_sectors - 1) - / s->l1_entry_sectors; - s->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9; - s->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9; - + } // try to open parent images, if exist if (vmdk_parent_open(bs) != 0) goto fail; @@ -408,41 +471,50 @@ static int vmdk_open(BlockDriverState *bs, int flags) } /* read the L1 table */ - l1_size = s->l1_size * sizeof(uint32_t); - s->l1_table = g_malloc(l1_size); - if (bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, l1_size) != l1_size) + l1_size = extent->l1_size * sizeof(uint32_t); + extent->l1_table = g_malloc(l1_size); + if (bdrv_pread(bs->file, + extent->l1_table_offset, + extent->l1_table, + l1_size) + != l1_size) { goto fail; - for(i = 0; i < s->l1_size; i++) { - le32_to_cpus(&s->l1_table[i]); + } + for (i = 0; i < extent->l1_size; i++) { + le32_to_cpus(&extent->l1_table[i]); } - if (s->l1_backup_table_offset) { - s->l1_backup_table = g_malloc(l1_size); - if (bdrv_pread(bs->file, s->l1_backup_table_offset, s->l1_backup_table, l1_size) != l1_size) + if (extent->l1_backup_table_offset) { + extent->l1_backup_table = g_malloc(l1_size); + if (bdrv_pread(bs->file, + extent->l1_backup_table_offset, + extent->l1_backup_table, + l1_size) + != l1_size) { goto fail; - for(i = 0; i < s->l1_size; i++) { - le32_to_cpus(&s->l1_backup_table[i]); + } + for (i = 0; i < extent->l1_size; i++) { + le32_to_cpus(&extent->l1_backup_table[i]); } } - s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint32_t)); + extent->l2_cache = + g_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t)); qemu_co_mutex_init(&s->lock); return 0; fail: - g_free(s->l1_backup_table); - g_free(s->l1_table); - g_free(s->l2_cache); + vmdk_free_extents(bs); return -1; } -static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data, - uint64_t offset, int allocate); - -static int get_whole_cluster(BlockDriverState *bs, uint64_t cluster_offset, - uint64_t offset, int allocate) +static int get_whole_cluster(BlockDriverState *bs, + VmdkExtent *extent, + uint64_t cluster_offset, + uint64_t offset, + bool allocate) { - BDRVVmdkState *s = bs->opaque; - uint8_t whole_grain[s->cluster_sectors*512]; // 128 sectors * 512 bytes each = grain size 64KB + /* 128 sectors * 512 bytes each = grain size 64KB */ + uint8_t whole_grain[extent->cluster_sectors * 512]; // we will be here if it's first write on non-exist grain(cluster). // try to read from parent image, if exist @@ -453,14 +525,14 @@ static int get_whole_cluster(BlockDriverState *bs, uint64_t cluster_offset, return -1; ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain, - s->cluster_sectors); + extent->cluster_sectors); if (ret < 0) { return -1; } //Write grain only into the active image - ret = bdrv_write(bs->file, cluster_offset, whole_grain, - s->cluster_sectors); + ret = bdrv_write(extent->file, cluster_offset, whole_grain, + extent->cluster_sectors); if (ret < 0) { return -1; } @@ -468,29 +540,39 @@ static int get_whole_cluster(BlockDriverState *bs, uint64_t cluster_offset, return 0; } -static int vmdk_L2update(BlockDriverState *bs, VmdkMetaData *m_data) +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data) { - BDRVVmdkState *s = bs->opaque; - /* update L2 table */ - if (bdrv_pwrite_sync(bs->file, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)), - &(m_data->offset), sizeof(m_data->offset)) < 0) + if (bdrv_pwrite_sync( + extent->file, + ((int64_t)m_data->l2_offset * 512) + + (m_data->l2_index * sizeof(m_data->offset)), + &(m_data->offset), + sizeof(m_data->offset) + ) < 0) { return -1; + } /* update backup L2 table */ - if (s->l1_backup_table_offset != 0) { - m_data->l2_offset = s->l1_backup_table[m_data->l1_index]; - if (bdrv_pwrite_sync(bs->file, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)), - &(m_data->offset), sizeof(m_data->offset)) < 0) + if (extent->l1_backup_table_offset != 0) { + m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; + if (bdrv_pwrite_sync( + extent->file, + ((int64_t)m_data->l2_offset * 512) + + (m_data->l2_index * sizeof(m_data->offset)), + &(m_data->offset), sizeof(m_data->offset) + ) < 0) { return -1; + } } return 0; } -static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data, - uint64_t offset, int allocate) +static uint64_t get_cluster_offset(BlockDriverState *bs, + VmdkExtent *extent, + VmdkMetaData *m_data, + uint64_t offset, int allocate) { - BDRVVmdkState *s = bs->opaque; unsigned int l1_index, l2_offset, l2_index; int min_index, i, j; uint32_t min_count, *l2_table, tmp = 0; @@ -499,21 +581,23 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data, if (m_data) m_data->valid = 0; - l1_index = (offset >> 9) / s->l1_entry_sectors; - if (l1_index >= s->l1_size) + l1_index = (offset >> 9) / extent->l1_entry_sectors; + if (l1_index >= extent->l1_size) { return 0; - l2_offset = s->l1_table[l1_index]; - if (!l2_offset) + } + l2_offset = extent->l1_table[l1_index]; + if (!l2_offset) { return 0; + } for(i = 0; i < L2_CACHE_SIZE; i++) { - if (l2_offset == s->l2_cache_offsets[i]) { + if (l2_offset == extent->l2_cache_offsets[i]) { /* increment the hit count */ - if (++s->l2_cache_counts[i] == 0xffffffff) { + if (++extent->l2_cache_counts[i] == 0xffffffff) { for(j = 0; j < L2_CACHE_SIZE; j++) { - s->l2_cache_counts[j] >>= 1; + extent->l2_cache_counts[j] >>= 1; } } - l2_table = s->l2_cache + (i * s->l2_size); + l2_table = extent->l2_cache + (i * extent->l2_size); goto found; } } @@ -521,20 +605,25 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data, min_index = 0; min_count = 0xffffffff; for(i = 0; i < L2_CACHE_SIZE; i++) { - if (s->l2_cache_counts[i] < min_count) { - min_count = s->l2_cache_counts[i]; + if (extent->l2_cache_counts[i] < min_count) { + min_count = extent->l2_cache_counts[i]; min_index = i; } } - l2_table = s->l2_cache + (min_index * s->l2_size); - if (bdrv_pread(bs->file, (int64_t)l2_offset * 512, l2_table, s->l2_size * sizeof(uint32_t)) != - s->l2_size * sizeof(uint32_t)) + l2_table = extent->l2_cache + (min_index * extent->l2_size); + if (bdrv_pread( + extent->file, + (int64_t)l2_offset * 512, + l2_table, + extent->l2_size * sizeof(uint32_t) + ) != extent->l2_size * sizeof(uint32_t)) { return 0; + } - s->l2_cache_offsets[min_index] = l2_offset; - s->l2_cache_counts[min_index] = 1; + extent->l2_cache_offsets[min_index] = l2_offset; + extent->l2_cache_counts[min_index] = 1; found: - l2_index = ((offset >> 9) / s->cluster_sectors) % s->l2_size; + l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; cluster_offset = le32_to_cpu(l2_table[l2_index]); if (!cluster_offset) { @@ -542,8 +631,11 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data, return 0; // Avoid the L2 tables update for the images that have snapshots. - cluster_offset = bdrv_getlength(bs->file); - bdrv_truncate(bs->file, cluster_offset + (s->cluster_sectors << 9)); + cluster_offset = bdrv_getlength(extent->file); + bdrv_truncate( + extent->file, + cluster_offset + (extent->cluster_sectors << 9) + ); cluster_offset >>= 9; tmp = cpu_to_le32(cluster_offset); @@ -554,7 +646,8 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data, * This problem may occur because of insufficient space on host disk * or inappropriate VM shutdown. */ - if (get_whole_cluster(bs, cluster_offset, offset, allocate) == -1) + if (get_whole_cluster( + bs, extent, cluster_offset, offset, allocate) == -1) return 0; if (m_data) { @@ -569,35 +662,71 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data, return cluster_offset; } +static VmdkExtent *find_extent(BDRVVmdkState *s, + int64_t sector_num, VmdkExtent *start_hint) +{ + VmdkExtent *extent = start_hint; + + if (!extent) { + extent = &s->extents[0]; + } + while (extent < &s->extents[s->num_extents]) { + if (sector_num < extent->end_sector) { + return extent; + } + extent++; + } + return NULL; +} + static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { BDRVVmdkState *s = bs->opaque; - int index_in_cluster, n; - uint64_t cluster_offset; - qemu_co_mutex_lock(&s->lock); - cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0); - qemu_co_mutex_unlock(&s->lock); - index_in_cluster = sector_num % s->cluster_sectors; - n = s->cluster_sectors - index_in_cluster; + int64_t index_in_cluster, n, ret; + uint64_t offset; + VmdkExtent *extent; + + extent = find_extent(s, sector_num, NULL); + if (!extent) { + return 0; + } + if (extent->flat) { + n = extent->end_sector - sector_num; + ret = 1; + } else { + qemu_co_mutex_lock(&s->lock); + offset = get_cluster_offset(bs, extent, NULL, sector_num * 512, 0); + qemu_co_mutex_unlock(&s->lock); + index_in_cluster = sector_num % extent->cluster_sectors; + n = extent->cluster_sectors - index_in_cluster; + ret = offset ? 1 : 0; + } if (n > nb_sectors) n = nb_sectors; *pnum = n; - return (cluster_offset != 0); + return ret; } static int vmdk_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors) { BDRVVmdkState *s = bs->opaque; - int index_in_cluster, n, ret; + int ret; + uint64_t n, index_in_cluster; + VmdkExtent *extent = NULL; uint64_t cluster_offset; while (nb_sectors > 0) { - cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0); - index_in_cluster = sector_num % s->cluster_sectors; - n = s->cluster_sectors - index_in_cluster; + extent = find_extent(s, sector_num, extent); + if (!extent) { + return -EIO; + } + cluster_offset = get_cluster_offset( + bs, extent, NULL, sector_num << 9, 0); + index_in_cluster = sector_num % extent->cluster_sectors; + n = extent->cluster_sectors - index_in_cluster; if (n > nb_sectors) n = nb_sectors; if (!cluster_offset) { @@ -637,10 +766,12 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors) { BDRVVmdkState *s = bs->opaque; - VmdkMetaData m_data; - int index_in_cluster, n; + VmdkExtent *extent = NULL; + int n; + int64_t index_in_cluster; uint64_t cluster_offset; static int cid_update = 0; + VmdkMetaData m_data; if (sector_num > bs->total_sectors) { fprintf(stderr, @@ -651,20 +782,35 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, } while (nb_sectors > 0) { - index_in_cluster = sector_num & (s->cluster_sectors - 1); - n = s->cluster_sectors - index_in_cluster; - if (n > nb_sectors) - n = nb_sectors; - cluster_offset = get_cluster_offset(bs, &m_data, sector_num << 9, 1); - if (!cluster_offset) + extent = find_extent(s, sector_num, extent); + if (!extent) { + return -EIO; + } + cluster_offset = get_cluster_offset( + bs, + extent, + &m_data, + sector_num << 9, 1); + if (!cluster_offset) { return -1; + } + index_in_cluster = sector_num % extent->cluster_sectors; + n = extent->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; + } - if (bdrv_pwrite(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512) + if (bdrv_pwrite(bs->file, + cluster_offset + index_in_cluster * 512, + buf, n * 512) + != n * 512) { return -1; + } if (m_data.valid) { /* update L2 tables */ - if (vmdk_L2update(bs, &m_data) == -1) + if (vmdk_L2update(extent, &m_data) == -1) { return -1; + } } nb_sectors -= n; sector_num += n; @@ -850,10 +996,7 @@ exit: static void vmdk_close(BlockDriverState *bs) { - BDRVVmdkState *s = bs->opaque; - - g_free(s->l1_table); - g_free(s->l2_cache); + vmdk_free_extents(bs); } static coroutine_fn int vmdk_co_flush(BlockDriverState *bs) -- 1.7.11.7