diff options
Diffstat (limited to 'fs')
277 files changed, 5004 insertions, 2269 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index b3c2cc79c20d..082d227fa56b 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -277,6 +277,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, case ACL_TYPE_ACCESS: if (acl) { struct iattr iattr; + struct posix_acl *old_acl = acl; retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); if (retval) @@ -287,6 +288,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, * by the mode bits. So don't * update ACL. */ + posix_acl_release(old_acl); value = NULL; size = 0; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 30ca770c5e0b..f8ab4a66acaf 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -483,6 +483,9 @@ static int v9fs_test_inode(struct inode *inode, void *data) if (v9inode->qid.type != st->qid.type) return 0; + + if (v9inode->qid.path != st->qid.path) + return 0; return 1; } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index afaa4b6de801..c3dd0d42bb3a 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -87,6 +87,9 @@ static int v9fs_test_inode_dotl(struct inode *inode, void *data) if (v9inode->qid.type != st->qid.type) return 0; + + if (v9inode->qid.path != st->qid.path) + return 0; return 1; } diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 81dd075356b9..d4fb0afc0097 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -202,7 +202,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) /* try and do the mount */ _debug("--- attempting mount %s -o %s ---", devname, options); - mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options); + mnt = vfs_submount(mntpt, &afs_fs_type, devname, options); _debug("--- mount result %p ---", mnt); free_page((unsigned long) devname); @@ -1085,7 +1085,8 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2) * Tell lockdep we inherited freeze protection from submission * thread. */ - __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE); + if (S_ISREG(file_inode(file)->i_mode)) + __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE); file_end_write(file); } @@ -1492,7 +1493,8 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, * by telling it the lock got released so that it doesn't * complain about held lock when we return to userspace. */ - __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); + if (S_ISREG(file_inode(file)->i_mode)) + __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); } kfree(iovec); return ret; diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index fc09eb77ddf3..ffc69dd28928 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -345,7 +345,7 @@ static int autofs_dev_ioctl_fail(struct file *fp, int status; token = (autofs_wqt_t) param->fail.token; - status = param->fail.status ? param->fail.status : -ENOENT; + status = param->fail.status < 0 ? param->fail.status : -ENOENT; return autofs4_wait_release(sbi, token, status); } diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index e44271dfceb6..4c71dba90120 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -87,7 +87,8 @@ static int autofs4_write(struct autofs_sb_info *sbi, spin_unlock_irqrestore(¤t->sighand->siglock, flags); } - return (bytes > 0); + /* if 'wr' returned 0 (impossible) we assume -EIO (safe) */ + return bytes == 0 ? 0 : wr < 0 ? wr : -EIO; } static void autofs4_notify_daemon(struct autofs_sb_info *sbi, @@ -101,6 +102,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, } pkt; struct file *pipe = NULL; size_t pktsz; + int ret; pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n", (unsigned long) wq->wait_queue_token, @@ -175,7 +177,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, mutex_unlock(&sbi->wq_mutex); if (autofs4_write(sbi, pipe, &pkt, pktsz)) + switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) { + case 0: + break; + case -ENOMEM: + case -ERESTARTSYS: + /* Just fail this one */ + autofs4_wait_release(sbi, wq->wait_queue_token, ret); + break; + default: autofs4_catatonic_mode(sbi); + break; + } fput(pipe); } @@ -431,8 +444,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, memcpy(&wq->name, &qstr, sizeof(struct qstr)); wq->dev = autofs4_get_dev(sbi); wq->ino = autofs4_get_ino(sbi); - wq->uid = current_real_cred()->uid; - wq->gid = current_real_cred()->gid; + wq->uid = current_cred()->uid; + wq->gid = current_cred()->gid; wq->pid = pid; wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 2472af2798c7..1fdf4e5bf8c6 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -911,17 +911,60 @@ static int load_elf_binary(struct linux_binprm *bprm) elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; vaddr = elf_ppnt->p_vaddr; + /* + * If we are loading ET_EXEC or we have already performed + * the ET_DYN load_addr calculations, proceed normally. + */ if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { elf_flags |= MAP_FIXED; } else if (loc->elf_ex.e_type == ET_DYN) { - /* Try and get dynamic programs out of the way of the - * default mmap base, as well as whatever program they - * might try to exec. This is because the brk will - * follow the loader, and is not movable. */ - load_bias = ELF_ET_DYN_BASE - vaddr; - if (current->flags & PF_RANDOMIZE) - load_bias += arch_mmap_rnd(); - load_bias = ELF_PAGESTART(load_bias); + /* + * This logic is run once for the first LOAD Program + * Header for ET_DYN binaries to calculate the + * randomization (load_bias) for all the LOAD + * Program Headers, and to calculate the entire + * size of the ELF mapping (total_size). (Note that + * load_addr_set is set to true later once the + * initial mapping is performed.) + * + * There are effectively two types of ET_DYN + * binaries: programs (i.e. PIE: ET_DYN with INTERP) + * and loaders (ET_DYN without INTERP, since they + * _are_ the ELF interpreter). The loaders must + * be loaded away from programs since the program + * may otherwise collide with the loader (especially + * for ET_EXEC which does not have a randomized + * position). For example to handle invocations of + * "./ld.so someprog" to test out a new version of + * the loader, the subsequent program that the + * loader loads must avoid the loader itself, so + * they cannot share the same load range. Sufficient + * room for the brk must be allocated with the + * loader as well, since brk must be available with + * the loader. + * + * Therefore, programs are loaded offset from + * ELF_ET_DYN_BASE and loaders are loaded into the + * independently randomized mmap region (0 load_bias + * without MAP_FIXED). + */ + if (elf_interpreter) { + load_bias = ELF_ET_DYN_BASE; + if (current->flags & PF_RANDOMIZE) + load_bias += arch_mmap_rnd(); + elf_flags |= MAP_FIXED; + } else + load_bias = 0; + + /* + * Since load_bias is used for all subsequent loading + * calculations, we must lower it by the first vaddr + * so that the remaining calculations based on the + * ELF vaddrs will be correctly offset. The result + * is then page aligned. + */ + load_bias = ELF_PAGESTART(load_bias - vaddr); + total_size = total_mapping_size(elf_phdata, loc->elf_ex.e_phnum); if (!total_size) { @@ -2296,6 +2339,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; } } + dump_truncate(cprm); if (!elf_core_write_extra_data(cprm)) goto end_coredump; diff --git a/fs/block_dev.c b/fs/block_dev.c index 092a2eed1628..cb936c90ae82 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -102,12 +102,11 @@ void invalidate_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0) - return; - - invalidate_bh_lrus(); - lru_add_drain_all(); /* make sure all lru add caches are flushed */ - invalidate_mapping_pages(mapping, 0, -1); + if (mapping->nrpages) { + invalidate_bh_lrus(); + lru_add_drain_all(); /* make sure all lru add caches are flushed */ + invalidate_mapping_pages(mapping, 0, -1); + } /* 99% of the time, we don't need to flush the cleancache on the bdev. * But, for the strange corners, lets be cautious */ @@ -451,10 +450,12 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, set_page_writeback(page); result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true); - if (result) + if (result) { end_page_writeback(page); - else + } else { + clean_page_buffers(page); unlock_page(page); + } blk_queue_exit(bdev->bd_queue); return result; } @@ -714,7 +715,7 @@ struct block_device *bdget(dev_t dev) bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; - bdev->bd_block_size = (1 << inode->i_blkbits); + bdev->bd_block_size = i_blocksize(inode); bdev->bd_part_count = 0; bdev->bd_invalidated = 0; inode->i_mode = S_IFBLK; @@ -1165,7 +1166,6 @@ int revalidate_disk(struct gendisk *disk) if (disk->fops->revalidate_disk) ret = disk->fops->revalidate_disk(disk); - blk_integrity_revalidate(disk); bdev = bdget_disk(disk, 0); if (!bdev) return ret; diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 247b8dfaf6e5..8d8370ddb6b2 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -78,12 +78,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (ret) - return ret; - } - ret = 0; break; case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) @@ -119,6 +113,13 @@ out: int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { + int ret; + + if (type == ACL_TYPE_ACCESS && acl) { + ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (ret) + return ret; + } return __btrfs_set_acl(NULL, inode, acl, type); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5909ae8c6731..c4cff5cc9c93 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3397,13 +3397,6 @@ again: goto again; } - /* We've already setup this transaction, go ahead and exit */ - if (block_group->cache_generation == trans->transid && - i_size_read(inode)) { - dcs = BTRFS_DC_SETUP; - goto out_put; - } - /* * We want to set the generation to 0, that way if anything goes wrong * from here on out we know not to trust this cache when we load up next @@ -3427,6 +3420,13 @@ again: } WARN_ON(ret); + /* We've already setup this transaction, go ahead and exit */ + if (block_group->cache_generation == trans->transid && + i_size_read(inode)) { + dcs = BTRFS_DC_SETUP; + goto out_put; + } + if (i_size_read(inode) > 0) { ret = btrfs_check_trunc_cache_free_space(root, &root->fs_info->global_block_rsv); @@ -3984,6 +3984,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, info->space_info_kobj, "%s", alloc_name(found->flags)); if (ret) { + percpu_counter_destroy(&found->total_bytes_pinned); kfree(found); return ret; } @@ -4758,10 +4759,6 @@ skip_async: else flush = BTRFS_RESERVE_NO_FLUSH; spin_lock(&space_info->lock); - if (can_overcommit(root, space_info, orig, flush)) { - spin_unlock(&space_info->lock); - break; - } if (list_empty(&space_info->tickets) && list_empty(&space_info->priority_tickets)) { spin_unlock(&space_info->lock); @@ -7400,7 +7397,8 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, spin_unlock(&cluster->refill_lock); - down_read(&used_bg->data_rwsem); + /* We should only have one-level nested. */ + down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); spin_lock(&cluster->refill_lock); if (used_bg == cluster->block_group) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 3a14c87d9c92..3286a6e47ff0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2842,7 +2842,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (!ret) ret = btrfs_prealloc_file_range(inode, mode, range->start, - range->len, 1 << inode->i_blkbits, + range->len, i_blocksize(inode), offset + len, &alloc_hint); else btrfs_free_reserved_data_space(inode, range->start, diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c index a97fdc156a03..baacc1866861 100644 --- a/fs/btrfs/hash.c +++ b/fs/btrfs/hash.c @@ -38,6 +38,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) { SHASH_DESC_ON_STACK(shash, tfm); u32 *ctx = (u32 *)shash_desc_ctx(shash); + u32 retval; int err; shash->tfm = tfm; @@ -47,5 +48,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) err = crypto_shash_update(shash, address, length); BUG_ON(err); - return *ctx; + retval = *ctx; + barrier_data(ctx); + return retval; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index be4da91d880f..f089d7d8afe7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4480,8 +4480,19 @@ search_again: if (found_type > min_type) { del_item = 1; } else { - if (item_end < new_size) + if (item_end < new_size) { + /* + * With NO_HOLES mode, for the following mapping + * + * [0-4k][hole][8k-12k] + * + * if truncating isize down to 6k, it ends up + * isize being 8k. + */ + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) + last_size = new_size; break; + } if (found_key.offset >= new_size) del_item = 1; else @@ -7235,7 +7246,6 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode, struct extent_map *em = NULL; int ret; - down_read(&BTRFS_I(inode)->dio_sem); if (type != BTRFS_ORDERED_NOCOW) { em = create_pinned_em(inode, start, len, orig_start, block_start, block_len, orig_block_len, @@ -7254,7 +7264,6 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode, em = ERR_PTR(ret); } out: - up_read(&BTRFS_I(inode)->dio_sem); return em; } @@ -7435,8 +7444,8 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) int found = false; void **pagep = NULL; struct page *page = NULL; - int start_idx; - int end_idx; + unsigned long start_idx; + unsigned long end_idx; start_idx = start >> PAGE_SHIFT; @@ -7639,11 +7648,18 @@ static void adjust_dio_outstanding_extents(struct inode *inode, * within our reservation, otherwise we need to adjust our inode * counter appropriately. */ - if (dio_data->outstanding_extents) { + if (dio_data->outstanding_extents >= num_extents) { dio_data->outstanding_extents -= num_extents; } else { + /* + * If dio write length has been split due to no large enough + * contiguous space, we need to compensate our inode counter + * appropriately. + */ + u64 num_needed = num_extents - dio_data->outstanding_extents; + spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents += num_extents; + BTRFS_I(inode)->outstanding_extents += num_needed; spin_unlock(&BTRFS_I(inode)->lock); } } @@ -8034,8 +8050,10 @@ next_block_or_try_again: start += sectorsize; - if (nr_sectors--) { + nr_sectors--; + if (nr_sectors) { pgoff += sectorsize; + ASSERT(pgoff < PAGE_SIZE); goto next_block_or_try_again; } } @@ -8141,8 +8159,10 @@ next: ASSERT(nr_sectors); - if (--nr_sectors) { + nr_sectors--; + if (nr_sectors) { pgoff += sectorsize; + ASSERT(pgoff < PAGE_SIZE); goto next_block; } } @@ -8707,6 +8727,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.unsubmitted_oe_range_start = (u64)offset; dio_data.unsubmitted_oe_range_end = (u64)offset; current->journal_info = &dio_data; + down_read(&BTRFS_I(inode)->dio_sem); } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { inode_dio_end(inode); @@ -8719,6 +8740,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) iter, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); if (iov_iter_rw(iter) == WRITE) { + up_read(&BTRFS_I(inode)->dio_sem); current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { if (dio_data.reserve) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1782804f6c26..0fe346c4bd28 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3052,7 +3052,7 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, out: if (ret) btrfs_cmp_data_free(cmp); - return 0; + return ret; } static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, @@ -4082,6 +4082,10 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } + if (!is_fstree(new_root->objectid)) { + ret = -ENOENT; + goto out; + } path = btrfs_alloc_path(); if (!path) { diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2cf5e142675e..04c61bcf62e5 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2367,11 +2367,11 @@ void free_reloc_roots(struct list_head *list) while (!list_empty(list)) { reloc_root = list_entry(list->next, struct btrfs_root, root_list); + __del_reloc_root(reloc_root); free_extent_buffer(reloc_root->node); free_extent_buffer(reloc_root->commit_root); reloc_root->node = NULL; reloc_root->commit_root = NULL; - __del_reloc_root(reloc_root); } } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 71261b459863..77f9efc1f7aa 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1680,6 +1680,9 @@ static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen) { int ret; + if (ino == BTRFS_FIRST_FREE_OBJECTID) + return 1; + ret = get_cur_inode_state(sctx, ino, gen); if (ret < 0) goto out; @@ -1865,7 +1868,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, * not deleted and then re-created, if it was then we have no overwrite * and we can just unlink this entry. */ - if (sctx->parent_root) { + if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 74ed5aae6cea..f6e111984ce2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1834,6 +1834,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } + btrfs_qgroup_rescan_resume(fs_info); + if (!fs_info->uuid_root) { btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b89004513c09..309313b71617 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -37,6 +37,7 @@ */ #define LOG_INODE_ALL 0 #define LOG_INODE_EXISTS 1 +#define LOG_OTHER_INODE 2 /* * directory trouble cases @@ -4623,7 +4624,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode) || (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags) && - inode_only == LOG_INODE_EXISTS)) + inode_only >= LOG_INODE_EXISTS)) max_key.type = BTRFS_XATTR_ITEM_KEY; else max_key.type = (u8)-1; @@ -4647,7 +4648,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, return ret; } - mutex_lock(&BTRFS_I(inode)->log_mutex); + if (inode_only == LOG_OTHER_INODE) { + inode_only = LOG_INODE_EXISTS; + mutex_lock_nested(&BTRFS_I(inode)->log_mutex, + SINGLE_DEPTH_NESTING); + } else { + mutex_lock(&BTRFS_I(inode)->log_mutex); + } /* * a brute force approach to making sure we get the most uptodate @@ -4799,7 +4806,7 @@ again: * unpin it. */ err = btrfs_log_inode(trans, root, other_inode, - LOG_INODE_EXISTS, + LOG_OTHER_INODE, 0, LLONG_MAX, ctx); iput(other_inode); if (err) diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 7fc89e4adb41..83bb2f2aa83c 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -351,7 +351,5 @@ skip: out: btrfs_free_path(path); - if (ret) - btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret); - return 0; + return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 71a60cc01451..06a77e47957d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6226,7 +6226,7 @@ int btrfs_map_bio(struct btrfs_root *root, struct bio *bio, for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || - (bio_op(bio) == REQ_OP_WRITE && !dev->writeable)) { + (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) { bbio_error(bbio, first_bio, logical); continue; } diff --git a/fs/buffer.c b/fs/buffer.c index b205a629001d..5d8f496d624e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2353,7 +2353,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, loff_t pos, loff_t *bytes) { struct inode *inode = mapping->host; - unsigned blocksize = 1 << inode->i_blkbits; + unsigned int blocksize = i_blocksize(inode); struct page *page; void *fsdata; pgoff_t index, curidx; @@ -2433,8 +2433,8 @@ int cont_write_begin(struct file *file, struct address_space *mapping, get_block_t *get_block, loff_t *bytes) { struct inode *inode = mapping->host; - unsigned blocksize = 1 << inode->i_blkbits; - unsigned zerofrom; + unsigned int blocksize = i_blocksize(inode); + unsigned int zerofrom; int err; err = cont_expand_zero(file, mapping, pos, bytes); @@ -2796,7 +2796,7 @@ int nobh_truncate_page(struct address_space *mapping, struct buffer_head map_bh; int err; - blocksize = 1 << inode->i_blkbits; + blocksize = i_blocksize(inode); length = offset & (blocksize - 1); /* Block boundary? Nothing to do */ @@ -2874,7 +2874,7 @@ int block_truncate_page(struct address_space *mapping, struct buffer_head *bh; int err; - blocksize = 1 << inode->i_blkbits; + blocksize = i_blocksize(inode); length = offset & (blocksize - 1); /* Block boundary? Nothing to do */ @@ -2986,7 +2986,7 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block, struct inode *inode = mapping->host; tmp.b_state = 0; tmp.b_blocknr = 0; - tmp.b_size = 1 << inode->i_blkbits; + tmp.b_size = i_blocksize(inode); get_block(inode, block, &tmp, 0); return tmp.b_blocknr; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index ef3ebd780aff..7b79a54a2789 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -188,7 +188,7 @@ static int ceph_releasepage(struct page *page, gfp_t g) /* * read a single page, without unlocking it. */ -static int readpage_nounlock(struct file *filp, struct page *page) +static int ceph_do_readpage(struct file *filp, struct page *page) { struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); @@ -218,7 +218,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) err = ceph_readpage_from_fscache(inode, page); if (err == 0) - goto out; + return -EINPROGRESS; dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); @@ -248,8 +248,11 @@ out: static int ceph_readpage(struct file *filp, struct page *page) { - int r = readpage_nounlock(filp, page); - unlock_page(page); + int r = ceph_do_readpage(filp, page); + if (r != -EINPROGRESS) + unlock_page(page); + else + r = 0; return r; } @@ -315,7 +318,32 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) struct page **pages; pgoff_t next_index; int nr_pages = 0; - int ret; + int got = 0; + int ret = 0; + + if (!current->journal_info) { + /* caller of readpages does not hold buffer and read caps + * (fadvise, madvise and readahead cases) */ + int want = CEPH_CAP_FILE_CACHE; + ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got); + if (ret < 0) { + dout("start_read %p, error getting cap\n", inode); + } else if (!(got & want)) { + dout("start_read %p, no cache cap\n", inode); + ret = 0; + } + if (ret <= 0) { + if (got) + ceph_put_cap_refs(ci, got); + while (!list_empty(page_list)) { + page = list_entry(page_list->prev, + struct page, lru); + list_del(&page->lru); + put_page(page); + } + return ret; + } + } off = (u64) page_offset(page); @@ -338,15 +366,18 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); - if (IS_ERR(req)) - return PTR_ERR(req); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } /* build page vector */ nr_pages = calc_pages_for(0, len); pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL); - ret = -ENOMEM; - if (!pages) - goto out; + if (!pages) { + ret = -ENOMEM; + goto out_put; + } for (i = 0; i < nr_pages; ++i) { page = list_entry(page_list->prev, struct page, lru); BUG_ON(PageLocked(page)); @@ -363,6 +394,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) nr_pages = i; if (nr_pages > 0) { len = nr_pages << PAGE_SHIFT; + osd_req_op_extent_update(req, 0, len); break; } goto out_pages; @@ -378,6 +410,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) if (ret < 0) goto out_pages; ceph_osdc_put_request(req); + + /* After adding locked pages to page cache, the inode holds cache cap. + * So we can drop our cap refs. */ + if (got) + ceph_put_cap_refs(ci, got); + return nr_pages; out_pages: @@ -386,8 +424,11 @@ out_pages: unlock_page(pages[i]); } ceph_put_page_vector(pages, nr_pages, false); -out: +out_put: ceph_osdc_put_request(req); +out: + if (got) + ceph_put_cap_refs(ci, got); return ret; } @@ -424,7 +465,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, rc = start_read(inode, page_list, max); if (rc < 0) goto out; - BUG_ON(rc == 0); } out: ceph_fscache_readpages_cancel(inode, page_list); @@ -708,7 +748,7 @@ static int ceph_writepages_start(struct address_space *mapping, struct pagevec pvec; int done = 0; int rc = 0; - unsigned wsize = 1 << inode->i_blkbits; + unsigned int wsize = i_blocksize(inode); struct ceph_osd_request *req = NULL; int do_sync = 0; loff_t snap_size, i_size; @@ -1198,7 +1238,7 @@ retry_locked: goto retry_locked; r = writepage_nounlock(page, NULL); if (r < 0) - goto fail_nosnap; + goto fail_unlock; goto retry_locked; } @@ -1226,11 +1266,14 @@ retry_locked: } /* we need to read it. */ - r = readpage_nounlock(file, page); - if (r < 0) - goto fail_nosnap; + r = ceph_do_readpage(file, page); + if (r < 0) { + if (r == -EINPROGRESS) + return -EAGAIN; + goto fail_unlock; + } goto retry_locked; -fail_nosnap: +fail_unlock: unlock_page(page); return r; } @@ -1371,9 +1414,11 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || - ci->i_inline_version == CEPH_INLINE_NONE) + ci->i_inline_version == CEPH_INLINE_NONE) { + current->journal_info = vma->vm_file; ret = filemap_fault(vma, vmf); - else + current->journal_info = NULL; + } else ret = -EAGAIN; dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 5bc5d37b1217..a2d7997afd94 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -240,13 +240,7 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) } } -static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) -{ - if (!error) - SetPageUptodate(page); -} - -static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error) +static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error) { if (!error) SetPageUptodate(page); @@ -274,7 +268,7 @@ int ceph_readpage_from_fscache(struct inode *inode, struct page *page) return -ENOBUFS; ret = fscache_read_or_alloc_page(ci->fscache, page, - ceph_vfs_readpage_complete, NULL, + ceph_readpage_from_fscache_complete, NULL, GFP_KERNEL); switch (ret) { @@ -303,7 +297,7 @@ int ceph_readpages_from_fscache(struct inode *inode, return -ENOBUFS; ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, - ceph_vfs_readpage_complete_unlock, + ceph_readpage_from_fscache_complete, NULL, mapping_gfp_mask(mapping)); switch (ret) { diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f3f21105b860..3e1c136aadb7 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1900,6 +1900,7 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) retry: spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { + spin_unlock(&ci->i_ceph_lock); dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); goto out; } @@ -1917,8 +1918,10 @@ retry: mutex_lock(&session->s_mutex); goto retry; } - if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) + if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) { + spin_unlock(&ci->i_ceph_lock); goto out; + } flushing = __mark_caps_flushing(inode, session, true, &flush_tid, &oldest_flush_tid); @@ -2479,6 +2482,27 @@ static void check_max_size(struct inode *inode, loff_t endoff) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } +int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got) +{ + int ret, err = 0; + + BUG_ON(need & ~CEPH_CAP_FILE_RD); + BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)); + ret = ceph_pool_perm_check(ci, need); + if (ret < 0) + return ret; + + ret = try_get_cap_refs(ci, need, want, 0, true, got, &err); + if (ret) { + if (err == -EAGAIN) { + ret = 0; + } else if (err < 0) { + ret = err; + } + } + return ret; +} + /* * Wait for caps, and take cap references. If we can't get a WR cap * due to a small max_size, make sure we check_max_size (and possibly diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 1afa11191000..cec25691cbae 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -292,6 +292,11 @@ out: if (ret < 0) err = ret; dput(last); + /* last_name no longer match cache index */ + if (fi->readdir_cache_idx >= 0) { + fi->readdir_cache_idx = -1; + fi->dir_release_count = 0; + } } return err; } @@ -315,7 +320,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_mds_client *mdsc = fsc->mdsc; int i; int err; - u32 ftype; + unsigned frag = -1; struct ceph_mds_reply_info_parsed *rinfo; dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); @@ -362,7 +367,6 @@ more: /* do we have the correct frag content buffered? */ if (need_send_readdir(fi, ctx->pos)) { struct ceph_mds_request *req; - unsigned frag; int op = ceph_snap(inode) == CEPH_SNAPDIR ? CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; @@ -373,8 +377,11 @@ more: } if (is_hash_order(ctx->pos)) { - frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), - NULL, NULL); + /* fragtree isn't always accurate. choose frag + * based on previous reply when possible. */ + if (frag == (unsigned)-1) + frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), + NULL, NULL); } else { frag = fpos_frag(ctx->pos); } @@ -497,6 +504,7 @@ more: struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; ino_t ino; + u32 ftype; BUG_ON(rde->offset < ctx->pos); @@ -519,15 +527,17 @@ more: ctx->pos++; } + ceph_mdsc_put_request(fi->last_readdir); + fi->last_readdir = NULL; + if (fi->next_offset > 2) { - ceph_mdsc_put_request(fi->last_readdir); - fi->last_readdir = NULL; + frag = fi->frag; goto more; } /* more frags? */ if (!ceph_frag_is_rightmost(fi->frag)) { - unsigned frag = ceph_frag_next(fi->frag); + frag = ceph_frag_next(fi->frag); if (is_hash_order(ctx->pos)) { loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), fi->next_offset, true); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f995e3528a33..ca3f630db90f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1249,8 +1249,9 @@ again: dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); - + current->journal_info = filp; ret = generic_file_read_iter(iocb, to); + current->journal_info = NULL; } dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 12f2252f6c98..4a6df2ce0f76 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1323,8 +1323,8 @@ retry_lookup: ceph_dir_clear_ordered(dir); dout("d_delete %p\n", dn); d_delete(dn); - } else { - if (have_lease && d_unhashed(dn)) + } else if (have_lease) { + if (d_unhashed(dn)) d_add(dn, NULL); update_dentry_lease(dn, rinfo->dlease, session, @@ -2080,11 +2080,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); - if (ia_valid & ATTR_MODE) { - err = posix_acl_chmod(inode, attr->ia_mode); - if (err) - goto out_put; - } if (mask) { req->r_inode = inode; @@ -2098,13 +2093,11 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) ceph_cap_string(dirtied), mask); ceph_mdsc_put_request(req); - if (mask & CEPH_SETATTR_SIZE) - __ceph_do_pending_vmtruncate(inode); - ceph_free_cap_flush(prealloc_cf); - return err; -out_put: - ceph_mdsc_put_request(req); ceph_free_cap_flush(prealloc_cf); + + if (err >= 0 && (mask & CEPH_SETATTR_SIZE)) + __ceph_do_pending_vmtruncate(inode); + return err; } @@ -2123,7 +2116,12 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (err != 0) return err; - return __ceph_setattr(inode, attr); + err = __ceph_setattr(inode, attr); + + if (err >= 0 && (attr->ia_valid & ATTR_MODE)) + err = posix_acl_chmod(inode, attr->ia_mode); + + return err; } /* diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 7d752d53353a..4c9c72f26eb9 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -25,7 +25,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) l.stripe_count = ci->i_layout.stripe_count; l.object_size = ci->i_layout.object_size; l.data_pool = ci->i_layout.pool_id; - l.preferred_osd = (s32)-1; + l.preferred_osd = -1; if (copy_to_user(arg, &l, sizeof(l))) return -EFAULT; } @@ -97,7 +97,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) nl.data_pool = ci->i_layout.pool_id; /* this is obsolete, and always -1 */ - nl.preferred_osd = le64_to_cpu(-1); + nl.preferred_osd = -1; err = __validate_layout(mdsc, &nl); if (err) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6a26c7bd1286..c0f52c443c34 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -628,6 +628,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc, { dout("__unregister_request %p tid %lld\n", req, req->r_tid); + /* Never leave an unregistered request on an unsafe list! */ + list_del_init(&req->r_unsafe_item); + if (req->r_tid == mdsc->oldest_tid) { struct rb_node *p = rb_next(&req->r_node); mdsc->oldest_tid = 0; @@ -1036,7 +1039,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, while (!list_empty(&session->s_unsafe)) { req = list_first_entry(&session->s_unsafe, struct ceph_mds_request, r_unsafe_item); - list_del_init(&req->r_unsafe_item); pr_warn_ratelimited(" dropping unsafe request %llu\n", req->r_tid); __unregister_request(mdsc, req); @@ -1780,13 +1782,18 @@ static int build_dentry_path(struct dentry *dentry, int *pfreepath) { char *path; + struct inode *dir; - if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) { - *pino = ceph_ino(d_inode(dentry->d_parent)); + rcu_read_lock(); + dir = d_inode_rcu(dentry->d_parent); + if (dir && ceph_snap(dir) == CEPH_NOSNAP) { + *pino = ceph_ino(dir); + rcu_read_unlock(); *ppath = dentry->d_name.name; *ppathlen = dentry->d_name.len; return 0; } + rcu_read_unlock(); path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); if (IS_ERR(path)) return PTR_ERR(path); @@ -2423,7 +2430,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) * useful we could do with a revised return value. */ dout("got safe reply %llu, mds%d\n", tid, mds); - list_del_init(&req->r_unsafe_item); /* last unsafe request during umount? */ if (mdsc->stopping && !__get_oldest_req(mdsc)) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3e3fa9163059..622d5dd9f616 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -905,6 +905,8 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page); +extern int ceph_try_get_caps(struct ceph_inode_info *ci, + int need, int want, int *got); /* for counting open files by mode */ extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index febc28f9e2c2..75267cdd5dfd 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -392,6 +392,7 @@ static int __set_xattr(struct ceph_inode_info *ci, if (update_xattr) { int err = 0; + if (xattr && (flags & XATTR_CREATE)) err = -EEXIST; else if (!xattr && (flags & XATTR_REPLACE)) @@ -399,12 +400,14 @@ static int __set_xattr(struct ceph_inode_info *ci, if (err) { kfree(name); kfree(val); + kfree(*newxattr); return err; } if (update_xattr < 0) { if (xattr) __remove_xattr(ci, xattr); kfree(name); + kfree(*newxattr); return 0; } } diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index ec9dbbcca3b9..9156be545b0f 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -245,7 +245,8 @@ compose_mount_options_err: * @fullpath: full path in UNC format * @ref: server's referral */ -static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, +static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt, + struct cifs_sb_info *cifs_sb, const char *fullpath, const struct dfs_info3_param *ref) { struct vfsmount *mnt; @@ -259,7 +260,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, if (IS_ERR(mountdata)) return (struct vfsmount *)mountdata; - mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata); + mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata); kfree(mountdata); kfree(devname); return mnt; @@ -334,7 +335,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) mnt = ERR_PTR(-EINVAL); break; } - mnt = cifs_dfs_do_refmount(cifs_sb, + mnt = cifs_dfs_do_refmount(mntpt, cifs_sb, full_path, referrals + i); cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", __func__, referrals[i].node_name, mnt); diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 02b071bf3732..a0b3e7d1be48 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -83,6 +83,9 @@ convert_sfm_char(const __u16 src_char, char *target) case SFM_COLON: *target = ':'; break; + case SFM_DOUBLEQUOTE: + *target = '"'; + break; case SFM_ASTERISK: *target = '*'; break; @@ -418,6 +421,9 @@ static __le16 convert_to_sfm_char(char src_char, bool end_of_string) case ':': dest_char = cpu_to_le16(SFM_COLON); break; + case '"': + dest_char = cpu_to_le16(SFM_DOUBLEQUOTE); + break; case '*': dest_char = cpu_to_le16(SFM_ASTERISK); break; diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index 479bc0a941f3..07ade707fa60 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -57,6 +57,7 @@ * not conflict (although almost does) with the mapping above. */ +#define SFM_DOUBLEQUOTE ((__u16) 0xF020) #define SFM_ASTERISK ((__u16) 0xF021) #define SFM_QUESTION ((__u16) 0xF025) #define SFM_COLON ((__u16) 0xF022) @@ -64,8 +65,8 @@ #define SFM_LESSTHAN ((__u16) 0xF023) #define SFM_PIPE ((__u16) 0xF027) #define SFM_SLASH ((__u16) 0xF026) -#define SFM_PERIOD ((__u16) 0xF028) -#define SFM_SPACE ((__u16) 0xF029) +#define SFM_SPACE ((__u16) 0xF028) +#define SFM_PERIOD ((__u16) 0xF029) /* * Mapping mechanism to use when one of the seven reserved characters is diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 15261ba464c5..87658f63b374 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -87,6 +87,7 @@ extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; struct workqueue_struct *cifsiod_wq; +struct workqueue_struct *cifsoplockd_wq; __u32 cifs_lock_secret; /* @@ -1283,9 +1284,16 @@ init_cifs(void) goto out_clean_proc; } + cifsoplockd_wq = alloc_workqueue("cifsoplockd", + WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + if (!cifsoplockd_wq) { + rc = -ENOMEM; + goto out_destroy_cifsiod_wq; + } + rc = cifs_fscache_register(); if (rc) - goto out_destroy_wq; + goto out_destroy_cifsoplockd_wq; rc = cifs_init_inodecache(); if (rc) @@ -1333,7 +1341,9 @@ out_destroy_inodecache: cifs_destroy_inodecache(); out_unreg_fscache: cifs_fscache_unregister(); -out_destroy_wq: +out_destroy_cifsoplockd_wq: + destroy_workqueue(cifsoplockd_wq); +out_destroy_cifsiod_wq: destroy_workqueue(cifsiod_wq); out_clean_proc: cifs_proc_clean(); @@ -1350,12 +1360,13 @@ exit_cifs(void) exit_cifs_idmap(); #endif #ifdef CONFIG_CIFS_UPCALL - unregister_key_type(&cifs_spnego_key_type); + exit_cifs_spnego(); #endif cifs_destroy_request_bufs(); cifs_destroy_mids(); cifs_destroy_inodecache(); cifs_fscache_unregister(); + destroy_workqueue(cifsoplockd_wq); destroy_workqueue(cifsiod_wq); cifs_proc_clean(); } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 203287f86525..7b496a4e650e 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -241,6 +241,7 @@ struct smb_version_operations { /* verify the message */ int (*check_message)(char *, unsigned int, struct TCP_Server_Info *); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); + int (*handle_cancelled_mid)(char *, struct TCP_Server_Info *); void (*downgrade_oplock)(struct TCP_Server_Info *, struct cifsInodeInfo *, bool); /* process transaction2 response */ @@ -364,6 +365,8 @@ struct smb_version_operations { unsigned int (*calc_smb_size)(void *); /* check for STATUS_PENDING and process it in a positive case */ bool (*is_status_pending)(char *, struct TCP_Server_Info *, int); + /* check for STATUS_NETWORK_SESSION_EXPIRED */ + bool (*is_session_expired)(char *); /* send oplock break response */ int (*oplock_response)(struct cifs_tcon *, struct cifs_fid *, struct cifsInodeInfo *); @@ -930,7 +933,6 @@ struct cifs_tcon { bool use_persistent:1; /* use persistent instead of durable handles */ #ifdef CONFIG_CIFS_SMB2 bool print:1; /* set if connection to printer share */ - bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */ __le32 capabilities; __u32 share_flags; __u32 maximal_access; @@ -1315,12 +1317,19 @@ struct mid_q_entry { void *callback_data; /* general purpose pointer for callback */ void *resp_buf; /* pointer to received SMB header */ int mid_state; /* wish this were enum but can not pass to wait_event */ + unsigned int mid_flags; __le16 command; /* smb command code */ bool large_buf:1; /* if valid response, is pointer to large buf */ bool multiRsp:1; /* multiple trans2 responses for one request */ bool multiEnd:1; /* both received */ }; +struct close_cancelled_open { + struct cifs_fid fid; + struct cifs_tcon *tcon; + struct work_struct work; +}; + /* Make code in transport.c a little cleaner by moving update of optional stats into function below */ #ifdef CONFIG_CIFS_STATS2 @@ -1452,6 +1461,9 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param, #define MID_RESPONSE_MALFORMED 0x10 #define MID_SHUTDOWN 0x20 +/* Flags */ +#define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */ + /* Types of response buffer returned from SendReceive2 */ #define CIFS_NO_BUFFER 0 /* Response buffer not returned */ #define CIFS_SMALL_BUFFER 1 @@ -1641,6 +1653,7 @@ void cifs_oplock_break(struct work_struct *work); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; +extern struct workqueue_struct *cifsoplockd_wq; extern __u32 cifs_lock_secret; extern mempool_t *cifs_mid_poolp; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index e3fed9249a04..cc420d6b71f7 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -717,6 +717,9 @@ CIFSSMBEcho(struct TCP_Server_Info *server) if (rc) return rc; + if (server->capabilities & CAP_UNICODE) + smb->hdr.Flags2 |= SMBFLG2_UNICODE; + /* set up echo request */ smb->hdr.Tid = 0xffff; smb->hdr.WordCount = 1; @@ -1423,6 +1426,8 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) length = discard_remaining_data(server); dequeue_mid(mid, rdata->result); + mid->resp_buf = server->smallbuf; + server->smallbuf = NULL; return length; } @@ -1452,6 +1457,13 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return length; server->total_read += length; + if (server->ops->is_session_expired && + server->ops->is_session_expired(buf)) { + cifs_reconnect(server); + wake_up(&server->response_q); + return -1; + } + if (server->ops->is_status_pending && server->ops->is_status_pending(buf, server, 0)) { discard_remaining_data(server); @@ -1534,6 +1546,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return cifs_readv_discard(server, mid); dequeue_mid(mid, false); + mid->resp_buf = server->smallbuf; + server->smallbuf = NULL; return length; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 893be0722643..580b3a4ca53a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -796,6 +796,13 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) cifs_dump_mem("Bad SMB: ", buf, min_t(unsigned int, server->total_read, 48)); + if (server->ops->is_session_expired && + server->ops->is_session_expired(buf)) { + cifs_reconnect(server); + wake_up(&server->response_q); + return -1; + } + if (server->ops->is_status_pending && server->ops->is_status_pending(buf, server, length)) return -1; @@ -882,10 +889,19 @@ cifs_demultiplex_thread(void *p) server->lstrp = jiffies; if (mid_entry != NULL) { + if ((mid_entry->mid_flags & MID_WAIT_CANCELLED) && + mid_entry->mid_state == MID_RESPONSE_RECEIVED && + server->ops->handle_cancelled_mid) + server->ops->handle_cancelled_mid( + mid_entry->resp_buf, + server); + if (!mid_entry->multiRsp || mid_entry->multiEnd) mid_entry->callback(mid_entry); - } else if (!server->ops->is_oplock_break || - !server->ops->is_oplock_break(buf, server)) { + } else if (server->ops->is_oplock_break && + server->ops->is_oplock_break(buf, server)) { + cifs_dbg(FYI, "Received oplock break\n"); + } else { cifs_dbg(VFS, "No task to wake, unknown frame received! NumMids %d\n", atomic_read(&midCount)); cifs_dump_mem("Received Data is: ", buf, @@ -2830,16 +2846,14 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data) { struct cifs_sb_info *old = CIFS_SB(sb); struct cifs_sb_info *new = mnt_data->cifs_sb; + bool old_set = old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH; + bool new_set = new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH; - if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) { - if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)) - return 0; - /* The prepath should be null terminated strings */ - if (strcmp(new->prepath, old->prepath)) - return 0; - + if (old_set && new_set && !strcmp(new->prepath, old->prepath)) return 1; - } + else if (!old_set && !new_set) + return 1; + return 0; } @@ -4064,6 +4078,14 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n", server->sec_mode, server->capabilities, server->timeAdj); + if (ses->auth_key.response) { + cifs_dbg(VFS, "Free previous auth_key.response = %p\n", + ses->auth_key.response); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + ses->auth_key.len = 0; + } + if (server->ops->sess_setup) rc = server->ops->sess_setup(xid, ses, nls_info); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 789ff1df2d8d..d9cbda269462 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -183,15 +183,21 @@ cifs_bp_rename_retry: } /* + * Don't allow path components longer than the server max. * Don't allow the separator character in a path component. * The VFS will not allow "/", but "\" is allowed by posix. */ static int -check_name(struct dentry *direntry) +check_name(struct dentry *direntry, struct cifs_tcon *tcon) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); int i; + if (unlikely(tcon->fsAttrInfo.MaxPathNameComponentLength && + direntry->d_name.len > + le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength))) + return -ENAMETOOLONG; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { for (i = 0; i < direntry->d_name.len; i++) { if (direntry->d_name.name[i] == '\\') { @@ -489,10 +495,6 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, return finish_no_open(file, res); } - rc = check_name(direntry); - if (rc) - return rc; - xid = get_xid(); cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n", @@ -505,6 +507,11 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, } tcon = tlink_tcon(tlink); + + rc = check_name(direntry, tcon); + if (rc) + goto out; + server = tcon->ses->server; if (server->ops->new_lease_key) @@ -765,7 +772,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } pTcon = tlink_tcon(tlink); - rc = check_name(direntry); + rc = check_name(direntry, pTcon); if (rc) goto lookup_out; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 18a1e1d6671f..cf192f9ce254 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -224,6 +224,13 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; + /* O_SYNC also has bit for O_DSYNC so following check picks up either */ + if (f_flags & O_SYNC) + create_options |= CREATE_WRITE_THROUGH; + + if (f_flags & O_DIRECT) + create_options |= CREATE_NO_BUFFER; + oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = desired_access; @@ -2597,7 +2604,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, wdata->credits = credits; if (!wdata->cfile->invalidHandle || - !cifs_reopen_file(wdata->cfile, false)) + !(rc = cifs_reopen_file(wdata->cfile, false))) rc = server->ops->async_writev(wdata, cifs_uncached_writedata_release); if (rc) { @@ -2884,7 +2891,15 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) for (i = 0; i < rdata->nr_pages; i++) { struct page *page = rdata->pages[i]; size_t copy = min_t(size_t, remaining, PAGE_SIZE); - size_t written = copy_page_to_iter(page, 0, copy, iter); + size_t written; + + if (unlikely(iter->type & ITER_PIPE)) { + void *addr = kmap_atomic(page); + + written = copy_to_iter(addr, copy, iter); + kunmap_atomic(addr); + } else + written = copy_page_to_iter(page, 0, copy, iter); remaining -= written; if (written < copy && iov_iter_count(iter) > 0) break; @@ -2994,7 +3009,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, rdata->credits = credits; if (!rdata->cfile->invalidHandle || - !cifs_reopen_file(rdata->cfile, true)) + !(rc = cifs_reopen_file(rdata->cfile, true))) rc = server->ops->async_readv(rdata); error: if (rc) { @@ -3569,7 +3584,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, } if (!rdata->cfile->invalidHandle || - !cifs_reopen_file(rdata->cfile, true)) + !(rc = cifs_reopen_file(rdata->cfile, true))) rc = server->ops->async_readv(rdata); if (rc) { add_credits_and_wake_if(server, rdata->credits, 0); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 001528781b6b..bdba9e7a9438 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -264,10 +264,14 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EOPNOTSUPP; break; case CIFS_IOC_GET_MNT_INFO: + if (pSMBFile == NULL) + break; tcon = tlink_tcon(pSMBFile->tlink); rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg); break; case CIFS_ENUMERATE_SNAPSHOTS: + if (pSMBFile == NULL) + break; if (arg == 0) { rc = -EINVAL; goto cifs_ioc_exit; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index c6729156f9a0..5419afea0a36 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -492,7 +492,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &pCifsInode->flags); - queue_work(cifsiod_wq, + queue_work(cifsoplockd_wq, &netfile->oplock_break); netfile->oplock_break_cancelled = false; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index fc537c29044e..efd72e1fae74 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -849,8 +849,13 @@ cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *fid, __u16 search_flags, struct cifs_search_info *srch_inf) { - return CIFSFindFirst(xid, tcon, path, cifs_sb, - &fid->netfid, search_flags, srch_inf, true); + int rc; + + rc = CIFSFindFirst(xid, tcon, path, cifs_sb, + &fid->netfid, search_flags, srch_inf, true); + if (rc) + cifs_dbg(FYI, "find first failed=%d\n", rc); + return rc; } static int @@ -1015,6 +1020,15 @@ cifs_dir_needs_close(struct cifsFileInfo *cfile) return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle; } +static bool +cifs_can_echo(struct TCP_Server_Info *server) +{ + if (server->tcpStatus == CifsGood) + return true; + + return false; +} + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -1049,6 +1063,7 @@ struct smb_version_operations smb1_operations = { .get_dfs_refer = CIFSGetDFSRefer, .qfs_tcon = cifs_qfs_tcon, .is_path_accessible = cifs_is_path_accessible, + .can_echo = cifs_can_echo, .query_path_info = cifs_query_path_info, .query_file_info = cifs_query_file_info, .get_srv_inum = cifs_get_srv_inum, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 3d383489b9cf..967dfe656ced 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -494,7 +494,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, else cfile->oplock_break_cancelled = true; - queue_work(cifsiod_wq, &cfile->oplock_break); + queue_work(cifsoplockd_wq, &cfile->oplock_break); kfree(lw); return true; } @@ -638,7 +638,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); spin_unlock(&cfile->file_info_lock); - queue_work(cifsiod_wq, &cfile->oplock_break); + queue_work(cifsoplockd_wq, + &cfile->oplock_break); spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); @@ -654,3 +655,47 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n"); return false; } + +void +smb2_cancelled_close_fid(struct work_struct *work) +{ + struct close_cancelled_open *cancelled = container_of(work, + struct close_cancelled_open, work); + + cifs_dbg(VFS, "Close unmatched open\n"); + + SMB2_close(0, cancelled->tcon, cancelled->fid.persistent_fid, + cancelled->fid.volatile_fid); + cifs_put_tcon(cancelled->tcon); + kfree(cancelled); +} + +int +smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server) +{ + struct smb2_hdr *hdr = (struct smb2_hdr *)buffer; + struct smb2_create_rsp *rsp = (struct smb2_create_rsp *)buffer; + struct cifs_tcon *tcon; + struct close_cancelled_open *cancelled; + + if (hdr->Command != SMB2_CREATE || hdr->Status != STATUS_SUCCESS) + return 0; + + cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL); + if (!cancelled) + return -ENOMEM; + + tcon = smb2_find_smb_tcon(server, hdr->SessionId, hdr->TreeId); + if (!tcon) { + kfree(cancelled); + return -ENOENT; + } + + cancelled->fid.persistent_fid = rsp->PersistentFileId; + cancelled->fid.volatile_fid = rsp->VolatileFileId; + cancelled->tcon = tcon; + INIT_WORK(&cancelled->work, smb2_cancelled_close_fid); + queue_work(cifsiod_wq, &cancelled->work); + + return 0; +} diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 5d456ebb3813..812e4884c392 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -924,6 +924,7 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, } if (snapshot_in.snapshot_array_size < sizeof(struct smb_snapshot_array)) { rc = -ERANGE; + kfree(retbuf); return rc; } @@ -963,7 +964,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); kfree(utf16_path); if (rc) { - cifs_dbg(VFS, "open dir failed\n"); + cifs_dbg(FYI, "open dir failed rc=%d\n", rc); return rc; } @@ -973,7 +974,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_query_directory(xid, tcon, fid->persistent_fid, fid->volatile_fid, 0, srch_inf); if (rc) { - cifs_dbg(VFS, "query directory failed\n"); + cifs_dbg(FYI, "query directory failed rc=%d\n", rc); SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); } return rc; @@ -1017,6 +1018,18 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length) return true; } +static bool +smb2_is_session_expired(char *buf) +{ + struct smb2_hdr *hdr = (struct smb2_hdr *)buf; + + if (hdr->Status != STATUS_NETWORK_SESSION_EXPIRED) + return false; + + cifs_dbg(FYI, "Session expired\n"); + return true; +} + static int smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, struct cifsInodeInfo *cinode) @@ -1565,6 +1578,7 @@ struct smb_version_operations smb20_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .handle_cancelled_mid = smb2_handle_cancelled_mid, .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, @@ -1607,6 +1621,7 @@ struct smb_version_operations smb20_operations = { .close_dir = smb2_close_dir, .calc_smb_size = smb2_calc_size, .is_status_pending = smb2_is_status_pending, + .is_session_expired = smb2_is_session_expired, .oplock_response = smb2_oplock_response, .queryfs = smb2_queryfs, .mand_lock = smb2_mand_lock, @@ -1645,6 +1660,7 @@ struct smb_version_operations smb21_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .handle_cancelled_mid = smb2_handle_cancelled_mid, .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, @@ -1687,6 +1703,7 @@ struct smb_version_operations smb21_operations = { .close_dir = smb2_close_dir, .calc_smb_size = smb2_calc_size, .is_status_pending = smb2_is_status_pending, + .is_session_expired = smb2_is_session_expired, .oplock_response = smb2_oplock_response, .queryfs = smb2_queryfs, .mand_lock = smb2_mand_lock, @@ -1727,6 +1744,7 @@ struct smb_version_operations smb30_operations = { .print_stats = smb2_print_stats, .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, + .handle_cancelled_mid = smb2_handle_cancelled_mid, .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, @@ -1769,6 +1787,7 @@ struct smb_version_operations smb30_operations = { .close_dir = smb2_close_dir, .calc_smb_size = smb2_calc_size, .is_status_pending = smb2_is_status_pending, + .is_session_expired = smb2_is_session_expired, .oplock_response = smb2_oplock_response, .queryfs = smb2_queryfs, .mand_lock = smb2_mand_lock, @@ -1815,6 +1834,7 @@ struct smb_version_operations smb311_operations = { .print_stats = smb2_print_stats, .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, + .handle_cancelled_mid = smb2_handle_cancelled_mid, .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, @@ -1857,6 +1877,7 @@ struct smb_version_operations smb311_operations = { .close_dir = smb2_close_dir, .calc_smb_size = smb2_calc_size, .is_status_pending = smb2_is_status_pending, + .is_session_expired = smb2_is_session_expired, .oplock_response = smb2_oplock_response, .queryfs = smb2_queryfs, .mand_lock = smb2_mand_lock, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 87457227812c..69b610ad3fdc 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -366,7 +366,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req) build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); req->NegotiateContextCount = cpu_to_le16(2); - inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) + 2 + inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) + sizeof(struct smb2_encryption_neg_context)); /* calculate hash */ } #else @@ -531,15 +531,22 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) /* * validation ioctl must be signed, so no point sending this if we - * can not sign it. We could eventually change this to selectively + * can not sign it (ie are not known user). Even if signing is not + * required (enabled but not negotiated), in those cases we selectively * sign just this, the first and only signed request on a connection. - * This is good enough for now since a user who wants better security - * would also enable signing on the mount. Having validation of - * negotiate info for signed connections helps reduce attack vectors + * Having validation of negotiate info helps reduce attack vectors. */ - if (tcon->ses->server->sign == false) + if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) return 0; /* validation requires signing */ + if (tcon->ses->user_name == NULL) { + cifs_dbg(FYI, "Can't validate negotiate: null user mount\n"); + return 0; /* validation requires signing */ + } + + if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) + cifs_dbg(VFS, "Unexpected null user (anonymous) auth flag sent by server\n"); + vneg_inbuf.Capabilities = cpu_to_le32(tcon->ses->server->vals->req_capabilities); memcpy(vneg_inbuf.Guid, tcon->ses->server->client_guid, @@ -569,8 +576,12 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) } if (rsplen != sizeof(struct validate_negotiate_info_rsp)) { - cifs_dbg(VFS, "invalid size of protocol negotiate response\n"); - return -EIO; + cifs_dbg(VFS, "invalid protocol negotiate response size: %d\n", + rsplen); + + /* relax check since Mac returns max bufsize allowed on ioctl */ + if (rsplen > CIFSMaxBufSize) + return -EIO; } /* check validate negotiate info response matches what we got earlier */ @@ -1006,6 +1017,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, while (sess_data->func) sess_data->func(sess_data); + if ((ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) && (ses->sign)) + cifs_dbg(VFS, "signing requested but authenticated as guest\n"); rc = sess_data->result; out: kfree(sess_data); @@ -1084,9 +1097,6 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, else return -EIO; - if (tcon && tcon->bad_network_name) - return -ENOENT; - if ((tcon && tcon->seal) && ((ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) == 0)) { cifs_dbg(VFS, "encryption requested but no server support"); @@ -1104,6 +1114,10 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, return -EINVAL; } + /* SMB2 TREE_CONNECT request must be called with TreeId == 0 */ + if (tcon) + tcon->tid = 0; + rc = small_smb2_init(SMB2_TREE_CONNECT, tcon, (void **) &req); if (rc) { kfree(unc_path); @@ -1184,8 +1198,6 @@ tcon_exit: tcon_error_exit: if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) { cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); - if (tcon) - tcon->bad_network_name = true; } goto tcon_exit; } @@ -1671,8 +1683,12 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, * than one credit. Windows typically sets this smaller, but for some * ioctls it may be useful to allow server to send more. No point * limiting what the server can send as long as fits in one credit + * Unfortunately - we can not handle more than CIFS_MAX_MSG_SIZE + * (by default, note that it can be overridden to make max larger) + * in responses (except for read responses which can be bigger. + * We may want to bump this limit up */ - req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */ + req->MaxOutputResponse = cpu_to_le32(CIFSMaxBufSize); if (is_fsctl) req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL); @@ -1983,6 +1999,9 @@ void smb2_reconnect_server(struct work_struct *work) struct cifs_tcon *tcon, *tcon2; struct list_head tmp_list; int tcon_exist = false; + int rc; + int resched = false; + /* Prevent simultaneous reconnects that can corrupt tcon->rlist list */ mutex_lock(&server->reconnect_mutex); @@ -2010,13 +2029,18 @@ void smb2_reconnect_server(struct work_struct *work) spin_unlock(&cifs_tcp_ses_lock); list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) { - if (!smb2_reconnect(SMB2_INTERNAL_CMD, tcon)) + rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon); + if (!rc) cifs_reopen_persistent_handles(tcon); + else + resched = true; list_del_init(&tcon->rlist); cifs_put_tcon(tcon); } cifs_dbg(FYI, "Reconnecting tcons finished\n"); + if (resched) + queue_delayed_work(cifsiod_wq, &server->reconnect, 2 * HZ); mutex_unlock(&server->reconnect_mutex); /* now we can safely release srv struct */ @@ -2915,8 +2939,8 @@ copy_fs_info_to_kstatfs(struct smb2_fs_full_size_info *pfs_inf, kst->f_bsize = le32_to_cpu(pfs_inf->BytesPerSector) * le32_to_cpu(pfs_inf->SectorsPerAllocationUnit); kst->f_blocks = le64_to_cpu(pfs_inf->TotalAllocationUnits); - kst->f_bfree = le64_to_cpu(pfs_inf->ActualAvailableAllocationUnits); - kst->f_bavail = le64_to_cpu(pfs_inf->CallerAvailableAllocationUnits); + kst->f_bfree = kst->f_bavail = + le64_to_cpu(pfs_inf->CallerAvailableAllocationUnits); return; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index dc0d141f33e2..1e1449ad00e8 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -84,8 +84,8 @@ #define NUMBER_OF_SMB2_COMMANDS 0x0013 -/* BB FIXME - analyze following length BB */ -#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ +/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */ +#define MAX_SMB2_HDR_SIZE 0x00b0 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) #define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index f2d511a6971b..04ef6e914597 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -48,6 +48,10 @@ extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst); extern struct mid_q_entry *smb2_setup_async_request( struct TCP_Server_Info *server, struct smb_rqst *rqst); +extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server, + __u64 ses_id); +extern struct cifs_tcon *smb2_find_smb_tcon(struct TCP_Server_Info *server, + __u64 ses_id, __u32 tid); extern int smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server); extern int smb3_calc_signature(struct smb_rqst *rqst, @@ -158,6 +162,9 @@ extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, const u64 persistent_fid, const u64 volatile_fid, const __u8 oplock_level); +extern int smb2_handle_cancelled_mid(char *buffer, + struct TCP_Server_Info *server); +void smb2_cancelled_close_fid(struct work_struct *work); extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, struct kstatfs *FSData); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index bc9a7b634643..390b0d0198f8 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -115,22 +115,68 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server) } static struct cifs_ses * -smb2_find_smb_ses(struct smb2_hdr *smb2hdr, struct TCP_Server_Info *server) +smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) { struct cifs_ses *ses; - spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - if (ses->Suid != smb2hdr->SessionId) + if (ses->Suid != ses_id) continue; - spin_unlock(&cifs_tcp_ses_lock); return ses; } + + return NULL; +} + +struct cifs_ses * +smb2_find_smb_ses(struct TCP_Server_Info *server, __u64 ses_id) +{ + struct cifs_ses *ses; + + spin_lock(&cifs_tcp_ses_lock); + ses = smb2_find_smb_ses_unlocked(server, ses_id); spin_unlock(&cifs_tcp_ses_lock); + return ses; +} + +static struct cifs_tcon * +smb2_find_smb_sess_tcon_unlocked(struct cifs_ses *ses, __u32 tid) +{ + struct cifs_tcon *tcon; + + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + if (tcon->tid != tid) + continue; + ++tcon->tc_count; + return tcon; + } + return NULL; } +/* + * Obtain tcon corresponding to the tid in the given + * cifs_ses + */ + +struct cifs_tcon * +smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid) +{ + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + spin_lock(&cifs_tcp_ses_lock); + ses = smb2_find_smb_ses_unlocked(server, ses_id); + if (!ses) { + spin_unlock(&cifs_tcp_ses_lock); + return NULL; + } + tcon = smb2_find_smb_sess_tcon_unlocked(ses, tid); + spin_unlock(&cifs_tcp_ses_lock); + + return tcon; +} int smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) @@ -142,7 +188,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; struct cifs_ses *ses; - ses = smb2_find_smb_ses(smb2_pdu, server); + ses = smb2_find_smb_ses(server, smb2_pdu->SessionId); if (!ses) { cifs_dbg(VFS, "%s: Could not find session\n", __func__); return 0; @@ -359,7 +405,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; struct cifs_ses *ses; - ses = smb2_find_smb_ses(smb2_pdu, server); + ses = smb2_find_smb_ses(server, smb2_pdu->SessionId); if (!ses) { cifs_dbg(VFS, "%s: Could not find session\n", __func__); return 0; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 206a597b2293..cc26d4138d70 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -727,9 +727,11 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, rc = wait_for_response(ses->server, midQ); if (rc != 0) { + cifs_dbg(FYI, "Cancelling wait for mid %llu\n", midQ->mid); send_cancel(ses->server, buf, midQ); spin_lock(&GlobalMid_Lock); if (midQ->mid_state == MID_REQUEST_SUBMITTED) { + midQ->mid_flags |= MID_WAIT_CANCELLED; midQ->callback = DeleteMidQEntry; spin_unlock(&GlobalMid_Lock); cifs_small_buf_release(buf); diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index f6c6c8adbc01..7289f0a7670b 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -446,8 +446,7 @@ int venus_fsync(struct super_block *sb, struct CodaFid *fid) UPARG(CODA_FSYNC); inp->coda_fsync.VFid = *fid; - error = coda_upcall(coda_vcp(sb), sizeof(union inputArgs), - &outsize, inp); + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); CODA_FREE(inp, insize); return error; diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index db6d69289608..314b4edac72b 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -83,14 +83,13 @@ static int create_link(struct config_item *parent_item, ret = -ENOMEM; sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL); if (sl) { - sl->sl_target = config_item_get(item); spin_lock(&configfs_dirent_lock); if (target_sd->s_type & CONFIGFS_USET_DROPPING) { spin_unlock(&configfs_dirent_lock); - config_item_put(item); kfree(sl); return -ENOENT; } + sl->sl_target = config_item_get(item); list_add(&sl->sl_list, &target_sd->s_links); spin_unlock(&configfs_dirent_lock); ret = configfs_create_link(sl, parent_item->ci_dentry, diff --git a/fs/coredump.c b/fs/coredump.c index eb9c92c9b20f..4407e27beca9 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -833,3 +833,21 @@ int dump_align(struct coredump_params *cprm, int align) return mod ? dump_skip(cprm, align - mod) : 1; } EXPORT_SYMBOL(dump_align); + +/* + * Ensures that file size is big enough to contain the current file + * postion. This prevents gdb from complaining about a truncated file + * if the last "write" to the file was dump_skip. + */ +void dump_truncate(struct coredump_params *cprm) +{ + struct file *file = cprm->file; + loff_t offset; + + if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + offset = file->f_op->llseek(file, 0, SEEK_CUR); + if (i_size_read(file->f_mapping->host) < offset) + do_truncate(file->f_path.dentry, offset, 0, file); + } +} +EXPORT_SYMBOL(dump_truncate); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 98f87fe8f186..73de1446c8d4 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -352,7 +352,6 @@ EXPORT_SYMBOL(fscrypt_zeroout_range); static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *dir; - struct fscrypt_info *ci; int dir_has_key, cached_with_key; if (flags & LOOKUP_RCU) @@ -364,18 +363,11 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return 0; } - ci = d_inode(dir)->i_crypt_info; - if (ci && ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD)))) - ci = NULL; - /* this should eventually be an flag in d_flags */ spin_lock(&dentry->d_lock); cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; spin_unlock(&dentry->d_lock); - dir_has_key = (ci != NULL); + dir_has_key = (d_inode(dir)->i_crypt_info != NULL); dput(dir); /* @@ -492,9 +484,6 @@ int fscrypt_initialize(void) { int i, res = -ENOMEM; - if (fscrypt_bounce_page_pool) - return 0; - mutex_lock(&fscrypt_init_mutex); if (fscrypt_bounce_page_pool) goto already_initialized; diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 9b774f4b50c8..e14bb7b67e9c 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -300,7 +300,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, } else { memset(buf, 0, 8); } - memcpy(buf + 8, iname->name + iname->len - 16, 16); + memcpy(buf + 8, iname->name + ((iname->len - 17) & ~15), 16); oname->name[0] = '_'; oname->len = 1 + digest_encode(buf, 24, oname->name + 1); return 0; @@ -332,7 +332,7 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, * in a directory. Consequently, a user space name cannot be mapped to * a disk-space name */ - return -EACCES; + return -ENOKEY; } EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); @@ -350,7 +350,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; return 0; } - ret = get_crypt_info(dir); + ret = fscrypt_get_encryption_info(dir); if (ret && ret != -EOPNOTSUPP) return ret; @@ -367,7 +367,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return 0; } if (!lookup) - return -EACCES; + return -ENOKEY; /* * We don't have the key and we are doing a lookup; decode the diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 67fb6d8876d0..a755fa1a0017 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -99,6 +99,7 @@ static int validate_user_key(struct fscrypt_info *crypt_info, kfree(full_key_descriptor); if (IS_ERR(keyring_key)) return PTR_ERR(keyring_key); + down_read(&keyring_key->sem); if (keyring_key->type != &key_type_logon) { printk_once(KERN_WARNING @@ -106,11 +107,14 @@ static int validate_user_key(struct fscrypt_info *crypt_info, res = -ENOKEY; goto out; } - down_read(&keyring_key->sem); ukp = user_key_payload(keyring_key); + if (!ukp) { + /* key was revoked before we acquired its semaphore */ + res = -EKEYREVOKED; + goto out; + } if (ukp->datalen != sizeof(struct fscrypt_key)) { res = -EINVAL; - up_read(&keyring_key->sem); goto out; } master_key = (struct fscrypt_key *)ukp->data; @@ -121,17 +125,11 @@ static int validate_user_key(struct fscrypt_info *crypt_info, "%s: key size incorrect: %d\n", __func__, master_key->size); res = -ENOKEY; - up_read(&keyring_key->sem); goto out; } res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); - up_read(&keyring_key->sem); - if (res) - goto out; - - crypt_info->ci_keyring_key = keyring_key; - return 0; out: + up_read(&keyring_key->sem); key_put(keyring_key); return res; } @@ -173,12 +171,11 @@ static void put_crypt_info(struct fscrypt_info *ci) if (!ci) return; - key_put(ci->ci_keyring_key); crypto_free_skcipher(ci->ci_ctfm); kmem_cache_free(fscrypt_info_cachep, ci); } -int get_crypt_info(struct inode *inode) +int fscrypt_get_encryption_info(struct inode *inode) { struct fscrypt_info *crypt_info; struct fscrypt_context ctx; @@ -188,21 +185,15 @@ int get_crypt_info(struct inode *inode) u8 *raw_key = NULL; int res; + if (inode->i_crypt_info) + return 0; + res = fscrypt_initialize(); if (res) return res; if (!inode->i_sb->s_cop->get_context) return -EOPNOTSUPP; -retry: - crypt_info = ACCESS_ONCE(inode->i_crypt_info); - if (crypt_info) { - if (!crypt_info->ci_keyring_key || - key_validate(crypt_info->ci_keyring_key) == 0) - return 0; - fscrypt_put_encryption_info(inode, crypt_info); - goto retry; - } res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { @@ -230,7 +221,6 @@ retry: crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; - crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); @@ -285,14 +275,8 @@ got_key: if (res) goto out; - kzfree(raw_key); - raw_key = NULL; - if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) { - put_crypt_info(crypt_info); - goto retry; - } - return 0; - + if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL) + crypt_info = NULL; out: if (res == -ENOKEY) res = 0; @@ -300,6 +284,7 @@ out: kzfree(raw_key); return res; } +EXPORT_SYMBOL(fscrypt_get_encryption_info); void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) { @@ -317,17 +302,3 @@ void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) put_crypt_info(ci); } EXPORT_SYMBOL(fscrypt_put_encryption_info); - -int fscrypt_get_encryption_info(struct inode *inode) -{ - struct fscrypt_info *ci = inode->i_crypt_info; - - if (!ci || - (ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD))))) - return get_crypt_info(inode); - return 0; -} -EXPORT_SYMBOL(fscrypt_get_encryption_info); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index abc18847b98d..c160d2d0e18d 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -113,7 +113,7 @@ int fscrypt_process_policy(struct file *filp, if (!inode_has_encryption_context(inode)) { if (!S_ISDIR(inode->i_mode)) - ret = -EINVAL; + ret = -ENOTDIR; else if (!inode->i_sb->s_cop->empty_dir) ret = -EOPNOTSUPP; else if (!inode->i_sb->s_cop->empty_dir(inode)) @@ -161,27 +161,61 @@ int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) } EXPORT_SYMBOL(fscrypt_get_policy); +/** + * fscrypt_has_permitted_context() - is a file's encryption policy permitted + * within its directory? + * + * @parent: inode for parent directory + * @child: inode for file being looked up, opened, or linked into @parent + * + * Filesystems must call this before permitting access to an inode in a + * situation where the parent directory is encrypted (either before allowing + * ->lookup() to succeed, or for a regular file before allowing it to be opened) + * and before any operation that involves linking an inode into an encrypted + * directory, including link, rename, and cross rename. It enforces the + * constraint that within a given encrypted directory tree, all files use the + * same encryption policy. The pre-access check is needed to detect potentially + * malicious offline violations of this constraint, while the link and rename + * checks are needed to prevent online violations of this constraint. + * + * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail + * the filesystem operation with EPERM. + */ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { - struct fscrypt_info *parent_ci, *child_ci; + const struct fscrypt_operations *cops = parent->i_sb->s_cop; + const struct fscrypt_info *parent_ci, *child_ci; + struct fscrypt_context parent_ctx, child_ctx; int res; - if ((parent == NULL) || (child == NULL)) { - printk(KERN_ERR "parent %p child %p\n", parent, child); - BUG_ON(1); - } - /* No restrictions on file types which are never encrypted */ if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && !S_ISLNK(child->i_mode)) return 1; - /* no restrictions if the parent directory is not encrypted */ - if (!parent->i_sb->s_cop->is_encrypted(parent)) + /* No restrictions if the parent directory is unencrypted */ + if (!cops->is_encrypted(parent)) return 1; - /* if the child directory is not encrypted, this is always a problem */ - if (!parent->i_sb->s_cop->is_encrypted(child)) + + /* Encrypted directories must not contain unencrypted files */ + if (!cops->is_encrypted(child)) return 0; + + /* + * Both parent and child are encrypted, so verify they use the same + * encryption policy. Compare the fscrypt_info structs if the keys are + * available, otherwise retrieve and compare the fscrypt_contexts. + * + * Note that the fscrypt_context retrieval will be required frequently + * when accessing an encrypted directory tree without the key. + * Performance-wise this is not a big deal because we already don't + * really optimize for file access without the key (to the extent that + * such access is even possible), given that any attempted access + * already causes a fscrypt_context retrieval and keyring search. + * + * In any case, if an unexpected error occurs, fall back to "forbidden". + */ + res = fscrypt_get_encryption_info(parent); if (res) return 0; @@ -190,17 +224,32 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) return 0; parent_ci = parent->i_crypt_info; child_ci = child->i_crypt_info; - if (!parent_ci && !child_ci) - return 1; - if (!parent_ci || !child_ci) + + if (parent_ci && child_ci) { + return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == + child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags); + } + + res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx)); + if (res != sizeof(parent_ctx)) return 0; - return (memcmp(parent_ci->ci_master_key, - child_ci->ci_master_key, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags)); + res = cops->get_context(child, &child_ctx, sizeof(child_ctx)); + if (res != sizeof(child_ctx)) + return 0; + + return memcmp(parent_ctx.master_key_descriptor, + child_ctx.master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ctx.contents_encryption_mode == + child_ctx.contents_encryption_mode) && + (parent_ctx.filenames_encryption_mode == + child_ctx.filenames_encryption_mode) && + (parent_ctx.flags == child_ctx.flags); } EXPORT_SYMBOL(fscrypt_has_permitted_context); diff --git a/fs/dcache.c b/fs/dcache.c index 4485a48f4091..67957f5b325c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -277,6 +277,33 @@ static inline int dname_external(const struct dentry *dentry) return dentry->d_name.name != dentry->d_iname; } +void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + if (unlikely(dname_external(dentry))) { + struct external_name *p = external_name(dentry); + atomic_inc(&p->u.count); + spin_unlock(&dentry->d_lock); + name->name = p->name; + } else { + memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN); + spin_unlock(&dentry->d_lock); + name->name = name->inline_name; + } +} +EXPORT_SYMBOL(take_dentry_name_snapshot); + +void release_dentry_name_snapshot(struct name_snapshot *name) +{ + if (unlikely(name->name != name->inline_name)) { + struct external_name *p; + p = container_of(name->name, struct external_name, name[0]); + if (unlikely(atomic_dec_and_test(&p->u.count))) + kfree_rcu(p, u.head); + } +} +EXPORT_SYMBOL(release_dentry_name_snapshot); + static inline void __d_set_inode_and_type(struct dentry *dentry, struct inode *inode, unsigned type_flags) @@ -1133,11 +1160,12 @@ void shrink_dcache_sb(struct super_block *sb) LIST_HEAD(dispose); freed = list_lru_walk(&sb->s_dentry_lru, - dentry_lru_isolate_shrink, &dispose, UINT_MAX); + dentry_lru_isolate_shrink, &dispose, 1024); this_cpu_sub(nr_dentry_unused, freed); shrink_dentry_list(&dispose); - } while (freed > 0); + cond_resched(); + } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index f17fcf89e18e..3d7de9f4f545 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -187,9 +187,9 @@ static const struct super_operations debugfs_super_operations = { static struct vfsmount *debugfs_automount(struct path *path) { - struct vfsmount *(*f)(void *); - f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata; - return f(d_inode(path->dentry)->i_private); + debugfs_automount_t f; + f = (debugfs_automount_t)path->dentry->d_fsdata; + return f(path->dentry, d_inode(path->dentry)->i_private); } static const struct dentry_operations debugfs_dops = { @@ -504,7 +504,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir); */ struct dentry *debugfs_create_automount(const char *name, struct dentry *parent, - struct vfsmount *(*f)(void *), + debugfs_automount_t f, void *data) { struct dentry *dentry = start_creating(name, parent); @@ -730,7 +730,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, { int error; struct dentry *dentry = NULL, *trap; - const char *old_name; + struct name_snapshot old_name; trap = lock_rename(new_dir, old_dir); /* Source or destination directories don't exist? */ @@ -745,19 +745,19 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry)) goto exit; - old_name = fsnotify_oldname_init(old_dentry->d_name.name); + take_dentry_name_snapshot(&old_name, old_dentry); error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir), dentry, 0); if (error) { - fsnotify_oldname_free(old_name); + release_dentry_name_snapshot(&old_name); goto exit; } d_move(old_dentry, dentry); - fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name, + fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name.name, d_is_dir(old_dentry), NULL, old_dentry); - fsnotify_oldname_free(old_name); + release_dentry_name_snapshot(&old_name); unlock_rename(new_dir, old_dir); dput(dentry); return old_dentry; diff --git a/fs/direct-io.c b/fs/direct-io.c index fb9aa16a7727..c6220a2daefd 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -587,7 +587,7 @@ static int dio_set_defer_completion(struct dio *dio) /* * Call into the fs to map some more disk blocks. We record the current number * of available blocks at sdio->blocks_available. These are in units of the - * fs blocksize, (1 << inode->i_blkbits). + * fs blocksize, i_blocksize(inode). * * The fs is allowed to map lots of blocks at once. If it wants to do that, * it uses the passed inode-relative block number as the file offset, as usual. @@ -835,7 +835,8 @@ out: */ if (sdio->boundary) { ret = dio_send_cur_page(dio, sdio, map_bh); - dio_bio_submit(dio, sdio); + if (sdio->bio) + dio_bio_submit(dio, sdio); put_page(sdio->cur_page); sdio->cur_page = NULL; } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 58c2f4a21b7f..9ac65914ab5b 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -355,6 +355,10 @@ static int dlm_device_register(struct dlm_ls *ls, char *name) error = misc_register(&ls->ls_device); if (error) { kfree(ls->ls_device.name); + /* this has to be set to NULL + * to avoid a double-free in dlm_device_deregister + */ + ls->ls_device.name = NULL; } fail: return error; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 599a29237cfe..a896e46671ea 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -84,11 +84,16 @@ struct ecryptfs_page_crypt_context { static inline struct ecryptfs_auth_tok * ecryptfs_get_encrypted_key_payload_data(struct key *key) { - if (key->type == &key_type_encrypted) - return (struct ecryptfs_auth_tok *) - (&((struct encrypted_key_payload *)key->payload.data[0])->payload_data); - else + struct encrypted_key_payload *payload; + + if (key->type != &key_type_encrypted) return NULL; + + payload = key->payload.data[0]; + if (!payload) + return ERR_PTR(-EKEYREVOKED); + + return (struct ecryptfs_auth_tok *)payload->payload_data; } static inline struct key *ecryptfs_get_encrypted_key(char *sig) @@ -114,12 +119,17 @@ static inline struct ecryptfs_auth_tok * ecryptfs_get_key_payload_data(struct key *key) { struct ecryptfs_auth_tok *auth_tok; + const struct user_key_payload *ukp; auth_tok = ecryptfs_get_encrypted_key_payload_data(key); - if (!auth_tok) - return (struct ecryptfs_auth_tok *)user_key_payload(key)->data; - else + if (auth_tok) return auth_tok; + + ukp = user_key_payload(key); + if (!ukp) + return ERR_PTR(-EKEYREVOKED); + + return (struct ecryptfs_auth_tok *)ukp->data; } #define ECRYPTFS_MAX_KEYSET_SIZE 1024 diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 3cf1546dca82..fa218cd64f74 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -459,7 +459,8 @@ out: * @auth_tok_key: key containing the authentication token * @auth_tok: authentication token * - * Returns zero on valid auth tok; -EINVAL otherwise + * Returns zero on valid auth tok; -EINVAL if the payload is invalid; or + * -EKEYREVOKED if the key was revoked before we acquired its semaphore. */ static int ecryptfs_verify_auth_tok_from_key(struct key *auth_tok_key, @@ -468,6 +469,12 @@ ecryptfs_verify_auth_tok_from_key(struct key *auth_tok_key, int rc = 0; (*auth_tok) = ecryptfs_get_key_payload_data(auth_tok_key); + if (IS_ERR(*auth_tok)) { + rc = PTR_ERR(*auth_tok); + *auth_tok = NULL; + goto out; + } + if (ecryptfs_verify_version((*auth_tok)->version)) { printk(KERN_ERR "Data structure version mismatch. Userspace " "tools must match eCryptfs kernel module with major " diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 286f10b0363b..4f457d5c4933 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -442,15 +442,16 @@ void ecryptfs_release_messaging(void) } if (ecryptfs_daemon_hash) { struct ecryptfs_daemon *daemon; + struct hlist_node *n; int i; mutex_lock(&ecryptfs_daemon_hash_mux); for (i = 0; i < (1 << ecryptfs_hash_bits); i++) { int rc; - hlist_for_each_entry(daemon, - &ecryptfs_daemon_hash[i], - euid_chain) { + hlist_for_each_entry_safe(daemon, n, + &ecryptfs_daemon_hash[i], + euid_chain) { rc = ecryptfs_exorcise_daemon(daemon); if (rc) printk(KERN_ERR "%s: Error whilst " diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 10db91218933..3cbc30413add 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -523,8 +523,13 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq) wait_queue_head_t *whead; rcu_read_lock(); - /* If it is cleared by POLLFREE, it should be rcu-safe */ - whead = rcu_dereference(pwq->whead); + /* + * If it is cleared by POLLFREE, it should be rcu-safe. + * If we read NULL we need a barrier paired with + * smp_store_release() in ep_poll_callback(), otherwise + * we rely on whead->lock. + */ + whead = smp_load_acquire(&pwq->whead); if (whead) remove_wait_queue(whead, &pwq->wait); rcu_read_unlock(); @@ -1009,17 +1014,6 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct eventpoll *ep = epi->ep; int ewake = 0; - if ((unsigned long)key & POLLFREE) { - ep_pwq_from_wait(wait)->whead = NULL; - /* - * whead = NULL above can race with ep_remove_wait_queue() - * which can do another remove_wait_queue() after us, so we - * can't use __remove_wait_queue(). whead->lock is held by - * the caller. - */ - list_del_init(&wait->task_list); - } - spin_lock_irqsave(&ep->lock, flags); /* @@ -1101,10 +1095,26 @@ out_unlock: if (pwake) ep_poll_safewake(&ep->poll_wait); - if (epi->event.events & EPOLLEXCLUSIVE) - return ewake; + if (!(epi->event.events & EPOLLEXCLUSIVE)) + ewake = 1; + + if ((unsigned long)key & POLLFREE) { + /* + * If we race with ep_remove_wait_queue() it can miss + * ->whead = NULL and do another remove_wait_queue() after + * us, so we can't use __remove_wait_queue(). + */ + list_del_init(&wait->task_list); + /* + * ->whead != NULL protects us from the race with ep_free() + * or ep_remove(), ep_remove_wait_queue() takes whead->lock + * held by the caller. Once we nullify it, nothing protects + * ep/epi or even wait. + */ + smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); + } - return 1; + return ewake; } /* diff --git a/fs/exec.c b/fs/exec.c index 67e86571685a..b8c43be24751 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -215,7 +215,24 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, if (write) { unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; - struct rlimit *rlim; + unsigned long ptr_size, limit; + + /* + * Since the stack will hold pointers to the strings, we + * must account for them as well. + * + * The size calculation is the entire vma while each arg page is + * built, so each time we get here it's calculating how far it + * is currently (rather than each call being just the newly + * added size from the arg page). As a result, we need to + * always add the entire size of the pointers, so that on the + * last call to get_arg_page() we'll actually have the entire + * correct size. + */ + ptr_size = (bprm->argc + bprm->envc) * sizeof(void *); + if (ptr_size > ULONG_MAX - size) + goto fail; + size += ptr_size; acct_arg_size(bprm, size / PAGE_SIZE); @@ -227,20 +244,24 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return page; /* - * Limit to 1/4-th the stack size for the argv+env strings. + * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM + * (whichever is smaller) for the argv+env strings. * This ensures that: * - the remaining binfmt code will not run out of stack space, * - the program will have a reasonable amount of stack left * to work from. */ - rlim = current->signal->rlim; - if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) { - put_page(page); - return NULL; - } + limit = _STK_LIM / 4 * 3; + limit = min(limit, rlimit(RLIMIT_STACK) / 4); + if (size > limit) + goto fail; } return page; + +fail: + put_page(page); + return NULL; } static void put_arg_page(struct page *page) diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 79dafa71effd..069c0dceda01 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -175,11 +175,8 @@ ext2_get_acl(struct inode *inode, int type) return acl; } -/* - * inode->i_mutex: down - */ -int -ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +static int +__ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int name_index; void *value = NULL; @@ -189,13 +186,6 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) switch(type) { case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - inode->i_ctime = current_time(inode); - mark_inode_dirty(inode); - } break; case ACL_TYPE_DEFAULT: @@ -222,6 +212,24 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) } /* + * inode->i_mutex: down + */ +int +ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error; + + if (type == ACL_TYPE_ACCESS && acl) { + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) + return error; + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); + } + return __ext2_set_acl(inode, acl, type); +} + +/* * Initialize the ACLs of a new inode. Called from ext2_new_inode. * * dir->i_mutex: down @@ -238,12 +246,12 @@ ext2_init_acl(struct inode *inode, struct inode *dir) return error; if (default_acl) { - error = ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = __ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); posix_acl_release(default_acl); } if (acl) { if (!error) - error = ext2_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = __ext2_set_acl(inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); } return error; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index dfa519979038..dfd01ca1a60a 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -192,13 +192,6 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - } break; case ACL_TYPE_DEFAULT: @@ -231,6 +224,8 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) { handle_t *handle; int error, retries = 0; + umode_t mode = inode->i_mode; + int update_mode = 0; retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, @@ -238,7 +233,20 @@ retry: if (IS_ERR(handle)) return PTR_ERR(handle); + if ((type == ACL_TYPE_ACCESS) && acl) { + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) + goto out_stop; + update_mode = 1; + } + error = __ext4_set_acl(handle, inode, type, acl); + if (!error && update_mode) { + inode->i_mode = mode; + inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + } +out_stop: ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c930a0110fb4..a77cbc5b657b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3413,13 +3413,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; - struct ext4_extent zero_ex; + struct ext4_extent zero_ex1, zero_ex2; struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; int allocated = 0, max_zeroout = 0; int err = 0; - int split_flag = 0; + int split_flag = EXT4_EXT_DATA_VALID2; ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, @@ -3436,7 +3436,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - zero_ex.ee_len = 0; + zero_ex1.ee_len = 0; + zero_ex2.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3576,62 +3577,52 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (ext4_encrypted_inode(inode)) max_zeroout = 0; - /* If extent is less than s_max_zeroout_kb, zeroout directly */ - if (max_zeroout && (ee_len <= max_zeroout)) { - err = ext4_ext_zeroout(inode, ex); - if (err) - goto out; - zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)); - ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; - ext4_ext_mark_initialized(ex); - ext4_ext_try_to_merge(handle, inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + path->p_depth); - goto out; - } - /* - * four cases: + * five cases: * 1. split the extent into three extents. - * 2. split the extent into two extents, zeroout the first half. - * 3. split the extent into two extents, zeroout the second half. + * 2. split the extent into two extents, zeroout the head of the first + * extent. + * 3. split the extent into two extents, zeroout the tail of the second + * extent. * 4. split the extent into two extents with out zeroout. + * 5. no splitting needed, just possibly zeroout the head and / or the + * tail of the extent. */ split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; - if (max_zeroout && (allocated > map->m_len)) { + if (max_zeroout && (allocated > split_map.m_len)) { if (allocated <= max_zeroout) { - /* case 3 */ - zero_ex.ee_block = - cpu_to_le32(map->m_lblk); - zero_ex.ee_len = cpu_to_le16(allocated); - ext4_ext_store_pblock(&zero_ex, - ext4_ext_pblock(ex) + map->m_lblk - ee_block); - err = ext4_ext_zeroout(inode, &zero_ex); + /* case 3 or 5 */ + zero_ex1.ee_block = + cpu_to_le32(split_map.m_lblk + + split_map.m_len); + zero_ex1.ee_len = + cpu_to_le16(allocated - split_map.m_len); + ext4_ext_store_pblock(&zero_ex1, + ext4_ext_pblock(ex) + split_map.m_lblk + + split_map.m_len - ee_block); + err = ext4_ext_zeroout(inode, &zero_ex1); if (err) goto out; - split_map.m_lblk = map->m_lblk; split_map.m_len = allocated; - } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { - /* case 2 */ - if (map->m_lblk != ee_block) { - zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = cpu_to_le16(map->m_lblk - + } + if (split_map.m_lblk - ee_block + split_map.m_len < + max_zeroout) { + /* case 2 or 5 */ + if (split_map.m_lblk != ee_block) { + zero_ex2.ee_block = ex->ee_block; + zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk - ee_block); - ext4_ext_store_pblock(&zero_ex, + ext4_ext_store_pblock(&zero_ex2, ext4_ext_pblock(ex)); - err = ext4_ext_zeroout(inode, &zero_ex); + err = ext4_ext_zeroout(inode, &zero_ex2); if (err) goto out; } + split_map.m_len += split_map.m_lblk - ee_block; split_map.m_lblk = ee_block; - split_map.m_len = map->m_lblk - ee_block + map->m_len; allocated = map->m_len; } } @@ -3642,8 +3633,11 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = 0; out: /* If we have gotten a failure, don't zero out status tree */ - if (!err) - err = ext4_zeroout_es(inode, &zero_ex); + if (!err) { + err = ext4_zeroout_es(inode, &zero_ex1); + if (!err) + err = ext4_zeroout_es(inode, &zero_ex2); + } return err ? err : allocated; } @@ -4809,7 +4803,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, } if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { + (offset + len > i_size_read(inode) || + offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); if (ret) @@ -4893,6 +4888,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); @@ -4978,7 +4975,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) } if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { + (offset + len > i_size_read(inode) || + offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); if (ret) @@ -5344,7 +5342,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ext4_lblk_t stop, *iterator, ex_start, ex_end; /* Let path point to the last extent */ - path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, + EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); @@ -5353,15 +5352,15 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, if (!extent) goto out; - stop = le32_to_cpu(extent->ee_block) + - ext4_ext_get_actual_len(extent); + stop = le32_to_cpu(extent->ee_block); /* * In case of left shift, Don't start shifting extents until we make * sure the hole is big enough to accommodate the shift. */ if (SHIFT == SHIFT_LEFT) { - path = ext4_find_extent(inode, start - 1, &path, 0); + path = ext4_find_extent(inode, start - 1, &path, + EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; @@ -5393,9 +5392,14 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, else iterator = &stop; - /* Its safe to start updating extents */ - while (start < stop) { - path = ext4_find_extent(inode, *iterator, &path, 0); + /* + * Its safe to start updating extents. Start and stop are unsigned, so + * in case of right shift if extent with 0 block is reached, iterator + * becomes NULL to indicate the end of the loop. + */ + while (iterator && start <= stop) { + path = ext4_find_extent(inode, *iterator, &path, + EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; @@ -5422,8 +5426,11 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ext4_ext_get_actual_len(extent); } else { extent = EXT_FIRST_EXTENT(path[depth].p_hdr); - *iterator = le32_to_cpu(extent->ee_block) > 0 ? - le32_to_cpu(extent->ee_block) - 1 : 0; + if (le32_to_cpu(extent->ee_block) > 0) + *iterator = le32_to_cpu(extent->ee_block) - 1; + else + /* Beginning is reached, end of the loop */ + iterator = NULL; /* Update path extent in case we need to stop */ while (le32_to_cpu(extent->ee_block) < start) extent++; @@ -5570,6 +5577,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_handle_sync(handle); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); @@ -5743,6 +5751,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2a822d30e73f..510e66422f04 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -432,47 +432,27 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, (pgoff_t)num); - if (nr_pages == 0) { - if (whence == SEEK_DATA) - break; - - BUG_ON(whence != SEEK_HOLE); - /* - * If this is the first time to go into the loop and - * offset is not beyond the end offset, it will be a - * hole at this offset - */ - if (lastoff == startoff || lastoff < endoff) - found = 1; - break; - } - - /* - * If this is the first time to go into the loop and - * offset is smaller than the first page offset, it will be a - * hole at this offset. - */ - if (lastoff == startoff && whence == SEEK_HOLE && - lastoff < page_offset(pvec.pages[0])) { - found = 1; + if (nr_pages == 0) break; - } for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; struct buffer_head *bh, *head; /* - * If the current offset is not beyond the end of given - * range, it will be a hole. + * If current offset is smaller than the page offset, + * there is a hole at this offset. */ - if (lastoff < endoff && whence == SEEK_HOLE && - page->index > end) { + if (whence == SEEK_HOLE && lastoff < endoff && + lastoff < page_offset(pvec.pages[i])) { found = 1; *offset = lastoff; goto out; } + if (page->index > end) + goto out; + lock_page(page); if (unlikely(page->mapping != inode->i_mapping)) { @@ -489,6 +469,8 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, lastoff = page_offset(page); bh = head = page_buffers(page); do { + if (lastoff + bh->b_size <= startoff) + goto next; if (buffer_uptodate(bh) || buffer_unwritten(bh)) { if (whence == SEEK_DATA) @@ -503,6 +485,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, unlock_page(page); goto out; } +next: lastoff += bh->b_size; bh = bh->b_this_page; } while (bh != head); @@ -512,20 +495,18 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, unlock_page(page); } - /* - * The no. of pages is less than our desired, that would be a - * hole in there. - */ - if (nr_pages < num && whence == SEEK_HOLE) { - found = 1; - *offset = lastoff; + /* The no. of pages is less than our desired, we are done. */ + if (nr_pages < num) break; - } index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); + if (whence == SEEK_HOLE && lastoff < endoff) { + found = 1; + *offset = lastoff; + } out: pagevec_release(&pvec); return found; @@ -546,7 +527,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) inode_lock(inode); isize = i_size_read(inode); - if (offset >= isize) { + if (offset < 0 || offset >= isize) { inode_unlock(inode); return -ENXIO; } @@ -609,7 +590,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) inode_lock(inode); isize = i_size_read(inode); - if (offset >= isize) { + if (offset < 0 || offset >= isize) { inode_unlock(inode); return -ENXIO; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 170421edfdfe..2d94e8524839 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -771,7 +771,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if (err) return ERR_PTR(err); if (!fscrypt_has_encryption_key(dir)) - return ERR_PTR(-EPERM); + return ERR_PTR(-ENOKEY); if (!handle) nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb); encrypt = 1; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d8ca4b9f9dd6..73cbc01ef5ad 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -376,7 +376,7 @@ out: static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, unsigned int len) { - int ret, size; + int ret, size, no_expand; struct ext4_inode_info *ei = EXT4_I(inode); if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) @@ -386,15 +386,14 @@ static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, if (size < len) return -ENOSPC; - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); if (ei->i_inline_off) ret = ext4_update_inline_data(handle, inode, len); else ret = ext4_create_inline_data(handle, inode, len); - up_write(&EXT4_I(inode)->xattr_sem); - + ext4_write_unlock_xattr(inode, &no_expand); return ret; } @@ -523,7 +522,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode, unsigned flags) { - int ret, needed_blocks; + int ret, needed_blocks, no_expand; handle_t *handle = NULL; int retries = 0, sem_held = 0; struct page *page = NULL; @@ -563,7 +562,7 @@ retry: goto out; } - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); sem_held = 1; /* If some one has already done this for us, just exit. */ if (!ext4_has_inline_data(inode)) { @@ -600,7 +599,7 @@ retry: put_page(page); page = NULL; ext4_orphan_add(handle, inode); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); sem_held = 0; ext4_journal_stop(handle); handle = NULL; @@ -626,7 +625,7 @@ out: put_page(page); } if (sem_held) - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); if (handle) ext4_journal_stop(handle); brelse(iloc.bh); @@ -719,7 +718,7 @@ convert: int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page) { - int ret; + int ret, no_expand; void *kaddr; struct ext4_iloc iloc; @@ -737,7 +736,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, goto out; } - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); BUG_ON(!ext4_has_inline_data(inode)); kaddr = kmap_atomic(page); @@ -747,7 +746,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, /* clear page dirty so that writepages wouldn't work for us. */ ClearPageDirty(page); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); brelse(iloc.bh); out: return copied; @@ -758,7 +757,7 @@ ext4_journalled_write_inline_data(struct inode *inode, unsigned len, struct page *page) { - int ret; + int ret, no_expand; void *kaddr; struct ext4_iloc iloc; @@ -768,11 +767,11 @@ ext4_journalled_write_inline_data(struct inode *inode, return NULL; } - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); kaddr = kmap_atomic(page); ext4_write_inline_data(inode, &iloc, kaddr, 0, len); kunmap_atomic(kaddr); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); return iloc.bh; } @@ -934,8 +933,15 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, struct page *page) { int i_size_changed = 0; + int ret; - copied = ext4_write_inline_data_end(inode, pos, len, copied, page); + ret = ext4_write_inline_data_end(inode, pos, len, copied, page); + if (ret < 0) { + unlock_page(page); + put_page(page); + return ret; + } + copied = ret; /* * No need to use i_size_read() here, the i_size @@ -1151,10 +1157,9 @@ static int ext4_finish_convert_inline_dir(handle_t *handle, set_buffer_uptodate(dir_block); err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); if (err) - goto out; + return err; set_buffer_verified(dir_block); -out: - return err; + return ext4_mark_inode_dirty(handle, inode); } static int ext4_convert_inline_data_nolock(handle_t *handle, @@ -1249,7 +1254,7 @@ out: int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { - int ret, inline_size; + int ret, inline_size, no_expand; void *inline_start; struct ext4_iloc iloc; @@ -1257,7 +1262,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, if (ret) return ret; - down_write(&EXT4_I(dir)->xattr_sem); + ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) goto out; @@ -1303,7 +1308,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, out: ext4_mark_inode_dirty(handle, dir); - up_write(&EXT4_I(dir)->xattr_sem); + ext4_write_unlock_xattr(dir, &no_expand); brelse(iloc.bh); return ret; } @@ -1663,7 +1668,7 @@ int ext4_delete_inline_entry(handle_t *handle, struct buffer_head *bh, int *has_inline_data) { - int err, inline_size; + int err, inline_size, no_expand; struct ext4_iloc iloc; void *inline_start; @@ -1671,7 +1676,7 @@ int ext4_delete_inline_entry(handle_t *handle, if (err) return err; - down_write(&EXT4_I(dir)->xattr_sem); + ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; @@ -1705,7 +1710,7 @@ int ext4_delete_inline_entry(handle_t *handle, ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); out: - up_write(&EXT4_I(dir)->xattr_sem); + ext4_write_unlock_xattr(dir, &no_expand); brelse(iloc.bh); if (err != -ENOENT) ext4_std_error(dir->i_sb, err); @@ -1804,11 +1809,11 @@ out: int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { - int ret; + int ret, no_expand; - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); ret = ext4_destroy_inline_data_nolock(handle, inode); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); return ret; } @@ -1893,7 +1898,7 @@ out: void ext4_inline_data_truncate(struct inode *inode, int *has_inline) { handle_t *handle; - int inline_size, value_len, needed_blocks; + int inline_size, value_len, needed_blocks, no_expand; size_t i_size; void *value = NULL; struct ext4_xattr_ibody_find is = { @@ -1910,7 +1915,7 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline) if (IS_ERR(handle)) return; - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); if (!ext4_has_inline_data(inode)) { *has_inline = 0; ext4_journal_stop(handle); @@ -1968,7 +1973,7 @@ out_error: up_write(&EXT4_I(inode)->i_data_sem); out: brelse(is.iloc.bh); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); kfree(value); if (inode->i_nlink) ext4_orphan_del(handle, inode); @@ -1984,7 +1989,7 @@ out: int ext4_convert_inline_data(struct inode *inode) { - int error, needed_blocks; + int error, needed_blocks, no_expand; handle_t *handle; struct ext4_iloc iloc; @@ -2006,15 +2011,10 @@ int ext4_convert_inline_data(struct inode *inode) goto out_free; } - down_write(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - up_write(&EXT4_I(inode)->xattr_sem); - goto out; - } - - error = ext4_convert_inline_data_nolock(handle, inode, &iloc); - up_write(&EXT4_I(inode)->xattr_sem); -out: + ext4_write_lock_xattr(inode, &no_expand); + if (ext4_has_inline_data(inode)) + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); + ext4_write_unlock_xattr(inode, &no_expand); ext4_journal_stop(handle); out_free: brelse(iloc.bh); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 33a509c876ee..ec28e8ebb984 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -71,10 +71,9 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, - EXT4_INODE_SIZE(inode->i_sb) - - offset); } + csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + EXT4_INODE_SIZE(inode->i_sb) - offset); } return csum; @@ -1324,8 +1323,11 @@ static int ext4_write_end(struct file *file, if (ext4_has_inline_data(inode)) { ret = ext4_write_inline_data_end(inode, pos, len, copied, page); - if (ret < 0) + if (ret < 0) { + unlock_page(page); + put_page(page); goto errout; + } copied = ret; } else copied = block_write_end(file, mapping, pos, @@ -1379,7 +1381,9 @@ errout: * set the buffer to be dirty, since in data=journalled mode we need * to call ext4_handle_dirty_metadata() instead. */ -static void zero_new_buffers(struct page *page, unsigned from, unsigned to) +static void ext4_journalled_zero_new_buffers(handle_t *handle, + struct page *page, + unsigned from, unsigned to) { unsigned int block_start = 0, block_end; struct buffer_head *head, *bh; @@ -1396,7 +1400,7 @@ static void zero_new_buffers(struct page *page, unsigned from, unsigned to) size = min(to, block_end) - start; zero_user(page, start, size); - set_buffer_uptodate(bh); + write_end_fn(handle, bh); } clear_buffer_new(bh); } @@ -1425,18 +1429,25 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); - if (ext4_has_inline_data(inode)) - copied = ext4_write_inline_data_end(inode, pos, len, - copied, page); - else { - if (copied < len) { - if (!PageUptodate(page)) - copied = 0; - zero_new_buffers(page, from+copied, to); + if (ext4_has_inline_data(inode)) { + ret = ext4_write_inline_data_end(inode, pos, len, + copied, page); + if (ret < 0) { + unlock_page(page); + put_page(page); + goto errout; } - + copied = ret; + } else if (unlikely(copied < len) && !PageUptodate(page)) { + copied = 0; + ext4_journalled_zero_new_buffers(handle, page, from, to); + } else { + if (unlikely(copied < len)) + ext4_journalled_zero_new_buffers(handle, page, + from + copied, to); ret = ext4_walk_page_buffers(handle, page_buffers(page), from, - to, &partial, write_end_fn); + from + copied, &partial, + write_end_fn); if (!partial) SetPageUptodate(page); } @@ -1462,6 +1473,7 @@ static int ext4_journalled_write_end(struct file *file, */ ext4_orphan_add(handle, inode); +errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -2095,15 +2107,29 @@ static int ext4_writepage(struct page *page, static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) { int len; - loff_t size = i_size_read(mpd->inode); + loff_t size; int err; BUG_ON(page->index != mpd->first_page); + clear_page_dirty_for_io(page); + /* + * We have to be very careful here! Nothing protects writeback path + * against i_size changes and the page can be writeably mapped into + * page tables. So an application can be growing i_size and writing + * data through mmap while writeback runs. clear_page_dirty_for_io() + * write-protects our page in page tables and the page cannot get + * written to again until we release page lock. So only after + * clear_page_dirty_for_io() we are safe to sample i_size for + * ext4_bio_write_page() to zero-out tail of the written page. We rely + * on the barrier provided by TestClearPageDirty in + * clear_page_dirty_for_io() to make sure i_size is really sampled only + * after page tables are updated. + */ + size = i_size_read(mpd->inode); if (page->index == size >> PAGE_SHIFT) len = size & ~PAGE_MASK; else len = PAGE_SIZE; - clear_page_dirty_for_io(page); err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); if (!err) mpd->wbc->nr_to_write--; @@ -2193,7 +2219,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, { struct inode *inode = mpd->inode; int err; - ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) + ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) >> inode->i_blkbits; do { @@ -3442,14 +3468,14 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) * writes need zeroing either because they can race with page * faults or because they use partial blocks. */ - if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size && + if (round_down(offset, i_blocksize(inode)) >= inode->i_size && ext4_aligned_io(inode, offset, count)) get_block_func = ext4_dio_get_block; else get_block_func = ext4_dax_get_block; dio_flags = DIO_LOCKING; } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || - round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { + round_down(offset, i_blocksize(inode)) >= inode->i_size) { get_block_func = ext4_dio_get_block; dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; } else if (is_sync_kiocb(iocb)) { @@ -3811,6 +3837,10 @@ static int ext4_block_truncate_page(handle_t *handle, unsigned blocksize; struct inode *inode = mapping->host; + /* If we are processing an encrypted inode during orphan list handling */ + if (ext4_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode)) + return 0; + blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); @@ -4028,6 +4058,8 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_dio: @@ -5030,7 +5062,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) * do. We do the check mainly to optimize the common PAGE_SIZE == * blocksize case */ - if (offset > PAGE_SIZE - (1 << inode->i_blkbits)) + if (offset > PAGE_SIZE - i_blocksize(inode)) return; while (1) { page = find_lock_page(inode->i_mapping, @@ -5425,8 +5457,9 @@ static int ext4_expand_extra_isize(struct inode *inode, /* No extended attributes present */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { - memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, - new_extra_isize); + memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize, 0, + new_extra_isize - EXT4_I(inode)->i_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; return 0; } @@ -5670,6 +5703,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) file_update_time(vma->vm_file); down_read(&EXT4_I(inode)->i_mmap_sem); + + ret = ext4_convert_inline_data(inode); + if (ret) + goto out_ret; + /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7ae43c59bc79..64056c6eb857 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -838,7 +838,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) inode = page->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); - blocksize = 1 << inode->i_blkbits; + blocksize = i_blocksize(inode); blocks_per_page = PAGE_SIZE / blocksize; groups_per_page = blocks_per_page >> 1; @@ -2136,8 +2136,10 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * We search using buddy data only if the order of the request * is greater than equal to the sbi_s_mb_order2_reqs * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req + * We also support searching for power-of-two requests only for + * requests upto maximum buddy size we have constructed. */ - if (i >= sbi->s_mb_order2_reqs) { + if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) { /* * This should tell if fe_len is exactly power of 2 */ @@ -2207,7 +2209,7 @@ repeat: } ac->ac_groups_scanned++; - if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2) + if (cr == 0) ext4_mb_simple_scan_group(ac, &e4b); else if (cr == 1 && sbi->s_stripe && !(ac->ac_g_ex.fe_len % sbi->s_stripe)) @@ -3123,6 +3125,13 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, if (ar->pright && start + size - 1 >= ar->lright) size -= start + size - ar->lright; + /* + * Trim allocation request for filesystems with artificially small + * groups. + */ + if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb); + end = start + size; /* check we don't cross already preallocated blocks */ diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 6fc14def0c70..578f8c33fb44 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -187,7 +187,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) if (PageUptodate(page)) return 0; - blocksize = 1 << inode->i_blkbits; + blocksize = i_blocksize(inode); if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 104f8bfba718..4438b93f6fd6 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1255,9 +1255,9 @@ static inline int ext4_match(struct ext4_filename *fname, if (unlikely(!name)) { if (fname->usr_fname->name[0] == '_') { int ret; - if (de->name_len < 16) + if (de->name_len <= 32) return 0; - ret = memcmp(de->name + de->name_len - 16, + ret = memcmp(de->name + ((de->name_len - 17) & ~15), fname->crypto_buf.name + 8, 16); return (ret == 0) ? 1 : 0; } @@ -1378,6 +1378,8 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, return NULL; retval = ext4_fname_setup_filename(dir, d_name, 1, &fname); + if (retval == -ENOENT) + return NULL; if (retval) return ERR_PTR(retval); @@ -1616,13 +1618,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi !fscrypt_has_permitted_context(dir, inode)) { int nokey = ext4_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode); - iput(inode); - if (nokey) + if (nokey) { + iput(inode); return ERR_PTR(-ENOKEY); + } ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu", (unsigned long) dir->i_ino, (unsigned long) inode->i_ino); + iput(inode); return ERR_PTR(-EPERM); } } @@ -3088,7 +3092,7 @@ static int ext4_symlink(struct inode *dir, if (err) return err; if (!fscrypt_has_encryption_key(dir)) - return -EPERM; + return -ENOKEY; disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + sizeof(struct fscrypt_symlink_data)); sd = kzalloc(disk_link.len, GFP_KERNEL); @@ -3525,6 +3529,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, EXT4_I(old_dentry->d_inode)->i_projid))) return -EXDEV; + if ((ext4_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (ext4_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + retval = dquot_initialize(old.dir); if (retval) return retval; @@ -3724,6 +3734,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, u8 new_file_type; int retval; + if ((ext4_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (ext4_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((ext4_encrypted_inode(old_dir) || ext4_encrypted_inode(new_dir)) && (old_dir != new_dir) && diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index cf681004b196..95bf46654153 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1926,7 +1926,8 @@ retry: n_desc_blocks = o_desc_blocks + le16_to_cpu(es->s_reserved_gdt_blocks); n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); - n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb); + n_blocks_count = (ext4_fsblk_t)n_group * + EXT4_BLOCKS_PER_GROUP(sb); n_group--; /* set to last group number */ } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bbc316db9495..1f581791b39d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -825,6 +825,7 @@ static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; + int aborted = 0; int i, err; ext4_unregister_li_request(sb); @@ -834,9 +835,10 @@ static void ext4_put_super(struct super_block *sb) destroy_workqueue(sbi->rsv_conversion_wq); if (sbi->s_journal) { + aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; - if (err < 0) + if ((err < 0) && !aborted) ext4_abort(sb, "Couldn't clean up the journal"); } @@ -847,7 +849,7 @@ static void ext4_put_super(struct super_block *sb) ext4_mb_release(sb); ext4_ext_release(sb); - if (!(sb->s_flags & MS_RDONLY)) { + if (!(sb->s_flags & MS_RDONLY) && !aborted) { ext4_clear_feature_journal_needs_recovery(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } @@ -2332,6 +2334,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, unsigned int s_flags = sb->s_flags; int nr_orphans = 0, nr_truncates = 0; #ifdef CONFIG_QUOTA + int quota_update = 0; int i; #endif if (!es->s_last_orphan) { @@ -2370,14 +2373,32 @@ static void ext4_orphan_cleanup(struct super_block *sb, #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ sb->s_flags |= MS_ACTIVE; - /* Turn on quotas so that they are updated correctly */ + + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + if (ext4_has_feature_quota(sb) && (s_flags & MS_RDONLY)) { + int ret = ext4_enable_quotas(sb); + + if (!ret) + quota_update = 1; + else + ext4_msg(sb, KERN_ERR, + "Cannot turn on quotas: error %d", ret); + } + + /* Turn on journaled quotas used for old sytle */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); - if (ret < 0) + + if (!ret) + quota_update = 1; + else ext4_msg(sb, KERN_ERR, "Cannot turn on journaled " - "quota: error %d", ret); + "quota: type %d: error %d", i, ret); } } #endif @@ -2436,10 +2457,12 @@ static void ext4_orphan_cleanup(struct super_block *sb, ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA - /* Turn quotas off */ - for (i = 0; i < EXT4_MAXQUOTAS; i++) { - if (sb_dqopt(sb)->files[i]) - dquot_quota_off(sb, i); + /* Turn off quotas if they were enabled for orphan cleanup */ + if (quota_update) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + dquot_quota_off(sb, i); + } } #endif sb->s_flags = s_flags; /* Restore MS_RDONLY status */ @@ -2605,9 +2628,9 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) ret = sbi->s_stripe; - else if (stripe_width <= sbi->s_blocks_per_group) + else if (stripe_width && stripe_width <= sbi->s_blocks_per_group) ret = stripe_width; - else if (stride <= sbi->s_blocks_per_group) + else if (stride && stride <= sbi->s_blocks_per_group) ret = stride; else ret = 0; @@ -3828,7 +3851,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); if (ext4_has_feature_meta_bg(sb)) { - if (le32_to_cpu(es->s_first_meta_bg) >= db_count) { + if (le32_to_cpu(es->s_first_meta_bg) > db_count) { ext4_msg(sb, KERN_WARNING, "first meta block group too large: %u " "(group descriptor block count %u)", @@ -3911,7 +3934,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { - if (ext4_load_journal(sb, es, journal_devnum)) + err = ext4_load_journal(sb, es, journal_devnum); + if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && ext4_has_feature_journal_needs_recovery(sb)) { @@ -5362,6 +5386,9 @@ static int ext4_enable_quotas(struct super_block *sb) DQUOT_USAGE_ENABLED | (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + ext4_warning(sb, "Failed to enable quota tracking " "(type=%d, err=%d). Please run " diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 42145be5c6b4..5dc655e410b4 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -100,7 +100,7 @@ static ssize_t reserved_clusters_store(struct ext4_attr *a, int ret; ret = kstrtoull(skip_spaces(buf), 0, &val); - if (!ret || val >= clusters) + if (ret || val >= clusters) return -EINVAL; atomic64_set(&sbi->s_resv_clusters, val); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index d77be9e9f535..3eeed8f0aa06 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -131,31 +131,26 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, } static int ext4_xattr_block_csum_verify(struct inode *inode, - sector_t block_nr, - struct ext4_xattr_header *hdr) + struct buffer_head *bh) { - if (ext4_has_metadata_csum(inode->i_sb) && - (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr))) - return 0; - return 1; -} - -static void ext4_xattr_block_csum_set(struct inode *inode, - sector_t block_nr, - struct ext4_xattr_header *hdr) -{ - if (!ext4_has_metadata_csum(inode->i_sb)) - return; + struct ext4_xattr_header *hdr = BHDR(bh); + int ret = 1; - hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr); + if (ext4_has_metadata_csum(inode->i_sb)) { + lock_buffer(bh); + ret = (hdr->h_checksum == ext4_xattr_block_csum(inode, + bh->b_blocknr, hdr)); + unlock_buffer(bh); + } + return ret; } -static inline int ext4_handle_dirty_xattr_block(handle_t *handle, - struct inode *inode, - struct buffer_head *bh) +static void ext4_xattr_block_csum_set(struct inode *inode, + struct buffer_head *bh) { - ext4_xattr_block_csum_set(inode, bh->b_blocknr, BHDR(bh)); - return ext4_handle_dirty_metadata(handle, inode, bh); + if (ext4_has_metadata_csum(inode->i_sb)) + BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode, + bh->b_blocknr, BHDR(bh)); } static inline const struct xattr_handler * @@ -218,7 +213,7 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) return -EFSCORRUPTED; - if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh))) + if (!ext4_xattr_block_csum_verify(inode, bh)) return -EFSBADCRC; error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size, bh->b_data); @@ -601,23 +596,22 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, } } + ext4_xattr_block_csum_set(inode, bh); /* * Beware of this ugliness: Releasing of xattr block references * from different inodes can race and so we have to protect * from a race where someone else frees the block (and releases * its journal_head) before we are done dirtying the buffer. In * nojournal mode this race is harmless and we actually cannot - * call ext4_handle_dirty_xattr_block() with locked buffer as + * call ext4_handle_dirty_metadata() with locked buffer as * that function can call sync_dirty_buffer() so for that case * we handle the dirtying after unlocking the buffer. */ if (ext4_handle_valid(handle)) - error = ext4_handle_dirty_xattr_block(handle, inode, - bh); + error = ext4_handle_dirty_metadata(handle, inode, bh); unlock_buffer(bh); if (!ext4_handle_valid(handle)) - error = ext4_handle_dirty_xattr_block(handle, inode, - bh); + error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); @@ -846,13 +840,14 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ext4_xattr_cache_insert(ext4_mb_cache, bs->bh); } + ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) goto bad_block; if (!error) - error = ext4_handle_dirty_xattr_block(handle, - inode, - bs->bh); + error = ext4_handle_dirty_metadata(handle, + inode, + bs->bh); if (error) goto cleanup; goto inserted; @@ -950,10 +945,11 @@ inserted: ce->e_reusable = 0; ea_bdebug(new_bh, "reusing; refcount now=%d", ref); + ext4_xattr_block_csum_set(inode, new_bh); unlock_buffer(new_bh); - error = ext4_handle_dirty_xattr_block(handle, - inode, - new_bh); + error = ext4_handle_dirty_metadata(handle, + inode, + new_bh); if (error) goto cleanup_dquot; } @@ -1003,11 +999,12 @@ getblk_failed: goto getblk_failed; } memcpy(new_bh->b_data, s->base, new_bh->b_size); + ext4_xattr_block_csum_set(inode, new_bh); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); ext4_xattr_cache_insert(ext4_mb_cache, new_bh); - error = ext4_handle_dirty_xattr_block(handle, - inode, new_bh); + error = ext4_handle_dirty_metadata(handle, inode, + new_bh); if (error) goto cleanup; } @@ -1174,16 +1171,14 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, struct ext4_xattr_block_find bs = { .s = { .not_found = -ENODATA, }, }; - unsigned long no_expand; + int no_expand; int error; if (!name) return -EINVAL; if (strlen(name) > 255) return -ERANGE; - down_write(&EXT4_I(inode)->xattr_sem); - no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); - ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_write_lock_xattr(inode, &no_expand); error = ext4_reserve_inode_write(handle, inode, &is.iloc); if (error) @@ -1251,7 +1246,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, ext4_xattr_update_super_block(handle, inode->i_sb); inode->i_ctime = ext4_current_time(inode); if (!value) - ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + no_expand = 0; error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); /* * The bh is consumed by ext4_mark_iloc_dirty, even with @@ -1265,9 +1260,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, cleanup: brelse(is.iloc.bh); brelse(bs.bh); - if (no_expand == 0) - ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); return error; } @@ -1484,12 +1477,11 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ + int no_expand; + + if (ext4_write_trylock_xattr(inode, &no_expand) == 0) + return 0; - down_write(&EXT4_I(inode)->xattr_sem); - /* - * Set EXT4_STATE_NO_EXPAND to avoid recursion when marking inode dirty - */ - ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); retry: isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) @@ -1571,17 +1563,16 @@ shift: EXT4_I(inode)->i_extra_isize = new_extra_isize; brelse(bh); out: - ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); return 0; cleanup: brelse(bh); /* - * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode - * size expansion failed. + * Inode size expansion failed; don't try again */ - up_write(&EXT4_I(inode)->xattr_sem); + no_expand = 1; + ext4_write_unlock_xattr(inode, &no_expand); return error; } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index a92e783fa057..099c8b670ef5 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -102,6 +102,38 @@ extern const struct xattr_handler ext4_xattr_security_handler; #define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c" +/* + * The EXT4_STATE_NO_EXPAND is overloaded and used for two purposes. + * The first is to signal that there the inline xattrs and data are + * taking up so much space that we might as well not keep trying to + * expand it. The second is that xattr_sem is taken for writing, so + * we shouldn't try to recurse into the inode expansion. For this + * second case, we need to make sure that we take save and restore the + * NO_EXPAND state flag appropriately. + */ +static inline void ext4_write_lock_xattr(struct inode *inode, int *save) +{ + down_write(&EXT4_I(inode)->xattr_sem); + *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); +} + +static inline int ext4_write_trylock_xattr(struct inode *inode, int *save) +{ + if (down_write_trylock(&EXT4_I(inode)->xattr_sem) == 0) + return 0; + *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); + return 1; +} + +static inline void ext4_write_unlock_xattr(struct inode *inode, int *save) +{ + if (*save == 0) + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + up_write(&EXT4_I(inode)->xattr_sem); +} + extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 6fe23af509e1..55aa29c0c78d 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -211,7 +211,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { + if (acl && !ipage) { error = posix_acl_update_mode(inode, &inode->i_mode, &acl); if (error) return error; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 14db4b712021..99432b59c5cb 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1619,7 +1619,12 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, goto fail; } repeat: - page = grab_cache_page_write_begin(mapping, index, flags); + /* + * Do not use grab_cache_page_write_begin() to avoid deadlock due to + * wait_for_stable_page. Will wait that below with our IO control. + */ + page = pagecache_get_page(mapping, index, + FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); if (!page) { err = -ENOMEM; goto fail; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 369f4513be37..8add4e8bab99 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -130,19 +130,29 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, continue; } - /* encrypted case */ + if (de->hash_code != namehash) + goto not_match; + de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); - /* show encrypted name */ - if (fname->hash) { - if (de->hash_code == fname->hash) - goto found; - } else if (de_name.len == name->len && - de->hash_code == namehash && - !memcmp(de_name.name, name->name, name->len)) +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (unlikely(!name->name)) { + if (fname->usr_fname->name[0] == '_') { + if (de_name.len > 32 && + !memcmp(de_name.name + ((de_name.len - 17) & ~15), + fname->crypto_buf.name + 8, 16)) + goto found; + goto not_match; + } + name->name = fname->crypto_buf.name; + name->len = fname->crypto_buf.len; + } +#endif + if (de_name.len == name->len && + !memcmp(de_name.name, name->name, name->len)) goto found; - +not_match: if (max_slots && max_len > *max_slots) *max_slots = max_len; max_len = 0; @@ -170,12 +180,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, struct f2fs_dir_entry *de = NULL; bool room = false; int max_slots; - f2fs_hash_t namehash; - - if(fname->hash) - namehash = cpu_to_le32(fname->hash); - else - namehash = f2fs_dentry_hash(&name); + f2fs_hash_t namehash = f2fs_dentry_hash(&name, fname); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -207,9 +212,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, f2fs_put_page(dentry_page, 0); } - if (!de && room && F2FS_I(dir)->chash != namehash) { - F2FS_I(dir)->chash = namehash; - F2FS_I(dir)->clevel = level; + /* This is to increase the speed of f2fs_create */ + if (!de && room) { + F2FS_I(dir)->task = current; + if (F2FS_I(dir)->chash != namehash) { + F2FS_I(dir)->chash = namehash; + F2FS_I(dir)->clevel = level; + } } return de; @@ -268,7 +277,10 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, err = fscrypt_setup_filename(dir, child, 1, &fname); if (err) { - *res_page = ERR_PTR(err); + if (err == -ENOENT) + *res_page = NULL; + else + *res_page = ERR_PTR(err); return NULL; } @@ -535,7 +547,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, level = 0; slots = GET_DENTRY_SLOTS(new_name->len); - dentry_hash = f2fs_dentry_hash(new_name); + dentry_hash = f2fs_dentry_hash(new_name, NULL); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -643,14 +655,34 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct fscrypt_name fname; + struct page *page = NULL; + struct f2fs_dir_entry *de = NULL; int err; err = fscrypt_setup_filename(dir, name, 0, &fname); if (err) return err; - err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); - + /* + * An immature stakable filesystem shows a race condition between lookup + * and create. If we have same task when doing lookup and create, it's + * definitely fine as expected by VFS normally. Otherwise, let's just + * verify on-disk dentry one more time, which guarantees filesystem + * consistency more. + */ + if (current != F2FS_I(dir)->task) { + de = __f2fs_find_entry(dir, &fname, &page); + F2FS_I(dir)->task = NULL; + } + if (de) { + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + err = -EEXIST; + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + } else { + err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); + } fscrypt_free_filename(&fname); return err; } diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2b06d4fcd954..7b32ce979fe1 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -352,11 +352,12 @@ static struct extent_node *__try_merge_extent_node(struct inode *inode, } if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { - if (en) - __release_extent_node(sbi, et, prev_ex); next_ex->ei.fofs = ei->fofs; next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; + if (en) + __release_extent_node(sbi, et, prev_ex); + en = next_ex; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 506af456412f..88e111ab068b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -431,6 +431,7 @@ struct f2fs_inode_info { atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ + struct task_struct *task; /* lookup and create consistency */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ loff_t last_disk_size; /* lastly written file size */ @@ -833,6 +834,9 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ + /* threshold for converting bg victims for fg */ + u64 fggc_threshold; + /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; @@ -944,6 +948,7 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, { SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver); u32 *ctx = (u32 *)shash_desc_ctx(shash); + u32 retval; int err; shash->tfm = sbi->s_chksum_driver; @@ -953,7 +958,9 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, err = crypto_shash_update(shash, address, length); BUG_ON(err); - return *ctx; + retval = *ctx; + barrier_data(ctx); + return retval; } static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, @@ -2012,7 +2019,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct qstr *); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname); /* * node.c diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6f14ee923acd..34a69e7ed90b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -166,7 +166,8 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->ofs_unit = sbi->segs_per_sec; } - if (p->max_search > sbi->max_victim_search) + /* we need to check every dirty segments in the FG_GC case */ + if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; p->offset = sbi->last_victim[p->gc_mode]; @@ -199,6 +200,10 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { if (sec_usage_check(sbi, secno)) continue; + + if (no_fggc_candidate(sbi, secno)) + continue; + clear_bit(secno, dirty_i->victim_secmap); return secno * sbi->segs_per_sec; } @@ -322,13 +327,15 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, nsearched++; } - secno = GET_SECNO(sbi, segno); if (sec_usage_check(sbi, secno)) goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (gc_type == FG_GC && p.alloc_mode == LFS && + no_fggc_candidate(sbi, secno)) + goto next; cost = get_gc_cost(sbi, segno, &p); @@ -972,5 +979,16 @@ stop: void build_gc_manager(struct f2fs_sb_info *sbi) { + u64 main_count, resv_count, ovp_count, blocks_per_sec; + DIRTY_I(sbi)->v_ops = &default_v_ops; + + /* threshold of # of valid blocks in a section for victims of FG_GC */ + main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; + resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; + ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; + blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec; + + sbi->fggc_threshold = div_u64((main_count - ovp_count) * blocks_per_sec, + (main_count - resv_count)); } diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 71b7206c431e..eb2e031ea887 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -70,7 +70,8 @@ static void str2hashbuf(const unsigned char *msg, size_t len, *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname) { __u32 hash; f2fs_hash_t f2fs_hash; @@ -79,6 +80,10 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) const unsigned char *name = name_info->name; size_t len = name_info->len; + /* encrypted bigname case */ + if (fname && !fname->disk_name.name) + return cpu_to_le32(fname->hash); + if (is_dot_dotdot(name_info)) return 0; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 5f1a67f756af..a21faa1c6817 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -294,7 +294,7 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, return NULL; } - namehash = f2fs_dentry_hash(&name); + namehash = f2fs_dentry_hash(&name, fname); inline_dentry = inline_data_addr(ipage); @@ -531,7 +531,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_wait_on_page_writeback(ipage, NODE, true); - name_hash = f2fs_dentry_hash(new_name); + name_hash = f2fs_dentry_hash(new_name, NULL); make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 489fa0d5f914..8556fe1ccb8a 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -403,7 +403,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, return err; if (!fscrypt_has_encryption_key(dir)) - return -EPERM; + return -ENOKEY; disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + sizeof(struct fscrypt_symlink_data)); @@ -447,7 +447,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, goto err_out; if (!fscrypt_has_encryption_key(inode)) { - err = -EPERM; + err = -ENOKEY; goto err_out; } @@ -663,6 +663,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err = -ENOENT; + if ((f2fs_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (f2fs_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && !fscrypt_has_permitted_context(new_dir, old_inode)) { err = -EPERM; @@ -843,6 +849,12 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int old_nlink = 0, new_nlink = 0; int err = -ENOENT; + if ((f2fs_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (f2fs_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && (old_dir != new_dir) && (!fscrypt_has_permitted_context(new_dir, old_inode) || diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 2fc84a991325..98c1a63a4614 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -316,7 +316,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, return 0; /* Get the previous summary */ - for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; @@ -626,8 +626,6 @@ out: } clear_sbi_flag(sbi, SBI_POR_DOING); - if (err) - set_ckpt_flags(sbi, CP_ERROR_FLAG); mutex_unlock(&sbi->cp_mutex); /* let's drop all the directory inodes for clean checkpoint */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fc886f008449..e10f61684ea4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -813,6 +813,8 @@ next: start = start_segno + sbi->segs_per_sec; if (start < end) goto next; + else + end = start - 1; } mutex_unlock(&dirty_i->seglist_lock); @@ -1261,7 +1263,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; - if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0)) + if (IS_NODESEG(type)) return v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR); @@ -1803,6 +1805,8 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { + struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal; + struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal; int type = CURSEG_HOT_DATA; int err; @@ -1829,6 +1833,11 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) return err; } + /* sanity check for summary blocks */ + if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || + sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) + return -EINVAL; + return 0; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index fecb856ad874..b164f8339281 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -688,6 +688,15 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) - (base + 1) + type; } +static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, + unsigned int secno) +{ + if (get_valid_blocks(sbi, secno, sbi->segs_per_sec) >= + sbi->fggc_threshold) + return true; + return false; +} + static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 013c6a541d6b..eb20b8767f3c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1405,6 +1405,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } + if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment count (%u)", + le32_to_cpu(raw_super->segment_count)); + return 1; + } + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ if (sanity_check_area_boundary(sbi, bh)) return 1; @@ -1417,6 +1424,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned int main_segs, blocks_per_seg; + int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1428,6 +1437,20 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; + main_segs = le32_to_cpu(raw_super->segment_count_main); + blocks_per_seg = sbi->blocks_per_seg; + + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) + return 1; + } + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 338d2f73eb29..a2c05f2ada6d 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1359,6 +1359,16 @@ out: return 0; } +static void fat_dummy_inode_init(struct inode *inode) +{ + /* Initialize this dummy inode to work as no-op. */ + MSDOS_I(inode)->mmu_private = 0; + MSDOS_I(inode)->i_start = 0; + MSDOS_I(inode)->i_logstart = 0; + MSDOS_I(inode)->i_attrs = 0; + MSDOS_I(inode)->i_pos = 0; +} + static int fat_read_root(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); @@ -1803,12 +1813,13 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, fat_inode = new_inode(sb); if (!fat_inode) goto out_fail; - MSDOS_I(fat_inode)->i_pos = 0; + fat_dummy_inode_init(fat_inode); sbi->fat_inode = fat_inode; fsinfo_inode = new_inode(sb); if (!fsinfo_inode) goto out_fail; + fat_dummy_inode_init(fsinfo_inode); fsinfo_inode->i_ino = MSDOS_FSINFO_INO; sbi->fsinfo_inode = fsinfo_inode; insert_inode_hash(fsinfo_inode); diff --git a/fs/fcntl.c b/fs/fcntl.c index 350a2c8cfd28..1493ceb0477d 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -741,16 +741,10 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( - O_RDONLY | O_WRONLY | O_RDWR | - O_CREAT | O_EXCL | O_NOCTTY | - O_TRUNC | O_APPEND | /* O_NONBLOCK | */ - __O_SYNC | O_DSYNC | FASYNC | - O_DIRECT | O_LARGEFILE | O_DIRECTORY | - O_NOFOLLOW | O_NOATIME | O_CLOEXEC | - __FMODE_EXEC | O_PATH | __O_TMPFILE | - __FMODE_NONOTIFY - )); + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != + HWEIGHT32( + (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) | + __FMODE_EXEC | __FMODE_NONOTIFY)); fasync_cache = kmem_cache_create("fasync_cache", sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 4304072161aa..40d61077bead 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -542,6 +542,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { if (invalidate) set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); } } else { @@ -560,6 +561,10 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t, TASK_UNINTERRUPTIBLE); + /* Make sure any pending writes are cancelled. */ + if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) + fscache_invalidate_writes(cookie); + /* Reset the cookie state if it wasn't relinquished */ if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) { atomic_inc(&cookie->n_active); diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index 9b28649df3a1..a8aa00be4444 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c @@ -48,6 +48,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) cookie->flags = 1 << FSCACHE_COOKIE_ENABLED; spin_lock_init(&cookie->lock); + spin_lock_init(&cookie->stores_lock); INIT_HLIST_HEAD(&cookie->backing_objects); /* check the netfs type is not already present */ diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 5d5ddaa84b21..37e0c31d284f 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -330,6 +330,13 @@ static void fscache_objlist_config(struct fscache_objlist_data *data) rcu_read_lock(); confkey = user_key_payload(key); + if (!confkey) { + /* key was revoked */ + rcu_read_unlock(); + key_put(key); + goto no_config; + } + buf = confkey->data; for (len = confkey->datalen - 1; len >= 0; len--) { diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 9e792e30f4db..7a182c87f378 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -30,6 +30,7 @@ static const struct fscache_state *fscache_look_up_object(struct fscache_object static const struct fscache_state *fscache_object_available(struct fscache_object *, int); static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int); static const struct fscache_state *fscache_update_object(struct fscache_object *, int); +static const struct fscache_state *fscache_object_dead(struct fscache_object *, int); #define __STATE_NAME(n) fscache_osm_##n #define STATE(n) (&__STATE_NAME(n)) @@ -91,7 +92,7 @@ static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure); static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object); static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents); static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object); -static WORK_STATE(OBJECT_DEAD, "DEAD", (void*)2UL); +static WORK_STATE(OBJECT_DEAD, "DEAD", fscache_object_dead); static WAIT_STATE(WAIT_FOR_INIT, "?INI", TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD)); @@ -229,6 +230,10 @@ execute_work_state: event = -1; if (new_state == NO_TRANSIT) { _debug("{OBJ%x} %s notrans", object->debug_id, state->name); + if (unlikely(state == STATE(OBJECT_DEAD))) { + _leave(" [dead]"); + return; + } fscache_enqueue_object(object); event_mask = object->oob_event_mask; goto unmask_events; @@ -239,7 +244,7 @@ execute_work_state: object->state = state = new_state; if (state->work) { - if (unlikely(state->work == ((void *)2UL))) { + if (unlikely(state == STATE(OBJECT_DEAD))) { _leave(" [dead]"); return; } @@ -645,6 +650,12 @@ static const struct fscache_state *fscache_kill_object(struct fscache_object *ob fscache_mark_object_dead(object); object->oob_event_mask = 0; + if (test_bit(FSCACHE_OBJECT_RETIRED, &object->flags)) { + /* Reject any new read/write ops and abort any that are pending. */ + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); + fscache_cancel_all_ops(object); + } + if (list_empty(&object->dependents) && object->n_ops == 0 && object->n_children == 0) @@ -1077,3 +1088,20 @@ void fscache_object_mark_killed(struct fscache_object *object, } } EXPORT_SYMBOL(fscache_object_mark_killed); + +/* + * The object is dead. We can get here if an object gets queued by an event + * that would lead to its death (such as EV_KILL) when the dispatcher is + * already running (and so can be requeued) but hasn't yet cleared the event + * mask. + */ +static const struct fscache_state *fscache_object_dead(struct fscache_object *object, + int event) +{ + if (!test_and_set_bit(FSCACHE_OBJECT_RUN_AFTER_DEAD, + &object->flags)) + return NO_TRANSIT; + + WARN(true, "FS-Cache object redispatched after death"); + return NO_TRANSIT; +} diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 4e06a27ed7f8..f11792672977 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -399,6 +399,10 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->waitq.lock); + if (test_bit(FR_FINISHED, &req->flags)) { + spin_unlock(&fiq->waitq.lock); + return; + } if (list_empty(&req->intr_entry)) { list_add_tail(&req->intr_entry, &fiq->interrupts); wake_up_locked(&fiq->waitq); @@ -1372,6 +1376,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, * code can Oops if the buffer persists after module unload. */ bufs[page_nr].ops = &nosteal_pipe_buf_ops; + bufs[page_nr].flags = 0; ret = add_to_pipe(pipe, &bufs[page_nr++]); if (unlikely(ret < 0)) break; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 642c57b8de7b..4bbad745415a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1312,7 +1312,8 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, */ over = !dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, dirent->type); - ctx->pos = dirent->off; + if (!over) + ctx->pos = dirent->off; } buf += reclen; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2401c5dabb2a..996aa23c409e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -46,7 +46,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; - ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); + ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL); if (unlikely(!ff)) return NULL; @@ -100,6 +100,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) iput(req->misc.release.inode); fuse_put_request(ff->fc, req); } else if (sync) { + __set_bit(FR_FORCE, &req->flags); __clear_bit(FR_BACKGROUND, &req->flags); fuse_request_send(ff->fc, req); iput(req->misc.release.inode); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 14cbf60167a7..7a8b1d72e3d9 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -80,9 +80,9 @@ static struct rhashtable_params ht_parms = { static struct rhashtable gl_hash_table; -void gfs2_glock_free(struct gfs2_glock *gl) +static void gfs2_glock_dealloc(struct rcu_head *rcu) { - struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); if (gl->gl_ops->go_flags & GLOF_ASPACE) { kmem_cache_free(gfs2_glock_aspace_cachep, gl); @@ -90,6 +90,13 @@ void gfs2_glock_free(struct gfs2_glock *gl) kfree(gl->gl_lksb.sb_lvbptr); kmem_cache_free(gfs2_glock_cachep, gl); } +} + +void gfs2_glock_free(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_glock_wait); } @@ -658,9 +665,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, struct kmem_cache *cachep; int ret, tries = 0; + rcu_read_lock(); gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); if (gl && !lockref_get_not_dead(&gl->gl_lockref)) gl = NULL; + rcu_read_unlock(); *glp = gl; if (gl) @@ -728,15 +737,18 @@ again: if (ret == -EEXIST) { ret = 0; + rcu_read_lock(); tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) { if (++tries < 100) { + rcu_read_unlock(); cond_resched(); goto again; } tmp = NULL; ret = -ENOMEM; } + rcu_read_unlock(); } else { WARN_ON_ONCE(ret); } @@ -1420,26 +1432,32 @@ static struct shrinker glock_shrinker = { * @sdp: the filesystem * @bucket: the bucket * + * Note that the function can be called multiple times on the same + * object. So the user must ensure that the function can cope with + * that. */ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) { struct gfs2_glock *gl; - struct rhash_head *pos; - const struct bucket_table *tbl; - int i; + struct rhashtable_iter iter; - rcu_read_lock(); - tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table); - for (i = 0; i < tbl->size; i++) { - rht_for_each_entry_rcu(gl, pos, tbl, i, gl_node) { + rhashtable_walk_enter(&gl_hash_table, &iter); + + do { + gl = ERR_PTR(rhashtable_walk_start(&iter)); + if (gl) + continue; + + while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) if ((gl->gl_name.ln_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); - } - } - rcu_read_unlock(); - cond_resched(); + + rhashtable_walk_stop(&iter); + } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); + + rhashtable_walk_exit(&iter); } /** @@ -1802,29 +1820,27 @@ void gfs2_glock_exit(void) static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi) { - do { - gi->gl = rhashtable_walk_next(&gi->hti); + while ((gi->gl = rhashtable_walk_next(&gi->hti))) { if (IS_ERR(gi->gl)) { if (PTR_ERR(gi->gl) == -EAGAIN) continue; gi->gl = NULL; + return; } - /* Skip entries for other sb and dead entries */ - } while ((gi->gl) && ((gi->sdp != gi->gl->gl_name.ln_sbd) || - __lockref_is_dead(&gi->gl->gl_lockref))); + /* Skip entries for other sb and dead entries */ + if (gi->sdp == gi->gl->gl_name.ln_sbd && + !__lockref_is_dead(&gi->gl->gl_lockref)) + return; + } } static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) { struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; - int ret; - if (gi->last_pos <= *pos) - n = (*pos - gi->last_pos); - - ret = rhashtable_walk_start(&gi->hti); - if (ret) + rhashtable_walk_enter(&gl_hash_table, &gi->hti); + if (rhashtable_walk_start(&gi->hti) != 0) return NULL; do { @@ -1832,6 +1848,7 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) } while (gi->gl && n--); gi->last_pos = *pos; + return gi->gl; } @@ -1843,6 +1860,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, (*pos)++; gi->last_pos = *pos; gfs2_glock_iter_next(gi); + return gi->gl; } @@ -1852,6 +1870,7 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) gi->gl = NULL; rhashtable_walk_stop(&gi->hti); + rhashtable_walk_exit(&gi->hti); } static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) @@ -1914,12 +1933,10 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file) struct gfs2_glock_iter *gi = seq->private; gi->sdp = inode->i_private; - gi->last_pos = 0; seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; gi->gl = NULL; - ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL); } return ret; } @@ -1930,7 +1947,6 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file) struct gfs2_glock_iter *gi = seq->private; gi->gl = NULL; - rhashtable_walk_exit(&gi->hti); return seq_release_private(inode, file); } @@ -1942,12 +1958,10 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; gi->sdp = inode->i_private; - gi->last_pos = 0; seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; gi->gl = NULL; - ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL); } return ret; } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index a6a3389a07fc..a04bf9533d9e 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -207,7 +207,7 @@ struct lm_lockname { struct gfs2_sbd *ln_sbd; u64 ln_number; unsigned int ln_type; -}; +} __packed __aligned(sizeof(int)); #define lm_name_equal(name1, name2) \ (((name1)->ln_number == (name2)->ln_number) && \ @@ -370,6 +370,7 @@ struct gfs2_glock { loff_t end; } gl_vm; }; + struct rcu_head gl_rcu; struct rhash_head gl_node; }; diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c index 9b92058a1240..6bb5d7c42888 100644 --- a/fs/hfsplus/posix_acl.c +++ b/fs/hfsplus/posix_acl.c @@ -51,8 +51,8 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type) return acl; } -int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, - int type) +static int __hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, + int type) { int err; char *xattr_name; @@ -64,12 +64,6 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, switch (type) { case ACL_TYPE_ACCESS: xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - err = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (err) - return err; - } - err = 0; break; case ACL_TYPE_DEFAULT: @@ -105,6 +99,18 @@ end_set_acl: return err; } +int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int err; + + if (type == ACL_TYPE_ACCESS && acl) { + err = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (err) + return err; + } + return __hfsplus_set_posix_acl(inode, acl, type); +} + int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) { int err = 0; @@ -122,15 +128,15 @@ int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) return err; if (default_acl) { - err = hfsplus_set_posix_acl(inode, default_acl, - ACL_TYPE_DEFAULT); + err = __hfsplus_set_posix_acl(inode, default_acl, + ACL_TYPE_DEFAULT); posix_acl_release(default_acl); } if (acl) { if (!err) - err = hfsplus_set_posix_acl(inode, acl, - ACL_TYPE_ACCESS); + err = __hfsplus_set_posix_acl(inode, acl, + ACL_TYPE_ACCESS); posix_acl_release(acl); } return err; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 4fb7b10f3a05..2c2f182cde03 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -191,7 +191,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } @@ -695,14 +695,11 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, inode = new_inode(sb); if (inode) { - struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); inode->i_mode = S_IFDIR | config->mode; inode->i_uid = config->uid; inode->i_gid = config->gid; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - info = HUGETLBFS_I(inode); - mpol_shared_policy_init(&info->policy, NULL); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ @@ -733,7 +730,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode = new_inode(sb); if (inode) { - struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, @@ -741,15 +737,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_mapping->private_data = resv_map; - info = HUGETLBFS_I(inode); - /* - * The policy is initialized here even if we are creating a - * private inode because initialization simply creates an - * an empty rb tree and calls rwlock_init(), later when we - * call mpol_free_shared_policy() it will just return because - * the rb tree will still be empty. - */ - mpol_shared_policy_init(&info->policy, NULL); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -937,6 +924,18 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) hugetlbfs_inc_free_inodes(sbinfo); return NULL; } + + /* + * Any time after allocation, hugetlbfs_destroy_inode can be called + * for the inode. mpol_free_shared_policy is unconditionally called + * as part of hugetlbfs_destroy_inode. So, initialize policy here + * in case of a quick call to destroy. + * + * Note that the policy is initialized even if we are creating a + * private inode. This simplifies hugetlbfs_destroy_inode. + */ + mpol_shared_policy_init(&p->policy, NULL); + return &p->vfs_inode; } diff --git a/fs/inode.c b/fs/inode.c index 88110fd0b282..920aa0b1c6b0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -637,6 +637,7 @@ again: dispose_list(&dispose); } +EXPORT_SYMBOL_GPL(evict_inodes); /** * invalidate_inodes - attempt to free all inodes on a superblock diff --git a/fs/internal.h b/fs/internal.h index f4da3341b4a3..8b7143b0211c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -136,7 +136,6 @@ extern bool atime_needs_update_rcu(const struct path *, struct inode *); extern void inode_io_list_del(struct inode *inode); extern long get_nr_dirty_inodes(void); -extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); /* diff --git a/fs/iomap.c b/fs/iomap.c index 814ae8f9587d..a49db8806a3a 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -281,7 +281,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, unsigned long bytes; /* Bytes to write to page */ offset = (pos & (PAGE_SIZE - 1)); - bytes = min_t(unsigned long, PAGE_SIZE - offset, length); + bytes = min_t(loff_t, PAGE_SIZE - offset, length); rpage = __iomap_read_page(inode, pos); if (IS_ERR(rpage)) @@ -376,7 +376,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, unsigned offset, bytes; offset = pos & (PAGE_SIZE - 1); /* Within page */ - bytes = min_t(unsigned, PAGE_SIZE - offset, count); + bytes = min_t(loff_t, PAGE_SIZE - offset, count); if (IS_DAX(inode)) status = iomap_dax_zero(pos, offset, bytes, iomap); @@ -419,8 +419,8 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, struct iomap_ops *ops) { - unsigned blocksize = (1 << inode->i_blkbits); - unsigned off = pos & (blocksize - 1); + unsigned int blocksize = i_blocksize(inode); + unsigned int off = pos & (blocksize - 1); /* Block boundary? Nothing to do */ if (!off) diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 0ac4c1f73fbd..25177e6bd603 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -103,7 +103,7 @@ static inline unsigned int isonum_733(char *p) /* Ignore bigendian datum due to broken mastering programs */ return get_unaligned_le32(p); } -extern int iso_date(char *, int); +extern int iso_date(u8 *, int); struct inode; /* To make gcc happy */ diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h index ed09e2b08637..f835976ce033 100644 --- a/fs/isofs/rock.h +++ b/fs/isofs/rock.h @@ -65,7 +65,7 @@ struct RR_PL_s { }; struct stamp { - char time[7]; + __u8 time[7]; /* actually 6 unsigned, 1 signed */ } __attribute__ ((packed)); struct RR_TF_s { diff --git a/fs/isofs/util.c b/fs/isofs/util.c index 005a15cfd30a..37860fea364d 100644 --- a/fs/isofs/util.c +++ b/fs/isofs/util.c @@ -15,7 +15,7 @@ * to GMT. Thus we should always be correct. */ -int iso_date(char * p, int flag) +int iso_date(u8 *p, int flag) { int year, month, day, hour, minute, second, tz; int crtime; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 927da4956a89..7d4b557f1962 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1125,10 +1125,8 @@ static journal_t *journal_init_common(struct block_device *bdev, /* Set up a default-sized revoke table for the new mount. */ err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); - if (err) { - kfree(journal); - return NULL; - } + if (err) + goto err_cleanup; spin_lock_init(&journal->j_history_lock); @@ -1145,23 +1143,25 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_wbufsize = n; journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *), GFP_KERNEL); - if (!journal->j_wbuf) { - kfree(journal); - return NULL; - } + if (!journal->j_wbuf) + goto err_cleanup; bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize); if (!bh) { pr_err("%s: Cannot get buffer for journal superblock\n", __func__); - kfree(journal->j_wbuf); - kfree(journal); - return NULL; + goto err_cleanup; } journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; return journal; + +err_cleanup: + kfree(journal->j_wbuf); + jbd2_journal_destroy_revoke(journal); + kfree(journal); + return NULL; } /* jbd2_journal_init_dev and jbd2_journal_init_inode: diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 91171dc352cb..3cd73059da9a 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -280,6 +280,7 @@ int jbd2_journal_init_revoke(journal_t *journal, int hash_size) fail1: jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); + journal->j_revoke_table[0] = NULL; fail0: return -ENOMEM; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e1652665bd93..5e659ee08d6a 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1863,7 +1863,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) __blist_del_buffer(list, jh); jh->b_jlist = BJ_None; - if (test_clear_buffer_jbddirty(bh)) + if (transaction && is_journal_aborted(transaction->t_journal)) + clear_buffer_jbddirty(bh); + else if (test_clear_buffer_jbddirty(bh)) mark_buffer_dirty(bh); /* Expose it to the VM */ } diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 7bc186f4ed4d..1be45c8d460d 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -77,13 +77,6 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: ea_name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - rc = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (rc) - return rc; - inode->i_ctime = current_time(inode); - mark_inode_dirty(inode); - } break; case ACL_TYPE_DEFAULT: ea_name = XATTR_NAME_POSIX_ACL_DEFAULT; @@ -118,9 +111,17 @@ int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) tid = txBegin(inode->i_sb, 0); mutex_lock(&JFS_IP(inode)->commit_mutex); + if (type == ACL_TYPE_ACCESS && acl) { + rc = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (rc) + goto end_tx; + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); + } rc = __jfs_set_acl(tid, inode, type, acl); if (!rc) rc = txCommit(tid, 1, &inode, 0); +end_tx: txEnd(tid); mutex_unlock(&JFS_IP(inode)->commit_mutex); return rc; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 85671f7f8518..14be95bc0bcd 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -758,7 +758,7 @@ static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data, sb->s_blocksize - offset : toread; tmp_bh.b_state = 0; - tmp_bh.b_size = 1 << inode->i_blkbits; + tmp_bh.b_size = i_blocksize(inode); err = jfs_get_block(inode, blk, &tmp_bh, 0); if (err) return err; @@ -798,7 +798,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, sb->s_blocksize - offset : towrite; tmp_bh.b_state = 0; - tmp_bh.b_size = 1 << inode->i_blkbits; + tmp_bh.b_size = i_blocksize(inode); err = jfs_get_block(inode, blk, &tmp_bh, 1); if (err) goto out; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index fc4084ef4736..9d373247222c 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -365,6 +365,7 @@ static int lockd_start_svc(struct svc_serv *serv) printk(KERN_WARNING "lockd_up: svc_rqst allocation failed, error=%d\n", error); + lockd_unregister_notifiers(); goto out_rqst; } @@ -455,13 +456,16 @@ int lockd_up(struct net *net) } error = lockd_up_net(serv, net); - if (error < 0) - goto err_net; + if (error < 0) { + lockd_unregister_notifiers(); + goto err_put; + } error = lockd_start_svc(serv); - if (error < 0) - goto err_start; - + if (error < 0) { + lockd_down_net(serv, net); + goto err_put; + } nlmsvc_users++; /* * Note: svc_serv structures have an initial use count of 1, @@ -472,12 +476,6 @@ err_put: err_create: mutex_unlock(&nlmsvc_mutex); return error; - -err_start: - lockd_down_net(serv, net); -err_net: - lockd_unregister_notifiers(); - goto err_put; } EXPORT_SYMBOL_GPL(lockd_up); diff --git a/fs/mount.h b/fs/mount.h index d2e25d7b64b3..3603884a63dd 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -58,6 +58,7 @@ struct mount { struct mnt_namespace *mnt_ns; /* containing namespace */ struct mountpoint *mnt_mp; /* where is it mounted */ struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ + struct list_head mnt_umounting; /* list entry for umount propagation */ #ifdef CONFIG_FSNOTIFY struct hlist_head mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; @@ -89,7 +90,6 @@ static inline int is_mounted(struct vfsmount *mnt) } extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); -extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); extern int __legitimize_mnt(struct vfsmount *, unsigned); extern bool legitimize_mnt(struct vfsmount *, unsigned); diff --git a/fs/mpage.c b/fs/mpage.c index d2413af0823a..e2ea442bb9e1 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -115,7 +115,7 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) SetPageUptodate(page); return; } - create_empty_buffers(page, 1 << inode->i_blkbits, 0); + create_empty_buffers(page, i_blocksize(inode), 0); } head = page_buffers(page); page_bh = head; @@ -466,6 +466,16 @@ static void clean_buffers(struct page *page, unsigned first_unmapped) try_to_free_buffers(page); } +/* + * For situations where we want to clean all buffers attached to a page. + * We don't need to calculate how many buffers are attached to the page, + * we just need to specify a number larger than the maximum number of buffers. + */ +void clean_page_buffers(struct page *page) +{ + clean_buffers(page, ~0U); +} + static int __mpage_writepage(struct page *page, struct writeback_control *wbc, void *data) { @@ -604,10 +614,8 @@ alloc_new: if (bio == NULL) { if (first_unmapped == blocks_per_page) { if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9), - page, wbc)) { - clean_buffers(page, first_unmapped); + page, wbc)) goto out; - } } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH); diff --git a/fs/namei.c b/fs/namei.c index 5b4eed221530..e7d125c23aa6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1100,7 +1100,6 @@ static int follow_automount(struct path *path, struct nameidata *nd, bool *need_mntput) { struct vfsmount *mnt; - const struct cred *old_cred; int err; if (!path->dentry->d_op || !path->dentry->d_op->d_automount) @@ -1129,9 +1128,7 @@ static int follow_automount(struct path *path, struct nameidata *nd, if (nd->total_link_count >= 40) return -ELOOP; - old_cred = override_creds(&init_cred); mnt = path->dentry->d_op->d_automount(path); - revert_creds(old_cred); if (IS_ERR(mnt)) { /* * The filesystem is allowed to return -EISDIR here to indicate @@ -2974,10 +2971,16 @@ static inline int open_to_namei_flags(int flag) static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode) { + struct user_namespace *s_user_ns; int error = security_path_mknod(dir, dentry, mode, 0); if (error) return error; + s_user_ns = dir->dentry->d_sb->s_user_ns; + if (!kuid_has_mapping(s_user_ns, current_fsuid()) || + !kgid_has_mapping(s_user_ns, current_fsgid())) + return -EOVERFLOW; + error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -4339,11 +4342,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, { int error; bool is_dir = d_is_dir(old_dentry); - const unsigned char *old_name; struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; + struct name_snapshot old_name; /* * Check source == target. @@ -4394,7 +4397,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; - old_name = fsnotify_oldname_init(old_dentry->d_name.name); + take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); if (!is_dir || (flags & RENAME_EXCHANGE)) lock_two_nondirectories(source, target); @@ -4449,14 +4452,14 @@ out: inode_unlock(target); dput(new_dentry); if (!error) { - fsnotify_move(old_dir, new_dir, old_name, is_dir, + fsnotify_move(old_dir, new_dir, old_name.name, is_dir, !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); if (flags & RENAME_EXCHANGE) { fsnotify_move(new_dir, old_dir, old_dentry->d_name.name, new_is_dir, NULL, new_dentry); } } - fsnotify_oldname_free(old_name); + release_dentry_name_snapshot(&old_name); return error; } diff --git a/fs/namespace.c b/fs/namespace.c index 7cea503ae06d..d7360f9897b4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -237,6 +237,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); + INIT_LIST_HEAD(&mnt->mnt_umounting); #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); #endif @@ -641,28 +642,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) } /* - * find the last mount at @dentry on vfsmount @mnt. - * mount_lock must be held. - */ -struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) -{ - struct mount *p, *res = NULL; - p = __lookup_mnt(mnt, dentry); - if (!p) - goto out; - if (!(p->mnt.mnt_flags & MNT_UMOUNT)) - res = p; - hlist_for_each_entry_continue(p, mnt_hash) { - if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) - break; - if (!(p->mnt.mnt_flags & MNT_UMOUNT)) - res = p; - } -out: - return res; -} - -/* * lookup_mnt - Return the first child mount mounted at path * * "First" means first mounted chronologically. If you create the @@ -882,6 +861,13 @@ void mnt_set_mountpoint(struct mount *mnt, hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } +static void __attach_mnt(struct mount *mnt, struct mount *parent) +{ + hlist_add_head_rcu(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); +} + /* * vfsmount lock must be held for write */ @@ -890,28 +876,45 @@ static void attach_mnt(struct mount *mnt, struct mountpoint *mp) { mnt_set_mountpoint(parent, mp, mnt); - hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); - list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + __attach_mnt(mnt, parent); } -static void attach_shadowed(struct mount *mnt, - struct mount *parent, - struct mount *shadows) +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) { - if (shadows) { - hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); - list_add(&mnt->mnt_child, &shadows->mnt_child); - } else { - hlist_add_head_rcu(&mnt->mnt_hash, - m_hash(&parent->mnt, mnt->mnt_mountpoint)); - list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); - } + struct mountpoint *old_mp = mnt->mnt_mp; + struct dentry *old_mountpoint = mnt->mnt_mountpoint; + struct mount *old_parent = mnt->mnt_parent; + + list_del_init(&mnt->mnt_child); + hlist_del_init(&mnt->mnt_mp_list); + hlist_del_init_rcu(&mnt->mnt_hash); + + attach_mnt(mnt, parent, mp); + + put_mountpoint(old_mp); + + /* + * Safely avoid even the suggestion this code might sleep or + * lock the mount hash by taking advantage of the knowledge that + * mnt_change_mountpoint will not release the final reference + * to a mountpoint. + * + * During mounting, the mount passed in as the parent mount will + * continue to use the old mountpoint and during unmounting, the + * old mountpoint will continue to exist until namespace_unlock, + * which happens well after mnt_change_mountpoint. + */ + spin_lock(&old_mountpoint->d_lock); + old_mountpoint->d_lockref.count--; + spin_unlock(&old_mountpoint->d_lock); + + mnt_add_count(old_parent, -1); } /* * vfsmount lock must be held for write */ -static void commit_tree(struct mount *mnt, struct mount *shadows) +static void commit_tree(struct mount *mnt) { struct mount *parent = mnt->mnt_parent; struct mount *m; @@ -929,7 +932,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) n->mounts += n->pending_mounts; n->pending_mounts = 0; - attach_shadowed(mnt, parent, shadows); + __attach_mnt(mnt, parent); touch_mnt_namespace(n); } @@ -993,6 +996,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void } EXPORT_SYMBOL_GPL(vfs_kern_mount); +struct vfsmount * +vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, + const char *name, void *data) +{ + /* Until it is worked out how to pass the user namespace + * through from the parent mount to the submount don't support + * unprivileged mounts with submounts. + */ + if (mountpoint->d_sb->s_user_ns != &init_user_ns) + return ERR_PTR(-EPERM); + + return vfs_kern_mount(type, MS_SUBMOUNT, name, data); +} +EXPORT_SYMBOL_GPL(vfs_submount); + static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { @@ -1737,7 +1755,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, continue; for (s = r; s; s = next_mnt(s, r)) { - struct mount *t = NULL; if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(s)) { s = skip_mnt_tree(s); @@ -1759,14 +1776,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, goto out; lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); - mnt_set_mountpoint(parent, p->mnt_mp, q); - if (!list_empty(&parent->mnt_mounts)) { - t = list_last_entry(&parent->mnt_mounts, - struct mount, mnt_child); - if (t->mnt_mp != p->mnt_mp) - t = NULL; - } - attach_shadowed(q, parent, t); + attach_mnt(q, parent, p->mnt_mp); unlock_mount_hash(); } } @@ -1967,10 +1977,18 @@ static int attach_recursive_mnt(struct mount *source_mnt, { HLIST_HEAD(tree_list); struct mnt_namespace *ns = dest_mnt->mnt_ns; + struct mountpoint *smp; struct mount *child, *p; struct hlist_node *n; int err; + /* Preallocate a mountpoint in case the new mounts need + * to be tucked under other mounts. + */ + smp = get_mountpoint(source_mnt->mnt.mnt_root); + if (IS_ERR(smp)) + return PTR_ERR(smp); + /* Is there space to add these mounts to the mount namespace? */ if (!parent_path) { err = count_mounts(ns, source_mnt); @@ -1997,16 +2015,19 @@ static int attach_recursive_mnt(struct mount *source_mnt, touch_mnt_namespace(source_mnt->mnt_ns); } else { mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); - commit_tree(source_mnt, NULL); + commit_tree(source_mnt); } hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); - q = __lookup_mnt_last(&child->mnt_parent->mnt, - child->mnt_mountpoint); - commit_tree(child, q); + q = __lookup_mnt(&child->mnt_parent->mnt, + child->mnt_mountpoint); + if (q) + mnt_change_mountpoint(child, smp, q); + commit_tree(child); } + put_mountpoint(smp); unlock_mount_hash(); return 0; @@ -2021,6 +2042,11 @@ static int attach_recursive_mnt(struct mount *source_mnt, cleanup_group_ids(source_mnt, NULL); out: ns->pending_mounts = 0; + + read_seqlock_excl(&mount_lock); + put_mountpoint(smp); + read_sequnlock_excl(&mount_lock); + return err; } @@ -2769,7 +2795,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | - MS_STRICTATIME | MS_NOREMOTELOCK); + MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT); if (flags & MS_REMOUNT) retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index f31fd0dd92c6..b1daeafbea92 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -121,6 +121,7 @@ config PNFS_FILE_LAYOUT config PNFS_BLOCK tristate depends on NFS_V4_1 && BLK_DEV_DM + depends on 64BIT || LBDAF default NFS_V4 config PNFS_OBJLAYOUT diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 484bebc20bca..af84a92cb142 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -75,7 +75,10 @@ nfs4_callback_svc(void *vrqstp) set_freezable(); - while (!kthread_should_stop()) { + while (!kthread_freezable_should_stop(NULL)) { + + if (signal_pending(current)) + flush_signals(current); /* * Listen for a request on the socket */ @@ -84,6 +87,8 @@ nfs4_callback_svc(void *vrqstp) continue; svc_process(rqstp); } + svc_exit_thread(rqstp); + module_put_and_exit(0); return 0; } @@ -102,9 +107,10 @@ nfs41_callback_svc(void *vrqstp) set_freezable(); - while (!kthread_should_stop()) { - if (try_to_freeze()) - continue; + while (!kthread_freezable_should_stop(NULL)) { + + if (signal_pending(current)) + flush_signals(current); prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); @@ -120,11 +126,13 @@ nfs41_callback_svc(void *vrqstp) error); } else { spin_unlock_bh(&serv->sv_cb_lock); - schedule(); + if (!kthread_should_stop()) + schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } - flush_signals(current); } + svc_exit_thread(rqstp); + module_put_and_exit(0); return 0; } @@ -220,23 +228,23 @@ err_bind: static struct svc_serv_ops nfs40_cb_sv_ops = { .svo_function = nfs4_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads, + .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; #if defined(CONFIG_NFS_V4_1) static struct svc_serv_ops nfs41_cb_sv_ops = { .svo_function = nfs41_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads, + .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; -struct svc_serv_ops *nfs4_cb_sv_ops[] = { +static struct svc_serv_ops *nfs4_cb_sv_ops[] = { [0] = &nfs40_cb_sv_ops, [1] = &nfs41_cb_sv_ops, }; #else -struct svc_serv_ops *nfs4_cb_sv_ops[] = { +static struct svc_serv_ops *nfs4_cb_sv_ops[] = { [0] = &nfs40_cb_sv_ops, [1] = NULL, }; @@ -279,7 +287,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", cb_info->users); - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); + serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); if (!serv) { printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); return ERR_PTR(-ENOMEM); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 53e02b8bd9bd..65566d5fcf39 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1167,11 +1167,13 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) /* Force a full look up iff the parent directory has changed */ if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) { - - if (nfs_lookup_verify_inode(inode, flags)) { + error = nfs_lookup_verify_inode(inode, flags); + if (error) { if (flags & LOOKUP_RCU) return -ECHILD; - goto out_zap_parent; + if (error == -ESTALE) + goto out_zap_parent; + goto out_error; } goto out_valid; } @@ -1195,8 +1197,10 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) trace_nfs_lookup_revalidate_enter(dir, dentry, flags); error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error); - if (error) + if (error == -ESTALE || error == -ENOENT) goto out_bad; + if (error) + goto out_error; if (nfs_compare_fh(NFS_FH(inode), fhandle)) goto out_bad; if ((error = nfs_refresh_inode(inode, fattr)) != 0) @@ -1288,7 +1292,7 @@ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) return 0; } - error = nfs_revalidate_inode(NFS_SERVER(inode), inode); + error = nfs_lookup_verify_inode(inode, flags); dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n", __func__, inode->i_ino, error ? "invalid" : "valid"); return !error; @@ -1439,6 +1443,7 @@ static int nfs4_lookup_revalidate(struct dentry *, unsigned int); const struct dentry_operations nfs4_dentry_operations = { .d_revalidate = nfs4_lookup_revalidate, + .d_weak_revalidate = nfs_weak_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, .d_automount = nfs_d_automount, diff --git a/fs/nfs/file.c b/fs/nfs/file.c index a1de8ef63e56..1eec947c562d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -636,11 +636,11 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result <= 0) goto out; - result = generic_write_sync(iocb, result); - if (result < 0) - goto out; written = result; iocb->ki_pos += written; + result = generic_write_sync(iocb, written); + if (result < 0) + goto out; /* Return error values */ if (nfs_need_check_write(file, inode)) { @@ -757,7 +757,7 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) */ nfs_sync_mapping(filp->f_mapping); if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) - nfs_zap_mapping(inode, filp->f_mapping); + nfs_zap_caches(inode); out: return status; } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index a5c38889e7ae..13abd608af0f 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1073,9 +1073,6 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, struct nfs_client *mds_client = mds_server->nfs_client; struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; - if (task->tk_status >= 0) - return 0; - switch (task->tk_status) { /* MDS state errors */ case -NFS4ERR_DELEG_REVOKED: @@ -1176,9 +1173,6 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, { struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); - if (task->tk_status >= 0) - return 0; - switch (task->tk_status) { /* File access problems. Don't mark the device as unavailable */ case -EACCES: @@ -1213,6 +1207,13 @@ static int ff_layout_async_handle_error(struct rpc_task *task, { int vers = clp->cl_nfs_mod->rpc_vers->number; + if (task->tk_status >= 0) + return 0; + + /* Handle the case of an invalid layout segment */ + if (!pnfs_is_valid_lseg(lseg)) + return -NFS4ERR_RESET_TO_PNFS; + switch (vers) { case 3: return ff_layout_async_handle_error_v3(task, lseg, idx); diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index f7a3f6b05369..90099896b838 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -30,6 +30,7 @@ void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { nfs4_print_deviceid(&mirror_ds->id_node.deviceid); nfs4_pnfs_ds_put(mirror_ds->ds); + kfree(mirror_ds->ds_versions); kfree_rcu(mirror_ds, id_node.rcu); } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index bf4ec5ecc97e..76ae25661d3f 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1278,9 +1278,9 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return 0; /* Has the inode gone and changed behind our back? */ if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) - return -EIO; + return -ESTALE; if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) - return -EIO; + return -ESTALE; if (!nfs_file_has_buffered_writers(nfsi)) { /* Verify a few of the more important attributes */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 80bcc0befb07..52ea41bce038 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -248,7 +248,6 @@ int nfs_iocounter_wait(struct nfs_lock_context *l_ctx); extern const struct nfs_pageio_ops nfs_pgio_rw_ops; struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); void nfs_pgio_header_free(struct nfs_pgio_header *); -void nfs_pgio_data_destroy(struct nfs_pgio_header *); int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops, diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 5551e8ef67fd..e49d831c4e85 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -226,7 +226,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, const char *devname, struct nfs_clone_mount *mountdata) { - return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); + return vfs_submount(mountdata->dentry, &nfs_xdev_fs_type, devname, mountdata); } /** diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 608501971fe0..5cda392028ce 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -128,30 +128,26 @@ out_unlock: return err; } -static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, +static ssize_t _nfs42_proc_copy(struct file *src, struct nfs_lock_context *src_lock, - struct file *dst, loff_t pos_dst, + struct file *dst, struct nfs_lock_context *dst_lock, - size_t count) + struct nfs42_copy_args *args, + struct nfs42_copy_res *res) { - struct nfs42_copy_args args = { - .src_fh = NFS_FH(file_inode(src)), - .src_pos = pos_src, - .dst_fh = NFS_FH(file_inode(dst)), - .dst_pos = pos_dst, - .count = count, - }; - struct nfs42_copy_res res; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY], - .rpc_argp = &args, - .rpc_resp = &res, + .rpc_argp = args, + .rpc_resp = res, }; struct inode *dst_inode = file_inode(dst); struct nfs_server *server = NFS_SERVER(dst_inode); + loff_t pos_src = args->src_pos; + loff_t pos_dst = args->dst_pos; + size_t count = args->count; int status; - status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context, + status = nfs4_set_rw_stateid(&args->src_stateid, src_lock->open_context, src_lock, FMODE_READ); if (status) return status; @@ -161,7 +157,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, if (status) return status; - status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, + status = nfs4_set_rw_stateid(&args->dst_stateid, dst_lock->open_context, dst_lock, FMODE_WRITE); if (status) return status; @@ -171,22 +167,22 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, return status; status = nfs4_call_sync(server->client, server, &msg, - &args.seq_args, &res.seq_res, 0); + &args->seq_args, &res->seq_res, 0); if (status == -ENOTSUPP) server->caps &= ~NFS_CAP_COPY; if (status) return status; - if (res.write_res.verifier.committed != NFS_FILE_SYNC) { - status = nfs_commit_file(dst, &res.write_res.verifier.verifier); + if (res->write_res.verifier.committed != NFS_FILE_SYNC) { + status = nfs_commit_file(dst, &res->write_res.verifier.verifier); if (status) return status; } truncate_pagecache_range(dst_inode, pos_dst, - pos_dst + res.write_res.count); + pos_dst + res->write_res.count); - return res.write_res.count; + return res->write_res.count; } ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, @@ -196,8 +192,22 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, struct nfs_server *server = NFS_SERVER(file_inode(dst)); struct nfs_lock_context *src_lock; struct nfs_lock_context *dst_lock; - struct nfs4_exception src_exception = { }; - struct nfs4_exception dst_exception = { }; + struct nfs42_copy_args args = { + .src_fh = NFS_FH(file_inode(src)), + .src_pos = pos_src, + .dst_fh = NFS_FH(file_inode(dst)), + .dst_pos = pos_dst, + .count = count, + }; + struct nfs42_copy_res res; + struct nfs4_exception src_exception = { + .inode = file_inode(src), + .stateid = &args.src_stateid, + }; + struct nfs4_exception dst_exception = { + .inode = file_inode(dst), + .stateid = &args.dst_stateid, + }; ssize_t err, err2; if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY)) @@ -207,7 +217,6 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, if (IS_ERR(src_lock)) return PTR_ERR(src_lock); - src_exception.inode = file_inode(src); src_exception.state = src_lock->open_context->state; dst_lock = nfs_get_lock_context(nfs_file_open_context(dst)); @@ -216,15 +225,17 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, goto out_put_src_lock; } - dst_exception.inode = file_inode(dst); dst_exception.state = dst_lock->open_context->state; do { inode_lock(file_inode(dst)); - err = _nfs42_proc_copy(src, pos_src, src_lock, - dst, pos_dst, dst_lock, count); + err = _nfs42_proc_copy(src, src_lock, + dst, dst_lock, + &args, &res); inode_unlock(file_inode(dst)); + if (err >= 0) + break; if (err == -ENOTSUPP) { err = -EOPNOTSUPP; break; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index d21104912676..d8b040bd9814 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -279,7 +279,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, mountdata->hostname, mountdata->mnt_path); - mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); + mnt = vfs_submount(mountdata->dentry, &nfs4_referral_fs_type, page, mountdata); if (!IS_ERR(mnt)) break; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 78ff8b63d5f7..67845220fc27 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -256,15 +256,12 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE }; const u32 nfs4_fs_locations_bitmap[3] = { - FATTR4_WORD0_TYPE - | FATTR4_WORD0_CHANGE + FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_FSID | FATTR4_WORD0_FILEID | FATTR4_WORD0_FS_LOCATIONS, - FATTR4_WORD1_MODE - | FATTR4_WORD1_NUMLINKS - | FATTR4_WORD1_OWNER + FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV | FATTR4_WORD1_SPACE_USED @@ -2343,8 +2340,6 @@ static int nfs4_opendata_access(struct rpc_cred *cred, if ((mask & ~cache.mask & (MAY_READ | MAY_EXEC)) == 0) return 0; - /* even though OPEN succeeded, access is denied. Close the file */ - nfs4_close_state(state, fmode); return -EACCES; } @@ -2385,8 +2380,10 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) if (status != 0) return status; } - if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) + if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) { + nfs4_sequence_free_slot(&o_res->seq_res); nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); + } return 0; } @@ -2532,17 +2529,14 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state) } nfs4_stateid_copy(&stateid, &delegation->stateid); - if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { + if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) || + !test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, + &delegation->flags)) { rcu_read_unlock(); nfs_finish_clear_delegation_stateid(state, &stateid); return; } - if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) { - rcu_read_unlock(); - return; - } - cred = get_rpccred(delegation->cred); rcu_read_unlock(); status = nfs41_test_and_free_expired_stateid(server, &stateid, cred); @@ -2708,6 +2702,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, ret = PTR_ERR(state); if (IS_ERR(state)) goto out; + ctx->state = state; if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK) @@ -2733,7 +2728,6 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, if (ret != 0) goto out; - ctx->state = state; if (d_inode(dentry) == state->inode) { nfs_inode_attach_open_context(ctx); if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) @@ -4990,7 +4984,7 @@ out: */ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) { - struct page *pages[NFS4ACL_MAXPAGES] = {NULL, }; + struct page *pages[NFS4ACL_MAXPAGES + 1] = {NULL, }; struct nfs_getaclargs args = { .fh = NFS_FH(inode), .acl_pages = pages, @@ -5004,13 +4998,9 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu .rpc_argp = &args, .rpc_resp = &res, }; - unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); + unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE) + 1; int ret = -ENOMEM, i; - /* As long as we're doing a round trip to the server anyway, - * let's be prepared for a page of acl data. */ - if (npages == 0) - npages = 1; if (npages > ARRAY_SIZE(pages)) return -ERANGE; @@ -6426,7 +6416,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irqrestore(&q->lock, flags); - freezable_schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT); + freezable_schedule_timeout(NFS4_LOCK_MAXTIMEOUT); } finish_wait(q, &wait); @@ -6685,9 +6675,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, struct page *page) { struct nfs_server *server = NFS_SERVER(dir); - u32 bitmask[3] = { - [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, - }; + u32 bitmask[3]; struct nfs4_fs_locations_arg args = { .dir_fh = NFS_FH(dir), .name = name, @@ -6706,12 +6694,15 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, dprintk("%s: start\n", __func__); + bitmask[0] = nfs4_fattr_bitmap[0] | FATTR4_WORD0_FS_LOCATIONS; + bitmask[1] = nfs4_fattr_bitmap[1]; + /* Ask for the fileid of the absent filesystem if mounted_on_fileid * is not supported */ if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) - bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; + bitmask[0] &= ~FATTR4_WORD0_FILEID; else - bitmask[0] |= FATTR4_WORD0_FILEID; + bitmask[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; nfs_fattr_init(&fs_locations->fattr); fs_locations->server = server; @@ -7417,7 +7408,7 @@ static void nfs4_exchange_id_done(struct rpc_task *task, void *data) cdata->res.server_scope = NULL; } /* Save the EXCHANGE_ID verifier session trunk tests */ - memcpy(clp->cl_confirm.data, cdata->args.verifier->data, + memcpy(clp->cl_confirm.data, cdata->args.verifier.data, sizeof(clp->cl_confirm.data)); } out: @@ -7430,11 +7421,11 @@ static void nfs4_exchange_id_release(void *data) struct nfs41_exchange_id_data *cdata = (struct nfs41_exchange_id_data *)data; - nfs_put_client(cdata->args.client); if (cdata->xprt) { xprt_put(cdata->xprt); rpc_clnt_xprt_switch_put(cdata->args.client->cl_rpcclient); } + nfs_put_client(cdata->args.client); kfree(cdata->res.impl_id); kfree(cdata->res.server_scope); kfree(cdata->res.server_owner); @@ -7454,7 +7445,6 @@ static const struct rpc_call_ops nfs4_exchange_id_call_ops = { static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, u32 sp4_how, struct rpc_xprt *xprt) { - nfs4_verifier verifier; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID], .rpc_cred = cred, @@ -7477,8 +7467,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (!calldata) goto out; - if (!xprt) - nfs4_init_boot_verifier(clp, &verifier); + nfs4_init_boot_verifier(clp, &calldata->args.verifier); status = nfs4_init_uniform_client_string(clp); if (status) @@ -7523,9 +7512,8 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, task_setup_data.rpc_xprt = xprt; task_setup_data.flags = RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC; - calldata->args.verifier = &clp->cl_confirm; - } else { - calldata->args.verifier = &verifier; + memcpy(calldata->args.verifier.data, clp->cl_confirm.data, + sizeof(calldata->args.verifier.data)); } calldata->args.client = clp; #ifdef CONFIG_NFS_V4_1_MIGRATION @@ -7541,10 +7529,8 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, task_setup_data.callback_data = calldata; task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out_impl_id; - } + if (IS_ERR(task)) + return PTR_ERR(task); if (!xprt) { status = rpc_wait_for_completion_task(task); @@ -7572,6 +7558,7 @@ out_server_owner: kfree(calldata->res.server_owner); out_calldata: kfree(calldata); + nfs_put_client(clp); goto out; } @@ -8437,6 +8424,7 @@ static void nfs4_layoutget_release(void *calldata) size_t max_pages = max_response_pages(server); dprintk("--> %s\n", __func__); + nfs4_sequence_free_slot(&lgp->res.seq_res); nfs4_free_pages(lgp->args.layout.pages, max_pages); pnfs_put_layout_hdr(NFS_I(inode)->layout); put_nfs_open_context(lgp->args.ctx); @@ -8511,7 +8499,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); - nfs4_sequence_free_slot(&lgp->res.seq_res); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); if (status) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0959c9661662..92671914067f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1079,6 +1079,7 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) case -NFS4ERR_BADXDR: case -NFS4ERR_RESOURCE: case -NFS4ERR_NOFILEHANDLE: + case -NFS4ERR_MOVED: /* Non-seqid mutating errors */ return; }; diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index cfb8f7ce5cf6..20cd8500452a 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -201,17 +201,13 @@ DECLARE_EVENT_CLASS(nfs4_clientid_event, TP_ARGS(clp, error), TP_STRUCT__entry( - __string(dstaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR)) + __string(dstaddr, clp->cl_hostname) __field(int, error) ), TP_fast_assign( __entry->error = error; - __assign_str(dstaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR)); + __assign_str(dstaddr, clp->cl_hostname); ), TP_printk( @@ -1103,9 +1099,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) - __string(dstaddr, clp ? - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown") + __string(dstaddr, clp ? clp->cl_hostname : "unknown") ), TP_fast_assign( @@ -1118,9 +1112,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown") + __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown") ), TP_printk( @@ -1162,9 +1154,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) - __string(dstaddr, clp ? - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown") + __string(dstaddr, clp ? clp->cl_hostname : "unknown") __field(int, stateid_seq) __field(u32, stateid_hash) ), @@ -1179,9 +1169,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown") + __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown") __entry->stateid_seq = be32_to_cpu(stateid->seqid); __entry->stateid_hash = diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index fc89e5ed07ee..5e2724a928ed 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1761,7 +1761,7 @@ static void encode_exchange_id(struct xdr_stream *xdr, int len = 0; encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); - encode_nfs4_verifier(xdr, args->verifier); + encode_nfs4_verifier(xdr, &args->verifier); encode_string(xdr, strlen(args->client->cl_owner_id), args->client->cl_owner_id); @@ -2492,7 +2492,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); - replen = hdr.replen + op_decode_hdr_maxsz + 1; + replen = hdr.replen + op_decode_hdr_maxsz; encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); xdr_inline_pages(&req->rq_rcv_buf, replen << 2, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 965db474f4b0..3d17fc82b9fe 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -29,13 +29,14 @@ static struct kmem_cache *nfs_page_cachep; static const struct rpc_call_ops nfs_pgio_common_ops; -static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) +static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount, + gfp_t gfp_flags) { p->npages = pagecount; if (pagecount <= ARRAY_SIZE(p->page_array)) p->pagevec = p->page_array; else { - p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); + p->pagevec = kcalloc(pagecount, sizeof(struct page *), gfp_flags); if (!p->pagevec) p->npages = 0; } @@ -496,16 +497,6 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops) } EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc); -/* - * nfs_pgio_header_free - Free a read or write header - * @hdr: The header to free - */ -void nfs_pgio_header_free(struct nfs_pgio_header *hdr) -{ - hdr->rw_ops->rw_free_header(hdr); -} -EXPORT_SYMBOL_GPL(nfs_pgio_header_free); - /** * nfs_pgio_data_destroy - make @hdr suitable for reuse * @@ -514,14 +505,24 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free); * * @hdr: A header that has had nfs_generic_pgio called */ -void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) +static void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) { if (hdr->args.context) put_nfs_open_context(hdr->args.context); if (hdr->page_array.pagevec != hdr->page_array.page_array) kfree(hdr->page_array.pagevec); } -EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy); + +/* + * nfs_pgio_header_free - Free a read or write header + * @hdr: The header to free + */ +void nfs_pgio_header_free(struct nfs_pgio_header *hdr) +{ + nfs_pgio_data_destroy(hdr); + hdr->rw_ops->rw_free_header(hdr); +} +EXPORT_SYMBOL_GPL(nfs_pgio_header_free); /** * nfs_pgio_rpcsetup - Set up arguments for a pageio call @@ -635,7 +636,6 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio); static void nfs_pgio_error(struct nfs_pgio_header *hdr) { set_bit(NFS_IOHDR_REDO, &hdr->flags); - nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); } @@ -646,7 +646,6 @@ static void nfs_pgio_error(struct nfs_pgio_header *hdr) static void nfs_pgio_release(void *calldata) { struct nfs_pgio_header *hdr = calldata; - nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); } @@ -681,6 +680,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, { struct nfs_pgio_mirror *new; int i; + gfp_t gfp_flags = GFP_KERNEL; desc->pg_moreio = 0; desc->pg_inode = inode; @@ -700,8 +700,10 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, if (pg_ops->pg_get_mirror_count) { /* until we have a request, we don't have an lseg and no * idea how many mirrors there will be */ + if (desc->pg_rw_ops->rw_mode == FMODE_WRITE) + gfp_flags = GFP_NOIO; new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX, - sizeof(struct nfs_pgio_mirror), GFP_KERNEL); + sizeof(struct nfs_pgio_mirror), gfp_flags); desc->pg_mirrors_dynamic = new; desc->pg_mirrors = new; @@ -755,9 +757,12 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, struct list_head *head = &mirror->pg_list; struct nfs_commit_info cinfo; unsigned int pagecount, pageused; + gfp_t gfp_flags = GFP_KERNEL; pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); - if (!nfs_pgarray_set(&hdr->page_array, pagecount)) { + if (desc->pg_rw_ops->rw_mode == FMODE_WRITE) + gfp_flags = GFP_NOIO; + if (!nfs_pgarray_set(&hdr->page_array, pagecount, gfp_flags)) { nfs_pgio_error(hdr); desc->pg_error = -ENOMEM; return desc->pg_error; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 415d7e69bc5e..b7a07ba8783a 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2145,7 +2145,6 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_write_mds(desc); mirror->pg_recoalesce = 1; } - nfs_pgio_data_destroy(hdr); hdr->release(hdr); } @@ -2257,7 +2256,6 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_read_mds(desc); mirror->pg_recoalesce = 1; } - nfs_pgio_data_destroy(hdr); hdr->release(hdr); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ddce94ce8142..51bf1f9ab287 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1339,7 +1339,7 @@ static int nfs_parse_mount_options(char *raw, mnt->options |= NFS_OPTION_MIGRATION; break; case Opt_nomigration: - mnt->options &= NFS_OPTION_MIGRATION; + mnt->options &= ~NFS_OPTION_MIGRATION; break; /* diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 53211838f72a..e4772a8340f8 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -548,9 +548,9 @@ static void nfs_write_error_remove_page(struct nfs_page *req) { nfs_unlock_request(req); nfs_end_page_writeback(req); - nfs_release_request(req); generic_error_remove_page(page_file_mapping(req->wb_page), req->wb_page); + nfs_release_request(req); } /* diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 0780ff864539..3e396dbb1eb9 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -23,7 +23,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, { struct nfsd4_layout_seg *seg = &args->lg_seg; struct super_block *sb = inode->i_sb; - u32 block_size = (1 << inode->i_blkbits); + u32 block_size = i_blocksize(inode); struct pnfs_block_extent *bex; struct iomap iomap; u32 device_generation = 0; @@ -180,7 +180,7 @@ nfsd4_block_proc_layoutcommit(struct inode *inode, int nr_iomaps; nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, - lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits); + lcp->lc_up_len, &iomaps, i_blocksize(inode)); if (nr_iomaps < 0) return nfserrno(nr_iomaps); @@ -372,7 +372,7 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, int nr_iomaps; nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout, - lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits); + lcp->lc_up_len, &iomaps, i_blocksize(inode)); if (nr_iomaps < 0) return nfserrno(nr_iomaps); diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index dba2ff8eaa68..452334694a5d 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -358,6 +358,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, { unsigned int len, v, hdr, dlen; u32 max_blocksize = svc_max_payload(rqstp); + struct kvec *head = rqstp->rq_arg.head; + struct kvec *tail = rqstp->rq_arg.tail; p = decode_fh(p, &args->fh); if (!p) @@ -367,6 +369,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, args->count = ntohl(*p++); args->stable = ntohl(*p++); len = args->len = ntohl(*p++); + if ((void *)p > head->iov_base + head->iov_len) + return 0; /* * The count must equal the amount of data passed. */ @@ -377,9 +381,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, * Check to make sure that we got the right number of * bytes. */ - hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; - dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len - + rqstp->rq_arg.tail[0].iov_len - hdr; + hdr = (void*)p - head->iov_base; + dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr; /* * Round the length of the data which was specified up to * the next multiple of XDR units and then compare that @@ -396,7 +399,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, len = args->len = max_blocksize; } rqstp->rq_vec[0].iov_base = (void*)p; - rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; + rqstp->rq_vec[0].iov_len = head->iov_len - hdr; v = 0; while (len > rqstp->rq_vec[v].iov_len) { len -= rqstp->rq_vec[v].iov_len; @@ -471,6 +474,8 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, /* first copy and check from the first page */ old = (char*)p; vec = &rqstp->rq_arg.head[0]; + if ((void *)old > vec->iov_base + vec->iov_len) + return 0; avail = vec->iov_len - (old - (char*)vec->iov_base); while (len && avail && *old) { *new++ = *old++; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 211dc2aed8e1..3069cd46ea66 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -753,6 +753,14 @@ int set_callback_cred(void) return 0; } +void cleanup_callback_cred(void) +{ + if (callback_cred) { + put_rpccred(callback_cred); + callback_cred = NULL; + } +} + static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses) { if (clp->cl_minorversion == 0) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index abb09b580389..022d95886d66 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1273,7 +1273,8 @@ nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type) return NULL; } - if (!(exp->ex_layout_types & (1 << layout_type))) { + if (layout_type >= LAYOUT_TYPE_MAX || + !(exp->ex_layout_types & (1 << layout_type))) { dprintk("%s: layout type %d not supported\n", __func__, layout_type); return NULL; @@ -1782,6 +1783,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, opdesc->op_get_currentstateid(cstate, &op->u); op->status = opdesc->op_func(rqstp, cstate, &op->u); + /* Only from SEQUENCE */ + if (cstate->status == nfserr_replay_cache) { + dprintk("%s NFS4.1 replay from cache\n", __func__); + status = op->status; + goto out; + } if (!op->status) { if (opdesc->op_set_currentstateid) opdesc->op_set_currentstateid(cstate, &op->u); @@ -1792,14 +1799,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, if (need_wrongsec_check(rqstp)) op->status = check_nfsd_access(current_fh->fh_export, rqstp); } - encode_op: - /* Only from SEQUENCE */ - if (cstate->status == nfserr_replay_cache) { - dprintk("%s NFS4.1 replay from cache\n", __func__); - status = op->status; - goto out; - } if (op->status == nfserr_replay_me) { op->replay = &cstate->replay_owner->so_replay; nfsd4_encode_replay(&resp->xdr, op); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a0dee8ae9f97..9ebb2d7c8182 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3513,7 +3513,9 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) /* ignore lock owners */ if (local->st_stateowner->so_is_open_owner == 0) continue; - if (local->st_stateowner == &oo->oo_owner) { + if (local->st_stateowner != &oo->oo_owner) + continue; + if (local->st_stid.sc_type == NFS4_OPEN_STID) { ret = local; atomic_inc(&ret->st_stid.sc_count); break; @@ -3522,6 +3524,52 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) return ret; } +static __be32 +nfsd4_verify_open_stid(struct nfs4_stid *s) +{ + __be32 ret = nfs_ok; + + switch (s->sc_type) { + default: + break; + case NFS4_CLOSED_STID: + case NFS4_CLOSED_DELEG_STID: + ret = nfserr_bad_stateid; + break; + case NFS4_REVOKED_DELEG_STID: + ret = nfserr_deleg_revoked; + } + return ret; +} + +/* Lock the stateid st_mutex, and deal with races with CLOSE */ +static __be32 +nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp) +{ + __be32 ret; + + mutex_lock(&stp->st_mutex); + ret = nfsd4_verify_open_stid(&stp->st_stid); + if (ret != nfs_ok) + mutex_unlock(&stp->st_mutex); + return ret; +} + +static struct nfs4_ol_stateid * +nfsd4_find_and_lock_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) +{ + struct nfs4_ol_stateid *stp; + for (;;) { + spin_lock(&fp->fi_lock); + stp = nfsd4_find_existing_open(fp, open); + spin_unlock(&fp->fi_lock); + if (!stp || nfsd4_lock_ol_stateid(stp) == nfs_ok) + break; + nfs4_put_stid(&stp->st_stid); + } + return stp; +} + static struct nfs4_openowner * alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, struct nfsd4_compound_state *cstate) @@ -3566,6 +3614,7 @@ init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open) mutex_init(&stp->st_mutex); mutex_lock(&stp->st_mutex); +retry: spin_lock(&oo->oo_owner.so_client->cl_lock); spin_lock(&fp->fi_lock); @@ -3590,7 +3639,11 @@ out_unlock: spin_unlock(&fp->fi_lock); spin_unlock(&oo->oo_owner.so_client->cl_lock); if (retstp) { - mutex_lock(&retstp->st_mutex); + /* Handle races with CLOSE */ + if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { + nfs4_put_stid(&retstp->st_stid); + goto retry; + } /* To keep mutex tracking happy */ mutex_unlock(&stp->st_mutex); stp = retstp; @@ -3967,7 +4020,8 @@ static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, statei { struct nfs4_stid *ret; - ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID); + ret = find_stateid_by_type(cl, s, + NFS4_DELEG_STID|NFS4_REVOKED_DELEG_STID); if (!ret) return NULL; return delegstateid(ret); @@ -3990,6 +4044,12 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, deleg = find_deleg_stateid(cl, &open->op_delegate_stateid); if (deleg == NULL) goto out; + if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) { + nfs4_put_stid(&deleg->dl_stid); + if (cl->cl_minorversion) + status = nfserr_deleg_revoked; + goto out; + } flags = share_access_to_flags(open->op_share_access); status = nfs4_check_delegmode(deleg, flags); if (status) { @@ -4393,6 +4453,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; __be32 status; + bool new_stp = false; /* * Lookup file; if found, lookup stateid and check open request, @@ -4404,9 +4465,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_check_deleg(cl, open, &dp); if (status) goto out; - spin_lock(&fp->fi_lock); - stp = nfsd4_find_existing_open(fp, open); - spin_unlock(&fp->fi_lock); + stp = nfsd4_find_and_lock_existing_open(fp, open); } else { open->op_file = NULL; status = nfserr_bad_stateid; @@ -4414,35 +4473,31 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf goto out; } + if (!stp) { + stp = init_open_stateid(fp, open); + if (!open->op_stp) + new_stp = true; + } + /* * OPEN the file, or upgrade an existing OPEN. * If truncate fails, the OPEN fails. + * + * stp is already locked. */ - if (stp) { + if (!new_stp) { /* Stateid was found, this is an OPEN upgrade */ - mutex_lock(&stp->st_mutex); status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) { mutex_unlock(&stp->st_mutex); goto out; } } else { - /* stp is returned locked. */ - stp = init_open_stateid(fp, open); - /* See if we lost the race to some other thread */ - if (stp->st_access_bmap != 0) { - status = nfs4_upgrade_open(rqstp, fp, current_fh, - stp, open); - if (status) { - mutex_unlock(&stp->st_mutex); - goto out; - } - goto upgrade_out; - } status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); if (status) { - mutex_unlock(&stp->st_mutex); + stp->st_stid.sc_type = NFS4_CLOSED_STID; release_open_stateid(stp); + mutex_unlock(&stp->st_mutex); goto out; } @@ -4451,7 +4506,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (stp->st_clnt_odstate == open->op_odstate) open->op_odstate = NULL; } -upgrade_out: + nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid); mutex_unlock(&stp->st_mutex); @@ -4677,7 +4732,7 @@ nfs4_laundromat(struct nfsd_net *nn) spin_unlock(&nn->blocked_locks_lock); while (!list_empty(&reaplist)) { - nbl = list_first_entry(&nn->blocked_locks_lru, + nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); posix_unblock_lock(&nbl->nbl_lock); @@ -4858,6 +4913,16 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, struct nfs4_stid **s, struct nfsd_net *nn) { __be32 status; + bool return_revoked = false; + + /* + * only return revoked delegations if explicitly asked. + * otherwise we report revoked or bad_stateid status. + */ + if (typemask & NFS4_REVOKED_DELEG_STID) + return_revoked = true; + else if (typemask & NFS4_DELEG_STID) + typemask |= NFS4_REVOKED_DELEG_STID; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return nfserr_bad_stateid; @@ -4872,6 +4937,12 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, *s = find_stateid_by_type(cstate->clp, stateid, typemask); if (!*s) return nfserr_bad_stateid; + if (((*s)->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) { + nfs4_put_stid(*s); + if (cstate->minorversion) + return nfserr_deleg_revoked; + return nfserr_bad_stateid; + } return nfs_ok; } @@ -5291,7 +5362,6 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) bool unhashed; LIST_HEAD(reaplist); - s->st_stid.sc_type = NFS4_CLOSED_STID; spin_lock(&clp->cl_lock); unhashed = unhash_open_stateid(s, &reaplist); @@ -5330,10 +5400,12 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_bump_seqid(cstate, status); if (status) goto out; + + stp->st_stid.sc_type = NFS4_CLOSED_STID; nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); - mutex_unlock(&stp->st_mutex); nfsd4_close_open_stateid(stp); + mutex_unlock(&stp->st_mutex); /* put reference from nfs4_preprocess_seqid_op */ nfs4_put_stid(&stp->st_stid); @@ -7012,23 +7084,24 @@ nfs4_state_start(void) ret = set_callback_cred(); if (ret) - return -ENOMEM; + return ret; + laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); if (laundry_wq == NULL) { ret = -ENOMEM; - goto out_recovery; + goto out_cleanup_cred; } ret = nfsd4_create_callback_queue(); if (ret) goto out_free_laundry; set_max_delegations(); - return 0; out_free_laundry: destroy_workqueue(laundry_wq); -out_recovery: +out_cleanup_cred: + cleanup_callback_cred(); return ret; } @@ -7070,7 +7143,7 @@ nfs4_state_shutdown_net(struct net *net) spin_unlock(&nn->blocked_locks_lock); while (!list_empty(&reaplist)) { - nbl = list_first_entry(&nn->blocked_locks_lru, + nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); posix_unblock_lock(&nbl->nbl_lock); @@ -7086,6 +7159,7 @@ nfs4_state_shutdown(void) { destroy_workqueue(laundry_wq); nfsd4_destroy_callback_queue(); + cleanup_callback_cred(); } static void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c2d2895a1ec1..2c4f7a22e128 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -129,7 +129,7 @@ static void next_decode_page(struct nfsd4_compoundargs *argp) argp->p = page_address(argp->pagelist[0]); argp->pagelist++; if (argp->pagelen < PAGE_SIZE) { - argp->end = argp->p + (argp->pagelen>>2); + argp->end = argp->p + XDR_QUADLEN(argp->pagelen); argp->pagelen = 0; } else { argp->end = argp->p + (PAGE_SIZE>>2); @@ -1246,9 +1246,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) argp->pagelen -= pages * PAGE_SIZE; len -= pages * PAGE_SIZE; - argp->p = (__be32 *)page_address(argp->pagelist[0]); - argp->pagelist++; - argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE); + next_decode_page(argp); } argp->p += XDR_QUADLEN(len); @@ -2793,9 +2791,16 @@ out_acl: } #endif /* CONFIG_NFSD_PNFS */ if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { - status = nfsd4_encode_bitmap(xdr, NFSD_SUPPATTR_EXCLCREAT_WORD0, - NFSD_SUPPATTR_EXCLCREAT_WORD1, - NFSD_SUPPATTR_EXCLCREAT_WORD2); + u32 supp[3]; + + supp[0] = nfsd_suppattrs0(minorversion); + supp[1] = nfsd_suppattrs1(minorversion); + supp[2] = nfsd_suppattrs2(minorversion); + supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0; + supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1; + supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2; + + status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]); if (status) goto out; } @@ -4081,8 +4086,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getdeviceinfo *gdev) { struct xdr_stream *xdr = &resp->xdr; - const struct nfsd4_layout_ops *ops = - nfsd4_layout_ops[gdev->gd_layout_type]; + const struct nfsd4_layout_ops *ops; u32 starting_len = xdr->buf->len, needed_len; __be32 *p; @@ -4099,6 +4103,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, /* If maxcount is 0 then just update notifications */ if (gdev->gd_maxcount != 0) { + ops = nfsd4_layout_ops[gdev->gd_layout_type]; nfserr = ops->encode_getdeviceinfo(xdr, gdev); if (nfserr) { /* @@ -4151,8 +4156,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_layoutget *lgp) { struct xdr_stream *xdr = &resp->xdr; - const struct nfsd4_layout_ops *ops = - nfsd4_layout_ops[lgp->lg_layout_type]; + const struct nfsd4_layout_ops *ops; __be32 *p; dprintk("%s: err %d\n", __func__, nfserr); @@ -4175,6 +4179,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, *p++ = cpu_to_be32(lgp->lg_seg.iomode); *p++ = cpu_to_be32(lgp->lg_layout_type); + ops = nfsd4_layout_ops[lgp->lg_layout_type]; nfserr = ops->encode_layoutget(xdr, lgp); out: kfree(lgp->lg_content); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 010aff5c5a79..536009e50387 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -790,6 +790,7 @@ nfserrno (int errno) { nfserr_serverfault, -ESERVERFAULT }, { nfserr_serverfault, -ENFILE }, { nfserr_io, -EUCLEAN }, + { nfserr_perm, -ENOKEY }, }; int i; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index a2b65fc56dd6..1645b977c9c6 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -733,6 +733,37 @@ static __be32 map_new_errors(u32 vers, __be32 nfserr) return nfserr; } +/* + * A write procedure can have a large argument, and a read procedure can + * have a large reply, but no NFSv2 or NFSv3 procedure has argument and + * reply that can both be larger than a page. The xdr code has taken + * advantage of this assumption to be a sloppy about bounds checking in + * some cases. Pending a rewrite of the NFSv2/v3 xdr code to fix that + * problem, we enforce these assumptions here: + */ +static bool nfs_request_too_big(struct svc_rqst *rqstp, + struct svc_procedure *proc) +{ + /* + * The ACL code has more careful bounds-checking and is not + * susceptible to this problem: + */ + if (rqstp->rq_prog != NFS_PROGRAM) + return false; + /* + * Ditto NFSv4 (which can in theory have argument and reply both + * more than a page): + */ + if (rqstp->rq_vers >= 4) + return false; + /* The reply will be small, we're OK: */ + if (proc->pc_xdrressize > 0 && + proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE)) + return false; + + return rqstp->rq_arg.len > PAGE_SIZE; +} + int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) { @@ -745,6 +776,11 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) rqstp->rq_vers, rqstp->rq_proc); proc = rqstp->rq_procinfo; + if (nfs_request_too_big(rqstp, proc)) { + dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers); + *statp = rpc_garbage_args; + return 1; + } /* * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 41b468a6a90f..de07ff625777 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -280,6 +280,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_writeargs *args) { unsigned int len, hdr, dlen; + struct kvec *head = rqstp->rq_arg.head; int v; p = decode_fh(p, &args->fh); @@ -300,9 +301,10 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, * Check to make sure that we got the right number of * bytes. */ - hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; - dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len - - hdr; + hdr = (void*)p - head->iov_base; + if (hdr > head->iov_len) + return 0; + dlen = head->iov_len + rqstp->rq_arg.page_len - hdr; /* * Round the length of the data which was specified up to @@ -316,7 +318,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, return 0; rqstp->rq_vec[0].iov_base = (void*)p; - rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; + rqstp->rq_vec[0].iov_len = head->iov_len - hdr; v = 0; while (len > rqstp->rq_vec[v].iov_len) { len -= rqstp->rq_vec[v].iov_len; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 4516e8b7d776..005c911b34ac 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -615,6 +615,7 @@ extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, extern __be32 nfs4_check_open_reclaim(clientid_t *clid, struct nfsd4_compound_state *cstate, struct nfsd_net *nn); extern int set_callback_cred(void); +extern void cleanup_callback_cred(void); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8ca642fe9b21..b829cc9a9b39 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -377,7 +377,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, __be32 err; int host_err; bool get_write_count; - int size_change = 0; + bool size_change = (iap->ia_valid & ATTR_SIZE); if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; @@ -390,11 +390,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, /* Get inode */ err = fh_verify(rqstp, fhp, ftype, accmode); if (err) - goto out; + return err; if (get_write_count) { host_err = fh_want_write(fhp); if (host_err) - return nfserrno(host_err); + goto out; } dentry = fhp->fh_dentry; @@ -405,20 +405,28 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, iap->ia_valid &= ~ATTR_MODE; if (!iap->ia_valid) - goto out; + return 0; nfsd_sanitize_attrs(inode, iap); + if (check_guard && guardtime != inode->i_ctime.tv_sec) + return nfserr_notsync; + /* * The size case is special, it changes the file in addition to the - * attributes. + * attributes, and file systems don't expect it to be mixed with + * "random" attribute changes. We thus split out the size change + * into a separate call to ->setattr, and do the rest as a separate + * setattr call. */ - if (iap->ia_valid & ATTR_SIZE) { + if (size_change) { err = nfsd_get_write_access(rqstp, fhp, iap); if (err) - goto out; - size_change = 1; + return err; + } + fh_lock(fhp); + if (size_change) { /* * RFC5661, Section 18.30.4: * Changing the size of a file with SETATTR indirectly @@ -426,29 +434,36 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, * * (and similar for the older RFCs) */ - if (iap->ia_size != i_size_read(inode)) - iap->ia_valid |= ATTR_MTIME; - } + struct iattr size_attr = { + .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME, + .ia_size = iap->ia_size, + }; - iap->ia_valid |= ATTR_CTIME; + host_err = notify_change(dentry, &size_attr, NULL); + if (host_err) + goto out_unlock; + iap->ia_valid &= ~ATTR_SIZE; - if (check_guard && guardtime != inode->i_ctime.tv_sec) { - err = nfserr_notsync; - goto out_put_write_access; + /* + * Avoid the additional setattr call below if the only other + * attribute that the client sends is the mtime, as we update + * it as part of the size change above. + */ + if ((iap->ia_valid & ~ATTR_MTIME) == 0) + goto out_unlock; } - fh_lock(fhp); + iap->ia_valid |= ATTR_CTIME; host_err = notify_change(dentry, iap, NULL); - fh_unlock(fhp); - err = nfserrno(host_err); -out_put_write_access: +out_unlock: + fh_unlock(fhp); if (size_change) put_write_access(inode); - if (!err) - err = nfserrno(commit_metadata(fhp)); out: - return err; + if (!host_err) + host_err = commit_metadata(fhp); + return nfserrno(host_err); } #if defined(CONFIG_NFSD_V4) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index d5c23da43513..c21e0b4454a6 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -50,7 +50,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) brelse(bh); BUG(); } - memset(bh->b_data, 0, 1 << inode->i_blkbits); + memset(bh->b_data, 0, i_blocksize(inode)); bh->b_bdev = inode->i_sb->s_bdev; bh->b_blocknr = blocknr; set_buffer_mapped(bh); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index c7f4fef9ebf5..7ffe71a8dfb9 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -51,7 +51,7 @@ void nilfs_inode_add_blocks(struct inode *inode, int n) { struct nilfs_root *root = NILFS_I(inode)->i_root; - inode_add_bytes(inode, (1 << inode->i_blkbits) * n); + inode_add_bytes(inode, i_blocksize(inode) * n); if (root) atomic64_add(n, &root->blocks_count); } @@ -60,7 +60,7 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n) { struct nilfs_root *root = NILFS_I(inode)->i_root; - inode_sub_bytes(inode, (1 << inode->i_blkbits) * n); + inode_sub_bytes(inode, i_blocksize(inode) * n); if (root) atomic64_sub(n, &root->blocks_count); } diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index d56d3a5bea88..98835ed6bef4 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -57,7 +57,7 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, set_buffer_mapped(bh); kaddr = kmap_atomic(bh->b_page); - memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits); + memset(kaddr + bh_offset(bh), 0, i_blocksize(inode)); if (init_block) init_block(inode, bh, kaddr); flush_dcache_page(bh->b_page); @@ -501,7 +501,7 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned int entry_size, struct nilfs_mdt_info *mi = NILFS_MDT(inode); mi->mi_entry_size = entry_size; - mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size; + mi->mi_entries_per_block = i_blocksize(inode) / entry_size; mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index bedcae2c28e6..36362d4bc344 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -723,7 +723,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, lock_page(page); if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << inode->i_blkbits, 0); + create_empty_buffers(page, i_blocksize(inode), 0); unlock_page(page); bh = head = page_buffers(page); @@ -1956,8 +1956,6 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, err, ii->vfs_inode.i_ino); return err; } - mark_buffer_dirty(ibh); - nilfs_mdt_mark_dirty(ifile); spin_lock(&nilfs->ns_inode_lock); if (likely(!ii->i_bh)) ii->i_bh = ibh; @@ -1966,6 +1964,10 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, goto retry; } + // Always redirty the buffer to avoid race condition + mark_buffer_dirty(ii->i_bh); + nilfs_mdt_mark_dirty(ifile); + clear_bit(NILFS_I_QUEUED, &ii->i_state); set_bit(NILFS_I_BUSY, &ii->i_state); list_move_tail(&ii->i_dirty, &sci->sc_dirty_files); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 7ebfca6a1427..258e8f635148 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -294,27 +294,37 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, } ret = copy_event_to_user(group, kevent, buf); + if (unlikely(ret == -EOPENSTALE)) { + /* + * We cannot report events with stale fd so drop it. + * Setting ret to 0 will continue the event loop and + * do the right thing if there are no more events to + * read (i.e. return bytes read, -EAGAIN or wait). + */ + ret = 0; + } + /* * Permission events get queued to wait for response. Other * events can be destroyed now. */ if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) { fsnotify_destroy_event(group, kevent); - if (ret < 0) - break; } else { #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (ret < 0) { + if (ret <= 0) { FANOTIFY_PE(kevent)->response = FAN_DENY; wake_up(&group->fanotify_data.access_waitq); - break; + } else { + spin_lock(&group->notification_lock); + list_add_tail(&kevent->list, + &group->fanotify_data.access_list); + spin_unlock(&group->notification_lock); } - spin_lock(&group->notification_lock); - list_add_tail(&kevent->list, - &group->fanotify_data.access_list); - spin_unlock(&group->notification_lock); #endif } + if (ret < 0) + break; buf += ret; count -= ret; } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index db39de2dd4cb..a64adc2fced9 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -104,16 +104,20 @@ int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) if (unlikely(!fsnotify_inode_watches_children(p_inode))) __fsnotify_update_child_dentry_flags(p_inode); else if (p_inode->i_fsnotify_mask & mask) { + struct name_snapshot name; + /* we are notifying a parent so come up with the new mask which * specifies these are events which came from a child. */ mask |= FS_EVENT_ON_CHILD; + take_dentry_name_snapshot(&name, dentry); if (path) ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH, - dentry->d_name.name, 0); + name.name, 0); else ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, - dentry->d_name.name, 0); + name.name, 0); + release_dentry_name_snapshot(&name); } dput(parent); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f72712f6c28d..06089becca60 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7310,13 +7310,24 @@ out: static int ocfs2_trim_extent(struct super_block *sb, struct ocfs2_group_desc *gd, - u32 start, u32 count) + u64 group, u32 start, u32 count) { u64 discard, bcount; + struct ocfs2_super *osb = OCFS2_SB(sb); bcount = ocfs2_clusters_to_blocks(sb, count); - discard = le64_to_cpu(gd->bg_blkno) + - ocfs2_clusters_to_blocks(sb, start); + discard = ocfs2_clusters_to_blocks(sb, start); + + /* + * For the first cluster group, the gd->bg_blkno is not at the start + * of the group, but at an offset from the start. If we add it while + * calculating discard for first group, we will wrongly start fstrim a + * few blocks after the desried start block and the range can cross + * over into the next cluster group. So, add it only if this is not + * the first cluster group. + */ + if (group != osb->first_cluster_group_blkno) + discard += le64_to_cpu(gd->bg_blkno); trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount); @@ -7324,7 +7335,7 @@ static int ocfs2_trim_extent(struct super_block *sb, } static int ocfs2_trim_group(struct super_block *sb, - struct ocfs2_group_desc *gd, + struct ocfs2_group_desc *gd, u64 group, u32 start, u32 max, u32 minbits) { int ret = 0, count = 0, next; @@ -7343,7 +7354,7 @@ static int ocfs2_trim_group(struct super_block *sb, next = ocfs2_find_next_bit(bitmap, max, start); if ((next - start) >= minbits) { - ret = ocfs2_trim_extent(sb, gd, + ret = ocfs2_trim_extent(sb, gd, group, start, next - start); if (ret < 0) { mlog_errno(ret); @@ -7441,7 +7452,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) } gd = (struct ocfs2_group_desc *)gd_bh->b_data; - cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen); + cnt = ocfs2_trim_group(sb, gd, group, + first_bit, last_bit, minlen); brelse(gd_bh); gd_bh = NULL; if (cnt < 0) { diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c5c5b9748ea3..f2961b13e8c5 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -599,7 +599,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, int ret = 0; struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; unsigned int block_end, block_start; - unsigned int bsize = 1 << inode->i_blkbits; + unsigned int bsize = i_blocksize(inode); if (!page_has_buffers(page)) create_empty_buffers(page, bsize, 0); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 636abcbd4650..5e8709aa1e7e 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -2242,13 +2242,13 @@ unlock: spin_unlock(&o2hb_live_lock); } -static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item, +static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item, char *page) { return sprintf(page, "%u\n", o2hb_dead_threshold); } -static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item, +static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item, const char *page, size_t count) { unsigned long tmp; @@ -2297,11 +2297,11 @@ static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item, } -CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold); +CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold); CONFIGFS_ATTR(o2hb_heartbeat_group_, mode); static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { - &o2hb_heartbeat_group_attr_threshold, + &o2hb_heartbeat_group_attr_dead_threshold, &o2hb_heartbeat_group_attr_mode, NULL, }; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index dd5cb8bcefd1..eef324823311 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2419,6 +2419,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) dlm_lockres_put(res); continue; } + dlm_move_lockres_to_recovery_list(dlm, res); } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 77d1632e905d..8dce4099a6ca 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) init_waitqueue_head(&res->l_event); INIT_LIST_HEAD(&res->l_blocked_list); INIT_LIST_HEAD(&res->l_mask_waiters); + INIT_LIST_HEAD(&res->l_holders); } void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, @@ -749,6 +750,50 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res) res->l_flags = 0UL; } +/* + * Keep a list of processes who have interest in a lockres. + * Note: this is now only uesed for check recursive cluster locking. + */ +static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, + struct ocfs2_lock_holder *oh) +{ + INIT_LIST_HEAD(&oh->oh_list); + oh->oh_owner_pid = get_pid(task_pid(current)); + + spin_lock(&lockres->l_lock); + list_add_tail(&oh->oh_list, &lockres->l_holders); + spin_unlock(&lockres->l_lock); +} + +static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres, + struct ocfs2_lock_holder *oh) +{ + spin_lock(&lockres->l_lock); + list_del(&oh->oh_list); + spin_unlock(&lockres->l_lock); + + put_pid(oh->oh_owner_pid); +} + +static inline int ocfs2_is_locked_by_me(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_lock_holder *oh; + struct pid *pid; + + /* look in the list of holders for one with the current task as owner */ + spin_lock(&lockres->l_lock); + pid = task_pid(current); + list_for_each_entry(oh, &lockres->l_holders, oh_list) { + if (oh->oh_owner_pid == pid) { + spin_unlock(&lockres->l_lock); + return 1; + } + } + spin_unlock(&lockres->l_lock); + + return 0; +} + static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, int level) { @@ -2333,8 +2378,9 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, goto getbh; } - if (ocfs2_mount_local(osb)) - goto local; + if ((arg_flags & OCFS2_META_LOCK_GETBH) || + ocfs2_mount_local(osb)) + goto update; if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) ocfs2_wait_for_recovery(osb); @@ -2363,7 +2409,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) ocfs2_wait_for_recovery(osb); -local: +update: /* * We only see this flag if we're being called from * ocfs2_read_locked_inode(). It means we're locking an inode @@ -2497,6 +2543,59 @@ void ocfs2_inode_unlock(struct inode *inode, ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); } +/* + * This _tracker variantes are introduced to deal with the recursive cluster + * locking issue. The idea is to keep track of a lock holder on the stack of + * the current process. If there's a lock holder on the stack, we know the + * task context is already protected by cluster locking. Currently, they're + * used in some VFS entry routines. + * + * return < 0 on error, return == 0 if there's no lock holder on the stack + * before this call, return == 1 if this call would be a recursive locking. + */ +int ocfs2_inode_lock_tracker(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + struct ocfs2_lock_holder *oh) +{ + int status; + int arg_flags = 0, has_locked; + struct ocfs2_lock_res *lockres; + + lockres = &OCFS2_I(inode)->ip_inode_lockres; + has_locked = ocfs2_is_locked_by_me(lockres); + /* Just get buffer head if the cluster lock has been taken */ + if (has_locked) + arg_flags = OCFS2_META_LOCK_GETBH; + + if (likely(!has_locked || ret_bh)) { + status = ocfs2_inode_lock_full(inode, ret_bh, ex, arg_flags); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + } + if (!has_locked) + ocfs2_add_holder(lockres, oh); + + return has_locked; +} + +void ocfs2_inode_unlock_tracker(struct inode *inode, + int ex, + struct ocfs2_lock_holder *oh, + int had_lock) +{ + struct ocfs2_lock_res *lockres; + + lockres = &OCFS2_I(inode)->ip_inode_lockres; + if (!had_lock) { + ocfs2_remove_holder(lockres, oh); + ocfs2_inode_unlock(inode, ex); + } +} + int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno) { struct ocfs2_lock_res *lockres; diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index d293a22c32c5..a7fc18ba0dc1 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -70,6 +70,11 @@ struct ocfs2_orphan_scan_lvb { __be32 lvb_os_seqno; }; +struct ocfs2_lock_holder { + struct list_head oh_list; + struct pid *oh_owner_pid; +}; + /* ocfs2_inode_lock_full() 'arg_flags' flags */ /* don't wait on recovery. */ #define OCFS2_META_LOCK_RECOVERY (0x01) @@ -77,6 +82,8 @@ struct ocfs2_orphan_scan_lvb { #define OCFS2_META_LOCK_NOQUEUE (0x02) /* don't block waiting for the downconvert thread, instead return -EAGAIN */ #define OCFS2_LOCK_NONBLOCK (0x04) +/* just get back disk inode bh if we've got cluster lock. */ +#define OCFS2_META_LOCK_GETBH (0x08) /* Locking subclasses of inode cluster lock */ enum { @@ -170,4 +177,15 @@ void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); /* To set the locking protocol on module initialization */ void ocfs2_set_locking_protocol(void); + +/* The _tracker pair is used to avoid cluster recursive locking */ +int ocfs2_inode_lock_tracker(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + struct ocfs2_lock_holder *oh); +void ocfs2_inode_unlock_tracker(struct inode *inode, + int ex, + struct ocfs2_lock_holder *oh, + int had_lock); + #endif /* DLMGLUE_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 000c234d7bbd..05a0fb9854f9 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -808,7 +808,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* We know that zero_from is block aligned */ for (block_start = zero_from; block_start < zero_to; block_start = block_end) { - block_end = block_start + (1 << inode->i_blkbits); + block_end = block_start + i_blocksize(inode); /* * block_start is block-aligned. Bump it by one to force @@ -1166,6 +1166,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; if (size_change) { + /* + * Here we should wait dio to finish before inode lock + * to avoid a deadlock between ocfs2_setattr() and + * ocfs2_dio_end_io_write() + */ + inode_dio_wait(inode); + status = ocfs2_rw_lock(inode, 1); if (status < 0) { mlog_errno(status); @@ -1186,8 +1193,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (status) goto bail_unlock; - inode_dio_wait(inode); - if (i_size_read(inode) >= attr->ia_size) { if (ocfs2_should_order_data(inode)) { status = ocfs2_begin_ordered_truncate(inode, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index e63af7ddfe68..594575e380e8 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -172,6 +172,7 @@ struct ocfs2_lock_res { struct list_head l_blocked_list; struct list_head l_mask_waiters; + struct list_head l_holders; unsigned long l_flags; char l_name[OCFS2_LOCK_ID_MAX_LEN]; diff --git a/fs/open.c b/fs/open.c index d3ed8171e8e0..a6c6244f4993 100644 --- a/fs/open.c +++ b/fs/open.c @@ -902,6 +902,12 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o int lookup_flags = 0; int acc_mode = ACC_MODE(flags); + /* + * Clear out all open flags we don't know about so that we don't report + * them in fcntl(F_GETFD) or similar interfaces. + */ + flags &= VALID_OPEN_FLAGS; + if (flags & (O_CREAT | __O_TMPFILE)) op->mode = (mode & S_IALLUGO) | S_IFREG; else diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 7a3754488312..9409aac232f7 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -61,9 +61,9 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type) return acl; } -int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +static int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, + int type) { - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); int error = 0; void *value = NULL; size_t size = 0; @@ -72,22 +72,6 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - umode_t mode; - - error = posix_acl_update_mode(inode, &mode, &acl); - if (error) { - gossip_err("%s: posix_acl_update_mode err: %d\n", - __func__, - error); - return error; - } - - if (inode->i_mode != mode) - SetModeFlag(orangefs_inode); - inode->i_mode = mode; - mark_inode_dirty_sync(inode); - } break; case ACL_TYPE_DEFAULT: name = XATTR_NAME_POSIX_ACL_DEFAULT; @@ -132,6 +116,29 @@ out: return error; } +int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error; + + if (type == ACL_TYPE_ACCESS && acl) { + umode_t mode; + + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) { + gossip_err("%s: posix_acl_update_mode err: %d\n", + __func__, + error); + return error; + } + + if (inode->i_mode != mode) + SetModeFlag(ORANGEFS_I(inode)); + inode->i_mode = mode; + mark_inode_dirty_sync(inode); + } + return __orangefs_set_acl(inode, acl, type); +} + int orangefs_init_acl(struct inode *inode, struct inode *dir) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); @@ -146,13 +153,14 @@ int orangefs_init_acl(struct inode *inode, struct inode *dir) return error; if (default_acl) { - error = orangefs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = __orangefs_set_acl(inode, default_acl, + ACL_TYPE_DEFAULT); posix_acl_release(default_acl); } if (acl) { if (!error) - error = orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); } diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index 516ffb4dc9a0..fe2cbeb90772 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -208,14 +208,19 @@ restart: continue; /* * Skip ops whose filesystem we don't know about unless - * it is being mounted. + * it is being mounted or unmounted. It is possible for + * a filesystem we don't know about to be unmounted if + * it fails to mount in the kernel after userspace has + * been sent the mount request. */ /* XXX: is there a better way to detect this? */ } else if (ret == -1 && !(op->upcall.type == ORANGEFS_VFS_OP_FS_MOUNT || op->upcall.type == - ORANGEFS_VFS_OP_GETATTR)) { + ORANGEFS_VFS_OP_GETATTR || + op->upcall.type == + ORANGEFS_VFS_OP_FS_UMOUNT)) { gossip_debug(GOSSIP_DEV_DEBUG, "orangefs: skipping op tag %llu %s\n", llu(op->tag), get_opname_string(op)); @@ -402,8 +407,9 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, /* remove the op from the in progress hash table */ op = orangefs_devreq_remove_op(head.tag); if (!op) { - gossip_err("WARNING: No one's waiting for tag %llu\n", - llu(head.tag)); + gossip_debug(GOSSIP_DEV_DEBUG, + "%s: No one's waiting for tag %llu\n", + __func__, llu(head.tag)); return ret; } diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index ef3b4eb54cf2..08ecdeebd6f7 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -223,8 +223,7 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) if (ret) goto out; - if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(inode)) { + if (iattr->ia_valid & ATTR_SIZE) { ret = orangefs_setattr_size(inode, iattr); if (ret) goto out; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index a290ff6ec756..7c315938e9c2 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -193,8 +193,6 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - ORANGEFS_I(inode)->getattr_time = jiffies - 1; - gossip_debug(GOSSIP_NAME_DEBUG, "%s:%s:%d " "Found good inode [%lu] with count [%d]\n", diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 38887cc5577f..0748a26598fc 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -671,8 +671,10 @@ int orangefs_prepare_debugfs_help_string(int at_boot) */ cdm_element_count = orangefs_prepare_cdm_array(client_debug_array_string); - if (cdm_element_count <= 0) + if (cdm_element_count <= 0) { + kfree(new); goto out; + } for (i = 0; i < cdm_element_count; i++) { strlcat(new, "\t", string_size); @@ -963,13 +965,13 @@ int orangefs_debugfs_new_client_string(void __user *arg) int ret; ret = copy_from_user(&client_debug_array_string, - (void __user *)arg, - ORANGEFS_MAX_DEBUG_STRING_LEN); + (void __user *)arg, + ORANGEFS_MAX_DEBUG_STRING_LEN); if (ret != 0) { pr_info("%s: CLIENT_STRING: copy_from_user failed\n", __func__); - return -EIO; + return -EFAULT; } /* @@ -984,17 +986,18 @@ int orangefs_debugfs_new_client_string(void __user *arg) */ client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] = '\0'; - + pr_info("%s: client debug array string has been received.\n", __func__); if (!help_string_initialized) { /* Build a proper debug help string. */ - if (orangefs_prepare_debugfs_help_string(0)) { + ret = orangefs_prepare_debugfs_help_string(0); + if (ret) { gossip_err("%s: no debug help string \n", __func__); - return -EIO; + return ret; } } @@ -1007,7 +1010,7 @@ int orangefs_debugfs_new_client_string(void __user *arg) help_string_initialized++; - return ret; + return 0; } int orangefs_debugfs_new_debug(void __user *arg) diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h index a3d84ffee905..f380f9ed1b28 100644 --- a/fs/orangefs/orangefs-dev-proto.h +++ b/fs/orangefs/orangefs-dev-proto.h @@ -50,8 +50,7 @@ * Misc constants. Please retain them as multiples of 8! * Otherwise 32-64 bit interactions will be messed up :) */ -#define ORANGEFS_MAX_DEBUG_STRING_LEN 0x00000400 -#define ORANGEFS_MAX_DEBUG_ARRAY_LEN 0x00000800 +#define ORANGEFS_MAX_DEBUG_STRING_LEN 0x00000800 /* * The maximum number of directory entries in a single request is 96. diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 3bf803d732c5..45dd8f27b2ac 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -249,6 +249,7 @@ struct orangefs_sb_info_s { char devname[ORANGEFS_MAX_SERVER_ADDR_LEN]; struct super_block *sb; int mount_pending; + int no_list; struct list_head list; }; diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 06af81f71e10..9b96b99539d6 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -306,7 +306,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) break; case S_IFDIR: inode->i_size = PAGE_SIZE; - orangefs_inode->blksize = (1 << inode->i_blkbits); + orangefs_inode->blksize = i_blocksize(inode); spin_lock(&inode->i_lock); inode_set_bytes(inode, inode->i_size); spin_unlock(&inode->i_lock); @@ -316,7 +316,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) if (new) { inode->i_size = (loff_t)strlen(new_op-> downcall.resp.getattr.link_target); - orangefs_inode->blksize = (1 << inode->i_blkbits); + orangefs_inode->blksize = i_blocksize(inode); ret = strscpy(orangefs_inode->link_target, new_op->downcall.resp.getattr.link_target, ORANGEFS_NAME_MAX); diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index c48859f16e7b..629d8c917fa6 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -115,6 +115,13 @@ static struct inode *orangefs_alloc_inode(struct super_block *sb) return &orangefs_inode->vfs_inode; } +static void orangefs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + kmem_cache_free(orangefs_inode_cache, orangefs_inode); +} + static void orangefs_destroy_inode(struct inode *inode) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); @@ -123,7 +130,7 @@ static void orangefs_destroy_inode(struct inode *inode) "%s: deallocated %p destroying inode %pU\n", __func__, orangefs_inode, get_khandle_from_ino(inode)); - kmem_cache_free(orangefs_inode_cache, orangefs_inode); + call_rcu(&inode->i_rcu, orangefs_i_callback); } /* @@ -256,8 +263,13 @@ int orangefs_remount(struct orangefs_sb_info_s *orangefs_sb) if (!new_op) return -ENOMEM; new_op->upcall.req.features.features = 0; - ret = service_operation(new_op, "orangefs_features", 0); - orangefs_features = new_op->downcall.resp.features.features; + ret = service_operation(new_op, "orangefs_features", + ORANGEFS_OP_PRIORITY | ORANGEFS_OP_NO_MUTEX); + if (!ret) + orangefs_features = + new_op->downcall.resp.features.features; + else + orangefs_features = 0; op_release(new_op); } else { orangefs_features = 0; @@ -481,7 +493,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst, if (ret) { d = ERR_PTR(ret); - goto free_op; + goto free_sb_and_op; } /* @@ -507,6 +519,9 @@ struct dentry *orangefs_mount(struct file_system_type *fst, spin_unlock(&orangefs_superblocks_lock); op_release(new_op); + /* Must be removed from the list now. */ + ORANGEFS_SB(sb)->no_list = 0; + if (orangefs_userspace_version >= 20906) { new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); if (!new_op) @@ -521,6 +536,10 @@ struct dentry *orangefs_mount(struct file_system_type *fst, return dget(sb->s_root); +free_sb_and_op: + /* Will call orangefs_kill_sb with sb not in list. */ + ORANGEFS_SB(sb)->no_list = 1; + deactivate_locked_super(sb); free_op: gossip_err("orangefs_mount: mount request failed with %d\n", ret); if (ret == -EINVAL) { @@ -546,12 +565,14 @@ void orangefs_kill_sb(struct super_block *sb) */ orangefs_unmount_sb(sb); - /* remove the sb from our list of orangefs specific sb's */ - - spin_lock(&orangefs_superblocks_lock); - __list_del_entry(&ORANGEFS_SB(sb)->list); /* not list_del_init */ - ORANGEFS_SB(sb)->list.prev = NULL; - spin_unlock(&orangefs_superblocks_lock); + if (!ORANGEFS_SB(sb)->no_list) { + /* remove the sb from our list of orangefs specific sb's */ + spin_lock(&orangefs_superblocks_lock); + /* not list_del_init */ + __list_del_entry(&ORANGEFS_SB(sb)->list); + ORANGEFS_SB(sb)->list.prev = NULL; + spin_unlock(&orangefs_superblocks_lock); + } /* * make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 74a81b1daaac..237c9c04dc3b 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -76,11 +76,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { - gossip_err("Invalid key length (%d)\n", - (int)strlen(name)); + if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) return -EINVAL; - } fsuid = from_kuid(&init_user_ns, current_fsuid()); fsgid = from_kgid(&init_user_ns, current_fsgid()); @@ -172,6 +169,9 @@ static int orangefs_inode_removexattr(struct inode *inode, const char *name, struct orangefs_kernel_op_s *new_op = NULL; int ret = -ENOMEM; + if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) + return -EINVAL; + down_write(&orangefs_inode->xattr_sem); new_op = op_alloc(ORANGEFS_VFS_OP_REMOVEXATTR); if (!new_op) @@ -231,23 +231,13 @@ int orangefs_inode_setxattr(struct inode *inode, const char *name, "%s: name %s, buffer_size %zd\n", __func__, name, size); - if (size >= ORANGEFS_MAX_XATTR_VALUELEN || - flags < 0) { - gossip_err("orangefs_inode_setxattr: bogus values of size(%d), flags(%d)\n", - (int)size, - flags); + if (size > ORANGEFS_MAX_XATTR_VALUELEN) + return -EINVAL; + if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) return -EINVAL; - } internal_flag = convert_to_internal_xattr_flags(flags); - if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { - gossip_err - ("orangefs_inode_setxattr: bogus key size (%d)\n", - (int)(strlen(name))); - return -EINVAL; - } - /* This is equivalent to a removexattr */ if (size == 0 && value == NULL) { gossip_debug(GOSSIP_XATTR_DEBUG, @@ -358,7 +348,7 @@ try_again: returned_count = new_op->downcall.resp.listxattr.returned_count; if (returned_count < 0 || - returned_count >= ORANGEFS_MAX_XATTR_LISTLEN) { + returned_count > ORANGEFS_MAX_XATTR_LISTLEN) { gossip_err("%s: impossible value for returned_count:%d:\n", __func__, returned_count); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 0e100856c7b8..e7c8ac41e288 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1146,6 +1146,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) unsigned int stacklen = 0; unsigned int i; bool remote = false; + struct cred *cred; int err; err = -ENOMEM; @@ -1309,10 +1310,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) else sb->s_d_op = &ovl_dentry_operations; - ufs->creator_cred = prepare_creds(); - if (!ufs->creator_cred) + err = -ENOMEM; + ufs->creator_cred = cred = prepare_creds(); + if (!cred) goto out_put_lower_mnt; + /* Never override disk quota limits or use reserved space */ + cap_lower(cred->cap_effective, CAP_SYS_RESOURCE); + err = -ENOMEM; oe = ovl_alloc_entry(numlower); if (!oe) diff --git a/fs/pnode.c b/fs/pnode.c index 234a9ac49958..d15c63e97ef1 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -24,6 +24,11 @@ static inline struct mount *first_slave(struct mount *p) return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave); } +static inline struct mount *last_slave(struct mount *p) +{ + return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave); +} + static inline struct mount *next_slave(struct mount *p) { return list_entry(p->mnt_slave.next, struct mount, mnt_slave); @@ -164,6 +169,19 @@ static struct mount *propagation_next(struct mount *m, } } +static struct mount *skip_propagation_subtree(struct mount *m, + struct mount *origin) +{ + /* + * Advance m such that propagation_next will not return + * the slaves of m. + */ + if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + m = last_slave(m); + + return m; +} + static struct mount *next_group(struct mount *m, struct mount *origin) { while (1) { @@ -324,6 +342,21 @@ out: return ret; } +static struct mount *find_topper(struct mount *mnt) +{ + /* If there is exactly one mount covering mnt completely return it. */ + struct mount *child; + + if (!list_is_singular(&mnt->mnt_mounts)) + return NULL; + + child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child); + if (child->mnt_mountpoint != mnt->mnt.mnt_root) + return NULL; + + return child; +} + /* * return true if the refcount is greater than count */ @@ -344,9 +377,8 @@ static inline int do_refcount_check(struct mount *mnt, int count) */ int propagate_mount_busy(struct mount *mnt, int refcnt) { - struct mount *m, *child; + struct mount *m, *child, *topper; struct mount *parent = mnt->mnt_parent; - int ret = 0; if (mnt == parent) return do_refcount_check(mnt, refcnt); @@ -361,12 +393,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); - if (child && list_empty(&child->mnt_mounts) && - (ret = do_refcount_check(child, 1))) - break; + int count = 1; + child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); + if (!child) + continue; + + /* Is there exactly one mount on the child that covers + * it completely whose reference should be ignored? + */ + topper = find_topper(child); + if (topper) + count += 1; + else if (!list_empty(&child->mnt_mounts)) + continue; + + if (do_refcount_check(child, count)) + return 1; } - return ret; + return 0; } /* @@ -383,60 +427,110 @@ void propagate_mount_unlock(struct mount *mnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); + child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (child) child->mnt.mnt_flags &= ~MNT_LOCKED; } } -/* - * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted. - */ -static void mark_umount_candidates(struct mount *mnt) +static void umount_one(struct mount *mnt, struct list_head *to_umount) { - struct mount *parent = mnt->mnt_parent; - struct mount *m; - - BUG_ON(parent == mnt); - - for (m = propagation_next(parent, parent); m; - m = propagation_next(m, parent)) { - struct mount *child = __lookup_mnt_last(&m->mnt, - mnt->mnt_mountpoint); - if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) { - SET_MNT_MARK(child); - } - } + CLEAR_MNT_MARK(mnt); + mnt->mnt.mnt_flags |= MNT_UMOUNT; + list_del_init(&mnt->mnt_child); + list_del_init(&mnt->mnt_umounting); + list_move_tail(&mnt->mnt_list, to_umount); } /* * NOTE: unmounting 'mnt' naturally propagates to all other mounts its * parent propagates to. */ -static void __propagate_umount(struct mount *mnt) +static bool __propagate_umount(struct mount *mnt, + struct list_head *to_umount, + struct list_head *to_restore) { - struct mount *parent = mnt->mnt_parent; - struct mount *m; - - BUG_ON(parent == mnt); + bool progress = false; + struct mount *child; - for (m = propagation_next(parent, parent); m; - m = propagation_next(m, parent)) { + /* + * The state of the parent won't change if this mount is + * already unmounted or marked as without children. + */ + if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED)) + goto out; - struct mount *child = __lookup_mnt_last(&m->mnt, - mnt->mnt_mountpoint); - /* - * umount the child only if the child has no children - * and the child is marked safe to unmount. - */ - if (!child || !IS_MNT_MARKED(child)) + /* Verify topper is the only grandchild that has not been + * speculatively unmounted. + */ + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + if (child->mnt_mountpoint == mnt->mnt.mnt_root) + continue; + if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child)) continue; - CLEAR_MNT_MARK(child); - if (list_empty(&child->mnt_mounts)) { - list_del_init(&child->mnt_child); - child->mnt.mnt_flags |= MNT_UMOUNT; - list_move_tail(&child->mnt_list, &mnt->mnt_list); + /* Found a mounted child */ + goto children; + } + + /* Mark mounts that can be unmounted if not locked */ + SET_MNT_MARK(mnt); + progress = true; + + /* If a mount is without children and not locked umount it. */ + if (!IS_MNT_LOCKED(mnt)) { + umount_one(mnt, to_umount); + } else { +children: + list_move_tail(&mnt->mnt_umounting, to_restore); + } +out: + return progress; +} + +static void umount_list(struct list_head *to_umount, + struct list_head *to_restore) +{ + struct mount *mnt, *child, *tmp; + list_for_each_entry(mnt, to_umount, mnt_list) { + list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) { + /* topper? */ + if (child->mnt_mountpoint == mnt->mnt.mnt_root) + list_move_tail(&child->mnt_umounting, to_restore); + else + umount_one(child, to_umount); + } + } +} + +static void restore_mounts(struct list_head *to_restore) +{ + /* Restore mounts to a clean working state */ + while (!list_empty(to_restore)) { + struct mount *mnt, *parent; + struct mountpoint *mp; + + mnt = list_first_entry(to_restore, struct mount, mnt_umounting); + CLEAR_MNT_MARK(mnt); + list_del_init(&mnt->mnt_umounting); + + /* Should this mount be reparented? */ + mp = mnt->mnt_mp; + parent = mnt->mnt_parent; + while (parent->mnt.mnt_flags & MNT_UMOUNT) { + mp = parent->mnt_mp; + parent = parent->mnt_parent; } + if (parent != mnt->mnt_parent) + mnt_change_mountpoint(parent, mp, mnt); + } +} + +static void cleanup_umount_visitations(struct list_head *visited) +{ + while (!list_empty(visited)) { + struct mount *mnt = + list_first_entry(visited, struct mount, mnt_umounting); + list_del_init(&mnt->mnt_umounting); } } @@ -450,11 +544,68 @@ static void __propagate_umount(struct mount *mnt) int propagate_umount(struct list_head *list) { struct mount *mnt; + LIST_HEAD(to_restore); + LIST_HEAD(to_umount); + LIST_HEAD(visited); + + /* Find candidates for unmounting */ + list_for_each_entry_reverse(mnt, list, mnt_list) { + struct mount *parent = mnt->mnt_parent; + struct mount *m; + + /* + * If this mount has already been visited it is known that it's + * entire peer group and all of their slaves in the propagation + * tree for the mountpoint has already been visited and there is + * no need to visit them again. + */ + if (!list_empty(&mnt->mnt_umounting)) + continue; + + list_add_tail(&mnt->mnt_umounting, &visited); + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + struct mount *child = __lookup_mnt(&m->mnt, + mnt->mnt_mountpoint); + if (!child) + continue; + + if (!list_empty(&child->mnt_umounting)) { + /* + * If the child has already been visited it is + * know that it's entire peer group and all of + * their slaves in the propgation tree for the + * mountpoint has already been visited and there + * is no need to visit this subtree again. + */ + m = skip_propagation_subtree(m, parent); + continue; + } else if (child->mnt.mnt_flags & MNT_UMOUNT) { + /* + * We have come accross an partially unmounted + * mount in list that has not been visited yet. + * Remember it has been visited and continue + * about our merry way. + */ + list_add_tail(&child->mnt_umounting, &visited); + continue; + } + + /* Check the child and parents while progress is made */ + while (__propagate_umount(child, + &to_umount, &to_restore)) { + /* Is the parent a umount candidate? */ + child = child->mnt_parent; + if (list_empty(&child->mnt_umounting)) + break; + } + } + } - list_for_each_entry_reverse(mnt, list, mnt_list) - mark_umount_candidates(mnt); + umount_list(&to_umount, &to_restore); + restore_mounts(&to_restore); + cleanup_umount_visitations(&visited); + list_splice_tail(&to_umount, list); - list_for_each_entry(mnt, list, mnt_list) - __propagate_umount(mnt); return 0; } diff --git a/fs/pnode.h b/fs/pnode.h index 550f5a8b4fcf..dc87e65becd2 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root); unsigned int mnt_get_count(struct mount *mnt); void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, + struct mount *mnt); struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); diff --git a/fs/proc/array.c b/fs/proc/array.c index 81818adb8e9e..c932ec454625 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -60,6 +60,7 @@ #include <linux/tty.h> #include <linux/string.h> #include <linux/mman.h> +#include <linux/sched.h> #include <linux/proc_fs.h> #include <linux/ioport.h> #include <linux/uaccess.h> @@ -416,7 +417,15 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * esp and eip are intentionally zeroed out. There is no * non-racy way to read them without freezing the task. * Programs that need reliable values can use ptrace(2). + * + * The only exception is if the task is core dumping because + * a program is not able to use ptrace(2) in that case. It is + * safe because the task has stopped executing permanently. */ + if (permitted && (task->flags & PF_DUMPCORE)) { + eip = KSTK_EIP(task); + esp = KSTK_ESP(task); + } } get_task_comm(tcomm, task); diff --git a/fs/proc/base.c b/fs/proc/base.c index ca651ac00660..e67fec3c9856 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3181,6 +3181,8 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) iter.tgid += 1, iter = next_tgid(ns, iter)) { char name[PROC_NUMBUF]; int len; + + cond_resched(); if (!has_pid_permissions(ns, iter.task, 2)) continue; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 5f2dc2032c79..6047471575bb 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -471,6 +471,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent->data = NULL; ent->proc_fops = NULL; ent->proc_iops = NULL; + parent->nlink++; if (proc_register(parent, ent) < 0) { kfree(ent); parent->nlink--; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 35b92d81692f..5138e781737a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -299,11 +299,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) /* We don't show the stack guard page in /proc/maps */ start = vma->vm_start; - if (stack_guard_page_start(vma, start)) - start += PAGE_SIZE; end = vma->vm_end; - if (stack_guard_page_end(vma, end)) - end -= PAGE_SIZE; seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", @@ -899,7 +895,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); + pmd_t pmd = *pmdp; + + /* See comment in change_huge_pmd() */ + pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(*pmdp)) + pmd = pmd_mkdirty(pmd); + if (pmd_young(*pmdp)) + pmd = pmd_mkyoung(pmd); pmd = pmd_wrprotect(pmd); pmd = pmd_clear_soft_dirty(pmd); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 14984d902a99..43033a3d66d5 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -704,6 +704,7 @@ int pstore_register(struct pstore_info *psi) if (psi->flags & PSTORE_FLAGS_PMSG) pstore_register_pmsg(); + /* Start watching for new records, if desired. */ if (pstore_update_ms >= 0) { pstore_timer.expires = jiffies + msecs_to_jiffies(pstore_update_ms); @@ -726,6 +727,11 @@ EXPORT_SYMBOL_GPL(pstore_register); void pstore_unregister(struct pstore_info *psi) { + /* Stop timer and make sure all work has finished. */ + pstore_update_ms = -1; + del_timer_sync(&pstore_timer); + flush_work(&pstore_work); + if (psi->flags & PSTORE_FLAGS_PMSG) pstore_unregister_pmsg(); if (psi->flags & PSTORE_FLAGS_FTRACE) @@ -825,7 +831,9 @@ static void pstore_timefunc(unsigned long dummy) schedule_work(&pstore_work); } - mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms)); + if (pstore_update_ms >= 0) + mod_timer(&pstore_timer, + jiffies + msecs_to_jiffies(pstore_update_ms)); } module_param(backend, charp, 0444); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 6ad831b9d1b8..8b09271e5d66 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -434,7 +434,7 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, for (i = 0; i < cxt->max_dump_cnt; i++) { cxt->przs[i] = persistent_ram_new(*paddr, cxt->record_size, 0, &cxt->ecc_info, - cxt->memtype); + cxt->memtype, 0); if (IS_ERR(cxt->przs[i])) { err = PTR_ERR(cxt->przs[i]); dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", @@ -471,7 +471,8 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, return -ENOMEM; } - *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype); + *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, + cxt->memtype, 0); if (IS_ERR(*prz)) { int err = PTR_ERR(*prz); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 3975deec02f8..e11672aa4575 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -48,16 +48,15 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz) return atomic_read(&prz->buffer->start); } -static DEFINE_RAW_SPINLOCK(buffer_lock); - /* increase and wrap the start pointer, returning the old value */ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a) { int old; int new; - unsigned long flags; + unsigned long flags = 0; - raw_spin_lock_irqsave(&buffer_lock, flags); + if (!(prz->flags & PRZ_FLAG_NO_LOCK)) + raw_spin_lock_irqsave(&prz->buffer_lock, flags); old = atomic_read(&prz->buffer->start); new = old + a; @@ -65,7 +64,8 @@ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a) new -= prz->buffer_size; atomic_set(&prz->buffer->start, new); - raw_spin_unlock_irqrestore(&buffer_lock, flags); + if (!(prz->flags & PRZ_FLAG_NO_LOCK)) + raw_spin_unlock_irqrestore(&prz->buffer_lock, flags); return old; } @@ -75,9 +75,10 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a) { size_t old; size_t new; - unsigned long flags; + unsigned long flags = 0; - raw_spin_lock_irqsave(&buffer_lock, flags); + if (!(prz->flags & PRZ_FLAG_NO_LOCK)) + raw_spin_lock_irqsave(&prz->buffer_lock, flags); old = atomic_read(&prz->buffer->size); if (old == prz->buffer_size) @@ -89,7 +90,8 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a) atomic_set(&prz->buffer->size, new); exit: - raw_spin_unlock_irqrestore(&buffer_lock, flags); + if (!(prz->flags & PRZ_FLAG_NO_LOCK)) + raw_spin_unlock_irqrestore(&prz->buffer_lock, flags); } static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz, @@ -491,6 +493,7 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, prz->buffer->sig); } + /* Rewind missing or invalid memory area. */ prz->buffer->sig = sig; persistent_ram_zap(prz); @@ -517,7 +520,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz) struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, u32 sig, struct persistent_ram_ecc_info *ecc_info, - unsigned int memtype) + unsigned int memtype, u32 flags) { struct persistent_ram_zone *prz; int ret = -ENOMEM; @@ -528,6 +531,10 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, goto err; } + /* Initialize general buffer state. */ + raw_spin_lock_init(&prz->buffer_lock); + prz->flags = flags; + ret = persistent_ram_buffer_map(start, size, prz, memtype); if (ret) goto err; diff --git a/fs/read_write.c b/fs/read_write.c index 190e0d362581..ba280596ec78 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -114,7 +114,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ - if (offset >= eof) + if ((unsigned long long)offset >= eof) return -ENXIO; break; case SEEK_HOLE: @@ -122,7 +122,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ - if (offset >= eof) + if ((unsigned long long)offset >= eof) return -ENXIO; offset = eof; break; @@ -1232,7 +1232,7 @@ static size_t compat_writev(struct file *file, if (!(file->f_mode & FMODE_CAN_WRITE)) goto out; - ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, 0); + ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, flags); out: if (ret > 0) @@ -1518,6 +1518,11 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, if (flags != 0) return -EINVAL; + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + ret = rw_verify_area(READ, file_in, &pos_in, len); if (unlikely(ret)) return ret; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 2f8c5c9bdaf6..b396eb09f288 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -189,7 +189,7 @@ int reiserfs_commit_page(struct inode *inode, struct page *page, int ret = 0; th.t_trans_id = 0; - blocksize = 1 << inode->i_blkbits; + blocksize = i_blocksize(inode); if (logit) { reiserfs_write_lock(s); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 58b2dedb2a3a..bd4c727f4610 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -524,7 +524,7 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode, * referenced in convert_tail_for_hole() that may be called from * reiserfs_get_block() */ - bh_result->b_size = (1 << inode->i_blkbits); + bh_result->b_size = i_blocksize(inode); ret = reiserfs_get_block(inode, iblock, bh_result, create | GET_BLOCK_NO_DANGLE); diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 3d2256a425ee..d92a1dc6ee70 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -37,7 +37,14 @@ reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) error = journal_begin(&th, inode->i_sb, jcreate_blocks); reiserfs_write_unlock(inode->i_sb); if (error == 0) { + if (type == ACL_TYPE_ACCESS && acl) { + error = posix_acl_update_mode(inode, &inode->i_mode, + &acl); + if (error) + goto unlock; + } error = __reiserfs_set_acl(&th, inode, type, acl); +unlock: reiserfs_write_lock(inode->i_sb); error2 = journal_end(&th); reiserfs_write_unlock(inode->i_sb); @@ -241,11 +248,6 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - } break; case ACL_TYPE_DEFAULT: name = XATTR_NAME_POSIX_ACL_DEFAULT; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index d0f8a38dfafa..0186fe6d39f3 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -74,6 +74,7 @@ #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/uaccess.h> +#include <linux/major.h> #include "internal.h" static struct kmem_cache *romfs_inode_cachep; @@ -416,7 +417,22 @@ static void romfs_destroy_inode(struct inode *inode) static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; - u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + u64 id = 0; + + /* When calling huge_encode_dev(), + * use sb->s_bdev->bd_dev when, + * - CONFIG_ROMFS_ON_BLOCK defined + * use sb->s_dev when, + * - CONFIG_ROMFS_ON_BLOCK undefined and + * - CONFIG_ROMFS_ON_MTD defined + * leave id as 0 when, + * - CONFIG_ROMFS_ON_BLOCK undefined and + * - CONFIG_ROMFS_ON_MTD undefined + */ + if (sb->s_bdev) + id = huge_encode_dev(sb->s_bdev->bd_dev); + else if (sb->s_dev) + id = huge_encode_dev(sb->s_dev); buf->f_type = ROMFS_MAGIC; buf->f_namelen = ROMFS_MAXFN; @@ -489,6 +505,11 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags |= MS_RDONLY | MS_NOATIME; sb->s_op = &romfs_super_ops; +#ifdef CONFIG_ROMFS_ON_MTD + /* Use same dev ID from the underlying mtdblock device */ + if (sb->s_mtd) + sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, sb->s_mtd->index); +#endif /* read the image superblock and check it */ rsb = kmalloc(512, GFP_KERNEL); if (!rsb) diff --git a/fs/splice.c b/fs/splice.c index 63b8f54485dc..8dd79ecfd383 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -203,6 +203,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, buf->len = spd->partial[page_nr].len; buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; + buf->flags = 0; pipe->nrbufs++; page_nr++; diff --git a/fs/stat.c b/fs/stat.c index bc045c7994e1..068fdbcc9e26 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -31,7 +31,7 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; stat->ctime = inode->i_ctime; - stat->blksize = (1 << inode->i_blkbits); + stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; } @@ -454,6 +454,7 @@ void __inode_add_bytes(struct inode *inode, loff_t bytes) inode->i_bytes -= 512; } } +EXPORT_SYMBOL(__inode_add_bytes); void inode_add_bytes(struct inode *inode, loff_t bytes) { diff --git a/fs/super.c b/fs/super.c index c183835566c1..1058bf3e8724 100644 --- a/fs/super.c +++ b/fs/super.c @@ -470,7 +470,7 @@ struct super_block *sget_userns(struct file_system_type *type, struct super_block *old; int err; - if (!(flags & MS_KERNMOUNT) && + if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !(type->fs_flags & FS_USERNS_MOUNT) && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -500,7 +500,7 @@ retry: } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(type, flags, user_ns); + s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; @@ -541,8 +541,15 @@ struct super_block *sget(struct file_system_type *type, { struct user_namespace *user_ns = current_user_ns(); + /* We don't yet pass the user namespace of the parent + * mount through to here so always use &init_user_ns + * until that changes. + */ + if (flags & MS_SUBMOUNT) + user_ns = &init_user_ns; + /* Ensure the requestor has permissions over the target filesystem */ - if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); return sget_userns(type, test, set, flags, user_ns, data); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index b803213d1307..39c75a86c67f 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -108,7 +108,7 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, { const struct sysfs_ops *ops = sysfs_file_ops(of->kn); struct kobject *kobj = of->kn->parent->priv; - size_t len; + ssize_t len; /* * If buf != of->prealloc_buf, we don't know how @@ -117,13 +117,15 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, if (WARN_ON_ONCE(buf != of->prealloc_buf)) return 0; len = ops->show(kobj, of->kn->priv, buf); + if (len < 0) + return len; if (pos) { if (len <= pos) return 0; len -= pos; memmove(buf, buf + pos, len); } - return min(count, len); + return min_t(ssize_t, count, len); } /* kernfs write callback for regular sysfs files */ diff --git a/fs/timerfd.c b/fs/timerfd.c index 9ae4abb4110b..ab8dd1538381 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -40,6 +40,7 @@ struct timerfd_ctx { short unsigned settime_flags; /* to show in fdinfo */ struct rcu_head rcu; struct list_head clist; + spinlock_t cancel_lock; bool might_cancel; }; @@ -112,7 +113,7 @@ void timerfd_clock_was_set(void) rcu_read_unlock(); } -static void timerfd_remove_cancel(struct timerfd_ctx *ctx) +static void __timerfd_remove_cancel(struct timerfd_ctx *ctx) { if (ctx->might_cancel) { ctx->might_cancel = false; @@ -122,6 +123,13 @@ static void timerfd_remove_cancel(struct timerfd_ctx *ctx) } } +static void timerfd_remove_cancel(struct timerfd_ctx *ctx) +{ + spin_lock(&ctx->cancel_lock); + __timerfd_remove_cancel(ctx); + spin_unlock(&ctx->cancel_lock); +} + static bool timerfd_canceled(struct timerfd_ctx *ctx) { if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX) @@ -132,6 +140,7 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx) static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) { + spin_lock(&ctx->cancel_lock); if ((ctx->clockid == CLOCK_REALTIME || ctx->clockid == CLOCK_REALTIME_ALARM) && (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) { @@ -141,9 +150,10 @@ static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) list_add_rcu(&ctx->clist, &cancel_list); spin_unlock(&cancel_lock); } - } else if (ctx->might_cancel) { - timerfd_remove_cancel(ctx); + } else { + __timerfd_remove_cancel(ctx); } + spin_unlock(&ctx->cancel_lock); } static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) @@ -400,6 +410,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) return -ENOMEM; init_waitqueue_head(&ctx->wqh); + spin_lock_init(&ctx->cancel_lock); ctx->clockid = clockid; if (isalarm(ctx)) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ca16c5d7bab1..87ab02e2d666 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -622,6 +622,11 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, return err; lock_2_inodes(dir, inode); + + /* Handle O_TMPFILE corner case, it is allowed to link a O_TMPFILE. */ + if (inode->i_nlink == 0) + ubifs_delete_orphan(c, inode->i_ino); + inc_nlink(inode); ihold(inode); inode->i_ctime = ubifs_current_time(inode); @@ -641,6 +646,8 @@ out_cancel: dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; drop_nlink(inode); + if (inode->i_nlink == 0) + ubifs_add_orphan(c, inode->i_ino); unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); iput(inode); @@ -1088,9 +1095,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, struct timespec time; unsigned int uninitialized_var(saved_nlink); - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - /* * Budget request settings: deletion direntry, new direntry, removing * the old inode, and changing old and new parent directory inodes. diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 91bc76dc559e..7d764e3b6c79 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -576,7 +576,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, /* Make sure to also account for extended attributes */ len += host_ui->data_len; - dent = kmalloc(len, GFP_NOFS); + dent = kzalloc(len, GFP_NOFS); if (!dent) return -ENOMEM; @@ -952,7 +952,7 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, if (twoparents) len += plen; - dent1 = kmalloc(len, GFP_NOFS); + dent1 = kzalloc(len, GFP_NOFS); if (!dent1) return -ENOMEM; @@ -1102,7 +1102,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8); if (move) len += plen; - dent = kmalloc(len, GFP_NOFS); + dent = kzalloc(len, GFP_NOFS); if (!dent) return -ENOMEM; @@ -1466,7 +1466,7 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, hlen = host_ui->data_len + UBIFS_INO_NODE_SZ; len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8); - xent = kmalloc(len, GFP_NOFS); + xent = kzalloc(len, GFP_NOFS); if (!xent) return -ENOMEM; @@ -1573,7 +1573,7 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, aligned_len1 = ALIGN(len1, 8); aligned_len = aligned_len1 + ALIGN(len2, 8); - ino = kmalloc(aligned_len, GFP_NOFS); + ino = kzalloc(aligned_len, GFP_NOFS); if (!ino) return -ENOMEM; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index aad46401ede5..035943501b9f 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1214,7 +1214,7 @@ int udf_setsize(struct inode *inode, loff_t newsize) { int err; struct udf_inode_info *iinfo; - int bsize = 1 << inode->i_blkbits; + int bsize = i_blocksize(inode); if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) @@ -1243,8 +1243,8 @@ int udf_setsize(struct inode *inode, loff_t newsize) return err; } set_size: - truncate_setsize(inode, newsize); up_write(&iinfo->i_data_sem); + truncate_setsize(inode, newsize); } else { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); @@ -1261,9 +1261,9 @@ set_size: udf_get_block); if (err) return err; + truncate_setsize(inode, newsize); down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); - truncate_setsize(inode, newsize); udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); } diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 67e085d591d8..a81b97013021 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -81,7 +81,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) ufs_error (sb, "ufs_free_fragments", "bit already cleared for fragment %u", i); } - + + inode_sub_bytes(inode, count << uspi->s_fshift); fs32_add(sb, &ucg->cg_cs.cs_nffree, count); uspi->cs_total.cs_nffree += count; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); @@ -183,6 +184,7 @@ do_more: ufs_error(sb, "ufs_free_blocks", "freeing free fragment"); } ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); + inode_sub_bytes(inode, uspi->s_fpb << uspi->s_fshift); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, 1); @@ -494,6 +496,20 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, return 0; } +static bool try_add_frags(struct inode *inode, unsigned frags) +{ + unsigned size = frags * i_blocksize(inode); + spin_lock(&inode->i_lock); + __inode_add_bytes(inode, size); + if (unlikely((u32)inode->i_blocks != inode->i_blocks)) { + __inode_sub_bytes(inode, size); + spin_unlock(&inode->i_lock); + return false; + } + spin_unlock(&inode->i_lock); + return true; +} + static u64 ufs_add_fragments(struct inode *inode, u64 fragment, unsigned oldcount, unsigned newcount) { @@ -530,6 +546,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, for (i = oldcount; i < newcount; i++) if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i)) return 0; + + if (!try_add_frags(inode, count)) + return 0; /* * Block can be extended */ @@ -647,6 +666,7 @@ cg_found: ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); i = uspi->s_fpb - count; + inode_sub_bytes(inode, i << uspi->s_fshift); fs32_add(sb, &ucg->cg_cs.cs_nffree, i); uspi->cs_total.cs_nffree += i; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i); @@ -657,6 +677,8 @@ cg_found: result = ufs_bitmap_search (sb, ucpi, goal, allocsize); if (result == INVBLOCK) return 0; + if (!try_add_frags(inode, count)) + return 0; for (i = 0; i < count; i++) ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i); @@ -716,6 +738,8 @@ norot: return INVBLOCK; ucpi->c_rotor = result; gotit: + if (!try_add_frags(inode, uspi->s_fpb)) + return 0; blkno = ufs_fragstoblks(result); ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 190d64be22ed..a2760a2869f4 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -235,7 +235,8 @@ ufs_extend_tail(struct inode *inode, u64 writes_to, p = ufs_get_direct_data_ptr(uspi, ufsi, block); tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p), - new_size, err, locked_page); + new_size - (lastfrag & uspi->s_fpbmask), err, + locked_page); return tmp != 0; } @@ -284,7 +285,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, goal += uspi->s_fpb; } tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), - goal, uspi->s_fpb, err, locked_page); + goal, nfrags, err, locked_page); if (!tmp) { *err = -ENOSPC; @@ -402,7 +403,9 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff if (!create) { phys64 = ufs_frag_map(inode, offsets, depth); - goto out; + if (phys64) + map_bh(bh_result, sb, phys64 + frag); + return 0; } /* This code entered only while writing ....? */ diff --git a/fs/ufs/super.c b/fs/ufs/super.c index f04ab232d08d..351162ff1bfd 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -746,6 +746,23 @@ static void ufs_put_super(struct super_block *sb) return; } +static u64 ufs_max_bytes(struct super_block *sb) +{ + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + int bits = uspi->s_apbshift; + u64 res; + + if (bits > 21) + res = ~0ULL; + else + res = UFS_NDADDR + (1LL << bits) + (1LL << (2*bits)) + + (1LL << (3*bits)); + + if (res >= (MAX_LFS_FILESIZE >> uspi->s_bshift)) + return MAX_LFS_FILESIZE; + return res << uspi->s_bshift; +} + static int ufs_fill_super(struct super_block *sb, void *data, int silent) { struct ufs_sb_info * sbi; @@ -812,9 +829,8 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) uspi->s_dirblksize = UFS_SECTOR_SIZE; super_block_offset=UFS_SBLOCK; - /* Keep 2Gig file limit. Some UFS variants need to override - this but as I don't know which I'll let those in the know loosen - the rules */ + sb->s_maxbytes = MAX_LFS_FILESIZE; + switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) { case UFS_MOUNT_UFSTYPE_44BSD: UFSD("ufstype=44bsd\n"); @@ -1212,6 +1228,7 @@ magic_found: "fast symlink size (%u)\n", uspi->s_maxsymlinklen); uspi->s_maxsymlinklen = maxsymlen; } + sb->s_maxbytes = ufs_max_bytes(sb); sb->s_max_links = UFS_LINK_MAX; inode = ufs_iget(sb, UFS_ROOTINO); diff --git a/fs/ufs/util.h b/fs/ufs/util.h index b7fbf53dbc81..398019fb1448 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -473,15 +473,19 @@ static inline unsigned _ubh_find_last_zero_bit_( static inline int _ubh_isblockset_(struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, unsigned begin, unsigned block) { + u8 mask; switch (uspi->s_fpb) { case 8: return (*ubh_get_addr (ubh, begin + block) == 0xff); case 4: - return (*ubh_get_addr (ubh, begin + (block >> 1)) == (0x0f << ((block & 0x01) << 2))); + mask = 0x0f << ((block & 0x01) << 2); + return (*ubh_get_addr (ubh, begin + (block >> 1)) & mask) == mask; case 2: - return (*ubh_get_addr (ubh, begin + (block >> 2)) == (0x03 << ((block & 0x03) << 1))); + mask = 0x03 << ((block & 0x03) << 1); + return (*ubh_get_addr (ubh, begin + (block >> 2)) & mask) == mask; case 1: - return (*ubh_get_addr (ubh, begin + (block >> 3)) == (0x01 << (block & 0x07))); + mask = 0x01 << (block & 0x07); + return (*ubh_get_addr (ubh, begin + (block >> 3)) & mask) == mask; } return 0; } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 85959d8324df..b86054cc41db 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -63,6 +63,7 @@ struct userfaultfd_wait_queue { struct uffd_msg msg; wait_queue_t wq; struct userfaultfd_ctx *ctx; + bool waken; }; struct userfaultfd_wake_range { @@ -86,6 +87,12 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, if (len && (start > uwq->msg.arg.pagefault.address || start + len <= uwq->msg.arg.pagefault.address)) goto out; + WRITE_ONCE(uwq->waken, true); + /* + * The implicit smp_mb__before_spinlock in try_to_wake_up() + * renders uwq->waken visible to other CPUs before the task is + * waken. + */ ret = wake_up_state(wq->private, mode); if (ret) /* @@ -264,6 +271,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) struct userfaultfd_wait_queue uwq; int ret; bool must_wait, return_to_userland; + long blocking_state; BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); @@ -333,10 +341,13 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) uwq.wq.private = current; uwq.msg = userfault_msg(fe->address, fe->flags, reason); uwq.ctx = ctx; + uwq.waken = false; return_to_userland = (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); + blocking_state = return_to_userland ? TASK_INTERRUPTIBLE : + TASK_KILLABLE; spin_lock(&ctx->fault_pending_wqh.lock); /* @@ -349,8 +360,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) * following the spin_unlock to happen before the list_add in * __add_wait_queue. */ - set_current_state(return_to_userland ? TASK_INTERRUPTIBLE : - TASK_KILLABLE); + set_current_state(blocking_state); spin_unlock(&ctx->fault_pending_wqh.lock); must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason); @@ -362,6 +372,29 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) wake_up_poll(&ctx->fd_wqh, POLLIN); schedule(); ret |= VM_FAULT_MAJOR; + + /* + * False wakeups can orginate even from rwsem before + * up_read() however userfaults will wait either for a + * targeted wakeup on the specific uwq waitqueue from + * wake_userfault() or for signals or for uffd + * release. + */ + while (!READ_ONCE(uwq.waken)) { + /* + * This needs the full smp_store_mb() + * guarantee as the state write must be + * visible to other CPUs before reading + * uwq.waken from other CPUs. + */ + set_current_state(blocking_state); + if (READ_ONCE(uwq.waken) || + READ_ONCE(ctx->released) || + (return_to_userland ? signal_pending(current) : + fatal_signal_pending(current))) + break; + schedule(); + } } __set_current_state(TASK_RUNNING); diff --git a/fs/xattr.c b/fs/xattr.c index 2d13b4e62fae..932b9061a3a2 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -249,7 +249,7 @@ xattr_getsecurity(struct inode *inode, const char *name, void *value, } memcpy(value, buffer, len); out: - security_release_secctx(buffer, len); + kfree(buffer); out_noalloc: return len; } @@ -530,7 +530,7 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, size = XATTR_SIZE_MAX; kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!kvalue) { - kvalue = vmalloc(size); + kvalue = vzalloc(size); if (!kvalue) return -ENOMEM; } diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 339c696bbc01..bb2beaef531a 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -24,24 +24,6 @@ #include "kmem.h" #include "xfs_message.h" -/* - * Greedy allocation. May fail and may return vmalloced memory. - */ -void * -kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) -{ - void *ptr; - size_t kmsize = maxsize; - - while (!(ptr = vzalloc(kmsize))) { - if ((kmsize >>= 1) <= minsize) - kmsize = minsize; - } - if (ptr) - *size = kmsize; - return ptr; -} - void * kmem_alloc(size_t size, xfs_km_flags_t flags) { diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 689f746224e7..f0fc84fcaac2 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -69,8 +69,6 @@ static inline void kmem_free(const void *ptr) } -extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); - static inline void * kmem_zalloc(size_t size, xfs_km_flags_t flags) { diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index d346d42c54d1..eed8f5867c46 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -39,6 +39,7 @@ #include "xfs_rmap_btree.h" #include "xfs_btree.h" #include "xfs_refcount_btree.h" +#include "xfs_ialloc_btree.h" /* * Per-AG Block Reservations @@ -156,7 +157,8 @@ __xfs_ag_resv_free( trace_xfs_ag_resv_free(pag, type, 0); resv = xfs_perag_resv(pag, type); - pag->pag_mount->m_ag_max_usable += resv->ar_asked; + if (pag->pag_agno == 0) + pag->pag_mount->m_ag_max_usable += resv->ar_asked; /* * AGFL blocks are always considered "free", so whatever * was reserved at mount time must be given back at umount. @@ -200,22 +202,37 @@ __xfs_ag_resv_init( struct xfs_mount *mp = pag->pag_mount; struct xfs_ag_resv *resv; int error; + xfs_extlen_t reserved; - resv = xfs_perag_resv(pag, type); if (used > ask) ask = used; - resv->ar_asked = ask; - resv->ar_reserved = resv->ar_orig_reserved = ask - used; - mp->m_ag_max_usable -= ask; - - trace_xfs_ag_resv_init(pag, type, ask); + reserved = ask - used; - error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true); - if (error) + error = xfs_mod_fdblocks(mp, -(int64_t)reserved, true); + if (error) { trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, error, _RET_IP_); + xfs_warn(mp, +"Per-AG reservation for AG %u failed. Filesystem may run out of space.", + pag->pag_agno); + return error; + } - return error; + /* + * Reduce the maximum per-AG allocation length by however much we're + * trying to reserve for an AG. Since this is a filesystem-wide + * counter, we only make the adjustment for AG 0. This assumes that + * there aren't any AGs hungrier for per-AG reservation than AG 0. + */ + if (pag->pag_agno == 0) + mp->m_ag_max_usable -= ask; + + resv = xfs_perag_resv(pag, type); + resv->ar_asked = ask; + resv->ar_reserved = resv->ar_orig_reserved = reserved; + + trace_xfs_ag_resv_init(pag, type, ask); + return 0; } /* Create a per-AG block reservation. */ @@ -223,6 +240,8 @@ int xfs_ag_resv_init( struct xfs_perag *pag) { + struct xfs_mount *mp = pag->pag_mount; + xfs_agnumber_t agno = pag->pag_agno; xfs_extlen_t ask; xfs_extlen_t used; int error = 0; @@ -231,23 +250,45 @@ xfs_ag_resv_init( if (pag->pag_meta_resv.ar_asked == 0) { ask = used = 0; - error = xfs_refcountbt_calc_reserves(pag->pag_mount, - pag->pag_agno, &ask, &used); + error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; - error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, - ask, used); + error = xfs_finobt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, + ask, used); + if (error) { + /* + * Because we didn't have per-AG reservations when the + * finobt feature was added we might not be able to + * reserve all needed blocks. Warn and fall back to the + * old and potentially buggy code in that case, but + * ensure we do have the reservation for the refcountbt. + */ + ask = used = 0; + + mp->m_inotbt_nores = true; + + error = xfs_refcountbt_calc_reserves(mp, agno, &ask, + &used); + if (error) + goto out; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, + ask, used); + if (error) + goto out; + } } /* Create the AGFL metadata reservation */ if (pag->pag_agfl_resv.ar_asked == 0) { ask = used = 0; - error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno, - &ask, &used); + error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; @@ -256,9 +297,16 @@ xfs_ag_resv_init( goto out; } +#ifdef DEBUG + /* need to read in the AGF for the ASSERT below to work */ + error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0); + if (error) + return error; + ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <= pag->pagf_freeblks + pag->pagf_flcount); +#endif out: return error; } diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 9f06a211e157..c3702cda010a 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1579,6 +1579,10 @@ xfs_alloc_ag_vextent_small( bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno, 0); + if (!bp) { + error = -EFSCORRUPTED; + goto error0; + } xfs_trans_binval(args->tp, bp); } args->len = 1; @@ -2136,6 +2140,10 @@ xfs_alloc_fix_freelist( if (error) goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); + if (!bp) { + error = -EFSCORRUPTED; + goto out_agbp_relse; + } xfs_trans_binval(tp, bp); } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2852521fc8ec..c6c15e5717e4 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -351,7 +351,7 @@ xfs_attr3_leaf_read( err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); return err; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f52fd63fce19..7eb99701054f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -579,7 +579,7 @@ xfs_bmap_validate_ret( #else #define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) -#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do { } while (0) #endif /* DEBUG */ /* @@ -769,8 +769,8 @@ xfs_bmap_extents_to_btree( args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); } else if (dfops->dop_low) { -try_another_ag: args.type = XFS_ALLOCTYPE_START_BNO; +try_another_ag: args.fsbno = *firstblock; } else { args.type = XFS_ALLOCTYPE_NEAR_BNO; @@ -796,17 +796,19 @@ try_another_ag: if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && args.fsbno == NULLFSBLOCK && args.type == XFS_ALLOCTYPE_NEAR_BNO) { - dfops->dop_low = true; + args.type = XFS_ALLOCTYPE_FIRST_AG; goto try_another_ag; } + if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { + xfs_iroot_realloc(ip, -1, whichfork); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return -ENOSPC; + } /* * Allocation can't fail, the space was reserved. */ - ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(*firstblock == NULLFSBLOCK || - args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || - (dfops->dop_low && - args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); + args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock)); *firstblock = cur->bc_private.b.firstblock = args.fsbno; cur->bc_private.b.allocated++; ip->i_d.di_nblocks++; @@ -1278,7 +1280,6 @@ xfs_bmap_read_extents( /* REFERENCED */ xfs_extnum_t room; /* number of entries there's room for */ - bno = NULLFSBLOCK; mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : @@ -1291,9 +1292,7 @@ xfs_bmap_read_extents( ASSERT(level > 0); pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); - ASSERT(bno != NULLFSBLOCK); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + /* * Go down the tree until leaf level is reached, following the first * pointer (leftmost) at each level. @@ -1955,6 +1954,7 @@ xfs_bmap_add_extent_delay_real( */ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); + xfs_bmbt_set_state(ep, new->br_state); trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); (*nextents)++; @@ -2208,8 +2208,10 @@ xfs_bmap_add_extent_delay_real( } temp = xfs_bmap_worst_indlen(bma->ip, temp); temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); - diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + diff = (int)(temp + temp2 - + (startblockval(PREV.br_startblock) - + (bma->cur ? + bma->cur->bc_private.b.allocated : 0))); if (diff > 0) { error = xfs_mod_fdblocks(bma->ip->i_mount, -((int64_t)diff), false); @@ -2266,7 +2268,6 @@ xfs_bmap_add_extent_delay_real( temp = da_new; if (bma->cur) temp += bma->cur->bc_private.b.allocated; - ASSERT(temp <= da_old); if (temp < da_old) xfs_mod_fdblocks(bma->ip->i_mount, (int64_t)(da_old - temp), false); @@ -2293,6 +2294,7 @@ STATIC int /* error */ xfs_bmap_add_extent_unwritten_real( struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ + int whichfork, xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ @@ -2312,12 +2314,14 @@ xfs_bmap_add_extent_unwritten_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ - struct xfs_mount *mp = tp->t_mountp; + struct xfs_mount *mp = ip->i_mount; *logflagsp = 0; cur = *curp; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ifp = XFS_IFORK_PTR(ip, whichfork); + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ASSERT(*idx >= 0); ASSERT(*idx <= xfs_iext_count(ifp)); @@ -2376,7 +2380,7 @@ xfs_bmap_add_extent_unwritten_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < xfs_iext_count(&ip->i_df) - 1) { + if (*idx < xfs_iext_count(ifp) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) @@ -2416,7 +2420,8 @@ xfs_bmap_add_extent_unwritten_real( trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx + 1, 2, state); - ip->i_d.di_nextents -= 2; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 2); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2459,7 +2464,8 @@ xfs_bmap_add_extent_unwritten_real( trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx + 1, 1, state); - ip->i_d.di_nextents--; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2494,7 +2500,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_bmbt_set_state(ep, newext); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx + 1, 1, state); - ip->i_d.di_nextents--; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2606,7 +2613,8 @@ xfs_bmap_add_extent_unwritten_real( trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_insert(ip, *idx, 1, new, state); - ip->i_d.di_nextents++; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2684,7 +2692,8 @@ xfs_bmap_add_extent_unwritten_real( ++*idx; xfs_iext_insert(ip, *idx, 1, new, state); - ip->i_d.di_nextents++; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2732,7 +2741,8 @@ xfs_bmap_add_extent_unwritten_real( ++*idx; xfs_iext_insert(ip, *idx, 2, &r[0], state); - ip->i_d.di_nextents += 2; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 2); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2786,17 +2796,17 @@ xfs_bmap_add_extent_unwritten_real( } /* update reverse mappings */ - error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new); + error = xfs_rmap_convert_extent(mp, dfops, ip, whichfork, new); if (error) goto done; /* convert to a btree if necessary */ - if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur, - 0, &tmp_logflags, XFS_DATA_FORK); + 0, &tmp_logflags, whichfork); *logflagsp |= tmp_logflags; if (error) goto done; @@ -2808,7 +2818,7 @@ xfs_bmap_add_extent_unwritten_real( *curp = cur; } - xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK); + xfs_bmap_check_leaf_extents(*curp, ip, whichfork); done: *logflagsp |= rval; return error; @@ -2900,7 +2910,8 @@ xfs_bmap_add_extent_hole_delay( oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock) + startblockval(right.br_startblock); - newlen = xfs_bmap_worst_indlen(ip, temp); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), nullstartblock((int)newlen)); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); @@ -2921,7 +2932,8 @@ xfs_bmap_add_extent_hole_delay( xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock); - newlen = xfs_bmap_worst_indlen(ip, temp); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), nullstartblock((int)newlen)); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); @@ -2937,7 +2949,8 @@ xfs_bmap_add_extent_hole_delay( temp = new->br_blockcount + right.br_blockcount; oldlen = startblockval(new->br_startblock) + startblockval(right.br_startblock); - newlen = xfs_bmap_worst_indlen(ip, temp); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), new->br_startoff, nullstartblock((int)newlen), temp, right.br_state); @@ -3913,17 +3926,13 @@ xfs_bmap_btalloc( * the first block that was allocated. */ ASSERT(*ap->firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *ap->firstblock) == - XFS_FSB_TO_AGNO(mp, args.fsbno) || - (ap->dfops->dop_low && - XFS_FSB_TO_AGNO(mp, *ap->firstblock) < - XFS_FSB_TO_AGNO(mp, args.fsbno))); + XFS_FSB_TO_AGNO(mp, *ap->firstblock) <= + XFS_FSB_TO_AGNO(mp, args.fsbno)); ap->blkno = args.fsbno; if (*ap->firstblock == NULLFSBLOCK) *ap->firstblock = args.fsbno; - ASSERT(nullfb || fb_agno == args.agno || - (ap->dfops->dop_low && fb_agno < args.agno)); + ASSERT(nullfb || fb_agno <= args.agno); ap->length = args.len; if (!(ap->flags & XFS_BMAPI_COWFORK)) ap->ip->i_d.di_nblocks += args.len; @@ -3956,7 +3965,7 @@ xfs_bmap_remap_alloc( { struct xfs_trans *tp = ap->tp; struct xfs_mount *mp = tp->t_mountp; - xfs_agblock_t bno; + xfs_fsblock_t bno; struct xfs_alloc_arg args; int error; @@ -4048,6 +4057,17 @@ xfs_trim_extent( } } +/* trim extent to within eof */ +void +xfs_trim_extent_eof( + struct xfs_bmbt_irec *irec, + struct xfs_inode *ip) + +{ + xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount, + i_size_read(VFS_I(ip)))); +} + /* * Trim the returned map to the required bounds */ @@ -4249,6 +4269,19 @@ xfs_bmapi_read( return 0; } +/* + * Add a delayed allocation extent to an inode. Blocks are reserved from the + * global pool and the extent inserted into the inode in-core extent tree. + * + * On entry, got refers to the first extent beyond the offset of the extent to + * allocate or eof is specified if no such extent exists. On return, got refers + * to the extent record that was inserted to the inode fork. + * + * Note that the allocated extent may have been merged with contiguous extents + * during insertion into the inode fork. Thus, got does not reflect the current + * state of the inode fork on return. If necessary, the caller can use lastx to + * look up the updated record in the inode fork. + */ int xfs_bmapi_reserve_delalloc( struct xfs_inode *ip, @@ -4335,13 +4368,8 @@ xfs_bmapi_reserve_delalloc( got->br_startblock = nullstartblock(indlen); got->br_blockcount = alen; got->br_state = XFS_EXT_NORM; - xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got); - /* - * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay - * might have merged it into one of the neighbouring ones. - */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); + xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got); /* * Tag the inode if blocks were preallocated. Note that COW fork @@ -4353,10 +4381,6 @@ xfs_bmapi_reserve_delalloc( if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) xfs_inode_set_cowblocks_tag(ip); - ASSERT(got->br_startoff <= aoff); - ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen); - ASSERT(isnullstartblock(got->br_startblock)); - ASSERT(got->br_state == XFS_EXT_NORM); return 0; out_unreserve_blocks: @@ -4461,10 +4485,16 @@ xfs_bmapi_allocate( bma->got.br_state = XFS_EXT_NORM; /* - * A wasdelay extent has been initialized, so shouldn't be flagged - * as unwritten. + * In the data fork, a wasdelay extent has been initialized, so + * shouldn't be flagged as unwritten. + * + * For the cow fork, however, we convert delalloc reservations + * (extents allocated for speculative preallocation) to + * allocated unwritten extents, and only convert the unwritten + * extents to real extents when we're about to write the data. */ - if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && + if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) && + (bma->flags & XFS_BMAPI_PREALLOC) && xfs_sb_version_hasextflgbit(&mp->m_sb)) bma->got.br_state = XFS_EXT_UNWRITTEN; @@ -4515,8 +4545,6 @@ xfs_bmapi_convert_unwritten( (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) return 0; - ASSERT(whichfork != XFS_COW_FORK); - /* * Modify (by adding) the state flag, if writing. */ @@ -4541,8 +4569,8 @@ xfs_bmapi_convert_unwritten( return error; } - error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, - &bma->cur, mval, bma->firstblock, bma->dfops, + error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork, + &bma->idx, &bma->cur, mval, bma->firstblock, bma->dfops, &tmp_logflags); /* * Log the inode core unconditionally in the unwritten extent conversion @@ -4551,8 +4579,12 @@ xfs_bmapi_convert_unwritten( * in the transaction for the sake of fsync(), even if nothing has * changed, because fsync() will not force the log for this transaction * unless it sees the inode pinned. + * + * Note: If we're only converting cow fork extents, there aren't + * any on-disk updates to make, so we don't need to log anything. */ - bma->logflags |= tmp_logflags | XFS_ILOG_CORE; + if (whichfork != XFS_COW_FORK) + bma->logflags |= tmp_logflags | XFS_ILOG_CORE; if (error) return error; @@ -4626,15 +4658,15 @@ xfs_bmapi_write( ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); ASSERT(!(flags & XFS_BMAPI_IGSTATE)); - ASSERT(tp != NULL); + ASSERT(tp != NULL || + (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) == + (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)); ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK); ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP)); ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP)); - ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK); - ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK); /* zeroing is for currently only for data extents, not metadata */ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != @@ -4840,13 +4872,9 @@ error0: if (bma.cur) { if (!error) { ASSERT(*firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *firstblock) == + XFS_FSB_TO_AGNO(mp, *firstblock) <= XFS_FSB_TO_AGNO(mp, - bma.cur->bc_private.b.firstblock) || - (dfops->dop_low && - XFS_FSB_TO_AGNO(mp, *firstblock) < - XFS_FSB_TO_AGNO(mp, - bma.cur->bc_private.b.firstblock))); + bma.cur->bc_private.b.firstblock)); *firstblock = bma.cur->bc_private.b.firstblock; } xfs_btree_del_cursor(bma.cur, @@ -4881,34 +4909,59 @@ xfs_bmap_split_indlen( xfs_filblks_t len2 = *indlen2; xfs_filblks_t nres = len1 + len2; /* new total res. */ xfs_filblks_t stolen = 0; + xfs_filblks_t resfactor; /* * Steal as many blocks as we can to try and satisfy the worst case * indlen for both new extents. */ - while (nres > ores && avail) { - nres--; - avail--; - stolen++; - } + if (ores < nres && avail) + stolen = XFS_FILBLKS_MIN(nres - ores, avail); + ores += stolen; + + /* nothing else to do if we've satisfied the new reservation */ + if (ores >= nres) + return stolen; /* - * The only blocks available are those reserved for the original - * extent and what we can steal from the extent being removed. - * If this still isn't enough to satisfy the combined - * requirements for the two new extents, skim blocks off of each - * of the new reservations until they match what is available. + * We can't meet the total required reservation for the two extents. + * Calculate the percent of the overall shortage between both extents + * and apply this percentage to each of the requested indlen values. + * This distributes the shortage fairly and reduces the chances that one + * of the two extents is left with nothing when extents are repeatedly + * split. */ - while (nres > ores) { - if (len1) { - len1--; - nres--; + resfactor = (ores * 100); + do_div(resfactor, nres); + len1 *= resfactor; + do_div(len1, 100); + len2 *= resfactor; + do_div(len2, 100); + ASSERT(len1 + len2 <= ores); + ASSERT(len1 < *indlen1 && len2 < *indlen2); + + /* + * Hand out the remainder to each extent. If one of the two reservations + * is zero, we want to make sure that one gets a block first. The loop + * below starts with len1, so hand len2 a block right off the bat if it + * is zero. + */ + ores -= (len1 + len2); + ASSERT((*indlen1 - len1) + (*indlen2 - len2) >= ores); + if (ores && !len2 && *indlen2) { + len2++; + ores--; + } + while (ores) { + if (len1 < *indlen1) { + len1++; + ores--; } - if (nres == ores) + if (!ores) break; - if (len2) { - len2--; - nres--; + if (len2 < *indlen2) { + len2++; + ores--; } } @@ -5513,6 +5566,8 @@ __xfs_bunmapi( int whichfork; /* data or attribute fork */ xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ + xfs_fileoff_t max_len; + xfs_agnumber_t prev_agno = NULLAGNUMBER, agno; trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); @@ -5534,6 +5589,16 @@ __xfs_bunmapi( ASSERT(len > 0); ASSERT(nexts >= 0); + /* + * Guesstimate how many blocks we can unmap without running the risk of + * blowing out the transaction with a mix of EFIs and reflink + * adjustments. + */ + if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) + max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res)); + else + max_len = len; + if (!(ifp->if_flags & XFS_IFEXTENTS) && (error = xfs_iread_extents(tp, ip, whichfork))) return error; @@ -5579,7 +5644,7 @@ __xfs_bunmapi( extno = 0; while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && - (nexts == 0 || extno < nexts)) { + (nexts == 0 || extno < nexts) && max_len > 0) { /* * Is the found extent after a hole in which bno lives? * Just back up to the previous extent, if so. @@ -5605,6 +5670,17 @@ __xfs_bunmapi( ASSERT(ep != NULL); del = got; wasdel = isnullstartblock(del.br_startblock); + + /* + * Make sure we don't touch multiple AGF headers out of order + * in a single transaction, as that could cause AB-BA deadlocks. + */ + if (!wasdel) { + agno = XFS_FSB_TO_AGNO(mp, del.br_startblock); + if (prev_agno != NULLAGNUMBER && prev_agno > agno) + break; + prev_agno = agno; + } if (got.br_startoff < start) { del.br_startoff = start; del.br_blockcount -= start - got.br_startoff; @@ -5613,6 +5689,15 @@ __xfs_bunmapi( } if (del.br_startoff + del.br_blockcount > bno + 1) del.br_blockcount = bno + 1 - del.br_startoff; + + /* How much can we safely unmap? */ + if (max_len < del.br_blockcount) { + del.br_startoff += del.br_blockcount - max_len; + if (!wasdel) + del.br_startblock += del.br_blockcount - max_len; + del.br_blockcount = max_len; + } + sum = del.br_startblock + del.br_blockcount; if (isrt && (mod = do_mod(sum, mp->m_sb.sb_rextsize))) { @@ -5656,8 +5741,8 @@ __xfs_bunmapi( } del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, - &lastx, &cur, &del, firstblock, dfops, - &logflags); + whichfork, &lastx, &cur, &del, + firstblock, dfops, &logflags); if (error) goto error0; goto nodelete; @@ -5714,8 +5799,9 @@ __xfs_bunmapi( prev.br_state = XFS_EXT_UNWRITTEN; lastx--; error = xfs_bmap_add_extent_unwritten_real(tp, - ip, &lastx, &cur, &prev, - firstblock, dfops, &logflags); + ip, whichfork, &lastx, &cur, + &prev, firstblock, dfops, + &logflags); if (error) goto error0; goto nodelete; @@ -5723,8 +5809,9 @@ __xfs_bunmapi( ASSERT(del.br_state == XFS_EXT_NORM); del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, - ip, &lastx, &cur, &del, - firstblock, dfops, &logflags); + ip, whichfork, &lastx, &cur, + &del, firstblock, dfops, + &logflags); if (error) goto error0; goto nodelete; @@ -5791,6 +5878,7 @@ __xfs_bunmapi( if (!isrt && wasdel) xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false); + max_len -= del.br_blockcount; bno = del.br_startoff - 1; nodelete: /* @@ -6560,25 +6648,33 @@ xfs_bmap_finish_one( int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, - xfs_filblks_t blockcount, + xfs_filblks_t *blockcount, xfs_exntst_t state) { struct xfs_bmbt_irec bmap; int nimaps = 1; xfs_fsblock_t firstfsb; int flags = XFS_BMAPI_REMAP; - int done; int error = 0; bmap.br_startblock = startblock; bmap.br_startoff = startoff; - bmap.br_blockcount = blockcount; + bmap.br_blockcount = *blockcount; bmap.br_state = state; + /* + * firstfsb is tied to the transaction lifetime and is used to + * ensure correct AG locking order and schedule work item + * continuations. XFS_BUI_MAX_FAST_EXTENTS (== 1) restricts us + * to only making one bmap call per transaction, so it should + * be safe to have it as a local variable here. + */ + firstfsb = NULLFSBLOCK; + trace_xfs_bmap_deferred(tp->t_mountp, XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), - ip->i_ino, whichfork, startoff, blockcount, state); + ip->i_ino, whichfork, startoff, *blockcount, state); if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) return -EFSCORRUPTED; @@ -6597,12 +6693,11 @@ xfs_bmap_finish_one( bmap.br_blockcount, flags, &firstfsb, bmap.br_blockcount, &bmap, &nimaps, dfops); + *blockcount = 0; break; case XFS_BMAP_UNMAP: - error = xfs_bunmapi(tp, ip, bmap.br_startoff, - bmap.br_blockcount, flags, 1, &firstfsb, - dfops, &done); - ASSERT(done); + error = __xfs_bunmapi(tp, ip, startoff, blockcount, + XFS_BMAPI_REMAP, 1, &firstfsb, dfops); break; default: ASSERT(0); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e7d40b39f18f..f1446d127120 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -196,6 +196,7 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); +void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops, @@ -265,7 +266,7 @@ struct xfs_bmap_intent { int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops, struct xfs_inode *ip, enum xfs_bmap_intent_type type, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, - xfs_filblks_t blockcount, xfs_exntst_t state); + xfs_filblks_t *blockcount, xfs_exntst_t state); int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index f76c1693ff01..9968a746c649 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -453,8 +453,8 @@ xfs_bmbt_alloc_block( if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); -try_another_ag: args.type = XFS_ALLOCTYPE_START_BNO; +try_another_ag: /* * Make sure there is sufficient room left in the AG to * complete a full tree split for an extent insert. If @@ -494,8 +494,8 @@ try_another_ag: if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && args.fsbno == NULLFSBLOCK && args.type == XFS_ALLOCTYPE_NEAR_BNO) { - cur->bc_private.b.dfops->dop_low = true; args.fsbno = cur->bc_private.b.firstblock; + args.type = XFS_ALLOCTYPE_FIRST_AG; goto try_another_ag; } @@ -512,7 +512,7 @@ try_another_ag: goto error0; cur->bc_private.b.dfops->dop_low = true; } - if (args.fsbno == NULLFSBLOCK) { + if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; @@ -888,6 +888,7 @@ xfs_bmbt_change_owner( cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); if (!cur) return -ENOMEM; + cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER; error = xfs_btree_change_owner(cur, new_owner, buffer_list); xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 21e6a6ab6b9a..4ad1e214b1b2 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -714,7 +714,8 @@ xfs_btree_firstrec( * Get the block pointer for this level. */ block = xfs_btree_get_block(cur, level, &bp); - xfs_btree_check_block(cur, block, level, bp); + if (xfs_btree_check_block(cur, block, level, bp)) + return 0; /* * It's empty, there is no such record. */ @@ -743,7 +744,8 @@ xfs_btree_lastrec( * Get the block pointer for this level. */ block = xfs_btree_get_block(cur, level, &bp); - xfs_btree_check_block(cur, block, level, bp); + if (xfs_btree_check_block(cur, block, level, bp)) + return 0; /* * It's empty, there is no such record. */ @@ -810,7 +812,8 @@ xfs_btree_read_bufl( xfs_daddr_t d; /* real disk block address */ int error; - ASSERT(fsbno != NULLFSBLOCK); + if (!XFS_FSB_SANITY_CHECK(mp, fsbno)) + return -EFSCORRUPTED; d = XFS_FSB_TO_DADDR(mp, fsbno); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, mp->m_bsize, lock, &bp, ops); @@ -1771,6 +1774,7 @@ xfs_btree_lookup_get_block( /* Check the inode owner since the verifiers don't. */ if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && + !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) && (cur->bc_flags & XFS_BTREE_LONG_PTRS) && be64_to_cpu((*blkp)->bb_u.l.bb_owner) != cur->bc_private.b.ip->i_ino) @@ -4375,7 +4379,7 @@ xfs_btree_visit_blocks( xfs_btree_readahead_ptr(cur, ptr, 1); /* save for the next iteration of the loop */ - lptr = *ptr; + xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); } /* for each buffer in the level */ @@ -4431,10 +4435,15 @@ xfs_btree_block_change_owner( /* modify the owner */ block = xfs_btree_get_block(cur, level, &bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) + return 0; block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); - else + } else { + if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) + return 0; block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); + } /* * If the block is a root block hosted in an inode, we might not have a @@ -4443,16 +4452,19 @@ xfs_btree_block_change_owner( * block is formatted into the on-disk inode fork. We still change it, * though, so everything is consistent in memory. */ - if (bp) { - if (cur->bc_tp) { - xfs_trans_ordered_buf(cur->bc_tp, bp); + if (!bp) { + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(level == cur->bc_nlevels - 1); + return 0; + } + + if (cur->bc_tp) { + if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { xfs_btree_log_block(cur, bp, XFS_BB_OWNER); - } else { - xfs_buf_delwri_queue(bp, bbcoi->buffer_list); + return -EAGAIN; } } else { - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); - ASSERT(level == cur->bc_nlevels - 1); + xfs_buf_delwri_queue(bp, bbcoi->buffer_list); } return 0; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index c2b01d1c79ee..33c7be2357b9 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -268,7 +268,8 @@ typedef struct xfs_btree_cur short forksize; /* fork's inode space */ char whichfork; /* data or attr fork */ char flags; /* flags */ -#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */ +#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */ +#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */ } b; } bc_private; /* per-btree type data */ } xfs_btree_cur_t; @@ -491,7 +492,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) #define XFS_FSB_SANITY_CHECK(mp,fsb) \ - (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ + (fsb && XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks) /* diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index f2dc1a950c85..b305dbfd81c4 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -263,7 +263,7 @@ xfs_da3_node_read( err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, which_fork, &xfs_da3_node_buf_ops); - if (!err && tp) { + if (!err && tp && *bpp) { struct xfs_da_blkinfo *info = (*bpp)->b_addr; int type; @@ -2633,7 +2633,7 @@ out_free: /* * Readahead the dir/attr block. */ -xfs_daddr_t +int xfs_da_reada_buf( struct xfs_inode *dp, xfs_dablk_t bno, @@ -2664,7 +2664,5 @@ out_free: if (mapp != &map) kmem_free(mapp); - if (error) - return -1; - return mappedbno; + return error; } diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 98c75cbe6ac2..4e29cb6a3627 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -201,7 +201,7 @@ int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, int whichfork, const struct xfs_buf_ops *ops); -xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, +int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, int whichfork, const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index aa17cb788946..43c902f7a68d 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -139,7 +139,7 @@ xfs_dir3_block_read( err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp, XFS_DATA_FORK, &xfs_dir3_block_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); return err; } diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index b887fb2a2bcf..f2e342e05365 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -268,7 +268,7 @@ xfs_dir3_leaf_read( err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); return err; } @@ -285,7 +285,7 @@ xfs_dir3_leafn_read( err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); return err; } diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 75a557432d0f..bbd1238852b3 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -155,6 +155,42 @@ const struct xfs_buf_ops xfs_dir3_free_buf_ops = { .verify_write = xfs_dir3_free_write_verify, }; +/* Everything ok in the free block header? */ +static bool +xfs_dir3_free_header_check( + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = dp->i_mount; + unsigned int firstdb; + int maxbests; + + maxbests = dp->d_ops->free_max_bests(mp->m_dir_geo); + firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) - + xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) * + maxbests; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; + + if (be32_to_cpu(hdr3->firstdb) != firstdb) + return false; + if (be32_to_cpu(hdr3->nvalid) > maxbests) + return false; + if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused)) + return false; + } else { + struct xfs_dir2_free_hdr *hdr = bp->b_addr; + + if (be32_to_cpu(hdr->firstdb) != firstdb) + return false; + if (be32_to_cpu(hdr->nvalid) > maxbests) + return false; + if (be32_to_cpu(hdr->nvalid) < be32_to_cpu(hdr->nused)) + return false; + } + return true; +} static int __xfs_dir3_free_read( @@ -168,11 +204,22 @@ __xfs_dir3_free_read( err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, XFS_DATA_FORK, &xfs_dir3_free_buf_ops); + if (err || !*bpp) + return err; + + /* Check things that we can't do in the verifier. */ + if (!xfs_dir3_free_header_check(dp, fbno, *bpp)) { + xfs_buf_ioerror(*bpp, -EFSCORRUPTED); + xfs_verifier_error(*bpp); + xfs_trans_brelse(tp, *bpp); + return -EFSCORRUPTED; + } /* try read returns without an error or *bpp if it lands in a hole */ - if (!err && tp && *bpp) + if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF); - return err; + + return 0; } int diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index ef9f6ead96a4..699a51bb2cbe 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -126,6 +126,7 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); +extern int xfs_dir2_sf_verify(struct xfs_inode *ip); /* xfs_dir2_readdir.c */ extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index c6809ff41197..e84af093b2ab 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -629,6 +629,112 @@ xfs_dir2_sf_check( } #endif /* DEBUG */ +/* Verify the consistency of an inline directory. */ +int +xfs_dir2_sf_verify( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_dir2_sf_hdr *sfp; + struct xfs_dir2_sf_entry *sfep; + struct xfs_dir2_sf_entry *next_sfep; + char *endp; + const struct xfs_dir_ops *dops; + struct xfs_ifork *ifp; + xfs_ino_t ino; + int i; + int i8count; + int offset; + int size; + int error; + __uint8_t filetype; + + ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); + /* + * xfs_iread calls us before xfs_setup_inode sets up ip->d_ops, + * so we can only trust the mountpoint to have the right pointer. + */ + dops = xfs_dir_get_ops(mp, NULL); + + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; + size = ifp->if_bytes; + + /* + * Give up if the directory is way too short. + */ + if (size <= offsetof(struct xfs_dir2_sf_hdr, parent) || + size < xfs_dir2_sf_hdr_size(sfp->i8count)) + return -EFSCORRUPTED; + + endp = (char *)sfp + size; + + /* Check .. entry */ + ino = dops->sf_get_parent_ino(sfp); + i8count = ino > XFS_DIR2_MAX_SHORT_INUM; + error = xfs_dir_ino_validate(mp, ino); + if (error) + return error; + offset = dops->data_first_offset; + + /* Check all reported entries */ + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->count; i++) { + /* + * struct xfs_dir2_sf_entry has a variable length. + * Check the fixed-offset parts of the structure are + * within the data buffer. + */ + if (((char *)sfep + sizeof(*sfep)) >= endp) + return -EFSCORRUPTED; + + /* Don't allow names with known bad length. */ + if (sfep->namelen == 0) + return -EFSCORRUPTED; + + /* + * Check that the variable-length part of the structure is + * within the data buffer. The next entry starts after the + * name component, so nextentry is an acceptable test. + */ + next_sfep = dops->sf_nextentry(sfp, sfep); + if (endp < (char *)next_sfep) + return -EFSCORRUPTED; + + /* Check that the offsets always increase. */ + if (xfs_dir2_sf_get_offset(sfep) < offset) + return -EFSCORRUPTED; + + /* Check the inode number. */ + ino = dops->sf_get_ino(sfp, sfep); + i8count += ino > XFS_DIR2_MAX_SHORT_INUM; + error = xfs_dir_ino_validate(mp, ino); + if (error) + return error; + + /* Check the file type. */ + filetype = dops->sf_get_ftype(sfep); + if (filetype >= XFS_DIR3_FT_MAX) + return -EFSCORRUPTED; + + offset = xfs_dir2_sf_get_offset(sfep) + + dops->data_entsize(sfep->namelen); + + sfep = next_sfep; + } + if (i8count != sfp->i8count) + return -EFSCORRUPTED; + if ((void *)sfep != (void *)endp) + return -EFSCORRUPTED; + + /* Make sure this whole thing ought to be in local format. */ + if (offset + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (uint)sizeof(xfs_dir2_block_tail_t) > mp->m_dir_geo->blksize) + return -EFSCORRUPTED; + + return 0; +} + /* * Create a new (shortform) directory. */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index d45c03779dae..42fef0731e2a 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -51,8 +51,7 @@ xfs_ialloc_cluster_alignment( struct xfs_mount *mp) { if (xfs_sb_version_hasalign(&mp->m_sb) && - mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) + mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp)) return mp->m_sb.sb_inoalignmt; return 1; } @@ -369,8 +368,6 @@ xfs_ialloc_inode_init( * transaction and pin the log appropriately. */ xfs_trans_ordered_buf(tp, fbuf); - xfs_trans_log_buf(tp, fbuf, 0, - BBTOB(fbuf->b_length) - 1); } } else { fbuf->b_flags |= XBF_DONE; @@ -1124,6 +1121,7 @@ xfs_dialloc_ag_inobt( int error; int offset; int i, j; + int searchdistance = 10; pag = xfs_perag_get(mp, agno); @@ -1150,7 +1148,6 @@ xfs_dialloc_ag_inobt( if (pagno == agno) { int doneleft; /* done, to the left */ int doneright; /* done, to the right */ - int searchdistance = 10; error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); if (error) @@ -1211,21 +1208,9 @@ xfs_dialloc_ag_inobt( /* * Loop until we find an inode chunk with a free inode. */ - while (!doneleft || !doneright) { + while (--searchdistance > 0 && (!doneleft || !doneright)) { int useleft; /* using left inode chunk this time */ - if (!--searchdistance) { - /* - * Not in range - save last search - * location and allocate a new inode - */ - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - pag->pagl_leftrec = trec.ir_startino; - pag->pagl_rightrec = rec.ir_startino; - pag->pagl_pagino = pagino; - goto newino; - } - /* figure out the closer block if both are valid. */ if (!doneleft && !doneright) { useleft = pagino - @@ -1237,13 +1222,13 @@ xfs_dialloc_ag_inobt( /* free inodes to the left? */ if (useleft && trec.ir_freecount) { - rec = trec; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); cur = tcur; pag->pagl_leftrec = trec.ir_startino; pag->pagl_rightrec = rec.ir_startino; pag->pagl_pagino = pagino; + rec = trec; goto alloc_inode; } @@ -1269,26 +1254,37 @@ xfs_dialloc_ag_inobt( goto error1; } - /* - * We've reached the end of the btree. because - * we are only searching a small chunk of the - * btree each search, there is obviously free - * inodes closer to the parent inode than we - * are now. restart the search again. - */ - pag->pagl_pagino = NULLAGINO; - pag->pagl_leftrec = NULLAGINO; - pag->pagl_rightrec = NULLAGINO; - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - goto restart_pagno; + if (searchdistance <= 0) { + /* + * Not in range - save last search + * location and allocate a new inode + */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + + } else { + /* + * We've reached the end of the btree. because + * we are only searching a small chunk of the + * btree each search, there is obviously free + * inodes closer to the parent inode than we + * are now. restart the search again. + */ + pag->pagl_pagino = NULLAGINO; + pag->pagl_leftrec = NULLAGINO; + pag->pagl_rightrec = NULLAGINO; + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + goto restart_pagno; + } } /* * In a different AG from the parent. * See if the most recently allocated block has any free. */ -newino: if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), XFS_LOOKUP_EQ, &i); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 6c6b95947e71..b9c351ff0422 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -82,11 +82,12 @@ xfs_finobt_set_root( } STATIC int -xfs_inobt_alloc_block( +__xfs_inobt_alloc_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *start, union xfs_btree_ptr *new, - int *stat) + int *stat, + enum xfs_ag_resv_type resv) { xfs_alloc_arg_t args; /* block allocation args */ int error; /* error return value */ @@ -103,6 +104,7 @@ xfs_inobt_alloc_block( args.maxlen = 1; args.prod = 1; args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.resv = resv; error = xfs_alloc_vextent(&args); if (error) { @@ -123,6 +125,27 @@ xfs_inobt_alloc_block( } STATIC int +xfs_inobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE); +} + +STATIC int +xfs_finobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + return __xfs_inobt_alloc_block(cur, start, new, stat, + XFS_AG_RESV_METADATA); +} + +STATIC int xfs_inobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) @@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_finobt_set_root, - .alloc_block = xfs_inobt_alloc_block, + .alloc_block = xfs_finobt_alloc_block, .free_block = xfs_inobt_free_block, .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, @@ -478,3 +501,64 @@ xfs_inobt_rec_check_count( return 0; } #endif /* DEBUG */ + +static xfs_extlen_t +xfs_inobt_max_size( + struct xfs_mount *mp) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_inobt_mxr[0] == 0) + return 0; + + return xfs_btree_calc_size(mp, mp->m_inobt_mnr, + (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / + XFS_INODES_PER_CHUNK); +} + +static int +xfs_inobt_count_blocks( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_btnum_t btnum, + xfs_extlen_t *tree_blocks) +{ + struct xfs_buf *agbp; + struct xfs_btree_cur *cur; + int error; + + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); + if (error) + return error; + + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum); + error = xfs_btree_count_blocks(cur, tree_blocks); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); + + return error; +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +int +xfs_finobt_calc_reserves( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_extlen_t *ask, + xfs_extlen_t *used) +{ + xfs_extlen_t tree_len = 0; + int error; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return 0; + + error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len); + if (error) + return error; + + *ask += xfs_inobt_max_size(mp); + *used += tree_len; + return 0; +} diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index bd88453217ce..aa81e2e63f3f 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, #define xfs_inobt_rec_check_count(mp, rec) 0 #endif /* DEBUG */ +int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_extlen_t *ask, xfs_extlen_t *used); + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 222e103356c6..4e30448c4465 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -26,12 +26,15 @@ #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_inode_item.h" +#include "xfs_btree.h" #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_attr_sf.h" #include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_priv.h" kmem_zone_t *xfs_ifork_zone; @@ -209,6 +212,16 @@ xfs_iformat_fork( if (error) return error; + /* Check inline dir contents. */ + if (S_ISDIR(VFS_I(ip)->i_mode) && + dip->di_format == XFS_DINODE_FMT_LOCAL) { + error = xfs_dir2_sf_verify(ip); + if (error) { + xfs_idestroy_fork(ip, XFS_DATA_FORK); + return error; + } + } + if (xfs_is_reflink_inode(ip)) { ASSERT(ip->i_cowfp == NULL); xfs_ifork_init_cow(ip); @@ -319,7 +332,6 @@ xfs_iformat_local( int whichfork, int size) { - /* * If the size is unreasonable, then something * is wrong and we just bail out rather than crash in @@ -429,11 +441,13 @@ xfs_iformat_btree( /* REFERENCED */ int nrecs; int size; + int level; ifp = XFS_IFORK_PTR(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); size = XFS_BMAP_BROOT_SPACE(mp, dfp); nrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); /* * blow out if -- fork has less extents than can fit in @@ -446,7 +460,8 @@ xfs_iformat_btree( XFS_IFORK_MAXEXT(ip, whichfork) || XFS_BMDR_SPACE_CALC(nrecs) > XFS_DFORK_SIZE(dip, mp, whichfork) || - XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { + XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) || + level == 0 || level > XFS_BTREE_MAXLEVELS) { xfs_warn(mp, "corrupt inode %Lu (btree).", (unsigned long long) ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, @@ -497,15 +512,14 @@ xfs_iread_extents( * We know that the size is valid (it's checked in iformat_btree) */ ifp->if_bytes = ifp->if_real_bytes = 0; - ifp->if_flags |= XFS_IFEXTENTS; xfs_iext_add(ifp, 0, nextents); error = xfs_bmap_read_extents(tp, ip, whichfork); if (error) { xfs_iext_destroy(ifp); - ifp->if_flags &= ~XFS_IFEXTENTS; return error; } xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); + ifp->if_flags |= XFS_IFEXTENTS; return 0; } /* @@ -1525,14 +1539,11 @@ xfs_iext_realloc_indirect( xfs_ifork_t *ifp, /* inode fork pointer */ int new_size) /* new indirection array size */ { - int nlists; /* number of irec's (ex lists) */ - int size; /* current indirection array size */ - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - size = nlists * sizeof(xfs_ext_irec_t); ASSERT(ifp->if_real_bytes); - ASSERT((new_size >= 0) && (new_size != size)); + ASSERT((new_size >= 0) && + (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) * + sizeof(xfs_ext_irec_t)))); if (new_size == 0) { xfs_iext_destroy(ifp); } else { diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 083cdd6d6c28..ce6958b1385c 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -270,6 +270,7 @@ typedef struct xfs_inode_log_format { __uint32_t ilf_fields; /* flags for fields logged */ __uint16_t ilf_asize; /* size of attr d/ext/root */ __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint32_t ilf_pad; /* pad for 64 bit boundary */ __uint64_t ilf_ino; /* inode number */ union { __uint32_t ilfu_rdev; /* rdev value for dev inode*/ @@ -280,29 +281,17 @@ typedef struct xfs_inode_log_format { __int32_t ilf_boffset; /* off of inode in buffer */ } xfs_inode_log_format_t; -typedef struct xfs_inode_log_format_32 { - __uint16_t ilf_type; /* inode log item type */ - __uint16_t ilf_size; /* size of this item */ - __uint32_t ilf_fields; /* flags for fields logged */ - __uint16_t ilf_asize; /* size of attr d/ext/root */ - __uint16_t ilf_dsize; /* size of data/ext/root */ - __uint64_t ilf_ino; /* inode number */ - union { - __uint32_t ilfu_rdev; /* rdev value for dev inode*/ - uuid_t ilfu_uuid; /* mount point value */ - } ilf_u; - __int64_t ilf_blkno; /* blkno of inode buffer */ - __int32_t ilf_len; /* len of inode buffer */ - __int32_t ilf_boffset; /* off of inode in buffer */ -} __attribute__((packed)) xfs_inode_log_format_32_t; - -typedef struct xfs_inode_log_format_64 { +/* + * Old 32 bit systems will log in this format without the 64 bit + * alignment padding. Recovery will detect this and convert it to the + * correct format. + */ +struct xfs_inode_log_format_32 { __uint16_t ilf_type; /* inode log item type */ __uint16_t ilf_size; /* size of this item */ __uint32_t ilf_fields; /* flags for fields logged */ __uint16_t ilf_asize; /* size of attr d/ext/root */ __uint16_t ilf_dsize; /* size of data/ext/root */ - __uint32_t ilf_pad; /* pad for 64 bit boundary */ __uint64_t ilf_ino; /* inode number */ union { __uint32_t ilfu_rdev; /* rdev value for dev inode*/ @@ -311,7 +300,7 @@ typedef struct xfs_inode_log_format_64 { __int64_t ilf_blkno; /* blkno of inode buffer */ __int32_t ilf_len; /* len of inode buffer */ __int32_t ilf_boffset; /* off of inode in buffer */ -} xfs_inode_log_format_64_t; +} __attribute__((packed)); /* diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index b177ef33cd4c..d71cb63cdea3 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -784,14 +784,6 @@ xfs_refcount_merge_extents( } /* - * While we're adjusting the refcounts records of an extent, we have - * to keep an eye on the number of extents we're dirtying -- run too - * many in a single transaction and we'll exceed the transaction's - * reservation and crash the fs. Each record adds 12 bytes to the - * log (plus any key updates) so we'll conservatively assume 24 bytes - * per record. We must also leave space for btree splits on both ends - * of the range and space for the CUD and a new CUI. - * * XXX: This is a pretty hand-wavy estimate. The penalty for guessing * true incorrectly is a shutdown FS; the penalty for guessing false * incorrectly is more transaction rolls than might be necessary. @@ -822,7 +814,7 @@ xfs_refcount_still_have_space( else if (overhead > cur->bc_tp->t_log_res) return false; return cur->bc_tp->t_log_res - overhead > - cur->bc_private.a.priv.refc.nr_ops * 32; + cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; } /* @@ -1629,13 +1621,32 @@ xfs_refcount_recover_cow_leftovers( if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) return -EOPNOTSUPP; - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + INIT_LIST_HEAD(&debris); + + /* + * In this first part, we use an empty transaction to gather up + * all the leftover CoW extents so that we can subsequently + * delete them. The empty transaction is used to avoid + * a buffer lock deadlock if there happens to be a loop in the + * refcountbt because we're allowed to re-grab a buffer that is + * already attached to our transaction. When we're done + * recording the CoW debris we cancel the (empty) transaction + * and everything goes away cleanly. + */ + error = xfs_trans_alloc_empty(mp, &tp); if (error) return error; - cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + if (error) + goto out_trans; + if (!agbp) { + error = -ENOMEM; + goto out_trans; + } + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); /* Find all the leftover CoW staging extents. */ - INIT_LIST_HEAD(&debris); memset(&low, 0, sizeof(low)); memset(&high, 0, sizeof(high)); low.rc.rc_startblock = XFS_REFC_COW_START; @@ -1645,10 +1656,11 @@ xfs_refcount_recover_cow_leftovers( if (error) goto out_cursor; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - xfs_buf_relse(agbp); + xfs_trans_brelse(tp, agbp); + xfs_trans_cancel(tp); /* Now iterate the list to free the leftovers */ - list_for_each_entry(rr, &debris, rr_list) { + list_for_each_entry_safe(rr, n, &debris, rr_list) { /* Set up transaction. */ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); if (error) @@ -1676,8 +1688,16 @@ xfs_refcount_recover_cow_leftovers( error = xfs_trans_commit(tp); if (error) goto out_free; + + list_del(&rr->rr_list); + kmem_free(rr); } + return error; +out_defer: + xfs_defer_cancel(&dfops); +out_trans: + xfs_trans_cancel(tp); out_free: /* Free the leftover list */ list_for_each_entry_safe(rr, n, &debris, rr_list) { @@ -1688,11 +1708,6 @@ out_free: out_cursor: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - xfs_buf_relse(agbp); - goto out_free; - -out_defer: - xfs_defer_cancel(&dfops); - xfs_trans_cancel(tp); - goto out_free; + xfs_trans_brelse(tp, agbp); + goto out_trans; } diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 098dc668ab2c..eafb9d1f3b37 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp, extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, xfs_agnumber_t agno); +/* + * While we're adjusting the refcounts records of an extent, we have + * to keep an eye on the number of extents we're dirtying -- run too + * many in a single transaction and we'll exceed the transaction's + * reservation and crash the fs. Each record adds 12 bytes to the + * log (plus any key updates) so we'll conservatively assume 32 bytes + * per record. We must also leave space for btree splits on both ends + * of the range and space for the CUD and a new CUI. + */ +#define XFS_REFCOUNT_ITEM_OVERHEAD 32 + +static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) +{ + return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD; +} + #endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 7917f6e44286..d787c677d2a3 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -21,8 +21,20 @@ /* * Components of space reservations. */ + +/* Worst case number of rmaps that can be held in a block. */ #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \ (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0])) + +/* Adding one rmap could split every level up to the top of the tree. */ +#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels) + +/* Blocks we might need to add "b" rmaps to a tree. */ +#define XFS_NRMAPADD_SPACE_RES(mp, b)\ + (((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ + XFS_RMAPADD_SPACE_RES(mp)) + #define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \ (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0])) #define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1) @@ -30,13 +42,12 @@ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ XFS_EXTENTADD_SPACE_RES(mp,w)) + +/* Blocks we might need to add "b" mappings & rmappings to a file. */ #define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\ - (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ - XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ - XFS_EXTENTADD_SPACE_RES(mp,w) + \ - ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ - XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ - (mp)->m_rmap_maxlevels) + (XFS_NEXTENTADD_SPACE_RES((mp), (b), (w)) + \ + XFS_NRMAPADD_SPACE_RES((mp), (b))) + #define XFS_DAENTER_1B(mp,w) \ ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1) #define XFS_DAENTER_DBS(mp,w) \ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index b468e041f207..3354140de07e 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -170,8 +170,8 @@ xfs_get_acl(struct inode *inode, int type) return acl; } -STATIC int -__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +int +__xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { struct xfs_inode *ip = XFS_I(inode); unsigned char *ea_name; @@ -247,6 +247,8 @@ xfs_set_mode(struct inode *inode, umode_t mode) int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { + umode_t mode; + bool set_mode = false; int error = 0; if (!acl) @@ -257,16 +259,24 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) return error; if (type == ACL_TYPE_ACCESS) { - umode_t mode; - error = posix_acl_update_mode(inode, &mode, &acl); if (error) return error; - error = xfs_set_mode(inode, mode); - if (error) - return error; + set_mode = true; } set_acl: - return __xfs_set_acl(inode, type, acl); + error = __xfs_set_acl(inode, acl, type); + if (error) + return error; + + /* + * We set the mode after successfully updating the ACL xattr because the + * xattr update can fail at ENOSPC and we don't want to change the mode + * if the ACL update hasn't been applied. + */ + if (set_mode) + error = xfs_set_mode(inode, mode); + + return error; } diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 286fa89217f5..04327318ef67 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -24,6 +24,7 @@ struct posix_acl; #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); #else static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) { diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 06763f5cc701..d31cd1ebd8e9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -90,11 +90,11 @@ xfs_find_bdev_for_inode( * associated buffer_heads, paying attention to the start and end offsets that * we need to process on the page. * - * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last - * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or - * the page at all, as we may be racing with memory reclaim and it can free both - * the bufferhead chain and the page as it will see the page as clean and - * unused. + * Note that we open code the action in end_buffer_async_write here so that we + * only have to iterate over the buffers attached to the page once. This is not + * only more efficient, but also ensures that we only calls end_page_writeback + * at the end of the iteration, and thus avoids the pitfall of having the page + * and buffers potentially freed after every call to end_buffer_async_write. */ static void xfs_finish_page_writeback( @@ -102,29 +102,45 @@ xfs_finish_page_writeback( struct bio_vec *bvec, int error) { - unsigned int end = bvec->bv_offset + bvec->bv_len - 1; - struct buffer_head *head, *bh, *next; + struct buffer_head *head = page_buffers(bvec->bv_page), *bh = head; + bool busy = false; unsigned int off = 0; - unsigned int bsize; + unsigned long flags; ASSERT(bvec->bv_offset < PAGE_SIZE); - ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0); - ASSERT(end < PAGE_SIZE); - ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0); + ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0); + ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE); + ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0); - bh = head = page_buffers(bvec->bv_page); - - bsize = bh->b_size; + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); do { - next = bh->b_this_page; - if (off < bvec->bv_offset) - goto next_bh; - if (off > end) - break; - bh->b_end_io(bh, !error); -next_bh: - off += bsize; - } while ((bh = next) != head); + if (off >= bvec->bv_offset && + off < bvec->bv_offset + bvec->bv_len) { + ASSERT(buffer_async_write(bh)); + ASSERT(bh->b_end_io == NULL); + + if (error) { + mapping_set_error(bvec->bv_page->mapping, -EIO); + set_buffer_write_io_error(bh); + clear_buffer_uptodate(bh); + SetPageError(bvec->bv_page); + } else { + set_buffer_uptodate(bh); + } + clear_buffer_async_write(bh); + unlock_buffer(bh); + } else if (buffer_async_write(bh)) { + ASSERT(buffer_locked(bh)); + busy = true; + } + off += bh->b_size; + } while ((bh = bh->b_this_page) != head); + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); + local_irq_restore(flags); + + if (!busy) + end_page_writeback(bvec->bv_page); } /* @@ -138,8 +154,10 @@ xfs_destroy_ioend( int error) { struct inode *inode = ioend->io_inode; - struct bio *last = ioend->io_bio; - struct bio *bio, *next; + struct bio *bio = &ioend->io_inline_bio; + struct bio *last = ioend->io_bio, *next; + u64 start = bio->bi_iter.bi_sector; + bool quiet = bio_flagged(bio, BIO_QUIET); for (bio = &ioend->io_inline_bio; bio; bio = next) { struct bio_vec *bvec; @@ -160,6 +178,11 @@ xfs_destroy_ioend( bio_put(bio); } + + if (unlikely(error && !quiet)) { + xfs_err_ratelimited(XFS_I(inode)->i_mount, + "writeback error on sector %llu", start); + } } /* @@ -279,54 +302,50 @@ xfs_end_io( struct xfs_ioend *ioend = container_of(work, struct xfs_ioend, io_work); struct xfs_inode *ip = XFS_I(ioend->io_inode); + xfs_off_t offset = ioend->io_offset; + size_t size = ioend->io_size; int error = ioend->io_bio->bi_error; /* - * Set an error if the mount has shut down and proceed with end I/O - * processing so it can perform whatever cleanups are necessary. + * Just clean up the in-memory strutures if the fs has been shut down. */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { error = -EIO; + goto done; + } /* - * For a CoW extent, we need to move the mapping from the CoW fork - * to the data fork. If instead an error happened, just dump the - * new blocks. + * Clean up any COW blocks on an I/O error. */ - if (ioend->io_type == XFS_IO_COW) { - if (error) - goto done; - if (ioend->io_bio->bi_error) { - error = xfs_reflink_cancel_cow_range(ip, - ioend->io_offset, ioend->io_size); - goto done; + if (unlikely(error)) { + switch (ioend->io_type) { + case XFS_IO_COW: + xfs_reflink_cancel_cow_range(ip, offset, size, true); + break; } - error = xfs_reflink_end_cow(ip, ioend->io_offset, - ioend->io_size); - if (error) - goto done; + + goto done; } /* - * For unwritten extents we need to issue transactions to convert a - * range to normal written extens after the data I/O has finished. - * Detecting and handling completion IO errors is done individually - * for each case as different cleanup operations need to be performed - * on error. + * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_type == XFS_IO_UNWRITTEN) { - if (error) - goto done; - error = xfs_iomap_write_unwritten(ip, ioend->io_offset, - ioend->io_size); - } else if (ioend->io_append_trans) { - error = xfs_setfilesize_ioend(ioend, error); - } else { - ASSERT(!xfs_ioend_is_append(ioend) || - ioend->io_type == XFS_IO_COW); + switch (ioend->io_type) { + case XFS_IO_COW: + error = xfs_reflink_end_cow(ip, offset, size); + break; + case XFS_IO_UNWRITTEN: + /* writeback should never update isize */ + error = xfs_iomap_write_unwritten(ip, offset, size, false); + break; + default: + ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); + break; } done: + if (ioend->io_append_trans) + error = xfs_setfilesize_ioend(ioend, error); xfs_destroy_ioend(ioend, error); } @@ -354,7 +373,7 @@ xfs_map_blocks( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - ssize_t count = 1 << inode->i_blkbits; + ssize_t count = i_blocksize(inode); xfs_fileoff_t offset_fsb, end_fsb; int error = 0; int bmapi_flags = XFS_BMAPI_ENTIRE; @@ -419,6 +438,19 @@ xfs_imap_valid( { offset >>= inode->i_blkbits; + /* + * We have to make sure the cached mapping is within EOF to protect + * against eofblocks trimming on file release leaving us with a stale + * mapping. Otherwise, a page for a subsequent file extending buffered + * write could get picked up by this writeback cycle and written to the + * wrong blocks. + * + * Note that what we really want here is a generic mapping invalidation + * mechanism to protect us from arbitrary extent modifying contexts, not + * just eofblocks. + */ + xfs_trim_extent_eof(imap, XFS_I(inode)); + return offset >= imap->br_startoff && offset < imap->br_startoff + imap->br_blockcount; } @@ -432,7 +464,8 @@ xfs_start_buffer_writeback( ASSERT(!buffer_delay(bh)); ASSERT(!buffer_unwritten(bh)); - mark_buffer_async_write(bh); + bh->b_end_io = NULL; + set_buffer_async_write(bh); set_buffer_uptodate(bh); clear_buffer_dirty(bh); } @@ -486,6 +519,12 @@ xfs_submit_ioend( struct xfs_ioend *ioend, int status) { + /* Convert CoW extents to regular */ + if (!status && ioend->io_type == XFS_IO_COW) { + status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), + ioend->io_offset, ioend->io_size); + } + /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && ioend->io_type != XFS_IO_UNWRITTEN && @@ -700,6 +739,14 @@ xfs_vm_invalidatepage( { trace_xfs_invalidatepage(page->mapping->host, page, offset, length); + + /* + * If we are invalidating the entire page, clear the dirty state from it + * so that we can check for attempts to release dirty cached pages in + * xfs_vm_releasepage(). + */ + if (offset == 0 && length >= PAGE_SIZE) + cancel_dirty_page(page); block_invalidatepage(page, offset, length); } @@ -758,7 +805,7 @@ xfs_aops_discard_page( break; } next_buffer: - offset += 1 << inode->i_blkbits; + offset += i_blocksize(inode); } while ((bh = bh->b_this_page) != head); @@ -846,7 +893,7 @@ xfs_writepage_map( LIST_HEAD(submit_list); struct xfs_ioend *ioend, *next; struct buffer_head *bh, *head; - ssize_t len = 1 << inode->i_blkbits; + ssize_t len = i_blocksize(inode); int error = 0; int count = 0; int uptodate = 1; @@ -1155,25 +1202,27 @@ xfs_vm_releasepage( * mm accommodates an old ext3 case where clean pages might not have had * the dirty bit cleared. Thus, it can send actual dirty pages to * ->releasepage() via shrink_active_list(). Conversely, - * block_invalidatepage() can send pages that are still marked dirty - * but otherwise have invalidated buffers. + * block_invalidatepage() can send pages that are still marked dirty but + * otherwise have invalidated buffers. * * We want to release the latter to avoid unnecessary buildup of the - * LRU, skip the former and warn if we've left any lingering - * delalloc/unwritten buffers on clean pages. Skip pages with delalloc - * or unwritten buffers and warn if the page is not dirty. Otherwise - * try to release the buffers. + * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages + * that are entirely invalidated and need to be released. Hence the + * only time we should get dirty pages here is through + * shrink_active_list() and so we can simply skip those now. + * + * warn if we've left any lingering delalloc/unwritten buffers on clean + * or invalidated pages we are about to release. */ + if (PageDirty(page)) + return 0; + xfs_count_page_state(page, &delalloc, &unwritten); - if (delalloc) { - WARN_ON_ONCE(!PageDirty(page)); + if (WARN_ON_ONCE(delalloc)) return 0; - } - if (unwritten) { - WARN_ON_ONCE(!PageDirty(page)); + if (WARN_ON_ONCE(unwritten)) return 0; - } return try_to_free_buffers(page); } @@ -1249,7 +1298,7 @@ xfs_map_trim_size( offset + mapping_size >= i_size_read(inode)) { /* limit mapping to block that spans EOF */ mapping_size = roundup_64(i_size_read(inode) - offset, - 1 << inode->i_blkbits); + i_blocksize(inode)); } if (mapping_size > LONG_MAX) mapping_size = LONG_MAX; @@ -1257,44 +1306,6 @@ xfs_map_trim_size( bh_result->b_size = mapping_size; } -/* Bounce unaligned directio writes to the page cache. */ -static int -xfs_bounce_unaligned_dio_write( - struct xfs_inode *ip, - xfs_fileoff_t offset_fsb, - struct xfs_bmbt_irec *imap) -{ - struct xfs_bmbt_irec irec; - xfs_fileoff_t delta; - bool shared; - bool x; - int error; - - irec = *imap; - if (offset_fsb > irec.br_startoff) { - delta = offset_fsb - irec.br_startoff; - irec.br_blockcount -= delta; - irec.br_startblock += delta; - irec.br_startoff = offset_fsb; - } - error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x); - if (error) - return error; - - /* - * We're here because we're trying to do a directio write to a - * region that isn't aligned to a filesystem block. If any part - * of the extent is shared, fall back to buffered mode to handle - * the RMW. This is done by returning -EREMCHG ("remote addr - * changed"), which is caught further up the call stack. - */ - if (shared) { - trace_xfs_reflink_bounce_dio_write(ip, imap); - return -EREMCHG; - } - return 0; -} - STATIC int __xfs_get_blocks( struct inode *inode, @@ -1323,7 +1334,7 @@ __xfs_get_blocks( return -EIO; offset = (xfs_off_t)iblock << inode->i_blkbits; - ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); + ASSERT(bh_result->b_size >= i_blocksize(inode)); size = bh_result->b_size; if (!create && offset >= i_size_read(inode)) @@ -1432,13 +1443,6 @@ __xfs_get_blocks( if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK && (create || !ISUNWRITTEN(&imap))) { - if (create && direct && !is_cow) { - error = xfs_bounce_unaligned_dio_write(ip, offset_fsb, - &imap); - if (error) - return error; - } - xfs_map_buffer(inode, bh_result, &imap, offset); if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); @@ -1552,6 +1556,21 @@ xfs_end_io_direct_write( return 0; } + if (flags & XFS_DIO_FLAG_COW) + error = xfs_reflink_end_cow(ip, offset, size); + + /* + * Unwritten conversion updates the in-core isize after extent + * conversion but before updating the on-disk size. Updating isize any + * earlier allows a racing dio read to find unwritten extents before + * they are converted. + */ + if (flags & XFS_DIO_FLAG_UNWRITTEN) { + trace_xfs_end_io_direct_write_unwritten(ip, offset, size); + + return xfs_iomap_write_unwritten(ip, offset, size, true); + } + /* * We need to update the in-core inode size here so that we don't end up * with the on-disk inode size being outside the in-core inode size. We @@ -1568,13 +1587,6 @@ xfs_end_io_direct_write( i_size_write(inode, offset + size); spin_unlock(&ip->i_flags_lock); - if (flags & XFS_DIO_FLAG_COW) - error = xfs_reflink_end_cow(ip, offset, size); - if (flags & XFS_DIO_FLAG_UNWRITTEN) { - trace_xfs_end_io_direct_write_unwritten(ip, offset, size); - - error = xfs_iomap_write_unwritten(ip, offset, size); - } if (flags & XFS_DIO_FLAG_APPEND) { trace_xfs_end_io_direct_write_append(ip, offset, size); @@ -1610,9 +1622,12 @@ xfs_vm_bmap( * The swap code (ab-)uses ->bmap to get a block mapping and then * bypasseÑ• the file system for actual I/O. We really can't allow * that on reflinks inodes, so we have to skip out here. And yes, - * 0 is the magic code for a bmap error.. + * 0 is the magic code for a bmap error. + * + * Since we don't pass back blockdev info, we can't return bmap + * information for rt files either. */ - if (xfs_is_reflink_inode(ip)) { + if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) { xfs_iunlock(ip, XFS_IOLOCK_SHARED); return 0; } @@ -1678,7 +1693,7 @@ xfs_vm_set_page_dirty( if (offset < end_offset) set_buffer_dirty(bh); bh = bh->b_this_page; - offset += 1 << inode->i_blkbits; + offset += i_blocksize(inode); } while (bh != head); } /* diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index be0b79d8900f..c6643004e583 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -302,6 +302,8 @@ xfs_attr3_node_inactive( &bp, XFS_ATTR_FORK); if (error) return error; + node = bp->b_addr; + btree = dp->d_ops->node_tree_p(node); child_fsb = be32_to_cpu(btree[i + 1].before); xfs_trans_brelse(*trans, bp); } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 9bf57c76623b..5a54dcd7e7b1 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -34,6 +34,8 @@ #include "xfs_bmap.h" #include "xfs_icache.h" #include "xfs_trace.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" kmem_zone_t *xfs_bui_zone; @@ -393,6 +395,7 @@ xfs_bui_recover( struct xfs_map_extent *bmap; xfs_fsblock_t startblock_fsb; xfs_fsblock_t inode_fsb; + xfs_filblks_t count; bool op_ok; struct xfs_bud_log_item *budp; enum xfs_bmap_intent_type type; @@ -401,6 +404,7 @@ xfs_bui_recover( struct xfs_trans *tp; struct xfs_inode *ip = NULL; struct xfs_defer_ops dfops; + struct xfs_bmbt_irec irec; xfs_fsblock_t firstfsb; ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); @@ -446,7 +450,8 @@ xfs_bui_recover( return -EIO; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); if (error) return error; budp = xfs_trans_get_bud(tp, buip); @@ -477,13 +482,24 @@ xfs_bui_recover( } xfs_trans_ijoin(tp, ip, 0); + count = bmap->me_len; error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, ip, whichfork, bmap->me_startoff, - bmap->me_startblock, bmap->me_len, - state); + bmap->me_startblock, &count, state); if (error) goto err_dfops; + if (count > 0) { + ASSERT(type == XFS_BMAP_UNMAP); + irec.br_startblock = bmap->me_startblock; + irec.br_blockcount = count; + irec.br_startoff = bmap->me_startoff; + irec.br_state = state; + error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec); + if (error) + goto err_dfops; + } + /* Finish transaction, free inodes. */ error = xfs_defer_finish(&tp, &dfops, NULL); if (error) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index efb8ccd6bbf2..cb62871fb91d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -84,6 +84,7 @@ xfs_zero_extent( GFP_NOFS, true); } +#ifdef CONFIG_XFS_RT int xfs_bmap_rtalloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ @@ -195,6 +196,7 @@ xfs_bmap_rtalloc( } return 0; } +#endif /* CONFIG_XFS_RT */ /* * Check if the endoff is outside the last extent. If so the caller will grow @@ -588,9 +590,13 @@ xfs_getbmap( } break; default: + /* Local format data forks report no extents. */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + bmv->bmv_entries = 0; + return 0; + } if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE && - ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + ip->i_d.di_format != XFS_DINODE_FMT_BTREE) return -EINVAL; if (xfs_get_extsz_hint(ip) || @@ -718,7 +724,7 @@ xfs_getbmap( * extents. */ if (map[i].br_startblock == DELAYSTARTBLOCK && - map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) ASSERT((iflags & BMV_IF_DELALLOC) != 0); if (map[i].br_startblock == HOLESTARTBLOCK && @@ -911,23 +917,22 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) } /* - * This is called by xfs_inactive to free any blocks beyond eof - * when the link count isn't zero and by xfs_dm_punch_hole() when - * punching a hole to EOF. + * This is called to free any blocks beyond eof. The caller must hold + * IOLOCK_EXCL unless we are in the inode reclaim path and have the only + * reference to the inode. */ int xfs_free_eofblocks( - xfs_mount_t *mp, - xfs_inode_t *ip, - bool need_iolock) + struct xfs_inode *ip) { - xfs_trans_t *tp; - int error; - xfs_fileoff_t end_fsb; - xfs_fileoff_t last_fsb; - xfs_filblks_t map_len; - int nimaps; - xfs_bmbt_irec_t imap; + struct xfs_trans *tp; + int error; + xfs_fileoff_t end_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t map_len; + int nimaps; + struct xfs_bmbt_irec imap; + struct xfs_mount *mp = ip->i_mount; /* * Figure out if there are any blocks beyond the end @@ -944,6 +949,10 @@ xfs_free_eofblocks( error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); xfs_iunlock(ip, XFS_ILOCK_SHARED); + /* + * If there are blocks after the end of file, truncate the file to its + * current size to free them up. + */ if (!error && (nimaps != 0) && (imap.br_startblock != HOLESTARTBLOCK || ip->i_delayed_blks)) { @@ -954,22 +963,13 @@ xfs_free_eofblocks( if (error) return error; - /* - * There are blocks after the end of file. - * Free them up now by truncating the file to - * its current size. - */ - if (need_iolock) { - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) - return -EAGAIN; - } + /* wait on dio to ensure i_size has settled */ + inode_dio_wait(VFS_I(ip)); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; } @@ -997,8 +997,6 @@ xfs_free_eofblocks( } xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); } return error; } @@ -1324,8 +1322,16 @@ xfs_free_file_space( /* * Now that we've unmap all full blocks we'll have to zero out any * partial block at the beginning and/or end. xfs_zero_range is - * smart enough to skip any holes, including those we just created. + * smart enough to skip any holes, including those we just created, + * but we must take care not to zero beyond EOF and enlarge i_size. */ + + if (offset >= XFS_ISIZE(ip)) + return 0; + + if (offset + len > XFS_ISIZE(ip)) + len = XFS_ISIZE(ip) - offset; + return xfs_zero_range(ip, offset, len, NULL); } @@ -1393,10 +1399,16 @@ xfs_shift_file_space( xfs_fileoff_t stop_fsb; xfs_fileoff_t next_fsb; xfs_fileoff_t shift_fsb; + uint resblks; ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); if (direction == SHIFT_LEFT) { + /* + * Reserve blocks to cover potential extent merges after left + * shift operations. + */ + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); next_fsb = XFS_B_TO_FSB(mp, offset + len); stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); } else { @@ -1404,6 +1416,7 @@ xfs_shift_file_space( * If right shift, delegate the work of initialization of * next_fsb to xfs_bmap_shift_extent as it has ilock held. */ + resblks = 0; next_fsb = NULLFSBLOCK; stop_fsb = XFS_B_TO_FSB(mp, offset); } @@ -1415,7 +1428,7 @@ xfs_shift_file_space( * into the accessible region of the file. */ if (xfs_can_free_eofblocks(ip, true)) { - error = xfs_free_eofblocks(mp, ip, false); + error = xfs_free_eofblocks(ip); if (error) return error; } @@ -1434,7 +1447,19 @@ xfs_shift_file_space( return error; /* - * The extent shiting code works on extent granularity. So, if + * Clean out anything hanging around in the cow fork now that + * we've flushed all the dirty data out to disk to avoid having + * CoW extents at the wrong offsets. + */ + if (xfs_is_reflink_inode(ip)) { + error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF, + true); + if (error) + return error; + } + + /* + * The extent shifting code works on extent granularity. So, if * stop_fsb is not the starting block of extent, we need to split * the extent at stop_fsb. */ @@ -1445,21 +1470,14 @@ xfs_shift_file_space( } while (!error && !done) { - /* - * We would need to reserve permanent block for transaction. - * This will come into picture when after shifting extent into - * hole we found that adjacent extents can be merged which - * may lead to freeing of a block during record update. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, + &tp); if (error) break; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, - ip->i_gdquot, ip->i_pdquot, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, + ip->i_gdquot, ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS); if (error) goto out_trans_cancel; @@ -1821,29 +1839,18 @@ xfs_swap_extent_forks( } /* - * Before we've swapped the forks, lets set the owners of the forks - * appropriately. We have to do this as we are demand paging the btree - * buffers, and so the validation done on read will expect the owner - * field to be correctly set. Once we change the owners, we can swap the - * inode forks. + * Btree format (v3) inodes have the inode number stamped in the bmbt + * block headers. We can't start changing the bmbt blocks until the + * inode owner change is logged so recovery does the right thing in the + * event of a crash. Set the owner change log flags now and leave the + * bmbt scan as the last step. */ if (ip->i_d.di_version == 3 && - ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + ip->i_d.di_format == XFS_DINODE_FMT_BTREE) (*target_log_flags) |= XFS_ILOG_DOWNER; - error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, - tip->i_ino, NULL); - if (error) - return error; - } - if (tip->i_d.di_version == 3 && - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) (*src_log_flags) |= XFS_ILOG_DOWNER; - error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, - ip->i_ino, NULL); - if (error) - return error; - } /* * Swap the data forks of the inodes @@ -1921,6 +1928,48 @@ xfs_swap_extent_forks( return 0; } +/* + * Fix up the owners of the bmbt blocks to refer to the current inode. The + * change owner scan attempts to order all modified buffers in the current + * transaction. In the event of ordered buffer failure, the offending buffer is + * physically logged as a fallback and the scan returns -EAGAIN. We must roll + * the transaction in this case to replenish the fallback log reservation and + * restart the scan. This process repeats until the scan completes. + */ +static int +xfs_swap_change_owner( + struct xfs_trans **tpp, + struct xfs_inode *ip, + struct xfs_inode *tmpip) +{ + int error; + struct xfs_trans *tp = *tpp; + + do { + error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino, + NULL); + /* success or fatal error */ + if (error != -EAGAIN) + break; + + error = xfs_trans_roll(tpp, NULL); + if (error) + break; + tp = *tpp; + + /* + * Redirty both inodes so they can relog and keep the log tail + * moving forward. + */ + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, tmpip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE); + } while (true); + + return error; +} + int xfs_swap_extents( struct xfs_inode *ip, /* target inode */ @@ -1934,8 +1983,8 @@ xfs_swap_extents( int error = 0; int lock_flags; struct xfs_ifork *cowfp; - __uint64_t f; - int resblks; + uint64_t f; + int resblks = 0; /* * Lock the inodes against other IO, page faults and truncate to @@ -1983,11 +2032,8 @@ xfs_swap_extents( XFS_SWAP_RMAP_SPACE_RES(mp, XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK), XFS_DATA_FORK); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, - 0, 0, &tp); - } else - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, - 0, 0, &tp); + } + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); if (error) goto out_unlock; @@ -2062,17 +2108,54 @@ xfs_swap_extents( ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK; + } + + /* Swap the cow forks. */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + xfs_extnum_t extnum; + + ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS); + ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS); + + extnum = ip->i_cnextents; + ip->i_cnextents = tip->i_cnextents; + tip->i_cnextents = extnum; + cowfp = ip->i_cowfp; ip->i_cowfp = tip->i_cowfp; tip->i_cowfp = cowfp; - xfs_inode_set_cowblocks_tag(ip); - xfs_inode_set_cowblocks_tag(tip); + + if (ip->i_cowfp && ip->i_cnextents) + xfs_inode_set_cowblocks_tag(ip); + else + xfs_inode_clear_cowblocks_tag(ip); + if (tip->i_cowfp && tip->i_cnextents) + xfs_inode_set_cowblocks_tag(tip); + else + xfs_inode_clear_cowblocks_tag(tip); } xfs_trans_log_inode(tp, ip, src_log_flags); xfs_trans_log_inode(tp, tip, target_log_flags); /* + * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems + * have inode number owner values in the bmbt blocks that still refer to + * the old inode. Scan each bmbt to fix up the owner values with the + * inode number of the current inode. + */ + if (src_log_flags & XFS_ILOG_DOWNER) { + error = xfs_swap_change_owner(&tp, ip, tip); + if (error) + goto out_trans_cancel; + } + if (target_log_flags & XFS_ILOG_DOWNER) { + error = xfs_swap_change_owner(&tp, tip, ip); + if (error) + goto out_trans_cancel; + } + + /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 68a621a8e0c0..ce330f06563e 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -28,7 +28,20 @@ struct xfs_mount; struct xfs_trans; struct xfs_bmalloca; +#ifdef CONFIG_XFS_RT int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); +#else /* !CONFIG_XFS_RT */ +/* + * Attempts to allocate RT extents when RT is disable indicates corruption and + * should trigger a shutdown. + */ +static inline int +xfs_bmap_rtalloc(struct xfs_bmalloca *ap) +{ + return -EFSCORRUPTED; +} +#endif /* CONFIG_XFS_RT */ + int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, int whichfork, int *eof); int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, @@ -63,8 +76,7 @@ int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, /* EOF block manipulation functions */ bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); -int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, - bool need_iolock); +int xfs_free_eofblocks(struct xfs_inode *ip); int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, struct xfs_swapext *sx); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d7a67d7fbc7f..eca7baecc9f0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -96,12 +96,16 @@ static inline void xfs_buf_ioacct_inc( struct xfs_buf *bp) { - if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT)) + if (bp->b_flags & XBF_NO_IOACCT) return; ASSERT(bp->b_flags & XBF_ASYNC); - bp->b_flags |= _XBF_IN_FLIGHT; - percpu_counter_inc(&bp->b_target->bt_io_count); + spin_lock(&bp->b_lock); + if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { + bp->b_state |= XFS_BSTATE_IN_FLIGHT; + percpu_counter_inc(&bp->b_target->bt_io_count); + } + spin_unlock(&bp->b_lock); } /* @@ -109,14 +113,24 @@ xfs_buf_ioacct_inc( * freed and unaccount from the buftarg. */ static inline void -xfs_buf_ioacct_dec( +__xfs_buf_ioacct_dec( struct xfs_buf *bp) { - if (!(bp->b_flags & _XBF_IN_FLIGHT)) - return; + lockdep_assert_held(&bp->b_lock); + + if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { + bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; + percpu_counter_dec(&bp->b_target->bt_io_count); + } +} - bp->b_flags &= ~_XBF_IN_FLIGHT; - percpu_counter_dec(&bp->b_target->bt_io_count); +static inline void +xfs_buf_ioacct_dec( + struct xfs_buf *bp) +{ + spin_lock(&bp->b_lock); + __xfs_buf_ioacct_dec(bp); + spin_unlock(&bp->b_lock); } /* @@ -148,9 +162,9 @@ xfs_buf_stale( * unaccounted (released to LRU) before that occurs. Drop in-flight * status now to preserve accounting consistency. */ - xfs_buf_ioacct_dec(bp); - spin_lock(&bp->b_lock); + __xfs_buf_ioacct_dec(bp); + atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) @@ -953,12 +967,12 @@ xfs_buf_rele( * ensures the decrement occurs only once per-buf. */ if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) - xfs_buf_ioacct_dec(bp); + __xfs_buf_ioacct_dec(bp); goto out_unlock; } /* the last reference has been dropped ... */ - xfs_buf_ioacct_dec(bp); + __xfs_buf_ioacct_dec(bp); if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { /* * If the buffer is added to the LRU take a new reference to the @@ -1052,6 +1066,8 @@ void xfs_buf_unlock( struct xfs_buf *bp) { + ASSERT(xfs_buf_islocked(bp)); + XB_CLEAR_OWNER(bp); up(&bp->b_sema); @@ -1790,6 +1806,28 @@ error: } /* + * Cancel a delayed write list. + * + * Remove each buffer from the list, clear the delwri queue flag and drop the + * associated buffer reference. + */ +void +xfs_buf_delwri_cancel( + struct list_head *list) +{ + struct xfs_buf *bp; + + while (!list_empty(list)) { + bp = list_first_entry(list, struct xfs_buf, b_list); + + xfs_buf_lock(bp); + bp->b_flags &= ~_XBF_DELWRI_Q; + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + } +} + +/* * Add a buffer to the delayed write list. * * This queues a buffer for writeout if it hasn't already been. Note that @@ -1984,6 +2022,66 @@ xfs_buf_delwri_submit( return error; } +/* + * Push a single buffer on a delwri queue. + * + * The purpose of this function is to submit a single buffer of a delwri queue + * and return with the buffer still on the original queue. The waiting delwri + * buffer submission infrastructure guarantees transfer of the delwri queue + * buffer reference to a temporary wait list. We reuse this infrastructure to + * transfer the buffer back to the original queue. + * + * Note the buffer transitions from the queued state, to the submitted and wait + * listed state and back to the queued state during this call. The buffer + * locking and queue management logic between _delwri_pushbuf() and + * _delwri_queue() guarantee that the buffer cannot be queued to another list + * before returning. + */ +int +xfs_buf_delwri_pushbuf( + struct xfs_buf *bp, + struct list_head *buffer_list) +{ + LIST_HEAD (submit_list); + int error; + + ASSERT(bp->b_flags & _XBF_DELWRI_Q); + + trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); + + /* + * Isolate the buffer to a new local list so we can submit it for I/O + * independently from the rest of the original list. + */ + xfs_buf_lock(bp); + list_move(&bp->b_list, &submit_list); + xfs_buf_unlock(bp); + + /* + * Delwri submission clears the DELWRI_Q buffer flag and returns with + * the buffer on the wait list with an associated reference. Rather than + * bounce the buffer from a local wait list back to the original list + * after I/O completion, reuse the original list as the wait list. + */ + xfs_buf_delwri_submit_buffers(&submit_list, buffer_list); + + /* + * The buffer is now under I/O and wait listed as during typical delwri + * submission. Lock the buffer to wait for I/O completion. Rather than + * remove the buffer from the wait list and release the reference, we + * want to return with the buffer queued to the original list. The + * buffer already sits on the original list with a wait list reference, + * however. If we let the queue inherit that wait list reference, all we + * need to do is reset the DELWRI_Q flag. + */ + xfs_buf_lock(bp); + error = bp->b_error; + bp->b_flags |= _XBF_DELWRI_Q; + xfs_buf_unlock(bp); + + return error; +} + int __init xfs_buf_init(void) { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 1c2e52b2d926..f961b19b9cc2 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -63,7 +63,6 @@ typedef enum { #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ #define _XBF_COMPOUND (1 << 23)/* compound buffer */ -#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */ typedef unsigned int xfs_buf_flags_t; @@ -83,14 +82,14 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" }, \ - { _XBF_IN_FLIGHT, "IN_FLIGHT" } + { _XBF_COMPOUND, "COMPOUND" } /* * Internal state flags. */ #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ +#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ /* * The xfs_buftarg contains 2 notions of "sector size" - @@ -330,9 +329,11 @@ extern void *xfs_buf_offset(struct xfs_buf *, size_t); extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ +extern void xfs_buf_delwri_cancel(struct list_head *); extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); +extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); /* Buffer Daemon Setup Routines */ extern int xfs_buf_init(void); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 2975cb2319f4..e0a0af0946f2 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -29,6 +29,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_inode.h" kmem_zone_t *xfs_buf_item_zone; @@ -322,6 +323,8 @@ xfs_buf_item_format( ASSERT((bip->bli_flags & XFS_BLI_STALE) || (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) || + (bip->bli_flags & XFS_BLI_STALE)); /* @@ -346,16 +349,6 @@ xfs_buf_item_format( bip->bli_flags &= ~XFS_BLI_INODE_BUF; } - if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) == - XFS_BLI_ORDERED) { - /* - * The buffer has been logged just to order it. It is not being - * included in the transaction commit, so don't format it. - */ - trace_xfs_buf_item_format_ordered(bip); - return; - } - for (i = 0; i < bip->bli_format_count; i++) { xfs_buf_item_format_segment(bip, lv, &vecp, offset, &bip->bli_formats[i]); @@ -574,26 +567,20 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - bool clean; - bool aborted; - int flags; + bool aborted = !!(lip->li_flags & XFS_LI_ABORTED); + bool hold = !!(bip->bli_flags & XFS_BLI_HOLD); + bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY); +#if defined(DEBUG) || defined(XFS_WARN) + bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED); +#endif /* Clear the buffer's association with this transaction. */ bp->b_transp = NULL; /* - * If this is a transaction abort, don't return early. Instead, allow - * the brelse to happen. Normally it would be done for stale - * (cancelled) buffers at unpin time, but we'll never go through the - * pin/unpin cycle if we abort inside commit. - */ - aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false; - /* - * Before possibly freeing the buf item, copy the per-transaction state - * so we can reference it safely later after clearing it from the - * buffer log item. + * The per-transaction state has been copied above so clear it from the + * bli. */ - flags = bip->bli_flags; bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); /* @@ -601,7 +588,7 @@ xfs_buf_item_unlock( * unlock the buffer and free the buf item when the buffer is unpinned * for the last time. */ - if (flags & XFS_BLI_STALE) { + if (bip->bli_flags & XFS_BLI_STALE) { trace_xfs_buf_item_unlock_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); if (!aborted) { @@ -619,40 +606,34 @@ xfs_buf_item_unlock( * regardless of whether it is dirty or not. A dirty abort implies a * shutdown, anyway. * - * Ordered buffers are dirty but may have no recorded changes, so ensure - * we only release clean items here. + * The bli dirty state should match whether the blf has logged segments + * except for ordered buffers, where only the bli should be dirty. */ - clean = (flags & XFS_BLI_DIRTY) ? false : true; - if (clean) { - int i; - for (i = 0; i < bip->bli_format_count; i++) { - if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, - bip->bli_formats[i].blf_map_size)) { - clean = false; - break; - } - } - } + ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || + (ordered && dirty && !xfs_buf_item_dirty_format(bip))); /* * Clean buffers, by definition, cannot be in the AIL. However, aborted - * buffers may be dirty and hence in the AIL. Therefore if we are - * aborting a buffer and we've just taken the last refernce away, we - * have to check if it is in the AIL before freeing it. We need to free - * it in this case, because an aborted transaction has already shut the - * filesystem down and this is the last chance we will have to do so. + * buffers may be in the AIL regardless of dirty state. An aborted + * transaction that invalidates a buffer already in the AIL may have + * marked it stale and cleared the dirty state, for example. + * + * Therefore if we are aborting a buffer and we've just taken the last + * reference away, we have to check if it is in the AIL before freeing + * it. We need to free it in this case, because an aborted transaction + * has already shut the filesystem down and this is the last chance we + * will have to do so. */ if (atomic_dec_and_test(&bip->bli_refcount)) { - if (clean) - xfs_buf_item_relse(bp); - else if (aborted) { + if (aborted) { ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); - } + } else if (!dirty) + xfs_buf_item_relse(bp); } - if (!(flags & XFS_BLI_HOLD)) + if (!hold) xfs_buf_relse(bp); } @@ -942,14 +923,22 @@ xfs_buf_item_log( /* - * Return 1 if the buffer has been logged or ordered in a transaction (at any - * point, not just the current transaction) and 0 if not. + * Return true if the buffer has any ranges logged/dirtied by a transaction, + * false otherwise. */ -uint -xfs_buf_item_dirty( - xfs_buf_log_item_t *bip) +bool +xfs_buf_item_dirty_format( + struct xfs_buf_log_item *bip) { - return (bip->bli_flags & XFS_BLI_DIRTY); + int i; + + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) + return true; + } + + return false; } STATIC void @@ -1051,6 +1040,31 @@ xfs_buf_do_callbacks( } } +/* + * Invoke the error state callback for each log item affected by the failed I/O. + * + * If a metadata buffer write fails with a non-permanent error, the buffer is + * eventually resubmitted and so the completion callbacks are not run. The error + * state may need to be propagated to the log items attached to the buffer, + * however, so the next AIL push of the item knows hot to handle it correctly. + */ +STATIC void +xfs_buf_do_callbacks_fail( + struct xfs_buf *bp) +{ + struct xfs_log_item *next; + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_ail *ailp = lip->li_ailp; + + spin_lock(&ailp->xa_lock); + for (; lip; lip = next) { + next = lip->li_bio_list; + if (lip->li_ops->iop_error) + lip->li_ops->iop_error(lip, bp); + } + spin_unlock(&ailp->xa_lock); +} + static bool xfs_buf_iodone_callback_error( struct xfs_buf *bp) @@ -1120,7 +1134,11 @@ xfs_buf_iodone_callback_error( if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) goto permanent_error; - /* still a transient error, higher layers will retry */ + /* + * Still a transient error, run IO completion failure callbacks and let + * the higher layers retry the buffer. + */ + xfs_buf_do_callbacks_fail(bp); xfs_buf_ioerror(bp, 0); xfs_buf_relse(bp); return true; @@ -1162,6 +1180,7 @@ xfs_buf_iodone_callbacks( */ bp->b_last_error = 0; bp->b_retries = 0; + bp->b_first_retry_time = 0; xfs_buf_do_callbacks(bp); bp->b_fspriv = NULL; @@ -1200,3 +1219,31 @@ xfs_buf_iodone( xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); xfs_buf_item_free(BUF_ITEM(lip)); } + +/* + * Requeue a failed buffer for writeback + * + * Return true if the buffer has been re-queued properly, false otherwise + */ +bool +xfs_buf_resubmit_failed_buffers( + struct xfs_buf *bp, + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + struct xfs_log_item *next; + + /* + * Clear XFS_LI_FAILED flag from all items before resubmit + * + * XFS_LI_FAILED set/clear is protected by xa_lock, caller this + * function already have it acquired + */ + for (; lip; lip = next) { + next = lip->li_bio_list; + xfs_clear_li_failed(lip); + } + + /* Add this buffer back to the delayed write list */ + return xfs_buf_delwri_queue(bp, buffer_list); +} diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index f7eba99d19dd..9690ce62c9a7 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -64,12 +64,15 @@ typedef struct xfs_buf_log_item { int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); -uint xfs_buf_item_dirty(xfs_buf_log_item_t *); +bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_attach_iodone(struct xfs_buf *, void(*)(struct xfs_buf *, xfs_log_item_t *), xfs_log_item_t *); void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); +bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, + struct xfs_log_item *, + struct list_head *); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 29816981b50a..eba63160be6c 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -71,22 +71,11 @@ xfs_dir2_sf_getdents( struct xfs_da_geometry *geo = args->geo; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - /* - * Give up if the directory is way too short. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return -EIO; - } - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - if (dp->i_d.di_size < xfs_dir2_sf_hdr_size(sfp->i8count)) - return -EFSCORRUPTED; - /* * If the block number in the offset is out of range, we're done. */ @@ -405,6 +394,7 @@ xfs_dir2_leaf_readbuf( /* * Do we need more readahead? + * Each loop tries to process 1 full dir blk; last may be partial. */ blk_start_plug(&plug); for (mip->ra_index = mip->ra_offset = i = 0; @@ -415,7 +405,8 @@ xfs_dir2_leaf_readbuf( * Read-ahead a contiguous directory block. */ if (i > mip->ra_current && - map[mip->ra_index].br_blockcount >= geo->fsbcount) { + (map[mip->ra_index].br_blockcount - mip->ra_offset) >= + geo->fsbcount) { xfs_dir3_data_readahead(dp, map[mip->ra_index].br_startoff + mip->ra_offset, XFS_FSB_TO_DADDR(dp->i_mount, @@ -436,14 +427,19 @@ xfs_dir2_leaf_readbuf( } /* - * Advance offset through the mapping table. + * Advance offset through the mapping table, processing a full + * dir block even if it is fragmented into several extents. + * But stop if we have consumed all valid mappings, even if + * it's not yet a full directory block. */ - for (j = 0; j < geo->fsbcount; j += length ) { + for (j = 0; + j < geo->fsbcount && mip->ra_index < mip->map_valid; + j += length ) { /* * The rest of this extent but not more than a dir * block. */ - length = min_t(int, geo->fsbcount, + length = min_t(int, geo->fsbcount - j, map[mip->ra_index].br_blockcount - mip->ra_offset); mip->ra_offset += length; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ed7ee4e8af73..bcf72970ca6b 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -167,7 +167,7 @@ xfs_verifier_error( { struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx", + xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", __return_address, bp->b_ops->name, bp->b_bn); diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 162dc186cf04..29c2f997aedf 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -45,18 +45,7 @@ xfs_extent_busy_insert( struct rb_node **rbp; struct rb_node *parent = NULL; - new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL); - if (!new) { - /* - * No Memory! Since it is now not possible to track the free - * block, make this a synchronous transaction to insure that - * the block is not reused before this transaction commits. - */ - trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len); - xfs_trans_set_sync(tp); - return; - } - + new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP); new->agno = agno; new->bno = bno; new->length = len; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9a5d64b5f35a..362c6b4c1186 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -92,7 +92,7 @@ xfs_zero_range( xfs_off_t count, bool *did_zero) { - return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops); + return iomap_zero_range(VFS_I(ip), pos, count, did_zero, &xfs_iomap_ops); } int @@ -554,6 +554,15 @@ xfs_file_dio_aio_write( if ((iocb->ki_pos & mp->m_blockmask) || ((iocb->ki_pos + count) & mp->m_blockmask)) { unaligned_io = 1; + + /* + * We can't properly handle unaligned direct I/O to reflink + * files yet, as we can't unshare a partial block. + */ + if (xfs_is_reflink_inode(ip)) { + trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); + return -EREMCHG; + } iolock = XFS_IOLOCK_EXCL; } else { iolock = XFS_IOLOCK_SHARED; @@ -675,8 +684,10 @@ xfs_file_buffered_aio_write( struct xfs_inode *ip = XFS_I(inode); ssize_t ret; int enospc = 0; - int iolock = XFS_IOLOCK_EXCL; + int iolock; +write_retry: + iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); @@ -686,7 +697,6 @@ xfs_file_buffered_aio_write( /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); -write_retry: trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); if (likely(ret >= 0)) @@ -702,26 +712,31 @@ write_retry: * running at the same time. */ if (ret == -EDQUOT && !enospc) { + xfs_rw_iunlock(ip, iolock); enospc = xfs_inode_free_quota_eofblocks(ip); if (enospc) goto write_retry; enospc = xfs_inode_free_quota_cowblocks(ip); if (enospc) goto write_retry; + iolock = 0; } else if (ret == -ENOSPC && !enospc) { struct xfs_eofblocks eofb = {0}; enospc = 1; xfs_flush_inodes(ip->i_mount); - eofb.eof_scan_owner = ip->i_ino; /* for locking */ + + xfs_rw_iunlock(ip, iolock); eofb.eof_flags = XFS_EOF_FLAGS_SYNC; xfs_icache_free_eofblocks(ip->i_mount, &eofb); + xfs_icache_free_cowblocks(ip->i_mount, &eofb); goto write_retry; } current->backing_dev_info = NULL; out: - xfs_rw_iunlock(ip, iolock); + if (iolock) + xfs_rw_iunlock(ip, iolock); return ret; } @@ -809,7 +824,7 @@ xfs_file_fallocate( if (error) goto out_unlock; } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { - unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + unsigned int blksize_mask = i_blocksize(inode) - 1; if (offset & blksize_mask || len & blksize_mask) { error = -EINVAL; @@ -831,7 +846,7 @@ xfs_file_fallocate( if (error) goto out_unlock; } else if (mode & FALLOC_FL_INSERT_RANGE) { - unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + unsigned int blksize_mask = i_blocksize(inode) - 1; new_size = i_size_read(inode) + len; if (offset & blksize_mask || len & blksize_mask) { @@ -987,9 +1002,9 @@ xfs_dir_open( */ mode = xfs_ilock_data_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_dir3_data_readahead(ip, 0, -1); + error = xfs_dir3_data_readahead(ip, 0, -1); xfs_iunlock(ip, mode); - return 0; + return error; } STATIC int @@ -1116,49 +1131,17 @@ xfs_find_get_desired_pgoff( index = startoff >> PAGE_SHIFT; endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); - end = endoff >> PAGE_SHIFT; + end = (endoff - 1) >> PAGE_SHIFT; do { int want; unsigned nr_pages; unsigned int i; - want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); - /* - * No page mapped into given range. If we are searching holes - * and if this is the first time we got into the loop, it means - * that the given offset is landed in a hole, return it. - * - * If we have already stepped through some block buffers to find - * holes but they all contains data. In this case, the last - * offset is already updated and pointed to the end of the last - * mapped page, if it does not reach the endpoint to search, - * that means there should be a hole between them. - */ - if (nr_pages == 0) { - /* Data search found nothing */ - if (type == DATA_OFF) - break; - - ASSERT(type == HOLE_OFF); - if (lastoff == startoff || lastoff < endoff) { - found = true; - *offset = lastoff; - } + if (nr_pages == 0) break; - } - - /* - * At lease we found one page. If this is the first time we - * step into the loop, and if the first page index offset is - * greater than the given search offset, a hole was found. - */ - if (type == HOLE_OFF && lastoff == startoff && - lastoff < page_offset(pvec.pages[0])) { - found = true; - break; - } for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1171,18 +1154,18 @@ xfs_find_get_desired_pgoff( * file mapping. However, page->index will not change * because we have a reference on the page. * - * Searching done if the page index is out of range. - * If the current offset is not reaches the end of - * the specified search range, there should be a hole - * between them. + * If current page offset is beyond where we've ended, + * we've found a hole. */ - if (page->index > end) { - if (type == HOLE_OFF && lastoff < endoff) { - *offset = lastoff; - found = true; - } + if (type == HOLE_OFF && lastoff < endoff && + lastoff < page_offset(pvec.pages[i])) { + found = true; + *offset = lastoff; goto out; } + /* Searching done if the page index is out of range. */ + if (page->index > end) + goto out; lock_page(page); /* @@ -1224,21 +1207,20 @@ xfs_find_get_desired_pgoff( /* * The number of returned pages less than our desired, search - * done. In this case, nothing was found for searching data, - * but we found a hole behind the last offset. + * done. */ - if (nr_pages < want) { - if (type == HOLE_OFF) { - *offset = lastoff; - found = true; - } + if (nr_pages < want) break; - } index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); + /* No page at lastoff and we are not done - we found a hole. */ + if (type == HOLE_OFF && lastoff < endoff) { + *offset = lastoff; + found = true; + } out: pagevec_release(&pvec); return found; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 29cc9886a3cb..86a4911520cc 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -66,7 +66,6 @@ xfs_inode_alloc( XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); @@ -192,7 +191,7 @@ xfs_perag_set_reclaim_tag( { struct xfs_mount *mp = pag->pag_mount; - ASSERT(spin_is_locked(&pag->pag_ici_lock)); + lockdep_assert_held(&pag->pag_ici_lock); if (pag->pag_ici_reclaimable++) return; @@ -214,7 +213,7 @@ xfs_perag_clear_reclaim_tag( { struct xfs_mount *mp = pag->pag_mount; - ASSERT(spin_is_locked(&pag->pag_ici_lock)); + lockdep_assert_held(&pag->pag_ici_lock); if (--pag->pag_ici_reclaimable) return; @@ -264,6 +263,22 @@ xfs_inode_clear_reclaim_tag( xfs_perag_clear_reclaim_tag(pag); } +static void +xfs_inew_wait( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); + + do { + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (!xfs_iflags_test(ip, XFS_INEW)) + break; + schedule(); + } while (true); + finish_wait(wq, &wait.wait); +} + /* * When we recycle a reclaimable inode, we need to re-initialise the VFS inode * part of the structure. This is made more complex by the fact we store @@ -368,14 +383,17 @@ xfs_iget_cache_hit( error = xfs_reinit_inode(mp, inode); if (error) { + bool wake; /* * Re-initializing the inode failed, and we are in deep * trouble. Try to re-add it to the reclaim list. */ rcu_read_lock(); spin_lock(&ip->i_flags_lock); - + wake = !!__xfs_iflags_test(ip, XFS_INEW); ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + if (wake) + wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); ASSERT(ip->i_flags & XFS_IRECLAIMABLE); trace_xfs_iget_reclaim_fail(ip); goto out_error; @@ -625,9 +643,11 @@ out_error_or_again: STATIC int xfs_inode_ag_walk_grab( - struct xfs_inode *ip) + struct xfs_inode *ip, + int flags) { struct inode *inode = VFS_I(ip); + bool newinos = !!(flags & XFS_AGITER_INEW_WAIT); ASSERT(rcu_read_lock_held()); @@ -645,7 +665,8 @@ xfs_inode_ag_walk_grab( goto out_unlock_noent; /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || + __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) goto out_unlock_noent; spin_unlock(&ip->i_flags_lock); @@ -673,7 +694,8 @@ xfs_inode_ag_walk( void *args), int flags, void *args, - int tag) + int tag, + int iter_flags) { uint32_t first_index; int last_error = 0; @@ -715,7 +737,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || xfs_inode_ag_walk_grab(ip)) + if (done || xfs_inode_ag_walk_grab(ip, iter_flags)) batch[i] = NULL; /* @@ -743,6 +765,9 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; + if ((iter_flags & XFS_AGITER_INEW_WAIT) && + xfs_iflags_test(batch[i], XFS_INEW)) + xfs_inew_wait(batch[i]); error = execute(batch[i], flags, args); IRELE(batch[i]); if (error == -EAGAIN) { @@ -822,12 +847,13 @@ xfs_cowblocks_worker( } int -xfs_inode_ag_iterator( +xfs_inode_ag_iterator_flags( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, - void *args) + void *args, + int iter_flags) { struct xfs_perag *pag; int error = 0; @@ -837,7 +863,8 @@ xfs_inode_ag_iterator( ag = 0; while ((pag = xfs_perag_get(mp, ag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1, + iter_flags); xfs_perag_put(pag); if (error) { last_error = error; @@ -849,6 +876,17 @@ xfs_inode_ag_iterator( } int +xfs_inode_ag_iterator( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int flags, + void *args) +{ + return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0); +} + +int xfs_inode_ag_iterator_tag( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, @@ -865,7 +903,8 @@ xfs_inode_ag_iterator_tag( ag = 0; while ((pag = xfs_perag_get_tag(mp, ag, tag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag, + 0); xfs_perag_put(pag); if (error) { last_error = error; @@ -1039,11 +1078,11 @@ reclaim: * Because we use RCU freeing we need to ensure the inode always appears * to be reclaimed with an invalid inode number when in the free state. * We do this as early as possible under the ILOCK so that - * xfs_iflush_cluster() can be guaranteed to detect races with us here. - * By doing this, we guarantee that once xfs_iflush_cluster has locked - * XFS_ILOCK that it will see either a valid, flushable inode that will - * serialise correctly, or it will see a clean (and invalid) inode that - * it can skip. + * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to + * detect races with us here. By doing this, we guarantee that once + * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that + * it will see either a valid inode that will serialise correctly, or it + * will see an invalid inode that it can skip. */ spin_lock(&ip->i_flags_lock); ip->i_flags = XFS_IRECLAIM; @@ -1324,13 +1363,10 @@ xfs_inode_free_eofblocks( int flags, void *args) { - int ret; + int ret = 0; struct xfs_eofblocks *eofb = args; - bool need_iolock = true; int match; - ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); - if (!xfs_can_free_eofblocks(ip, false)) { /* inode could be preallocated or append-only */ trace_xfs_inode_free_eofblocks_invalid(ip); @@ -1358,21 +1394,19 @@ xfs_inode_free_eofblocks( if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && XFS_ISIZE(ip) < eofb->eof_min_file_size) return 0; - - /* - * A scan owner implies we already hold the iolock. Skip it in - * xfs_free_eofblocks() to avoid deadlock. This also eliminates - * the possibility of EAGAIN being returned. - */ - if (eofb->eof_scan_owner == ip->i_ino) - need_iolock = false; } - ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock); - - /* don't revisit the inode if we're not waiting */ - if (ret == -EAGAIN && !(flags & SYNC_WAIT)) - ret = 0; + /* + * If the caller is waiting, return -EAGAIN to keep the background + * scanner moving and revisit the inode in a subsequent pass. + */ + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (flags & SYNC_WAIT) + ret = -EAGAIN; + return ret; + } + ret = xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -1419,15 +1453,10 @@ __xfs_inode_free_quota_eofblocks( struct xfs_eofblocks eofb = {0}; struct xfs_dquot *dq; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - /* - * Set the scan owner to avoid a potential livelock. Otherwise, the scan - * can repeatedly trylock on the inode we're currently processing. We - * run a sync scan to increase effectiveness and use the union filter to + * Run a sync scan to increase effectiveness and use the union filter to * cover all applicable quotas in a single scan. */ - eofb.eof_scan_owner = ip->i_ino; eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { @@ -1579,12 +1608,9 @@ xfs_inode_free_cowblocks( { int ret; struct xfs_eofblocks *eofb = args; - bool need_iolock = true; int match; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); - /* * Just clear the tag if we have an empty cow fork or none at all. It's * possible the inode was fully unshared since it was originally tagged. @@ -1617,28 +1643,16 @@ xfs_inode_free_cowblocks( if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && XFS_ISIZE(ip) < eofb->eof_min_file_size) return 0; - - /* - * A scan owner implies we already hold the iolock. Skip it in - * xfs_free_eofblocks() to avoid deadlock. This also eliminates - * the possibility of EAGAIN being returned. - */ - if (eofb->eof_scan_owner == ip->i_ino) - need_iolock = false; } /* Free the CoW blocks */ - if (need_iolock) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - } + xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); - if (need_iolock) { - xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - } + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index a1e02f4708ab..9183f77958ef 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -27,7 +27,6 @@ struct xfs_eofblocks { kgid_t eof_gid; prid_t eof_prid; __u64 eof_min_file_size; - xfs_ino_t eof_scan_owner; }; #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ @@ -49,6 +48,11 @@ struct xfs_eofblocks { #define XFS_IGET_UNTRUSTED 0x2 #define XFS_IGET_DONTCACHE 0x4 +/* + * flags for AG inode iterator + */ +#define XFS_AGITER_INEW_WAIT 0x1 /* wait on new inodes */ + int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); @@ -80,6 +84,9 @@ void xfs_cowblocks_worker(struct work_struct *); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args); +int xfs_inode_ag_iterator_flags(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, void *args), + int flags, void *args, int iter_flags); int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args, int tag); @@ -102,7 +109,6 @@ xfs_fs_eofblocks_from_user( dst->eof_flags = src->eof_flags; dst->eof_prid = src->eof_prid; dst->eof_min_file_size = src->eof_min_file_size; - dst->eof_scan_owner = NULLFSINO; dst->eof_uid = INVALID_UID; if (src->eof_flags & XFS_EOF_FLAGS_UID) { diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 512ff13ed66a..fe9a9a183b2d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -50,6 +50,7 @@ #include "xfs_log.h" #include "xfs_bmap_btree.h" #include "xfs_reflink.h" +#include "xfs_dir2_priv.h" kmem_zone_t *xfs_inode_zone; @@ -880,7 +881,6 @@ xfs_ialloc( case S_IFREG: case S_IFDIR: if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { - uint64_t di_flags2 = 0; uint di_flags = 0; if (S_ISDIR(mode)) { @@ -917,20 +917,23 @@ xfs_ialloc( di_flags |= XFS_DIFLAG_NODEFRAG; if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; - if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) - di_flags2 |= XFS_DIFLAG2_DAX; ip->i_d.di_flags |= di_flags; - ip->i_d.di_flags2 |= di_flags2; } if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && pip->i_d.di_version == 3 && ip->i_d.di_version == 3) { + uint64_t di_flags2 = 0; + if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { - ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; } + if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + + ip->i_d.di_flags2 |= di_flags2; } /* FALLTHROUGH */ case S_IFLNK: @@ -1624,15 +1627,17 @@ xfs_itruncate_extents( /* Remove all pending CoW reservations. */ error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block, - last_block); + last_block, true); if (error) goto out; /* - * Clear the reflink flag if we truncated everything. + * Clear the reflink flag if there are no data fork blocks and + * there are no extents staged in the cow fork. */ - if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) { - ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) { + if (ip->i_d.di_nblocks == 0) + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; xfs_inode_clear_cowblocks_tag(ip); } @@ -1701,32 +1706,34 @@ xfs_release( if (xfs_can_free_eofblocks(ip, false)) { /* + * Check if the inode is being opened, written and closed + * frequently and we have delayed allocation blocks outstanding + * (e.g. streaming writes from the NFS server), truncating the + * blocks past EOF will cause fragmentation to occur. + * + * In this case don't do the truncation, but we have to be + * careful how we detect this case. Blocks beyond EOF show up as + * i_delayed_blks even when the inode is clean, so we need to + * truncate them away first before checking for a dirty release. + * Hence on the first dirty close we will still remove the + * speculative allocation, but after that we will leave it in + * place. + */ + if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) + return 0; + /* * If we can't get the iolock just skip truncating the blocks * past EOF because we could deadlock with the mmap_sem - * otherwise. We'll get another chance to drop them once the + * otherwise. We'll get another chance to drop them once the * last reference to the inode is dropped, so we'll never leak * blocks permanently. - * - * Further, check if the inode is being opened, written and - * closed frequently and we have delayed allocation blocks - * outstanding (e.g. streaming writes from the NFS server), - * truncating the blocks past EOF will cause fragmentation to - * occur. - * - * In this case don't do the truncation, either, but we have to - * be careful how we detect this case. Blocks beyond EOF show - * up as i_delayed_blks even when the inode is clean, so we - * need to truncate them away first before checking for a dirty - * release. Hence on the first dirty close we will still remove - * the speculative allocation, but after that we will leave it - * in place. */ - if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) - return 0; - - error = xfs_free_eofblocks(mp, ip, true); - if (error && error != -EAGAIN) - return error; + if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + error = xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + if (error) + return error; + } /* delalloc blocks after truncation means it really is dirty */ if (ip->i_delayed_blks) @@ -1801,22 +1808,23 @@ xfs_inactive_ifree( int error; /* - * The ifree transaction might need to allocate blocks for record - * insertion to the finobt. We don't want to fail here at ENOSPC, so - * allow ifree to dip into the reserved block pool if necessary. - * - * Freeing large sets of inodes generally means freeing inode chunks, - * directory and file data blocks, so this should be relatively safe. - * Only under severe circumstances should it be possible to free enough - * inodes to exhaust the reserve block pool via finobt expansion while - * at the same time not creating free space in the filesystem. + * We try to use a per-AG reservation for any block needed by the finobt + * tree, but as the finobt feature predates the per-AG reservation + * support a degraded file system might not have enough space for the + * reservation at mount time. In that case try to dip into the reserved + * pool and pray. * * Send a warning if the reservation does happen to fail, as the inode * now remains allocated and sits on the unlinked list until the fs is * repaired. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, - XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); + if (unlikely(mp->m_inotbt_nores)) { + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, + &tp); + } else { + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); + } if (error) { if (error == -ENOSPC) { xfs_warn_ratelimited(mp, @@ -1911,9 +1919,13 @@ xfs_inactive( * force is true because we are evicting an inode from the * cache. Post-eof blocks must be freed, lest we end up with * broken free space accounting. + * + * Note: don't bother with iolock here since lockdep complains + * about acquiring it in reclaim context. We have the only + * reference to the inode at this point anyways. */ if (xfs_can_free_eofblocks(ip, true)) - xfs_free_eofblocks(mp, ip, false); + xfs_free_eofblocks(ip); return; } @@ -2358,11 +2370,24 @@ retry: * already marked stale. If we can't lock it, back off * and retry. */ - if (ip != free_ip && - !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - rcu_read_unlock(); - delay(1); - goto retry; + if (ip != free_ip) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + rcu_read_unlock(); + delay(1); + goto retry; + } + + /* + * Check the inode number again in case we're + * racing with freeing in xfs_reclaim_inode(). + * See the comments in that function for more + * information as to why the initial check is + * not sufficient. + */ + if (ip->i_ino != inum + i) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + continue; + } } rcu_read_unlock(); @@ -3556,6 +3581,12 @@ xfs_iflush_int( if (ip->i_d.di_version < 3) ip->i_d.di_flushiter++; + /* Check the inline directory data. */ + if (S_ISDIR(VFS_I(ip)->i_mode) && + ip->i_d.di_format == XFS_DINODE_FMT_LOCAL && + xfs_dir2_sf_verify(ip)) + goto corrupt_out; + /* * Copy the dirty parts of the inode into the on-disk inode. We always * copy out the core of the inode, because if the inode is dirty at all diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 71e8a81c91a3..c038f6eecc28 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -217,7 +217,8 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) #define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ #define XFS_ISTALE (1 << 1) /* inode has been staled */ #define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ -#define XFS_INEW (1 << 3) /* inode has just been allocated */ +#define __XFS_INEW_BIT 3 /* inode has just been allocated */ +#define XFS_INEW (1 << __XFS_INEW_BIT) #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ #define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ @@ -467,6 +468,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip) xfs_iflags_clear(ip, XFS_INEW); barrier(); unlock_new_inode(VFS_I(ip)); + wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); } static inline void xfs_setup_existing_inode(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index d90e7811ccdd..d0a3c4bd2c38 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -27,6 +27,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_trans_priv.h" +#include "xfs_buf_item.h" #include "xfs_log.h" @@ -363,6 +364,9 @@ xfs_inode_to_log_dinode( to->di_dmstate = from->di_dmstate; to->di_flags = from->di_flags; + /* log a dummy value to ensure log structure is fully initialised */ + to->di_next_unlinked = NULLAGINO; + if (from->di_version == 3) { to->di_changecount = inode->i_version; to->di_crtime.t_sec = from->di_crtime.t_sec; @@ -403,6 +407,11 @@ xfs_inode_item_format_core( * the second with the on-disk inode structure, and a possible third and/or * fourth with the inode data/extents/b-tree root and inode attributes * data/extents/b-tree root. + * + * Note: Always use the 64 bit inode log format structure so we don't + * leave an uninitialised hole in the format item on 64 bit systems. Log + * recovery on 32 bit systems handles this just fine, so there's no reason + * for not using an initialising the properly padded structure all the time. */ STATIC void xfs_inode_item_format( @@ -411,8 +420,8 @@ xfs_inode_item_format( { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - struct xfs_inode_log_format *ilf; struct xfs_log_iovec *vecp = NULL; + struct xfs_inode_log_format *ilf; ASSERT(ip->i_d.di_version > 1); @@ -424,7 +433,17 @@ xfs_inode_item_format( ilf->ilf_boffset = ip->i_imap.im_boffset; ilf->ilf_fields = XFS_ILOG_CORE; ilf->ilf_size = 2; /* format + core */ - xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format)); + + /* + * make sure we don't leak uninitialised data into the log in the case + * when we don't log every field in the inode. + */ + ilf->ilf_dsize = 0; + ilf->ilf_asize = 0; + ilf->ilf_pad = 0; + memset(&ilf->ilf_u.ilfu_uuid, 0, sizeof(ilf->ilf_u.ilfu_uuid)); + + xlog_finish_iovec(lv, vecp, sizeof(*ilf)); xfs_inode_item_format_core(ip, lv, &vecp); xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); @@ -475,6 +494,23 @@ xfs_inode_item_unpin( wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); } +/* + * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer + * have been failed during writeback + * + * This informs the AIL that the inode is already flush locked on the next push, + * and acquires a hold on the buffer to ensure that it isn't reclaimed before + * dirty data makes it to disk. + */ +STATIC void +xfs_inode_item_error( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode)); + xfs_set_li_failed(lip, bp); +} + STATIC uint xfs_inode_item_push( struct xfs_log_item *lip, @@ -484,13 +520,28 @@ xfs_inode_item_push( { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - struct xfs_buf *bp = NULL; + struct xfs_buf *bp = lip->li_buf; uint rval = XFS_ITEM_SUCCESS; int error; if (xfs_ipincount(ip) > 0) return XFS_ITEM_PINNED; + /* + * The buffer containing this item failed to be written back + * previously. Resubmit the buffer for IO. + */ + if (lip->li_flags & XFS_LI_FAILED) { + if (!xfs_buf_trylock(bp)) + return XFS_ITEM_LOCKED; + + if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list)) + rval = XFS_ITEM_FLUSHING; + + xfs_buf_unlock(bp); + return rval; + } + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) return XFS_ITEM_LOCKED; @@ -622,7 +673,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = { .iop_unlock = xfs_inode_item_unlock, .iop_committed = xfs_inode_item_committed, .iop_push = xfs_inode_item_push, - .iop_committing = xfs_inode_item_committing + .iop_committing = xfs_inode_item_committing, + .iop_error = xfs_inode_item_error }; @@ -710,7 +762,8 @@ xfs_iflush_done( * the AIL lock. */ iip = INODE_ITEM(blip); - if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) + if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || + (blip->li_flags & XFS_LI_FAILED)) need_ail++; blip = next; @@ -718,7 +771,8 @@ xfs_iflush_done( /* make sure we capture the state of the initial inode. */ iip = INODE_ITEM(lip); - if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) + if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) || + lip->li_flags & XFS_LI_FAILED) need_ail++; /* @@ -731,22 +785,30 @@ xfs_iflush_done( * holding the lock before removing the inode from the AIL. */ if (need_ail) { - struct xfs_log_item *log_items[need_ail]; - int i = 0; + bool mlip_changed = false; + + /* this is an opencoded batch version of xfs_trans_ail_delete */ spin_lock(&ailp->xa_lock); for (blip = lip; blip; blip = blip->li_bio_list) { - iip = INODE_ITEM(blip); - if (iip->ili_logged && - blip->li_lsn == iip->ili_flush_lsn) { - log_items[i++] = blip; + if (INODE_ITEM(blip)->ili_logged && + blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) + mlip_changed |= xfs_ail_delete_one(ailp, blip); + else { + xfs_clear_li_failed(blip); } - ASSERT(i <= need_ail); } - /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ - xfs_trans_ail_delete_bulk(ailp, log_items, i, - SHUTDOWN_CORRUPT_INCORE); - } + if (mlip_changed) { + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) + xlog_assign_tail_lsn_locked(ailp->xa_mount); + if (list_empty(&ailp->xa_ail)) + wake_up_all(&ailp->xa_empty); + } + spin_unlock(&ailp->xa_lock); + + if (mlip_changed) + xfs_log_space_wake(ailp->xa_mount); + } /* * clean up and unlock the flush lock now we are done. We can clear the @@ -811,48 +873,30 @@ xfs_istale_done( } /* - * convert an xfs_inode_log_format struct from either 32 or 64 bit versions - * (which can have different field alignments) to the native version + * convert an xfs_inode_log_format struct from the old 32 bit version + * (which can have different field alignments) to the native 64 bit version */ int xfs_inode_item_format_convert( - xfs_log_iovec_t *buf, - xfs_inode_log_format_t *in_f) + struct xfs_log_iovec *buf, + struct xfs_inode_log_format *in_f) { - if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) { - xfs_inode_log_format_32_t *in_f32 = buf->i_addr; - - in_f->ilf_type = in_f32->ilf_type; - in_f->ilf_size = in_f32->ilf_size; - in_f->ilf_fields = in_f32->ilf_fields; - in_f->ilf_asize = in_f32->ilf_asize; - in_f->ilf_dsize = in_f32->ilf_dsize; - in_f->ilf_ino = in_f32->ilf_ino; - /* copy biggest field of ilf_u */ - memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, - in_f32->ilf_u.ilfu_uuid.__u_bits, - sizeof(uuid_t)); - in_f->ilf_blkno = in_f32->ilf_blkno; - in_f->ilf_len = in_f32->ilf_len; - in_f->ilf_boffset = in_f32->ilf_boffset; - return 0; - } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){ - xfs_inode_log_format_64_t *in_f64 = buf->i_addr; - - in_f->ilf_type = in_f64->ilf_type; - in_f->ilf_size = in_f64->ilf_size; - in_f->ilf_fields = in_f64->ilf_fields; - in_f->ilf_asize = in_f64->ilf_asize; - in_f->ilf_dsize = in_f64->ilf_dsize; - in_f->ilf_ino = in_f64->ilf_ino; - /* copy biggest field of ilf_u */ - memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, - in_f64->ilf_u.ilfu_uuid.__u_bits, - sizeof(uuid_t)); - in_f->ilf_blkno = in_f64->ilf_blkno; - in_f->ilf_len = in_f64->ilf_len; - in_f->ilf_boffset = in_f64->ilf_boffset; - return 0; - } - return -EFSCORRUPTED; + struct xfs_inode_log_format_32 *in_f32 = buf->i_addr; + + if (buf->i_len != sizeof(*in_f32)) + return -EFSCORRUPTED; + + in_f->ilf_type = in_f32->ilf_type; + in_f->ilf_size = in_f32->ilf_size; + in_f->ilf_fields = in_f32->ilf_fields; + in_f->ilf_asize = in_f32->ilf_asize; + in_f->ilf_dsize = in_f32->ilf_dsize; + in_f->ilf_ino = in_f32->ilf_ino; + /* copy biggest field of ilf_u */ + memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, + in_f32->ilf_u.ilfu_uuid.__u_bits, sizeof(uuid_t)); + in_f->ilf_blkno = in_f32->ilf_blkno; + in_f->ilf_len = in_f32->ilf_len; + in_f->ilf_boffset = in_f32->ilf_boffset; + return 0; } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index a39197501a7c..6c95812120eb 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -928,16 +928,15 @@ xfs_ioc_fsgetxattr( return 0; } -STATIC void -xfs_set_diflags( +STATIC uint16_t +xfs_flags2diflags( struct xfs_inode *ip, unsigned int xflags) { - unsigned int di_flags; - uint64_t di_flags2; - /* can't set PREALLOC this way, just preserve it */ - di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + uint16_t di_flags = + (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + if (xflags & FS_XFLAG_IMMUTABLE) di_flags |= XFS_DIFLAG_IMMUTABLE; if (xflags & FS_XFLAG_APPEND) @@ -967,19 +966,24 @@ xfs_set_diflags( if (xflags & FS_XFLAG_EXTSIZE) di_flags |= XFS_DIFLAG_EXTSIZE; } - ip->i_d.di_flags = di_flags; - /* diflags2 only valid for v3 inodes. */ - if (ip->i_d.di_version < 3) - return; + return di_flags; +} + +STATIC uint64_t +xfs_flags2diflags2( + struct xfs_inode *ip, + unsigned int xflags) +{ + uint64_t di_flags2 = + (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); - di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; if (xflags & FS_XFLAG_COWEXTSIZE) di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; - ip->i_d.di_flags2 = di_flags2; + return di_flags2; } STATIC void @@ -1005,11 +1009,12 @@ xfs_diflags_to_linux( inode->i_flags |= S_NOATIME; else inode->i_flags &= ~S_NOATIME; +#if 0 /* disabled until the flag switching races are sorted out */ if (xflags & FS_XFLAG_DAX) inode->i_flags |= S_DAX; else inode->i_flags &= ~S_DAX; - +#endif } static int @@ -1019,6 +1024,7 @@ xfs_ioctl_setattr_xflags( struct fsxattr *fa) { struct xfs_mount *mp = ip->i_mount; + uint64_t di_flags2; /* Can't change realtime flag if any extents are allocated. */ if ((ip->i_d.di_nextents || ip->i_delayed_blks) && @@ -1049,7 +1055,14 @@ xfs_ioctl_setattr_xflags( !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; - xfs_set_diflags(ip, fa->fsx_xflags); + /* diflags2 only valid for v3 inodes. */ + di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); + if (di_flags2 && ip->i_d.di_version < 3) + return -EINVAL; + + ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); + ip->i_d.di_flags2 = di_flags2; + xfs_diflags_to_linux(ip); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1072,6 +1085,7 @@ xfs_ioctl_setattr_dax_invalidate( int *join_flags) { struct inode *inode = VFS_I(ip); + struct super_block *sb = inode->i_sb; int error; *join_flags = 0; @@ -1084,7 +1098,7 @@ xfs_ioctl_setattr_dax_invalidate( if (fa->fsx_xflags & FS_XFLAG_DAX) { if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) return -EINVAL; - if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE) + if (bdev_dax_supported(sb, sb->s_blocksize) < 0) return -EINVAL; } @@ -1542,10 +1556,11 @@ xfs_ioc_getbmap( unsigned int cmd, void __user *arg) { - struct getbmapx bmx; + struct getbmapx bmx = { 0 }; int error; - if (copy_from_user(&bmx, arg, sizeof(struct getbmapx))) + /* struct getbmap is a strict subset of struct getbmapx. */ + if (copy_from_user(&bmx, arg, offsetof(struct getbmapx, bmv_iflags))) return -EFAULT; if (bmx.bmv_count < 2) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index cdc6bdd495be..f286f63c430c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -637,6 +637,11 @@ retry: goto out_unlock; } + /* + * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch + * them out if the write happens to fail. + */ + iomap->flags = IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, count, 0, &got); done: if (isnullstartblock(got.br_startblock)) @@ -685,7 +690,7 @@ xfs_iomap_write_allocate( int nres; if (whichfork == XFS_COW_FORK) - flags |= XFS_BMAPI_COWFORK; + flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; /* * Make sure that the dquots are there. @@ -831,7 +836,8 @@ int xfs_iomap_write_unwritten( xfs_inode_t *ip, xfs_off_t offset, - xfs_off_t count) + xfs_off_t count, + bool update_isize) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; @@ -842,6 +848,7 @@ xfs_iomap_write_unwritten( xfs_trans_t *tp; xfs_bmbt_irec_t imap; struct xfs_defer_ops dfops; + struct inode *inode = VFS_I(ip); xfs_fsize_t i_size; uint resblks; int error; @@ -901,7 +908,8 @@ xfs_iomap_write_unwritten( i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb); if (i_size > offset + count) i_size = offset + count; - + if (update_isize && i_size > i_size_read(inode)) + i_size_write(inode, i_size); i_size = xfs_new_eof(ip, i_size); if (i_size) { ip->i_d.di_size = i_size; @@ -1061,25 +1069,37 @@ xfs_file_iomap_end_delalloc( struct xfs_inode *ip, loff_t offset, loff_t length, - ssize_t written) + ssize_t written, + struct iomap *iomap) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t start_fsb; xfs_fileoff_t end_fsb; int error = 0; - start_fsb = XFS_B_TO_FSB(mp, offset + written); + /* + * start_fsb refers to the first unused block after a short write. If + * nothing was written, round offset down to point at the first block in + * the range. + */ + if (unlikely(!written)) + start_fsb = XFS_B_TO_FSBT(mp, offset); + else + start_fsb = XFS_B_TO_FSB(mp, offset + written); end_fsb = XFS_B_TO_FSB(mp, offset + length); /* - * Trim back delalloc blocks if we didn't manage to write the whole - * range reserved. + * Trim delalloc blocks if they were allocated by this write and we + * didn't manage to write the whole range. * * We don't need to care about racing delalloc as we hold i_mutex * across the reserve/allocate/unreserve calls. If there are delalloc * blocks in the range, they are ours. */ - if (start_fsb < end_fsb) { + if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) { + truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), + XFS_FSB_TO_B(mp, end_fsb) - 1); + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_bmap_punch_delalloc_range(ip, start_fsb, end_fsb - start_fsb); @@ -1106,7 +1126,7 @@ xfs_file_iomap_end( { if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, - length, written); + length, written, iomap); return 0; } @@ -1134,10 +1154,10 @@ xfs_xattr_iomap_begin( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - lockmode = xfs_ilock_data_map_shared(ip); + lockmode = xfs_ilock_attr_map_shared(ip); /* if there are no attribute fork or extents, return ENOENT */ - if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { + if (!XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { error = -ENOENT; goto out_unlock; } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 6d45cf01fcff..d71703af5c76 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -27,7 +27,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, struct xfs_bmbt_irec *); -int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); +int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, struct xfs_bmbt_irec *); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f5e0f608e245..5b81f7f41b80 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -190,12 +190,12 @@ xfs_generic_create( #ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { - error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) goto out_cleanup_inode; } if (acl) { - error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = __xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); if (error) goto out_cleanup_inode; } @@ -802,7 +802,7 @@ xfs_vn_setattr_nonsize( * Caution: The caller of this function is responsible for calling * setattr_prepare() or otherwise verifying the change is fine. */ -int +STATIC int xfs_setattr_size( struct xfs_inode *ip, struct iattr *iattr) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 66e881790c17..26d67ce3c18d 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -361,7 +361,6 @@ xfs_bulkstat( xfs_agino_t agino; /* inode # in allocation group */ xfs_agnumber_t agno; /* allocation group number */ xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ - size_t irbsize; /* size of irec buffer in bytes */ xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ int nirbuf; /* size of irbuf */ int ubcount; /* size of user's buffer */ @@ -388,11 +387,10 @@ xfs_bulkstat( *ubcountp = 0; *done = 0; - irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); + irbuf = kmem_zalloc_large(PAGE_SIZE * 4, KM_SLEEP); if (!irbuf) return -ENOMEM; - - nirbuf = irbsize / sizeof(*irbuf); + nirbuf = (PAGE_SIZE * 4) / sizeof(*irbuf); /* * Loop over the allocation groups, starting from the last @@ -585,7 +583,7 @@ xfs_inumbers( return error; bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer))); - buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP); + buffer = kmem_zalloc(bcount * sizeof(*buffer), KM_SLEEP); do { struct xfs_inobt_rec_incore r; int stat; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 1455b25205a8..3ebed168e508 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -363,7 +363,14 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) #endif /* DEBUG */ #ifdef CONFIG_XFS_RT -#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) + +/* + * make sure we ignore the inode flag if the filesystem doesn't have a + * configured realtime device. + */ +#define XFS_IS_REALTIME_INODE(ip) \ + (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) && \ + (ip)->i_mount->m_rtdev_targp) #else #define XFS_IS_REALTIME_INODE(ip) (0) #endif diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 4017aa967331..33c9a3aae948 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -743,15 +743,45 @@ xfs_log_mount_finish( struct xfs_mount *mp) { int error = 0; + bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY); if (mp->m_flags & XFS_MOUNT_NORECOVERY) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); return 0; + } else if (readonly) { + /* Allow unlinked processing to proceed */ + mp->m_flags &= ~XFS_MOUNT_RDONLY; } + /* + * During the second phase of log recovery, we need iget and + * iput to behave like they do for an active filesystem. + * xfs_fs_drop_inode needs to be able to prevent the deletion + * of inodes before we're done replaying log items on those + * inodes. Turn it off immediately after recovery finishes + * so that we don't leak the quota inodes if subsequent mount + * activities fail. + * + * We let all inodes involved in redo item processing end up on + * the LRU instead of being evicted immediately so that if we do + * something to an unlinked inode, the irele won't cause + * premature truncation and freeing of the inode, which results + * in log recovery failure. We have to evict the unreferenced + * lru inodes after clearing MS_ACTIVE because we don't + * otherwise clean up the lru if there's a subsequent failure in + * xfs_mountfs, which leads to us leaking the inodes if nothing + * else (e.g. quotacheck) references the inodes before the + * mount failure occurs. + */ + mp->m_super->s_flags |= MS_ACTIVE; error = xlog_recover_finish(mp->m_log); if (!error) xfs_log_work_queue(mp); + mp->m_super->s_flags &= ~MS_ACTIVE; + evict_inodes(mp->m_super); + + if (readonly) + mp->m_flags |= XFS_MOUNT_RDONLY; return error; } @@ -801,11 +831,14 @@ xfs_log_unmount_write(xfs_mount_t *mp) int error; /* - * Don't write out unmount record on read-only mounts. + * Don't write out unmount record on norecovery mounts or ro devices. * Or, if we are doing a forced umount (typically because of IO errors). */ - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (mp->m_flags & XFS_MOUNT_NORECOVERY || + xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); return 0; + } error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); @@ -1293,7 +1326,7 @@ void xfs_log_work_queue( struct xfs_mount *mp) { - queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work, + queue_delayed_work(mp->m_sync_workqueue, &mp->m_log->l_work, msecs_to_jiffies(xfs_syncd_centisecs * 10)); } @@ -3304,8 +3337,6 @@ maybe_sleep: */ if (iclog->ic_state & XLOG_STATE_IOERROR) return -EIO; - if (log_flushed) - *log_flushed = 1; } else { no_sleep: @@ -3409,8 +3440,6 @@ try_again: xlog_wait(&iclog->ic_prev->ic_write_wait, &log->l_icloglock); - if (log_flushed) - *log_flushed = 1; already_slept = 1; goto try_again; } @@ -3444,9 +3473,6 @@ try_again: */ if (iclog->ic_state & XLOG_STATE_IOERROR) return -EIO; - - if (log_flushed) - *log_flushed = 1; } else { /* just return */ spin_unlock(&log->l_icloglock); } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9b3d7c76915d..05909269f973 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1029,61 +1029,106 @@ out_error: } /* - * Check the log tail for torn writes. This is required when torn writes are - * detected at the head and the head had to be walked back to a previous record. - * The tail of the previous record must now be verified to ensure the torn - * writes didn't corrupt the previous tail. + * Calculate distance from head to tail (i.e., unused space in the log). + */ +static inline int +xlog_tail_distance( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + if (head_blk < tail_blk) + return tail_blk - head_blk; + + return tail_blk + (log->l_logBBsize - head_blk); +} + +/* + * Verify the log tail. This is particularly important when torn or incomplete + * writes have been detected near the front of the log and the head has been + * walked back accordingly. * - * Return an error if CRC verification fails as recovery cannot proceed. + * We also have to handle the case where the tail was pinned and the head + * blocked behind the tail right before a crash. If the tail had been pushed + * immediately prior to the crash and the subsequent checkpoint was only + * partially written, it's possible it overwrote the last referenced tail in the + * log with garbage. This is not a coherency problem because the tail must have + * been pushed before it can be overwritten, but appears as log corruption to + * recovery because we have no way to know the tail was updated if the + * subsequent checkpoint didn't write successfully. + * + * Therefore, CRC check the log from tail to head. If a failure occurs and the + * offending record is within max iclog bufs from the head, walk the tail + * forward and retry until a valid tail is found or corruption is detected out + * of the range of a possible overwrite. */ STATIC int xlog_verify_tail( struct xlog *log, xfs_daddr_t head_blk, - xfs_daddr_t tail_blk) + xfs_daddr_t *tail_blk, + int hsize) { struct xlog_rec_header *thead; struct xfs_buf *bp; xfs_daddr_t first_bad; - int count; int error = 0; bool wrapped; - xfs_daddr_t tmp_head; + xfs_daddr_t tmp_tail; + xfs_daddr_t orig_tail = *tail_blk; bp = xlog_get_bp(log, 1); if (!bp) return -ENOMEM; /* - * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get - * a temporary head block that points after the last possible - * concurrently written record of the tail. + * Make sure the tail points to a record (returns positive count on + * success). */ - count = xlog_seek_logrec_hdr(log, head_blk, tail_blk, - XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead, - &wrapped); - if (count < 0) { - error = count; + error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp, + &tmp_tail, &thead, &wrapped); + if (error < 0) goto out; - } - - /* - * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran - * into the actual log head. tmp_head points to the start of the record - * so update it to the actual head block. - */ - if (count < XLOG_MAX_ICLOGS + 1) - tmp_head = head_blk; + if (*tail_blk != tmp_tail) + *tail_blk = tmp_tail; /* - * We now have a tail and temporary head block that covers at least - * XLOG_MAX_ICLOGS records from the tail. We need to verify that these - * records were completely written. Run a CRC verification pass from - * tail to head and return the result. + * Run a CRC check from the tail to the head. We can't just check + * MAX_ICLOGS records past the tail because the tail may point to stale + * blocks cleared during the search for the head/tail. These blocks are + * overwritten with zero-length records and thus record count is not a + * reliable indicator of the iclog state before a crash. */ - error = xlog_do_recovery_pass(log, tmp_head, tail_blk, + first_bad = 0; + error = xlog_do_recovery_pass(log, head_blk, *tail_blk, XLOG_RECOVER_CRCPASS, &first_bad); + while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) { + int tail_distance; + + /* + * Is corruption within range of the head? If so, retry from + * the next record. Otherwise return an error. + */ + tail_distance = xlog_tail_distance(log, head_blk, first_bad); + if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize)) + break; + + /* skip to the next record; returns positive count on success */ + error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp, + &tmp_tail, &thead, &wrapped); + if (error < 0) + goto out; + + *tail_blk = tmp_tail; + first_bad = 0; + error = xlog_do_recovery_pass(log, head_blk, *tail_blk, + XLOG_RECOVER_CRCPASS, &first_bad); + } + if (!error && *tail_blk != orig_tail) + xfs_warn(log->l_mp, + "Tail block (0x%llx) overwrite detected. Updated to 0x%llx", + orig_tail, *tail_blk); out: xlog_put_bp(bp); return error; @@ -1143,7 +1188,7 @@ xlog_verify_head( */ error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk, XLOG_RECOVER_CRCPASS, &first_bad); - if (error == -EFSBADCRC) { + if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) { /* * We've hit a potential torn write. Reset the error and warn * about it. @@ -1183,31 +1228,12 @@ xlog_verify_head( ASSERT(0); return 0; } - - /* - * Now verify the tail based on the updated head. This is - * required because the torn writes trimmed from the head could - * have been written over the tail of a previous record. Return - * any errors since recovery cannot proceed if the tail is - * corrupt. - * - * XXX: This leaves a gap in truly robust protection from torn - * writes in the log. If the head is behind the tail, the tail - * pushes forward to create some space and then a crash occurs - * causing the writes into the previous record's tail region to - * tear, log recovery isn't able to recover. - * - * How likely is this to occur? If possible, can we do something - * more intelligent here? Is it safe to push the tail forward if - * we can determine that the tail is within the range of the - * torn write (e.g., the kernel can only overwrite the tail if - * it has actually been pushed forward)? Alternatively, could we - * somehow prevent this condition at runtime? - */ - error = xlog_verify_tail(log, *head_blk, *tail_blk); } + if (error) + return error; - return error; + return xlog_verify_tail(log, *head_blk, tail_blk, + be32_to_cpu((*rhead)->h_size)); } /* @@ -4152,7 +4178,7 @@ xlog_recover_commit_trans( #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 - hlist_del(&trans->r_list); + hlist_del_init(&trans->r_list); error = xlog_recover_reorder_trans(log, trans, pass); if (error) @@ -4354,6 +4380,8 @@ xlog_recover_free_trans( xlog_recover_item_t *item, *n; int i; + hlist_del_init(&trans->r_list); + list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { /* Free the regions in the item. */ list_del(&item->ri_list); @@ -4799,12 +4827,16 @@ xlog_recover_process_intents( int error = 0; struct xfs_ail_cursor cur; struct xfs_ail *ailp; +#if defined(DEBUG) || defined(XFS_WARN) xfs_lsn_t last_lsn; +#endif ailp = log->l_ailp; spin_lock(&ailp->xa_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); +#if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); +#endif while (lip != NULL) { /* * We're done when we see something other than an intent. @@ -5214,7 +5246,7 @@ xlog_do_recovery_pass( xfs_daddr_t *first_bad) /* out: first bad log rec */ { xlog_rec_header_t *rhead; - xfs_daddr_t blk_no; + xfs_daddr_t blk_no, rblk_no; xfs_daddr_t rhead_blk; char *offset; xfs_buf_t *hbp, *dbp; @@ -5222,11 +5254,15 @@ xlog_do_recovery_pass( int error2 = 0; int bblks, split_bblks; int hblks, split_hblks, wrapped_hblks; + int i; struct hlist_head rhash[XLOG_RHASH_SIZE]; LIST_HEAD (buffer_list); ASSERT(head_blk != tail_blk); - rhead_blk = 0; + blk_no = rhead_blk = tail_blk; + + for (i = 0; i < XLOG_RHASH_SIZE; i++) + INIT_HLIST_HEAD(&rhash[i]); /* * Read the header of the tail block and get the iclog buffer size from @@ -5301,7 +5337,6 @@ xlog_do_recovery_pass( } memset(rhash, 0, sizeof(rhash)); - blk_no = rhead_blk = tail_blk; if (tail_blk > head_blk) { /* * Perform recovery around the end of the physical log. @@ -5363,9 +5398,19 @@ xlog_do_recovery_pass( bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); blk_no += hblks; - /* Read in data for log record */ - if (blk_no + bblks <= log->l_logBBsize) { - error = xlog_bread(log, blk_no, bblks, dbp, + /* + * Read the log record data in multiple reads if it + * wraps around the end of the log. Note that if the + * header already wrapped, blk_no could point past the + * end of the log. The record data is contiguous in + * that case. + */ + if (blk_no + bblks <= log->l_logBBsize || + blk_no >= log->l_logBBsize) { + /* mod blk_no in case the header wrapped and + * pushed it beyond the end of the log */ + rblk_no = do_mod(blk_no, log->l_logBBsize); + error = xlog_bread(log, rblk_no, bblks, dbp, &offset); if (error) goto bread_err2; @@ -5464,6 +5509,19 @@ xlog_do_recovery_pass( if (error && first_bad) *first_bad = rhead_blk; + /* + * Transactions are freed at commit time but transactions without commit + * records on disk are never committed. Free any that may be left in the + * hash table. + */ + for (i = 0; i < XLOG_RHASH_SIZE; i++) { + struct hlist_node *tmp; + struct xlog_recover *trans; + + hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list) + xlog_recover_free_trans(trans); + } + return error ? error : error2; } @@ -5542,6 +5600,8 @@ xlog_do_recover( xfs_buf_t *bp; xfs_sb_t *sbp; + trace_xfs_log_recover(log, head_blk, tail_blk); + /* * First replay the images in the log. */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b341f10cf481..d4ce8d277992 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -502,8 +502,7 @@ STATIC void xfs_set_inoalignment(xfs_mount_t *mp) { if (xfs_sb_version_hasalign(&mp->m_sb) && - mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) + mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp)) mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1; else mp->m_inoalign_mask = 0; @@ -926,15 +925,6 @@ xfs_mountfs( } /* - * During the second phase of log recovery, we need iget and - * iput to behave like they do for an active filesystem. - * xfs_fs_drop_inode needs to be able to prevent the deletion - * of inodes before we're done replaying log items on those - * inodes. - */ - mp->m_super->s_flags |= MS_ACTIVE; - - /* * Finish recovering the file system. This part needed to be delayed * until after the root and real-time bitmap inodes were consistently * read in. @@ -1009,12 +999,13 @@ xfs_mountfs( out_quota: xfs_qm_unmount_quotas(mp); out_rtunmount: - mp->m_super->s_flags &= ~MS_ACTIVE; xfs_rtunmount_inodes(mp); out_rele_rip: IRELE(rip); cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); + /* Clean out dquots that might be in memory after quotacheck. */ + xfs_qm_unmount(mp); out_log_dealloc: mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_log_mount_cancel(mp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 819b80b15bfb..5415f9031ef8 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -140,6 +140,7 @@ typedef struct xfs_mount { int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ __uint64_t m_flags; /* global mount flags */ + bool m_inotbt_nores; /* no per-AG finobt resv. */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ int m_ialloc_min_blks;/* min blocks in sparse inode @@ -182,6 +183,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_reclaim_workqueue; struct workqueue_struct *m_log_workqueue; struct workqueue_struct *m_eofblocks_workqueue; + struct workqueue_struct *m_sync_workqueue; /* * Generation of the filesysyem layout. This is incremented by each diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 0c381d71b242..0492436a053f 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -134,7 +134,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28); XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52); - XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64, 56); + XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); } diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 93a7aafa56d6..cecd37569ddb 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -279,7 +279,7 @@ xfs_fs_commit_blocks( (end - 1) >> PAGE_SHIFT); WARN_ON_ONCE(error); - error = xfs_iomap_write_unwritten(ip, start, length); + error = xfs_iomap_write_unwritten(ip, start, length, false); if (error) goto out_drop_iolock; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index b669b123287b..1fdd3face2d9 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -111,6 +111,9 @@ restart: skipped = 0; break; } + /* we're done if id overflows back to zero */ + if (!next_index) + break; } if (skipped) { @@ -1247,6 +1250,7 @@ xfs_qm_flush_one( struct xfs_dquot *dqp, void *data) { + struct xfs_mount *mp = dqp->q_mount; struct list_head *buffer_list = data; struct xfs_buf *bp = NULL; int error = 0; @@ -1257,7 +1261,32 @@ xfs_qm_flush_one( if (!XFS_DQ_IS_DIRTY(dqp)) goto out_unlock; - xfs_dqflock(dqp); + /* + * The only way the dquot is already flush locked by the time quotacheck + * gets here is if reclaim flushed it before the dqadjust walk dirtied + * it for the final time. Quotacheck collects all dquot bufs in the + * local delwri queue before dquots are dirtied, so reclaim can't have + * possibly queued it for I/O. The only way out is to push the buffer to + * cycle the flush lock. + */ + if (!xfs_dqflock_nowait(dqp)) { + /* buf is pinned in-core by delwri list */ + DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen); + bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL); + if (!bp) { + error = -EINVAL; + goto out_unlock; + } + xfs_buf_unlock(bp); + + xfs_buf_delwri_pushbuf(bp, buffer_list); + xfs_buf_rele(bp); + + error = -EAGAIN; + goto out_unlock; + } + error = xfs_qm_dqflush(dqp, &bp); if (error) goto out_unlock; @@ -1384,12 +1413,7 @@ xfs_qm_quotacheck( mp->m_qflags |= flags; error_return: - while (!list_empty(&buffer_list)) { - struct xfs_buf *bp = - list_first_entry(&buffer_list, struct xfs_buf, b_list); - list_del_init(&bp->b_list); - xfs_buf_relse(bp); - } + xfs_buf_delwri_cancel(&buffer_list); if (error) { xfs_warn(mp, diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 475a3882a81f..9cb5c381b01c 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -759,5 +759,6 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL); + xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL, + XFS_AGITER_INEW_WAIT); } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 4d3f74e3c5e1..17d3c964a2a2 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -82,11 +82,22 @@ * mappings are a reservation against the free space in the filesystem; * adjacent mappings can also be combined into fewer larger mappings. * + * As an optimization, the CoW extent size hint (cowextsz) creates + * outsized aligned delalloc reservations in the hope of landing out of + * order nearby CoW writes in a single extent on disk, thereby reducing + * fragmentation and improving future performance. + * + * D: --RRRRRRSSSRRRRRRRR--- (data fork) + * C: ------DDDDDDD--------- (CoW fork) + * * When dirty pages are being written out (typically in writepage), the - * delalloc reservations are converted into real mappings by allocating - * blocks and replacing the delalloc mapping with real ones. A delalloc - * mapping can be replaced by several real ones if the free space is - * fragmented. + * delalloc reservations are converted into unwritten mappings by + * allocating blocks and replacing the delalloc mapping with real ones. + * A delalloc mapping can be replaced by several unwritten ones if the + * free space is fragmented. + * + * D: --RRRRRRSSSRRRRRRRR--- + * C: ------UUUUUUU--------- * * We want to adapt the delalloc mechanism for copy-on-write, since the * write paths are similar. The first two steps (creating the reservation @@ -101,13 +112,29 @@ * Block-aligned directio writes will use the same mechanism as buffered * writes. * + * Just prior to submitting the actual disk write requests, we convert + * the extents representing the range of the file actually being written + * (as opposed to extra pieces created for the cowextsize hint) to real + * extents. This will become important in the next step: + * + * D: --RRRRRRSSSRRRRRRRR--- + * C: ------UUrrUUU--------- + * * CoW remapping must be done after the data block write completes, * because we don't want to destroy the old data fork map until we're sure * the new block has been written. Since the new mappings are kept in a * separate fork, we can simply iterate these mappings to find the ones * that cover the file blocks that we just CoW'd. For each extent, simply * unmap the corresponding range in the data fork, map the new range into - * the data fork, and remove the extent from the CoW fork. + * the data fork, and remove the extent from the CoW fork. Because of + * the presence of the cowextsize hint, however, we must be careful + * only to remap the blocks that we've actually written out -- we must + * never remap delalloc reservations nor CoW staging blocks that have + * yet to be written. This corresponds exactly to the real extents in + * the CoW fork: + * + * D: --RRRRRRrrSRRRRRRRR--- + * C: ------UU--UUU--------- * * Since the remapping operation can be applied to an arbitrary file * range, we record the need for the remap step as a flag in the ioend @@ -142,6 +169,8 @@ xfs_reflink_find_shared( error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); if (error) return error; + if (!agbp) + return -ENOMEM; cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); @@ -296,6 +325,65 @@ xfs_reflink_reserve_cow( return 0; } +/* Convert part of an unwritten CoW extent to a real one. */ +STATIC int +xfs_reflink_convert_cow_extent( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb, + struct xfs_defer_ops *dfops) +{ + struct xfs_bmbt_irec irec = *imap; + xfs_fsblock_t first_block = NULLFSBLOCK; + int nimaps = 1; + + if (imap->br_state == XFS_EXT_NORM) + return 0; + + xfs_trim_extent(&irec, offset_fsb, count_fsb); + trace_xfs_reflink_convert_cow(ip, &irec); + if (irec.br_blockcount == 0) + return 0; + return xfs_bmapi_write(NULL, ip, irec.br_startoff, irec.br_blockcount, + XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, &first_block, + 0, &irec, &nimaps, dfops); +} + +/* Convert all of the unwritten CoW extents in a file's range to real ones. */ +int +xfs_reflink_convert_cow( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_bmbt_irec got; + struct xfs_defer_ops dfops; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); + xfs_extnum_t idx; + bool found; + int error = 0; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* Convert all the extents to real from unwritten. */ + for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got); + found && got.br_startoff < end_fsb; + found = xfs_iext_get_extent(ifp, ++idx, &got)) { + error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb, + end_fsb - offset_fsb, &dfops); + if (error) + break; + } + + /* Finish up. */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + /* Allocate all CoW reservations covering a range of blocks in a file. */ static int __xfs_reflink_allocate_cow( @@ -328,6 +416,7 @@ __xfs_reflink_allocate_cow( goto out_unlock; ASSERT(nimaps == 1); + /* Make sure there's a CoW reservation for it. */ error = xfs_reflink_reserve_cow(ip, &imap, &shared); if (error) goto out_trans_cancel; @@ -337,14 +426,16 @@ __xfs_reflink_allocate_cow( goto out_trans_cancel; } + /* Allocate the entire reservation as unwritten blocks. */ xfs_trans_ijoin(tp, ip, 0); error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount, - XFS_BMAPI_COWFORK, &first_block, + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block, XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), &imap, &nimaps, &dfops); if (error) goto out_trans_cancel; + /* Finish up. */ error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_trans_cancel; @@ -389,11 +480,12 @@ xfs_reflink_allocate_cow_range( if (error) { trace_xfs_reflink_allocate_cow_range_error(ip, error, _RET_IP_); - break; + return error; } } - return error; + /* Convert the CoW extents to regular. */ + return xfs_reflink_convert_cow(ip, offset, count); } /* @@ -481,14 +573,18 @@ xfs_reflink_trim_irec_to_next_cow( } /* - * Cancel all pending CoW reservations for some block range of an inode. + * Cancel CoW reservations for some block range of an inode. + * + * If cancel_real is true this function cancels all COW fork extents for the + * inode; if cancel_real is false, real extents are not cleared. */ int xfs_reflink_cancel_cow_blocks( struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, - xfs_fileoff_t end_fsb) + xfs_fileoff_t end_fsb, + bool cancel_real) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec got, prev, del; @@ -515,7 +611,7 @@ xfs_reflink_cancel_cow_blocks( &idx, &got, &del); if (error) break; - } else { + } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { xfs_trans_ijoin(*tpp, ip, 0); xfs_defer_init(&dfops, &firstfsb); @@ -558,13 +654,17 @@ xfs_reflink_cancel_cow_blocks( } /* - * Cancel all pending CoW reservations for some byte range of an inode. + * Cancel CoW reservations for some byte range of an inode. + * + * If cancel_real is true this function cancels all COW fork extents for the + * inode; if cancel_real is false, real extents are not cleared. */ int xfs_reflink_cancel_cow_range( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t count) + xfs_off_t count, + bool cancel_real) { struct xfs_trans *tp; xfs_fileoff_t offset_fsb; @@ -590,7 +690,8 @@ xfs_reflink_cancel_cow_range( xfs_trans_ijoin(tp, ip, 0); /* Scrape out the old CoW reservations */ - error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb); + error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb, + cancel_real); if (error) goto out_cancel; @@ -637,8 +738,22 @@ xfs_reflink_end_cow( offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); - /* Start a rolling transaction to switch the mappings */ - resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + /* + * Start a rolling transaction to switch the mappings. We're + * unlikely ever to have to remap 16T worth of single-block + * extents, so just cap the worst case extent count to 2^32-1. + * Stick a warning in just in case, and avoid 64-bit division. + */ + BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX); + if (end_fsb - offset_fsb > UINT_MAX) { + error = -EFSCORRUPTED; + xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE); + ASSERT(0); + goto out; + } + resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount, + (unsigned int)(end_fsb - offset_fsb), + XFS_DATA_FORK); error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, resblks, 0, 0, &tp); if (error) @@ -652,7 +767,13 @@ xfs_reflink_end_cow( /* If there is a hole at end_fsb - 1 go to the previous extent */ if (eof || got.br_startoff > end_fsb) { - ASSERT(idx > 0); + /* + * In case of racing, overlapping AIO writes no COW extents + * might be left by the time I/O completes for the loser of + * the race. In that case we are done. + */ + if (idx <= 0) + goto out_cancel; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got); } @@ -669,6 +790,16 @@ xfs_reflink_end_cow( ASSERT(!isnullstartblock(got.br_startblock)); + /* + * Don't remap unwritten extents; these are + * speculatively preallocated CoW extents that have been + * allocated but have not yet been involved in a write. + */ + if (got.br_state == XFS_EXT_UNWRITTEN) { + idx--; + goto next_extent; + } + /* Unmap the old blocks in the data fork. */ xfs_defer_init(&dfops, &firstfsb); rlen = del.br_blockcount; @@ -716,6 +847,7 @@ next_extent: out_defer: xfs_defer_cancel(&dfops); +out_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); out: @@ -885,13 +1017,14 @@ STATIC int xfs_reflink_update_dest( struct xfs_inode *dest, xfs_off_t newlen, - xfs_extlen_t cowextsize) + xfs_extlen_t cowextsize, + bool is_dedupe) { struct xfs_mount *mp = dest->i_mount; struct xfs_trans *tp; int error; - if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) + if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) return 0; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); @@ -912,6 +1045,10 @@ xfs_reflink_update_dest( dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; } + if (!is_dedupe) { + xfs_trans_ichgtime(tp, dest, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + } xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); error = xfs_trans_commit(tp); @@ -1428,7 +1565,8 @@ xfs_reflink_remap_range( !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) cowextsize = src->i_d.di_cowextsize; - ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize); + ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, + is_dedupe); out_unlock: xfs_iunlock(src, XFS_MMAPLOCK_EXCL); @@ -1580,7 +1718,7 @@ next: * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. */ - error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF); + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); if (error) return error; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 97ea9b487884..a57966fc7ddd 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -30,6 +30,8 @@ extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared); extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); +extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, struct xfs_bmbt_irec *imap, bool *need_alloc); extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, @@ -37,9 +39,9 @@ extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, - xfs_fileoff_t end_fsb); + xfs_fileoff_t end_fsb, bool cancel_real); extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t count); + xfs_off_t count, bool cancel_real); extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ade4691e3f74..67d589e0a49f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -872,8 +872,15 @@ xfs_init_mount_workqueues( if (!mp->m_eofblocks_workqueue) goto out_destroy_log; + mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0, + mp->m_fsname); + if (!mp->m_sync_workqueue) + goto out_destroy_eofb; + return 0; +out_destroy_eofb: + destroy_workqueue(mp->m_eofblocks_workqueue); out_destroy_log: destroy_workqueue(mp->m_log_workqueue); out_destroy_reclaim: @@ -894,6 +901,7 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_sync_workqueue); destroy_workqueue(mp->m_eofblocks_workqueue); destroy_workqueue(mp->m_log_workqueue); destroy_workqueue(mp->m_reclaim_workqueue); @@ -948,7 +956,7 @@ xfs_fs_destroy_inode( XFS_STATS_INC(ip->i_mount, vn_remove); if (xfs_is_reflink_inode(ip)) { - error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) xfs_warn(ip->i_mount, "Error %d while evicting CoW blocks for inode %llu.", @@ -1206,7 +1214,7 @@ xfs_test_remount_options( tmp_mp->m_super = sb; error = xfs_parseargs(tmp_mp, options); xfs_free_fsname(tmp_mp); - kfree(tmp_mp); + kmem_free(tmp_mp); return error; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0907752be62d..bdf69e1c7410 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -366,6 +366,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done); DEFINE_BUF_EVENT(xfs_buf_delwri_queue); DEFINE_BUF_EVENT(xfs_buf_delwri_queued); DEFINE_BUF_EVENT(xfs_buf_delwri_split); +DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_bdstrat_shut); DEFINE_BUF_EVENT(xfs_buf_item_relse); @@ -519,7 +520,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); @@ -1990,6 +1990,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); +TRACE_EVENT(xfs_log_recover, + TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk), + TP_ARGS(log, headblk, tailblk), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, headblk) + __field(xfs_daddr_t, tailblk) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->headblk = headblk; + __entry->tailblk = tailblk; + ), + TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk, + __entry->tailblk) +) + TRACE_EVENT(xfs_log_recover_record, TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass), TP_ARGS(log, rhead, pass), @@ -3183,6 +3201,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class, __field(xfs_fileoff_t, lblk) __field(xfs_extlen_t, len) __field(xfs_fsblock_t, pblk) + __field(int, state) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; @@ -3190,13 +3209,15 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class, __entry->lblk = irec->br_startoff; __entry->len = irec->br_blockcount; __entry->pblk = irec->br_startblock; + __entry->state = irec->br_state; ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu", + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->lblk, __entry->len, - __entry->pblk) + __entry->pblk, + __entry->state) ); #define DEFINE_INODE_IREC_EVENT(name) \ DEFINE_EVENT(xfs_inode_irec_class, name, \ @@ -3345,11 +3366,12 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); +DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); DEFINE_RW_EVENT(xfs_reflink_reserve_cow); DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); -DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write); +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write); DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping); DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 70f42ea86dfb..a280e126491f 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -263,6 +263,28 @@ xfs_trans_alloc( } /* + * Create an empty transaction with no reservation. This is a defensive + * mechanism for routines that query metadata without actually modifying + * them -- if the metadata being queried is somehow cross-linked (think a + * btree block pointer that points higher in the tree), we risk deadlock. + * However, blocks grabbed as part of a transaction can be re-grabbed. + * The verifiers will notice the corrupt block and the operation will fail + * back to userspace without deadlocking. + * + * Note the zero-length reservation; this transaction MUST be cancelled + * without any dirty data. + */ +int +xfs_trans_alloc_empty( + struct xfs_mount *mp, + struct xfs_trans **tpp) +{ + struct xfs_trans_res resv = {0}; + + return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp); +} + +/* * Record the indicated change to the given field for application * to the file system's superblock when the transaction commits. * For now, just store the change in the transaction structure. diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 61b7fbdd3ebd..5669cf00bae0 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -50,6 +50,7 @@ typedef struct xfs_log_item { struct xfs_ail *li_ailp; /* ptr to AIL */ uint li_type; /* item type */ uint li_flags; /* misc flags */ + struct xfs_buf *li_buf; /* real buffer pointer */ struct xfs_log_item *li_bio_list; /* buffer item list */ void (*li_cb)(struct xfs_buf *, struct xfs_log_item *); @@ -65,11 +66,13 @@ typedef struct xfs_log_item { } xfs_log_item_t; #define XFS_LI_IN_AIL 0x1 -#define XFS_LI_ABORTED 0x2 +#define XFS_LI_ABORTED 0x2 +#define XFS_LI_FAILED 0x4 #define XFS_LI_FLAGS \ { XFS_LI_IN_AIL, "IN_AIL" }, \ - { XFS_LI_ABORTED, "ABORTED" } + { XFS_LI_ABORTED, "ABORTED" }, \ + { XFS_LI_FAILED, "FAILED" } struct xfs_item_ops { void (*iop_size)(xfs_log_item_t *, int *, int *); @@ -80,6 +83,7 @@ struct xfs_item_ops { void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); + void (*iop_error)(xfs_log_item_t *, xfs_buf_t *); }; void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, @@ -159,6 +163,8 @@ typedef struct xfs_trans { int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, uint blocks, uint rtextents, uint flags, struct xfs_trans **tpp); +int xfs_trans_alloc_empty(struct xfs_mount *mp, + struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, @@ -211,12 +217,14 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); -void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); +bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); -void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); +void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint, + uint); +void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); void xfs_extent_free_init_defer_op(void); @@ -275,6 +283,6 @@ int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp, struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops, enum xfs_bmap_intent_type type, struct xfs_inode *ip, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, - xfs_filblks_t blockcount, xfs_exntst_t state); + xfs_filblks_t *blockcount, xfs_exntst_t state); #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index d6c9c3e9e02b..70f5ab017323 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -684,8 +684,24 @@ xfs_trans_ail_update_bulk( } } -/* - * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL +bool +xfs_ail_delete_one( + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_log_item *mlip = xfs_ail_min(ailp); + + trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); + xfs_ail_delete(ailp, lip); + xfs_clear_li_failed(lip); + lip->li_flags &= ~XFS_LI_IN_AIL; + lip->li_lsn = 0; + + return mlip == lip; +} + +/** + * Remove a log items from the AIL * * @xfs_trans_ail_delete_bulk takes an array of log items that all need to * removed from the AIL. The caller is already holding the AIL lock, and done @@ -706,52 +722,36 @@ xfs_trans_ail_update_bulk( * before returning. */ void -xfs_trans_ail_delete_bulk( +xfs_trans_ail_delete( struct xfs_ail *ailp, - struct xfs_log_item **log_items, - int nr_items, + struct xfs_log_item *lip, int shutdown_type) __releases(ailp->xa_lock) { - xfs_log_item_t *mlip; - int mlip_changed = 0; - int i; - - mlip = xfs_ail_min(ailp); + struct xfs_mount *mp = ailp->xa_mount; + bool mlip_changed; - for (i = 0; i < nr_items; i++) { - struct xfs_log_item *lip = log_items[i]; - if (!(lip->li_flags & XFS_LI_IN_AIL)) { - struct xfs_mount *mp = ailp->xa_mount; - - spin_unlock(&ailp->xa_lock); - if (!XFS_FORCED_SHUTDOWN(mp)) { - xfs_alert_tag(mp, XFS_PTAG_AILDELETE, - "%s: attempting to delete a log item that is not in the AIL", - __func__); - xfs_force_shutdown(mp, shutdown_type); - } - return; + if (!(lip->li_flags & XFS_LI_IN_AIL)) { + spin_unlock(&ailp->xa_lock); + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_alert_tag(mp, XFS_PTAG_AILDELETE, + "%s: attempting to delete a log item that is not in the AIL", + __func__); + xfs_force_shutdown(mp, shutdown_type); } - - trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); - xfs_ail_delete(ailp, lip); - lip->li_flags &= ~XFS_LI_IN_AIL; - lip->li_lsn = 0; - if (mlip == lip) - mlip_changed = 1; + return; } + mlip_changed = xfs_ail_delete_one(ailp, lip); if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) - xlog_assign_tail_lsn_locked(ailp->xa_mount); + if (!XFS_FORCED_SHUTDOWN(mp)) + xlog_assign_tail_lsn_locked(mp); if (list_empty(&ailp->xa_ail)) wake_up_all(&ailp->xa_empty); - spin_unlock(&ailp->xa_lock); + } + spin_unlock(&ailp->xa_lock); + if (mlip_changed) xfs_log_space_wake(ailp->xa_mount); - } else { - spin_unlock(&ailp->xa_lock); - } } int diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c index 6408e7d7c08c..14543d93cd4b 100644 --- a/fs/xfs/xfs_trans_bmap.c +++ b/fs/xfs/xfs_trans_bmap.c @@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update( int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, - xfs_filblks_t blockcount, + xfs_filblks_t *blockcount, xfs_exntst_t state) { int error; @@ -196,16 +196,23 @@ xfs_bmap_update_finish_item( void **state) { struct xfs_bmap_intent *bmap; + xfs_filblks_t count; int error; bmap = container_of(item, struct xfs_bmap_intent, bi_list); + count = bmap->bi_bmap.br_blockcount; error = xfs_trans_log_finish_bmap_update(tp, done_item, dop, bmap->bi_type, bmap->bi_owner, bmap->bi_whichfork, bmap->bi_bmap.br_startoff, bmap->bi_bmap.br_startblock, - bmap->bi_bmap.br_blockcount, + &count, bmap->bi_bmap.br_state); + if (!error && count > 0) { + ASSERT(bmap->bi_type == XFS_BMAP_UNMAP); + bmap->bi_bmap.br_blockcount = count; + return -EAGAIN; + } kmem_free(bmap); return error; } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 8ee29ca132dc..3ba7a96a8abd 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -356,6 +356,7 @@ xfs_trans_brelse(xfs_trans_t *tp, xfs_buf_t *bp) { xfs_buf_log_item_t *bip; + int freed; /* * Default to a normal brelse() call if the tp is NULL. @@ -419,16 +420,22 @@ xfs_trans_brelse(xfs_trans_t *tp, /* * Drop our reference to the buf log item. */ - atomic_dec(&bip->bli_refcount); + freed = atomic_dec_and_test(&bip->bli_refcount); /* - * If the buf item is not tracking data in the log, then - * we must free it before releasing the buffer back to the - * free pool. Before releasing the buffer to the free pool, - * clear the transaction pointer in b_fsprivate2 to dissolve - * its relation to this transaction. + * If the buf item is not tracking data in the log, then we must free it + * before releasing the buffer back to the free pool. + * + * If the fs has shutdown and we dropped the last reference, it may fall + * on us to release a (possibly dirty) bli if it never made it to the + * AIL (e.g., the aborted unpin already happened and didn't release it + * due to our reference). Since we're already shutdown and need xa_lock, + * just force remove from the AIL and release the bli here. */ - if (!xfs_buf_item_dirty(bip)) { + if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { + xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); + xfs_buf_item_relse(bp); + } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) { /*** ASSERT(bp->b_pincount == 0); ***/ @@ -486,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t *tp, } /* - * This is called to mark bytes first through last inclusive of the given - * buffer as needing to be logged when the transaction is committed. - * The buffer must already be associated with the given transaction. - * - * First and last are numbers relative to the beginning of this buffer, - * so the first byte in the buffer is numbered 0 regardless of the - * value of b_blkno. + * Mark a buffer dirty in the transaction. */ void -xfs_trans_log_buf(xfs_trans_t *tp, - xfs_buf_t *bp, - uint first, - uint last) +xfs_trans_dirty_buf( + struct xfs_trans *tp, + struct xfs_buf *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_fspriv; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); - ASSERT(first <= last && last < BBTOB(bp->b_length)); ASSERT(bp->b_iodone == NULL || bp->b_iodone == xfs_buf_iodone_callbacks); @@ -524,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, bp->b_iodone = xfs_buf_iodone_callbacks; bip->bli_item.li_cb = xfs_buf_iodone; - trace_xfs_trans_log_buf(bip); - /* * If we invalidated the buffer within this transaction, then * cancel the invalidation now that we're dirtying the buffer @@ -538,17 +535,37 @@ xfs_trans_log_buf(xfs_trans_t *tp, bp->b_flags &= ~XBF_STALE; bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; } + bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; tp->t_flags |= XFS_TRANS_DIRTY; bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} - /* - * If we have an ordered buffer we are not logging any dirty range but - * it still needs to be marked dirty and that it has been logged. - */ - bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; - if (!(bip->bli_flags & XFS_BLI_ORDERED)) - xfs_buf_item_log(bip, first, last); +/* + * This is called to mark bytes first through last inclusive of the given + * buffer as needing to be logged when the transaction is committed. + * The buffer must already be associated with the given transaction. + * + * First and last are numbers relative to the beginning of this buffer, + * so the first byte in the buffer is numbered 0 regardless of the + * value of b_blkno. + */ +void +xfs_trans_log_buf( + struct xfs_trans *tp, + struct xfs_buf *bp, + uint first, + uint last) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + ASSERT(first <= last && last < BBTOB(bp->b_length)); + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); + + xfs_trans_dirty_buf(tp, bp); + + trace_xfs_trans_log_buf(bip); + xfs_buf_item_log(bip, first, last); } @@ -701,14 +718,13 @@ xfs_trans_inode_alloc_buf( } /* - * Mark the buffer as ordered for this transaction. This means - * that the contents of the buffer are not recorded in the transaction - * but it is tracked in the AIL as though it was. This allows us - * to record logical changes in transactions rather than the physical - * changes we make to the buffer without changing writeback ordering - * constraints of metadata buffers. + * Mark the buffer as ordered for this transaction. This means that the contents + * of the buffer are not recorded in the transaction but it is tracked in the + * AIL as though it was. This allows us to record logical changes in + * transactions rather than the physical changes we make to the buffer without + * changing writeback ordering constraints of metadata buffers. */ -void +bool xfs_trans_ordered_buf( struct xfs_trans *tp, struct xfs_buf *bp) @@ -719,8 +735,18 @@ xfs_trans_ordered_buf( ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); + if (xfs_buf_item_dirty_format(bip)) + return false; + bip->bli_flags |= XFS_BLI_ORDERED; trace_xfs_buf_item_ordered(bip); + + /* + * We don't log a dirty range of an ordered buffer but it still needs + * to be marked dirty and that it has been logged. + */ + xfs_trans_dirty_buf(tp, bp); + return true; } /* diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 49931b72da8a..b317a3644c00 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -106,18 +106,9 @@ xfs_trans_ail_update( xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } -void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, - struct xfs_log_item **log_items, int nr_items, - int shutdown_type) - __releases(ailp->xa_lock); -static inline void -xfs_trans_ail_delete( - struct xfs_ail *ailp, - xfs_log_item_t *lip, - int shutdown_type) __releases(ailp->xa_lock) -{ - xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type); -} +bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, + int shutdown_type) __releases(ailp->xa_lock); static inline void xfs_trans_ail_remove( @@ -173,4 +164,35 @@ xfs_trans_ail_copy_lsn( *dst = *src; } #endif + +static inline void +xfs_clear_li_failed( + struct xfs_log_item *lip) +{ + struct xfs_buf *bp = lip->li_buf; + + ASSERT(lip->li_flags & XFS_LI_IN_AIL); + lockdep_assert_held(&lip->li_ailp->xa_lock); + + if (lip->li_flags & XFS_LI_FAILED) { + lip->li_flags &= ~XFS_LI_FAILED; + lip->li_buf = NULL; + xfs_buf_rele(bp); + } +} + +static inline void +xfs_set_li_failed( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + lockdep_assert_held(&lip->li_ailp->xa_lock); + + if (!(lip->li_flags & XFS_LI_FAILED)) { + xfs_buf_hold(bp); + lip->li_flags |= XFS_LI_FAILED; + lip->li_buf = bp; + } +} + #endif /* __XFS_TRANS_PRIV_H__ */ |