diff options
Diffstat (limited to 'fs')
120 files changed, 1602 insertions, 522 deletions
@@ -183,8 +183,9 @@ struct poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; - bool done; bool cancelled; + bool work_scheduled; + bool work_need_resched; struct wait_queue_entry wait; struct work_struct work; }; @@ -1626,6 +1627,51 @@ static void aio_poll_put_work(struct work_struct *work) iocb_put(iocb); } +/* + * Safely lock the waitqueue which the request is on, synchronizing with the + * case where the ->poll() provider decides to free its waitqueue early. + * + * Returns true on success, meaning that req->head->lock was locked, req->wait + * is on req->head, and an RCU read lock was taken. Returns false if the + * request was already removed from its waitqueue (which might no longer exist). + */ +static bool poll_iocb_lock_wq(struct poll_iocb *req) +{ + wait_queue_head_t *head; + + /* + * While we hold the waitqueue lock and the waitqueue is nonempty, + * wake_up_pollfree() will wait for us. However, taking the waitqueue + * lock in the first place can race with the waitqueue being freed. + * + * We solve this as eventpoll does: by taking advantage of the fact that + * all users of wake_up_pollfree() will RCU-delay the actual free. If + * we enter rcu_read_lock() and see that the pointer to the queue is + * non-NULL, we can then lock it without the memory being freed out from + * under us, then check whether the request is still on the queue. + * + * Keep holding rcu_read_lock() as long as we hold the queue lock, in + * case the caller deletes the entry from the queue, leaving it empty. + * In that case, only RCU prevents the queue memory from being freed. + */ + rcu_read_lock(); + head = smp_load_acquire(&req->head); + if (head) { + spin_lock(&head->lock); + if (!list_empty(&req->wait.entry)) + return true; + spin_unlock(&head->lock); + } + rcu_read_unlock(); + return false; +} + +static void poll_iocb_unlock_wq(struct poll_iocb *req) +{ + spin_unlock(&req->head->lock); + rcu_read_unlock(); +} + static void aio_poll_complete_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); @@ -1645,14 +1691,27 @@ static void aio_poll_complete_work(struct work_struct *work) * avoid further branches in the fast path. */ spin_lock_irq(&ctx->ctx_lock); - if (!mask && !READ_ONCE(req->cancelled)) { - add_wait_queue(req->head, &req->wait); - spin_unlock_irq(&ctx->ctx_lock); - return; - } + if (poll_iocb_lock_wq(req)) { + if (!mask && !READ_ONCE(req->cancelled)) { + /* + * The request isn't actually ready to be completed yet. + * Reschedule completion if another wakeup came in. + */ + if (req->work_need_resched) { + schedule_work(&req->work); + req->work_need_resched = false; + } else { + req->work_scheduled = false; + } + poll_iocb_unlock_wq(req); + spin_unlock_irq(&ctx->ctx_lock); + return; + } + list_del_init(&req->wait.entry); + poll_iocb_unlock_wq(req); + } /* else, POLLFREE has freed the waitqueue, so we must complete */ list_del_init(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); - req->done = true; spin_unlock_irq(&ctx->ctx_lock); iocb_put(iocb); @@ -1664,13 +1723,14 @@ static int aio_poll_cancel(struct kiocb *iocb) struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); struct poll_iocb *req = &aiocb->poll; - spin_lock(&req->head->lock); - WRITE_ONCE(req->cancelled, true); - if (!list_empty(&req->wait.entry)) { - list_del_init(&req->wait.entry); - schedule_work(&aiocb->poll.work); - } - spin_unlock(&req->head->lock); + if (poll_iocb_lock_wq(req)) { + WRITE_ONCE(req->cancelled, true); + if (!req->work_scheduled) { + schedule_work(&aiocb->poll.work); + req->work_scheduled = true; + } + poll_iocb_unlock_wq(req); + } /* else, the request was force-cancelled by POLLFREE already */ return 0; } @@ -1687,20 +1747,26 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && !(mask & req->events)) return 0; - list_del_init(&req->wait.entry); - - if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { + /* + * Complete the request inline if possible. This requires that three + * conditions be met: + * 1. An event mask must have been passed. If a plain wakeup was done + * instead, then mask == 0 and we have to call vfs_poll() to get + * the events, so inline completion isn't possible. + * 2. The completion work must not have already been scheduled. + * 3. ctx_lock must not be busy. We have to use trylock because we + * already hold the waitqueue lock, so this inverts the normal + * locking order. Use irqsave/irqrestore because not all + * filesystems (e.g. fuse) call this function with IRQs disabled, + * yet IRQs have to be disabled before ctx_lock is obtained. + */ + if (mask && !req->work_scheduled && + spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { struct kioctx *ctx = iocb->ki_ctx; - /* - * Try to complete the iocb inline if we can. Use - * irqsave/irqrestore because not all filesystems (e.g. fuse) - * call this function with IRQs disabled and because IRQs - * have to be disabled before ctx_lock is obtained. - */ + list_del_init(&req->wait.entry); list_del(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); - req->done = true; if (iocb->ki_eventfd && eventfd_signal_count()) { iocb = NULL; INIT_WORK(&req->work, aio_poll_put_work); @@ -1710,7 +1776,43 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (iocb) iocb_put(iocb); } else { - schedule_work(&req->work); + /* + * Schedule the completion work if needed. If it was already + * scheduled, record that another wakeup came in. + * + * Don't remove the request from the waitqueue here, as it might + * not actually be complete yet (we won't know until vfs_poll() + * is called), and we must not miss any wakeups. POLLFREE is an + * exception to this; see below. + */ + if (req->work_scheduled) { + req->work_need_resched = true; + } else { + schedule_work(&req->work); + req->work_scheduled = true; + } + + /* + * If the waitqueue is being freed early but we can't complete + * the request inline, we have to tear down the request as best + * we can. That means immediately removing the request from its + * waitqueue and preventing all further accesses to the + * waitqueue via the request. We also need to schedule the + * completion work (done above). Also mark the request as + * cancelled, to potentially skip an unneeded call to ->poll(). + */ + if (mask & POLLFREE) { + WRITE_ONCE(req->cancelled, true); + list_del_init(&req->wait.entry); + + /* + * Careful: this *must* be the last step, since as soon + * as req->head is NULL'ed out, the request can be + * completed and freed, since aio_poll_complete_work() + * will no longer need to take the waitqueue lock. + */ + smp_store_release(&req->head, NULL); + } } return 1; } @@ -1718,6 +1820,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct aio_poll_table { struct poll_table_struct pt; struct aio_kiocb *iocb; + bool queued; int error; }; @@ -1728,11 +1831,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt); /* multiple wait queues per file are not supported */ - if (unlikely(pt->iocb->poll.head)) { + if (unlikely(pt->queued)) { pt->error = -EINVAL; return; } + pt->queued = true; pt->error = 0; pt->iocb->poll.head = head; add_wait_queue(head, &pt->iocb->poll.wait); @@ -1757,12 +1861,14 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; req->head = NULL; - req->done = false; req->cancelled = false; + req->work_scheduled = false; + req->work_need_resched = false; apt.pt._qproc = aio_poll_queue_proc; apt.pt._key = req->events; apt.iocb = aiocb; + apt.queued = false; apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ /* initialized the list so that we can do list_empty checks */ @@ -1771,23 +1877,35 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) mask = vfs_poll(req->file, &apt.pt) & req->events; spin_lock_irq(&ctx->ctx_lock); - if (likely(req->head)) { - spin_lock(&req->head->lock); - if (unlikely(list_empty(&req->wait.entry))) { - if (apt.error) + if (likely(apt.queued)) { + bool on_queue = poll_iocb_lock_wq(req); + + if (!on_queue || req->work_scheduled) { + /* + * aio_poll_wake() already either scheduled the async + * completion work, or completed the request inline. + */ + if (apt.error) /* unsupported case: multiple queues */ cancel = true; apt.error = 0; mask = 0; } if (mask || apt.error) { + /* Steal to complete synchronously. */ list_del_init(&req->wait.entry); } else if (cancel) { + /* Cancel if possible (may be too late though). */ WRITE_ONCE(req->cancelled, true); - } else if (!req->done) { /* actually waiting for an event */ + } else if (on_queue) { + /* + * Actually waiting for an event, so add the request to + * active_reqs so that it can be cancelled if needed. + */ list_add_tail(&aiocb->ki_list, &ctx->active_reqs); aiocb->ki_cancel = aio_poll_cancel; } - spin_unlock(&req->head->lock); + if (on_queue) + poll_iocb_unlock_wq(req); } if (mask) { /* no async, we'd stolen it */ aiocb->ki_res.res = mangle_poll(mask); diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 11be02459b87..eb592b92aa9c 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -237,6 +237,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq, ordered_list); if (!test_bit(WORK_DONE_BIT, &work->flags)) break; + /* + * Orders all subsequent loads after reading WORK_DONE_BIT, + * paired with the smp_mb__before_atomic in btrfs_work_helper + * this guarantees that the ordered function will see all + * updates from ordinary work function. + */ + smp_rmb(); /* * we are going to call the ordered done function, but @@ -325,6 +332,13 @@ static void btrfs_work_helper(struct work_struct *normal_work) thresh_exec_hook(wq); work->func(work); if (need_order) { + /* + * Ensures all memory accesses done in the work function are + * ordered before setting the WORK_DONE_BIT. Ensuring the thread + * which is going to executed the ordered work sees them. + * Pairs with the smp_rmb in run_ordered_work. + */ + smp_mb__before_atomic(); set_bit(WORK_DONE_BIT, &work->flags); run_ordered_work(wq, work); } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 7f644a58db51..c701a19fac53 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1208,7 +1208,12 @@ again: ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); + if (ret == 0) { + /* This shouldn't happen, indicates a bug or fs corruption. */ + ASSERT(ret != 0); + ret = -EUCLEAN; + goto out; + } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (trans && likely(trans->type != __TRANS_DUMMY) && @@ -1356,10 +1361,18 @@ again: goto out; if (!ret && extent_item_pos) { /* - * we've recorded that parent, so we must extend - * its inode list here + * We've recorded that parent, so we must extend + * its inode list here. + * + * However if there was corruption we may not + * have found an eie, return an error in this + * case. */ - BUG_ON(!eie); + ASSERT(eie); + if (!eie) { + ret = -EUCLEAN; + goto out; + } while (eie->next) eie = eie->next; eie->next = ref->inode_list; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e98d6ea35ea8..bcf19dfb0af3 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2388,7 +2388,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_path *path = NULL; LIST_HEAD(dirty); struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; int loops = 0; spin_lock(&cur_trans->dirty_bgs_lock); @@ -2455,7 +2454,6 @@ again: cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; /* @@ -2556,7 +2554,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) int should_put; struct btrfs_path *path; struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; path = btrfs_alloc_path(); if (!path) @@ -2614,7 +2611,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; list_add_tail(&cache->io_list, io); } else { diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dac30b00d14b..822c615840e8 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2658,12 +2658,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; - int root_lock; + int root_lock = 0; int level = 0; - /* We try very hard to do read locks on the root */ - root_lock = BTRFS_READ_LOCK; - if (p->search_commit_root) { /* * The commit roots are read only so we always do read locks, @@ -2701,6 +2698,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, goto out; } + /* We try very hard to do read locks on the root */ + root_lock = BTRFS_READ_LOCK; + /* * If the level is set to maximum, we can skip trying to get the read * lock. @@ -2727,6 +2727,17 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, level = btrfs_header_level(b); out: + /* + * The root may have failed to write out at some point, and thus is no + * longer valid, return an error in this case. + */ + if (!extent_buffer_uptodate(b)) { + if (root_lock) + btrfs_tree_unlock_rw(b, root_lock); + free_extent_buffer(b); + return ERR_PTR(-EIO); + } + p->nodes[level] = b; if (!p->skip_locking) p->locks[level] = root_lock; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1499531bc151..f18c6d97932e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3636,11 +3636,23 @@ static void btrfs_end_empty_barrier(struct bio *bio) */ static void write_dev_flush(struct btrfs_device *device) { - struct request_queue *q = bdev_get_queue(device->bdev); struct bio *bio = device->flush_bio; +#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY + /* + * When a disk has write caching disabled, we skip submission of a bio + * with flush and sync requests before writing the superblock, since + * it's not needed. However when the integrity checker is enabled, this + * results in reports that there are metadata blocks referred by a + * superblock that were not properly flushed. So don't skip the bio + * submission only when the integrity checker is enabled for the sake + * of simplicity, since this is a debug tool and not meant for use in + * non-debug builds. + */ + struct request_queue *q = bdev_get_queue(device->bdev); if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) return; +#endif bio_reset(bio); bio->bi_end_io = btrfs_end_empty_barrier; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9108a73423f7..95ddeb477797 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3755,6 +3755,12 @@ static void set_btree_ioerr(struct page *page) return; /* + * A read may stumble upon this buffer later, make sure that it gets an + * error and knows there was an error. + */ + clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + + /* * If we error out, we should add back the dirty_metadata_bytes * to make it consistent. */ diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index fcf1807cc8dd..c8def2bdf247 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -202,7 +202,7 @@ struct extent_buffer { */ struct extent_changeset { /* How many bytes are set/cleared in this operation */ - unsigned int bytes_changed; + u64 bytes_changed; /* Changed ranges */ struct ulist range_changed; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b859ed50cf46..7755a0362a3a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10808,9 +10808,19 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, struct btrfs_swap_info *bsi) { unsigned long nr_pages; + unsigned long max_pages; u64 first_ppage, first_ppage_reported, next_ppage; int ret; + /* + * Our swapfile may have had its size extended after the swap header was + * written. In that case activating the swapfile should not go beyond + * the max size set in the swap header. + */ + if (bsi->nr_pages >= sis->max) + return 0; + + max_pages = sis->max - bsi->nr_pages; first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, PAGE_SIZE) >> PAGE_SHIFT; @@ -10818,6 +10828,7 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, if (first_ppage >= next_ppage) return 0; nr_pages = next_ppage - first_ppage; + nr_pages = min(nr_pages, max_pages); first_ppage_reported = first_ppage; if (bsi->start == 0) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e9d3eb7f0e2b..675112aa998f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3027,10 +3027,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, inode_lock(inode); err = btrfs_delete_subvolume(dir, dentry); inode_unlock(inode); - if (!err) { - fsnotify_rmdir(dir, dentry); - d_delete(dentry); - } + if (!err) + d_delete_notify(dir, dentry); out_dput: dput(dentry); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index bb034e19a2a8..5a3006c75d63 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -890,6 +890,14 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) int ret = 0; int slot; + /* + * We need to have subvol_sem write locked, to prevent races between + * concurrent tasks trying to enable quotas, because we will unlock + * and relock qgroup_ioctl_lock before setting fs_info->quota_root + * and before setting BTRFS_FS_QUOTA_ENABLED. + */ + lockdep_assert_held_write(&fs_info->subvol_sem); + mutex_lock(&fs_info->qgroup_ioctl_lock); if (fs_info->quota_root) goto out; @@ -1035,8 +1043,19 @@ out_add_root: goto out_free_path; } + mutex_unlock(&fs_info->qgroup_ioctl_lock); + /* + * Commit the transaction while not holding qgroup_ioctl_lock, to avoid + * a deadlock with tasks concurrently doing other qgroup operations, such + * adding/removing qgroups or adding/deleting qgroup relations for example, + * because all qgroup operations first start or join a transaction and then + * lock the qgroup_ioctl_lock mutex. + * We are safe from a concurrent task trying to enable quotas, by calling + * this function, since we are serialized by fs_info->subvol_sem. + */ ret = btrfs_commit_transaction(trans); trans = NULL; + mutex_lock(&fs_info->qgroup_ioctl_lock); if (ret) goto out_free_path; @@ -1086,12 +1105,34 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) struct btrfs_trans_handle *trans = NULL; int ret = 0; + /* + * We need to have subvol_sem write locked, to prevent races between + * concurrent tasks trying to disable quotas, because we will unlock + * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes. + */ + lockdep_assert_held_write(&fs_info->subvol_sem); + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out; + + /* + * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to + * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs + * to lock that mutex while holding a transaction handle and the rescan + * worker needs to commit a transaction. + */ mutex_unlock(&fs_info->qgroup_ioctl_lock); /* + * Request qgroup rescan worker to complete and wait for it. This wait + * must be done before transaction start for quota disable since it may + * deadlock with transaction by the qgroup rescan worker. + */ + clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + btrfs_qgroup_wait_for_completion(fs_info, false); + + /* * 1 For the root item * * We should also reserve enough items for the quota tree deletion in @@ -1106,14 +1147,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); goto out; } if (!fs_info->quota_root) goto out; - clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; fs_info->quota_root = NULL; @@ -3285,6 +3325,9 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, btrfs_warn(fs_info, "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; + } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + /* Quota disable is in progress */ + ret = -EBUSY; } if (ret) { diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 612411c74550..0d07ebe511e7 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -371,7 +371,8 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, key.offset = ref_id; again: ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - BUG_ON(ret < 0); + if (ret < 0) + goto out; if (ret == 0) { leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fc688af57c23..e258fc484cea 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5005,6 +5005,10 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) lock_page(page); if (!PageUptodate(page)) { unlock_page(page); + btrfs_err(fs_info, + "send: IO error at offset %llu for inode %llu root %llu", + page_offset(page), sctx->cur_ino, + sctx->send_root->root_key.objectid); put_page(page); ret = -EIO; break; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7bc4477d7ee7..b7bfecfc2ea3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1308,6 +1308,15 @@ again: inode, name, namelen); kfree(name); iput(dir); + /* + * Whenever we need to check if a name exists or not, we + * check the subvolume tree. So after an unlink we must + * run delayed items, so that future checks for a name + * during log replay see that the name does not exists + * anymore. + */ + if (!ret) + ret = btrfs_run_delayed_items(trans); if (ret) goto out; goto again; @@ -1559,6 +1568,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, */ if (!ret && inode->i_nlink == 0) inc_nlink(inode); + /* + * Whenever we need to check if a name exists or + * not, we check the subvolume tree. So after an + * unlink we must run delayed items, so that future + * checks for a name during log replay see that the + * name does not exists anymore. + */ + if (!ret) + ret = btrfs_run_delayed_items(trans); } if (ret < 0) goto out; @@ -4249,7 +4267,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, /* * Log all prealloc extents beyond the inode's i_size to make sure we do not - * lose them after doing a fast fsync and replaying the log. We scan the + * lose them after doing a full/fast fsync and replaying the log. We scan the * subvolume's root instead of iterating the inode's extent map tree because * otherwise we can log incorrect extent items based on extent map conversion. * That can happen due to the fact that extent maps are merged when they @@ -5042,6 +5060,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx, bool *need_log_inode_item) { + const u64 i_size = i_size_read(&inode->vfs_inode); struct btrfs_root *root = inode->root; int ins_start_slot = 0; int ins_nr = 0; @@ -5062,13 +5081,21 @@ again: if (min_key->type > max_key->type) break; - if (min_key->type == BTRFS_INODE_ITEM_KEY) + if (min_key->type == BTRFS_INODE_ITEM_KEY) { *need_log_inode_item = false; - - if ((min_key->type == BTRFS_INODE_REF_KEY || - min_key->type == BTRFS_INODE_EXTREF_KEY) && - inode->generation == trans->transid && - !recursive_logging) { + } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && + min_key->offset >= i_size) { + /* + * Extents at and beyond eof are logged with + * btrfs_log_prealloc_extents(). + * Only regular files have BTRFS_EXTENT_DATA_KEY keys, + * and no keys greater than that, so bail out. + */ + break; + } else if ((min_key->type == BTRFS_INODE_REF_KEY || + min_key->type == BTRFS_INODE_EXTREF_KEY) && + inode->generation == trans->transid && + !recursive_logging) { u64 other_ino = 0; u64 other_parent = 0; @@ -5099,10 +5126,8 @@ again: btrfs_release_path(path); goto next_key; } - } - - /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ - if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ if (ins_nr == 0) goto next_slot; ret = copy_items(trans, inode, dst_path, path, @@ -5155,9 +5180,21 @@ next_key: break; } } - if (ins_nr) + if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, logged_isize); + if (ret) + return ret; + } + + if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { + /* + * Release the path because otherwise we might attempt to double + * lock the same leaf with btrfs_log_prealloc_extents() below. + */ + btrfs_release_path(path); + ret = btrfs_log_prealloc_extents(trans, inode, dst_path); + } return ret; } @@ -5258,6 +5295,18 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } /* + * For symlinks, we must always log their content, which is stored in an + * inline extent, otherwise we could end up with an empty symlink after + * log replay, which is invalid on linux (symlink(2) returns -ENOENT if + * one attempts to create an empty symlink). + * We don't need to worry about flushing delalloc, because when we create + * the inline extent when the symlink is created (we never have delalloc + * for symlinks). + */ + if (S_ISLNK(inode->vfs_inode.i_mode)) + inode_only = LOG_INODE_ALL; + + /* * a brute force approach to making sure we get the most uptodate * copies of everything. */ @@ -5670,7 +5719,7 @@ process_leaf: } ctx->log_new_dentries = false; - if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) + if (type == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), log_mode, 0, LLONG_MAX, ctx); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 344d18de1f08..8898682c9103 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4320,10 +4320,12 @@ static int balance_kthread(void *data) struct btrfs_fs_info *fs_info = data; int ret = 0; + sb_start_write(fs_info->sb); mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); mutex_unlock(&fs_info->balance_mutex); + sb_end_write(fs_info->sb); return ret; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 41b3c5fc958c..79a18692b84c 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -855,6 +855,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, out_super: deactivate_locked_super(sb); + return root; out: cifs_cleanup_volume_info(volume_info); return root; @@ -888,7 +889,7 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t rc; struct inode *inode = file_inode(iocb->ki_filp); - if (iocb->ki_filp->f_flags & O_DIRECT) + if (iocb->ki_flags & IOCB_DIRECT) return cifs_user_readv(iocb, iter); rc = cifs_revalidate_mapping(inode); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index a9746af5a44d..03c85beecec1 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2577,12 +2577,23 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, tcon = tlink_tcon(smbfile->tlink); if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) { server = tcon->ses->server; - if (server->ops->flush) - rc = server->ops->flush(xid, tcon, &smbfile->fid); - else + if (server->ops->flush == NULL) { rc = -ENOSYS; + goto strict_fsync_exit; + } + + if ((OPEN_FMODE(smbfile->f_flags) & FMODE_WRITE) == 0) { + smbfile = find_writable_file(CIFS_I(inode), FIND_WR_ANY); + if (smbfile) { + rc = server->ops->flush(xid, tcon, &smbfile->fid); + cifsFileInfo_put(smbfile); + } else + cifs_dbg(FYI, "ignore fsync for file not open for write\n"); + } else + rc = server->ops->flush(xid, tcon, &smbfile->fid); } +strict_fsync_exit: free_xid(xid); return rc; } @@ -2594,6 +2605,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct cifs_tcon *tcon; struct TCP_Server_Info *server; struct cifsFileInfo *smbfile = file->private_data; + struct inode *inode = file_inode(file); struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); rc = file_write_and_wait_range(file, start, end); @@ -2608,12 +2620,23 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) tcon = tlink_tcon(smbfile->tlink); if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) { server = tcon->ses->server; - if (server->ops->flush) - rc = server->ops->flush(xid, tcon, &smbfile->fid); - else + if (server->ops->flush == NULL) { rc = -ENOSYS; + goto fsync_exit; + } + + if ((OPEN_FMODE(smbfile->f_flags) & FMODE_WRITE) == 0) { + smbfile = find_writable_file(CIFS_I(inode), FIND_WR_ANY); + if (smbfile) { + rc = server->ops->flush(xid, tcon, &smbfile->fid); + cifsFileInfo_put(smbfile); + } else + cifs_dbg(FYI, "ignore fsync for file not open for write\n"); + } else + rc = server->ops->flush(xid, tcon, &smbfile->fid); } +fsync_exit: free_xid(xid); return rc; } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index b736acd3917b..a24bcbbb5033 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -97,6 +97,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, if (rc != 1) return -EINVAL; + if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN) + return -EINVAL; + rc = symlink_hash(link_len, link_str, md5_hash); if (rc) { cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index defee1d208d2..7985fe25850b 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1643,9 +1643,17 @@ smb2_copychunk_range(const unsigned int xid, int chunks_copied = 0; bool chunk_sizes_updated = false; ssize_t bytes_written, total_bytes_written = 0; + struct inode *inode; pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL); + /* + * We need to flush all unwritten data before we can send the + * copychunk ioctl to the server. + */ + inode = d_inode(trgtfile->dentry); + filemap_write_and_wait(inode->i_mapping); + if (pcchunk == NULL) return -ENOMEM; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index cb733652ecca..d73d88d9c259 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -36,6 +36,14 @@ */ DEFINE_SPINLOCK(configfs_dirent_lock); +/* + * All of link_obj/unlink_obj/link_group/unlink_group require that + * subsys->su_mutex is held. + * But parent configfs_subsystem is NULL when config_item is root. + * Use this mutex when config_item is root. + */ +static DEFINE_MUTEX(configfs_subsystem_mutex); + static void configfs_d_iput(struct dentry * dentry, struct inode * inode) { @@ -1805,8 +1813,8 @@ void configfs_unregister_group(struct config_group *group) configfs_detach_group(&group->cg_item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); + d_drop(dentry); fsnotify_rmdir(d_inode(parent), dentry); - d_delete(dentry); inode_unlock(d_inode(parent)); dput(dentry); @@ -1884,7 +1892,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) group->cg_item.ci_name = group->cg_item.ci_namebuf; sd = root->d_fsdata; + mutex_lock(&configfs_subsystem_mutex); link_group(to_config_group(sd->s_element), group); + mutex_unlock(&configfs_subsystem_mutex); inode_lock_nested(d_inode(root), I_MUTEX_PARENT); @@ -1909,7 +1919,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) inode_unlock(d_inode(root)); if (err) { + mutex_lock(&configfs_subsystem_mutex); unlink_group(group); + mutex_unlock(&configfs_subsystem_mutex); configfs_release_fs(); } put_fragment(frag); @@ -1947,16 +1959,18 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) configfs_detach_group(&group->cg_item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - fsnotify_rmdir(d_inode(root), dentry); inode_unlock(d_inode(dentry)); - d_delete(dentry); + d_drop(dentry); + fsnotify_rmdir(d_inode(root), dentry); inode_unlock(d_inode(root)); dput(dentry); + mutex_lock(&configfs_subsystem_mutex); unlink_group(group); + mutex_unlock(&configfs_subsystem_mutex); configfs_release_fs(); } diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index a32c5c7dcfd8..da87615ad69a 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -146,7 +146,7 @@ static int debugfs_locked_down(struct inode *inode, struct file *filp, const struct file_operations *real_fops) { - if ((inode->i_mode & 07777) == 0444 && + if ((inode->i_mode & 07777 & ~0444) == 0 && !(filp->f_mode & FMODE_WRITE) && !real_fops->unlocked_ioctl && !real_fops->compat_ioctl && diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 42e5a766d33c..4f25015aa534 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -621,8 +621,8 @@ void devpts_pty_kill(struct dentry *dentry) dentry->d_fsdata = NULL; drop_nlink(dentry->d_inode); - fsnotify_unlink(d_inode(dentry->d_parent), dentry); d_drop(dentry); + fsnotify_unlink(d_inode(dentry->d_parent), dentry); dput(dentry); /* d_alloc_name() in devpts_pty_new() */ } diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 18d81599522f..53500b555bfa 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -3975,6 +3975,14 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) int from = ms->m_header.h_nodeid; int error = 0; + /* currently mixing of user/kernel locks are not supported */ + if (ms->m_flags & DLM_IFL_USER && ~lkb->lkb_flags & DLM_IFL_USER) { + log_error(lkb->lkb_resource->res_ls, + "got user dlm message for a kernel lock"); + error = -EINVAL; + goto out; + } + switch (ms->m_type) { case DLM_MSG_CONVERT: case DLM_MSG_UNLOCK: @@ -4003,6 +4011,7 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) error = -EINVAL; } +out: if (error) log_error(lkb->lkb_resource->res_ls, "ignore invalid message %d from %d %x %x %x %d", diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 065cd2d1bdc6..db403c01d4d5 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -770,8 +770,12 @@ static loff_t ext2_max_size(int bits) res += 1LL << (bits-2); res += 1LL << (2*(bits-2)); res += 1LL << (3*(bits-2)); + /* Compute how many metadata blocks are needed */ + meta_blocks = 1; + meta_blocks += 1 + ppb; + meta_blocks += 1 + ppb + ppb * ppb; /* Does block tree limit file size? */ - if (res < upper_limit) + if (res + meta_blocks <= upper_limit) goto check_lfs; res = upper_limit; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ae2cb15d9540..e932e8482371 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1966,6 +1966,10 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) * Structure of a directory entry */ #define EXT4_NAME_LEN 255 +/* + * Base length of the ext4 directory entry excluding the name length + */ +#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN) struct ext4_dir_entry { __le32 inode; /* Inode number */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ae73e6793683..f1bbce4350c4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -133,14 +133,25 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle, static int ext4_ext_get_access(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { + int err = 0; + if (path->p_bh) { /* path points to block */ BUFFER_TRACE(path->p_bh, "get_write_access"); - return ext4_journal_get_write_access(handle, path->p_bh); + err = ext4_journal_get_write_access(handle, path->p_bh); + + /* + * The extent buffer's verified bit will be set again in + * __ext4_ext_dirty(). We could leave an inconsistent + * buffer if the extents updating procudure break off du + * to some error happens, force to check it again. + */ + if (!err) + clear_buffer_verified(path->p_bh); } /* path points to leaf/index in inode body */ /* we use in-core data, no need to protect them */ - return 0; + return err; } /* @@ -160,6 +171,9 @@ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, /* path points to block */ err = __ext4_handle_dirty_metadata(where, line, handle, inode, path->p_bh); + /* Extents updating done, re-set verified flag */ + if (!err) + set_buffer_verified(path->p_bh); } else { /* path points to leaf/index in inode body */ err = ext4_mark_inode_dirty(handle, inode); @@ -390,9 +404,13 @@ static int ext4_valid_extent_idx(struct inode *inode, static int ext4_valid_extent_entries(struct inode *inode, struct ext4_extent_header *eh, + ext4_lblk_t lblk, ext4_fsblk_t *pblk, int depth) { unsigned short entries; + ext4_lblk_t lblock = 0; + ext4_lblk_t prev = 0; + if (eh->eh_entries == 0) return 1; @@ -403,32 +421,52 @@ static int ext4_valid_extent_entries(struct inode *inode, struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; ext4_fsblk_t pblock = 0; - ext4_lblk_t lblock = 0; - ext4_lblk_t prev = 0; - int len = 0; + + /* + * The logical block in the first entry should equal to + * the number in the index block. + */ + if (depth != ext_depth(inode) && + lblk != le32_to_cpu(ext->ee_block)) + return 0; while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; /* Check for overlapping extents */ lblock = le32_to_cpu(ext->ee_block); - len = ext4_ext_get_actual_len(ext); if ((lblock <= prev) && prev) { pblock = ext4_ext_pblock(ext); es->s_last_error_block = cpu_to_le64(pblock); return 0; } + prev = lblock + ext4_ext_get_actual_len(ext) - 1; ext++; entries--; - prev = lblock + len - 1; } } else { struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); + + /* + * The logical block in the first entry should equal to + * the number in the parent index block. + */ + if (depth != ext_depth(inode) && + lblk != le32_to_cpu(ext_idx->ei_block)) + return 0; while (entries) { if (!ext4_valid_extent_idx(inode, ext_idx)) return 0; + + /* Check for overlapping index extents */ + lblock = le32_to_cpu(ext_idx->ei_block); + if ((lblock <= prev) && prev) { + *pblk = ext4_idx_pblock(ext_idx); + return 0; + } ext_idx++; entries--; + prev = lblock; } } return 1; @@ -436,7 +474,7 @@ static int ext4_valid_extent_entries(struct inode *inode, static int __ext4_ext_check(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_header *eh, - int depth, ext4_fsblk_t pblk) + int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk) { const char *error_msg; int max = 0, err = -EFSCORRUPTED; @@ -462,7 +500,7 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid eh_entries"; goto corrupted; } - if (!ext4_valid_extent_entries(inode, eh, depth)) { + if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { error_msg = "invalid extent entries"; goto corrupted; } @@ -491,7 +529,7 @@ corrupted: } #define ext4_ext_check(inode, eh, depth, pblk) \ - __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk)) + __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0) int ext4_ext_check_inode(struct inode *inode) { @@ -524,12 +562,14 @@ static void ext4_cache_extents(struct inode *inode, static struct buffer_head * __read_extent_tree_block(const char *function, unsigned int line, - struct inode *inode, ext4_fsblk_t pblk, int depth, - int flags) + struct inode *inode, struct ext4_extent_idx *idx, + int depth, int flags) { struct buffer_head *bh; int err; + ext4_fsblk_t pblk; + pblk = ext4_idx_pblock(idx); bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); @@ -545,8 +585,8 @@ __read_extent_tree_block(const char *function, unsigned int line, if (!ext4_has_feature_journal(inode->i_sb) || (inode->i_ino != le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) { - err = __ext4_ext_check(function, line, inode, - ext_block_hdr(bh), depth, pblk); + err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh), + depth, pblk, le32_to_cpu(idx->ei_block)); if (err) goto errout; } @@ -565,8 +605,8 @@ errout: } -#define read_extent_tree_block(inode, pblk, depth, flags) \ - __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ +#define read_extent_tree_block(inode, idx, depth, flags) \ + __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \ (depth), (flags)) /* @@ -613,8 +653,7 @@ int ext4_ext_precache(struct inode *inode) i--; continue; } - bh = read_extent_tree_block(inode, - ext4_idx_pblock(path[i].p_idx++), + bh = read_extent_tree_block(inode, path[i].p_idx++, depth - i - 1, EXT4_EX_FORCE_CACHE); if (IS_ERR(bh)) { @@ -917,8 +956,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_depth = i; path[ppos].p_ext = NULL; - bh = read_extent_tree_block(inode, path[ppos].p_block, --i, - flags); + bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags); if (IS_ERR(bh)) { ret = PTR_ERR(bh); goto err; @@ -1517,7 +1555,6 @@ static int ext4_ext_search_right(struct inode *inode, struct ext4_extent_header *eh; struct ext4_extent_idx *ix; struct ext4_extent *ex; - ext4_fsblk_t block; int depth; /* Note, NOT eh_depth; depth from top of tree */ int ee_len; @@ -1584,20 +1621,17 @@ got_index: * follow it and find the closest allocated * block to the right */ ix++; - block = ext4_idx_pblock(ix); while (++depth < path->p_depth) { /* subtract from p_depth to get proper eh_depth */ - bh = read_extent_tree_block(inode, block, - path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); ix = EXT_FIRST_INDEX(eh); - block = ext4_idx_pblock(ix); put_bh(bh); } - bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); @@ -3119,9 +3153,9 @@ again: ext_debug("move to level %d (block %llu)\n", i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); - bh = read_extent_tree_block(inode, - ext4_idx_pblock(path[i].p_idx), depth - i - 1, - EXT4_EX_NOCACHE); + bh = read_extent_tree_block(inode, path[i].p_idx, + depth - i - 1, + EXT4_EX_NOCACHE); if (IS_ERR(bh)) { /* should we reset i_size? */ err = PTR_ERR(bh); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index cdb10e9fded6..8f665aa1d706 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1120,7 +1120,15 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc, void *buf, int inline_size) { - ext4_create_inline_data(handle, inode, inline_size); + int ret; + + ret = ext4_create_inline_data(handle, inode, inline_size); + if (ret) { + ext4_msg(inode->i_sb, KERN_EMERG, + "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)", + inode->i_ino, ret); + return; + } ext4_write_inline_data(inode, iloc, buf, 0, inline_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index dcbd8ac8d471..00686fbe3c27 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2161,6 +2161,15 @@ static int ext4_writepage(struct page *page, else len = PAGE_SIZE; + /* Should never happen but for bugs in other kernel subsystems */ + if (!page_has_buffers(page)) { + ext4_warning_inode(inode, + "page %lu does not have buffers attached", page->index); + ClearPageDirty(page); + unlock_page(page); + return 0; + } + page_bufs = page_buffers(page); /* * We cannot do block allocation or other extent handling in this @@ -2710,6 +2719,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); + /* + * Should never happen but for buggy code in + * other subsystems that call + * set_page_dirty() without properly warning + * the file system first. See [1] for more + * information. + * + * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz + */ + if (!page_has_buffers(page)) { + ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index); + ClearPageDirty(page); + unlock_page(page); + continue; + } + if (mpd->map.m_len == 0) mpd->first_page = page->index; mpd->next_page = page->index + 1; @@ -4286,7 +4311,8 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; - loff_t first_block_offset, last_block_offset; + loff_t first_block_offset, last_block_offset, max_length; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; unsigned int credits; int ret = 0; @@ -4332,6 +4358,14 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) offset; } + /* + * For punch hole the length + offset needs to be within one block + * before last range. Adjust the length if it goes beyond that limit. + */ + max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; + if (offset + length > max_length) + length = max_length - offset; + if (offset & (sb->s_blocksize - 1) || (offset + length) & (sb->s_blocksize - 1)) { /* diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index ba13fbb443d5..9fa20f9ba52b 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1120,8 +1120,6 @@ resizefs_out: sizeof(range))) return -EFAULT; - range.minlen = max((unsigned int)range.minlen, - q->limits.discard_granularity); ret = ext4_trim_fs(sb, &range); if (ret < 0) return ret; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b67ea979f0cf..0307702d114d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5270,6 +5270,7 @@ out: */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); struct ext4_group_info *grp; ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; @@ -5288,6 +5289,13 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) start >= max_blks || range->len < sb->s_blocksize) return -EINVAL; + /* No point to try to trim less than discard granularity */ + if (range->minlen < q->limits.discard_granularity) { + minlen = EXT4_NUM_B2C(EXT4_SB(sb), + q->limits.discard_granularity >> sb->s_blocksize_bits); + if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) + goto out; + } if (end >= max_blks) end = max_blks - 1; if (end <= first_data_blk) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index be4ee3dcc5cf..c5b2ea1a9372 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -455,12 +455,12 @@ int ext4_ext_migrate(struct inode *inode) percpu_down_write(&sbi->s_writepages_rwsem); /* - * Worst case we can touch the allocation bitmaps, a bgd - * block, and a block to link in the orphan list. We do need - * need to worry about credits for modifying the quota inode. + * Worst case we can touch the allocation bitmaps and a block + * group descriptor block. We do need need to worry about + * credits for modifying the quota inode. */ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, - 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); + 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); @@ -477,6 +477,13 @@ int ext4_ext_migrate(struct inode *inode) ext4_journal_stop(handle); goto out_unlock; } + /* + * Use the correct seed for checksum (i.e. the seed from 'inode'). This + * is so that the metadata blocks will have the correct checksum after + * the migration. + */ + ei = EXT4_I(inode); + EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed; i_size_write(tmp_inode, i_size_read(inode)); /* * Set the i_nlink to zero so it will be deleted later @@ -485,7 +492,6 @@ int ext4_ext_migrate(struct inode *inode) clear_nlink(tmp_inode); ext4_ext_tree_init(handle, tmp_inode); - ext4_orphan_add(handle, tmp_inode); ext4_journal_stop(handle); /* @@ -510,17 +516,10 @@ int ext4_ext_migrate(struct inode *inode) handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { - /* - * It is impossible to update on-disk structures without - * a handle, so just rollback in-core changes and live other - * work to orphan_list_cleanup() - */ - ext4_orphan_del(NULL, tmp_inode); retval = PTR_ERR(handle); goto out_tmp_inode; } - ei = EXT4_I(inode); i_data = ei->i_data; memset(&lb, 0, sizeof(lb)); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 9905720df924..f10307215d58 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1385,10 +1385,10 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; - while ((char *) de < dlimit) { + while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ - if ((char *) de + de->name_len <= dlimit && + if (de->name + de->name_len <= dlimit && ext4_match(dir, fname, de)) { /* found a match - just to be sure, do * a full check */ diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 2cc9f2168b9e..b66b335a0ca6 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -100,8 +100,10 @@ static void ext4_finish_bio(struct bio *bio) continue; } clear_buffer_async_write(bh); - if (bio->bi_status) + if (bio->bi_status) { + set_buffer_write_io_error(bh); buffer_io_error(bh); + } } while ((bh = bh->b_this_page) != head); bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); local_irq_restore(flags); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index ad1d4c8faf44..b7f20196439a 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -74,6 +74,11 @@ int ext4_resize_begin(struct super_block *sb) return -EPERM; } + if (ext4_has_feature_sparse_super2(sb)) { + ext4_msg(sb, KERN_ERR, "Online resizing not supported with sparse_super2"); + return -EOPNOTSUPP; + } + if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags)) ret = -EBUSY; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f68dfef5939f..c13879bd2168 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3485,9 +3485,11 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, ext4_fsblk_t first_block, last_block, b; ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; + int has_super = ext4_bg_has_super(sb, grp); if (!ext4_has_feature_bigalloc(sb)) - return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) + + return (has_super + ext4_bg_num_gdb(sb, grp) + + (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + sbi->s_itb_per_group + 2); first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + @@ -4512,9 +4514,18 @@ no_journal: * Get the # of file system overhead blocks from the * superblock if present. */ - if (es->s_overhead_clusters) - sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); - else { + sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); + /* ignore the precalculated value if it is ridiculous */ + if (sbi->s_overhead > ext4_blocks_count(es)) + sbi->s_overhead = 0; + /* + * If the bigalloc feature is not enabled recalculating the + * overhead doesn't take long, so we might as well just redo + * it to make sure we are using the correct value. + */ + if (!ext4_has_feature_bigalloc(sb)) + sbi->s_overhead = 0; + if (sbi->s_overhead == 0) { err = ext4_calculate_overhead(sb); if (err) goto failed_mount_wq; @@ -5912,10 +5923,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); err = dquot_quota_on(sb, type, format_id, path); - if (err) { - lockdep_set_quota_inode(path->dentry->d_inode, - I_DATA_SEM_NORMAL); - } else { + if (!err) { struct inode *inode = d_inode(path->dentry); handle_t *handle; @@ -5935,7 +5943,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, ext4_journal_stop(handle); unlock_inode: inode_unlock(inode); + if (err) + dquot_quota_off(sb, type); } + if (err) + lockdep_set_quota_inode(path->dentry->d_inode, + I_DATA_SEM_NORMAL); return err; } @@ -5998,8 +6011,19 @@ static int ext4_enable_quotas(struct super_block *sb) "Failed to enable quota tracking " "(type=%d, err=%d). Please run " "e2fsck to fix.", type, err); - for (type--; type >= 0; type--) + for (type--; type >= 0; type--) { + struct inode *inode; + + inode = sb_dqopt(sb)->files[type]; + if (inode) + inode = igrab(inode); dquot_quota_off(sb, type); + if (inode) { + lockdep_set_quota_inode(inode, + I_DATA_SEM_NORMAL); + iput(inode); + } + } return err; } @@ -6101,7 +6125,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, struct buffer_head *bh; handle_t *handle = journal_current_handle(); - if (EXT4_SB(sb)->s_journal && !handle) { + if (!handle) { ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" " cancelled because transaction is not started", (unsigned long long)off, (unsigned long long)len); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f7d27cbbeb86..54f0d2c4c7d8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -848,6 +848,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, struct page *cp_page_1 = NULL, *cp_page_2 = NULL; struct f2fs_checkpoint *cp_block = NULL; unsigned long long cur_version = 0, pre_version = 0; + unsigned int cp_blocks; int err; err = get_checkpoint_version(sbi, cp_addr, &cp_block, @@ -855,15 +856,16 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (err) return NULL; - if (le32_to_cpu(cp_block->cp_pack_total_block_count) > - sbi->blocks_per_seg) { + cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count); + + if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) { f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u", le32_to_cpu(cp_block->cp_pack_total_block_count)); goto invalid_cp; } pre_version = *version; - cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; + cp_addr += cp_blocks - 1; err = get_checkpoint_version(sbi, cp_addr, &cp_block, &cp_page_2, version); if (err) @@ -1144,7 +1146,8 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi) if (!is_journalled_quota(sbi)) return false; - down_write(&sbi->quota_sem); + if (!down_write_trylock(&sbi->quota_sem)) + return true; if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) { ret = false; } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1679f9c0b63b..773028921c48 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2467,8 +2467,12 @@ static int __f2fs_write_data_pages(struct address_space *mapping, /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ if (wbc->sync_mode == WB_SYNC_ALL) atomic_inc(&sbi->wb_sync_req[DATA]); - else if (atomic_read(&sbi->wb_sync_req[DATA])) + else if (atomic_read(&sbi->wb_sync_req[DATA])) { + /* to avoid potential deadlock */ + if (current->plug) + blk_finish_plug(current->plug); goto skip_write; + } if (__should_serialize_io(inode, wbc)) { mutex_lock(&sbi->writepages); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 031a17bf52a2..5645502c156d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -931,6 +931,7 @@ struct f2fs_sm_info { unsigned int segment_count; /* total # of segments */ unsigned int main_segments; /* # of segments in main area */ unsigned int reserved_segments; /* # of reserved segments */ + unsigned int additional_reserved_segments;/* reserved segs for IO align feature */ unsigned int ovp_segments; /* # of overprovision segments */ /* a threshold to reclaim prefree segments */ @@ -1800,6 +1801,11 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, if (!__allow_reserved_blocks(sbi, inode, true)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; + + if (F2FS_IO_ALIGNED(sbi)) + avail_user_block_count -= sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { if (avail_user_block_count > sbi->unusable_block_count) avail_user_block_count -= sbi->unusable_block_count; @@ -2045,6 +2051,11 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, if (!__allow_reserved_blocks(sbi, inode, false)) valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; + + if (F2FS_IO_ALIGNED(sbi)) + valid_block_count += sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments; + user_block_count = sbi->user_block_count; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) user_block_count -= sbi->unusable_block_count; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 4b6c36208f55..68d5c73c5ed1 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -633,6 +633,11 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, set_sbi_flag(sbi, SBI_NEED_FSCK); } + if (f2fs_check_nid_range(sbi, dni->ino)) { + f2fs_put_page(node_page, 1); + return false; + } + *nofs = ofs_of_node(node_page); source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2383d52b1f42..264c19e17779 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -777,6 +777,7 @@ void f2fs_handle_failed_inode(struct inode *inode) err = f2fs_get_node_info(sbi, inode->i_ino, &ni); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); + set_inode_flag(inode, FI_FREE_NID); f2fs_warn(sbi, "May loss orphan inode, run fsck to fix."); goto out; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4cb182c20eed..3dc7cc3d6ac6 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1385,6 +1385,7 @@ page_hit: nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); + set_sbi_flag(sbi, SBI_NEED_FSCK); err = -EINVAL; out_err: ClearPageUptodate(page); @@ -1994,8 +1995,12 @@ static int f2fs_write_node_pages(struct address_space *mapping, if (wbc->sync_mode == WB_SYNC_ALL) atomic_inc(&sbi->wb_sync_req[NODE]); - else if (atomic_read(&sbi->wb_sync_req[NODE])) + else if (atomic_read(&sbi->wb_sync_req[NODE])) { + /* to avoid potential deadlock */ + if (current->plug) + blk_finish_plug(current->plug); goto skip_write; + } trace_f2fs_writepages(mapping->host, wbc, NODE); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 2034b9a07d63..15b343f65609 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -508,7 +508,8 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi) static inline int reserved_segments(struct f2fs_sb_info *sbi) { - return SM_I(sbi)->reserved_segments; + return SM_I(sbi)->reserved_segments + + SM_I(sbi)->additional_reserved_segments; } static inline unsigned int free_sections(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 41bf656658ba..6bd8a944902e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -277,6 +277,46 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).s_resgid)); } +static inline int adjust_reserved_segment(struct f2fs_sb_info *sbi) +{ + unsigned int sec_blks = sbi->blocks_per_seg * sbi->segs_per_sec; + unsigned int avg_vblocks; + unsigned int wanted_reserved_segments; + block_t avail_user_block_count; + + if (!F2FS_IO_ALIGNED(sbi)) + return 0; + + /* average valid block count in section in worst case */ + avg_vblocks = sec_blks / F2FS_IO_SIZE(sbi); + + /* + * we need enough free space when migrating one section in worst case + */ + wanted_reserved_segments = (F2FS_IO_SIZE(sbi) / avg_vblocks) * + reserved_segments(sbi); + wanted_reserved_segments -= reserved_segments(sbi); + + avail_user_block_count = sbi->user_block_count - + sbi->current_reserved_blocks - + F2FS_OPTION(sbi).root_reserved_blocks; + + if (wanted_reserved_segments * sbi->blocks_per_seg > + avail_user_block_count) { + f2fs_err(sbi, "IO align feature can't grab additional reserved segment: %u, available segments: %u", + wanted_reserved_segments, + avail_user_block_count >> sbi->log_blocks_per_seg); + return -ENOSPC; + } + + SM_I(sbi)->additional_reserved_segments = wanted_reserved_segments; + + f2fs_info(sbi, "IO align feature needs additional reserved segment: %u", + wanted_reserved_segments); + + return 0; +} + static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi) { if (!F2FS_OPTION(sbi).unusable_cap_perc) @@ -2026,7 +2066,7 @@ int f2fs_quota_sync(struct super_block *sb, int type) struct f2fs_sb_info *sbi = F2FS_SB(sb); struct quota_info *dqopt = sb_dqopt(sb); int cnt; - int ret; + int ret = 0; /* * Now when everything is written we can discard the pagecache so @@ -2037,8 +2077,8 @@ int f2fs_quota_sync(struct super_block *sb, int type) if (type != -1 && cnt != type) continue; - if (!sb_has_quota_active(sb, type)) - return 0; + if (!sb_has_quota_active(sb, cnt)) + continue; inode_lock(dqopt->files[cnt]); @@ -3450,6 +3490,10 @@ try_onemore: goto free_nm; } + err = adjust_reserved_segment(sbi); + if (err) + goto free_nm; + /* For write statistics */ if (sb->s_bdev->bd_part) sbi->sectors_written_start = diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 3a5360e045cf..84bb37665093 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -262,7 +262,9 @@ out: if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); if (t > (unsigned long)(sbi->user_block_count - - F2FS_OPTION(sbi).root_reserved_blocks)) { + F2FS_OPTION(sbi).root_reserved_blocks - + sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 296b3189448a..cf1bfed1e662 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -661,8 +661,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, } last = here; - while (!IS_XATTR_LAST_ENTRY(last)) + while (!IS_XATTR_LAST_ENTRY(last)) { + if ((void *)(last) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(last) > last_base_addr) { + f2fs_err(F2FS_I_SB(inode), "inode (%lu) has invalid last xattr entry, entry_size: %zu", + inode->i_ino, ENTRY_SIZE(last)); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + error = -EFSCORRUPTED; + goto exit; + } last = XATTR_NEXT_ENTRY(last); + } newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size); diff --git a/fs/file.c b/fs/file.c index e5d328335f88..51f53a7dc221 100644 --- a/fs/file.c +++ b/fs/file.c @@ -706,24 +706,69 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } +static inline struct file *__fget_files_rcu(struct files_struct *files, + unsigned int fd, fmode_t mask, unsigned int refs) +{ + for (;;) { + struct file *file; + struct fdtable *fdt = rcu_dereference_raw(files->fdt); + struct file __rcu **fdentry; + + if (unlikely(fd >= fdt->max_fds)) + return NULL; + + fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); + file = rcu_dereference_raw(*fdentry); + if (unlikely(!file)) + return NULL; + + if (unlikely(file->f_mode & mask)) + return NULL; + + /* + * Ok, we have a file pointer. However, because we do + * this all locklessly under RCU, we may be racing with + * that file being closed. + * + * Such a race can take two forms: + * + * (a) the file ref already went down to zero, + * and get_file_rcu_many() fails. Just try + * again: + */ + if (unlikely(!get_file_rcu_many(file, refs))) + continue; + + /* + * (b) the file table entry has changed under us. + * Note that we don't need to re-check the 'fdt->fd' + * pointer having changed, because it always goes + * hand-in-hand with 'fdt'. + * + * If so, we need to put our refs and try again. + */ + if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || + unlikely(rcu_dereference_raw(*fdentry) != file)) { + fput_many(file, refs); + continue; + } + + /* + * Ok, we have a ref to the file, and checked that it + * still exists. + */ + return file; + } +} + + static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs) { struct files_struct *files = current->files; struct file *file; rcu_read_lock(); -loop: - file = fcheck_files(files, fd); - if (file) { - /* File object ref couldn't be taken. - * dup2() atomicity guarantee is the reason - * we loop to catch the new file (or NULL pointer) - */ - if (file->f_mode & mask) - file = NULL; - else if (!get_file_rcu_many(file, refs)) - goto loop; - } + file = __fget_files_rcu(files, fd, mask, refs); rcu_read_unlock(); return file; diff --git a/fs/fs_context.c b/fs/fs_context.c index 138b5b4d621d..e492a83fa100 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -258,7 +258,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, struct fs_context *fc; int ret = -ENOMEM; - fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); + fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT); if (!fc) return ERR_PTR(-ENOMEM); @@ -585,7 +585,7 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) param->key); } - if (len > PAGE_SIZE - 2 - size) + if (size + len + 2 > PAGE_SIZE) return invalf(fc, "VFS: Legacy: Cumulative options too large"); if (strchr(param->key, ',') || (param->type == fs_value_is_string && @@ -686,7 +686,7 @@ const struct fs_context_operations legacy_fs_context_ops = { */ static int legacy_init_fs_context(struct fs_context *fc) { - fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL); + fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT); if (!fc->fs_private) return -ENOMEM; fc->ops = &legacy_fs_context_ops; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index fa4d2aba5a70..ac6a8da34013 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -839,17 +839,17 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) goto out_put_old; } + get_page(newpage); + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + lru_cache_add_file(newpage); + /* * Release while we have extra ref on stolen page. Otherwise * anon_pipe_buf_release() might think the page can be reused. */ pipe_buf_release(cs->pipe, buf); - get_page(newpage); - - if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - lru_cache_add_file(newpage); - err = 0; spin_lock(&cs->req->waitq.lock); if (test_bit(FR_ABORTED, &cs->req->flags)) @@ -933,7 +933,17 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, while (count) { if (cs->write && cs->pipebufs && page) { - return fuse_ref_page(cs, page, offset, count); + /* + * Can't control lifetime of pipe buffers, so always + * copy user pages. + */ + if (cs->req->args->user_pages) { + err = fuse_copy_fill(cs); + if (err) + return err; + } else { + return fuse_ref_page(cs, page, offset, count); + } } else if (!cs->len) { if (cs->move_pages && page && offset == 0 && count == PAGE_SIZE) { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 60378f3baaae..34487bf1d791 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1032,7 +1032,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, if (!parent) return -ENOENT; - inode_lock(parent); + inode_lock_nested(parent, I_MUTEX_PARENT); if (!S_ISDIR(parent->i_mode)) goto unlock; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1de59998e0e7..efb2a4871291 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1433,6 +1433,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } + ap->args.user_pages = true; if (write) ap->args.in_pages = 1; else @@ -3188,7 +3189,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) { - int err = filemap_write_and_wait_range(inode->i_mapping, start, -1); + int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX); if (!err) fuse_sync_writes(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d87892648545..83c2855bc740 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -248,6 +248,7 @@ struct fuse_args { bool nocreds:1; bool in_pages:1; bool out_pages:1; + bool user_pages:1; bool out_argvar:1; bool page_zeroing:1; bool page_replace:1; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index aaec3c5b0202..dec5285a02e9 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -940,7 +940,7 @@ do_alloc: else if (height == ip->i_height) ret = gfs2_hole_size(inode, lblock, len, mp, iomap); else - iomap->length = size - pos; + iomap->length = size - iomap->offset; } else if (flags & IOMAP_WRITE) { u64 alloc_size; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index c056ed5c6df3..8153a3eac540 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -925,15 +925,15 @@ static int read_rindex_entry(struct gfs2_inode *ip) rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); spin_lock_init(&rgd->rd_rsspin); - error = compute_bitstructs(rgd); - if (error) - goto fail; - error = gfs2_glock_get(sdp, rgd->rd_addr, &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); if (error) goto fail; + error = compute_bitstructs(rgd); + if (error) + goto fail_glock; + rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED); if (rgd->rd_data > sdp->sd_max_rg_data) @@ -950,6 +950,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) } error = 0; /* someone else read in the rgrp; free it and ignore it */ +fail_glock: gfs2_glock_put(rgd->rd_gl); fail: @@ -1429,7 +1430,8 @@ int gfs2_fitrim(struct file *filp, void __user *argp) start = r.start >> bs_shift; end = start + (r.len >> bs_shift); - minlen = max_t(u64, r.minlen, + minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize); + minlen = max_t(u64, minlen, q->limits.discard_granularity) >> bs_shift; if (end <= start || minlen > sdp->sd_max_rg_data) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 358398b1fe0c..7d039ba5ae28 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -38,6 +38,7 @@ #include <linux/uio.h> #include <linux/uaccess.h> +#include <linux/sched/mm.h> static const struct super_operations hugetlbfs_ops; static const struct address_space_operations hugetlbfs_aops; @@ -201,13 +202,61 @@ out: #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA static unsigned long +hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + + info.flags = 0; + info.length = len; + info.low_limit = current->mm->mmap_base; + info.high_limit = arch_get_mmap_end(addr); + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); +} + +static unsigned long +hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + addr = vm_unmapped_area(&info); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + if (unlikely(offset_in_page(addr))) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = current->mm->mmap_base; + info.high_limit = arch_get_mmap_end(addr); + addr = vm_unmapped_area(&info); + } + + return addr; +} + +static unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info; + const unsigned long mmap_end = arch_get_mmap_end(addr); if (len & ~huge_page_mask(h)) return -EINVAL; @@ -223,18 +272,21 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (mmap_end - len >= addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } - info.flags = 0; - info.length = len; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - info.align_offset = 0; - return vm_unmapped_area(&info); + /* + * Use mm->get_unmapped_area value as a hint to use topdown routine. + * If architectures have special needs, they should define their own + * version of hugetlb_get_unmapped_area. + */ + if (mm->get_unmapped_area == arch_get_unmapped_area_topdown) + return hugetlb_get_unmapped_area_topdown(file, addr, len, + pgoff, flags); + return hugetlb_get_unmapped_area_bottomup(file, addr, len, + pgoff, flags); } #endif diff --git a/fs/io_uring.c b/fs/io_uring.c index 478df7e10767..e73969fa96bc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -438,6 +438,22 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) return ctx; } +static void io_req_put_fs(struct io_kiocb *req) +{ + struct fs_struct *fs = req->fs; + + if (!fs) + return; + + spin_lock(&req->fs->lock); + if (--fs->users) + fs = NULL; + spin_unlock(&req->fs->lock); + if (fs) + free_fs_struct(fs); + req->fs = NULL; +} + static inline bool __io_sequence_defer(struct io_ring_ctx *ctx, struct io_kiocb *req) { @@ -695,6 +711,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) static void __io_free_req(struct io_kiocb *req) { + io_req_put_fs(req); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); percpu_ref_put(&req->ctx->refs); @@ -1701,16 +1718,7 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = -EINTR; } - if (req->fs) { - struct fs_struct *fs = req->fs; - - spin_lock(&req->fs->lock); - if (--fs->users) - fs = NULL; - spin_unlock(&req->fs->lock); - if (fs) - free_fs_struct(fs); - } + io_req_put_fs(req); io_cqring_add_event(req->ctx, sqe->user_data, ret); io_put_req(req); return 0; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 88146008b3e3..d45ceb2e2149 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -451,7 +451,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) } spin_unlock(&commit_transaction->t_handle_lock); commit_transaction->t_state = T_SWITCH; - write_unlock(&journal->j_state_lock); J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= journal->j_max_transaction_buffers); @@ -471,6 +470,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) * has reserved. This is consistent with the existing behaviour * that multiple jbd2_journal_get_write_access() calls to the same * buffer are perfectly permissible. + * We use journal->j_state_lock here to serialize processing of + * t_reserved_list with eviction of buffers from journal_unmap_buffer(). */ while (commit_transaction->t_reserved_list) { jh = commit_transaction->t_reserved_list; @@ -490,6 +491,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_refile_buffer(journal, jh); } + write_unlock(&journal->j_state_lock); /* * Now try to drop any written-back buffers from the journal's * checkpoint lists. We do this *before* commit because it potentially diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index b288c8ae1236..837cd55fd4c5 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -415,13 +415,15 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c) jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); ret = -EIO; - goto out_free; + goto out_sum_exit; } jffs2_calc_trigger_levels(c); return 0; + out_sum_exit: + jffs2_sum_exit(c); out_free: kvfree(c->blocks); diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index f8fb89b10227..34880a4c2173 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -135,20 +135,15 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, struct page *pg; struct inode *inode = mapping->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); pgoff_t index = pos >> PAGE_SHIFT; uint32_t pageofs = index << PAGE_SHIFT; int ret = 0; - pg = grab_cache_page_write_begin(mapping, index, flags); - if (!pg) - return -ENOMEM; - *pagep = pg; - jffs2_dbg(1, "%s()\n", __func__); if (pageofs > inode->i_size) { /* Make new hole frag from old EOF to new page */ - struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); struct jffs2_raw_inode ri; struct jffs2_full_dnode *fn; uint32_t alloc_len; @@ -159,7 +154,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); if (ret) - goto out_page; + goto out_err; mutex_lock(&f->sem); memset(&ri, 0, sizeof(ri)); @@ -189,7 +184,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, ret = PTR_ERR(fn); jffs2_complete_reservation(c); mutex_unlock(&f->sem); - goto out_page; + goto out_err; } ret = jffs2_add_full_dnode_to_inode(c, f, fn); if (f->metadata) { @@ -204,7 +199,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, jffs2_free_full_dnode(fn); jffs2_complete_reservation(c); mutex_unlock(&f->sem); - goto out_page; + goto out_err; } jffs2_complete_reservation(c); inode->i_size = pageofs; @@ -212,6 +207,19 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, } /* + * While getting a page and reading data in, lock c->alloc_sem until + * the page is Uptodate. Otherwise GC task may attempt to read the same + * page in read_cache_page(), which causes a deadlock. + */ + mutex_lock(&c->alloc_sem); + pg = grab_cache_page_write_begin(mapping, index, flags); + if (!pg) { + ret = -ENOMEM; + goto release_sem; + } + *pagep = pg; + + /* * Read in the page if it wasn't already present. Cannot optimize away * the whole page write case until jffs2_write_end can handle the * case of a short-copy. @@ -220,15 +228,17 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, mutex_lock(&f->sem); ret = jffs2_do_readpage_nolock(inode, pg); mutex_unlock(&f->sem); - if (ret) - goto out_page; + if (ret) { + unlock_page(pg); + put_page(pg); + goto release_sem; + } } jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags); - return ret; -out_page: - unlock_page(pg); - put_page(pg); +release_sem: + mutex_unlock(&c->alloc_sem); +out_err: return ret; } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index ab8cdd9e9325..ad1eba809e7e 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -602,8 +602,8 @@ out_root: jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); kvfree(c->blocks); - out_inohash: jffs2_clear_xattr_subsystem(c); + out_inohash: kfree(c->inocache_list); out_wbuf: jffs2_flash_cleanup(c); diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 0b1a7f68b712..f73904c08b39 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -136,7 +136,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) if (!s) { JFFS2_WARNING("Can't allocate memory for summary\n"); ret = -ENOMEM; - goto out; + goto out_buf; } } @@ -274,13 +274,15 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) } ret = 0; out: + jffs2_sum_reset_collected(s); + kfree(s); + out_buf: if (buf_size) kfree(flashbuf); #ifndef __ECOS else mtd_unpoint(c->mtd, 0, c->mtd->size); #endif - kfree(s); return ret; } diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index d862cfc3d3a8..62c4a5450cda 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -146,12 +146,13 @@ void jfs_evict_inode(struct inode *inode) dquot_initialize(inode); if (JFS_IP(inode)->fileset == FILESYSTEM_I) { + struct inode *ipimap = JFS_SBI(inode->i_sb)->ipimap; truncate_inode_pages_final(&inode->i_data); if (test_cflag(COMMIT_Freewmap, inode)) jfs_free_zero_link(inode); - if (JFS_SBI(inode->i_sb)->ipimap) + if (ipimap && JFS_IP(ipimap)->i_imap) diFree(inode); /* diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 6fe82ce8663e..79f3440e204b 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -148,6 +148,7 @@ static const s8 budtab[256] = { * 0 - success * -ENOMEM - insufficient memory * -EIO - i/o error + * -EINVAL - wrong bmap data */ int dbMount(struct inode *ipbmap) { @@ -179,6 +180,12 @@ int dbMount(struct inode *ipbmap) bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree); bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); + if (!bmp->db_numag) { + release_metapage(mp); + kfree(bmp); + return -EINVAL; + } + bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 7b09a9158e40..3fffc709afd4 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -447,7 +447,8 @@ static const struct address_space_operations minix_aops = { .writepage = minix_writepage, .write_begin = minix_write_begin, .write_end = generic_write_end, - .bmap = minix_bmap + .bmap = minix_bmap, + .direct_IO = noop_direct_IO }; static const struct inode_operations minix_symlink_inode_operations = { diff --git a/fs/namei.c b/fs/namei.c index 5b5759d70822..b952ecbd49c2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3878,13 +3878,12 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); detach_mounts(dentry); - fsnotify_rmdir(dir, dentry); out: inode_unlock(dentry->d_inode); dput(dentry); if (!error) - d_delete(dentry); + d_delete_notify(dir, dentry); return error; } EXPORT_SYMBOL(vfs_rmdir); @@ -3995,7 +3994,6 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate if (!error) { dont_mount(dentry); detach_mounts(dentry); - fsnotify_unlink(dir, dentry); } } } @@ -4003,9 +4001,11 @@ out: inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ - if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { + if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) { + fsnotify_unlink(dir, dentry); + } else if (!error) { fsnotify_link_count(target); - d_delete(dentry); + d_delete_notify(dir, dentry); } return error; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 8f34daf85f70..5d5227ce4d91 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -168,7 +168,7 @@ struct cb_devicenotifyitem { }; struct cb_devicenotifyargs { - int ndevs; + uint32_t ndevs; struct cb_devicenotifyitem *devs; }; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index fc775b0b5194..31922657e836 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -364,12 +364,11 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp, struct cb_process_state *cps) { struct cb_devicenotifyargs *args = argp; - int i; + const struct pnfs_layoutdriver_type *ld = NULL; + uint32_t i; __be32 res = 0; - struct nfs_client *clp = cps->clp; - struct nfs_server *server = NULL; - if (!clp) { + if (!cps->clp) { res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); goto out; } @@ -377,23 +376,15 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp, for (i = 0; i < args->ndevs; i++) { struct cb_devicenotifyitem *dev = &args->devs[i]; - if (!server || - server->pnfs_curr_ld->id != dev->cbd_layout_type) { - rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) - if (server->pnfs_curr_ld && - server->pnfs_curr_ld->id == dev->cbd_layout_type) { - rcu_read_unlock(); - goto found; - } - rcu_read_unlock(); - continue; + if (!ld || ld->id != dev->cbd_layout_type) { + pnfs_put_layoutdriver(ld); + ld = pnfs_find_layoutdriver(dev->cbd_layout_type); + if (!ld) + continue; } - - found: - nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); + nfs4_delete_deviceid(ld, cps->clp, &dev->cbd_dev_id); } - + pnfs_put_layoutdriver(ld); out: kfree(args->devs); return res; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 73a5a5ea2976..04d27f0ed39a 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -258,11 +258,9 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, void *argp) { struct cb_devicenotifyargs *args = argp; + uint32_t tmp, n, i; __be32 *p; __be32 status = 0; - u32 tmp; - int n, i; - args->ndevs = 0; /* Num of device notifications */ p = xdr_inline_decode(xdr, sizeof(uint32_t)); @@ -271,12 +269,8 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, goto out; } n = ntohl(*p++); - if (n <= 0) - goto out; - if (n > ULONG_MAX / sizeof(*args->devs)) { - status = htonl(NFS4ERR_BADXDR); + if (n == 0) goto out; - } args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL); if (!args->devs) { @@ -330,19 +324,21 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, dev->cbd_immediate = 0; } - args->ndevs++; - dprintk("%s: type %d layout 0x%x immediate %d\n", __func__, dev->cbd_notify_type, dev->cbd_layout_type, dev->cbd_immediate); } + args->ndevs = n; + dprintk("%s: ndevs %d\n", __func__, args->ndevs); + return 0; +err: + kfree(args->devs); out: + args->devs = NULL; + args->ndevs = 0; dprintk("%s: status %d ndevs %d\n", __func__, ntohl(status), args->ndevs); return status; -err: - kfree(args->devs); - goto out; } static __be32 decode_sessionid(struct xdr_stream *xdr, diff --git a/fs/nfs/client.c b/fs/nfs/client.c index af838d1ed281..35abe63655a9 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -176,6 +176,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) INIT_LIST_HEAD(&clp->cl_superblocks); clp->cl_rpcclient = ERR_PTR(-EINVAL); + clp->cl_flags = cl_init->init_flags; clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; clp->cl_net = get_net(cl_init->net); @@ -419,7 +420,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) list_add_tail(&new->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); - new->cl_flags = cl_init->init_flags; return rpc_ops->init_client(new, cl_init); } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e7c0790308fe..28ceee102d0b 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1638,6 +1638,24 @@ out: no_open: res = nfs_lookup(dir, dentry, lookup_flags); + if (!res) { + inode = d_inode(dentry); + if ((lookup_flags & LOOKUP_DIRECTORY) && inode && + !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) + res = ERR_PTR(-ENOTDIR); + else if (inode && S_ISREG(inode->i_mode)) + res = ERR_PTR(-EOPENSTALE); + } else if (!IS_ERR(res)) { + inode = d_inode(res); + if ((lookup_flags & LOOKUP_DIRECTORY) && inode && + !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) { + dput(res); + res = ERR_PTR(-ENOTDIR); + } else if (inode && S_ISREG(inode->i_mode)) { + dput(res); + res = ERR_PTR(-EOPENSTALE); + } + } if (switched) { d_lookup_done(dentry); if (!res) @@ -2035,6 +2053,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) trace_nfs_link_enter(inode, dir, dentry); d_drop(dentry); + if (S_ISREG(inode->i_mode)) + nfs_sync_inode(inode); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { ihold(inode); @@ -2123,6 +2143,8 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } + if (S_ISREG(old_inode->i_mode)) + nfs_sync_inode(old_inode); task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); if (IS_ERR(task)) { error = PTR_ERR(task); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 6b0bf4ebd812..0682037f972b 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -272,8 +272,8 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (iov_iter_rw(iter) == READ) - return nfs_file_direct_read(iocb, iter); - return nfs_file_direct_write(iocb, iter); + return nfs_file_direct_read(iocb, iter, true); + return nfs_file_direct_write(iocb, iter, true); } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) @@ -524,6 +524,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_read - file direct read operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers into which to read data + * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct reads instead of calling * generic_file_aio_read() in order to avoid gfar's check to see if @@ -539,7 +540,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * client must read the updated atime from the server back into its * cache. */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, + bool swap) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -581,12 +583,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (iter_is_iovec(iter)) dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; - nfs_start_io_direct(inode); + if (!swap) + nfs_start_io_direct(inode); NFS_I(inode)->read_io += count; requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); - nfs_end_io_direct(inode); + if (!swap) + nfs_end_io_direct(inode); if (requested > 0) { result = nfs_direct_wait(dreq); @@ -851,7 +855,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { */ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, struct iov_iter *iter, - loff_t pos) + loff_t pos, int ioflags) { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; @@ -859,7 +863,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, size_t requested_bytes = 0; size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); - nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false, + nfs_pageio_init_write(&desc, inode, ioflags, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; get_dreq(dreq); @@ -937,6 +941,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers from which to write data + * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct writes instead of calling * generic_file_aio_write() in order to avoid taking the inode @@ -953,7 +958,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * Note that O_APPEND is not supported for NFS direct writes, as there * is no atomic O_APPEND write facility in the NFS protocol. */ -ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) +ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, + bool swap) { ssize_t result = -EINVAL, requested; size_t count; @@ -967,7 +973,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, iov_iter_count(iter), (long long) iocb->ki_pos); - result = generic_write_checks(iocb, iter); + if (swap) + /* bypass generic checks */ + result = iov_iter_count(iter); + else + result = generic_write_checks(iocb, iter); if (result <= 0) return result; count = result; @@ -997,16 +1007,22 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; - nfs_start_io_direct(inode); + if (swap) { + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, + FLUSH_STABLE); + } else { + nfs_start_io_direct(inode); - requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, + FLUSH_COND_STABLE); - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - } + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); + } - nfs_end_io_direct(inode); + nfs_end_io_direct(inode); + } if (requested > 0) { result = nfs_direct_wait(dreq); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 387a2cfa7e17..73415970af38 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -161,7 +161,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) ssize_t result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_read(iocb, to); + return nfs_file_direct_read(iocb, to, false); dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, @@ -609,7 +609,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) return result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_write(iocb, from); + return nfs_file_direct_write(iocb, from, false); dprintk("NFS: write(%pD2, %zu@%Ld)\n", file, iov_iter_count(from), (long long) iocb->ki_pos); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 209263c0c537..3bddf5332b6d 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -791,12 +791,9 @@ int nfs_getattr(const struct path *path, struct kstat *stat, goto out_no_update; /* Flush out writes to the server in order to update c/mtime. */ - if ((request_mask & (STATX_CTIME|STATX_MTIME)) && - S_ISREG(inode->i_mode)) { - err = filemap_write_and_wait(inode->i_mapping); - if (err) - goto out; - } + if ((request_mask & (STATX_CTIME | STATX_MTIME)) && + S_ISREG(inode->i_mode)) + filemap_write_and_wait(inode->i_mapping); /* * We may force a getattr if the user cares about atime. diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 887f9136a9db..af557dc2cfe1 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -953,7 +953,7 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_filename_inline(xdr, &entry->name, &entry->len); if (unlikely(error)) - return error; + return -EAGAIN; /* * The type (size and byte order) of nfscookie isn't defined in diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 23d75cddbb2e..84369d51353a 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1968,7 +1968,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, bool plus) { struct user_namespace *userns = rpc_userns(entry->server->client); - struct nfs_entry old = *entry; __be32 *p; int error; u64 new_cookie; @@ -1988,15 +1987,15 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_fileid3(xdr, &entry->ino); if (unlikely(error)) - return error; + return -EAGAIN; error = decode_inline_filename3(xdr, &entry->name, &entry->len); if (unlikely(error)) - return error; + return -EAGAIN; error = decode_cookie3(xdr, &new_cookie); if (unlikely(error)) - return error; + return -EAGAIN; entry->d_type = DT_UNKNOWN; @@ -2004,7 +2003,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, entry->fattr->valid = 0; error = decode_post_op_attr(xdr, entry->fattr, userns); if (unlikely(error)) - return error; + return -EAGAIN; if (entry->fattr->valid & NFS_ATTR_FATTR_V3) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); @@ -2019,11 +2018,8 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, return -EAGAIN; if (*p != xdr_zero) { error = decode_nfs_fh3(xdr, entry->fh); - if (unlikely(error)) { - if (error == -E2BIG) - goto out_truncated; - return error; - } + if (unlikely(error)) + return -EAGAIN; } else zero_nfs_fh3(entry->fh); } @@ -2032,11 +2028,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, entry->cookie = new_cookie; return 0; - -out_truncated: - dprintk("NFS: directory entry contains invalid file handle\n"); - *entry = old; - return -EAGAIN; } /* diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6b7c926824ae..504812ea4bc2 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -295,8 +295,9 @@ static ssize_t _nfs42_proc_copy(struct file *src, goto out; } - truncate_pagecache_range(dst_inode, pos_dst, - pos_dst + res->write_res.count); + WARN_ON_ONCE(invalidate_inode_pages2_range(dst_inode->i_mapping, + pos_dst >> PAGE_SHIFT, + (pos_dst + res->write_res.count - 1) >> PAGE_SHIFT)); status = res->write_res.count; out: diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index aed865a84629..2b78f7b8d546 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -769,8 +769,7 @@ static int nfs4_xdr_dec_clone(struct rpc_rqst *rqstp, status = decode_clone(xdr); if (status) goto out; - status = decode_getfattr(xdr, res->dst_fattr, res->server); - + decode_getfattr(xdr, res->dst_fattr, res->server); out: res->rpc_status = status; return status; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 5708b5a636f1..ebd77a301057 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -279,7 +279,8 @@ struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, struct nfs_fh *, struct nfs_fattr *); int nfs4_replace_transport(struct nfs_server *server, const struct nfs4_fs_locations *locations); - +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, + size_t salen, struct net *net); /* nfs4proc.c */ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *); extern int nfs4_async_handle_error(struct rpc_task *task, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 8cace8350fa3..3671a51fe5eb 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1293,8 +1293,11 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, } nfs_put_client(clp); - if (server->nfs_client->cl_hostname == NULL) + if (server->nfs_client->cl_hostname == NULL) { server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); + if (server->nfs_client->cl_hostname == NULL) + return -ENOMEM; + } nfs_server_insert_lists(server); return nfs_probe_destination(server); diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 2e460c33ae48..768258848a68 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -121,8 +121,8 @@ static int nfs4_validate_fspath(struct dentry *dentry, return 0; } -static size_t nfs_parse_server_name(char *string, size_t len, - struct sockaddr *sa, size_t salen, struct net *net) +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, + size_t salen, struct net *net) { ssize_t ret; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index fb3d1532f11d..cf3b00751ff6 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -359,6 +359,14 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start); } +static void nfs4_fattr_set_prechange(struct nfs_fattr *fattr, u64 version) +{ + if (!(fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) { + fattr->pre_change_attr = version; + fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; + } +} + static void nfs4_test_and_free_stateid(struct nfs_server *server, nfs4_stateid *stateid, const struct cred *cred) @@ -6307,7 +6315,9 @@ static void nfs4_delegreturn_release(void *calldata) pnfs_roc_release(&data->lr.arg, &data->lr.res, data->res.lr_ret); if (inode) { - nfs_post_op_update_inode_force_wcc(inode, &data->fattr); + nfs4_fattr_set_prechange(&data->fattr, + inode_peek_iversion_raw(inode)); + nfs_refresh_inode(inode, &data->fattr); nfs_iput_and_deactive(inode); } kfree(calldata); @@ -7918,6 +7928,7 @@ nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata) case -NFS4ERR_DEADSESSION: nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); + return; } if (args->dir == NFS4_CDFC4_FORE_OR_BOTH && res->dir != NFS4_CDFS4_BOTH) { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index ea680f619438..1d2b81a233bb 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -49,6 +49,7 @@ #include <linux/workqueue.h> #include <linux/bitops.h> #include <linux/jiffies.h> +#include <linux/sched/mm.h> #include <linux/sunrpc/clnt.h> @@ -2070,6 +2071,9 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred } result = -NFS4ERR_NXIO; + if (!locations->nlocations) + goto out; + if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { dprintk("<-- %s: No fs_locations data, migration skipped\n", __func__); @@ -2501,9 +2505,17 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp) static void nfs4_state_manager(struct nfs_client *clp) { + unsigned int memflags; int status = 0; const char *section = "", *section_sep = ""; + /* + * State recovery can deadlock if the direct reclaim code tries + * start NFS writeback. So ensure memory allocations are all + * GFP_NOFS. + */ + memflags = memalloc_nofs_save(); + /* Ensure exclusive access to NFSv4 state */ do { clear_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); @@ -2597,6 +2609,7 @@ static void nfs4_state_manager(struct nfs_client *clp) clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); } + memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); @@ -2613,6 +2626,7 @@ static void nfs4_state_manager(struct nfs_client *clp) return; if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) return; + memflags = memalloc_nofs_save(); } while (refcount_read(&clp->cl_count) > 1 && !signalled()); goto out_drain; @@ -2624,6 +2638,7 @@ out_error: clp->cl_hostname, -status); ssleep(1); out_drain: + memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 9a022a4fb964..2b7741fe42ea 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3683,8 +3683,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st if (unlikely(!p)) goto out_eio; n = be32_to_cpup(p); - if (n <= 0) - goto out_eio; for (res->nlocations = 0; res->nlocations < n; res->nlocations++) { u32 m; struct nfs4_fs_location *loc; @@ -4187,10 +4185,11 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, } else printk(KERN_WARNING "%s: label too long (%u)!\n", __func__, len); + if (label && label->label) + dprintk("%s: label=%.*s, len=%d, PI=%d, LFS=%d\n", + __func__, label->len, (char *)label->label, + label->len, label->pi, label->lfs); } - if (label && label->label) - dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__, - (char *)label->label, label->len, label->pi, label->lfs); return status; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 1b512df1003f..0471b6e0da16 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -92,6 +92,17 @@ find_pnfs_driver(u32 id) return local; } +const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id) +{ + return find_pnfs_driver(id); +} + +void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld) +{ + if (ld) + module_put(ld->owner); +} + void unset_pnfs_layoutdriver(struct nfs_server *nfss) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 3d55edd6b25a..68339680bb7d 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -226,6 +226,8 @@ struct pnfs_devicelist { extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); +extern const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id); +extern void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld); /* nfs4proc.c */ extern size_t max_response_pages(struct nfs_server *server); diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 662937472e9b..79ad6b2c9608 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -44,6 +44,17 @@ struct nfsd_fcache_bucket { static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); +struct nfsd_fcache_disposal { + struct list_head list; + struct work_struct work; + struct net *net; + spinlock_t lock; + struct list_head freeme; + struct rcu_head rcu; +}; + +struct workqueue_struct *nfsd_filecache_wq __read_mostly; + static struct kmem_cache *nfsd_file_slab; static struct kmem_cache *nfsd_file_mark_slab; static struct nfsd_fcache_bucket *nfsd_file_hashtbl; @@ -52,32 +63,21 @@ static long nfsd_file_lru_flags; static struct fsnotify_group *nfsd_file_fsnotify_group; static atomic_long_t nfsd_filecache_count; static struct delayed_work nfsd_filecache_laundrette; +static DEFINE_SPINLOCK(laundrette_lock); +static LIST_HEAD(laundrettes); -enum nfsd_file_laundrette_ctl { - NFSD_FILE_LAUNDRETTE_NOFLUSH = 0, - NFSD_FILE_LAUNDRETTE_MAY_FLUSH -}; +static void nfsd_file_gc(void); static void -nfsd_file_schedule_laundrette(enum nfsd_file_laundrette_ctl ctl) +nfsd_file_schedule_laundrette(void) { long count = atomic_long_read(&nfsd_filecache_count); if (count == 0 || test_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags)) return; - /* Be more aggressive about scanning if over the threshold */ - if (count > NFSD_FILE_LRU_THRESHOLD) - mod_delayed_work(system_wq, &nfsd_filecache_laundrette, 0); - else - schedule_delayed_work(&nfsd_filecache_laundrette, NFSD_LAUNDRETTE_DELAY); - - if (ctl == NFSD_FILE_LAUNDRETTE_NOFLUSH) - return; - - /* ...and don't delay flushing if we're out of control */ - if (count >= NFSD_FILE_LRU_LIMIT) - flush_delayed_work(&nfsd_filecache_laundrette); + queue_delayed_work(system_wq, &nfsd_filecache_laundrette, + NFSD_LAUNDRETTE_DELAY); } static void @@ -260,8 +260,6 @@ nfsd_file_do_unhash(struct nfsd_file *nf) nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id)); --nfsd_file_hashtbl[nf->nf_hashval].nfb_count; hlist_del_rcu(&nf->nf_node); - if (!list_empty(&nf->nf_lru)) - list_lru_del(&nfsd_file_lru, &nf->nf_lru); atomic_long_dec(&nfsd_filecache_count); } @@ -270,6 +268,8 @@ nfsd_file_unhash(struct nfsd_file *nf) { if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { nfsd_file_do_unhash(nf); + if (!list_empty(&nf->nf_lru)) + list_lru_del(&nfsd_file_lru, &nf->nf_lru); return true; } return false; @@ -316,7 +316,9 @@ nfsd_file_put(struct nfsd_file *nf) set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); if (nfsd_file_put_noref(nf) == 1 && is_hashed && unused) - nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_MAY_FLUSH); + nfsd_file_schedule_laundrette(); + if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT) + nfsd_file_gc(); } struct nfsd_file * @@ -357,6 +359,58 @@ nfsd_file_dispose_list_sync(struct list_head *dispose) flush_delayed_fput(); } +static void +nfsd_file_list_remove_disposal(struct list_head *dst, + struct nfsd_fcache_disposal *l) +{ + spin_lock(&l->lock); + list_splice_init(&l->freeme, dst); + spin_unlock(&l->lock); +} + +static void +nfsd_file_list_add_disposal(struct list_head *files, struct net *net) +{ + struct nfsd_fcache_disposal *l; + + rcu_read_lock(); + list_for_each_entry_rcu(l, &laundrettes, list) { + if (l->net == net) { + spin_lock(&l->lock); + list_splice_tail_init(files, &l->freeme); + spin_unlock(&l->lock); + queue_work(nfsd_filecache_wq, &l->work); + break; + } + } + rcu_read_unlock(); +} + +static void +nfsd_file_list_add_pernet(struct list_head *dst, struct list_head *src, + struct net *net) +{ + struct nfsd_file *nf, *tmp; + + list_for_each_entry_safe(nf, tmp, src, nf_lru) { + if (nf->nf_net == net) + list_move_tail(&nf->nf_lru, dst); + } +} + +static void +nfsd_file_dispose_list_delayed(struct list_head *dispose) +{ + LIST_HEAD(list); + struct nfsd_file *nf; + + while(!list_empty(dispose)) { + nf = list_first_entry(dispose, struct nfsd_file, nf_lru); + nfsd_file_list_add_pernet(&list, dispose, nf->nf_net); + nfsd_file_list_add_disposal(&list, nf->nf_net); + } +} + /* * Note this can deadlock with nfsd_file_cache_purge. */ @@ -403,18 +457,40 @@ out_skip: return LRU_SKIP; } -static void -nfsd_file_lru_dispose(struct list_head *head) +static unsigned long +nfsd_file_lru_walk_list(struct shrink_control *sc) { - while(!list_empty(head)) { - struct nfsd_file *nf = list_first_entry(head, - struct nfsd_file, nf_lru); - list_del_init(&nf->nf_lru); + LIST_HEAD(head); + struct nfsd_file *nf; + unsigned long ret; + + if (sc) + ret = list_lru_shrink_walk(&nfsd_file_lru, sc, + nfsd_file_lru_cb, &head); + else + ret = list_lru_walk(&nfsd_file_lru, + nfsd_file_lru_cb, + &head, LONG_MAX); + list_for_each_entry(nf, &head, nf_lru) { spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); nfsd_file_do_unhash(nf); spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); - nfsd_file_put_noref(nf); } + nfsd_file_dispose_list_delayed(&head); + return ret; +} + +static void +nfsd_file_gc(void) +{ + nfsd_file_lru_walk_list(NULL); +} + +static void +nfsd_file_gc_worker(struct work_struct *work) +{ + nfsd_file_gc(); + nfsd_file_schedule_laundrette(); } static unsigned long @@ -426,12 +502,7 @@ nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc) static unsigned long nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) { - LIST_HEAD(head); - unsigned long ret; - - ret = list_lru_shrink_walk(&nfsd_file_lru, sc, nfsd_file_lru_cb, &head); - nfsd_file_lru_dispose(&head); - return ret; + return nfsd_file_lru_walk_list(sc); } static struct shrinker nfsd_file_shrinker = { @@ -493,7 +564,7 @@ nfsd_file_close_inode(struct inode *inode) __nfsd_file_close_inode(inode, hashval, &dispose); trace_nfsd_file_close_inode(inode, hashval, !list_empty(&dispose)); - nfsd_file_dispose_list(&dispose); + nfsd_file_dispose_list_delayed(&dispose); } /** @@ -509,16 +580,11 @@ static void nfsd_file_delayed_close(struct work_struct *work) { LIST_HEAD(head); + struct nfsd_fcache_disposal *l = container_of(work, + struct nfsd_fcache_disposal, work); - list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, &head, LONG_MAX); - - if (test_and_clear_bit(NFSD_FILE_LRU_RESCAN, &nfsd_file_lru_flags)) - nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_NOFLUSH); - - if (!list_empty(&head)) { - nfsd_file_lru_dispose(&head); - flush_delayed_fput(); - } + nfsd_file_list_remove_disposal(&head, l); + nfsd_file_dispose_list(&head); } static int @@ -579,6 +645,10 @@ nfsd_file_cache_init(void) if (nfsd_file_hashtbl) return 0; + nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", 0, 0); + if (!nfsd_filecache_wq) + goto out; + nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE, sizeof(*nfsd_file_hashtbl), GFP_KERNEL); if (!nfsd_file_hashtbl) { @@ -632,7 +702,7 @@ nfsd_file_cache_init(void) spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock); } - INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_delayed_close); + INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_gc_worker); out: return ret; out_notifier: @@ -648,6 +718,8 @@ out_err: nfsd_file_mark_slab = NULL; kfree(nfsd_file_hashtbl); nfsd_file_hashtbl = NULL; + destroy_workqueue(nfsd_filecache_wq); + nfsd_filecache_wq = NULL; goto out; } @@ -686,6 +758,88 @@ nfsd_file_cache_purge(struct net *net) } } +static struct nfsd_fcache_disposal * +nfsd_alloc_fcache_disposal(struct net *net) +{ + struct nfsd_fcache_disposal *l; + + l = kmalloc(sizeof(*l), GFP_KERNEL); + if (!l) + return NULL; + INIT_WORK(&l->work, nfsd_file_delayed_close); + l->net = net; + spin_lock_init(&l->lock); + INIT_LIST_HEAD(&l->freeme); + return l; +} + +static void +nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l) +{ + rcu_assign_pointer(l->net, NULL); + cancel_work_sync(&l->work); + nfsd_file_dispose_list(&l->freeme); + kfree_rcu(l, rcu); +} + +static void +nfsd_add_fcache_disposal(struct nfsd_fcache_disposal *l) +{ + spin_lock(&laundrette_lock); + list_add_tail_rcu(&l->list, &laundrettes); + spin_unlock(&laundrette_lock); +} + +static void +nfsd_del_fcache_disposal(struct nfsd_fcache_disposal *l) +{ + spin_lock(&laundrette_lock); + list_del_rcu(&l->list); + spin_unlock(&laundrette_lock); +} + +static int +nfsd_alloc_fcache_disposal_net(struct net *net) +{ + struct nfsd_fcache_disposal *l; + + l = nfsd_alloc_fcache_disposal(net); + if (!l) + return -ENOMEM; + nfsd_add_fcache_disposal(l); + return 0; +} + +static void +nfsd_free_fcache_disposal_net(struct net *net) +{ + struct nfsd_fcache_disposal *l; + + rcu_read_lock(); + list_for_each_entry_rcu(l, &laundrettes, list) { + if (l->net != net) + continue; + nfsd_del_fcache_disposal(l); + rcu_read_unlock(); + nfsd_free_fcache_disposal(l); + return; + } + rcu_read_unlock(); +} + +int +nfsd_file_cache_start_net(struct net *net) +{ + return nfsd_alloc_fcache_disposal_net(net); +} + +void +nfsd_file_cache_shutdown_net(struct net *net) +{ + nfsd_file_cache_purge(net); + nfsd_free_fcache_disposal_net(net); +} + void nfsd_file_cache_shutdown(void) { @@ -712,6 +866,8 @@ nfsd_file_cache_shutdown(void) nfsd_file_mark_slab = NULL; kfree(nfsd_file_hashtbl); nfsd_file_hashtbl = NULL; + destroy_workqueue(nfsd_filecache_wq); + nfsd_filecache_wq = NULL; } static bool @@ -881,7 +1037,8 @@ open_file: nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount, nfsd_file_hashtbl[hashval].nfb_count); spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); - atomic_long_inc(&nfsd_filecache_count); + if (atomic_long_inc_return(&nfsd_filecache_count) >= NFSD_FILE_LRU_THRESHOLD) + nfsd_file_gc(); nf->nf_mark = nfsd_file_mark_find_or_create(nf); if (nf->nf_mark) diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 851d9abf54c2..79a7d6808d97 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -51,6 +51,8 @@ struct nfsd_file { int nfsd_file_cache_init(void); void nfsd_file_cache_purge(struct net *); void nfsd_file_cache_shutdown(void); +int nfsd_file_cache_start_net(struct net *net); +void nfsd_file_cache_shutdown_net(struct net *net); void nfsd_file_put(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index cea68d8411ac..bb97c020c96b 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -195,6 +195,11 @@ nfsd3_proc_write(struct svc_rqst *rqstp) (unsigned long long) argp->offset, argp->stable? " stable" : ""); + resp->status = nfserr_fbig; + if (argp->offset > (u64)OFFSET_MAX || + argp->offset + argp->len > (u64)OFFSET_MAX) + return rpc_success; + fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages, diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 4798667af647..452ed633a2c7 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -992,8 +992,9 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, unsigned long cnt; int nvecs; - if (write->wr_offset >= OFFSET_MAX) - return nfserr_inval; + if (write->wr_offset > (u64)OFFSET_MAX || + write->wr_offset + write->wr_buflen > (u64)OFFSET_MAX) + return nfserr_fbig; cnt = write->wr_buflen; trace_nfsd_write_start(rqstp, &cstate->current_fh, diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index c35c0ebaf722..7d408957ed62 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -2177,6 +2177,7 @@ static struct notifier_block nfsd4_cld_block = { int register_cld_notifier(void) { + WARN_ON(!nfsd_net_id); return rpc_pipefs_notifier_register(&nfsd4_cld_block); } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3283cc2a4e42..62eb78ac7437 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1041,6 +1041,11 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) return 0; } +static bool delegation_hashed(struct nfs4_delegation *dp) +{ + return !(list_empty(&dp->dl_perfile)); +} + static bool unhash_delegation_locked(struct nfs4_delegation *dp) { @@ -1048,7 +1053,7 @@ unhash_delegation_locked(struct nfs4_delegation *dp) lockdep_assert_held(&state_lock); - if (list_empty(&dp->dl_perfile)) + if (!delegation_hashed(dp)) return false; dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID; @@ -3936,8 +3941,10 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, status = nfserr_clid_inuse; if (client_has_state(old) && !same_creds(&unconf->cl_cred, - &old->cl_cred)) + &old->cl_cred)) { + old = NULL; goto out; + } status = mark_client_expired_locked(old); if (status) { old = NULL; @@ -4406,7 +4413,7 @@ static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb) * queued for a lease break. Don't queue it again. */ spin_lock(&state_lock); - if (dp->dl_time == 0) { + if (delegation_hashed(dp) && dp->dl_time == 0) { dp->dl_time = get_seconds(); list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 730386c130e0..055cc0458f27 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1247,7 +1247,8 @@ static void nfsdfs_remove_file(struct inode *dir, struct dentry *dentry) clear_ncl(d_inode(dentry)); dget(dentry); ret = simple_unlink(dir, dentry); - d_delete(dentry); + d_drop(dentry); + fsnotify_unlink(dir, dentry); dput(dentry); WARN_ON_ONCE(ret); } @@ -1336,8 +1337,8 @@ void nfsd_client_rmdir(struct dentry *dentry) dget(dentry); ret = simple_rmdir(dir, dentry); WARN_ON_ONCE(ret); + d_drop(dentry); fsnotify_rmdir(dir, dentry); - d_delete(dentry); dput(dentry); inode_unlock(dir); } @@ -1526,12 +1527,9 @@ static int __init init_nfsd(void) int retval; printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); - retval = register_cld_notifier(); - if (retval) - return retval; retval = nfsd4_init_slabs(); if (retval) - goto out_unregister_notifier; + return retval; retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; @@ -1549,9 +1547,14 @@ static int __init init_nfsd(void) goto out_free_exports; retval = register_pernet_subsys(&nfsd_net_ops); if (retval < 0) + goto out_free_filesystem; + retval = register_cld_notifier(); + if (retval) goto out_free_all; return 0; out_free_all: + unregister_pernet_subsys(&nfsd_net_ops); +out_free_filesystem: unregister_filesystem(&nfsd_fs_type); out_free_exports: remove_proc_entry("fs/nfs/exports", NULL); @@ -1565,13 +1568,12 @@ out_free_stat: nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); -out_unregister_notifier: - unregister_cld_notifier(); return retval; } static void __exit exit_nfsd(void) { + unregister_cld_notifier(); unregister_pernet_subsys(&nfsd_net_ops); nfsd_drc_slab_free(); remove_proc_entry("fs/nfs/exports", NULL); @@ -1582,7 +1584,6 @@ static void __exit exit_nfsd(void) nfsd4_exit_pnfs(); nfsd_fault_inject_cleanup(); unregister_filesystem(&nfsd_fs_type); - unregister_cld_notifier(); } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 754c763374dd..4aca93e11af7 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -230,7 +230,7 @@ nfsd_proc_write(struct svc_rqst *rqstp) unsigned long cnt = argp->len; unsigned int nvecs; - dprintk("nfsd: WRITE %s %d bytes at %d\n", + dprintk("nfsd: WRITE %s %u bytes at %d\n", SVCFH_fmt(&argp->fh), argp->len, argp->offset); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 155a4e43b24e..d63cdda1782d 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -394,13 +394,18 @@ static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cre nn->lockd_up = 1; } - ret = nfs4_state_start_net(net); + ret = nfsd_file_cache_start_net(net); if (ret) goto out_lockd; + ret = nfs4_state_start_net(net); + if (ret) + goto out_filecache; nn->nfsd_net_up = true; return 0; +out_filecache: + nfsd_file_cache_shutdown_net(net); out_lockd: if (nn->lockd_up) { lockd_down(net); @@ -415,7 +420,7 @@ static void nfsd_shutdown_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - nfsd_file_cache_purge(net); + nfsd_file_cache_shutdown_net(net); nfs4_state_shutdown_net(net); if (nn->lockd_up) { lockd_down(net); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index b073bdc2e6e8..127db5351d01 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -53,14 +53,14 @@ TRACE_EVENT(nfsd_compound_status, DECLARE_EVENT_CLASS(nfsd_io_class, TP_PROTO(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, - unsigned long len), + u64 offset, + u32 len), TP_ARGS(rqstp, fhp, offset, len), TP_STRUCT__entry( __field(u32, xid) __field(u32, fh_hash) - __field(loff_t, offset) - __field(unsigned long, len) + __field(u64, offset) + __field(u32, len) ), TP_fast_assign( __entry->xid = be32_to_cpu(rqstp->rq_xid); @@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(nfsd_io_class, __entry->offset = offset; __entry->len = len; ), - TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld len=%lu", + TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu len=%u", __entry->xid, __entry->fh_hash, __entry->offset, __entry->len) ) @@ -77,8 +77,8 @@ DECLARE_EVENT_CLASS(nfsd_io_class, DEFINE_EVENT(nfsd_io_class, nfsd_##name, \ TP_PROTO(struct svc_rqst *rqstp, \ struct svc_fh *fhp, \ - loff_t offset, \ - unsigned long len), \ + u64 offset, \ + u32 len), \ TP_ARGS(rqstp, fhp, offset, len)) DEFINE_NFSD_IO_EVENT(read_start); diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index ea7cca3a64b7..6251d8754c82 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -33,7 +33,7 @@ struct nfsd_readargs { struct nfsd_writeargs { svc_fh fh; __u32 offset; - int len; + __u32 len; struct kvec first; }; diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 3c4811469ae8..e278bfc5ee7f 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1503,7 +1503,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, na.type = AT_BITMAP; na.name = I30; na.name_len = 4; - bmp_vi = ilookup5(vi->i_sb, vi->i_ino, (test_t)ntfs_test_inode, &na); + bmp_vi = ilookup5(vi->i_sb, vi->i_ino, ntfs_test_inode, &na); if (bmp_vi) { write_inode_now(bmp_vi, !datasync); iput(bmp_vi); diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 46dc16e01fe2..cf222c9225d6 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -30,10 +30,10 @@ /** * ntfs_test_inode - compare two (possibly fake) inodes for equality * @vi: vfs inode which to test - * @na: ntfs attribute which is being tested with + * @data: data which is being tested with * * Compare the ntfs attribute embedded in the ntfs specific part of the vfs - * inode @vi for equality with the ntfs attribute @na. + * inode @vi for equality with the ntfs attribute @data. * * If searching for the normal file/directory inode, set @na->type to AT_UNUSED. * @na->name and @na->name_len are then ignored. @@ -43,8 +43,9 @@ * NOTE: This function runs with the inode_hash_lock spin lock held so it is not * allowed to sleep. */ -int ntfs_test_inode(struct inode *vi, ntfs_attr *na) +int ntfs_test_inode(struct inode *vi, void *data) { + ntfs_attr *na = (ntfs_attr *)data; ntfs_inode *ni; if (vi->i_ino != na->mft_no) @@ -72,9 +73,9 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na) /** * ntfs_init_locked_inode - initialize an inode * @vi: vfs inode to initialize - * @na: ntfs attribute which to initialize @vi to + * @data: data which to initialize @vi to * - * Initialize the vfs inode @vi with the values from the ntfs attribute @na in + * Initialize the vfs inode @vi with the values from the ntfs attribute @data in * order to enable ntfs_test_inode() to do its work. * * If initializing the normal file/directory inode, set @na->type to AT_UNUSED. @@ -87,8 +88,9 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na) * NOTE: This function runs with the inode->i_lock spin lock held so it is not * allowed to sleep. (Hence the GFP_ATOMIC allocation.) */ -static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na) +static int ntfs_init_locked_inode(struct inode *vi, void *data) { + ntfs_attr *na = (ntfs_attr *)data; ntfs_inode *ni = NTFS_I(vi); vi->i_ino = na->mft_no; @@ -131,7 +133,6 @@ static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na) return 0; } -typedef int (*set_t)(struct inode *, void *); static int ntfs_read_locked_inode(struct inode *vi); static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi); static int ntfs_read_locked_index_inode(struct inode *base_vi, @@ -164,8 +165,8 @@ struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no) na.name = NULL; na.name_len = 0; - vi = iget5_locked(sb, mft_no, (test_t)ntfs_test_inode, - (set_t)ntfs_init_locked_inode, &na); + vi = iget5_locked(sb, mft_no, ntfs_test_inode, + ntfs_init_locked_inode, &na); if (unlikely(!vi)) return ERR_PTR(-ENOMEM); @@ -225,8 +226,8 @@ struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, na.name = name; na.name_len = name_len; - vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode, - (set_t)ntfs_init_locked_inode, &na); + vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode, + ntfs_init_locked_inode, &na); if (unlikely(!vi)) return ERR_PTR(-ENOMEM); @@ -280,8 +281,8 @@ struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, na.name = name; na.name_len = name_len; - vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode, - (set_t)ntfs_init_locked_inode, &na); + vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode, + ntfs_init_locked_inode, &na); if (unlikely(!vi)) return ERR_PTR(-ENOMEM); @@ -1880,6 +1881,10 @@ int ntfs_read_inode_mount(struct inode *vi) } /* Now allocate memory for the attribute list. */ ni->attr_list_size = (u32)ntfs_attr_size(a); + if (!ni->attr_list_size) { + ntfs_error(sb, "Attr_list_size is zero"); + goto put_err_out; + } ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); if (!ni->attr_list) { ntfs_error(sb, "Not enough memory to allocate buffer " diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 98e670fbdd31..363e4e820673 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -253,9 +253,7 @@ typedef struct { ATTR_TYPE type; } ntfs_attr; -typedef int (*test_t)(struct inode *, void *); - -extern int ntfs_test_inode(struct inode *vi, ntfs_attr *na); +extern int ntfs_test_inode(struct inode *vi, void *data); extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no); extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 3aac5c917afe..58234b42d68f 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -958,7 +958,7 @@ bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, * dirty code path of the inode dirty code path when writing * $MFT occurs. */ - vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na); + vi = ilookup5_nowait(sb, mft_no, ntfs_test_inode, &na); } if (vi) { ntfs_debug("Base inode 0x%lx is in icache.", mft_no); @@ -1019,7 +1019,7 @@ bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, vi = igrab(mft_vi); BUG_ON(vi != mft_vi); } else - vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode, + vi = ilookup5_nowait(sb, na.mft_no, ntfs_test_inode, &na); if (!vi) { /* diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index eaec97892dce..c1cf67b24c19 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1100,17 +1100,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) goto read_super_error; } - root = d_make_root(inode); - if (!root) { - status = -ENOMEM; - mlog_errno(status); - goto read_super_error; - } - - sb->s_root = root; - - ocfs2_complete_mount_recovery(osb); - osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj); if (!osb->osb_dev_kset) { @@ -1128,6 +1117,17 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) goto read_super_error; } + root = d_make_root(inode); + if (!root) { + status = -ENOMEM; + mlog_errno(status); + goto read_super_error; + } + + sb->s_root = root; + + ocfs2_complete_mount_recovery(osb); + if (ocfs2_mount_local(osb)) snprintf(nodestr, sizeof(nodestr), "local"); else diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 2bb916d68576..023b9bc54b7c 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -179,7 +179,7 @@ orangefs_bufmap_free(struct orangefs_bufmap *bufmap) { kfree(bufmap->page_array); kfree(bufmap->desc_array); - kfree(bufmap->buffer_index_array); + bitmap_free(bufmap->buffer_index_array); kfree(bufmap); } @@ -229,8 +229,7 @@ orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc) bufmap->desc_size = user_desc->size; bufmap->desc_shift = ilog2(bufmap->desc_size); - bufmap->buffer_index_array = - kzalloc(DIV_ROUND_UP(bufmap->desc_count, BITS_PER_LONG), GFP_KERNEL); + bufmap->buffer_index_array = bitmap_zalloc(bufmap->desc_count, GFP_KERNEL); if (!bufmap->buffer_index_array) goto out_free_bufmap; @@ -253,7 +252,7 @@ orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc) out_free_desc_array: kfree(bufmap->desc_array); out_free_index_array: - kfree(bufmap->buffer_index_array); + bitmap_free(bufmap->buffer_index_array); out_free_bufmap: kfree(bufmap); out: diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 876de87f604c..8c89eaea1583 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -113,8 +113,7 @@ kill_whiteout: goto out; } -static int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, - umode_t mode) +int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode) { int err; struct dentry *d, *dentry = *newdentry; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6934bcf030f0..a8e9da5f01eb 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -409,6 +409,7 @@ struct ovl_cattr { #define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) }) +int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode); struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, struct ovl_cattr *attr); int ovl_cleanup(struct inode *dir, struct dentry *dentry); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index f036d7544d4a..f5cf0938f298 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -650,10 +650,14 @@ retry: goto retry; } - work = ovl_create_real(dir, work, OVL_CATTR(attr.ia_mode)); - err = PTR_ERR(work); - if (IS_ERR(work)) - goto out_err; + err = ovl_mkdir_real(dir, &work, attr.ia_mode); + if (err) + goto out_dput; + + /* Weird filesystem returning with hashed negative (kernfs)? */ + err = -EINVAL; + if (d_really_is_negative(work)) + goto out_dput; /* * Try to remove POSIX ACL xattrs from workdir. We are good if: diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 080ca9d5eccb..b1102a31a108 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -125,9 +125,13 @@ ssize_t read_from_oldmem(char *buf, size_t count, nr_bytes = count; /* If pfn is not ram, return zeros for sparse dump files */ - if (pfn_is_ram(pfn) == 0) - memset(buf, 0, nr_bytes); - else { + if (pfn_is_ram(pfn) == 0) { + tmp = 0; + if (!userbuf) + memset(buf, 0, nr_bytes); + else if (clear_user(buf, nr_bytes)) + tmp = -EFAULT; + } else { if (encrypted) tmp = copy_oldmem_page_encrypted(pfn, buf, nr_bytes, @@ -136,10 +140,10 @@ ssize_t read_from_oldmem(char *buf, size_t count, else tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf); - - if (tmp < 0) - return tmp; } + if (tmp < 0) + return tmp; + *ppos += nr_bytes; count -= nr_bytes; buf += nr_bytes; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 7abc3230c21a..dc5f8654b277 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -693,9 +693,14 @@ int dquot_quota_sync(struct super_block *sb, int type) /* This is not very clever (and fast) but currently I don't know about * any other simple way of getting quota data to disk and we must get * them there for userspace to be visible... */ - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 1); - sync_blockdev(sb->s_bdev); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 1); + if (ret) + return ret; + } + ret = sync_blockdev(sb->s_bdev); + if (ret) + return ret; /* * Now when everything is written we can discard the pagecache so diff --git a/fs/select.c b/fs/select.c index e51796063cb6..7716d9d5be1e 100644 --- a/fs/select.c +++ b/fs/select.c @@ -458,9 +458,11 @@ get_max: return max; } -#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR) -#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR) -#define POLLEX_SET (EPOLLPRI) +#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\ + EPOLLNVAL) +#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\ + EPOLLNVAL) +#define POLLEX_SET (EPOLLPRI | EPOLLNVAL) static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, @@ -527,6 +529,7 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) break; if (!(bit & all_bits)) continue; + mask = EPOLLNVAL; f = fdget(i); if (f.file) { wait_key_set(wait, in, out, bit, @@ -534,34 +537,34 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) mask = vfs_poll(f.file, wait); fdput(f); - if ((mask & POLLIN_SET) && (in & bit)) { - res_in |= bit; - retval++; - wait->_qproc = NULL; - } - if ((mask & POLLOUT_SET) && (out & bit)) { - res_out |= bit; - retval++; - wait->_qproc = NULL; - } - if ((mask & POLLEX_SET) && (ex & bit)) { - res_ex |= bit; - retval++; - wait->_qproc = NULL; - } - /* got something, stop busy polling */ - if (retval) { - can_busy_loop = false; - busy_flag = 0; - - /* - * only remember a returned - * POLL_BUSY_LOOP if we asked for it - */ - } else if (busy_flag & mask) - can_busy_loop = true; - } + if ((mask & POLLIN_SET) && (in & bit)) { + res_in |= bit; + retval++; + wait->_qproc = NULL; + } + if ((mask & POLLOUT_SET) && (out & bit)) { + res_out |= bit; + retval++; + wait->_qproc = NULL; + } + if ((mask & POLLEX_SET) && (ex & bit)) { + res_ex |= bit; + retval++; + wait->_qproc = NULL; + } + /* got something, stop busy polling */ + if (retval) { + can_busy_loop = false; + busy_flag = 0; + + /* + * only remember a returned + * POLL_BUSY_LOOP if we asked for it + */ + } else if (busy_flag & mask) + can_busy_loop = true; + } if (res_in) *rinp = res_in; diff --git a/fs/signalfd.c b/fs/signalfd.c index 5b78719be445..3e94d181930f 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -35,17 +35,7 @@ void signalfd_cleanup(struct sighand_struct *sighand) { - wait_queue_head_t *wqh = &sighand->signalfd_wqh; - /* - * The lockless check can race with remove_wait_queue() in progress, - * but in this case its caller should run under rcu_read_lock() and - * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return. - */ - if (likely(!waitqueue_active(wqh))) - return; - - /* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */ - wake_up_poll(wqh, EPOLLHUP | POLLFREE); + wake_up_pollfree(&sighand->signalfd_wqh); } struct signalfd_ctx { diff --git a/fs/stat.c b/fs/stat.c index c38e4c2e1221..268c9eb89656 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -290,9 +290,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat # define choose_32_64(a,b) b #endif -#define valid_dev(x) choose_32_64(old_valid_dev(x),true) -#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) - #ifndef INIT_STRUCT_STAT_PADDING # define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) #endif @@ -301,7 +298,9 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) { struct stat tmp; - if (!valid_dev(stat->dev) || !valid_dev(stat->rdev)) + if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) + return -EOVERFLOW; + if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; #if BITS_PER_LONG == 32 if (stat->size > MAX_NON_LFS) @@ -309,7 +308,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) #endif INIT_STRUCT_STAT_PADDING(tmp); - tmp.st_dev = encode_dev(stat->dev); + tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -319,7 +318,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = encode_dev(stat->rdev); + tmp.st_rdev = new_encode_dev(stat->rdev); tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; tmp.st_mtime = stat->mtime.tv_sec; @@ -593,11 +592,13 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { struct compat_stat tmp; - if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) + return -EOVERFLOW; + if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; memset(&tmp, 0, sizeof(tmp)); - tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -607,7 +608,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = old_encode_dev(stat->rdev); + tmp.st_rdev = new_encode_dev(stat->rdev); if ((u64) stat->size > MAX_NON_LFS) return -EOVERFLOW; tmp.st_size = stat->size; diff --git a/fs/super.c b/fs/super.c index 877532baf513..e255c18fa2c8 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1470,8 +1470,8 @@ struct dentry *mount_nodev(struct file_system_type *fs_type, } EXPORT_SYMBOL(mount_nodev); -static int reconfigure_single(struct super_block *s, - int flags, void *data) +int reconfigure_single(struct super_block *s, + int flags, void *data) { struct fs_context *fc; int ret; @@ -1691,11 +1691,9 @@ static void lockdep_sb_freeze_acquire(struct super_block *sb) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } -static void sb_freeze_unlock(struct super_block *sb) +static void sb_freeze_unlock(struct super_block *sb, int level) { - int level; - - for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + for (level--; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); } @@ -1766,7 +1764,14 @@ int freeze_super(struct super_block *sb) sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ - sync_filesystem(sb); + ret = sync_filesystem(sb); + if (ret) { + sb->s_writers.frozen = SB_UNFROZEN; + sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); + wake_up(&sb->s_writers.wait_unfrozen); + deactivate_locked_super(sb); + return ret; + } /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; @@ -1778,7 +1783,7 @@ int freeze_super(struct super_block *sb) printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; - sb_freeze_unlock(sb); + sb_freeze_unlock(sb, SB_FREEZE_FS); wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; @@ -1829,7 +1834,7 @@ static int thaw_super_locked(struct super_block *sb) } sb->s_writers.frozen = SB_UNFROZEN; - sb_freeze_unlock(sb); + sb_freeze_unlock(sb, SB_FREEZE_FS); out: wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index efe078fe5d4a..7878f145bf1b 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -159,6 +159,77 @@ struct tracefs_fs_info { struct tracefs_mount_opts mount_opts; }; +static void change_gid(struct dentry *dentry, kgid_t gid) +{ + if (!dentry->d_inode) + return; + dentry->d_inode->i_gid = gid; +} + +/* + * Taken from d_walk, but without he need for handling renames. + * Nothing can be renamed while walking the list, as tracefs + * does not support renames. This is only called when mounting + * or remounting the file system, to set all the files to + * the given gid. + */ +static void set_gid(struct dentry *parent, kgid_t gid) +{ + struct dentry *this_parent; + struct list_head *next; + + this_parent = parent; + spin_lock(&this_parent->d_lock); + + change_gid(this_parent, gid); +repeat: + next = this_parent->d_subdirs.next; +resume: + while (next != &this_parent->d_subdirs) { + struct list_head *tmp = next; + struct dentry *dentry = list_entry(tmp, struct dentry, d_child); + next = tmp->next; + + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + + change_gid(dentry, gid); + + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&this_parent->d_lock); + spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); + this_parent = dentry; + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); + goto repeat; + } + spin_unlock(&dentry->d_lock); + } + /* + * All done at this level ... ascend and resume the search. + */ + rcu_read_lock(); +ascend: + if (this_parent != parent) { + struct dentry *child = this_parent; + this_parent = child->d_parent; + + spin_unlock(&child->d_lock); + spin_lock(&this_parent->d_lock); + + /* go into the first sibling still alive */ + do { + next = child->d_child.next; + if (next == &this_parent->d_subdirs) + goto ascend; + child = list_entry(next, struct dentry, d_child); + } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); + rcu_read_unlock(); + goto resume; + } + rcu_read_unlock(); + spin_unlock(&this_parent->d_lock); + return; +} + static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) { substring_t args[MAX_OPT_ARGS]; @@ -217,7 +288,9 @@ static int tracefs_apply_options(struct super_block *sb) inode->i_mode |= opts->mode; inode->i_uid = opts->uid; - inode->i_gid = opts->gid; + + /* Set all the group ids to the mount option */ + set_gid(sb->s_root, opts->gid); return 0; } @@ -409,6 +482,8 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, inode->i_mode = mode; inode->i_fop = fops ? fops : &tracefs_file_operations; inode->i_private = data; + inode->i_uid = d_inode(dentry->d_parent)->i_uid; + inode->i_gid = d_inode(dentry->d_parent)->i_gid; d_instantiate(dentry, inode); fsnotify_create(dentry->d_parent->d_inode, dentry); return end_creating(dentry); @@ -431,6 +506,8 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP; inode->i_op = ops; inode->i_fop = &simple_dir_operations; + inode->i_uid = d_inode(dentry->d_parent)->i_uid; + inode->i_gid = d_inode(dentry->d_parent)->i_gid; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index eeb93f009b28..83a173feb698 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -361,15 +361,18 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry, { struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1}; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1}; struct ubifs_budget_req ino_req = { .dirtied_ino = 1 }; struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir); int err, instantiated = 0; struct fscrypt_name nm; /* - * Budget request settings: new dirty inode, new direntry, - * budget for dirtied inode will be released via writeback. + * Budget request settings: new inode, new direntry, changing the + * parent directory inode. + * Allocate budget separately for new dirtied inode, the budget will + * be released via writeback. */ dbg_gen("dent '%pd', mode %#hx in dir ino %lu", @@ -439,6 +442,8 @@ out_inode: make_bad_inode(inode); if (!instantiated) iput(inode); + else if (whiteout) + iput(*whiteout); out_budg: ubifs_release_budget(c, &req); if (!instantiated) @@ -955,7 +960,8 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_info *c = dir->i_sb->s_fs_info; int err, sz_change; - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1}; struct fscrypt_name nm; /* @@ -1330,6 +1336,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (flags & RENAME_WHITEOUT) { union ubifs_dev_desc *dev = NULL; + struct ubifs_budget_req wht_req; dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); if (!dev) { @@ -1351,6 +1358,23 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, whiteout_ui->data = dev; whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0)); ubifs_assert(c, !whiteout_ui->dirty); + + memset(&wht_req, 0, sizeof(struct ubifs_budget_req)); + wht_req.dirtied_ino = 1; + wht_req.dirtied_ino_d = ALIGN(whiteout_ui->data_len, 8); + /* + * To avoid deadlock between space budget (holds ui_mutex and + * waits wb work) and writeback work(waits ui_mutex), do space + * budget before ubifs inodes locked. + */ + err = ubifs_budget_space(c, &wht_req); + if (err) { + iput(whiteout); + goto out_release; + } + + /* Add the old_dentry size to the old_dir size. */ + old_sz -= CALC_DENT_SIZE(fname_len(&old_nm)); } lock_4_inodes(old_dir, new_dir, new_inode, whiteout); @@ -1425,18 +1449,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, } if (whiteout) { - struct ubifs_budget_req wht_req = { .dirtied_ino = 1, - .dirtied_ino_d = \ - ALIGN(ubifs_inode(whiteout)->data_len, 8) }; - - err = ubifs_budget_space(c, &wht_req); - if (err) { - kfree(whiteout_ui->data); - whiteout_ui->data_len = 0; - iput(whiteout); - goto out_release; - } - inc_nlink(whiteout); mark_inode_dirty(whiteout); diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index eae9cf5a57b0..89b671ad0f9a 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -846,16 +846,42 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) */ n = aligned_len >> c->max_write_shift; if (n) { - n <<= c->max_write_shift; + int m = n - 1; + dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, wbuf->offs); - err = ubifs_leb_write(c, wbuf->lnum, buf + written, - wbuf->offs, n); + + if (m) { + /* '(n-1)<<c->max_write_shift < len' is always true. */ + m <<= c->max_write_shift; + err = ubifs_leb_write(c, wbuf->lnum, buf + written, + wbuf->offs, m); + if (err) + goto out; + wbuf->offs += m; + aligned_len -= m; + len -= m; + written += m; + } + + /* + * The non-written len of buf may be less than 'n' because + * parameter 'len' is not 8 bytes aligned, so here we read + * min(len, n) bytes from buf. + */ + n = 1 << c->max_write_shift; + memcpy(wbuf->buf, buf + written, min(len, n)); + if (n > len) { + ubifs_assert(c, n - len < 8); + ubifs_pad(c, wbuf->buf + len, n - len); + } + + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, n); if (err) goto out; wbuf->offs += n; aligned_len -= n; - len -= n; + len -= min(len, n); written += n; } diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index eeb1be259888..2923d5a6a7c0 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -101,7 +101,7 @@ static int setflags(struct inode *inode, int flags) struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_budget_req req = { .dirtied_ino = 1, - .dirtied_ino_d = ui->data_len }; + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; err = ubifs_budget_space(c, &req); if (err) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 2cbc3c36f3a8..b37c6b1e3325 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1835,7 +1835,6 @@ out: kthread_stop(c->bgt); c->bgt = NULL; } - free_wbufs(c); kfree(c->write_reserve_buf); c->write_reserve_buf = NULL; vfree(c->ileb_buf); diff --git a/fs/udf/dir.c b/fs/udf/dir.c index c19dba45aa20..d0f92a52e3ba 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -31,6 +31,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/bio.h> +#include <linux/iversion.h> #include "udf_i.h" #include "udf_sb.h" @@ -44,7 +45,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) struct fileIdentDesc *fi = NULL; struct fileIdentDesc cfi; udf_pblk_t block, iblock; - loff_t nf_pos; + loff_t nf_pos, emit_pos = 0; int flen; unsigned char *fname = NULL, *copy_name = NULL; unsigned char *nameptr; @@ -58,6 +59,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) int i, num, ret = 0; struct extent_position epos = { NULL, 0, {0, 0} }; struct super_block *sb = dir->i_sb; + bool pos_valid = false; if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) @@ -68,6 +70,21 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) if (nf_pos >= size) goto out; + /* + * Something changed since last readdir (either lseek was called or dir + * changed)? We need to verify the position correctly points at the + * beginning of some dir entry so that the directory parsing code does + * not get confused. Since UDF does not have any reliable way of + * identifying beginning of dir entry (names are under user control), + * we need to scan the directory from the beginning. + */ + if (!inode_eq_iversion(dir, file->f_version)) { + emit_pos = nf_pos; + nf_pos = 0; + } else { + pos_valid = true; + } + fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); if (!fname) { ret = -ENOMEM; @@ -123,13 +140,21 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) while (nf_pos < size) { struct kernel_lb_addr tloc; + loff_t cur_pos = nf_pos; - ctx->pos = (nf_pos >> 2) + 1; + /* Update file position only if we got past the current one */ + if (nf_pos >= emit_pos) { + ctx->pos = (nf_pos >> 2) + 1; + pos_valid = true; + } fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, &elen, &offset); if (!fi) goto out; + /* Still not at offset where user asked us to read from? */ + if (cur_pos < emit_pos) + continue; liu = le16_to_cpu(cfi.lengthOfImpUse); lfi = cfi.lengthFileIdent; @@ -187,8 +212,11 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) } /* end while */ ctx->pos = (nf_pos >> 2) + 1; + pos_valid = true; out: + if (pos_valid) + file->f_version = inode_query_iversion(dir); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 507f8f910327..639aabf30eaf 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -258,10 +258,6 @@ int udf_expand_file_adinicb(struct inode *inode) char *kaddr; struct udf_inode_info *iinfo = UDF_I(inode); int err; - struct writeback_control udf_wbc = { - .sync_mode = WB_SYNC_NONE, - .nr_to_write = 1, - }; WARN_ON_ONCE(!inode_is_locked(inode)); if (!iinfo->i_lenAlloc) { @@ -305,8 +301,10 @@ int udf_expand_file_adinicb(struct inode *inode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; /* from now on we have normal address_space methods */ inode->i_data.a_ops = &udf_aops; + set_page_dirty(page); + unlock_page(page); up_write(&iinfo->i_data_sem); - err = inode->i_data.a_ops->writepage(page, &udf_wbc); + err = filemap_fdatawrite(inode->i_mapping); if (err) { /* Restore everything back so that we don't lose data... */ lock_page(page); @@ -318,6 +316,7 @@ int udf_expand_file_adinicb(struct inode *inode) unlock_page(page); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; inode->i_data.a_ops = &udf_adinicb_aops; + iinfo->i_lenAlloc = inode->i_size; up_write(&iinfo->i_data_sem); } put_page(page); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 3c3d3b20889c..3c009562375d 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -30,6 +30,7 @@ #include <linux/sched.h> #include <linux/crc-itu-t.h> #include <linux/exportfs.h> +#include <linux/iversion.h> static inline int udf_match(int len1, const unsigned char *name1, int len2, const unsigned char *name2) @@ -135,6 +136,8 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, mark_buffer_dirty_inode(fibh->ebh, inode); mark_buffer_dirty_inode(fibh->sbh, inode); } + inode_inc_iversion(inode); + return 0; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 5663bae95700..5193b94c0683 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -57,6 +57,7 @@ #include <linux/crc-itu-t.h> #include <linux/log2.h> #include <asm/byteorder.h> +#include <linux/iversion.h> #include "udf_sb.h" #include "udf_i.h" @@ -149,6 +150,7 @@ static struct inode *udf_alloc_inode(struct super_block *sb) init_rwsem(&ei->i_data_sem); ei->cached_extent.lstart = -1; spin_lock_init(&ei->i_extent_cache_lock); + inode_set_iversion(&ei->vfs_inode, 1); return &ei->vfs_inode; } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index b3021d9b34a5..7b7a009425e2 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -714,7 +714,8 @@ xfs_ioc_space( flags |= XFS_PREALLOC_CLEAR; if (bf->l_start > XFS_ISIZE(ip)) { error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), - bf->l_start - XFS_ISIZE(ip), 0); + bf->l_start - XFS_ISIZE(ip), + XFS_BMAPI_PREALLOC); if (error) goto out_unlock; } |