summaryrefslogtreecommitdiff
path: root/fs/fuse/dir.c
diff options
context:
space:
mode:
authorMiklos Szeredi <mszeredi@suse.cz>2008-04-30 00:54:41 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-30 08:29:50 -0700
commit3be5a52b30aa5cf9d795b7634f728f612197b1c4 (patch)
tree5a78251a351e273cf2061a527a381c7ba256fc15 /fs/fuse/dir.c
parentb88473f73e6d7b6af9cfc4ecc349d82c75d9a6af (diff)
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/fuse/dir.c')
-rw-r--r--fs/fuse/dir.c84
1 files changed, 81 insertions, 3 deletions
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c4807b3fc8a3..48b9971ecd97 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1107,6 +1107,50 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
}
/*
+ * Prevent concurrent writepages on inode
+ *
+ * This is done by adding a negative bias to the inode write counter
+ * and waiting for all pending writes to finish.
+ */
+void fuse_set_nowrite(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ BUG_ON(!mutex_is_locked(&inode->i_mutex));
+
+ spin_lock(&fc->lock);
+ BUG_ON(fi->writectr < 0);
+ fi->writectr += FUSE_NOWRITE;
+ spin_unlock(&fc->lock);
+ wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);
+}
+
+/*
+ * Allow writepages on inode
+ *
+ * Remove the bias from the writecounter and send any queued
+ * writepages.
+ */
+static void __fuse_release_nowrite(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ BUG_ON(fi->writectr != FUSE_NOWRITE);
+ fi->writectr = 0;
+ fuse_flush_writepages(inode);
+}
+
+void fuse_release_nowrite(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ spin_lock(&fc->lock);
+ __fuse_release_nowrite(inode);
+ spin_unlock(&fc->lock);
+}
+
+/*
* Set attributes, and at the same time refresh them.
*
* Truncation is slightly complicated, because the 'truncate' request
@@ -1122,6 +1166,8 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
struct fuse_req *req;
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
+ bool is_truncate = false;
+ loff_t oldsize;
int err;
if (!fuse_allow_task(fc, current))
@@ -1145,12 +1191,16 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
send_sig(SIGXFSZ, current, 0);
return -EFBIG;
}
+ is_truncate = true;
}
req = fuse_get_req(fc);
if (IS_ERR(req))
return PTR_ERR(req);
+ if (is_truncate)
+ fuse_set_nowrite(inode);
+
memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
iattr_to_fattr(attr, &inarg);
@@ -1181,16 +1231,44 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
if (err) {
if (err == -EINTR)
fuse_invalidate_attr(inode);
- return err;
+ goto error;
}
if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
make_bad_inode(inode);
- return -EIO;
+ err = -EIO;
+ goto error;
+ }
+
+ spin_lock(&fc->lock);
+ fuse_change_attributes_common(inode, &outarg.attr,
+ attr_timeout(&outarg));
+ oldsize = inode->i_size;
+ i_size_write(inode, outarg.attr.size);
+
+ if (is_truncate) {
+ /* NOTE: this may release/reacquire fc->lock */
+ __fuse_release_nowrite(inode);
+ }
+ spin_unlock(&fc->lock);
+
+ /*
+ * Only call invalidate_inode_pages2() after removing
+ * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
+ */
+ if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
+ if (outarg.attr.size < oldsize)
+ fuse_truncate(inode->i_mapping, outarg.attr.size);
+ invalidate_inode_pages2(inode->i_mapping);
}
- fuse_change_attributes(inode, &outarg.attr, attr_timeout(&outarg), 0);
return 0;
+
+error:
+ if (is_truncate)
+ fuse_release_nowrite(inode);
+
+ return err;
}
static int fuse_setattr(struct dentry *entry, struct iattr *attr)