summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorMarcel Ziswiler <marcel.ziswiler@toradex.com>2019-12-18 22:52:20 +0100
committerMarcel Ziswiler <marcel.ziswiler@toradex.com>2019-12-18 22:52:20 +0100
commit1ddf624b0b268fdc0b80b1de618b98f8d117afea (patch)
tree3d3218332bcb34cb0afa01d6ad996058a3dbcb77 /fs
parent6b774eec1f9d3064e9b33634dfa99d5666d0a73a (diff)
parentbfb9e5c03076a446b1f4f6a523ddc8d723c907a6 (diff)
Merge tag 'v4.14.159' into 4.14-2.0.x-imx
This is the 4.14.159 stable release Conflicts: arch/arm/Kconfig.debug arch/arm/boot/dts/imx7s.dtsi arch/arm/mach-imx/cpuidle-imx6sx.c drivers/crypto/caam/caamalg.c drivers/crypto/mxs-dcp.c drivers/dma/imx-sdma.c drivers/input/keyboard/imx_keypad.c drivers/net/can/flexcan.c drivers/net/can/rx-offload.c drivers/net/wireless/ath/ath10k/pci.c drivers/pci/dwc/pci-imx6.c drivers/spi/spi-fsl-lpspi.c drivers/usb/dwc3/gadget.c
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c2
-rw-r--r--fs/9p/vfs_addr.c6
-rw-r--r--fs/9p/vfs_file.c3
-rw-r--r--fs/adfs/super.c5
-rw-r--r--fs/autofs4/expire.c5
-rw-r--r--fs/binfmt_elf.c12
-rw-r--r--fs/binfmt_flat.c23
-rw-r--r--fs/binfmt_script.c57
-rw-r--r--fs/btrfs/backref.c2
-rw-r--r--fs/btrfs/compression.c31
-rw-r--r--fs/btrfs/compression.h3
-rw-r--r--fs/btrfs/ctree.c9
-rw-r--r--fs/btrfs/delayed-inode.c13
-rw-r--r--fs/btrfs/delayed-ref.c3
-rw-r--r--fs/btrfs/dev-replace.c29
-rw-r--r--fs/btrfs/extent-tree.c9
-rw-r--r--fs/btrfs/file.c7
-rw-r--r--fs/btrfs/free-space-cache.c6
-rw-r--r--fs/btrfs/inode.c27
-rw-r--r--fs/btrfs/props.c6
-rw-r--r--fs/btrfs/qgroup.c45
-rw-r--r--fs/btrfs/reada.c5
-rw-r--r--fs/btrfs/send.c102
-rw-r--r--fs/btrfs/super.c6
-rw-r--r--fs/btrfs/transaction.c10
-rw-r--r--fs/btrfs/tree-log.c84
-rw-r--r--fs/btrfs/volumes.c5
-rw-r--r--fs/btrfs/volumes.h6
-rw-r--r--fs/ceph/caps.c22
-rw-r--r--fs/ceph/inode.c16
-rw-r--r--fs/ceph/locks.c3
-rw-r--r--fs/ceph/mds_client.c4
-rw-r--r--fs/ceph/snap.c4
-rw-r--r--fs/ceph/super.c11
-rw-r--r--fs/ceph/super.h9
-rw-r--r--fs/ceph/xattr.c33
-rw-r--r--fs/cifs/cifsglob.h5
-rw-r--r--fs/cifs/cifsproto.h1
-rw-r--r--fs/cifs/connect.c30
-rw-r--r--fs/cifs/dir.c8
-rw-r--r--fs/cifs/file.c34
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/cifs/netmisc.c4
-rw-r--r--fs/cifs/smb1ops.c3
-rw-r--r--fs/cifs/smb2file.c2
-rw-r--r--fs/cifs/smb2maperror.c2
-rw-r--r--fs/cifs/smb2misc.c7
-rw-r--r--fs/cifs/smb2ops.c15
-rw-r--r--fs/cifs/smb2pdu.c7
-rw-r--r--fs/cifs/xattr.c2
-rw-r--r--fs/coda/file.c70
-rw-r--r--fs/coda/psdev.c5
-rw-r--r--fs/compat_ioctl.c13
-rw-r--r--fs/configfs/configfs_internal.h15
-rw-r--r--fs/configfs/dir.c151
-rw-r--r--fs/configfs/file.c294
-rw-r--r--fs/configfs/symlink.c33
-rw-r--r--fs/crypto/crypto.c15
-rw-r--r--fs/crypto/policy.c2
-rw-r--r--fs/dlm/lockspace.c1
-rw-r--r--fs/dlm/member.c7
-rw-r--r--fs/dlm/memory.c9
-rw-r--r--fs/dlm/user.c5
-rw-r--r--fs/ecryptfs/crypto.c12
-rw-r--r--fs/ecryptfs/inode.c19
-rw-r--r--fs/exec.c4
-rw-r--r--fs/exofs/super.c37
-rw-r--r--fs/exportfs/expfs.c31
-rw-r--r--fs/ext2/inode.c7
-rw-r--r--fs/ext4/dir.c19
-rw-r--r--fs/ext4/ext4_jbd2.h12
-rw-r--r--fs/ext4/extents.c4
-rw-r--r--fs/ext4/file.c4
-rw-r--r--fs/ext4/inode.c73
-rw-r--r--fs/ext4/ioctl.c46
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c58
-rw-r--r--fs/ext4/super.c21
-rw-r--r--fs/f2fs/checkpoint.c8
-rw-r--r--fs/f2fs/data.c16
-rw-r--r--fs/f2fs/dir.c1
-rw-r--r--fs/f2fs/f2fs.h4
-rw-r--r--fs/f2fs/file.c4
-rw-r--r--fs/f2fs/gc.c12
-rw-r--r--fs/f2fs/inline.c4
-rw-r--r--fs/f2fs/inode.c4
-rw-r--r--fs/f2fs/node.c4
-rw-r--r--fs/f2fs/recovery.c19
-rw-r--r--fs/f2fs/segment.c48
-rw-r--r--fs/f2fs/segment.h4
-rw-r--r--fs/f2fs/super.c52
-rw-r--r--fs/fat/dir.c13
-rw-r--r--fs/fat/fatent.c3
-rw-r--r--fs/fs-writeback.c17
-rw-r--r--fs/fuse/control.c4
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/fuse/dir.c40
-rw-r--r--fs/fuse/file.c11
-rw-r--r--fs/fuse/fuse_i.h2
-rw-r--r--fs/gfs2/bmap.c3
-rw-r--r--fs/gfs2/log.c8
-rw-r--r--fs/gfs2/log.h1
-rw-r--r--fs/gfs2/lops.c5
-rw-r--r--fs/gfs2/rgrp.c15
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/gfs2/trans.c2
-rw-r--r--fs/hfs/brec.c1
-rw-r--r--fs/hfs/btree.c41
-rw-r--r--fs/hfs/btree.h1
-rw-r--r--fs/hfs/catalog.c16
-rw-r--r--fs/hfs/extent.c10
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/attributes.c10
-rw-r--r--fs/hfsplus/brec.c1
-rw-r--r--fs/hfsplus/btree.c44
-rw-r--r--fs/hfsplus/catalog.c24
-rw-r--r--fs/hfsplus/extents.c8
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/inode.c1
-rw-r--r--fs/inode.c9
-rw-r--r--fs/iomap.c18
-rw-r--r--fs/jbd2/commit.c23
-rw-r--r--fs/jbd2/journal.c4
-rw-r--r--fs/jbd2/transaction.c49
-rw-r--r--fs/kernfs/dir.c5
-rw-r--r--fs/kernfs/symlink.c5
-rw-r--r--fs/libfs.c134
-rw-r--r--fs/lockd/clnt4xdr.c22
-rw-r--r--fs/lockd/clntxdr.c22
-rw-r--r--fs/nfs/client.c4
-rw-r--r--fs/nfs/delegation.c16
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c297
-rw-r--r--fs/nfs/direct.c94
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c4
-rw-r--r--fs/nfs/inode.c1
-rw-r--r--fs/nfs/nfs4_fs.h3
-rw-r--r--fs/nfs/nfs4client.c5
-rw-r--r--fs/nfs/nfs4file.c14
-rw-r--r--fs/nfs/nfs4proc.c76
-rw-r--r--fs/nfs/nfs4state.c27
-rw-r--r--fs/nfs/nfs4xdr.c2
-rw-r--r--fs/nfs/pagelist.c34
-rw-r--r--fs/nfs/pnfs.c9
-rw-r--r--fs/nfs/proc.c7
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/write.c16
-rw-r--r--fs/nfsd/nfs4recover.c17
-rw-r--r--fs/nfsd/nfs4state.c11
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/vfs.c17
-rw-r--r--fs/ocfs2/aops.c25
-rw-r--r--fs/ocfs2/buffer_head_io.c77
-rw-r--r--fs/ocfs2/dcache.c12
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c2
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c23
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/ioctl.c2
-rw-r--r--fs/ocfs2/journal.c9
-rw-r--r--fs/ocfs2/localalloc.c3
-rw-r--r--fs/ocfs2/move_extents.c17
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/stackglue.c6
-rw-r--r--fs/ocfs2/stackglue.h3
-rw-r--r--fs/ocfs2/xattr.c3
-rw-r--r--fs/open.c19
-rw-r--r--fs/orangefs/orangefs-sysfs.c2
-rw-r--r--fs/overlayfs/dir.c2
-rw-r--r--fs/overlayfs/inode.c3
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/page.c28
-rw-r--r--fs/proc/proc_sysctl.c4
-rw-r--r--fs/proc/vmcore.c10
-rw-r--r--fs/pstore/ram.c2
-rw-r--r--fs/quota/dquot.c15
-rw-r--r--fs/read_write.c82
-rw-r--r--fs/reiserfs/inode.c12
-rw-r--r--fs/reiserfs/namei.c7
-rw-r--r--fs/reiserfs/reiserfs.h2
-rw-r--r--fs/reiserfs/super.c2
-rw-r--r--fs/reiserfs/xattr.c19
-rw-r--r--fs/reiserfs/xattr_acl.c4
-rw-r--r--fs/statfs.c17
-rw-r--r--fs/ubifs/tnc.c16
-rw-r--r--fs/udf/inode.c93
-rw-r--r--fs/userfaultfd.c25
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c29
-rw-r--r--fs/xfs/xfs_buf.c40
-rw-r--r--fs/xfs/xfs_ioctl32.c40
-rw-r--r--fs/xfs/xfs_iops.c1
-rw-r--r--fs/xfs/xfs_rtalloc.c4
-rw-r--r--fs/xfs/xfs_super.c10
192 files changed, 2604 insertions, 1207 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 082d227fa56b..6261719f6f2a 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -276,7 +276,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
switch (handler->flags) {
case ACL_TYPE_ACCESS:
if (acl) {
- struct iattr iattr;
+ struct iattr iattr = { 0 };
struct posix_acl *old_acl = acl;
retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index e1cbdfdb7c68..197069303510 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -50,8 +50,9 @@
* @page: structure to page
*
*/
-static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
+static int v9fs_fid_readpage(void *data, struct page *page)
{
+ struct p9_fid *fid = data;
struct inode *inode = page->mapping->host;
struct bio_vec bvec = {.bv_page = page, .bv_len = PAGE_SIZE};
struct iov_iter to;
@@ -122,7 +123,8 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
if (ret == 0)
return ret;
- ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
+ ret = read_cache_pages(mapping, pages, v9fs_fid_readpage,
+ filp->private_data);
p9_debug(P9_DEBUG_VFS, " = %d\n", ret);
return ret;
}
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 89e69904976a..2651192f0166 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -528,6 +528,7 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
v9inode = V9FS_I(inode);
mutex_lock(&v9inode->v_mutex);
if (!v9inode->writeback_fid &&
+ (vma->vm_flags & VM_SHARED) &&
(vma->vm_flags & VM_WRITE)) {
/*
* clone a fid and add it to writeback_fid
@@ -629,6 +630,8 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
(vma->vm_end - vma->vm_start - 1),
};
+ if (!(vma->vm_flags & VM_SHARED))
+ return;
p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index c9fdfb112933..e42c30001509 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -368,6 +368,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
struct buffer_head *bh;
struct object_info root_obj;
unsigned char *b_data;
+ unsigned int blocksize;
struct adfs_sb_info *asb;
struct inode *root;
int ret = -EINVAL;
@@ -419,8 +420,10 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
goto error_free_bh;
}
+ blocksize = 1 << dr->log2secsize;
brelse(bh);
- if (sb_set_blocksize(sb, 1 << dr->log2secsize)) {
+
+ if (sb_set_blocksize(sb, blocksize)) {
bh = sb_bread(sb, ADFS_DISCRECORD / sb->s_blocksize);
if (!bh) {
adfs_error(sb, "couldn't read superblock on "
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 141f9bc213a3..94a0017c923b 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -472,9 +472,10 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
*/
flags &= ~AUTOFS_EXP_LEAVES;
found = should_expire(expired, mnt, timeout, how);
- if (!found || found != expired)
- /* Something has changed, continue */
+ if (found != expired) { // something has changed, continue
+ dput(found);
goto next;
+ }
if (expired != dentry)
dput(dentry);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 469666df91da..166846a40078 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1116,6 +1116,18 @@ static int load_elf_binary(struct linux_binprm *bprm)
current->mm->start_stack = bprm->p;
if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
+ /*
+ * For architectures with ELF randomization, when executing
+ * a loader directly (i.e. no interpreter listed in ELF
+ * headers), move the brk area out of the mmap region
+ * (since it grows up, and may collide early with the stack
+ * growing down), and into the unused ELF_ET_DYN_BASE region.
+ */
+ if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
+ loc->elf_ex.e_type == ET_DYN && !interpreter)
+ current->mm->brk = current->mm->start_brk =
+ ELF_ET_DYN_BASE;
+
current->mm->brk = current->mm->start_brk =
arch_randomize_brk(current->mm);
#ifdef compat_brk_randomized
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 5d6b94475f27..78b5bac82559 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -856,9 +856,14 @@ err:
static int load_flat_shared_library(int id, struct lib_info *libs)
{
+ /*
+ * This is a fake bprm struct; only the members "buf", "file" and
+ * "filename" are actually used.
+ */
struct linux_binprm bprm;
int res;
char buf[16];
+ loff_t pos = 0;
memset(&bprm, 0, sizeof(bprm));
@@ -872,25 +877,11 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
if (IS_ERR(bprm.file))
return res;
- bprm.cred = prepare_exec_creds();
- res = -ENOMEM;
- if (!bprm.cred)
- goto out;
-
- /* We don't really care about recalculating credentials at this point
- * as we're past the point of no return and are dealing with shared
- * libraries.
- */
- bprm.called_set_creds = 1;
+ res = kernel_read(bprm.file, bprm.buf, BINPRM_BUF_SIZE, &pos);
- res = prepare_binprm(&bprm);
-
- if (!res)
+ if (res >= 0)
res = load_flat_file(&bprm, libs, id, NULL);
- abort_creds(bprm.cred);
-
-out:
allow_write_access(bprm.file);
fput(bprm.file);
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 7cde3f46ad26..e996174cbfc0 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -14,13 +14,30 @@
#include <linux/err.h>
#include <linux/fs.h>
+static inline bool spacetab(char c) { return c == ' ' || c == '\t'; }
+static inline char *next_non_spacetab(char *first, const char *last)
+{
+ for (; first <= last; first++)
+ if (!spacetab(*first))
+ return first;
+ return NULL;
+}
+static inline char *next_terminator(char *first, const char *last)
+{
+ for (; first <= last; first++)
+ if (spacetab(*first) || !*first)
+ return first;
+ return NULL;
+}
+
static int load_script(struct linux_binprm *bprm)
{
const char *i_arg, *i_name;
- char *cp;
+ char *cp, *buf_end;
struct file *file;
int retval;
+ /* Not ours to exec if we don't start with "#!". */
if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
return -ENOEXEC;
@@ -33,18 +50,40 @@ static int load_script(struct linux_binprm *bprm)
if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
return -ENOENT;
- /*
- * This section does the #! interpretation.
- * Sorta complicated, but hopefully it will work. -TYT
- */
-
+ /* Release since we are not mapping a binary into memory. */
allow_write_access(bprm->file);
fput(bprm->file);
bprm->file = NULL;
- bprm->buf[BINPRM_BUF_SIZE - 1] = '\0';
- if ((cp = strchr(bprm->buf, '\n')) == NULL)
- cp = bprm->buf+BINPRM_BUF_SIZE-1;
+ /*
+ * This section handles parsing the #! line into separate
+ * interpreter path and argument strings. We must be careful
+ * because bprm->buf is not yet guaranteed to be NUL-terminated
+ * (though the buffer will have trailing NUL padding when the
+ * file size was smaller than the buffer size).
+ *
+ * We do not want to exec a truncated interpreter path, so either
+ * we find a newline (which indicates nothing is truncated), or
+ * we find a space/tab/NUL after the interpreter path (which
+ * itself may be preceded by spaces/tabs). Truncating the
+ * arguments is fine: the interpreter can re-read the script to
+ * parse them on its own.
+ */
+ buf_end = bprm->buf + sizeof(bprm->buf) - 1;
+ cp = strnchr(bprm->buf, sizeof(bprm->buf), '\n');
+ if (!cp) {
+ cp = next_non_spacetab(bprm->buf + 2, buf_end);
+ if (!cp)
+ return -ENOEXEC; /* Entire buf is spaces/tabs */
+ /*
+ * If there is no later space/tab/NUL we must assume the
+ * interpreter path is truncated.
+ */
+ if (!next_terminator(cp, buf_end))
+ return -ENOEXEC;
+ cp = buf_end;
+ }
+ /* NUL-terminate the buffer and any trailing spaces/tabs. */
*cp = '\0';
while (cp > bprm->buf) {
cp--;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index d826fbaf7d50..e4d5e6eae409 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1290,8 +1290,6 @@ again:
ret = -EIO;
goto out;
}
- btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
if (!path->skip_locking) {
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 280384bf34f1..ccd9c709375e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -43,6 +43,37 @@
#include "extent_io.h"
#include "extent_map.h"
+static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
+
+const char* btrfs_compress_type2str(enum btrfs_compression_type type)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB:
+ case BTRFS_COMPRESS_LZO:
+ case BTRFS_COMPRESS_ZSTD:
+ case BTRFS_COMPRESS_NONE:
+ return btrfs_compress_types[type];
+ }
+
+ return NULL;
+}
+
+bool btrfs_compress_is_valid_type(const char *str, size_t len)
+{
+ int i;
+
+ for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
+ size_t comp_len = strlen(btrfs_compress_types[i]);
+
+ if (len < comp_len)
+ continue;
+
+ if (!strncmp(btrfs_compress_types[i], str, comp_len))
+ return true;
+ }
+ return false;
+}
+
static int btrfs_decompress_bio(struct compressed_bio *cb);
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index d2781ff8f994..0b185e277df4 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -130,6 +130,9 @@ extern const struct btrfs_compress_op btrfs_zlib_compress;
extern const struct btrfs_compress_op btrfs_lzo_compress;
extern const struct btrfs_compress_op btrfs_zstd_compress;
+const char* btrfs_compress_type2str(enum btrfs_compression_type type);
+bool btrfs_compress_is_valid_type(const char *str, size_t len);
+
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d1b9900ebc9b..d2263caff307 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1414,6 +1414,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
struct tree_mod_elem *tm;
struct extent_buffer *eb = NULL;
struct extent_buffer *eb_root;
+ u64 eb_root_owner = 0;
struct extent_buffer *old;
struct tree_mod_root *old_root = NULL;
u64 old_generation = 0;
@@ -1448,6 +1449,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
free_extent_buffer(old);
}
} else if (old_root) {
+ eb_root_owner = btrfs_header_owner(eb_root);
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
eb = alloc_dummy_extent_buffer(fs_info, logical);
@@ -1465,7 +1467,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
- btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
+ btrfs_set_header_owner(eb, eb_root_owner);
btrfs_set_header_level(eb, old_root->level);
btrfs_set_header_generation(eb, old_generation);
}
@@ -2986,6 +2988,10 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
again:
b = get_old_root(root, time_seq);
+ if (!b) {
+ ret = -EIO;
+ goto done;
+ }
level = btrfs_header_level(b);
p->locks[level] = BTRFS_READ_LOCK;
@@ -5492,6 +5498,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
advance_left = advance_right = 0;
while (1) {
+ cond_resched();
if (advance_left && !left_end_reached) {
ret = tree_advance(fs_info, left_path, &left_level,
left_root_level,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 04f39111fafb..87414fc9e268 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1975,12 +1975,19 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
}
inode_id = delayed_nodes[n - 1]->inode_id + 1;
-
- for (i = 0; i < n; i++)
- refcount_inc(&delayed_nodes[i]->refs);
+ for (i = 0; i < n; i++) {
+ /*
+ * Don't increase refs in case the node is dead and
+ * about to be removed from the tree in the loop below
+ */
+ if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
+ delayed_nodes[i] = NULL;
+ }
spin_unlock(&root->inode_lock);
for (i = 0; i < n; i++) {
+ if (!delayed_nodes[i])
+ continue;
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 93ffa898df6d..d56bd3625468 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -195,8 +195,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
ref->in_tree = 0;
btrfs_put_delayed_ref(ref);
atomic_dec(&delayed_refs->num_entries);
- if (trans->delayed_ref_updates)
- trans->delayed_ref_updates--;
}
static bool merge_ref(struct btrfs_trans_handle *trans,
@@ -458,7 +456,6 @@ add_tail:
if (ref->action == BTRFS_ADD_DELAYED_REF)
list_add_tail(&ref->add_list, &href->ref_add_list);
atomic_inc(&root->num_entries);
- trans->delayed_ref_updates++;
spin_unlock(&href->lock);
return ret;
}
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f86457713e60..f1e9dd246ab0 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -512,18 +512,27 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
}
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return PTR_ERR(trans);
+ while (1) {
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans);
+ WARN_ON(ret);
+ mutex_lock(&uuid_mutex);
+ /* keep away write_all_supers() during the finishing procedure */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_info->chunk_mutex);
+ if (src_device->has_pending_chunks) {
+ mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&uuid_mutex);
+ } else {
+ break;
+ }
}
- ret = btrfs_commit_transaction(trans);
- WARN_ON(ret);
- mutex_lock(&uuid_mutex);
- /* keep away write_all_supers() during the finishing procedure */
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- mutex_lock(&fs_info->chunk_mutex);
btrfs_dev_replace_lock(dev_replace, 1);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 49766721b2b1..fd15f396b3a0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7706,6 +7706,14 @@ search:
*/
if ((flags & extra) && !(block_group->flags & extra))
goto loop;
+
+ /*
+ * This block group has different flags than we want.
+ * It's possible that we have MIXED_GROUP flag but no
+ * block group is mixed. Just skip such block group.
+ */
+ btrfs_release_block_group(block_group, delalloc);
+ continue;
}
have_block_group:
@@ -10247,6 +10255,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
btrfs_err(info,
"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
cache->key.objectid);
+ btrfs_put_block_group(cache);
ret = -EINVAL;
goto error;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 97958ecaeed9..bf654d48eb46 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1625,6 +1625,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
break;
}
+ only_release_metadata = false;
sector_offset = pos & (fs_info->sectorsize - 1);
reserve_bytes = round_up(write_bytes + sector_offset,
fs_info->sectorsize);
@@ -1778,7 +1779,6 @@ again:
set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
lockend, EXTENT_NORESERVE, NULL,
NULL, GFP_NOFS);
- only_release_metadata = false;
}
btrfs_drop_pages(pages, num_pages);
@@ -2784,6 +2784,11 @@ out_only_mutex:
* for detecting, at fsync time, if the inode isn't yet in the
* log tree or it's there but not up to date.
*/
+ struct timespec now = current_time(inode);
+
+ inode_inc_iversion(inode);
+ inode->i_mtime = now;
+ inode->i_ctime = now;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9f31b81a5e27..abeb26d48d0a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -398,6 +398,12 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode
if (uptodate && !PageUptodate(page)) {
btrfs_readpage(NULL, page);
lock_page(page);
+ if (page->mapping != inode->i_mapping) {
+ btrfs_err(BTRFS_I(inode)->root->fs_info,
+ "free space cache page truncated");
+ io_ctl_drop_pages(io_ctl);
+ return -EIO;
+ }
if (!PageUptodate(page)) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
"error reading free space cache");
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ea7b65c025c2..739f45b04b52 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -405,10 +405,31 @@ static noinline int add_async_extent(struct async_cow *cow,
return 0;
}
+/*
+ * Check if the inode has flags compatible with compression
+ */
+static inline bool inode_can_compress(struct inode *inode)
+{
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
+ BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ return false;
+ return true;
+}
+
+/*
+ * Check if the inode needs to be submitted to compression, based on mount
+ * options, defragmentation, properties or heuristics.
+ */
static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ if (!inode_can_compress(inode)) {
+ WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
+ KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
+ btrfs_ino(BTRFS_I(inode)));
+ return 0;
+ }
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
@@ -1626,7 +1647,8 @@ static int run_delalloc_range(void *private_data, struct page *locked_page,
} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
- } else if (!inode_need_compress(inode, start, end)) {
+ } else if (!inode_can_compress(inode) ||
+ !inode_need_compress(inode, start, end)) {
ret = cow_file_range(inode, locked_page, start, end, end,
page_started, nr_written, 1, NULL);
} else {
@@ -9817,6 +9839,9 @@ static int btrfs_rename_exchange(struct inode *old_dir,
goto out_notrans;
}
+ if (dest != root)
+ btrfs_record_root_in_trans(trans, dest);
+
/*
* We need to find a free sequence number both in the source and
* in the destination directory for the exchange.
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 266f9069307b..b9c7d8508e35 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -386,11 +386,7 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
static int prop_compression_validate(const char *value, size_t len)
{
- if (!strncmp("lzo", value, 3))
- return 0;
- else if (!strncmp("zlib", value, 4))
- return 0;
- else if (!strncmp("zstd", value, 4))
+ if (btrfs_compress_is_valid_type(value, len))
return 0;
return -EINVAL;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d6d6e9593e39..cb6e8cb0de94 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -722,10 +722,10 @@ out:
return ret;
}
-static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_root *root)
+static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *quota_root = fs_info->quota_root;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *l;
@@ -741,7 +741,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
if (ret > 0)
ret = -ENOENT;
@@ -2110,7 +2110,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
spin_unlock(&fs_info->qgroup_lock);
- ret = update_qgroup_status_item(trans, fs_info, quota_root);
+ ret = update_qgroup_status_item(trans);
if (ret)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -2645,9 +2645,6 @@ out:
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
- if (!btrfs_fs_closing(fs_info))
- fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
-
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -2663,16 +2660,30 @@ out:
trans = btrfs_start_transaction(fs_info->quota_root, 1);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
+ trans = NULL;
btrfs_err(fs_info,
"fail to start transaction for status update: %d",
err);
- goto done;
}
- ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root);
- if (ret < 0) {
- err = ret;
- btrfs_err(fs_info, "fail to update qgroup status: %d", err);
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (!btrfs_fs_closing(fs_info))
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ if (trans) {
+ ret = update_qgroup_status_item(trans);
+ if (ret < 0) {
+ err = ret;
+ btrfs_err(fs_info, "fail to update qgroup status: %d",
+ err);
+ }
}
+ fs_info->qgroup_rescan_running = false;
+ complete_all(&fs_info->qgroup_rescan_completion);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ if (!trans)
+ return;
+
btrfs_end_transaction(trans);
if (btrfs_fs_closing(fs_info)) {
@@ -2683,12 +2694,6 @@ out:
} else {
btrfs_err(fs_info, "qgroup scan failed with %d", err);
}
-
-done:
- mutex_lock(&fs_info->qgroup_rescan_lock);
- fs_info->qgroup_rescan_running = false;
- mutex_unlock(&fs_info->qgroup_rescan_lock);
- complete_all(&fs_info->qgroup_rescan_completion);
}
/*
@@ -2951,7 +2956,7 @@ static int qgroup_free_reserved_data(struct inode *inode,
* EXTENT_QGROUP_RESERVED, we won't double free.
* So not need to rush.
*/
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree,
free_start, free_start + free_len - 1,
EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index ab852b8e3e37..3a4e15b39cc1 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -759,6 +759,7 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
u64 total = 0;
int i;
+again:
do {
enqueued = 0;
mutex_lock(&fs_devices->device_list_mutex);
@@ -770,6 +771,10 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
mutex_unlock(&fs_devices->device_list_mutex);
total += enqueued;
} while (enqueued && total < 10000);
+ if (fs_devices->seed) {
+ fs_devices = fs_devices->seed;
+ goto again;
+ }
if (enqueued == 0)
return;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 14c4062a6e58..1211fdcd425d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -37,6 +37,14 @@
#include "compression.h"
/*
+ * Maximum number of references an extent can have in order for us to attempt to
+ * issue clone operations instead of write operations. This currently exists to
+ * avoid hitting limitations of the backreference walking code (taking a lot of
+ * time and using too much memory for extents with large number of references).
+ */
+#define SEND_MAX_EXTENT_REFS 64
+
+/*
* A fs_path is a helper to dynamically build path names with unknown size.
* It reallocates the internal buffer on demand.
* It allows fast adding of path elements on the right side (normal path) and
@@ -1324,6 +1332,7 @@ static int find_extent_clone(struct send_ctx *sctx,
struct clone_root *cur_clone_root;
struct btrfs_key found_key;
struct btrfs_path *tmp_path;
+ struct btrfs_extent_item *ei;
int compressed;
u32 i;
@@ -1373,7 +1382,6 @@ static int find_extent_clone(struct send_ctx *sctx,
ret = extent_from_logical(fs_info, disk_byte, tmp_path,
&found_key, &flags);
up_read(&fs_info->commit_root_sem);
- btrfs_release_path(tmp_path);
if (ret < 0)
goto out;
@@ -1382,6 +1390,21 @@ static int find_extent_clone(struct send_ctx *sctx,
goto out;
}
+ ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
+ struct btrfs_extent_item);
+ /*
+ * Backreference walking (iterate_extent_inodes() below) is currently
+ * too expensive when an extent has a large number of references, both
+ * in time spent and used memory. So for now just fallback to write
+ * operations instead of clone operations when an extent has more than
+ * a certain amount of references.
+ */
+ if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
+ ret = -ENOENT;
+ goto out;
+ }
+ btrfs_release_path(tmp_path);
+
/*
* Setup the clone roots.
*/
@@ -6130,68 +6153,21 @@ static int changed_extent(struct send_ctx *sctx,
{
int ret = 0;
- if (sctx->cur_ino != sctx->cmp_key->objectid) {
-
- if (result == BTRFS_COMPARE_TREE_CHANGED) {
- struct extent_buffer *leaf_l;
- struct extent_buffer *leaf_r;
- struct btrfs_file_extent_item *ei_l;
- struct btrfs_file_extent_item *ei_r;
-
- leaf_l = sctx->left_path->nodes[0];
- leaf_r = sctx->right_path->nodes[0];
- ei_l = btrfs_item_ptr(leaf_l,
- sctx->left_path->slots[0],
- struct btrfs_file_extent_item);
- ei_r = btrfs_item_ptr(leaf_r,
- sctx->right_path->slots[0],
- struct btrfs_file_extent_item);
-
- /*
- * We may have found an extent item that has changed
- * only its disk_bytenr field and the corresponding
- * inode item was not updated. This case happens due to
- * very specific timings during relocation when a leaf
- * that contains file extent items is COWed while
- * relocation is ongoing and its in the stage where it
- * updates data pointers. So when this happens we can
- * safely ignore it since we know it's the same extent,
- * but just at different logical and physical locations
- * (when an extent is fully replaced with a new one, we
- * know the generation number must have changed too,
- * since snapshot creation implies committing the current
- * transaction, and the inode item must have been updated
- * as well).
- * This replacement of the disk_bytenr happens at
- * relocation.c:replace_file_extents() through
- * relocation.c:btrfs_reloc_cow_block().
- */
- if (btrfs_file_extent_generation(leaf_l, ei_l) ==
- btrfs_file_extent_generation(leaf_r, ei_r) &&
- btrfs_file_extent_ram_bytes(leaf_l, ei_l) ==
- btrfs_file_extent_ram_bytes(leaf_r, ei_r) &&
- btrfs_file_extent_compression(leaf_l, ei_l) ==
- btrfs_file_extent_compression(leaf_r, ei_r) &&
- btrfs_file_extent_encryption(leaf_l, ei_l) ==
- btrfs_file_extent_encryption(leaf_r, ei_r) &&
- btrfs_file_extent_other_encoding(leaf_l, ei_l) ==
- btrfs_file_extent_other_encoding(leaf_r, ei_r) &&
- btrfs_file_extent_type(leaf_l, ei_l) ==
- btrfs_file_extent_type(leaf_r, ei_r) &&
- btrfs_file_extent_disk_bytenr(leaf_l, ei_l) !=
- btrfs_file_extent_disk_bytenr(leaf_r, ei_r) &&
- btrfs_file_extent_disk_num_bytes(leaf_l, ei_l) ==
- btrfs_file_extent_disk_num_bytes(leaf_r, ei_r) &&
- btrfs_file_extent_offset(leaf_l, ei_l) ==
- btrfs_file_extent_offset(leaf_r, ei_r) &&
- btrfs_file_extent_num_bytes(leaf_l, ei_l) ==
- btrfs_file_extent_num_bytes(leaf_r, ei_r))
- return 0;
- }
-
- inconsistent_snapshot_error(sctx, result, "extent");
- return -EIO;
- }
+ /*
+ * We have found an extent item that changed without the inode item
+ * having changed. This can happen either after relocation (where the
+ * disk_bytenr of an extent item is replaced at
+ * relocation.c:replace_file_extents()) or after deduplication into a
+ * file in both the parent and send snapshots (where an extent item can
+ * get modified or replaced with a new one). Note that deduplication
+ * updates the inode item, but it only changes the iversion (sequence
+ * field in the inode item) of the inode, so if a file is deduplicated
+ * the same amount of times in both the parent and send snapshots, its
+ * iversion becames the same in both snapshots, whence the inode item is
+ * the same on both snapshots.
+ */
+ if (sctx->cur_ino != sctx->cmp_key->objectid)
+ return 0;
if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
if (result != BTRFS_COMPARE_TREE_DELETED)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 49a02bf091ae..204d585e012a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1863,7 +1863,7 @@ restore:
}
/* Used to sort the devices by max_avail(descending sort) */
-static int btrfs_cmp_device_free_bytes(const void *dev_info1,
+static inline int btrfs_cmp_device_free_bytes(const void *dev_info1,
const void *dev_info2)
{
if (((struct btrfs_device_info *)dev_info1)->max_avail >
@@ -1892,8 +1892,8 @@ static inline void btrfs_descending_sort_devices(
* The helper to calc the free space on the devices that can be used to store
* file data.
*/
-static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
- u64 *free_bytes)
+static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
+ u64 *free_bytes)
{
struct btrfs_device_info *devices_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 73c1fbca0c35..fa8f56e6f665 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2052,6 +2052,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
}
} else {
spin_unlock(&fs_info->trans_lock);
+ /*
+ * The previous transaction was aborted and was already removed
+ * from the list of transactions at fs_info->trans_list. So we
+ * abort to prevent writing a new superblock that reflects a
+ * corrupt state (pointing to trees with unwritten nodes/leafs).
+ */
+ if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
+ ret = -EROFS;
+ goto cleanup_transaction;
+ }
}
extwriter_counter_dec(cur_trans, trans->type);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8ac6a64d0422..e35301e5fe8e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2729,7 +2729,8 @@ out:
* in the tree of log roots
*/
static int update_log_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *log)
+ struct btrfs_root *log,
+ struct btrfs_root_item *root_item)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret;
@@ -2737,10 +2738,10 @@ static int update_log_root(struct btrfs_trans_handle *trans,
if (log->log_transid == 1) {
/* insert root item on the first sync */
ret = btrfs_insert_root(trans, fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ &log->root_key, root_item);
} else {
ret = btrfs_update_root(trans, fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ &log->root_key, root_item);
}
return ret;
}
@@ -2836,6 +2837,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+ struct btrfs_root_item new_root_item;
int log_transid = 0;
struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
@@ -2901,18 +2903,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
+ /*
+ * We _must_ update under the root->log_mutex in order to make sure we
+ * have a consistent view of the log root we are trying to commit at
+ * this moment.
+ *
+ * We _must_ copy this into a local copy, because we are not holding the
+ * log_root_tree->log_mutex yet. This is important because when we
+ * commit the log_root_tree we must have a consistent view of the
+ * log_root_tree when we update the super block to point at the
+ * log_root_tree bytenr. If we update the log_root_tree here we'll race
+ * with the commit and possibly point at the new block which we may not
+ * have written out.
+ */
btrfs_set_root_node(&log->root_item, log->node);
+ memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
root->log_transid++;
log->log_transid = root->log_transid;
root->log_start_pid = 0;
/*
- * Update or create log root item under the root's log_mutex to prevent
- * races with concurrent log syncs that can lead to failure to update
- * log root item because it was not created yet.
- */
- ret = update_log_root(trans, log);
- /*
* IO has been started, blocks of the log tree have WRITTEN flag set
* in their headers. new modifications of the log will be written to
* new positions. so it's safe to allow log writers to go in.
@@ -2932,6 +2942,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
mutex_lock(&log_root_tree->log_mutex);
+
+ /*
+ * Now we are safe to update the log_root_tree because we're under the
+ * log_mutex, and we're a current writer so we're holding the commit
+ * open until we drop the log_mutex.
+ */
+ ret = update_log_root(trans, log, &new_root_item);
+
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
/*
* Implicit memory barrier after atomic_dec_and_test
@@ -3153,6 +3171,30 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
}
/*
+ * Check if an inode was logged in the current transaction. We can't always rely
+ * on an inode's logged_trans value, because it's an in-memory only field and
+ * therefore not persisted. This means that its value is lost if the inode gets
+ * evicted and loaded again from disk (in which case it has a value of 0, and
+ * certainly it is smaller then any possible transaction ID), when that happens
+ * the full_sync flag is set in the inode's runtime flags, so on that case we
+ * assume eviction happened and ignore the logged_trans value, assuming the
+ * worst case, that the inode was logged before in the current transaction.
+ */
+static bool inode_logged(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ if (inode->logged_trans == trans->transid)
+ return true;
+
+ if (inode->last_trans == trans->transid &&
+ test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+ !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
+ return true;
+
+ return false;
+}
+
+/*
* If both a file and directory are logged, and unlinks or renames are
* mixed in, we have a few interesting corners:
*
@@ -3186,7 +3228,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
int bytes_del = 0;
u64 dir_ino = btrfs_ino(dir);
- if (dir->logged_trans < trans->transid)
+ if (!inode_logged(trans, dir))
return 0;
ret = join_running_log_trans(root);
@@ -3291,7 +3333,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
u64 index;
int ret;
- if (inode->logged_trans < trans->transid)
+ if (!inode_logged(trans, inode))
return 0;
ret = join_running_log_trans(root);
@@ -5098,7 +5140,7 @@ again:
BTRFS_I(other_inode),
LOG_OTHER_INODE, 0, LLONG_MAX,
ctx);
- iput(other_inode);
+ btrfs_add_delayed_iput(other_inode);
if (err)
goto out_unlock;
else
@@ -5266,9 +5308,19 @@ log_extents:
}
}
+ /*
+ * Don't update last_log_commit if we logged that an inode exists after
+ * it was loaded to memory (full_sync bit set).
+ * This is to prevent data loss when we do a write to the inode, then
+ * the inode gets evicted after all delalloc was flushed, then we log
+ * it exists (due to a rename for example) and then fsync it. This last
+ * fsync would do nothing (not logging the extents previously written).
+ */
spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
- inode->last_log_commit = inode->last_sub_trans;
+ if (inode_only != LOG_INODE_EXISTS ||
+ !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
out_unlock:
if (unlikely(err))
@@ -5505,7 +5557,7 @@ process_leaf:
}
if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
- iput(di_inode);
+ btrfs_add_delayed_iput(di_inode);
break;
}
@@ -5517,7 +5569,7 @@ process_leaf:
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
ret = 1;
- iput(di_inode);
+ btrfs_add_delayed_iput(di_inode);
if (ret)
goto next_dir_inode;
if (ctx->log_new_dentries) {
@@ -5664,7 +5716,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (!ret && ctx && ctx->log_new_dentries)
ret = log_new_dir_dentries(trans, root,
BTRFS_I(dir_inode), ctx);
- iput(dir_inode);
+ btrfs_add_delayed_iput(dir_inode);
if (ret)
goto out;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 38ed8e259e00..358e930df4ac 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4851,6 +4851,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
for (i = 0; i < map->num_stripes; i++) {
num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
+ map->stripes[i].dev->has_pending_chunks = true;
}
atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
@@ -5018,8 +5019,7 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_DUP)) {
+ BTRFS_BLOCK_GROUP_RAID5)) {
max_errors = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
max_errors = 2;
@@ -7310,6 +7310,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
for (i = 0; i < map->num_stripes; i++) {
dev = map->stripes[i].dev;
dev->commit_bytes_used = dev->bytes_used;
+ dev->has_pending_chunks = false;
}
}
mutex_unlock(&fs_info->chunk_mutex);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 76fb6e84f201..07b805d08e55 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -61,6 +61,11 @@ struct btrfs_device {
spinlock_t io_lock ____cacheline_aligned;
int running_pending;
+ /* When true means this device has pending chunk alloc in
+ * current transaction. Protected by chunk_mutex.
+ */
+ bool has_pending_chunks;
+
/* regular prio bios */
struct btrfs_pending_bios pending_bios;
/* sync bios */
@@ -312,7 +317,6 @@ struct btrfs_bio {
u64 map_type; /* get from map_lookup->type */
bio_end_io_t *end_io;
struct bio *orig_bio;
- unsigned long flags;
void *private;
atomic_t error;
int max_errors;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 92eb9c3052ee..c3a3ee74e2d8 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -935,6 +935,11 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+ /* remove from inode's cap rbtree, and clear auth cap */
+ rb_erase(&cap->ci_node, &ci->i_caps);
+ if (ci->i_auth_cap == cap)
+ ci->i_auth_cap = NULL;
+
/* remove from session list */
spin_lock(&session->s_cap_lock);
if (session->s_cap_iterator == cap) {
@@ -970,11 +975,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
spin_unlock(&session->s_cap_lock);
- /* remove from inode list */
- rb_erase(&cap->ci_node, &ci->i_caps);
- if (ci->i_auth_cap == cap)
- ci->i_auth_cap = NULL;
-
if (removed)
ceph_put_cap(mdsc, cap);
@@ -1119,20 +1119,23 @@ static int send_cap_msg(struct cap_msg_args *arg)
}
/*
- * Queue cap releases when an inode is dropped from our cache. Since
- * inode is about to be destroyed, there is no need for i_ceph_lock.
+ * Queue cap releases when an inode is dropped from our cache.
*/
void ceph_queue_caps_release(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct rb_node *p;
+ /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
+ * may call __ceph_caps_issued_mask() on a freeing inode. */
+ spin_lock(&ci->i_ceph_lock);
p = rb_first(&ci->i_caps);
while (p) {
struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
p = rb_next(p);
__ceph_remove_cap(cap, true);
}
+ spin_unlock(&ci->i_ceph_lock);
}
/*
@@ -1159,6 +1162,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
{
struct ceph_inode_info *ci = cap->ci;
struct inode *inode = &ci->vfs_inode;
+ struct ceph_buffer *old_blob = NULL;
struct cap_msg_args arg;
int held, revoking, dropping;
int wake = 0;
@@ -1224,7 +1228,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ci->i_requested_max_size = arg.max_size;
if (flushing & CEPH_CAP_XATTR_EXCL) {
- __ceph_build_xattrs_blob(ci);
+ old_blob = __ceph_build_xattrs_blob(ci);
arg.xattr_version = ci->i_xattrs.version;
arg.xattr_buf = ci->i_xattrs.blob;
} else {
@@ -1259,6 +1263,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
spin_unlock(&ci->i_ceph_lock);
+ ceph_buffer_put(old_blob);
+
ret = send_cap_msg(&arg);
if (ret < 0) {
dout("error sending cap msg, must requeue %p\n", inode);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f2b722f0df5d..5999d806de78 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -730,6 +730,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
int issued = 0, implemented, new_issued;
struct timespec mtime, atime, ctime;
struct ceph_buffer *xattr_blob = NULL;
+ struct ceph_buffer *old_blob = NULL;
struct ceph_string *pool_ns = NULL;
struct ceph_cap *new_cap = NULL;
int err = 0;
@@ -788,7 +789,12 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
ci->i_version = le64_to_cpu(info->version);
inode->i_version++;
inode->i_rdev = le32_to_cpu(info->rdev);
- inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+ /* directories have fl_stripe_unit set to zero */
+ if (le32_to_cpu(info->layout.fl_stripe_unit))
+ inode->i_blkbits =
+ fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+ else
+ inode->i_blkbits = CEPH_BLOCK_SHIFT;
if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
(issued & CEPH_CAP_AUTH_EXCL) == 0) {
@@ -847,7 +853,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
if (ci->i_xattrs.blob)
- ceph_buffer_put(ci->i_xattrs.blob);
+ old_blob = ci->i_xattrs.blob;
ci->i_xattrs.blob = xattr_blob;
if (xattr_blob)
memcpy(ci->i_xattrs.blob->vec.iov_base,
@@ -993,8 +999,8 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
out:
if (new_cap)
ceph_put_cap(mdsc, new_cap);
- if (xattr_blob)
- ceph_buffer_put(xattr_blob);
+ ceph_buffer_put(old_blob);
+ ceph_buffer_put(xattr_blob);
ceph_put_string(pool_ns);
return err;
}
@@ -1341,6 +1347,7 @@ retry_lookup:
dout(" final dn %p\n", dn);
} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
req->r_op == CEPH_MDS_OP_MKSNAP) &&
+ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
struct dentry *dn = req->r_dentry;
struct inode *dir = req->r_parent;
@@ -1624,7 +1631,6 @@ retry_lookup:
if (IS_ERR(realdn)) {
err = PTR_ERR(realdn);
d_drop(dn);
- dn = NULL;
goto next_item;
}
dn = realdn;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index e7cce412f2cf..cb647c8c7b68 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -78,8 +78,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
req->r_wait_for_completion = ceph_lock_wait_for_completion;
err = ceph_mdsc_do_request(mdsc, inode, req);
-
- if (operation == CEPH_MDS_OP_GETFILELOCK) {
+ if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
fl->fl_type = F_RDLCK;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index e1ded4bd6115..b968334f841e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3543,7 +3543,9 @@ static void delayed_work(struct work_struct *work)
pr_info("mds%d hung\n", s->s_mds);
}
}
- if (s->s_state < CEPH_MDS_SESSION_OPEN) {
+ if (s->s_state == CEPH_MDS_SESSION_NEW ||
+ s->s_state == CEPH_MDS_SESSION_RESTARTING ||
+ s->s_state == CEPH_MDS_SESSION_REJECTED) {
/* this mds is failed or recovering, just wait */
ceph_put_mds_session(s);
continue;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index a7e763dac038..29ed1688a1d3 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -460,6 +460,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
struct inode *inode = &ci->vfs_inode;
struct ceph_cap_snap *capsnap;
struct ceph_snap_context *old_snapc, *new_snapc;
+ struct ceph_buffer *old_blob = NULL;
int used, dirty;
capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
@@ -536,7 +537,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
capsnap->gid = inode->i_gid;
if (dirty & CEPH_CAP_XATTR_EXCL) {
- __ceph_build_xattrs_blob(ci);
+ old_blob = __ceph_build_xattrs_blob(ci);
capsnap->xattr_blob =
ceph_buffer_get(ci->i_xattrs.blob);
capsnap->xattr_version = ci->i_xattrs.version;
@@ -579,6 +580,7 @@ update_snapc:
}
spin_unlock(&ci->i_ceph_lock);
+ ceph_buffer_put(old_blob);
kfree(capsnap);
ceph_put_snap_context(old_snapc);
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f0694293b31a..088c4488b449 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -232,6 +232,7 @@ static int parse_fsopt_token(char *c, void *private)
return -ENOMEM;
break;
case Opt_fscache_uniq:
+#ifdef CONFIG_CEPH_FSCACHE
kfree(fsopt->fscache_uniq);
fsopt->fscache_uniq = kstrndup(argstr[0].from,
argstr[0].to-argstr[0].from,
@@ -240,7 +241,10 @@ static int parse_fsopt_token(char *c, void *private)
return -ENOMEM;
fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
break;
- /* misc */
+#else
+ pr_err("fscache support is disabled\n");
+ return -EINVAL;
+#endif
case Opt_wsize:
if (intval < PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE)
return -EINVAL;
@@ -312,8 +316,13 @@ static int parse_fsopt_token(char *c, void *private)
fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
break;
case Opt_fscache:
+#ifdef CONFIG_CEPH_FSCACHE
fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
break;
+#else
+ pr_err("fscache support is disabled\n");
+ return -EINVAL;
+#endif
case Opt_nofscache:
fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
break;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3e27a28aa44a..46f600107cb5 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -517,7 +517,12 @@ static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
long long release_count,
long long ordered_count)
{
- smp_mb__before_atomic();
+ /*
+ * Makes sure operations that setup readdir cache (update page
+ * cache and i_size) are strongly ordered w.r.t. the following
+ * atomic64_set() operations.
+ */
+ smp_mb();
atomic64_set(&ci->i_complete_seq[0], release_count);
atomic64_set(&ci->i_complete_seq[1], ordered_count);
}
@@ -830,7 +835,7 @@ extern int ceph_getattr(const struct path *path, struct kstat *stat,
int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
-extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
+extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
extern void __init ceph_xattr_init(void);
extern void ceph_xattr_exit(void);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index e1c4e0b12b4c..3a166f860b6c 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -75,7 +75,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
const char *ns_field = " pool_namespace=";
char buf[128];
size_t len, total_len = 0;
- int ret;
+ ssize_t ret;
pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
@@ -99,11 +99,8 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
if (pool_ns)
total_len += strlen(ns_field) + pool_ns->len;
- if (!size) {
- ret = total_len;
- } else if (total_len > size) {
- ret = -ERANGE;
- } else {
+ ret = total_len;
+ if (size >= total_len) {
memcpy(val, buf, len);
ret = len;
if (pool_name) {
@@ -684,12 +681,15 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
/*
* If there are dirty xattrs, reencode xattrs into the prealloc_blob
- * and swap into place.
+ * and swap into place. It returns the old i_xattrs.blob (or NULL) so
+ * that it can be freed by the caller as the i_ceph_lock is likely to be
+ * held.
*/
-void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
+struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci)
{
struct rb_node *p;
struct ceph_inode_xattr *xattr = NULL;
+ struct ceph_buffer *old_blob = NULL;
void *dest;
dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
@@ -720,12 +720,14 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
if (ci->i_xattrs.blob)
- ceph_buffer_put(ci->i_xattrs.blob);
+ old_blob = ci->i_xattrs.blob;
ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
ci->i_xattrs.prealloc_blob = NULL;
ci->i_xattrs.dirty = false;
ci->i_xattrs.version++;
}
+
+ return old_blob;
}
static inline int __get_request_mask(struct inode *in) {
@@ -761,8 +763,11 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
if (err)
return err;
err = -ENODATA;
- if (!(vxattr->exists_cb && !vxattr->exists_cb(ci)))
+ if (!(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
err = vxattr->getxattr_cb(ci, value, size);
+ if (size && size < err)
+ err = -ERANGE;
+ }
return err;
}
@@ -955,6 +960,7 @@ int __ceph_setxattr(struct inode *inode, const char *name,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf = NULL;
+ struct ceph_buffer *old_blob = NULL;
int issued;
int err;
int dirty = 0;
@@ -1023,13 +1029,15 @@ retry:
struct ceph_buffer *blob;
spin_unlock(&ci->i_ceph_lock);
- dout(" preaallocating new blob size=%d\n", required_blob_size);
+ ceph_buffer_put(old_blob); /* Shouldn't be required */
+ dout(" pre-allocating new blob size=%d\n", required_blob_size);
blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
if (!blob)
goto do_sync_unlocked;
spin_lock(&ci->i_ceph_lock);
+ /* prealloc_blob can't be released while holding i_ceph_lock */
if (ci->i_xattrs.prealloc_blob)
- ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+ old_blob = ci->i_xattrs.prealloc_blob;
ci->i_xattrs.prealloc_blob = blob;
goto retry;
}
@@ -1045,6 +1053,7 @@ retry:
}
spin_unlock(&ci->i_ceph_lock);
+ ceph_buffer_put(old_blob);
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
if (dirty)
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7b7ab10a9db1..600bb838c15b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1210,6 +1210,11 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
struct cifsInodeInfo {
bool can_cache_brlcks;
struct list_head llist; /* locks helb by this inode */
+ /*
+ * NOTE: Some code paths call down_read(lock_sem) twice, so
+ * we must always use use cifs_down_write() instead of down_write()
+ * for this semaphore to avoid deadlocks.
+ */
struct rw_semaphore lock_sem; /* protect the fields above */
/* BB add in lists for dirty pages i.e. write caching info for oplock */
struct list_head openFileList;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ccdb42f71b2e..3a7fb8e750e9 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -149,6 +149,7 @@ extern int cifs_unlock_range(struct cifsFileInfo *cfile,
struct file_lock *flock, const unsigned int xid);
extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile);
+extern void cifs_down_write(struct rw_semaphore *sem);
extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid,
struct file *file,
struct tcon_link *tlink,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 33cd844579ae..f523a9ca9574 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -554,10 +554,10 @@ static bool
server_unresponsive(struct TCP_Server_Info *server)
{
/*
- * We need to wait 2 echo intervals to make sure we handle such
+ * We need to wait 3 echo intervals to make sure we handle such
* situations right:
* 1s client sends a normal SMB request
- * 2s client gets a response
+ * 3s client gets a response
* 30s echo workqueue job pops, and decides we got a response recently
* and don't need to send another
* ...
@@ -566,9 +566,9 @@ server_unresponsive(struct TCP_Server_Info *server)
*/
if ((server->tcpStatus == CifsGood ||
server->tcpStatus == CifsNeedNegotiate) &&
- time_after(jiffies, server->lstrp + 2 * server->echo_interval)) {
+ time_after(jiffies, server->lstrp + 3 * server->echo_interval)) {
cifs_dbg(VFS, "Server %s has not responded in %lu seconds. Reconnecting...\n",
- server->hostname, (2 * server->echo_interval) / HZ);
+ server->hostname, (3 * server->echo_interval) / HZ);
cifs_reconnect(server);
wake_up(&server->response_q);
return true;
@@ -2542,6 +2542,7 @@ static int
cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
{
int rc = 0;
+ int is_domain = 0;
const char *delim, *payload;
char *desc;
ssize_t len;
@@ -2589,6 +2590,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
rc = PTR_ERR(key);
goto out_err;
}
+ is_domain = 1;
}
down_read(&key->sem);
@@ -2646,6 +2648,26 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
goto out_key_put;
}
+ /*
+ * If we have a domain key then we must set the domainName in the
+ * for the request.
+ */
+ if (is_domain && ses->domainName) {
+ vol->domainname = kstrndup(ses->domainName,
+ strlen(ses->domainName),
+ GFP_KERNEL);
+ if (!vol->domainname) {
+ cifs_dbg(FYI, "Unable to allocate %zd bytes for "
+ "domain\n", len);
+ rc = -ENOMEM;
+ kfree(vol->username);
+ vol->username = NULL;
+ kzfree(vol->password);
+ vol->password = NULL;
+ goto out_key_put;
+ }
+ }
+
out_key_put:
up_read(&key->sem);
key_put(key);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index ca98afda3cdb..f00a7ce3eb6e 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -841,10 +841,16 @@ lookup_out:
static int
cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
{
+ struct inode *inode;
+
if (flags & LOOKUP_RCU)
return -ECHILD;
if (d_really_is_positive(direntry)) {
+ inode = d_inode(direntry);
+ if ((flags & LOOKUP_REVAL) && !CIFS_CACHE_READ(CIFS_I(inode)))
+ CIFS_I(inode)->time = 0; /* force reval */
+
if (cifs_revalidate_dentry(direntry))
return 0;
else {
@@ -855,7 +861,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
* attributes will have been updated by
* cifs_revalidate_dentry().
*/
- if (IS_AUTOMOUNT(d_inode(direntry)) &&
+ if (IS_AUTOMOUNT(inode) &&
!(direntry->d_flags & DCACHE_NEED_AUTOMOUNT)) {
spin_lock(&direntry->d_lock);
direntry->d_flags |= DCACHE_NEED_AUTOMOUNT;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 6ee8f9270892..6dc0e092b0fc 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -252,6 +252,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
xid, fid);
+ if (rc) {
+ server->ops->close(xid, tcon, fid);
+ if (rc == -ESTALE)
+ rc = -EOPENSTALE;
+ }
+
out:
kfree(buf);
return rc;
@@ -274,6 +280,13 @@ cifs_has_mand_locks(struct cifsInodeInfo *cinode)
return has_locks;
}
+void
+cifs_down_write(struct rw_semaphore *sem)
+{
+ while (!down_write_trylock(sem))
+ msleep(10);
+}
+
struct cifsFileInfo *
cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
struct tcon_link *tlink, __u32 oplock)
@@ -299,9 +312,6 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
INIT_LIST_HEAD(&fdlocks->locks);
fdlocks->cfile = cfile;
cfile->llist = fdlocks;
- down_write(&cinode->lock_sem);
- list_add(&fdlocks->llist, &cinode->llist);
- up_write(&cinode->lock_sem);
cfile->count = 1;
cfile->pid = current->tgid;
@@ -325,6 +335,10 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
oplock = 0;
}
+ cifs_down_write(&cinode->lock_sem);
+ list_add(&fdlocks->llist, &cinode->llist);
+ up_write(&cinode->lock_sem);
+
spin_lock(&tcon->open_file_lock);
if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock)
oplock = fid->pending_open->oplock;
@@ -451,7 +465,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
* Delete any outstanding lock records. We'll lose them when the file
* is closed anyway.
*/
- down_write(&cifsi->lock_sem);
+ cifs_down_write(&cifsi->lock_sem);
list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) {
list_del(&li->llist);
cifs_del_lock_waiters(li);
@@ -1005,7 +1019,7 @@ static void
cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock)
{
struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
list_add_tail(&lock->llist, &cfile->llist->locks);
up_write(&cinode->lock_sem);
}
@@ -1027,7 +1041,7 @@ cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock,
try_again:
exist = false;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
lock->type, &conf_lock, CIFS_LOCK_OP);
@@ -1049,7 +1063,7 @@ try_again:
(lock->blist.next == &lock->blist));
if (!rc)
goto try_again;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
list_del_init(&lock->blist);
}
@@ -1102,7 +1116,7 @@ cifs_posix_lock_set(struct file *file, struct file_lock *flock)
return rc;
try_again:
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
if (!cinode->can_cache_brlcks) {
up_write(&cinode->lock_sem);
return rc;
@@ -1308,7 +1322,7 @@ cifs_push_locks(struct cifsFileInfo *cfile)
int rc = 0;
/* we are going to update can_cache_brlcks here - need a write access */
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
if (!cinode->can_cache_brlcks) {
up_write(&cinode->lock_sem);
return rc;
@@ -1499,7 +1513,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
if (!buf)
return -ENOMEM;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
for (i = 0; i < 2; i++) {
cur = buf;
num = 0;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index e7192ee7a89c..a35c14105906 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -410,6 +410,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
/* if uniqueid is different, return error */
if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) {
+ CIFS_I(*pinode)->time = 0; /* force reval */
rc = -ESTALE;
goto cgiiu_exit;
}
@@ -417,6 +418,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
/* if filetype is different, return error */
if (unlikely(((*pinode)->i_mode & S_IFMT) !=
(fattr.cf_mode & S_IFMT))) {
+ CIFS_I(*pinode)->time = 0; /* force reval */
rc = -ESTALE;
goto cgiiu_exit;
}
@@ -925,6 +927,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
/* if uniqueid is different, return error */
if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) {
+ CIFS_I(*inode)->time = 0; /* force reval */
rc = -ESTALE;
goto cgii_exit;
}
@@ -932,6 +935,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
/* if filetype is different, return error */
if (unlikely(((*inode)->i_mode & S_IFMT) !=
(fattr.cf_mode & S_IFMT))) {
+ CIFS_I(*inode)->time = 0; /* force reval */
rc = -ESTALE;
goto cgii_exit;
}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index cc88f4f0325e..bed973330227 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -130,10 +130,6 @@ static const struct smb_to_posix_error mapping_table_ERRSRV[] = {
{0, 0}
};
-static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
- {0, 0}
-};
-
/*
* Convert a string containing text IPv4 or IPv6 address to binary form.
*
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index f50d3d0b9b87..483458340b10 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -181,6 +181,9 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
/* we do not want to loop forever */
last_mid = cur_mid;
cur_mid++;
+ /* avoid 0xFFFF MID */
+ if (cur_mid == 0xffff)
+ cur_mid++;
/*
* This nested loop looks more expensive than it is.
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 1add404618f0..2c809233084b 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -139,7 +139,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
cur = buf;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) {
if (flock->fl_start > li->offset ||
(flock->fl_start + length) <
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 92c9cdf4704d..49911bdc17ec 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -456,7 +456,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_FILE_INVALID, -EIO, "STATUS_FILE_INVALID"},
{STATUS_ALLOTTED_SPACE_EXCEEDED, -EIO,
"STATUS_ALLOTTED_SPACE_EXCEEDED"},
- {STATUS_INSUFFICIENT_RESOURCES, -EREMOTEIO,
+ {STATUS_INSUFFICIENT_RESOURCES, -EAGAIN,
"STATUS_INSUFFICIENT_RESOURCES"},
{STATUS_DFS_EXIT_PATH_FOUND, -EIO, "STATUS_DFS_EXIT_PATH_FOUND"},
{STATUS_DEVICE_DATA_ERROR, -EIO, "STATUS_DEVICE_DATA_ERROR"},
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 31f01f09d25a..ff2ad15f67d6 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -622,10 +622,10 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
spin_lock(&cifs_tcp_ses_lock);
list_for_each(tmp, &server->smb_ses_list) {
ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+
list_for_each(tmp1, &ses->tcon_list) {
tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
- cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
spin_lock(&tcon->open_file_lock);
list_for_each(tmp2, &tcon->openFileList) {
cfile = list_entry(tmp2, struct cifsFileInfo,
@@ -637,6 +637,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
continue;
cifs_dbg(FYI, "file id match, oplock break\n");
+ cifs_stats_inc(
+ &tcon->stats.cifs_stats.num_oplock_brks);
cinode = CIFS_I(d_inode(cfile->dentry));
spin_lock(&cfile->file_info_lock);
if (!CIFS_CACHE_WRITE(cinode) &&
@@ -669,9 +671,6 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
return true;
}
spin_unlock(&tcon->open_file_lock);
- spin_unlock(&cifs_tcp_ses_lock);
- cifs_dbg(FYI, "No matching file for oplock break\n");
- return true;
}
}
spin_unlock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 23326b0cd562..951c444d83e7 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1975,6 +1975,11 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
return;
+ /* Check if the server granted an oplock rather than a lease */
+ if (oplock & SMB2_OPLOCK_LEVEL_EXCLUSIVE)
+ return smb2_set_oplock_level(cinode, oplock, epoch,
+ purge_cache);
+
if (oplock & SMB2_LEASE_READ_CACHING_HE) {
new_oplock |= CIFS_CACHE_READ_FLG;
strcat(message, "R");
@@ -2168,7 +2173,15 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, struct smb_rqst *old_rq)
static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf,
unsigned int buflen)
{
- sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
+ void *addr;
+ /*
+ * VMAP_STACK (at least) puts stack into the vmalloc address space
+ */
+ if (is_vmalloc_addr(buf))
+ addr = vmalloc_to_page(buf);
+ else
+ addr = virt_to_page(buf);
+ sg_set_page(sg, addr, buflen, offset_in_page(buf));
}
static struct scatterlist *
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index fd2d199dd413..0e1c36c92f60 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -166,7 +166,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
if (tcon == NULL)
return 0;
- if (smb2_command == SMB2_TREE_CONNECT)
+ if (smb2_command == SMB2_TREE_CONNECT || smb2_command == SMB2_IOCTL)
return 0;
if (tcon->tidStatus == CifsExiting) {
@@ -834,7 +834,12 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
else
req->SecurityMode = 0;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ req->Capabilities = cpu_to_le32(SMB2_GLOBAL_CAP_DFS);
+#else
req->Capabilities = 0;
+#endif /* DFS_UPCALL */
+
req->Channel = 0; /* MBZ */
sess_data->iov[0].iov_base = (char *)req;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 52f975d848a0..26219d9db575 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -31,7 +31,7 @@
#include "cifs_fs_sb.h"
#include "cifs_unicode.h"
-#define MAX_EA_VALUE_SIZE 65535
+#define MAX_EA_VALUE_SIZE CIFSMaxBufSize
#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
#define CIFS_XATTR_ATTRIB "cifs.dosattrib" /* full name: user.cifs.dosattrib */
#define CIFS_XATTR_CREATETIME "cifs.creationtime" /* user.cifs.creationtime */
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 1cbc1f2298ee..43d371551d2b 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -27,6 +27,13 @@
#include "coda_linux.h"
#include "coda_int.h"
+struct coda_vm_ops {
+ atomic_t refcnt;
+ struct file *coda_file;
+ const struct vm_operations_struct *host_vm_ops;
+ struct vm_operations_struct vm_ops;
+};
+
static ssize_t
coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
@@ -61,6 +68,34 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
return ret;
}
+static void
+coda_vm_open(struct vm_area_struct *vma)
+{
+ struct coda_vm_ops *cvm_ops =
+ container_of(vma->vm_ops, struct coda_vm_ops, vm_ops);
+
+ atomic_inc(&cvm_ops->refcnt);
+
+ if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->open)
+ cvm_ops->host_vm_ops->open(vma);
+}
+
+static void
+coda_vm_close(struct vm_area_struct *vma)
+{
+ struct coda_vm_ops *cvm_ops =
+ container_of(vma->vm_ops, struct coda_vm_ops, vm_ops);
+
+ if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->close)
+ cvm_ops->host_vm_ops->close(vma);
+
+ if (atomic_dec_and_test(&cvm_ops->refcnt)) {
+ vma->vm_ops = cvm_ops->host_vm_ops;
+ fput(cvm_ops->coda_file);
+ kfree(cvm_ops);
+ }
+}
+
static int
coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
{
@@ -68,6 +103,8 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
struct coda_inode_info *cii;
struct file *host_file;
struct inode *coda_inode, *host_inode;
+ struct coda_vm_ops *cvm_ops;
+ int ret;
cfi = CODA_FTOC(coda_file);
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
@@ -76,6 +113,13 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
if (!host_file->f_op->mmap)
return -ENODEV;
+ if (WARN_ON(coda_file != vma->vm_file))
+ return -EIO;
+
+ cvm_ops = kmalloc(sizeof(struct coda_vm_ops), GFP_KERNEL);
+ if (!cvm_ops)
+ return -ENOMEM;
+
coda_inode = file_inode(coda_file);
host_inode = file_inode(host_file);
@@ -89,6 +133,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
* the container file on us! */
else if (coda_inode->i_mapping != host_inode->i_mapping) {
spin_unlock(&cii->c_lock);
+ kfree(cvm_ops);
return -EBUSY;
}
@@ -97,7 +142,29 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
cfi->cfi_mapcount++;
spin_unlock(&cii->c_lock);
- return call_mmap(host_file, vma);
+ vma->vm_file = get_file(host_file);
+ ret = call_mmap(vma->vm_file, vma);
+
+ if (ret) {
+ /* if call_mmap fails, our caller will put coda_file so we
+ * should drop the reference to the host_file that we got.
+ */
+ fput(host_file);
+ kfree(cvm_ops);
+ } else {
+ /* here we add redirects for the open/close vm_operations */
+ cvm_ops->host_vm_ops = vma->vm_ops;
+ if (vma->vm_ops)
+ cvm_ops->vm_ops = *vma->vm_ops;
+
+ cvm_ops->vm_ops.open = coda_vm_open;
+ cvm_ops->vm_ops.close = coda_vm_close;
+ cvm_ops->coda_file = coda_file;
+ atomic_set(&cvm_ops->refcnt, 1);
+
+ vma->vm_ops = &cvm_ops->vm_ops;
+ }
+ return ret;
}
int coda_open(struct inode *coda_inode, struct file *coda_file)
@@ -207,4 +274,3 @@ const struct file_operations coda_file_operations = {
.fsync = coda_fsync,
.splice_read = generic_file_splice_read,
};
-
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index f40e3953e7fe..a6d9e841a375 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -187,8 +187,11 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
if (req->uc_opcode == CODA_OPEN_BY_FD) {
struct coda_open_by_fd_out *outp =
(struct coda_open_by_fd_out *)req->uc_data;
- if (!outp->oh.result)
+ if (!outp->oh.result) {
outp->fh = fget(outp->fd);
+ if (!outp->fh)
+ return -EBADF;
+ }
}
wake_up(&req->uc_sleep);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index bd5d91e119ca..033e8e6aabb7 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -161,6 +161,7 @@ struct compat_video_event {
unsigned int frame_rate;
} u;
};
+#define VIDEO_GET_EVENT32 _IOR('o', 28, struct compat_video_event)
static int do_video_get_event(struct file *file,
unsigned int cmd, struct compat_video_event __user *up)
@@ -172,7 +173,7 @@ static int do_video_get_event(struct file *file,
if (kevent == NULL)
return -EFAULT;
- err = do_ioctl(file, cmd, (unsigned long)kevent);
+ err = do_ioctl(file, VIDEO_GET_EVENT, (unsigned long)kevent);
if (!err) {
err = convert_in_user(&kevent->type, &up->type);
err |= convert_in_user(&kevent->timestamp, &up->timestamp);
@@ -191,6 +192,7 @@ struct compat_video_still_picture {
compat_uptr_t iFrame;
int32_t size;
};
+#define VIDEO_STILLPICTURE32 _IOW('o', 30, struct compat_video_still_picture)
static int do_video_stillpicture(struct file *file,
unsigned int cmd, struct compat_video_still_picture __user *up)
@@ -213,7 +215,7 @@ static int do_video_stillpicture(struct file *file,
if (err)
return -EFAULT;
- err = do_ioctl(file, cmd, (unsigned long) up_native);
+ err = do_ioctl(file, VIDEO_STILLPICTURE, (unsigned long) up_native);
return err;
}
@@ -1032,9 +1034,6 @@ COMPATIBLE_IOCTL(PPPIOCDISCONN)
COMPATIBLE_IOCTL(PPPIOCATTCHAN)
COMPATIBLE_IOCTL(PPPIOCGCHAN)
COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
-/* PPPOX */
-COMPATIBLE_IOCTL(PPPOEIOCSFWD)
-COMPATIBLE_IOCTL(PPPOEIOCDFWD)
/* Big A */
/* sparc only */
/* Big Q for sound/OSS */
@@ -1479,9 +1478,9 @@ static long do_ioctl_trans(unsigned int cmd,
return rtc_ioctl(file, cmd, argp);
/* dvb */
- case VIDEO_GET_EVENT:
+ case VIDEO_GET_EVENT32:
return do_video_get_event(file, cmd, argp);
- case VIDEO_STILLPICTURE:
+ case VIDEO_STILLPICTURE32:
return do_video_stillpicture(file, cmd, argp);
case VIDEO_SET_SPU_PALETTE:
return do_video_set_spu_palette(file, cmd, argp);
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index ccc31fa6f1a7..16eb59adf5aa 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -34,6 +34,15 @@
#include <linux/list.h>
#include <linux/spinlock.h>
+struct configfs_fragment {
+ atomic_t frag_count;
+ struct rw_semaphore frag_sem;
+ bool frag_dead;
+};
+
+void put_fragment(struct configfs_fragment *);
+struct configfs_fragment *get_fragment(struct configfs_fragment *);
+
struct configfs_dirent {
atomic_t s_count;
int s_dependent_count;
@@ -48,6 +57,7 @@ struct configfs_dirent {
#ifdef CONFIG_LOCKDEP
int s_depth;
#endif
+ struct configfs_fragment *s_frag;
};
#define CONFIGFS_ROOT 0x0001
@@ -75,8 +85,8 @@ extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct in
extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
extern int configfs_create_bin_file(struct config_item *,
const struct configfs_bin_attribute *);
-extern int configfs_make_dirent(struct configfs_dirent *,
- struct dentry *, void *, umode_t, int);
+extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *,
+ void *, umode_t, int, struct configfs_fragment *);
extern int configfs_dirent_is_ready(struct configfs_dirent *);
extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
@@ -151,6 +161,7 @@ static inline void release_configfs_dirent(struct configfs_dirent * sd)
{
if (!(sd->s_type & CONFIGFS_ROOT)) {
kfree(sd->s_iattr);
+ put_fragment(sd->s_frag);
kmem_cache_free(configfs_dir_cachep, sd);
}
}
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index d7955dc56737..c2ef617d2f97 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -58,15 +58,13 @@ static void configfs_d_iput(struct dentry * dentry,
if (sd) {
/* Coordinate with configfs_readdir */
spin_lock(&configfs_dirent_lock);
- /* Coordinate with configfs_attach_attr where will increase
- * sd->s_count and update sd->s_dentry to new allocated one.
- * Only set sd->dentry to null when this dentry is the only
- * sd owner.
- * If not do so, configfs_d_iput may run just after
- * configfs_attach_attr and set sd->s_dentry to null
- * even it's still in use.
+ /*
+ * Set sd->s_dentry to null only when this dentry is the one
+ * that is going to be killed. Otherwise configfs_d_iput may
+ * run just after configfs_attach_attr and set sd->s_dentry to
+ * NULL even it's still in use.
*/
- if (atomic_read(&sd->s_count) <= 2)
+ if (sd->s_dentry == dentry)
sd->s_dentry = NULL;
spin_unlock(&configfs_dirent_lock);
@@ -166,11 +164,38 @@ configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
#endif /* CONFIG_LOCKDEP */
+static struct configfs_fragment *new_fragment(void)
+{
+ struct configfs_fragment *p;
+
+ p = kmalloc(sizeof(struct configfs_fragment), GFP_KERNEL);
+ if (p) {
+ atomic_set(&p->frag_count, 1);
+ init_rwsem(&p->frag_sem);
+ p->frag_dead = false;
+ }
+ return p;
+}
+
+void put_fragment(struct configfs_fragment *frag)
+{
+ if (frag && atomic_dec_and_test(&frag->frag_count))
+ kfree(frag);
+}
+
+struct configfs_fragment *get_fragment(struct configfs_fragment *frag)
+{
+ if (likely(frag))
+ atomic_inc(&frag->frag_count);
+ return frag;
+}
+
/*
* Allocates a new configfs_dirent and links it to the parent configfs_dirent
*/
static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd,
- void *element, int type)
+ void *element, int type,
+ struct configfs_fragment *frag)
{
struct configfs_dirent * sd;
@@ -190,6 +215,7 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *paren
kmem_cache_free(configfs_dir_cachep, sd);
return ERR_PTR(-ENOENT);
}
+ sd->s_frag = get_fragment(frag);
list_add(&sd->s_sibling, &parent_sd->s_children);
spin_unlock(&configfs_dirent_lock);
@@ -224,11 +250,11 @@ static int configfs_dirent_exists(struct configfs_dirent *parent_sd,
int configfs_make_dirent(struct configfs_dirent * parent_sd,
struct dentry * dentry, void * element,
- umode_t mode, int type)
+ umode_t mode, int type, struct configfs_fragment *frag)
{
struct configfs_dirent * sd;
- sd = configfs_new_dirent(parent_sd, element, type);
+ sd = configfs_new_dirent(parent_sd, element, type, frag);
if (IS_ERR(sd))
return PTR_ERR(sd);
@@ -275,7 +301,8 @@ static void init_symlink(struct inode * inode)
* until it is validated by configfs_dir_set_ready()
*/
-static int configfs_create_dir(struct config_item *item, struct dentry *dentry)
+static int configfs_create_dir(struct config_item *item, struct dentry *dentry,
+ struct configfs_fragment *frag)
{
int error;
umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
@@ -288,7 +315,8 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry)
return error;
error = configfs_make_dirent(p->d_fsdata, dentry, item, mode,
- CONFIGFS_DIR | CONFIGFS_USET_CREATING);
+ CONFIGFS_DIR | CONFIGFS_USET_CREATING,
+ frag);
if (unlikely(error))
return error;
@@ -353,9 +381,10 @@ int configfs_create_link(struct configfs_symlink *sl,
{
int err = 0;
umode_t mode = S_IFLNK | S_IRWXUGO;
+ struct configfs_dirent *p = parent->d_fsdata;
- err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode,
- CONFIGFS_ITEM_LINK);
+ err = configfs_make_dirent(p, dentry, sl, mode,
+ CONFIGFS_ITEM_LINK, p->s_frag);
if (!err) {
err = configfs_create(dentry, mode, init_symlink);
if (err) {
@@ -614,7 +643,8 @@ static int populate_attrs(struct config_item *item)
static int configfs_attach_group(struct config_item *parent_item,
struct config_item *item,
- struct dentry *dentry);
+ struct dentry *dentry,
+ struct configfs_fragment *frag);
static void configfs_detach_group(struct config_item *item);
static void detach_groups(struct config_group *group)
@@ -662,7 +692,8 @@ static void detach_groups(struct config_group *group)
* try using vfs_mkdir. Just a thought.
*/
static int create_default_group(struct config_group *parent_group,
- struct config_group *group)
+ struct config_group *group,
+ struct configfs_fragment *frag)
{
int ret;
struct configfs_dirent *sd;
@@ -678,7 +709,7 @@ static int create_default_group(struct config_group *parent_group,
d_add(child, NULL);
ret = configfs_attach_group(&parent_group->cg_item,
- &group->cg_item, child);
+ &group->cg_item, child, frag);
if (!ret) {
sd = child->d_fsdata;
sd->s_type |= CONFIGFS_USET_DEFAULT;
@@ -692,13 +723,14 @@ static int create_default_group(struct config_group *parent_group,
return ret;
}
-static int populate_groups(struct config_group *group)
+static int populate_groups(struct config_group *group,
+ struct configfs_fragment *frag)
{
struct config_group *new_group;
int ret = 0;
list_for_each_entry(new_group, &group->default_groups, group_entry) {
- ret = create_default_group(group, new_group);
+ ret = create_default_group(group, new_group, frag);
if (ret) {
detach_groups(group);
break;
@@ -812,11 +844,12 @@ static void link_group(struct config_group *parent_group, struct config_group *g
*/
static int configfs_attach_item(struct config_item *parent_item,
struct config_item *item,
- struct dentry *dentry)
+ struct dentry *dentry,
+ struct configfs_fragment *frag)
{
int ret;
- ret = configfs_create_dir(item, dentry);
+ ret = configfs_create_dir(item, dentry, frag);
if (!ret) {
ret = populate_attrs(item);
if (ret) {
@@ -846,12 +879,13 @@ static void configfs_detach_item(struct config_item *item)
static int configfs_attach_group(struct config_item *parent_item,
struct config_item *item,
- struct dentry *dentry)
+ struct dentry *dentry,
+ struct configfs_fragment *frag)
{
int ret;
struct configfs_dirent *sd;
- ret = configfs_attach_item(parent_item, item, dentry);
+ ret = configfs_attach_item(parent_item, item, dentry, frag);
if (!ret) {
sd = dentry->d_fsdata;
sd->s_type |= CONFIGFS_USET_DIR;
@@ -867,7 +901,7 @@ static int configfs_attach_group(struct config_item *parent_item,
*/
inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
configfs_adjust_dir_dirent_depth_before_populate(sd);
- ret = populate_groups(to_config_group(item));
+ ret = populate_groups(to_config_group(item), frag);
if (ret) {
configfs_detach_item(item);
d_inode(dentry)->i_flags |= S_DEAD;
@@ -1262,6 +1296,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
struct configfs_dirent *sd;
struct config_item_type *type;
struct module *subsys_owner = NULL, *new_item_owner = NULL;
+ struct configfs_fragment *frag;
char *name;
sd = dentry->d_parent->d_fsdata;
@@ -1280,6 +1315,12 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
goto out;
}
+ frag = new_fragment();
+ if (!frag) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
/* Get a working ref for the duration of this function */
parent_item = configfs_get_config_item(dentry->d_parent);
type = parent_item->ci_type;
@@ -1382,9 +1423,9 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
spin_unlock(&configfs_dirent_lock);
if (group)
- ret = configfs_attach_group(parent_item, item, dentry);
+ ret = configfs_attach_group(parent_item, item, dentry, frag);
else
- ret = configfs_attach_item(parent_item, item, dentry);
+ ret = configfs_attach_item(parent_item, item, dentry, frag);
spin_lock(&configfs_dirent_lock);
sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
@@ -1421,6 +1462,7 @@ out_put:
* reference.
*/
config_item_put(parent_item);
+ put_fragment(frag);
out:
return ret;
@@ -1432,6 +1474,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
struct config_item *item;
struct configfs_subsystem *subsys;
struct configfs_dirent *sd;
+ struct configfs_fragment *frag;
struct module *subsys_owner = NULL, *dead_item_owner = NULL;
int ret;
@@ -1489,6 +1532,16 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
}
} while (ret == -EAGAIN);
+ frag = sd->s_frag;
+ if (down_write_killable(&frag->frag_sem)) {
+ spin_lock(&configfs_dirent_lock);
+ configfs_detach_rollback(dentry);
+ spin_unlock(&configfs_dirent_lock);
+ return -EINTR;
+ }
+ frag->frag_dead = true;
+ up_write(&frag->frag_sem);
+
/* Get a working ref for the duration of this function */
item = configfs_get_config_item(dentry);
@@ -1589,7 +1642,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
*/
err = -ENOENT;
if (configfs_dirent_is_ready(parent_sd)) {
- file->private_data = configfs_new_dirent(parent_sd, NULL, 0);
+ file->private_data = configfs_new_dirent(parent_sd, NULL, 0, NULL);
if (IS_ERR(file->private_data))
err = PTR_ERR(file->private_data);
else
@@ -1745,8 +1798,13 @@ int configfs_register_group(struct config_group *parent_group,
{
struct configfs_subsystem *subsys = parent_group->cg_subsys;
struct dentry *parent;
+ struct configfs_fragment *frag;
int ret;
+ frag = new_fragment();
+ if (!frag)
+ return -ENOMEM;
+
mutex_lock(&subsys->su_mutex);
link_group(parent_group, group);
mutex_unlock(&subsys->su_mutex);
@@ -1754,7 +1812,7 @@ int configfs_register_group(struct config_group *parent_group,
parent = parent_group->cg_item.ci_dentry;
inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
- ret = create_default_group(parent_group, group);
+ ret = create_default_group(parent_group, group, frag);
if (ret)
goto err_out;
@@ -1762,12 +1820,14 @@ int configfs_register_group(struct config_group *parent_group,
configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata);
spin_unlock(&configfs_dirent_lock);
inode_unlock(d_inode(parent));
+ put_fragment(frag);
return 0;
err_out:
inode_unlock(d_inode(parent));
mutex_lock(&subsys->su_mutex);
unlink_group(group);
mutex_unlock(&subsys->su_mutex);
+ put_fragment(frag);
return ret;
}
EXPORT_SYMBOL(configfs_register_group);
@@ -1783,16 +1843,12 @@ void configfs_unregister_group(struct config_group *group)
struct configfs_subsystem *subsys = group->cg_subsys;
struct dentry *dentry = group->cg_item.ci_dentry;
struct dentry *parent = group->cg_item.ci_parent->ci_dentry;
+ struct configfs_dirent *sd = dentry->d_fsdata;
+ struct configfs_fragment *frag = sd->s_frag;
- mutex_lock(&subsys->su_mutex);
- if (!group->cg_item.ci_parent->ci_group) {
- /*
- * The parent has already been unlinked and detached
- * due to a rmdir.
- */
- goto unlink_group;
- }
- mutex_unlock(&subsys->su_mutex);
+ down_write(&frag->frag_sem);
+ frag->frag_dead = true;
+ up_write(&frag->frag_sem);
inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
spin_lock(&configfs_dirent_lock);
@@ -1808,7 +1864,6 @@ void configfs_unregister_group(struct config_group *group)
dput(dentry);
mutex_lock(&subsys->su_mutex);
-unlink_group:
unlink_group(group);
mutex_unlock(&subsys->su_mutex);
}
@@ -1865,10 +1920,17 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
struct dentry *dentry;
struct dentry *root;
struct configfs_dirent *sd;
+ struct configfs_fragment *frag;
+
+ frag = new_fragment();
+ if (!frag)
+ return -ENOMEM;
root = configfs_pin_fs();
- if (IS_ERR(root))
+ if (IS_ERR(root)) {
+ put_fragment(frag);
return PTR_ERR(root);
+ }
if (!group->cg_item.ci_name)
group->cg_item.ci_name = group->cg_item.ci_namebuf;
@@ -1884,7 +1946,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
d_add(dentry, NULL);
err = configfs_attach_group(sd->s_element, &group->cg_item,
- dentry);
+ dentry, frag);
if (err) {
BUG_ON(d_inode(dentry));
d_drop(dentry);
@@ -1902,6 +1964,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
unlink_group(group);
configfs_release_fs();
}
+ put_fragment(frag);
return err;
}
@@ -1911,12 +1974,18 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
struct config_group *group = &subsys->su_group;
struct dentry *dentry = group->cg_item.ci_dentry;
struct dentry *root = dentry->d_sb->s_root;
+ struct configfs_dirent *sd = dentry->d_fsdata;
+ struct configfs_fragment *frag = sd->s_frag;
if (dentry->d_parent != root) {
pr_err("Tried to unregister non-subsystem!\n");
return;
}
+ down_write(&frag->frag_sem);
+ frag->frag_dead = true;
+ up_write(&frag->frag_sem);
+
inode_lock_nested(d_inode(root),
I_MUTEX_PARENT);
inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 39da1103d341..bb0a427517e9 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -53,40 +53,44 @@ struct configfs_buffer {
bool write_in_progress;
char *bin_buffer;
int bin_buffer_size;
+ int cb_max_size;
+ struct config_item *item;
+ struct module *owner;
+ union {
+ struct configfs_attribute *attr;
+ struct configfs_bin_attribute *bin_attr;
+ };
};
+static inline struct configfs_fragment *to_frag(struct file *file)
+{
+ struct configfs_dirent *sd = file->f_path.dentry->d_fsdata;
-/**
- * fill_read_buffer - allocate and fill buffer from item.
- * @dentry: dentry pointer.
- * @buffer: data buffer for file.
- *
- * Allocate @buffer->page, if it hasn't been already, then call the
- * config_item's show() method to fill the buffer with this attribute's
- * data.
- * This is called only once, on the file's first read.
- */
-static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer)
+ return sd->s_frag;
+}
+
+static int fill_read_buffer(struct file *file, struct configfs_buffer *buffer)
{
- struct configfs_attribute * attr = to_attr(dentry);
- struct config_item * item = to_item(dentry->d_parent);
- int ret = 0;
- ssize_t count;
+ struct configfs_fragment *frag = to_frag(file);
+ ssize_t count = -ENOENT;
if (!buffer->page)
buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
if (!buffer->page)
return -ENOMEM;
- count = attr->show(item, buffer->page);
-
- BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
- if (count >= 0) {
- buffer->needs_read_fill = 0;
- buffer->count = count;
- } else
- ret = count;
- return ret;
+ down_read(&frag->frag_sem);
+ if (!frag->frag_dead)
+ count = buffer->attr->show(buffer->item, buffer->page);
+ up_read(&frag->frag_sem);
+
+ if (count < 0)
+ return count;
+ if (WARN_ON_ONCE(count > (ssize_t)SIMPLE_ATTR_SIZE))
+ return -EIO;
+ buffer->needs_read_fill = 0;
+ buffer->count = count;
+ return 0;
}
/**
@@ -111,12 +115,13 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
static ssize_t
configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
- struct configfs_buffer * buffer = file->private_data;
+ struct configfs_buffer *buffer = file->private_data;
ssize_t retval = 0;
mutex_lock(&buffer->mutex);
if (buffer->needs_read_fill) {
- if ((retval = fill_read_buffer(file->f_path.dentry,buffer)))
+ retval = fill_read_buffer(file, buffer);
+ if (retval)
goto out;
}
pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
@@ -152,10 +157,8 @@ static ssize_t
configfs_read_bin_file(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
+ struct configfs_fragment *frag = to_frag(file);
struct configfs_buffer *buffer = file->private_data;
- struct dentry *dentry = file->f_path.dentry;
- struct config_item *item = to_item(dentry->d_parent);
- struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
ssize_t retval = 0;
ssize_t len = min_t(size_t, count, PAGE_SIZE);
@@ -166,18 +169,23 @@ configfs_read_bin_file(struct file *file, char __user *buf,
retval = -ETXTBSY;
goto out;
}
- buffer->read_in_progress = 1;
+ buffer->read_in_progress = true;
if (buffer->needs_read_fill) {
/* perform first read with buf == NULL to get extent */
- len = bin_attr->read(item, NULL, 0);
+ down_read(&frag->frag_sem);
+ if (!frag->frag_dead)
+ len = buffer->bin_attr->read(buffer->item, NULL, 0);
+ else
+ len = -ENOENT;
+ up_read(&frag->frag_sem);
if (len <= 0) {
retval = len;
goto out;
}
/* do not exceed the maximum value */
- if (bin_attr->cb_max_size && len > bin_attr->cb_max_size) {
+ if (buffer->cb_max_size && len > buffer->cb_max_size) {
retval = -EFBIG;
goto out;
}
@@ -190,7 +198,13 @@ configfs_read_bin_file(struct file *file, char __user *buf,
buffer->bin_buffer_size = len;
/* perform second read to fill buffer */
- len = bin_attr->read(item, buffer->bin_buffer, len);
+ down_read(&frag->frag_sem);
+ if (!frag->frag_dead)
+ len = buffer->bin_attr->read(buffer->item,
+ buffer->bin_buffer, len);
+ else
+ len = -ENOENT;
+ up_read(&frag->frag_sem);
if (len < 0) {
retval = len;
vfree(buffer->bin_buffer);
@@ -240,25 +254,17 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size
return error ? -EFAULT : count;
}
-
-/**
- * flush_write_buffer - push buffer to config_item.
- * @dentry: dentry to the attribute
- * @buffer: data buffer for file.
- * @count: number of bytes
- *
- * Get the correct pointers for the config_item and the attribute we're
- * dealing with, then call the store() method for the attribute,
- * passing the buffer that we acquired in fill_write_buffer().
- */
-
static int
-flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count)
+flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t count)
{
- struct configfs_attribute * attr = to_attr(dentry);
- struct config_item * item = to_item(dentry->d_parent);
-
- return attr->store(item, buffer->page, count);
+ struct configfs_fragment *frag = to_frag(file);
+ int res = -ENOENT;
+
+ down_read(&frag->frag_sem);
+ if (!frag->frag_dead)
+ res = buffer->attr->store(buffer->item, buffer->page, count);
+ up_read(&frag->frag_sem);
+ return res;
}
@@ -282,13 +288,13 @@ flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size
static ssize_t
configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
- struct configfs_buffer * buffer = file->private_data;
+ struct configfs_buffer *buffer = file->private_data;
ssize_t len;
mutex_lock(&buffer->mutex);
len = fill_write_buffer(buffer, buf, count);
if (len > 0)
- len = flush_write_buffer(file->f_path.dentry, buffer, len);
+ len = flush_write_buffer(file, buffer, len);
if (len > 0)
*ppos += len;
mutex_unlock(&buffer->mutex);
@@ -313,8 +319,6 @@ configfs_write_bin_file(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct configfs_buffer *buffer = file->private_data;
- struct dentry *dentry = file->f_path.dentry;
- struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
void *tbuf = NULL;
ssize_t len;
@@ -325,13 +329,13 @@ configfs_write_bin_file(struct file *file, const char __user *buf,
len = -ETXTBSY;
goto out;
}
- buffer->write_in_progress = 1;
+ buffer->write_in_progress = true;
/* buffer grows? */
if (*ppos + count > buffer->bin_buffer_size) {
- if (bin_attr->cb_max_size &&
- *ppos + count > bin_attr->cb_max_size) {
+ if (buffer->cb_max_size &&
+ *ppos + count > buffer->cb_max_size) {
len = -EFBIG;
goto out;
}
@@ -363,31 +367,51 @@ out:
return len;
}
-static int check_perm(struct inode * inode, struct file * file, int type)
+static int __configfs_open_file(struct inode *inode, struct file *file, int type)
{
- struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent);
- struct configfs_attribute * attr = to_attr(file->f_path.dentry);
- struct configfs_bin_attribute *bin_attr = NULL;
- struct configfs_buffer * buffer;
- struct configfs_item_operations * ops = NULL;
- int error = 0;
+ struct dentry *dentry = file->f_path.dentry;
+ struct configfs_fragment *frag = to_frag(file);
+ struct configfs_attribute *attr;
+ struct configfs_buffer *buffer;
+ int error;
- if (!item || !attr)
- goto Einval;
+ error = -ENOMEM;
+ buffer = kzalloc(sizeof(struct configfs_buffer), GFP_KERNEL);
+ if (!buffer)
+ goto out;
- if (type & CONFIGFS_ITEM_BIN_ATTR)
- bin_attr = to_bin_attr(file->f_path.dentry);
+ error = -ENOENT;
+ down_read(&frag->frag_sem);
+ if (unlikely(frag->frag_dead))
+ goto out_free_buffer;
- /* Grab the module reference for this attribute if we have one */
- if (!try_module_get(attr->ca_owner)) {
- error = -ENODEV;
- goto Done;
+ error = -EINVAL;
+ buffer->item = to_item(dentry->d_parent);
+ if (!buffer->item)
+ goto out_free_buffer;
+
+ attr = to_attr(dentry);
+ if (!attr)
+ goto out_put_item;
+
+ if (type & CONFIGFS_ITEM_BIN_ATTR) {
+ buffer->bin_attr = to_bin_attr(dentry);
+ buffer->cb_max_size = buffer->bin_attr->cb_max_size;
+ } else {
+ buffer->attr = attr;
}
- if (item->ci_type)
- ops = item->ci_type->ct_item_ops;
- else
- goto Eaccess;
+ buffer->owner = attr->ca_owner;
+ /* Grab the module reference for this attribute if we have one */
+ error = -ENODEV;
+ if (!try_module_get(buffer->owner))
+ goto out_put_item;
+
+ error = -EACCES;
+ if (!buffer->item->ci_type)
+ goto out_put_module;
+
+ buffer->ops = buffer->item->ci_type->ct_item_ops;
/* File needs write support.
* The inode's perms must say it's ok,
@@ -395,13 +419,11 @@ static int check_perm(struct inode * inode, struct file * file, int type)
*/
if (file->f_mode & FMODE_WRITE) {
if (!(inode->i_mode & S_IWUGO))
- goto Eaccess;
-
+ goto out_put_module;
if ((type & CONFIGFS_ITEM_ATTR) && !attr->store)
- goto Eaccess;
-
- if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->write)
- goto Eaccess;
+ goto out_put_module;
+ if ((type & CONFIGFS_ITEM_BIN_ATTR) && !buffer->bin_attr->write)
+ goto out_put_module;
}
/* File needs read support.
@@ -410,92 +432,72 @@ static int check_perm(struct inode * inode, struct file * file, int type)
*/
if (file->f_mode & FMODE_READ) {
if (!(inode->i_mode & S_IRUGO))
- goto Eaccess;
-
+ goto out_put_module;
if ((type & CONFIGFS_ITEM_ATTR) && !attr->show)
- goto Eaccess;
-
- if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->read)
- goto Eaccess;
+ goto out_put_module;
+ if ((type & CONFIGFS_ITEM_BIN_ATTR) && !buffer->bin_attr->read)
+ goto out_put_module;
}
- /* No error? Great, allocate a buffer for the file, and store it
- * it in file->private_data for easy access.
- */
- buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
- if (!buffer) {
- error = -ENOMEM;
- goto Enomem;
- }
mutex_init(&buffer->mutex);
buffer->needs_read_fill = 1;
- buffer->read_in_progress = 0;
- buffer->write_in_progress = 0;
- buffer->ops = ops;
+ buffer->read_in_progress = false;
+ buffer->write_in_progress = false;
file->private_data = buffer;
- goto Done;
+ up_read(&frag->frag_sem);
+ return 0;
- Einval:
- error = -EINVAL;
- goto Done;
- Eaccess:
- error = -EACCES;
- Enomem:
- module_put(attr->ca_owner);
- Done:
- if (error && item)
- config_item_put(item);
+out_put_module:
+ module_put(buffer->owner);
+out_put_item:
+ config_item_put(buffer->item);
+out_free_buffer:
+ up_read(&frag->frag_sem);
+ kfree(buffer);
+out:
return error;
}
static int configfs_release(struct inode *inode, struct file *filp)
{
- struct config_item * item = to_item(filp->f_path.dentry->d_parent);
- struct configfs_attribute * attr = to_attr(filp->f_path.dentry);
- struct module * owner = attr->ca_owner;
- struct configfs_buffer * buffer = filp->private_data;
-
- if (item)
- config_item_put(item);
- /* After this point, attr should not be accessed. */
- module_put(owner);
-
- if (buffer) {
- if (buffer->page)
- free_page((unsigned long)buffer->page);
- mutex_destroy(&buffer->mutex);
- kfree(buffer);
- }
+ struct configfs_buffer *buffer = filp->private_data;
+
+ module_put(buffer->owner);
+ if (buffer->page)
+ free_page((unsigned long)buffer->page);
+ mutex_destroy(&buffer->mutex);
+ kfree(buffer);
return 0;
}
static int configfs_open_file(struct inode *inode, struct file *filp)
{
- return check_perm(inode, filp, CONFIGFS_ITEM_ATTR);
+ return __configfs_open_file(inode, filp, CONFIGFS_ITEM_ATTR);
}
static int configfs_open_bin_file(struct inode *inode, struct file *filp)
{
- return check_perm(inode, filp, CONFIGFS_ITEM_BIN_ATTR);
+ return __configfs_open_file(inode, filp, CONFIGFS_ITEM_BIN_ATTR);
}
-static int configfs_release_bin_file(struct inode *inode, struct file *filp)
+static int configfs_release_bin_file(struct inode *inode, struct file *file)
{
- struct configfs_buffer *buffer = filp->private_data;
- struct dentry *dentry = filp->f_path.dentry;
- struct config_item *item = to_item(dentry->d_parent);
- struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
- ssize_t len = 0;
- int ret;
+ struct configfs_buffer *buffer = file->private_data;
- buffer->read_in_progress = 0;
+ buffer->read_in_progress = false;
if (buffer->write_in_progress) {
- buffer->write_in_progress = 0;
-
- len = bin_attr->write(item, buffer->bin_buffer,
- buffer->bin_buffer_size);
-
+ struct configfs_fragment *frag = to_frag(file);
+ buffer->write_in_progress = false;
+
+ down_read(&frag->frag_sem);
+ if (!frag->frag_dead) {
+ /* result of ->release() is ignored */
+ buffer->bin_attr->write(buffer->item,
+ buffer->bin_buffer,
+ buffer->bin_buffer_size);
+ }
+ up_read(&frag->frag_sem);
/* vfree on NULL is safe */
vfree(buffer->bin_buffer);
buffer->bin_buffer = NULL;
@@ -503,10 +505,8 @@ static int configfs_release_bin_file(struct inode *inode, struct file *filp)
buffer->needs_read_fill = 1;
}
- ret = configfs_release(inode, filp);
- if (len < 0)
- return len;
- return ret;
+ configfs_release(inode, file);
+ return 0;
}
@@ -541,7 +541,7 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib
inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL);
error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode,
- CONFIGFS_ITEM_ATTR);
+ CONFIGFS_ITEM_ATTR, parent_sd->s_frag);
inode_unlock(d_inode(dir));
return error;
@@ -563,7 +563,7 @@ int configfs_create_bin_file(struct config_item *item,
inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL);
error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode,
- CONFIGFS_ITEM_BIN_ATTR);
+ CONFIGFS_ITEM_BIN_ATTR, parent_sd->s_frag);
inode_unlock(dir->d_inode);
return error;
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 9993cdb81e7d..147a6b779ab9 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -157,11 +157,42 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
!type->ct_item_ops->allow_link)
goto out_put;
+ /*
+ * This is really sick. What they wanted was a hybrid of
+ * link(2) and symlink(2) - they wanted the target resolved
+ * at syscall time (as link(2) would've done), be a directory
+ * (which link(2) would've refused to do) *AND* be a deep
+ * fucking magic, making the target busy from rmdir POV.
+ * symlink(2) is nothing of that sort, and the locking it
+ * gets matches the normal symlink(2) semantics. Without
+ * attempts to resolve the target (which might very well
+ * not even exist yet) done prior to locking the parent
+ * directory. This perversion, OTOH, needs to resolve
+ * the target, which would lead to obvious deadlocks if
+ * attempted with any directories locked.
+ *
+ * Unfortunately, that garbage is userland ABI and we should've
+ * said "no" back in 2005. Too late now, so we get to
+ * play very ugly games with locking.
+ *
+ * Try *ANYTHING* of that sort in new code, and you will
+ * really regret it. Just ask yourself - what could a BOFH
+ * do to me and do I want to find it out first-hand?
+ *
+ * AV, a thoroughly annoyed bastard.
+ */
+ inode_unlock(dir);
ret = get_target(symname, &path, &target_item, dentry->d_sb);
+ inode_lock(dir);
if (ret)
goto out_put;
- ret = type->ct_item_ops->allow_link(parent_item, target_item);
+ if (dentry->d_inode || d_unhashed(dentry))
+ ret = -EEXIST;
+ else
+ ret = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ if (!ret)
+ ret = type->ct_item_ops->allow_link(parent_item, target_item);
if (!ret) {
mutex_lock(&configfs_symlink_mutex);
ret = create_link(parent_item, target_item, dentry);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index daf2683f0655..f862ad19c714 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -157,7 +157,10 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
struct crypto_skcipher *tfm = ci->ci_ctfm;
int res = 0;
- BUG_ON(len == 0);
+ if (WARN_ON_ONCE(len <= 0))
+ return -EINVAL;
+ if (WARN_ON_ONCE(len % FS_CRYPTO_BLOCK_SIZE != 0))
+ return -EINVAL;
BUILD_BUG_ON(sizeof(iv) != FS_IV_SIZE);
BUILD_BUG_ON(AES_BLOCK_SIZE != FS_IV_SIZE);
@@ -257,8 +260,6 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
struct page *ciphertext_page = page;
int err;
- BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0);
-
if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) {
/* with inplace-encryption we just encrypt the page */
err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, page,
@@ -270,7 +271,8 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
return ciphertext_page;
}
- BUG_ON(!PageLocked(page));
+ if (WARN_ON_ONCE(!PageLocked(page)))
+ return ERR_PTR(-EINVAL);
ctx = fscrypt_get_ctx(inode, gfp_flags);
if (IS_ERR(ctx))
@@ -318,8 +320,9 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
unsigned int len, unsigned int offs, u64 lblk_num)
{
- if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES))
- BUG_ON(!PageLocked(page));
+ if (WARN_ON_ONCE(!PageLocked(page) &&
+ !(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES)))
+ return -EINVAL;
return fscrypt_do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page,
len, offs, GFP_NOFS);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index a120649beeca..d13a154c8424 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -81,6 +81,8 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
if (ret == -ENODATA) {
if (!S_ISDIR(inode->i_mode))
ret = -ENOTDIR;
+ else if (IS_DEADDIR(inode))
+ ret = -ENOENT;
else if (!inode->i_sb->s_cop->empty_dir(inode))
ret = -ENOTEMPTY;
else
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 610f72ae7ad6..9c8c9a09b4a6 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -807,6 +807,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
dlm_delete_debug_file(ls);
+ idr_destroy(&ls->ls_recover_idr);
kfree(ls->ls_recover_buf);
/*
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 3fda3832cf6a..0bc43b35d2c5 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -671,7 +671,7 @@ int dlm_ls_stop(struct dlm_ls *ls)
int dlm_ls_start(struct dlm_ls *ls)
{
struct dlm_recover *rv, *rv_old;
- struct dlm_config_node *nodes;
+ struct dlm_config_node *nodes = NULL;
int error, count;
rv = kzalloc(sizeof(*rv), GFP_NOFS);
@@ -680,7 +680,7 @@ int dlm_ls_start(struct dlm_ls *ls)
error = dlm_config_nodes(ls->ls_name, &nodes, &count);
if (error < 0)
- goto fail;
+ goto fail_rv;
spin_lock(&ls->ls_recover_lock);
@@ -712,8 +712,9 @@ int dlm_ls_start(struct dlm_ls *ls)
return 0;
fail:
- kfree(rv);
kfree(nodes);
+ fail_rv:
+ kfree(rv);
return error;
}
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 7cd24bccd4fe..37be29f21d04 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -38,10 +38,8 @@ int __init dlm_memory_init(void)
void dlm_memory_exit(void)
{
- if (lkb_cache)
- kmem_cache_destroy(lkb_cache);
- if (rsb_cache)
- kmem_cache_destroy(rsb_cache);
+ kmem_cache_destroy(lkb_cache);
+ kmem_cache_destroy(rsb_cache);
}
char *dlm_allocate_lvb(struct dlm_ls *ls)
@@ -86,8 +84,7 @@ void dlm_free_lkb(struct dlm_lkb *lkb)
struct dlm_user_args *ua;
ua = lkb->lkb_ua;
if (ua) {
- if (ua->lksb.sb_lvbptr)
- kfree(ua->lksb.sb_lvbptr);
+ kfree(ua->lksb.sb_lvbptr);
kfree(ua);
}
}
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index d18e7a539f11..02de11695d0b 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -25,6 +25,7 @@
#include "lvb_table.h"
#include "user.h"
#include "ast.h"
+#include "config.h"
static const char name_prefix[] = "dlm";
static const struct file_operations device_fops;
@@ -404,7 +405,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- error = dlm_new_lockspace(params->name, NULL, params->flags,
+ error = dlm_new_lockspace(params->name, dlm_config.ci_cluster_name, params->flags,
DLM_USER_LVB_LEN, NULL, NULL, NULL,
&lockspace);
if (error)
@@ -702,7 +703,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat,
result.version[0] = DLM_DEVICE_VERSION_MAJOR;
result.version[1] = DLM_DEVICE_VERSION_MINOR;
result.version[2] = DLM_DEVICE_VERSION_PATCH;
- memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
+ memcpy(&result.lksb, &ua->lksb, offsetof(struct dlm_lksb, sb_lvbptr));
result.user_lksb = ua->user_lksb;
/* FIXME: dlm1 provides for the user's bastparam/addr to not be updated
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 9d1823efff34..bd25ab837011 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1034,8 +1034,10 @@ int ecryptfs_read_and_validate_header_region(struct inode *inode)
rc = ecryptfs_read_lower(file_size, 0, ECRYPTFS_SIZE_AND_MARKER_BYTES,
inode);
- if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
- return rc >= 0 ? -EINVAL : rc;
+ if (rc < 0)
+ return rc;
+ else if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
+ return -EINVAL;
rc = ecryptfs_validate_marker(marker);
if (!rc)
ecryptfs_i_size_init(file_size, inode);
@@ -1397,8 +1399,10 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
ecryptfs_inode_to_lower(inode),
ECRYPTFS_XATTR_NAME, file_size,
ECRYPTFS_SIZE_AND_MARKER_BYTES);
- if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
- return rc >= 0 ? -EINVAL : rc;
+ if (rc < 0)
+ return rc;
+ else if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
+ return -EINVAL;
rc = ecryptfs_validate_marker(marker);
if (!rc)
ecryptfs_i_size_init(file_size, inode);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index bda65a730790..62d1dea85ef1 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -326,9 +326,9 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
struct dentry *lower_dentry)
{
- struct inode *inode, *lower_inode = d_inode(lower_dentry);
+ struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent);
+ struct inode *inode, *lower_inode;
struct ecryptfs_dentry_info *dentry_info;
- struct vfsmount *lower_mnt;
int rc = 0;
dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
@@ -340,16 +340,23 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
return ERR_PTR(-ENOMEM);
}
- lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
fsstack_copy_attr_atime(d_inode(dentry->d_parent),
- d_inode(lower_dentry->d_parent));
+ d_inode(path->dentry));
BUG_ON(!d_count(lower_dentry));
ecryptfs_set_dentry_private(dentry, dentry_info);
- dentry_info->lower_path.mnt = lower_mnt;
+ dentry_info->lower_path.mnt = mntget(path->mnt);
dentry_info->lower_path.dentry = lower_dentry;
- if (d_really_is_negative(lower_dentry)) {
+ /*
+ * negative dentry can go positive under us here - its parent is not
+ * locked. That's OK and that could happen just as we return from
+ * ecryptfs_lookup() anyway. Just need to be careful and fetch
+ * ->d_inode only once - it's not stable here.
+ */
+ lower_inode = READ_ONCE(lower_dentry->d_inode);
+
+ if (!lower_inode) {
/* We want to add because we couldn't find in lower */
d_add(dentry, NULL);
return NULL;
diff --git a/fs/exec.c b/fs/exec.c
index 0936b5a8199a..7def97f6aac2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1007,7 +1007,7 @@ static int exec_mmap(struct mm_struct *mm)
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
old_mm = current->mm;
- mm_release(tsk, old_mm);
+ exec_mm_release(tsk, old_mm);
if (old_mm) {
sync_mm_rss(old_mm);
@@ -1808,7 +1808,7 @@ static int do_execveat_common(int fd, struct filename *filename,
current->in_execve = 0;
membarrier_execve(current);
acct_update_integrals(current);
- task_numa_free(current);
+ task_numa_free(current, false);
free_bprm(bprm);
kfree(pathbuf);
putname(filename);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index c9ec652e2fcd..881d5798a181 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -702,21 +702,18 @@ out:
/*
* Read the superblock from the OSD and fill in the fields
*/
-static int exofs_fill_super(struct super_block *sb, void *data, int silent)
+static int exofs_fill_super(struct super_block *sb,
+ struct exofs_mountopt *opts,
+ struct exofs_sb_info *sbi,
+ int silent)
{
struct inode *root;
- struct exofs_mountopt *opts = data;
- struct exofs_sb_info *sbi; /*extended info */
struct osd_dev *od; /* Master device */
struct exofs_fscb fscb; /*on-disk superblock info */
struct ore_comp comp;
unsigned table_count;
int ret;
- sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
-
/* use mount options to fill superblock */
if (opts->is_osdname) {
struct osd_dev_info odi = {.systemid_len = 0};
@@ -860,7 +857,9 @@ static struct dentry *exofs_mount(struct file_system_type *type,
int flags, const char *dev_name,
void *data)
{
+ struct super_block *s;
struct exofs_mountopt opts;
+ struct exofs_sb_info *sbi;
int ret;
ret = parse_options(data, &opts);
@@ -869,9 +868,31 @@ static struct dentry *exofs_mount(struct file_system_type *type,
return ERR_PTR(ret);
}
+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ if (!sbi) {
+ kfree(opts.dev_name);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ s = sget(type, NULL, set_anon_super, flags, NULL);
+
+ if (IS_ERR(s)) {
+ kfree(opts.dev_name);
+ kfree(sbi);
+ return ERR_CAST(s);
+ }
+
if (!opts.dev_name)
opts.dev_name = dev_name;
- return mount_nodev(type, flags, &opts, exofs_fill_super);
+
+
+ ret = exofs_fill_super(s, &opts, sbi, flags & SB_SILENT ? 1 : 0);
+ if (ret) {
+ deactivate_locked_super(s);
+ return ERR_PTR(ret);
+ }
+ s->s_flags |= SB_ACTIVE;
+ return dget(s->s_root);
}
/*
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index c22cc9d2a5c9..a561ae17cf43 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -508,26 +508,33 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
* inode is actually connected to the parent.
*/
err = exportfs_get_name(mnt, target_dir, nbuf, result);
- if (!err) {
- inode_lock(target_dir->d_inode);
- nresult = lookup_one_len(nbuf, target_dir,
- strlen(nbuf));
- inode_unlock(target_dir->d_inode);
- if (!IS_ERR(nresult)) {
- if (nresult->d_inode) {
- dput(result);
- result = nresult;
- } else
- dput(nresult);
- }
+ if (err) {
+ dput(target_dir);
+ goto err_result;
}
+ inode_lock(target_dir->d_inode);
+ nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf));
+ if (!IS_ERR(nresult)) {
+ if (unlikely(nresult->d_inode != result->d_inode)) {
+ dput(nresult);
+ nresult = ERR_PTR(-ESTALE);
+ }
+ }
+ inode_unlock(target_dir->d_inode);
/*
* At this point we are done with the parent, but it's pinned
* by the child dentry anyway.
*/
dput(target_dir);
+ if (IS_ERR(nresult)) {
+ err = PTR_ERR(nresult);
+ goto err_result;
+ }
+ dput(result);
+ result = nresult;
+
/*
* And finally make sure the dentry is actually acceptable
* to NFSD.
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index a7c87d593083..31c5a7b5f1f3 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -699,10 +699,13 @@ static int ext2_get_blocks(struct inode *inode,
if (!partial) {
count++;
mutex_unlock(&ei->truncate_mutex);
- if (err)
- goto cleanup;
goto got_it;
}
+
+ if (err) {
+ mutex_unlock(&ei->truncate_mutex);
+ goto cleanup;
+ }
}
/*
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index dc676714454a..446b6c375b6f 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -107,7 +107,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct buffer_head *bh = NULL;
- int dir_has_error = 0;
struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
if (ext4_encrypted_inode(inode)) {
@@ -143,8 +142,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
return err;
}
- offset = ctx->pos & (sb->s_blocksize - 1);
-
while (ctx->pos < inode->i_size) {
struct ext4_map_blocks map;
@@ -153,9 +150,18 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
goto errout;
}
cond_resched();
+ offset = ctx->pos & (sb->s_blocksize - 1);
map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
map.m_len = 1;
err = ext4_map_blocks(NULL, inode, &map, 0);
+ if (err == 0) {
+ /* m_len should never be zero but let's avoid
+ * an infinite loop if it somehow is */
+ if (map.m_len == 0)
+ map.m_len = 1;
+ ctx->pos += map.m_len * sb->s_blocksize;
+ continue;
+ }
if (err > 0) {
pgoff_t index = map.m_pblk >>
(PAGE_SHIFT - inode->i_blkbits);
@@ -174,13 +180,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
}
if (!bh) {
- if (!dir_has_error) {
- EXT4_ERROR_FILE(file, 0,
- "directory contains a "
- "hole at offset %llu",
- (unsigned long long) ctx->pos);
- dir_has_error = 1;
- }
/* corrupt size? Maybe no more blocks to read */
if (ctx->pos > inode->i_blocks << 9)
break;
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 1437f62d068c..116401578401 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -364,20 +364,20 @@ static inline int ext4_journal_force_commit(journal_t *journal)
}
static inline int ext4_jbd2_inode_add_write(handle_t *handle,
- struct inode *inode)
+ struct inode *inode, loff_t start_byte, loff_t length)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_inode_add_write(handle,
- EXT4_I(inode)->jinode);
+ return jbd2_journal_inode_ranged_write(handle,
+ EXT4_I(inode)->jinode, start_byte, length);
return 0;
}
static inline int ext4_jbd2_inode_add_wait(handle_t *handle,
- struct inode *inode)
+ struct inode *inode, loff_t start_byte, loff_t length)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_inode_add_wait(handle,
- EXT4_I(inode)->jinode);
+ return jbd2_journal_inode_ranged_wait(handle,
+ EXT4_I(inode)->jinode, start_byte, length);
return 0;
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 01f44364c547..20d68554680f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3756,8 +3756,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
* illegal.
*/
if (ee_block != map->m_lblk || ee_len > map->m_len) {
-#ifdef EXT4_DEBUG
- ext4_warning("Inode (%ld) finished: extent logical block %llu,"
+#ifdef CONFIG_EXT4_DEBUG
+ ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
" len %u; IO logical block %llu, len %u",
inode->i_ino, (unsigned long long)ee_block, ee_len,
(unsigned long long)map->m_lblk, map->m_len);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 44966b272216..4ede0af9d6fe 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -163,6 +163,10 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
ret = generic_write_checks(iocb, from);
if (ret <= 0)
return ret;
+
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return -EPERM;
+
/*
* If we have encountered a bitmap-format file, the size limit
* is smaller than s_maxbytes, which is for extent-mapped files.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 862766a1b080..11bc4c69bf16 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -195,7 +195,12 @@ void ext4_evict_inode(struct inode *inode)
{
handle_t *handle;
int err;
- int extra_credits = 3;
+ /*
+ * Credits for final inode cleanup and freeing:
+ * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
+ * (xattr block freeing), bitmap, group descriptor (inode freeing)
+ */
+ int extra_credits = 6;
struct ext4_xattr_inode_array *ea_inode_array = NULL;
trace_ext4_evict_inode(inode);
@@ -251,8 +256,12 @@ void ext4_evict_inode(struct inode *inode)
if (!IS_NOQUOTA(inode))
extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
+ /*
+ * Block bitmap, group descriptor, and inode are accounted in both
+ * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
+ */
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
- ext4_blocks_for_truncate(inode)+extra_credits);
+ ext4_blocks_for_truncate(inode) + extra_credits - 3);
if (IS_ERR(handle)) {
ext4_std_error(inode->i_sb, PTR_ERR(handle));
/*
@@ -728,10 +737,16 @@ out_sem:
!(flags & EXT4_GET_BLOCKS_ZERO) &&
!ext4_is_quota_file(inode) &&
ext4_should_order_data(inode)) {
+ loff_t start_byte =
+ (loff_t)map->m_lblk << inode->i_blkbits;
+ loff_t length = (loff_t)map->m_len << inode->i_blkbits;
+
if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
- ret = ext4_jbd2_inode_add_wait(handle, inode);
+ ret = ext4_jbd2_inode_add_wait(handle, inode,
+ start_byte, length);
else
- ret = ext4_jbd2_inode_add_write(handle, inode);
+ ret = ext4_jbd2_inode_add_write(handle, inode,
+ start_byte, length);
if (ret)
return ret;
}
@@ -4004,7 +4019,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
err = 0;
mark_buffer_dirty(bh);
if (ext4_should_order_data(inode))
- err = ext4_jbd2_inode_add_write(handle, inode);
+ err = ext4_jbd2_inode_add_write(handle, inode, from,
+ length);
}
unlock:
@@ -4170,6 +4186,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
trace_ext4_punch_hole(inode, offset, length, 0);
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ if (ext4_has_inline_data(inode)) {
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ ret = ext4_convert_inline_data(inode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
+ if (ret)
+ return ret;
+ }
+
/*
* Write out all dirty pages to avoid race conditions
* Then release them.
@@ -5280,11 +5305,15 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
offset = inode->i_size & (PAGE_SIZE - 1);
/*
- * All buffers in the last page remain valid? Then there's nothing to
- * do. We do the check mainly to optimize the common PAGE_SIZE ==
- * blocksize case
+ * If the page is fully truncated, we don't need to wait for any commit
+ * (and we even should not as __ext4_journalled_invalidatepage() may
+ * strip all buffers from the page but keep the page dirty which can then
+ * confuse e.g. concurrent ext4_writepage() seeing dirty page without
+ * buffers). Also we don't need to wait for any commit if all buffers in
+ * the page remain valid. This is most beneficial for the common case of
+ * blocksize == PAGESIZE.
*/
- if (offset > PAGE_SIZE - i_blocksize(inode))
+ if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
return;
while (1) {
page = find_lock_page(inode->i_mapping,
@@ -5341,6 +5370,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return -EPERM;
+
+ if (unlikely(IS_APPEND(inode) &&
+ (ia_valid & (ATTR_MODE | ATTR_UID |
+ ATTR_GID | ATTR_TIMES_SET))))
+ return -EPERM;
+
error = setattr_prepare(dentry, attr);
if (error)
return error;
@@ -5727,8 +5764,23 @@ static int __ext4_expand_extra_isize(struct inode *inode,
{
struct ext4_inode *raw_inode;
struct ext4_xattr_ibody_header *header;
+ unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
int error;
+ /* this was checked at iget time, but double check for good measure */
+ if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
+ (ei->i_extra_isize & 3)) {
+ EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
+ ei->i_extra_isize,
+ EXT4_INODE_SIZE(inode->i_sb));
+ return -EFSCORRUPTED;
+ }
+ if ((new_extra_isize < ei->i_extra_isize) ||
+ (new_extra_isize < 4) ||
+ (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
+ return -EINVAL; /* Should never happen */
+
raw_inode = ext4_raw_inode(iloc);
header = IHDR(inode, raw_inode);
@@ -6045,6 +6097,9 @@ int ext4_page_mkwrite(struct vm_fault *vmf)
get_block_t *get_block;
int retries = 0;
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return VM_FAULT_SIGBUS;
+
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d2efc0cb8f31..82e118e9e50b 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -210,6 +210,29 @@ static int uuid_is_zero(__u8 u[16])
}
#endif
+/*
+ * If immutable is set and we are not clearing it, we're not allowed to change
+ * anything else in the inode. Don't error out if we're only trying to set
+ * immutable on an immutable file.
+ */
+static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid,
+ unsigned int flags)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned int oldflags = ei->i_flags;
+
+ if (!(oldflags & EXT4_IMMUTABLE_FL) || !(flags & EXT4_IMMUTABLE_FL))
+ return 0;
+
+ if ((oldflags & ~EXT4_IMMUTABLE_FL) != (flags & ~EXT4_IMMUTABLE_FL))
+ return -EPERM;
+ if (ext4_has_feature_project(inode->i_sb) &&
+ __kprojid_val(ei->i_projid) != new_projid)
+ return -EPERM;
+
+ return 0;
+}
+
static int ext4_ioctl_setflags(struct inode *inode,
unsigned int flags)
{
@@ -263,6 +286,20 @@ static int ext4_ioctl_setflags(struct inode *inode,
goto flags_out;
}
+ /*
+ * Wait for all pending directio and then flush all the dirty pages
+ * for this file. The flush marks all the pages readonly, so any
+ * subsequent attempt to write to the file (particularly mmap pages)
+ * will come through the filesystem and fail.
+ */
+ if (S_ISREG(inode->i_mode) && !IS_IMMUTABLE(inode) &&
+ (flags & EXT4_IMMUTABLE_FL)) {
+ inode_dio_wait(inode);
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err)
+ goto flags_out;
+ }
+
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
@@ -653,7 +690,11 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return err;
inode_lock(inode);
- err = ext4_ioctl_setflags(inode, flags);
+ err = ext4_ioctl_check_immutable(inode,
+ from_kprojid(&init_user_ns, ei->i_projid),
+ flags);
+ if (!err)
+ err = ext4_ioctl_setflags(inode, flags);
inode_unlock(inode);
mnt_drop_write_file(filp);
return err;
@@ -1061,6 +1102,9 @@ resizefs_out:
goto out;
flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
(flags & EXT4_FL_XFLAG_VISIBLE);
+ err = ext4_ioctl_check_immutable(inode, fa.fsx_projid, flags);
+ if (err)
+ goto out;
err = ext4_ioctl_setflags(inode, flags);
if (err)
goto out;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index cd8d481e0c48..ef60f2e92da6 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -400,7 +400,8 @@ data_copy:
/* Even in case of data=writeback it is reasonable to pin
* inode to transaction, to prevent unexpected data loss */
- *err = ext4_jbd2_inode_add_write(handle, orig_inode);
+ *err = ext4_jbd2_inode_add_write(handle, orig_inode,
+ (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size);
unlock_pages:
unlock_page(pagep[0]);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 070660cb5b91..b4e0c270def4 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -80,8 +80,18 @@ static struct buffer_head *ext4_append(handle_t *handle,
static int ext4_dx_csum_verify(struct inode *inode,
struct ext4_dir_entry *dirent);
+/*
+ * Hints to ext4_read_dirblock regarding whether we expect a directory
+ * block being read to be an index block, or a block containing
+ * directory entries (and if the latter, whether it was found via a
+ * logical block in an htree index block). This is used to control
+ * what sort of sanity checkinig ext4_read_dirblock() will do on the
+ * directory block read from the storage device. EITHER will means
+ * the caller doesn't know what kind of directory block will be read,
+ * so no specific verification will be done.
+ */
typedef enum {
- EITHER, INDEX, DIRENT
+ EITHER, INDEX, DIRENT, DIRENT_HTREE
} dirblock_type_t;
#define ext4_read_dirblock(inode, block, type) \
@@ -107,11 +117,14 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
return bh;
}
- if (!bh) {
+ if (!bh && (type == INDEX || type == DIRENT_HTREE)) {
ext4_error_inode(inode, func, line, block,
- "Directory hole found");
+ "Directory hole found for htree %s block",
+ (type == INDEX) ? "index" : "leaf");
return ERR_PTR(-EFSCORRUPTED);
}
+ if (!bh)
+ return NULL;
dirent = (struct ext4_dir_entry *) bh->b_data;
/* Determine whether or not we have an index block */
if (is_dx(inode)) {
@@ -978,7 +991,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
- bh = ext4_read_dirblock(dir, block, DIRENT);
+ bh = ext4_read_dirblock(dir, block, DIRENT_HTREE);
if (IS_ERR(bh))
return PTR_ERR(bh);
@@ -1508,7 +1521,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
return (struct buffer_head *) frame;
do {
block = dx_get_block(frame->at);
- bh = ext4_read_dirblock(dir, block, DIRENT);
+ bh = ext4_read_dirblock(dir, block, DIRENT_HTREE);
if (IS_ERR(bh))
goto errout;
@@ -2088,6 +2101,11 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
blocks = dir->i_size >> sb->s_blocksize_bits;
for (block = 0; block < blocks; block++) {
bh = ext4_read_dirblock(dir, block, DIRENT);
+ if (bh == NULL) {
+ bh = ext4_bread(handle, dir, block,
+ EXT4_GET_BLOCKS_CREATE);
+ goto add_to_new_block;
+ }
if (IS_ERR(bh)) {
retval = PTR_ERR(bh);
bh = NULL;
@@ -2108,6 +2126,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
brelse(bh);
}
bh = ext4_append(handle, dir, &block);
+add_to_new_block:
if (IS_ERR(bh)) {
retval = PTR_ERR(bh);
bh = NULL;
@@ -2152,7 +2171,7 @@ again:
return PTR_ERR(frame);
entries = frame->entries;
at = frame->at;
- bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT);
+ bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT_HTREE);
if (IS_ERR(bh)) {
err = PTR_ERR(bh);
bh = NULL;
@@ -2274,7 +2293,7 @@ again:
dxroot->info.indirect_levels += 1;
dxtrace(printk(KERN_DEBUG
"Creating %d level index...\n",
- info->indirect_levels));
+ dxroot->info.indirect_levels));
err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
if (err)
goto journal_error;
@@ -2700,7 +2719,10 @@ bool ext4_empty_dir(struct inode *inode)
EXT4_ERROR_INODE(inode, "invalid size");
return true;
}
- bh = ext4_read_dirblock(inode, 0, EITHER);
+ /* The first directory block must not be a hole,
+ * so treat it as DIRENT_HTREE
+ */
+ bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
if (IS_ERR(bh))
return true;
@@ -2722,6 +2744,10 @@ bool ext4_empty_dir(struct inode *inode)
brelse(bh);
lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
bh = ext4_read_dirblock(inode, lblock, EITHER);
+ if (bh == NULL) {
+ offset += sb->s_blocksize;
+ continue;
+ }
if (IS_ERR(bh))
return true;
de = (struct ext4_dir_entry_2 *) bh->b_data;
@@ -3039,18 +3065,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- if (inode->i_nlink == 0) {
- ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
- dentry->d_name.len, dentry->d_name.name);
- set_nlink(inode, 1);
- }
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_unlink;
dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
ext4_mark_inode_dirty(handle, dir);
- drop_nlink(inode);
+ if (inode->i_nlink == 0)
+ ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
+ dentry->d_name.len, dentry->d_name.name);
+ else
+ drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
inode->i_ctime = current_time(inode);
@@ -3292,7 +3317,10 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
struct buffer_head *bh;
if (!ext4_has_inline_data(inode)) {
- bh = ext4_read_dirblock(inode, 0, EITHER);
+ /* The first directory block must not be a hole, so
+ * treat it as DIRENT_HTREE
+ */
+ bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
if (IS_ERR(bh)) {
*retval = PTR_ERR(bh);
return NULL;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 61d07608577e..1a0a56647974 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3458,12 +3458,15 @@ static void ext4_clamp_want_extra_isize(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
+ unsigned def_extra_isize = sizeof(struct ext4_inode) -
+ EXT4_GOOD_OLD_INODE_SIZE;
- /* determine the minimum size of new large inodes, if present */
- if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
- sbi->s_want_extra_isize == 0) {
- sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
- EXT4_GOOD_OLD_INODE_SIZE;
+ if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) {
+ sbi->s_want_extra_isize = 0;
+ return;
+ }
+ if (sbi->s_want_extra_isize < 4) {
+ sbi->s_want_extra_isize = def_extra_isize;
if (ext4_has_feature_extra_isize(sb)) {
if (sbi->s_want_extra_isize <
le16_to_cpu(es->s_want_extra_isize))
@@ -3476,10 +3479,10 @@ static void ext4_clamp_want_extra_isize(struct super_block *sb)
}
}
/* Check if enough inode space is available */
- if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
- sbi->s_inode_size) {
- sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
- EXT4_GOOD_OLD_INODE_SIZE;
+ if ((sbi->s_want_extra_isize > sbi->s_inode_size) ||
+ (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
+ sbi->s_inode_size)) {
+ sbi->s_want_extra_isize = def_extra_isize;
ext4_msg(sb, KERN_INFO,
"required extra inode space not available");
}
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 624817eeb25e..170423ff2721 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -793,6 +793,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
unsigned int cp_blks = 1 + __cp_payload(sbi);
block_t cp_blk_no;
int i;
+ int err;
sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL);
if (!sbi->ckpt)
@@ -819,6 +820,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
} else if (cp2) {
cur_page = cp2;
} else {
+ err = -EFSCORRUPTED;
goto fail_no_cp;
}
@@ -831,8 +833,10 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
sbi->cur_cp_pack = 2;
/* Sanity checking of checkpoint */
- if (sanity_check_ckpt(sbi))
+ if (sanity_check_ckpt(sbi)) {
+ err = -EFSCORRUPTED;
goto free_fail_no_cp;
+ }
if (cp_blks <= 1)
goto done;
@@ -860,7 +864,7 @@ free_fail_no_cp:
f2fs_put_page(cp2, 1);
fail_no_cp:
kfree(sbi->ckpt);
- return -EINVAL;
+ return err;
}
static void __add_dirty_inode(struct inode *inode, enum inode_type type)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 113d1cd55119..ac3fa4bbed2d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -376,7 +376,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
__is_meta_io(fio) ? META_GENERIC : DATA_GENERIC))
- return -EFAULT;
+ return -EFSCORRUPTED;
trace_f2fs_submit_page_bio(page, fio);
f2fs_trace_ios(fio, 0);
@@ -959,7 +959,7 @@ next_block:
if (__is_valid_data_blkaddr(blkaddr) &&
!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) {
- err = -EFAULT;
+ err = -EFSCORRUPTED;
goto sync_out;
}
@@ -1425,7 +1425,7 @@ int do_write_data_page(struct f2fs_io_info *fio)
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
DATA_GENERIC))
- return -EFAULT;
+ return -EFSCORRUPTED;
ipu_force = true;
fio->need_lock = LOCK_DONE;
@@ -1445,13 +1445,14 @@ int do_write_data_page(struct f2fs_io_info *fio)
/* This page is already truncated */
if (fio->old_blkaddr == NULL_ADDR) {
ClearPageUptodate(page);
+ clear_cold_data(page);
goto out_writepage;
}
got_it:
if (__is_valid_data_blkaddr(fio->old_blkaddr) &&
!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
DATA_GENERIC)) {
- err = -EFAULT;
+ err = -EFSCORRUPTED;
goto out_writepage;
}
/*
@@ -1597,8 +1598,10 @@ done:
out:
inode_dec_dirty_pages(inode);
- if (err)
+ if (err) {
ClearPageUptodate(page);
+ clear_cold_data(page);
+ }
if (wbc->for_reclaim) {
f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA);
@@ -2158,6 +2161,8 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
}
}
+ clear_cold_data(page);
+
/* This is atomic written page, keep Private */
if (IS_ATOMIC_WRITTEN_PAGE(page))
return drop_inmem_page(inode, page);
@@ -2176,6 +2181,7 @@ int f2fs_release_page(struct page *page, gfp_t wait)
if (IS_ATOMIC_WRITTEN_PAGE(page))
return 0;
+ clear_cold_data(page);
set_page_private(page, 0);
ClearPagePrivate(page);
return 1;
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index c0c933ad43c8..4abefd841b6c 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -745,6 +745,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
clear_page_dirty_for_io(page);
ClearPagePrivate(page);
ClearPageUptodate(page);
+ clear_cold_data(page);
inode_dec_dirty_pages(dir);
remove_dirty_inode(dir);
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6caae471c1a4..268409cee1c3 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3089,3 +3089,7 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
}
#endif
+
+#define EFSBADCRC EBADMSG /* Bad CRC detected */
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
+
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 1b1792199445..a90173b856f6 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1593,7 +1593,7 @@ static int __f2fs_ioc_setflags(struct inode *inode, unsigned int flags)
inode->i_ctime = current_time(inode);
f2fs_set_inode_flags(inode);
- f2fs_mark_inode_dirty_sync(inode, false);
+ f2fs_mark_inode_dirty_sync(inode, true);
return 0;
}
@@ -2029,7 +2029,7 @@ do_more:
}
ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start));
- range.start += sbi->blocks_per_seg;
+ range.start += BLKS_PER_SEC(sbi);
if (range.start <= end)
goto do_more;
out:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index ceb6023786bd..c2e4c6ce2cf7 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -330,8 +330,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
p.min_cost = get_max_cost(sbi, &p);
if (*result != NULL_SEGNO) {
- if (IS_DATASEG(get_seg_entry(sbi, *result)->type) &&
- get_valid_blocks(sbi, *result, false) &&
+ if (get_valid_blocks(sbi, *result, false) &&
!sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
p.min_segno = *result;
goto out;
@@ -952,9 +951,9 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
GET_SUM_BLOCK(sbi, segno));
f2fs_put_page(sum_page, 0);
- if (get_valid_blocks(sbi, segno, false) == 0 ||
- !PageUptodate(sum_page) ||
- unlikely(f2fs_cp_error(sbi)))
+ if (get_valid_blocks(sbi, segno, false) == 0)
+ goto freed;
+ if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi)))
goto next;
sum = page_address(sum_page);
@@ -981,6 +980,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
stat_inc_seg_count(sbi, type, gc_type);
+freed:
if (gc_type == FG_GC &&
get_valid_blocks(sbi, segno, false) == 0)
seg_freed++;
@@ -1091,7 +1091,7 @@ stop:
put_gc_inode(&gc_list);
- if (sync)
+ if (sync && !ret)
ret = sec_freed ? 0 : -EAGAIN;
return ret;
}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 506e365cf903..8906f6381b1a 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -135,7 +135,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
"%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, "
"run fsck to fix.",
__func__, dn->inode->i_ino, dn->data_blkaddr);
- return -EINVAL;
+ return -EFSCORRUPTED;
}
f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page));
@@ -382,7 +382,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
"%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, "
"run fsck to fix.",
__func__, dir->i_ino, dn.data_blkaddr);
- err = -EINVAL;
+ err = -EFSCORRUPTED;
goto out;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index e02ed16bc35c..c6d0687f00fe 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -70,7 +70,7 @@ static int __written_first_block(struct f2fs_sb_info *sbi,
if (!__is_valid_data_blkaddr(addr))
return 1;
if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC))
- return -EFAULT;
+ return -EFSCORRUPTED;
return 0;
}
@@ -300,7 +300,7 @@ static int do_read_inode(struct inode *inode)
if (!sanity_check_inode(inode, node_page)) {
f2fs_put_page(node_page, 1);
- return -EINVAL;
+ return -EFSCORRUPTED;
}
/* check data exist */
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 12060fbfbb05..e7b8e2b35e22 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -39,7 +39,7 @@ int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
f2fs_msg(sbi->sb, KERN_WARNING,
"%s: out-of-range nid=%x, run fsck to fix.",
__func__, nid);
- return -EINVAL;
+ return -EFSCORRUPTED;
}
return 0;
}
@@ -1195,7 +1195,7 @@ repeat:
}
if (!f2fs_inode_chksum_verify(sbi, page)) {
- err = -EBADMSG;
+ err = -EFSBADCRC;
goto out_err;
}
page_hit:
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 65a82c5bafcb..2eef266b656b 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -201,6 +201,21 @@ static void recover_inode(struct inode *inode, struct page *page)
char *name;
inode->i_mode = le16_to_cpu(raw->i_mode);
+ i_uid_write(inode, le32_to_cpu(raw->i_uid));
+ i_gid_write(inode, le32_to_cpu(raw->i_gid));
+
+ if (raw->i_inline & F2FS_EXTRA_ATTR) {
+ if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) &&
+ F2FS_FITS_IN_INODE(raw, le16_to_cpu(raw->i_extra_isize),
+ i_projid)) {
+ projid_t i_projid;
+
+ i_projid = (projid_t)le32_to_cpu(raw->i_projid);
+ F2FS_I(inode)->i_projid =
+ make_kprojid(&init_user_ns, i_projid);
+ }
+ }
+
f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime);
inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
@@ -212,6 +227,8 @@ static void recover_inode(struct inode *inode, struct page *page)
F2FS_I(inode)->i_advise = raw->i_advise;
F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags);
+ f2fs_mark_inode_dirty_sync(inode, true);
+
if (file_enc_name(inode))
name = "<encrypted>";
else
@@ -451,7 +468,7 @@ retry_dn:
"Inconsistent ofs_of_node, ino:%lu, ofs:%u, %u",
inode->i_ino, ofs_of_node(dn.node_page),
ofs_of_node(page));
- err = -EFAULT;
+ err = -EFSCORRUPTED;
goto err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 70bd15cadb44..2cd0d126ef8f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -251,8 +251,10 @@ retry:
}
next:
/* we don't need to invalidate this in the sccessful status */
- if (drop || recover)
+ if (drop || recover) {
ClearPageUptodate(page);
+ clear_cold_data(page);
+ }
set_page_private(page, 0);
ClearPagePrivate(page);
f2fs_put_page(page, 1);
@@ -2216,6 +2218,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
f2fs_msg(sbi->sb, KERN_WARNING,
"Found FS corruption, run fsck to fix.");
+ err = -EFSCORRUPTED;
goto out;
}
@@ -3309,7 +3312,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
"Wrong journal entry on segno %u",
start);
set_sbi_flag(sbi, SBI_NEED_FSCK);
- err = -EINVAL;
+ err = -EFSCORRUPTED;
break;
}
@@ -3350,7 +3353,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
"SIT is corrupted node# %u vs %u",
total_node_blocks, valid_node_count(sbi));
set_sbi_flag(sbi, SBI_NEED_FSCK);
- err = -EINVAL;
+ err = -EFSCORRUPTED;
}
return err;
@@ -3439,6 +3442,41 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
return init_victim_secmap(sbi);
}
+static int sanity_check_curseg(struct f2fs_sb_info *sbi)
+{
+ int i;
+
+ /*
+ * In LFS/SSR curseg, .next_blkoff should point to an unused blkaddr;
+ * In LFS curseg, all blkaddr after .next_blkoff should be unused.
+ */
+ for (i = 0; i < NO_CHECK_TYPE; i++) {
+ struct curseg_info *curseg = CURSEG_I(sbi, i);
+ struct seg_entry *se = get_seg_entry(sbi, curseg->segno);
+ unsigned int blkofs = curseg->next_blkoff;
+
+ if (f2fs_test_bit(blkofs, se->cur_valid_map))
+ goto out;
+
+ if (curseg->alloc_type == SSR)
+ continue;
+
+ for (blkofs += 1; blkofs < sbi->blocks_per_seg; blkofs++) {
+ if (!f2fs_test_bit(blkofs, se->cur_valid_map))
+ continue;
+out:
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Current segment's next free block offset is "
+ "inconsistent with bitmap, logtype:%u, "
+ "segno:%u, type:%u, next_blkoff:%u, blkofs:%u",
+ i, curseg->segno, curseg->alloc_type,
+ curseg->next_blkoff, blkofs);
+ return -EFSCORRUPTED;
+ }
+ }
+ return 0;
+}
+
/*
* Update min, max modified time for cost-benefit GC algorithm
*/
@@ -3532,6 +3570,10 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
if (err)
return err;
+ err = sanity_check_curseg(sbi);
+ if (err)
+ return err;
+
init_min_max_mtime(sbi);
return 0;
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index e3d8826c5113..0d46e936d54e 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -665,7 +665,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi,
"Mismatch valid blocks %d vs. %d",
GET_SIT_VBLOCKS(raw_sit), valid_blocks);
set_sbi_flag(sbi, SBI_NEED_FSCK);
- return -EINVAL;
+ return -EFSCORRUPTED;
}
/* check segment usage, and check boundary of a given segment number */
@@ -675,7 +675,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi,
"Wrong valid blocks %d or segno %u",
GET_SIT_VBLOCKS(raw_sit), segno);
set_sbi_flag(sbi, SBI_NEED_FSCK);
- return -EINVAL;
+ return -EFSCORRUPTED;
}
return 0;
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 4c169ba50c0f..e4aabfc21bd4 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1523,6 +1523,12 @@ void f2fs_quota_off_umount(struct super_block *sb)
set_sbi_flag(F2FS_SB(sb), SBI_NEED_FSCK);
}
}
+ /*
+ * In case of checkpoint=disable, we must flush quota blocks.
+ * This can cause NULL exception for node_inode in end_io, since
+ * put_super already dropped it.
+ */
+ sync_filesystem(sb);
}
int f2fs_get_projid(struct inode *inode, kprojid_t *projid)
@@ -1814,11 +1820,11 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
struct super_block *sb = sbi->sb;
unsigned int blocksize;
- if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) {
+ if (le32_to_cpu(raw_super->magic) != F2FS_SUPER_MAGIC) {
f2fs_msg(sb, KERN_INFO,
"Magic Mismatch, valid(0x%x) - read(0x%x)",
F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic));
- return 1;
+ return -EINVAL;
}
/* Currently, support only 4KB page cache size */
@@ -1826,7 +1832,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
f2fs_msg(sb, KERN_INFO,
"Invalid page_cache_size (%lu), supports only 4KB\n",
PAGE_SIZE);
- return 1;
+ return -EFSCORRUPTED;
}
/* Currently, support only 4KB block size */
@@ -1835,7 +1841,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
f2fs_msg(sb, KERN_INFO,
"Invalid blocksize (%u), supports only 4KB\n",
blocksize);
- return 1;
+ return -EFSCORRUPTED;
}
/* check log blocks per segment */
@@ -1843,7 +1849,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
f2fs_msg(sb, KERN_INFO,
"Invalid log blocks per segment (%u)\n",
le32_to_cpu(raw_super->log_blocks_per_seg));
- return 1;
+ return -EFSCORRUPTED;
}
/* Currently, support 512/1024/2048/4096 bytes sector size */
@@ -1853,7 +1859,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
F2FS_MIN_LOG_SECTOR_SIZE) {
f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize (%u)",
le32_to_cpu(raw_super->log_sectorsize));
- return 1;
+ return -EFSCORRUPTED;
}
if (le32_to_cpu(raw_super->log_sectors_per_block) +
le32_to_cpu(raw_super->log_sectorsize) !=
@@ -1862,7 +1868,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
"Invalid log sectors per block(%u) log sectorsize(%u)",
le32_to_cpu(raw_super->log_sectors_per_block),
le32_to_cpu(raw_super->log_sectorsize));
- return 1;
+ return -EFSCORRUPTED;
}
segment_count = le32_to_cpu(raw_super->segment_count);
@@ -1878,7 +1884,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
f2fs_msg(sb, KERN_INFO,
"Invalid segment count (%u)",
segment_count);
- return 1;
+ return -EFSCORRUPTED;
}
if (total_sections > segment_count ||
@@ -1887,35 +1893,35 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
f2fs_msg(sb, KERN_INFO,
"Invalid segment/section count (%u, %u x %u)",
segment_count, total_sections, segs_per_sec);
- return 1;
+ return -EFSCORRUPTED;
}
if ((segment_count / segs_per_sec) < total_sections) {
f2fs_msg(sb, KERN_INFO,
"Small segment_count (%u < %u * %u)",
segment_count, segs_per_sec, total_sections);
- return 1;
+ return -EFSCORRUPTED;
}
if (segment_count > (le64_to_cpu(raw_super->block_count) >> 9)) {
f2fs_msg(sb, KERN_INFO,
"Wrong segment_count / block_count (%u > %llu)",
segment_count, le64_to_cpu(raw_super->block_count));
- return 1;
+ return -EFSCORRUPTED;
}
if (secs_per_zone > total_sections || !secs_per_zone) {
f2fs_msg(sb, KERN_INFO,
"Wrong secs_per_zone / total_sections (%u, %u)",
secs_per_zone, total_sections);
- return 1;
+ return -EFSCORRUPTED;
}
if (le32_to_cpu(raw_super->extension_count) > F2FS_MAX_EXTENSION) {
f2fs_msg(sb, KERN_INFO,
"Corrupted extension count (%u > %u)",
le32_to_cpu(raw_super->extension_count),
F2FS_MAX_EXTENSION);
- return 1;
+ return -EFSCORRUPTED;
}
if (le32_to_cpu(raw_super->cp_payload) >
@@ -1924,7 +1930,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
"Insane cp_payload (%u > %u)",
le32_to_cpu(raw_super->cp_payload),
blocks_per_seg - F2FS_CP_PACKS);
- return 1;
+ return -EFSCORRUPTED;
}
/* check reserved ino info */
@@ -1936,12 +1942,12 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
le32_to_cpu(raw_super->node_ino),
le32_to_cpu(raw_super->meta_ino),
le32_to_cpu(raw_super->root_ino));
- return 1;
+ return -EFSCORRUPTED;
}
/* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
if (sanity_check_area_boundary(sbi, bh))
- return 1;
+ return -EFSCORRUPTED;
return 0;
}
@@ -2027,11 +2033,11 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
}
}
for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
- for (j = i; j < NR_CURSEG_DATA_TYPE; j++) {
+ for (j = 0; j < NR_CURSEG_DATA_TYPE; j++) {
if (le32_to_cpu(ckpt->cur_node_segno[i]) ==
le32_to_cpu(ckpt->cur_data_segno[j])) {
f2fs_msg(sbi->sb, KERN_ERR,
- "Data segment (%u) and Data segment (%u)"
+ "Node segment (%u) and Data segment (%u)"
" has the same segno: %u", i, j,
le32_to_cpu(ckpt->cur_node_segno[i]));
return 1;
@@ -2117,8 +2123,12 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
if (err)
return err;
- return percpu_counter_init(&sbi->total_valid_inode_count, 0,
+ err = percpu_counter_init(&sbi->total_valid_inode_count, 0,
GFP_KERNEL);
+ if (err)
+ percpu_counter_destroy(&sbi->alloc_valid_block_count);
+
+ return err;
}
#ifdef CONFIG_BLK_DEV_ZONED
@@ -2216,11 +2226,11 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
}
/* sanity checking of raw super */
- if (sanity_check_raw_super(sbi, bh)) {
+ err = sanity_check_raw_super(sbi, bh);
+ if (err) {
f2fs_msg(sb, KERN_ERR,
"Can't find valid F2FS filesystem in %dth superblock",
block + 1);
- err = -EINVAL;
brelse(bh);
continue;
}
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 81cecbe6d7cf..971e369517a7 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1097,8 +1097,11 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
err = -ENOMEM;
goto error;
}
+ /* Avoid race with userspace read via bdev */
+ lock_buffer(bhs[n]);
memset(bhs[n]->b_data, 0, sb->s_blocksize);
set_buffer_uptodate(bhs[n]);
+ unlock_buffer(bhs[n]);
mark_buffer_dirty_inode(bhs[n], dir);
n++;
@@ -1155,6 +1158,8 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
de = (struct msdos_dir_entry *)bhs[0]->b_data;
+ /* Avoid race with userspace read via bdev */
+ lock_buffer(bhs[0]);
/* filling the new directory slots ("." and ".." entries) */
memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME);
memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME);
@@ -1177,6 +1182,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
de[0].size = de[1].size = 0;
memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
set_buffer_uptodate(bhs[0]);
+ unlock_buffer(bhs[0]);
mark_buffer_dirty_inode(bhs[0], dir);
err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
@@ -1234,11 +1240,14 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
/* fill the directory entry */
copy = min(size, sb->s_blocksize);
+ /* Avoid race with userspace read via bdev */
+ lock_buffer(bhs[n]);
memcpy(bhs[n]->b_data, slots, copy);
- slots += copy;
- size -= copy;
set_buffer_uptodate(bhs[n]);
+ unlock_buffer(bhs[n]);
mark_buffer_dirty_inode(bhs[n], dir);
+ slots += copy;
+ size -= copy;
if (!size)
break;
n++;
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 9635df94db7d..24ed1f4e48ae 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -389,8 +389,11 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
err = -ENOMEM;
goto error;
}
+ /* Avoid race with userspace read via bdev */
+ lock_buffer(c_bh);
memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
set_buffer_uptodate(c_bh);
+ unlock_buffer(c_bh);
mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
if (sb->s_flags & MS_SYNCHRONOUS)
err = sync_dirty_buffer(c_bh);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4d561ee08d05..6398bd8a066e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -582,10 +582,13 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
spin_unlock(&inode->i_lock);
/*
- * A dying wb indicates that the memcg-blkcg mapping has changed
- * and a new wb is already serving the memcg. Switch immediately.
+ * A dying wb indicates that either the blkcg associated with the
+ * memcg changed or the associated memcg is dying. In the first
+ * case, a replacement wb should already be available and we should
+ * refresh the wb immediately. In the second case, trying to
+ * refresh will keep failing.
*/
- if (unlikely(wb_dying(wbc->wb)))
+ if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
inode_switch_wbs(inode, wbc->wb_id);
}
@@ -721,6 +724,7 @@ void wbc_detach_inode(struct writeback_control *wbc)
void wbc_account_io(struct writeback_control *wbc, struct page *page,
size_t bytes)
{
+ struct cgroup_subsys_state *css;
int id;
/*
@@ -732,7 +736,12 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
if (!wbc->wb)
return;
- id = mem_cgroup_css_from_page(page)->id;
+ css = mem_cgroup_css_from_page(page);
+ /* dead cgroups shouldn't contribute to inode ownership arbitration */
+ if (!(css->flags & CSS_ONLINE))
+ return;
+
+ id = css->id;
if (id == wbc->wb_id) {
wbc->wb_bytes += bytes;
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 5be0339dcceb..42bed87dd5ea 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -107,7 +107,7 @@ static ssize_t fuse_conn_max_background_read(struct file *file,
if (!fc)
return 0;
- val = fc->max_background;
+ val = READ_ONCE(fc->max_background);
fuse_conn_put(fc);
return fuse_conn_limit_read(file, buf, len, ppos, val);
@@ -144,7 +144,7 @@ static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
if (!fc)
return 0;
- val = fc->congestion_threshold;
+ val = READ_ONCE(fc->congestion_threshold);
fuse_conn_put(fc);
return fuse_conn_limit_read(file, buf, len, ppos, val);
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index e9e97803442a..55db06c7c587 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -513,6 +513,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
rc = cuse_send_init(cc);
if (rc) {
fuse_dev_free(fud);
+ fuse_conn_put(&cc->fc);
return rc;
}
file->private_data = fud;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d933ecb7a08c..4d95a416fc36 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -234,7 +234,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
kfree(forget);
if (ret == -ENOMEM)
goto out;
- if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
+ if (ret || fuse_invalid_attr(&outarg.attr) ||
+ (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
goto invalid;
forget_all_cached_acls(inode);
@@ -297,6 +298,12 @@ int fuse_valid_type(int m)
S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
}
+bool fuse_invalid_attr(struct fuse_attr *attr)
+{
+ return !fuse_valid_type(attr->mode) ||
+ attr->size > LLONG_MAX;
+}
+
int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
struct fuse_entry_out *outarg, struct inode **inode)
{
@@ -328,7 +335,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
err = -EIO;
if (!outarg->nodeid)
goto out_put_forget;
- if (!fuse_valid_type(outarg->attr.mode))
+ if (fuse_invalid_attr(&outarg->attr))
goto out_put_forget;
*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
@@ -451,7 +458,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
goto out_free_ff;
err = -EIO;
- if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid))
+ if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid) ||
+ fuse_invalid_attr(&outentry.attr))
goto out_free_ff;
ff->fh = outopen.fh;
@@ -557,7 +565,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
goto out_put_forget_req;
err = -EIO;
- if (invalid_nodeid(outarg.nodeid))
+ if (invalid_nodeid(outarg.nodeid) || fuse_invalid_attr(&outarg.attr))
goto out_put_forget_req;
if ((outarg.attr.mode ^ mode) & S_IFMT)
@@ -830,7 +838,8 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
spin_lock(&fc->lock);
fi->attr_version = ++fc->attr_version;
- inc_nlink(inode);
+ if (likely(inode->i_nlink < UINT_MAX))
+ inc_nlink(inode);
spin_unlock(&fc->lock);
fuse_invalidate_attr(inode);
fuse_update_ctime(inode);
@@ -910,7 +919,8 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
args.out.args[0].value = &outarg;
err = fuse_simple_request(fc, &args);
if (!err) {
- if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+ if (fuse_invalid_attr(&outarg.attr) ||
+ (inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
make_bad_inode(inode);
err = -EIO;
} else {
@@ -1214,7 +1224,7 @@ static int fuse_direntplus_link(struct file *file,
if (invalid_nodeid(o->nodeid))
return -EIO;
- if (!fuse_valid_type(o->attr.mode))
+ if (fuse_invalid_attr(&o->attr))
return -EIO;
fc = get_fuse_conn(dir);
@@ -1650,6 +1660,19 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
if (attr->ia_valid & ATTR_SIZE)
is_truncate = true;
+ /* Flush dirty data/metadata before non-truncate SETATTR */
+ if (is_wb && S_ISREG(inode->i_mode) &&
+ attr->ia_valid &
+ (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET |
+ ATTR_TIMES_SET)) {
+ err = write_inode_now(inode, true);
+ if (err)
+ return err;
+
+ fuse_set_nowrite(inode);
+ fuse_release_nowrite(inode);
+ }
+
if (is_truncate) {
fuse_set_nowrite(inode);
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
@@ -1678,7 +1701,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
goto error;
}
- if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+ if (fuse_invalid_attr(&outarg.attr) ||
+ (inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
make_bad_inode(inode);
err = -EIO;
goto error;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e340449ca862..969584c99c54 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -201,7 +201,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
{
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
- bool lock_inode = (file->f_flags & O_TRUNC) &&
+ bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
fc->atomic_o_trunc &&
fc->writeback_cache;
@@ -209,16 +209,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
if (err)
return err;
- if (lock_inode)
+ if (is_wb_truncate) {
inode_lock(inode);
+ fuse_set_nowrite(inode);
+ }
err = fuse_do_open(fc, get_node_id(inode), file, isdir);
if (!err)
fuse_finish_open(inode, file);
- if (lock_inode)
+ if (is_wb_truncate) {
+ fuse_release_nowrite(inode);
inode_unlock(inode);
+ }
return err;
}
@@ -1699,6 +1703,7 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc)
WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
return 0;
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e682f2eff6c0..338aa5e266d6 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -896,6 +896,8 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc);
*/
int fuse_valid_type(int m);
+bool fuse_invalid_attr(struct fuse_attr *attr);
+
/**
* Is current process allowed to perform filesystem operation?
*/
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index bc8787718feb..b3a1b16d4e3e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1078,6 +1078,7 @@ out_unlock:
gfs2_dinode_out(ip, dibh->b_data);
up_write(&ip->i_rw_mutex);
gfs2_trans_end(sdp);
+ buf_in_tr = false;
}
gfs2_glock_dq_uninit(rd_gh);
cond_resched();
@@ -1444,6 +1445,8 @@ static int do_grow(struct inode *inode, u64 size)
}
error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
+ (unstuff &&
+ gfs2_is_jdata(ip) ? RES_JDATA : 0) +
(sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
0 : RES_QUOTA), 0);
if (error)
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 483b82e2be92..a3208511f35a 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -594,6 +594,14 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
}
+void gfs2_glock_remove_revoke(struct gfs2_glock *gl)
+{
+ if (atomic_dec_return(&gl->gl_revokes) == 0) {
+ clear_bit(GLF_LFLUSH, &gl->gl_flags);
+ gfs2_glock_queue_put(gl);
+ }
+}
+
void gfs2_write_revokes(struct gfs2_sbd *sdp)
{
struct gfs2_trans *tr;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 9499a6049212..3b7b7839ec6a 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -80,6 +80,7 @@ extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
extern int gfs2_logd(void *data);
extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl);
extern void gfs2_write_revokes(struct gfs2_sbd *sdp);
#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 049f8c6721b4..a5041e6d2c0d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -660,10 +660,7 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
list_del_init(&bd->bd_list);
gl = bd->bd_gl;
- if (atomic_dec_return(&gl->gl_revokes) == 0) {
- clear_bit(GLF_LFLUSH, &gl->gl_flags);
- gfs2_glock_queue_put(gl);
- }
+ gfs2_glock_remove_revoke(gl);
kmem_cache_free(gfs2_bufdata_cachep, bd);
}
}
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index b0eee90738ff..7cb0672294df 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -623,7 +623,10 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
RB_CLEAR_NODE(&rs->rs_node);
if (rs->rs_free) {
- struct gfs2_bitmap *bi = rbm_bi(&rs->rs_rbm);
+ u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) +
+ rs->rs_free - 1;
+ struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, };
+ struct gfs2_bitmap *start, *last;
/* return reserved blocks to the rgrp */
BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
@@ -634,7 +637,13 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
it will force the number to be recalculated later. */
rgd->rd_extfail_pt += rs->rs_free;
rs->rs_free = 0;
- clear_bit(GBF_FULL, &bi->bi_flags);
+ if (gfs2_rbm_from_block(&last_rbm, last_block))
+ return;
+ start = rbm_bi(&rs->rs_rbm);
+ last = rbm_bi(&last_rbm);
+ do
+ clear_bit(GBF_FULL, &start->bi_flags);
+ while (start++ != last);
}
}
@@ -1201,7 +1210,7 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
rl_flags &= ~GFS2_RDF_MASK;
rgd->rd_flags &= GFS2_RDF_MASK;
- rgd->rd_flags |= (rl_flags | GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
+ rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
if (rgd->rd_rgl->rl_unlinked == 0)
rgd->rd_flags &= ~GFS2_RDF_CHECK;
rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 8e54f2e3a304..c3f3f1ae4e1b 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -845,10 +845,10 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
return error;
+ flush_workqueue(gfs2_delete_workqueue);
kthread_stop(sdp->sd_quotad_process);
kthread_stop(sdp->sd_logd_process);
- flush_workqueue(gfs2_delete_workqueue);
gfs2_quota_sync(sdp->sd_vfs, 0);
gfs2_statfs_sync(sdp->sd_vfs, 0);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index affef3c066e0..69e3402a3cc5 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -277,6 +277,8 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
list_del_init(&bd->bd_list);
gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
sdp->sd_log_num_revoke--;
+ if (bd->bd_gl)
+ gfs2_glock_remove_revoke(bd->bd_gl);
kmem_cache_free(gfs2_bufdata_cachep, bd);
tr->tr_num_revoke_rm++;
if (--n == 0)
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index da25c49203cc..896396554bcc 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -445,6 +445,7 @@ skip:
/* restore search_key */
hfs_bnode_read_key(node, fd->search_key, 14);
}
+ new_node = NULL;
}
if (!rec && node->parent)
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 9bdff5e40626..19017d296173 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -220,25 +220,17 @@ static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx)
return node;
}
-struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+/* Make sure @tree has enough space for the @rsvd_nodes */
+int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes)
{
- struct hfs_bnode *node, *next_node;
- struct page **pagep;
- u32 nidx, idx;
- unsigned off;
- u16 off16;
- u16 len;
- u8 *data, byte, m;
- int i;
-
- while (!tree->free_nodes) {
- struct inode *inode = tree->inode;
- u32 count;
- int res;
+ struct inode *inode = tree->inode;
+ u32 count;
+ int res;
+ while (tree->free_nodes < rsvd_nodes) {
res = hfs_extend_file(inode);
if (res)
- return ERR_PTR(res);
+ return res;
HFS_I(inode)->phys_size = inode->i_size =
(loff_t)HFS_I(inode)->alloc_blocks *
HFS_SB(tree->sb)->alloc_blksz;
@@ -246,9 +238,26 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
tree->sb->s_blocksize_bits;
inode_set_bytes(inode, inode->i_size);
count = inode->i_size >> tree->node_size_shift;
- tree->free_nodes = count - tree->node_count;
+ tree->free_nodes += count - tree->node_count;
tree->node_count = count;
}
+ return 0;
+}
+
+struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+{
+ struct hfs_bnode *node, *next_node;
+ struct page **pagep;
+ u32 nidx, idx;
+ unsigned off;
+ u16 off16;
+ u16 len;
+ u8 *data, byte, m;
+ int i, res;
+
+ res = hfs_bmap_reserve(tree, 1);
+ if (res)
+ return ERR_PTR(res);
nidx = 0;
node = hfs_bnode_find(tree, nidx);
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index c8b252dbb26c..dcc2aab1b2c4 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -82,6 +82,7 @@ struct hfs_find_data {
extern struct hfs_btree *hfs_btree_open(struct super_block *, u32, btree_keycmp);
extern void hfs_btree_close(struct hfs_btree *);
extern void hfs_btree_write(struct hfs_btree *);
+extern int hfs_bmap_reserve(struct hfs_btree *, int);
extern struct hfs_bnode * hfs_bmap_alloc(struct hfs_btree *);
extern void hfs_bmap_free(struct hfs_bnode *node);
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 8a66405b0f8b..d365bf0b8c77 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -97,6 +97,14 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i
if (err)
return err;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most once.
+ */
+ err = hfs_bmap_reserve(fd.tree, 2 * fd.tree->depth);
+ if (err)
+ goto err2;
+
hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
HFS_CDR_THD : HFS_CDR_FTH,
@@ -295,6 +303,14 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
return err;
dst_fd = src_fd;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most once.
+ */
+ err = hfs_bmap_reserve(src_fd.tree, 2 * src_fd.tree->depth);
+ if (err)
+ goto out;
+
/* find the old dir entry and read the data */
hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
err = hfs_brec_find(&src_fd);
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index 5d0182654580..263d5028d9d1 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -117,6 +117,10 @@ static int __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) {
if (res != -ENOENT)
return res;
+ /* Fail early and avoid ENOSPC during the btree operation */
+ res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1);
+ if (res)
+ return res;
hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec));
HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW);
} else {
@@ -300,7 +304,7 @@ int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type)
return 0;
blocks = 0;
- for (i = 0; i < 3; extent++, i++)
+ for (i = 0; i < 3; i++)
blocks += be16_to_cpu(extent[i].count);
res = hfs_free_extents(sb, extent, blocks, blocks);
@@ -341,7 +345,9 @@ int hfs_get_block(struct inode *inode, sector_t block,
ablock = (u32)block / HFS_SB(sb)->fs_div;
if (block >= HFS_I(inode)->fs_blocks) {
- if (block > HFS_I(inode)->fs_blocks || !create)
+ if (!create)
+ return 0;
+ if (block > HFS_I(inode)->fs_blocks)
return -EIO;
if (ablock >= HFS_I(inode)->alloc_blocks) {
res = hfs_extend_file(inode);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 2538b49cc349..350afd67bd69 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -642,6 +642,8 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
truncate_setsize(inode, attr->ia_size);
hfs_file_truncate(inode);
+ inode->i_atime = inode->i_mtime = inode->i_ctime =
+ current_time(inode);
}
setattr_copy(inode, attr);
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index 2bab6b3cdba4..e6d554476db4 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -217,6 +217,11 @@ int hfsplus_create_attr(struct inode *inode,
if (err)
goto failed_init_create_attr;
+ /* Fail early and avoid ENOSPC during the btree operation */
+ err = hfs_bmap_reserve(fd.tree, fd.tree->depth + 1);
+ if (err)
+ goto failed_create_attr;
+
if (name) {
err = hfsplus_attr_build_key(sb, fd.search_key,
inode->i_ino, name);
@@ -313,6 +318,11 @@ int hfsplus_delete_attr(struct inode *inode, const char *name)
if (err)
return err;
+ /* Fail early and avoid ENOSPC during the btree operation */
+ err = hfs_bmap_reserve(fd.tree, fd.tree->depth);
+ if (err)
+ goto out;
+
if (name) {
err = hfsplus_attr_build_key(sb, fd.search_key,
inode->i_ino, name);
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index d3f36982f685..0f53a486d2c1 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -448,6 +448,7 @@ skip:
/* restore search_key */
hfs_bnode_read_key(node, fd->search_key, 14);
}
+ new_node = NULL;
}
if (!rec && node->parent)
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 3de3bc4918b5..66774f4cb4fd 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -342,26 +342,21 @@ static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx)
return node;
}
-struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+/* Make sure @tree has enough space for the @rsvd_nodes */
+int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes)
{
- struct hfs_bnode *node, *next_node;
- struct page **pagep;
- u32 nidx, idx;
- unsigned off;
- u16 off16;
- u16 len;
- u8 *data, byte, m;
- int i;
+ struct inode *inode = tree->inode;
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+ u32 count;
+ int res;
- while (!tree->free_nodes) {
- struct inode *inode = tree->inode;
- struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
- u32 count;
- int res;
+ if (rsvd_nodes <= 0)
+ return 0;
+ while (tree->free_nodes < rsvd_nodes) {
res = hfsplus_file_extend(inode, hfs_bnode_need_zeroout(tree));
if (res)
- return ERR_PTR(res);
+ return res;
hip->phys_size = inode->i_size =
(loff_t)hip->alloc_blocks <<
HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
@@ -369,9 +364,26 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
inode_set_bytes(inode, inode->i_size);
count = inode->i_size >> tree->node_size_shift;
- tree->free_nodes = count - tree->node_count;
+ tree->free_nodes += count - tree->node_count;
tree->node_count = count;
}
+ return 0;
+}
+
+struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+{
+ struct hfs_bnode *node, *next_node;
+ struct page **pagep;
+ u32 nidx, idx;
+ unsigned off;
+ u16 off16;
+ u16 len;
+ u8 *data, byte, m;
+ int i, res;
+
+ res = hfs_bmap_reserve(tree, 1);
+ if (res)
+ return ERR_PTR(res);
nidx = 0;
node = hfs_bnode_find(tree, nidx);
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index a196369ba779..35472cba750e 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -265,6 +265,14 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
if (err)
return err;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most once.
+ */
+ err = hfs_bmap_reserve(fd.tree, 2 * fd.tree->depth);
+ if (err)
+ goto err2;
+
hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
entry_size = hfsplus_fill_cat_thread(sb, &entry,
S_ISDIR(inode->i_mode) ?
@@ -333,6 +341,14 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str)
if (err)
return err;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most once.
+ */
+ err = hfs_bmap_reserve(fd.tree, 2 * (int)fd.tree->depth - 2);
+ if (err)
+ goto out;
+
if (!str) {
int len;
@@ -433,6 +449,14 @@ int hfsplus_rename_cat(u32 cnid,
return err;
dst_fd = src_fd;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most twice.
+ */
+ err = hfs_bmap_reserve(src_fd.tree, 4 * (int)src_fd.tree->depth - 1);
+ if (err)
+ goto out;
+
/* find the old dir entry and read the data */
err = hfsplus_cat_build_key(sb, src_fd.search_key,
src_dir->i_ino, src_name);
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index e8770935ce6d..58f296bfd438 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -100,6 +100,10 @@ static int __hfsplus_ext_write_extent(struct inode *inode,
if (hip->extent_state & HFSPLUS_EXT_NEW) {
if (res != -ENOENT)
return res;
+ /* Fail early and avoid ENOSPC during the btree operation */
+ res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1);
+ if (res)
+ return res;
hfs_brec_insert(fd, hip->cached_extents,
sizeof(hfsplus_extent_rec));
hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
@@ -233,7 +237,9 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
ablock = iblock >> sbi->fs_shift;
if (iblock >= hip->fs_blocks) {
- if (iblock > hip->fs_blocks || !create)
+ if (!create)
+ return 0;
+ if (iblock > hip->fs_blocks)
return -EIO;
if (ablock >= hip->alloc_blocks) {
res = hfsplus_file_extend(inode, false);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index a015044daa05..dbb55d823385 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -312,6 +312,7 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
#define hfs_btree_open hfsplus_btree_open
#define hfs_btree_close hfsplus_btree_close
#define hfs_btree_write hfsplus_btree_write
+#define hfs_bmap_reserve hfsplus_bmap_reserve
#define hfs_bmap_alloc hfsplus_bmap_alloc
#define hfs_bmap_free hfsplus_bmap_free
#define hfs_bnode_read hfsplus_bnode_read
@@ -396,6 +397,7 @@ u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, u64 sectors,
struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id);
void hfs_btree_close(struct hfs_btree *tree);
int hfs_btree_write(struct hfs_btree *tree);
+int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes);
struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree);
void hfs_bmap_free(struct hfs_bnode *node);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 190c60efbc99..5b31f4730ee9 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -262,6 +262,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
}
truncate_setsize(inode, attr->ia_size);
hfsplus_file_truncate(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
}
setattr_copy(inode, attr);
diff --git a/fs/inode.c b/fs/inode.c
index cfc36d11bcb3..76f7535fe754 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1817,8 +1817,13 @@ int file_remove_privs(struct file *file)
int kill;
int error = 0;
- /* Fast path for nothing security related */
- if (IS_NOSEC(inode))
+ /*
+ * Fast path for nothing security related.
+ * As well for non-regular files, e.g. blkdev inodes.
+ * For example, blkdev_write_iter() might get here
+ * trying to remove privs which it is not allowed to.
+ */
+ if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
return 0;
kill = dentry_needs_remove_privs(dentry);
diff --git a/fs/iomap.c b/fs/iomap.c
index 4966abd956d4..64e07534ded3 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -941,7 +941,14 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
dio->submit.cookie = submit_bio(bio);
} while (nr_pages);
- if (need_zeroout) {
+ /*
+ * We need to zeroout the tail of a sub-block write if the extent type
+ * requires zeroing or the write extends beyond EOF. If we don't zero
+ * the block tail in the latter case, we can expose stale data via mmap
+ * reads of the EOF block.
+ */
+ if (need_zeroout ||
+ ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
/* zero out from the end of the write to the end of the block */
pad = pos & (fs_block_size - 1);
if (pad)
@@ -1046,8 +1053,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
}
pos += ret;
- if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
+ if (iov_iter_rw(iter) == READ && pos >= dio->i_size) {
+ /*
+ * We only report that we've read data up to i_size.
+ * Revert iter to a state corresponding to that as
+ * some callers (such as splice code) rely on it.
+ */
+ iov_iter_revert(iter, pos - dio->i_size);
break;
+ }
} while ((count = iov_iter_count(iter)) > 0);
blk_finish_plug(&plug);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d11401afd52f..0567b17a970c 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -189,14 +189,15 @@ static int journal_wait_on_commit_record(journal_t *journal,
* use writepages() because with dealyed allocation we may be doing
* block allocation in writepages().
*/
-static int journal_submit_inode_data_buffers(struct address_space *mapping)
+static int journal_submit_inode_data_buffers(struct address_space *mapping,
+ loff_t dirty_start, loff_t dirty_end)
{
int ret;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = mapping->nrpages * 2,
- .range_start = 0,
- .range_end = i_size_read(mapping->host),
+ .range_start = dirty_start,
+ .range_end = dirty_end,
};
ret = generic_writepages(mapping, &wbc);
@@ -220,6 +221,9 @@ static int journal_submit_data_buffers(journal_t *journal,
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ loff_t dirty_start = jinode->i_dirty_start;
+ loff_t dirty_end = jinode->i_dirty_end;
+
if (!(jinode->i_flags & JI_WRITE_DATA))
continue;
mapping = jinode->i_vfs_inode->i_mapping;
@@ -232,7 +236,8 @@ static int journal_submit_data_buffers(journal_t *journal,
* only allocated blocks here.
*/
trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
- err = journal_submit_inode_data_buffers(mapping);
+ err = journal_submit_inode_data_buffers(mapping, dirty_start,
+ dirty_end);
if (!ret)
ret = err;
spin_lock(&journal->j_list_lock);
@@ -259,12 +264,16 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
/* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ loff_t dirty_start = jinode->i_dirty_start;
+ loff_t dirty_end = jinode->i_dirty_end;
+
if (!(jinode->i_flags & JI_WAIT_DATA))
continue;
jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
- err = filemap_fdatawait_keep_errors(
- jinode->i_vfs_inode->i_mapping);
+ err = filemap_fdatawait_range_keep_errors(
+ jinode->i_vfs_inode->i_mapping, dirty_start,
+ dirty_end);
if (!ret)
ret = err;
spin_lock(&journal->j_list_lock);
@@ -284,6 +293,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
&jinode->i_transaction->t_inode_list);
} else {
jinode->i_transaction = NULL;
+ jinode->i_dirty_start = 0;
+ jinode->i_dirty_end = 0;
}
}
spin_unlock(&journal->j_list_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 593f3e31fb21..d3cce5c86fd9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -97,6 +97,8 @@ EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
EXPORT_SYMBOL(jbd2_journal_inode_add_write);
EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
+EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
+EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
@@ -2581,6 +2583,8 @@ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
jinode->i_next_transaction = NULL;
jinode->i_vfs_inode = inode;
jinode->i_flags = 0;
+ jinode->i_dirty_start = 0;
+ jinode->i_dirty_end = 0;
INIT_LIST_HEAD(&jinode->i_list);
}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 650927f0a2dc..7fe422eced89 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2503,7 +2503,7 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
* File inode in the inode list of the handle's transaction
*/
static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
- unsigned long flags)
+ unsigned long flags, loff_t start_byte, loff_t end_byte)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal;
@@ -2515,26 +2515,17 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
transaction->t_tid);
- /*
- * First check whether inode isn't already on the transaction's
- * lists without taking the lock. Note that this check is safe
- * without the lock as we cannot race with somebody removing inode
- * from the transaction. The reason is that we remove inode from the
- * transaction only in journal_release_jbd_inode() and when we commit
- * the transaction. We are guarded from the first case by holding
- * a reference to the inode. We are safe against the second case
- * because if jinode->i_transaction == transaction, commit code
- * cannot touch the transaction because we hold reference to it,
- * and if jinode->i_next_transaction == transaction, commit code
- * will only file the inode where we want it.
- */
- if ((jinode->i_transaction == transaction ||
- jinode->i_next_transaction == transaction) &&
- (jinode->i_flags & flags) == flags)
- return 0;
-
spin_lock(&journal->j_list_lock);
jinode->i_flags |= flags;
+
+ if (jinode->i_dirty_end) {
+ jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte);
+ jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte);
+ } else {
+ jinode->i_dirty_start = start_byte;
+ jinode->i_dirty_end = end_byte;
+ }
+
/* Is inode already attached where we need it? */
if (jinode->i_transaction == transaction ||
jinode->i_next_transaction == transaction)
@@ -2569,12 +2560,28 @@ done:
int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
{
return jbd2_journal_file_inode(handle, jinode,
- JI_WRITE_DATA | JI_WAIT_DATA);
+ JI_WRITE_DATA | JI_WAIT_DATA, 0, LLONG_MAX);
}
int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
{
- return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA);
+ return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, 0,
+ LLONG_MAX);
+}
+
+int jbd2_journal_inode_ranged_write(handle_t *handle,
+ struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
+{
+ return jbd2_journal_file_inode(handle, jinode,
+ JI_WRITE_DATA | JI_WAIT_DATA, start_byte,
+ start_byte + length - 1);
+}
+
+int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *jinode,
+ loff_t start_byte, loff_t length)
+{
+ return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA,
+ start_byte, start_byte + length - 1);
}
/*
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 9e9117533fd7..8697b750b1c9 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -623,7 +623,6 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
{
struct kernfs_node *kn;
u32 gen;
- int cursor;
int ret;
name = kstrdup_const(name, GFP_KERNEL);
@@ -636,11 +635,11 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
idr_preload(GFP_KERNEL);
spin_lock(&kernfs_idr_lock);
- cursor = idr_get_cursor(&root->ino_idr);
ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
- if (ret >= 0 && ret < cursor)
+ if (ret >= 0 && ret < root->last_ino)
root->next_generation++;
gen = root->next_generation;
+ root->last_ino = ret;
spin_unlock(&kernfs_idr_lock);
idr_preload_end();
if (ret < 0)
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 5145ae2f0572..d273e3accade 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -63,6 +63,9 @@ static int kernfs_get_target_path(struct kernfs_node *parent,
if (base == kn)
break;
+ if ((s - path) + 3 >= PATH_MAX)
+ return -ENAMETOOLONG;
+
strcpy(s, "../");
s += 3;
base = base->parent;
@@ -79,7 +82,7 @@ static int kernfs_get_target_path(struct kernfs_node *parent,
if (len < 2)
return -EINVAL;
len--;
- if ((s - path) + len > PATH_MAX)
+ if ((s - path) + len >= PATH_MAX)
return -ENAMETOOLONG;
/* reverse fillup of target string from target to base */
diff --git a/fs/libfs.c b/fs/libfs.c
index 3aabe553fc45..49623301e5f0 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -86,58 +86,47 @@ int dcache_dir_close(struct inode *inode, struct file *file)
EXPORT_SYMBOL(dcache_dir_close);
/* parent is locked at least shared */
-static struct dentry *next_positive(struct dentry *parent,
- struct list_head *from,
- int count)
+/*
+ * Returns an element of siblings' list.
+ * We are looking for <count>th positive after <p>; if
+ * found, dentry is grabbed and passed to caller via *<res>.
+ * If no such element exists, the anchor of list is returned
+ * and *<res> is set to NULL.
+ */
+static struct list_head *scan_positives(struct dentry *cursor,
+ struct list_head *p,
+ loff_t count,
+ struct dentry **res)
{
- unsigned *seq = &parent->d_inode->i_dir_seq, n;
- struct dentry *res;
- struct list_head *p;
- bool skipped;
- int i;
+ struct dentry *dentry = cursor->d_parent, *found = NULL;
-retry:
- i = count;
- skipped = false;
- n = smp_load_acquire(seq) & ~1;
- res = NULL;
- rcu_read_lock();
- for (p = from->next; p != &parent->d_subdirs; p = p->next) {
+ spin_lock(&dentry->d_lock);
+ while ((p = p->next) != &dentry->d_subdirs) {
struct dentry *d = list_entry(p, struct dentry, d_child);
- if (!simple_positive(d)) {
- skipped = true;
- } else if (!--i) {
- res = d;
- break;
+ // we must at least skip cursors, to avoid livelocks
+ if (d->d_flags & DCACHE_DENTRY_CURSOR)
+ continue;
+ if (simple_positive(d) && !--count) {
+ spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
+ if (simple_positive(d))
+ found = dget_dlock(d);
+ spin_unlock(&d->d_lock);
+ if (likely(found))
+ break;
+ count = 1;
+ }
+ if (need_resched()) {
+ list_move(&cursor->d_child, p);
+ p = &cursor->d_child;
+ spin_unlock(&dentry->d_lock);
+ cond_resched();
+ spin_lock(&dentry->d_lock);
}
}
- rcu_read_unlock();
- if (skipped) {
- smp_rmb();
- if (unlikely(*seq != n))
- goto retry;
- }
- return res;
-}
-
-static void move_cursor(struct dentry *cursor, struct list_head *after)
-{
- struct dentry *parent = cursor->d_parent;
- unsigned n, *seq = &parent->d_inode->i_dir_seq;
- spin_lock(&parent->d_lock);
- for (;;) {
- n = *seq;
- if (!(n & 1) && cmpxchg(seq, n, n + 1) == n)
- break;
- cpu_relax();
- }
- __list_del(cursor->d_child.prev, cursor->d_child.next);
- if (after)
- list_add(&cursor->d_child, after);
- else
- list_add_tail(&cursor->d_child, &parent->d_subdirs);
- smp_store_release(seq, n + 2);
- spin_unlock(&parent->d_lock);
+ spin_unlock(&dentry->d_lock);
+ dput(*res);
+ *res = found;
+ return p;
}
loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
@@ -153,17 +142,28 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
return -EINVAL;
}
if (offset != file->f_pos) {
+ struct dentry *cursor = file->private_data;
+ struct dentry *to = NULL;
+ struct list_head *p;
+
file->f_pos = offset;
- if (file->f_pos >= 2) {
- struct dentry *cursor = file->private_data;
- struct dentry *to;
- loff_t n = file->f_pos - 2;
-
- inode_lock_shared(dentry->d_inode);
- to = next_positive(dentry, &dentry->d_subdirs, n);
- move_cursor(cursor, to ? &to->d_child : NULL);
- inode_unlock_shared(dentry->d_inode);
+ inode_lock_shared(dentry->d_inode);
+
+ if (file->f_pos > 2) {
+ p = scan_positives(cursor, &dentry->d_subdirs,
+ file->f_pos - 2, &to);
+ spin_lock(&dentry->d_lock);
+ list_move(&cursor->d_child, p);
+ spin_unlock(&dentry->d_lock);
+ } else {
+ spin_lock(&dentry->d_lock);
+ list_del_init(&cursor->d_child);
+ spin_unlock(&dentry->d_lock);
}
+
+ dput(to);
+
+ inode_unlock_shared(dentry->d_inode);
}
return offset;
}
@@ -185,25 +185,29 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
struct dentry *cursor = file->private_data;
- struct list_head *p = &cursor->d_child;
- struct dentry *next;
- bool moved = false;
+ struct list_head *anchor = &dentry->d_subdirs;
+ struct dentry *next = NULL;
+ struct list_head *p;
if (!dir_emit_dots(file, ctx))
return 0;
if (ctx->pos == 2)
- p = &dentry->d_subdirs;
- while ((next = next_positive(dentry, p, 1)) != NULL) {
+ p = anchor;
+ else
+ p = &cursor->d_child;
+
+ while ((p = scan_positives(cursor, p, 1, &next)) != anchor) {
if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
d_inode(next)->i_ino, dt_type(d_inode(next))))
break;
- moved = true;
- p = &next->d_child;
ctx->pos++;
}
- if (moved)
- move_cursor(cursor, p);
+ spin_lock(&dentry->d_lock);
+ list_move_tail(&cursor->d_child, p);
+ spin_unlock(&dentry->d_lock);
+ dput(next);
+
return 0;
}
EXPORT_SYMBOL(dcache_readdir);
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 00d5ef5f99f7..214a2fa1f1e3 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -128,24 +128,14 @@ static void encode_netobj(struct xdr_stream *xdr,
static int decode_netobj(struct xdr_stream *xdr,
struct xdr_netobj *obj)
{
- u32 length;
- __be32 *p;
+ ssize_t ret;
- p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
- length = be32_to_cpup(p++);
- if (unlikely(length > XDR_MAX_NETOBJ))
- goto out_size;
- obj->len = length;
- obj->data = (u8 *)p;
+ ret = xdr_stream_decode_opaque_inline(xdr, (void *)&obj->data,
+ XDR_MAX_NETOBJ);
+ if (unlikely(ret < 0))
+ return -EIO;
+ obj->len = ret;
return 0;
-out_size:
- dprintk("NFS: returned netobj was too long: %u\n", length);
- return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 2c6176387143..747b9c8c940a 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -125,24 +125,14 @@ static void encode_netobj(struct xdr_stream *xdr,
static int decode_netobj(struct xdr_stream *xdr,
struct xdr_netobj *obj)
{
- u32 length;
- __be32 *p;
+ ssize_t ret;
- p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
- length = be32_to_cpup(p++);
- if (unlikely(length > XDR_MAX_NETOBJ))
- goto out_size;
- obj->len = length;
- obj->data = (u8 *)p;
+ ret = xdr_stream_decode_opaque_inline(xdr, (void *)&obj->data,
+ XDR_MAX_NETOBJ);
+ if (unlikely(ret < 0))
+ return -EIO;
+ obj->len = ret;
return 0;
-out_size:
- dprintk("NFS: returned netobj was too long: %u\n", length);
- return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 0c7008fb6d5a..9e7d49fac4e3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -416,10 +416,10 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
clp = nfs_match_client(cl_init);
if (clp) {
spin_unlock(&nn->nfs_client_lock);
- if (IS_ERR(clp))
- return clp;
if (new)
new->rpc_ops->free_client(new);
+ if (IS_ERR(clp))
+ return clp;
return nfs_found_client(cl_init, clp);
}
if (new) {
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 606dd3871f66..04d57e11577e 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -52,6 +52,16 @@ nfs4_is_valid_delegation(const struct nfs_delegation *delegation,
return false;
}
+struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (nfs4_is_valid_delegation(delegation, 0))
+ return delegation;
+ return NULL;
+}
+
static int
nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
{
@@ -91,7 +101,7 @@ int nfs4_check_delegation(struct inode *inode, fmode_t flags)
return nfs4_do_check_delegation(inode, flags, false);
}
-static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
+static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_stateid *stateid)
{
struct inode *inode = state->inode;
struct file_lock *fl;
@@ -106,7 +116,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
spin_lock(&flctx->flc_lock);
restart:
list_for_each_entry(fl, list, fl_list) {
- if (nfs_file_open_context(fl->fl_file) != ctx)
+ if (nfs_file_open_context(fl->fl_file)->state != state)
continue;
spin_unlock(&flctx->flc_lock);
status = nfs4_lock_delegation_recall(fl, state, stateid);
@@ -153,7 +163,7 @@ again:
seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
err = nfs4_open_delegation_recall(ctx, state, stateid, type);
if (!err)
- err = nfs_delegation_claim_locks(ctx, state, stateid);
+ err = nfs_delegation_claim_locks(state, stateid);
if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
err = -EAGAIN;
mutex_unlock(&sp->so_delegreturn_mutex);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index ddaf2644cf13..df41d16dc6ab 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -63,6 +63,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred);
+struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode);
void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
int nfs4_have_delegation(struct inode *inode, fmode_t flags);
int nfs4_check_delegation(struct inode *inode, fmode_t flags);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index bf2c43635062..50c181fa0025 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1059,6 +1059,100 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
}
+static int
+nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
+ struct inode *inode, int error)
+{
+ switch (error) {
+ case 1:
+ dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
+ __func__, dentry);
+ return 1;
+ case 0:
+ nfs_mark_for_revalidate(dir);
+ if (inode && S_ISDIR(inode->i_mode)) {
+ /* Purge readdir caches. */
+ nfs_zap_caches(inode);
+ /*
+ * We can't d_drop the root of a disconnected tree:
+ * its d_hash is on the s_anon list and d_drop() would hide
+ * it from shrink_dcache_for_unmount(), leading to busy
+ * inodes on unmount and further oopses.
+ */
+ if (IS_ROOT(dentry))
+ return 1;
+ }
+ dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
+ __func__, dentry);
+ return 0;
+ }
+ dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n",
+ __func__, dentry, error);
+ return error;
+}
+
+static int
+nfs_lookup_revalidate_negative(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ int ret = 1;
+ if (nfs_neg_need_reval(dir, dentry, flags)) {
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ ret = 0;
+ }
+ return nfs_lookup_revalidate_done(dir, dentry, NULL, ret);
+}
+
+static int
+nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry,
+ struct inode *inode)
+{
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
+}
+
+static int
+nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
+ struct inode *inode)
+{
+ struct nfs_fh *fhandle;
+ struct nfs_fattr *fattr;
+ struct nfs4_label *label;
+ int ret;
+
+ ret = -ENOMEM;
+ fhandle = nfs_alloc_fhandle();
+ fattr = nfs_alloc_fattr();
+ label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+ if (fhandle == NULL || fattr == NULL || IS_ERR(label))
+ goto out;
+
+ ret = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
+ if (ret < 0) {
+ if (ret == -ESTALE || ret == -ENOENT)
+ ret = 0;
+ goto out;
+ }
+ ret = 0;
+ if (nfs_compare_fh(NFS_FH(inode), fhandle))
+ goto out;
+ if (nfs_refresh_inode(inode, fattr) < 0)
+ goto out;
+
+ nfs_setsecurity(inode, fattr, label);
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+
+ /* set a readdirplus hint that we had a cache miss */
+ nfs_force_use_readdirplus(dir);
+ ret = 1;
+out:
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fhandle);
+ nfs4_label_free(label);
+ return nfs_lookup_revalidate_done(dir, dentry, inode, ret);
+}
+
/*
* This is called every time the dcache has a lookup hit,
* and we should check whether we can really trust that
@@ -1070,58 +1164,36 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
* If the parent directory is seen to have changed, we throw out the
* cached dentry and do a new lookup.
*/
-static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+static int
+nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
{
- struct inode *dir;
struct inode *inode;
- struct dentry *parent;
- struct nfs_fh *fhandle = NULL;
- struct nfs_fattr *fattr = NULL;
- struct nfs4_label *label = NULL;
int error;
- if (flags & LOOKUP_RCU) {
- parent = ACCESS_ONCE(dentry->d_parent);
- dir = d_inode_rcu(parent);
- if (!dir)
- return -ECHILD;
- } else {
- parent = dget_parent(dentry);
- dir = d_inode(parent);
- }
nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
inode = d_inode(dentry);
- if (!inode) {
- if (nfs_neg_need_reval(dir, dentry, flags)) {
- if (flags & LOOKUP_RCU)
- return -ECHILD;
- goto out_bad;
- }
- goto out_valid;
- }
+ if (!inode)
+ return nfs_lookup_revalidate_negative(dir, dentry, flags);
if (is_bad_inode(inode)) {
- if (flags & LOOKUP_RCU)
- return -ECHILD;
dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
__func__, dentry);
goto out_bad;
}
if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
- goto out_set_verifier;
+ return nfs_lookup_revalidate_delegated(dir, dentry, inode);
/* Force a full look up iff the parent directory has changed */
if (!nfs_is_exclusive_create(dir, flags) &&
nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
error = nfs_lookup_verify_inode(inode, flags);
if (error) {
- if (flags & LOOKUP_RCU)
- return -ECHILD;
if (error == -ESTALE)
- goto out_zap_parent;
- goto out_error;
+ nfs_zap_caches(dir);
+ goto out_bad;
}
nfs_advise_use_readdirplus(dir);
goto out_valid;
@@ -1133,81 +1205,45 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (NFS_STALE(inode))
goto out_bad;
- error = -ENOMEM;
- fhandle = nfs_alloc_fhandle();
- fattr = nfs_alloc_fattr();
- if (fhandle == NULL || fattr == NULL)
- goto out_error;
-
- label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
- if (IS_ERR(label))
- goto out_error;
-
trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
+ error = nfs_lookup_revalidate_dentry(dir, dentry, inode);
trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error);
- if (error == -ESTALE || error == -ENOENT)
- goto out_bad;
- if (error)
- goto out_error;
- if (nfs_compare_fh(NFS_FH(inode), fhandle))
- goto out_bad;
- if ((error = nfs_refresh_inode(inode, fattr)) != 0)
- goto out_bad;
-
- nfs_setsecurity(inode, fattr, label);
-
- nfs_free_fattr(fattr);
- nfs_free_fhandle(fhandle);
- nfs4_label_free(label);
+ return error;
+out_valid:
+ return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
+out_bad:
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ return nfs_lookup_revalidate_done(dir, dentry, inode, 0);
+}
- /* set a readdirplus hint that we had a cache miss */
- nfs_force_use_readdirplus(dir);
+static int
+__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
+ int (*reval)(struct inode *, struct dentry *, unsigned int))
+{
+ struct dentry *parent;
+ struct inode *dir;
+ int ret;
-out_set_verifier:
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
- out_valid:
if (flags & LOOKUP_RCU) {
+ parent = ACCESS_ONCE(dentry->d_parent);
+ dir = d_inode_rcu(parent);
+ if (!dir)
+ return -ECHILD;
+ ret = reval(dir, dentry, flags);
if (parent != ACCESS_ONCE(dentry->d_parent))
return -ECHILD;
- } else
+ } else {
+ parent = dget_parent(dentry);
+ ret = reval(d_inode(parent), dentry, flags);
dput(parent);
- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
- __func__, dentry);
- return 1;
-out_zap_parent:
- nfs_zap_caches(dir);
- out_bad:
- WARN_ON(flags & LOOKUP_RCU);
- nfs_free_fattr(fattr);
- nfs_free_fhandle(fhandle);
- nfs4_label_free(label);
- nfs_mark_for_revalidate(dir);
- if (inode && S_ISDIR(inode->i_mode)) {
- /* Purge readdir caches. */
- nfs_zap_caches(inode);
- /*
- * We can't d_drop the root of a disconnected tree:
- * its d_hash is on the s_anon list and d_drop() would hide
- * it from shrink_dcache_for_unmount(), leading to busy
- * inodes on unmount and further oopses.
- */
- if (IS_ROOT(dentry))
- goto out_valid;
}
- dput(parent);
- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
- __func__, dentry);
- return 0;
-out_error:
- WARN_ON(flags & LOOKUP_RCU);
- nfs_free_fattr(fattr);
- nfs_free_fhandle(fhandle);
- nfs4_label_free(label);
- dput(parent);
- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n",
- __func__, dentry, error);
- return error;
+ return ret;
+}
+
+static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate);
}
/*
@@ -1434,7 +1470,7 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
if (S_ISREG(file->f_path.dentry->d_inode->i_mode))
nfs_file_set_open_context(file, ctx);
else
- err = -ESTALE;
+ err = -EOPENSTALE;
out:
return err;
}
@@ -1560,62 +1596,55 @@ no_open:
}
EXPORT_SYMBOL_GPL(nfs_atomic_open);
-static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+static int
+nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
{
struct inode *inode;
- int ret = 0;
if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
- goto no_open;
+ goto full_reval;
if (d_mountpoint(dentry))
- goto no_open;
- if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1)
- goto no_open;
+ goto full_reval;
inode = d_inode(dentry);
/* We can't create new files in nfs_open_revalidate(), so we
* optimize away revalidation of negative dentries.
*/
- if (inode == NULL) {
- struct dentry *parent;
- struct inode *dir;
-
- if (flags & LOOKUP_RCU) {
- parent = ACCESS_ONCE(dentry->d_parent);
- dir = d_inode_rcu(parent);
- if (!dir)
- return -ECHILD;
- } else {
- parent = dget_parent(dentry);
- dir = d_inode(parent);
- }
- if (!nfs_neg_need_reval(dir, dentry, flags))
- ret = 1;
- else if (flags & LOOKUP_RCU)
- ret = -ECHILD;
- if (!(flags & LOOKUP_RCU))
- dput(parent);
- else if (parent != ACCESS_ONCE(dentry->d_parent))
- return -ECHILD;
- goto out;
- }
+ if (inode == NULL)
+ goto full_reval;
+
+ if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
+ return nfs_lookup_revalidate_delegated(dir, dentry, inode);
/* NFS only supports OPEN on regular files */
if (!S_ISREG(inode->i_mode))
- goto no_open;
+ goto full_reval;
+
/* We cannot do exclusive creation on a positive dentry */
- if (flags & LOOKUP_EXCL)
- goto no_open;
+ if (flags & (LOOKUP_EXCL | LOOKUP_REVAL))
+ goto reval_dentry;
+
+ /* Check if the directory changed */
+ if (!nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU))
+ goto reval_dentry;
/* Let f_op->open() actually open (and revalidate) the file */
- ret = 1;
+ return 1;
+reval_dentry:
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ return nfs_lookup_revalidate_dentry(dir, dentry, inode);;
-out:
- return ret;
+full_reval:
+ return nfs_do_lookup_revalidate(dir, dentry, flags);
+}
-no_open:
- return nfs_lookup_revalidate(dentry, flags);
+static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return __nfs_lookup_revalidate(dentry, flags,
+ nfs4_do_lookup_revalidate);
}
#endif /* CONFIG_NFSV4 */
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 89c03a507dd9..9cdac9945483 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -122,32 +122,49 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
}
static void
-nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
+nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
+ const struct nfs_pgio_header *hdr,
+ ssize_t dreq_len)
{
- int i;
- ssize_t count;
+ struct nfs_direct_mirror *mirror = &dreq->mirrors[hdr->pgio_mirror_idx];
+
+ if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) ||
+ test_bit(NFS_IOHDR_EOF, &hdr->flags)))
+ return;
+ if (dreq->max_count >= dreq_len) {
+ dreq->max_count = dreq_len;
+ if (dreq->count > dreq_len)
+ dreq->count = dreq_len;
+
+ if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
+ dreq->error = hdr->error;
+ else /* Clear outstanding error if this is EOF */
+ dreq->error = 0;
+ }
+ if (mirror->count > dreq_len)
+ mirror->count = dreq_len;
+}
+
+static void
+nfs_direct_count_bytes(struct nfs_direct_req *dreq,
+ const struct nfs_pgio_header *hdr)
+{
+ struct nfs_direct_mirror *mirror = &dreq->mirrors[hdr->pgio_mirror_idx];
+ loff_t hdr_end = hdr->io_start + hdr->good_bytes;
+ ssize_t dreq_len = 0;
- WARN_ON_ONCE(dreq->count >= dreq->max_count);
+ if (hdr_end > dreq->io_start)
+ dreq_len = hdr_end - dreq->io_start;
- if (dreq->mirror_count == 1) {
- dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes;
- dreq->count += hdr->good_bytes;
- } else {
- /* mirrored writes */
- count = dreq->mirrors[hdr->pgio_mirror_idx].count;
- if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
- count = hdr->io_start + hdr->good_bytes - dreq->io_start;
- dreq->mirrors[hdr->pgio_mirror_idx].count = count;
- }
- /* update the dreq->count by finding the minimum agreed count from all
- * mirrors */
- count = dreq->mirrors[0].count;
+ nfs_direct_handle_truncated(dreq, hdr, dreq_len);
- for (i = 1; i < dreq->mirror_count; i++)
- count = min(count, dreq->mirrors[i].count);
+ if (dreq_len > dreq->max_count)
+ dreq_len = dreq->max_count;
- dreq->count = count;
- }
+ if (mirror->count < dreq_len)
+ mirror->count = dreq_len;
+ if (dreq->count < dreq_len)
+ dreq->count = dreq_len;
}
/*
@@ -400,15 +417,13 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
unsigned long bytes = 0;
struct nfs_direct_req *dreq = hdr->dreq;
- if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
- goto out_put;
-
spin_lock(&dreq->lock);
- if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
- dreq->error = hdr->error;
- else
- nfs_direct_good_bytes(dreq, hdr);
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ spin_unlock(&dreq->lock);
+ goto out_put;
+ }
+ nfs_direct_count_bytes(dreq, hdr);
spin_unlock(&dreq->lock);
while (!list_empty(&hdr->pages)) {
@@ -428,7 +443,7 @@ out_put:
hdr->release(hdr);
}
-static void nfs_read_sync_pgio_error(struct list_head *head)
+static void nfs_read_sync_pgio_error(struct list_head *head, int error)
{
struct nfs_page *req;
@@ -645,6 +660,9 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
dreq->count = 0;
+ dreq->max_count = 0;
+ list_for_each_entry(req, &reqs, wb_list)
+ dreq->max_count += req->wb_bytes;
dreq->verf.committed = NFS_INVALID_STABLE_HOW;
nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
for (i = 0; i < dreq->mirror_count; i++)
@@ -664,8 +682,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
if (!nfs_pageio_add_request(&desc, req)) {
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &failed);
+ nfs_list_move_request(req, &failed);
spin_lock(&cinfo.inode->i_lock);
dreq->flags = 0;
if (desc.pg_error < 0)
@@ -775,17 +792,16 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
bool request_commit = false;
struct nfs_page *req = nfs_list_entry(hdr->pages.next);
- if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
- goto out_put;
-
nfs_init_cinfo_from_dreq(&cinfo, dreq);
spin_lock(&dreq->lock);
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ spin_unlock(&dreq->lock);
+ goto out_put;
+ }
- if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
- dreq->error = hdr->error;
- if (dreq->error == 0) {
- nfs_direct_good_bytes(dreq, hdr);
+ nfs_direct_count_bytes(dreq, hdr);
+ if (hdr->good_bytes != 0) {
if (nfs_write_need_commit(hdr)) {
if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
request_commit = true;
@@ -821,7 +837,7 @@ out_put:
hdr->release(hdr);
}
-static void nfs_write_sync_pgio_error(struct list_head *head)
+static void nfs_write_sync_pgio_error(struct list_head *head, int error)
{
struct nfs_page *req;
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 9f69e83810ca..2464b9b80698 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -18,7 +18,7 @@
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS;
+static unsigned int dataserver_timeo = NFS_DEF_TCP_TIMEO;
static unsigned int dataserver_retrans;
static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
@@ -306,7 +306,7 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
if (status == 0)
return 0;
- if (mirror->mirror_ds == NULL)
+ if (IS_ERR_OR_NULL(mirror->mirror_ds))
return -EINVAL;
dserr = kmalloc(sizeof(*dserr), gfp_flags);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 134d9f560240..71a399f6805a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1034,6 +1034,7 @@ int nfs_open(struct inode *inode, struct file *filp)
nfs_fscache_open_file(inode, filp);
return 0;
}
+EXPORT_SYMBOL_GPL(nfs_open);
/*
* This function is called whenever some part of NFS notices that
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a73144b3cb8c..22cff39cca29 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -433,7 +433,8 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session,
extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *, gfp_t);
extern void nfs4_put_state_owner(struct nfs4_state_owner *);
-extern void nfs4_purge_state_owners(struct nfs_server *);
+extern void nfs4_purge_state_owners(struct nfs_server *, struct list_head *);
+extern void nfs4_free_state_owners(struct list_head *head);
extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
extern void nfs4_put_open_state(struct nfs4_state *);
extern void nfs4_close_state(struct nfs4_state *, fmode_t);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 8f96f6548dc8..0924b68b5657 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -739,9 +739,12 @@ out:
static void nfs4_destroy_server(struct nfs_server *server)
{
+ LIST_HEAD(freeme);
+
nfs_server_return_all_delegations(server);
unset_pnfs_layoutdriver(server);
- nfs4_purge_state_owners(server);
+ nfs4_purge_state_owners(server, &freeme);
+ nfs4_free_state_owners(&freeme);
}
/*
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 626d1382002e..b8d316a338bc 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -50,7 +50,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
return err;
if ((openflags & O_ACCMODE) == 3)
- openflags--;
+ return nfs_open(inode, filp);
/* We can't create new files here */
openflags &= ~(O_CREAT|O_EXCL);
@@ -74,13 +74,13 @@ nfs4_file_open(struct inode *inode, struct file *filp)
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
switch (err) {
- case -EPERM:
- case -EACCES:
- case -EDQUOT:
- case -ENOSPC:
- case -EROFS:
- goto out_put_ctx;
default:
+ goto out_put_ctx;
+ case -ENOENT:
+ case -ESTALE:
+ case -EISDIR:
+ case -ENOTDIR:
+ case -ELOOP:
goto out_drop;
}
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a225f98c9903..f1526f65cc58 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1099,6 +1099,12 @@ struct nfs4_opendata {
int rpc_status;
};
+struct nfs4_open_createattrs {
+ struct nfs4_label *label;
+ struct iattr *sattr;
+ const __u32 verf[2];
+};
+
static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server,
int err, struct nfs4_exception *exception)
{
@@ -1168,8 +1174,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
struct nfs4_state_owner *sp, fmode_t fmode, int flags,
- const struct iattr *attrs,
- struct nfs4_label *label,
+ const struct nfs4_open_createattrs *c,
enum open_claim_type4 claim,
gfp_t gfp_mask)
{
@@ -1177,6 +1182,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
struct inode *dir = d_inode(parent);
struct nfs_server *server = NFS_SERVER(dir);
struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+ struct nfs4_label *label = (c != NULL) ? c->label : NULL;
struct nfs4_opendata *p;
p = kzalloc(sizeof(*p), gfp_mask);
@@ -1242,15 +1248,11 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
p->o_arg.fh = NFS_FH(d_inode(dentry));
}
- if (attrs != NULL && attrs->ia_valid != 0) {
- __u32 verf[2];
-
+ if (c != NULL && c->sattr != NULL && c->sattr->ia_valid != 0) {
p->o_arg.u.attrs = &p->attrs;
- memcpy(&p->attrs, attrs, sizeof(p->attrs));
+ memcpy(&p->attrs, c->sattr, sizeof(p->attrs));
- verf[0] = jiffies;
- verf[1] = current->pid;
- memcpy(p->o_arg.u.verifier.data, verf,
+ memcpy(p->o_arg.u.verifier.data, c->verf,
sizeof(p->o_arg.u.verifier.data));
}
p->c_arg.fh = &p->o_res.fh;
@@ -1315,12 +1317,20 @@ static bool nfs4_mode_match_open_stateid(struct nfs4_state *state,
return false;
}
-static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode)
+static int can_open_cached(struct nfs4_state *state, fmode_t mode,
+ int open_mode, enum open_claim_type4 claim)
{
int ret = 0;
if (open_mode & (O_EXCL|O_TRUNC))
goto out;
+ switch (claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
+ goto out;
+ default:
+ break;
+ }
switch (mode & (FMODE_READ|FMODE_WRITE)) {
case FMODE_READ:
ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0
@@ -1345,8 +1355,6 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode,
return 0;
if ((delegation->type & fmode) != fmode)
return 0;
- if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
- return 0;
switch (claim) {
case NFS4_OPEN_CLAIM_NULL:
case NFS4_OPEN_CLAIM_FH:
@@ -1605,7 +1613,6 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo
static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
{
struct nfs4_state *state = opendata->state;
- struct nfs_inode *nfsi = NFS_I(state->inode);
struct nfs_delegation *delegation;
int open_mode = opendata->o_arg.open_flags;
fmode_t fmode = opendata->o_arg.fmode;
@@ -1615,14 +1622,14 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
for (;;) {
spin_lock(&state->owner->so_lock);
- if (can_open_cached(state, fmode, open_mode)) {
+ if (can_open_cached(state, fmode, open_mode, claim)) {
update_open_stateflags(state, fmode);
spin_unlock(&state->owner->so_lock);
goto out_return_state;
}
spin_unlock(&state->owner->so_lock);
rcu_read_lock();
- delegation = rcu_dereference(nfsi->delegation);
+ delegation = nfs4_get_valid_delegation(state->inode);
if (!can_open_delegated(delegation, fmode, claim)) {
rcu_read_unlock();
break;
@@ -1816,7 +1823,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
struct nfs4_opendata *opendata;
opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0,
- NULL, NULL, claim, GFP_NOFS);
+ NULL, claim, GFP_NOFS);
if (opendata == NULL)
return ERR_PTR(-ENOMEM);
opendata->state = state;
@@ -2139,10 +2146,11 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
if (data->state != NULL) {
struct nfs_delegation *delegation;
- if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags))
+ if (can_open_cached(data->state, data->o_arg.fmode,
+ data->o_arg.open_flags, claim))
goto out_no_action;
rcu_read_lock();
- delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
+ delegation = nfs4_get_valid_delegation(data->state->inode);
if (can_open_delegated(delegation, data->o_arg.fmode, claim))
goto unlock_no_action;
rcu_read_unlock();
@@ -2757,8 +2765,7 @@ out:
static int _nfs4_do_open(struct inode *dir,
struct nfs_open_context *ctx,
int flags,
- struct iattr *sattr,
- struct nfs4_label *label,
+ const struct nfs4_open_createattrs *c,
int *opened)
{
struct nfs4_state_owner *sp;
@@ -2770,6 +2777,8 @@ static int _nfs4_do_open(struct inode *dir,
struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
+ struct iattr *sattr = c->sattr;
+ struct nfs4_label *label = c->label;
struct nfs4_label *olabel = NULL;
int status;
@@ -2788,8 +2797,8 @@ static int _nfs4_do_open(struct inode *dir,
status = -ENOMEM;
if (d_really_is_positive(dentry))
claim = NFS4_OPEN_CLAIM_FH;
- opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr,
- label, claim, GFP_KERNEL);
+ opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags,
+ c, claim, GFP_KERNEL);
if (opendata == NULL)
goto err_put_state_owner;
@@ -2870,10 +2879,18 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_exception exception = { };
struct nfs4_state *res;
+ struct nfs4_open_createattrs c = {
+ .label = label,
+ .sattr = sattr,
+ .verf = {
+ [0] = (__u32)jiffies,
+ [1] = (__u32)current->pid,
+ },
+ };
int status;
do {
- status = _nfs4_do_open(dir, ctx, flags, sattr, label, opened);
+ status = _nfs4_do_open(dir, ctx, flags, &c, opened);
res = ctx->state;
trace_nfs4_open_file(ctx, flags, status);
if (status == 0)
@@ -2934,7 +2951,6 @@ static int _nfs4_do_setattr(struct inode *inode,
};
struct rpc_cred *delegation_cred = NULL;
unsigned long timestamp = jiffies;
- fmode_t fmode;
bool truncate;
int status;
@@ -2942,11 +2958,12 @@ static int _nfs4_do_setattr(struct inode *inode,
/* Servers should only apply open mode checks for file size changes */
truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false;
- fmode = truncate ? FMODE_WRITE : FMODE_READ;
+ if (!truncate)
+ goto zero_stateid;
- if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) {
+ if (nfs4_copy_delegation_stateid(inode, FMODE_WRITE, &arg->stateid, &delegation_cred)) {
/* Use that stateid */
- } else if (truncate && ctx != NULL) {
+ } else if (ctx != NULL && ctx->state) {
struct nfs_lock_context *l_ctx;
if (!nfs4_valid_open_stateid(ctx->state))
return -EBADF;
@@ -2958,8 +2975,10 @@ static int _nfs4_do_setattr(struct inode *inode,
nfs_put_lock_context(l_ctx);
if (status == -EIO)
return -EBADF;
- } else
+ } else {
+zero_stateid:
nfs4_stateid_copy(&arg->stateid, &zero_stateid);
+ }
if (delegation_cred)
msg.rpc_cred = delegation_cred;
@@ -5633,6 +5652,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
}
status = task->tk_status;
if (setclientid.sc_cred) {
+ kfree(clp->cl_acceptor);
clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
put_rpccred(setclientid.sc_cred);
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 85ec07e4aa91..f92bfc787c5f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -614,24 +614,39 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
/**
* nfs4_purge_state_owners - Release all cached state owners
* @server: nfs_server with cached state owners to release
+ * @head: resulting list of state owners
*
* Called at umount time. Remaining state owners will be on
* the LRU with ref count of zero.
+ * Note that the state owners are not freed, but are added
+ * to the list @head, which can later be used as an argument
+ * to nfs4_free_state_owners.
*/
-void nfs4_purge_state_owners(struct nfs_server *server)
+void nfs4_purge_state_owners(struct nfs_server *server, struct list_head *head)
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_state_owner *sp, *tmp;
- LIST_HEAD(doomed);
spin_lock(&clp->cl_lock);
list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
- list_move(&sp->so_lru, &doomed);
+ list_move(&sp->so_lru, head);
nfs4_remove_state_owner_locked(sp);
}
spin_unlock(&clp->cl_lock);
+}
- list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+/**
+ * nfs4_purge_state_owners - Release all cached state owners
+ * @head: resulting list of state owners
+ *
+ * Frees a list of state owners that was generated by
+ * nfs4_purge_state_owners
+ */
+void nfs4_free_state_owners(struct list_head *head)
+{
+ struct nfs4_state_owner *sp, *tmp;
+
+ list_for_each_entry_safe(sp, tmp, head, so_lru) {
list_del(&sp->so_lru);
nfs4_free_state_owner(sp);
}
@@ -1782,12 +1797,13 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
struct nfs4_state_owner *sp;
struct nfs_server *server;
struct rb_node *pos;
+ LIST_HEAD(freeme);
int status = 0;
restart:
rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- nfs4_purge_state_owners(server);
+ nfs4_purge_state_owners(server, &freeme);
spin_lock(&clp->cl_lock);
for (pos = rb_first(&server->state_owners);
pos != NULL;
@@ -1816,6 +1832,7 @@ restart:
spin_unlock(&clp->cl_lock);
}
rcu_read_unlock();
+ nfs4_free_state_owners(&freeme);
return 0;
}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 549c916d2859..525684b0056f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1132,7 +1132,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
} else
*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
}
- if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
+ if (label && (bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) {
*p++ = cpu_to_be32(label->lfs);
*p++ = cpu_to_be32(label->pi);
*p++ = cpu_to_be32(label->len);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 28b013d1d44a..ceb6892d9bbd 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -566,7 +566,7 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
}
hdr->res.fattr = &hdr->fattr;
- hdr->res.count = count;
+ hdr->res.count = 0;
hdr->res.eof = 0;
hdr->res.verf = &hdr->verf;
nfs_fattr_init(&hdr->fattr);
@@ -768,8 +768,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
pageused = 0;
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &hdr->pages);
+ nfs_list_move_request(req, &hdr->pages);
if (!last_page || last_page != req->wb_page) {
pageused++;
@@ -961,8 +960,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
}
if (!nfs_can_coalesce_requests(prev, req, desc))
return 0;
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &mirror->pg_list);
+ nfs_list_move_request(req, &mirror->pg_list);
mirror->pg_count += req->wb_bytes;
return 1;
}
@@ -994,9 +992,8 @@ nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc,
{
LIST_HEAD(head);
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &head);
- desc->pg_completion_ops->error_cleanup(&head);
+ nfs_list_move_request(req, &head);
+ desc->pg_completion_ops->error_cleanup(&head, desc->pg_error);
}
/**
@@ -1132,7 +1129,8 @@ static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc)
for (midx = 0; midx < desc->pg_mirror_count; midx++) {
mirror = &desc->pg_mirrors[midx];
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+ desc->pg_completion_ops->error_cleanup(&mirror->pg_list,
+ desc->pg_error);
}
}
@@ -1234,21 +1232,23 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr)
{
- LIST_HEAD(failed);
+ LIST_HEAD(pages);
desc->pg_io_completion = hdr->io_completion;
desc->pg_dreq = hdr->dreq;
- while (!list_empty(&hdr->pages)) {
- struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+ list_splice_init(&hdr->pages, &pages);
+ while (!list_empty(&pages)) {
+ struct nfs_page *req = nfs_list_entry(pages.next);
- nfs_list_remove_request(req);
if (!nfs_pageio_add_request(desc, req))
- nfs_list_add_request(req, &failed);
+ break;
}
nfs_pageio_complete(desc);
- if (!list_empty(&failed)) {
- list_move(&failed, &hdr->pages);
- return desc->pg_error < 0 ? desc->pg_error : -EIO;
+ if (!list_empty(&pages)) {
+ int err = desc->pg_error < 0 ? desc->pg_error : -EIO;
+ hdr->completion_ops->error_cleanup(&pages, err);
+ nfs_set_pgio_error(hdr, err, hdr->io_start);
+ return err;
}
return 0;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 96867fb159bf..ec04cce31814 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1319,10 +1319,15 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
const nfs4_stateid *res_stateid = NULL;
struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
- if (ret == 0) {
- arg_stateid = &args->stateid;
+ switch (ret) {
+ case -NFS4ERR_NOMATCHING_LAYOUT:
+ break;
+ case 0:
if (res->lrs_present)
res_stateid = &res->stateid;
+ /* Fallthrough */
+ default:
+ arg_stateid = &args->stateid;
}
pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range,
res_stateid);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index f7fd9192d4bc..eff93315572e 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -589,7 +589,8 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
/* Emulate the eof flag, which isn't normally needed in NFSv2
* as it is guaranteed to always return the file attributes
*/
- if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)
+ if ((hdr->res.count == 0 && hdr->args.count > 0) ||
+ hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)
hdr->res.eof = 1;
}
return 0;
@@ -610,8 +611,10 @@ static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task,
static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- if (task->tk_status >= 0)
+ if (task->tk_status >= 0) {
+ hdr->res.count = hdr->args.count;
nfs_writeback_update_inode(hdr);
+ }
return 0;
}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 48d7277c60a9..09d5c282f50e 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -205,7 +205,7 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr,
}
static void
-nfs_async_read_error(struct list_head *head)
+nfs_async_read_error(struct list_head *head, int error)
{
struct nfs_page *req;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 50ed3944d183..01b9d9341b54 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -783,7 +783,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_page *head;
- atomic_long_dec(&nfsi->nrequests);
if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
head = req->wb_head;
@@ -796,8 +795,10 @@ static void nfs_inode_remove_request(struct nfs_page *req)
spin_unlock(&mapping->private_lock);
}
- if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
+ if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
nfs_release_request(req);
+ atomic_long_dec(&nfsi->nrequests);
+ }
}
static void
@@ -1397,20 +1398,27 @@ static void nfs_redirty_request(struct nfs_page *req)
nfs_release_request(req);
}
-static void nfs_async_write_error(struct list_head *head)
+static void nfs_async_write_error(struct list_head *head, int error)
{
struct nfs_page *req;
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
+ if (nfs_error_is_fatal(error)) {
+ nfs_context_set_write_error(req->wb_context, error);
+ if (nfs_error_is_fatal_on_server(error)) {
+ nfs_write_error_remove_page(req);
+ continue;
+ }
+ }
nfs_redirty_request(req);
}
}
static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr)
{
- nfs_async_write_error(&hdr->pages);
+ nfs_async_write_error(&hdr->pages, 0);
}
static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 66eaeb1e8c2c..dc9586feab31 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -661,7 +661,7 @@ struct cld_net {
struct cld_upcall {
struct list_head cu_list;
struct cld_net *cu_net;
- struct task_struct *cu_task;
+ struct completion cu_done;
struct cld_msg cu_msg;
};
@@ -670,23 +670,18 @@ __cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
{
int ret;
struct rpc_pipe_msg msg;
+ struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_msg);
memset(&msg, 0, sizeof(msg));
msg.data = cmsg;
msg.len = sizeof(*cmsg);
- /*
- * Set task state before we queue the upcall. That prevents
- * wake_up_process in the downcall from racing with schedule.
- */
- set_current_state(TASK_UNINTERRUPTIBLE);
ret = rpc_queue_upcall(pipe, &msg);
if (ret < 0) {
- set_current_state(TASK_RUNNING);
goto out;
}
- schedule();
+ wait_for_completion(&cup->cu_done);
if (msg.errno < 0)
ret = msg.errno;
@@ -753,7 +748,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
if (copy_from_user(&cup->cu_msg, src, mlen) != 0)
return -EFAULT;
- wake_up_process(cup->cu_task);
+ complete(&cup->cu_done);
return mlen;
}
@@ -768,7 +763,7 @@ cld_pipe_destroy_msg(struct rpc_pipe_msg *msg)
if (msg->errno >= 0)
return;
- wake_up_process(cup->cu_task);
+ complete(&cup->cu_done);
}
static const struct rpc_pipe_ops cld_upcall_ops = {
@@ -899,7 +894,7 @@ restart_search:
goto restart_search;
}
}
- new->cu_task = current;
+ init_completion(&new->cu_done);
new->cu_msg.cm_vers = CLD_UPCALL_VERSION;
put_unaligned(cn->cn_xid++, &new->cu_msg.cm_xid);
new->cu_net = cn;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 94128643ec1a..87ee9cbf7dcb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1502,11 +1502,16 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca)
{
u32 slotsize = slot_bytes(ca);
u32 num = ca->maxreqs;
- int avail;
+ unsigned long avail, total_avail;
spin_lock(&nfsd_drc_lock);
- avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
- nfsd_drc_max_mem - nfsd_drc_mem_used);
+ total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used;
+ avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail);
+ /*
+ * Never use more than a third of the remaining memory,
+ * unless it's the only way to give this client a slot:
+ */
+ avail = clamp_t(unsigned long, avail, slotsize, total_avail/3);
num = min_t(int, num, avail / slotsize);
nfsd_drc_mem_used += num * slotsize;
spin_unlock(&nfsd_drc_lock);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index e02bd2783124..4a9e0fb634b6 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -447,7 +447,7 @@ void nfsd_reset_versions(void)
*/
static void set_max_drc(void)
{
- #define NFSD_DRC_SIZE_SHIFT 10
+ #define NFSD_DRC_SIZE_SHIFT 7
nfsd_drc_max_mem = (nr_free_buffer_pages()
>> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
nfsd_drc_mem_used = 0;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f55527ef21e8..06d1f2edf2ec 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -396,10 +396,23 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
bool get_write_count;
bool size_change = (iap->ia_valid & ATTR_SIZE);
- if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
+ if (iap->ia_valid & ATTR_SIZE) {
accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
- if (iap->ia_valid & ATTR_SIZE)
ftype = S_IFREG;
+ }
+
+ /*
+ * If utimes(2) and friends are called with times not NULL, we should
+ * not set NFSD_MAY_WRITE bit. Otherwise fh_verify->nfsd_permission
+ * will return EACCESS, when the caller's effective UID does not match
+ * the owner of the file, and the caller is not privileged. In this
+ * situation, we should return EPERM(notify_change will return this).
+ */
+ if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME)) {
+ accmode |= NFSD_MAY_OWNER_OVERRIDE;
+ if (!(iap->ia_valid & (ATTR_ATIME_SET | ATTR_MTIME_SET)))
+ accmode |= NFSD_MAY_WRITE;
+ }
/* Callers that do fh_verify should do the fh_want_write: */
get_write_count = !fhp->fh_dentry;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 99550f4bd159..7de0c9562b70 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2054,7 +2054,8 @@ out_write_size:
inode->i_mtime = inode->i_ctime = current_time(inode);
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ if (handle)
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
}
if (handle)
ocfs2_journal_dirty(handle, wc->w_di_bh);
@@ -2151,13 +2152,30 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
struct ocfs2_dio_write_ctxt *dwc = NULL;
struct buffer_head *di_bh = NULL;
u64 p_blkno;
- loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+ unsigned int i_blkbits = inode->i_sb->s_blocksize_bits;
+ loff_t pos = iblock << i_blkbits;
+ sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits;
unsigned len, total_len = bh_result->b_size;
int ret = 0, first_get_block = 0;
len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
len = min(total_len, len);
+ /*
+ * bh_result->b_size is count in get_more_blocks according to write
+ * "pos" and "end", we need map twice to return different buffer state:
+ * 1. area in file size, not set NEW;
+ * 2. area out file size, set NEW.
+ *
+ * iblock endblk
+ * |--------|---------|---------|---------
+ * |<-------area in file------->|
+ */
+
+ if ((iblock <= endblk) &&
+ ((iblock + ((len - 1) >> i_blkbits)) > endblk))
+ len = (endblk - iblock + 1) << i_blkbits;
+
mlog(0, "get block of %lu at %llu:%u req %u\n",
inode->i_ino, pos, len, total_len);
@@ -2241,6 +2259,9 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
if (desc->c_needs_zero)
set_buffer_new(bh_result);
+ if (iblock > endblk)
+ set_buffer_new(bh_result);
+
/* May sleep in end_io. It should not happen in a irq context. So defer
* it to dio work queue. */
set_buffer_defer_completion(bh_result);
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 9f8250df99f1..f9b84f7a3e4b 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -99,25 +99,34 @@ out:
return ret;
}
+/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
+ * will be easier to handle read failure.
+ */
int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
unsigned int nr, struct buffer_head *bhs[])
{
int status = 0;
unsigned int i;
struct buffer_head *bh;
+ int new_bh = 0;
trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
if (!nr)
goto bail;
+ /* Don't put buffer head and re-assign it to NULL if it is allocated
+ * outside since the caller can't be aware of this alternation!
+ */
+ new_bh = (bhs[0] == NULL);
+
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(osb->sb, block++);
if (bhs[i] == NULL) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ break;
}
}
bh = bhs[i];
@@ -157,9 +166,26 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
submit_bh(REQ_OP_READ, 0, bh);
}
+read_failure:
for (i = nr; i > 0; i--) {
bh = bhs[i - 1];
+ if (unlikely(status)) {
+ if (new_bh && bh) {
+ /* If middle bh fails, let previous bh
+ * finish its read and then put it to
+ * aovoid bh leak
+ */
+ if (!buffer_jbd(bh))
+ wait_on_buffer(bh);
+ put_bh(bh);
+ bhs[i - 1] = NULL;
+ } else if (bh && buffer_uptodate(bh)) {
+ clear_buffer_uptodate(bh);
+ }
+ continue;
+ }
+
/* No need to wait on the buffer if it's managed by JBD. */
if (!buffer_jbd(bh))
wait_on_buffer(bh);
@@ -169,8 +195,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
* so we can safely record this and loop back
* to cleanup the other buffers. */
status = -EIO;
- put_bh(bh);
- bhs[i - 1] = NULL;
+ goto read_failure;
}
}
@@ -178,6 +203,9 @@ bail:
return status;
}
+/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
+ * will be easier to handle read failure.
+ */
int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
@@ -187,6 +215,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
int i, ignore_cache = 0;
struct buffer_head *bh;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ int new_bh = 0;
trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
@@ -212,6 +241,11 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
goto bail;
}
+ /* Don't put buffer head and re-assign it to NULL if it is allocated
+ * outside since the caller can't be aware of this alternation!
+ */
+ new_bh = (bhs[0] == NULL);
+
ocfs2_metadata_cache_io_lock(ci);
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
@@ -220,7 +254,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
ocfs2_metadata_cache_io_unlock(ci);
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ /* Don't forget to put previous bh! */
+ break;
}
}
bh = bhs[i];
@@ -314,16 +349,27 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
}
}
- status = 0;
-
+read_failure:
for (i = (nr - 1); i >= 0; i--) {
bh = bhs[i];
if (!(flags & OCFS2_BH_READAHEAD)) {
- if (status) {
- /* Clear the rest of the buffers on error */
- put_bh(bh);
- bhs[i] = NULL;
+ if (unlikely(status)) {
+ /* Clear the buffers on error including those
+ * ever succeeded in reading
+ */
+ if (new_bh && bh) {
+ /* If middle bh fails, let previous bh
+ * finish its read and then put it to
+ * aovoid bh leak
+ */
+ if (!buffer_jbd(bh))
+ wait_on_buffer(bh);
+ put_bh(bh);
+ bhs[i] = NULL;
+ } else if (bh && buffer_uptodate(bh)) {
+ clear_buffer_uptodate(bh);
+ }
continue;
}
/* We know this can't have changed as we hold the
@@ -341,9 +387,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
* uptodate. */
status = -EIO;
clear_buffer_needs_validate(bh);
- put_bh(bh);
- bhs[i] = NULL;
- continue;
+ goto read_failure;
}
if (buffer_needs_validate(bh)) {
@@ -353,11 +397,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
BUG_ON(buffer_jbd(bh));
clear_buffer_needs_validate(bh);
status = validate(sb, bh);
- if (status) {
- put_bh(bh);
- bhs[i] = NULL;
- continue;
- }
+ if (status)
+ goto read_failure;
}
}
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 290373024d9d..e8ace3b54e9c 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -310,6 +310,18 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
out_attach:
spin_lock(&dentry_attach_lock);
+ if (unlikely(dentry->d_fsdata && !alias)) {
+ /* d_fsdata is set by a racing thread which is doing
+ * the same thing as this thread is doing. Leave the racing
+ * thread going ahead and we return here.
+ */
+ spin_unlock(&dentry_attach_lock);
+ iput(dl->dl_inode);
+ ocfs2_lock_res_free(&dl->dl_lockres);
+ kfree(dl);
+ return 0;
+ }
+
dentry->d_fsdata = dl;
dl->dl_count++;
spin_unlock(&dentry_attach_lock);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 9b984cae4c4e..1d6dc8422899 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -329,7 +329,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
{
char *buf;
- buf = (char *) get_zeroed_page(GFP_NOFS);
+ buf = (char *) get_zeroed_page(GFP_ATOMIC);
if (buf) {
dump_mle(mle, buf, PAGE_SIZE - 1);
free_page((unsigned long)buf);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 63d701cd1e2e..c8e9b7031d9a 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -105,7 +105,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
enum dlm_status status;
int actions = 0;
int in_use;
- u8 owner;
+ u8 owner;
+ int recovery_wait = 0;
mlog(0, "master_node = %d, valblk = %d\n", master_node,
flags & LKM_VALBLK);
@@ -208,9 +209,12 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
}
if (flags & LKM_CANCEL)
lock->cancel_pending = 0;
- else
- lock->unlock_pending = 0;
-
+ else {
+ if (!lock->unlock_pending)
+ recovery_wait = 1;
+ else
+ lock->unlock_pending = 0;
+ }
}
/* get an extra ref on lock. if we are just switching
@@ -244,6 +248,17 @@ leave:
spin_unlock(&res->spinlock);
wake_up(&res->wq);
+ if (recovery_wait) {
+ spin_lock(&res->spinlock);
+ /* Unlock request will directly succeed after owner dies,
+ * and the lock is already removed from grant list. We have to
+ * wait for RECOVERING done or we miss the chance to purge it
+ * since the removement is much faster than RECOVERING proc.
+ */
+ __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING);
+ spin_unlock(&res->spinlock);
+ }
+
/* let the caller's final dlm_lock_put handle the actual kfree */
if (actions & DLM_UNLOCK_FREE_LOCK) {
/* this should always be coupled with list removal */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 5193218f5889..e961015fb484 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3422,7 +3422,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
* we can recover correctly from node failure. Otherwise, we may get
* invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
*/
- if (!ocfs2_is_o2cb_active() &&
+ if (ocfs2_userspace_stack(osb) &&
lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
lvb = 1;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index ab30c005cc4b..9fa98abecfc6 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -290,7 +290,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
if (inode_alloc)
inode_lock(inode_alloc);
- if (o2info_coherent(&fi->ifi_req)) {
+ if (inode_alloc && o2info_coherent(&fi->ifi_req)) {
status = ocfs2_inode_lock(inode_alloc, &bh, 0);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index e5dcea6cee5f..2459ae9d2234 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -231,7 +231,8 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb)
/* At this point, we know that no more recovery threads can be
* launched, so wait for any recovery completion work to
* complete. */
- flush_workqueue(osb->ocfs2_wq);
+ if (osb->ocfs2_wq)
+ flush_workqueue(osb->ocfs2_wq);
/*
* Now that recovery is shut down, and the osb is about to be
@@ -1017,7 +1018,8 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
mlog_errno(status);
}
- if (status == 0) {
+ /* Shutdown the kernel journal system */
+ if (!jbd2_journal_destroy(journal->j_journal) && !status) {
/*
* Do not toggle if flush was unsuccessful otherwise
* will leave dirty metadata in a "clean" journal
@@ -1026,9 +1028,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
if (status < 0)
mlog_errno(status);
}
-
- /* Shutdown the kernel journal system */
- jbd2_journal_destroy(journal->j_journal);
journal->j_journal = NULL;
OCFS2_I(inode)->ip_open_count--;
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 5d53d0d63d19..ea38677daa06 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -391,7 +391,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
struct ocfs2_dinode *alloc = NULL;
cancel_delayed_work(&osb->la_enable_wq);
- flush_workqueue(osb->ocfs2_wq);
+ if (osb->ocfs2_wq)
+ flush_workqueue(osb->ocfs2_wq);
if (osb->local_alloc_state == OCFS2_LA_UNUSED)
goto out;
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index f55f82ca3425..1565dd8e8856 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -25,6 +25,7 @@
#include "ocfs2_ioctl.h"
#include "alloc.h"
+#include "localalloc.h"
#include "aops.h"
#include "dlmglue.h"
#include "extent_map.h"
@@ -222,6 +223,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
struct ocfs2_refcount_tree *ref_tree = NULL;
u32 new_phys_cpos, new_len;
u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+ int need_free = 0;
if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
BUG_ON(!ocfs2_is_refcount_inode(inode));
@@ -312,6 +314,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
if (!partial) {
context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
ret = -ENOSPC;
+ need_free = 1;
goto out_commit;
}
}
@@ -336,6 +339,20 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
mlog_errno(ret);
out_commit:
+ if (need_free && context->data_ac) {
+ struct ocfs2_alloc_context *data_ac = context->data_ac;
+
+ if (context->data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+ ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+ new_phys_cpos, new_len);
+ else
+ ocfs2_free_clusters(handle,
+ data_ac->ac_inode,
+ data_ac->ac_bh,
+ ocfs2_clusters_to_blocks(osb->sb, new_phys_cpos),
+ new_len);
+ }
+
ocfs2_commit_trans(osb, handle);
out_unlock_mutex:
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index b39d14cbfa34..d212d09c00b1 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -727,7 +727,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
mutex_lock(&dquot->dq_lock);
/* Check whether we are not racing with some other dqget() */
- if (atomic_read(&dquot->dq_count) > 1)
+ if (dquot_is_busy(dquot))
goto out;
/* Running from downconvert thread? Postpone quota processing to wq */
if (current == osb->dc_task) {
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index d6c350ba25b9..c4b029c43464 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -48,12 +48,6 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
*/
static struct ocfs2_stack_plugin *active_stack;
-inline int ocfs2_is_o2cb_active(void)
-{
- return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB);
-}
-EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active);
-
static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
{
struct ocfs2_stack_plugin *p;
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index e3036e1790e8..f2dce10fae54 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,9 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
-/* In ocfs2_downconvert_lock(), we need to know which stack we are using */
-int ocfs2_is_o2cb_active(void);
-
extern struct kset *ocfs2_kset;
#endif /* STACKGLUE_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index fb0a4eec310c..77740ef5a8e8 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3832,7 +3832,6 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
int low_bucket = 0, bucket, high_bucket;
struct ocfs2_xattr_bucket *search;
- u32 last_hash;
u64 blkno, lower_blkno = 0;
search = ocfs2_xattr_bucket_new(inode);
@@ -3876,8 +3875,6 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
if (xh->xh_count)
xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
- last_hash = le32_to_cpu(xe->xe_name_hash);
-
/* record lower_blkno which may be the insert place. */
lower_blkno = blkno;
diff --git a/fs/open.c b/fs/open.c
index f4ea0dc88823..29a2cdcbcb17 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -379,6 +379,25 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
override_cred->cap_permitted;
}
+ /*
+ * The new set of credentials can *only* be used in
+ * task-synchronous circumstances, and does not need
+ * RCU freeing, unless somebody then takes a separate
+ * reference to it.
+ *
+ * NOTE! This is _only_ true because this credential
+ * is used purely for override_creds() that installs
+ * it as the subjective cred. Other threads will be
+ * accessing ->real_cred, not the subjective cred.
+ *
+ * If somebody _does_ make a copy of this (using the
+ * 'get_current_cred()' function), that will clear the
+ * non_rcu field, because now that other user may be
+ * expecting RCU freeing. But normal thread-synchronous
+ * cred accesses will keep things non-RCY.
+ */
+ override_cred->non_rcu = 1;
+
old_cred = override_creds(override_cred);
retry:
res = user_path_at(dfd, filename, lookup_flags, &path);
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index 079a465796f3..bc56df2ae705 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -323,7 +323,7 @@ static ssize_t sysfs_service_op_show(struct kobject *kobj,
/* Can't do a service_operation if the client is not running... */
rc = is_daemon_in_service();
if (rc) {
- pr_info("%s: Client not running :%d:\n",
+ pr_info_ratelimited("%s: Client not running :%d:\n",
__func__,
is_daemon_in_service());
goto out;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index ef11fa7b869e..8c561703275a 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -1042,7 +1042,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
if (newdentry == trap)
goto out_dput;
- if (WARN_ON(olddentry->d_inode == newdentry->d_inode))
+ if (olddentry->d_inode == newdentry->d_inode)
goto out_dput;
err = 0;
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index efed50304b49..30a1c7fc8c75 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -265,7 +265,8 @@ static bool ovl_can_list(const char *s)
return true;
/* Never list trusted.overlay, list other trusted for superuser only */
- return !ovl_is_private_xattr(s) && capable(CAP_SYS_ADMIN);
+ return !ovl_is_private_xattr(s) &&
+ ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 4ac811e1a26c..37c7ed0dc820 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -448,7 +448,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
* a program is not able to use ptrace(2) in that case. It is
* safe because the task has stopped executing permanently.
*/
- if (permitted && (task->flags & PF_DUMPCORE)) {
+ if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) {
if (try_get_task_stack(task)) {
eip = KSTK_EIP(task);
esp = KSTK_ESP(task);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 1491918a33c3..0c952c217118 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -42,10 +42,12 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
return -EINVAL;
while (count > 0) {
- if (pfn_valid(pfn))
- ppage = pfn_to_page(pfn);
- else
- ppage = NULL;
+ /*
+ * TODO: ZONE_DEVICE support requires to identify
+ * memmaps that were actually initialized.
+ */
+ ppage = pfn_to_online_page(pfn);
+
if (!ppage || PageSlab(ppage))
pcount = 0;
else
@@ -214,10 +216,11 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
return -EINVAL;
while (count > 0) {
- if (pfn_valid(pfn))
- ppage = pfn_to_page(pfn);
- else
- ppage = NULL;
+ /*
+ * TODO: ZONE_DEVICE support requires to identify
+ * memmaps that were actually initialized.
+ */
+ ppage = pfn_to_online_page(pfn);
if (put_user(stable_page_flags(ppage), out)) {
ret = -EFAULT;
@@ -259,10 +262,11 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
return -EINVAL;
while (count > 0) {
- if (pfn_valid(pfn))
- ppage = pfn_to_page(pfn);
- else
- ppage = NULL;
+ /*
+ * TODO: ZONE_DEVICE support requires to identify
+ * memmaps that were actually initialized.
+ */
+ ppage = pfn_to_online_page(pfn);
if (ppage)
ino = page_cgroup_ino(ppage);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 555698ddb943..12bac452738d 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -498,6 +498,10 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
if (root->set_ownership)
root->set_ownership(head, table, &inode->i_uid, &inode->i_gid);
+ else {
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ }
return inode;
}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 885d445afa0d..ce400f97370d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -165,6 +165,16 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
}
/*
+ * Architectures which support memory encryption override this.
+ */
+ssize_t __weak
+copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
+ unsigned long offset, int userbuf)
+{
+ return copy_oldmem_page(pfn, buf, csize, offset, userbuf);
+}
+
+/*
* Copy to either kernel or user space
*/
static int copy_to(void *target, void *src, size_t size, int userbuf)
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 40bfc6c58374..1e675be10926 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -297,6 +297,7 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record)
GFP_KERNEL);
if (!tmp_prz)
return -ENOMEM;
+ prz = tmp_prz;
free_prz = true;
while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) {
@@ -319,7 +320,6 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record)
goto out;
}
record->id = 0;
- prz = tmp_prz;
}
}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 4cd0c2336624..3254c90fd899 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -491,7 +491,7 @@ int dquot_release(struct dquot *dquot)
mutex_lock(&dquot->dq_lock);
/* Check whether we are not racing with some other dqget() */
- if (atomic_read(&dquot->dq_count) > 1)
+ if (dquot_is_busy(dquot))
goto out_dqlock;
if (dqopt->ops[dquot->dq_id.type]->release_dqblk) {
ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot);
@@ -617,7 +617,7 @@ EXPORT_SYMBOL(dquot_scan_active);
/* Write all dquot structures to quota files */
int dquot_writeback_dquots(struct super_block *sb, int type)
{
- struct list_head *dirty;
+ struct list_head dirty;
struct dquot *dquot;
struct quota_info *dqopt = sb_dqopt(sb);
int cnt;
@@ -631,9 +631,10 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
if (!sb_has_quota_active(sb, cnt))
continue;
spin_lock(&dq_list_lock);
- dirty = &dqopt->info[cnt].dqi_dirty_list;
- while (!list_empty(dirty)) {
- dquot = list_first_entry(dirty, struct dquot,
+ /* Move list away to avoid livelock. */
+ list_replace_init(&dqopt->info[cnt].dqi_dirty_list, &dirty);
+ while (!list_empty(&dirty)) {
+ dquot = list_first_entry(&dirty, struct dquot,
dq_dirty);
WARN_ON(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags));
@@ -1989,8 +1990,8 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
&warn_to[cnt]);
if (ret)
goto over_quota;
- ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space, 0,
- &warn_to[cnt]);
+ ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space,
+ DQUOT_SPACE_WARN, &warn_to[cnt]);
if (ret) {
spin_lock(&transfer_to[cnt]->dq_dqb_lock);
dquot_decr_inodes(transfer_to[cnt], inode_usage);
diff --git a/fs/read_write.c b/fs/read_write.c
index d6f8bfb0f794..e8136a72c13f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1709,6 +1709,34 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
}
+/*
+ * Ensure that we don't remap a partial EOF block in the middle of something
+ * else. Assume that the offsets have already been checked for block
+ * alignment.
+ *
+ * For deduplication we always scale down to the previous block because we
+ * can't meaningfully compare post-EOF contents.
+ *
+ * For clone we only link a partial EOF block above the destination file's EOF.
+ */
+static int generic_remap_check_len(struct inode *inode_in,
+ struct inode *inode_out,
+ loff_t pos_out,
+ u64 *len,
+ bool is_dedupe)
+{
+ u64 blkmask = i_blocksize(inode_in) - 1;
+
+ if ((*len & blkmask) == 0)
+ return 0;
+
+ if (is_dedupe)
+ *len &= ~blkmask;
+ else if (pos_out + *len < i_size_read(inode_out))
+ return -EINVAL;
+
+ return 0;
+}
/*
* Check that the two inodes are eligible for cloning, the ranges make
@@ -1815,6 +1843,11 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
return -EBADE;
}
+ ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
+ is_dedupe);
+ if (ret)
+ return ret;
+
return 1;
}
EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
@@ -1882,10 +1915,7 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
}
EXPORT_SYMBOL(vfs_clone_file_range);
-/*
- * Read a page's worth of file data into the page cache. Return the page
- * locked.
- */
+/* Read a page's worth of file data into the page cache. */
static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
{
struct address_space *mapping;
@@ -1901,11 +1931,33 @@ static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
put_page(page);
return ERR_PTR(-EIO);
}
- lock_page(page);
return page;
}
/*
+ * Lock two pages, ensuring that we lock in offset order if the pages are from
+ * the same file.
+ */
+static void vfs_lock_two_pages(struct page *page1, struct page *page2)
+{
+ /* Always lock in order of increasing index. */
+ if (page1->index > page2->index)
+ swap(page1, page2);
+
+ lock_page(page1);
+ if (page1 != page2)
+ lock_page(page2);
+}
+
+/* Unlock two pages, being careful not to unlock the same page twice. */
+static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
+{
+ unlock_page(page1);
+ if (page1 != page2)
+ unlock_page(page2);
+}
+
+/*
* Compare extents of two files to see if they are the same.
* Caller must have locked both inodes to prevent write races.
*/
@@ -1942,10 +1994,24 @@ int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
dest_page = vfs_dedupe_get_page(dest, destoff);
if (IS_ERR(dest_page)) {
error = PTR_ERR(dest_page);
- unlock_page(src_page);
put_page(src_page);
goto out_error;
}
+
+ vfs_lock_two_pages(src_page, dest_page);
+
+ /*
+ * Now that we've locked both pages, make sure they're still
+ * mapped to the file data we're interested in. If not,
+ * someone is invalidating pages on us and we lose.
+ */
+ if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
+ src_page->mapping != src->i_mapping ||
+ dest_page->mapping != dest->i_mapping) {
+ same = false;
+ goto unlock;
+ }
+
src_addr = kmap_atomic(src_page);
dest_addr = kmap_atomic(dest_page);
@@ -1957,8 +2023,8 @@ int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
kunmap_atomic(dest_addr);
kunmap_atomic(src_addr);
- unlock_page(dest_page);
- unlock_page(src_page);
+unlock:
+ vfs_unlock_two_pages(src_page, dest_page);
put_page(dest_page);
put_page(src_page);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 11a48affa882..683496322aa8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2096,6 +2096,15 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
goto out_inserted_sd;
}
+ /*
+ * Mark it private if we're creating the privroot
+ * or something under it.
+ */
+ if (IS_PRIVATE(dir) || dentry == REISERFS_SB(sb)->priv_root) {
+ inode->i_flags |= S_PRIVATE;
+ inode->i_opflags &= ~IOP_XATTR;
+ }
+
if (reiserfs_posixacl(inode->i_sb)) {
reiserfs_write_unlock(inode->i_sb);
retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
@@ -2110,8 +2119,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
reiserfs_warning(inode->i_sb, "jdm-13090",
"ACLs aren't enabled in the fs, "
"but vfs thinks they are!");
- } else if (IS_PRIVATE(dir))
- inode->i_flags |= S_PRIVATE;
+ }
if (security->name) {
reiserfs_write_unlock(inode->i_sb);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 5089dac02660..14ba7a12b89d 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -377,10 +377,13 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
/*
* Propagate the private flag so we know we're
- * in the priv tree
+ * in the priv tree. Also clear IOP_XATTR
+ * since we don't have xattrs on xattr files.
*/
- if (IS_PRIVATE(dir))
+ if (IS_PRIVATE(dir)) {
inode->i_flags |= S_PRIVATE;
+ inode->i_opflags &= ~IOP_XATTR;
+ }
}
reiserfs_write_unlock(dir->i_sb);
if (retval == IO_ERROR) {
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index eabf85371ece..0efe7c7c4124 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -1168,6 +1168,8 @@ static inline int bmap_would_wrap(unsigned bmap_nr)
return bmap_nr > ((1LL << 16) - 1);
}
+extern const struct xattr_handler *reiserfs_xattr_handlers[];
+
/*
* this says about version of key of all items (but stat data) the
* object consists of
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 4885c7b6e44f..cc0b22c72e83 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2052,6 +2052,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
if (replay_only(s))
goto error_unlocked;
+ s->s_xattr = reiserfs_xattr_handlers;
+
if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) {
SWARN(silent, s, "clm-7000",
"Detected readonly device, marking FS readonly");
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 83423192588c..29a0c0969e91 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -122,13 +122,13 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
struct dentry *xaroot;
if (d_really_is_negative(privroot))
- return ERR_PTR(-ENODATA);
+ return ERR_PTR(-EOPNOTSUPP);
inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR);
xaroot = dget(REISERFS_SB(sb)->xattr_root);
if (!xaroot)
- xaroot = ERR_PTR(-ENODATA);
+ xaroot = ERR_PTR(-EOPNOTSUPP);
else if (d_really_is_negative(xaroot)) {
int err = -ENODATA;
@@ -610,6 +610,10 @@ int reiserfs_xattr_set(struct inode *inode, const char *name,
int error, error2;
size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size);
+ /* Check before we start a transaction and then do nothing. */
+ if (!d_really_is_positive(REISERFS_SB(inode->i_sb)->priv_root))
+ return -EOPNOTSUPP;
+
if (!(flags & XATTR_REPLACE))
jbegin_count += reiserfs_xattr_jcreate_nblocks(inode);
@@ -832,8 +836,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
if (d_really_is_negative(dentry))
return -EINVAL;
- if (!dentry->d_sb->s_xattr ||
- get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
+ if (get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE);
@@ -873,6 +876,7 @@ static int create_privroot(struct dentry *dentry)
}
d_inode(dentry)->i_flags |= S_PRIVATE;
+ d_inode(dentry)->i_opflags &= ~IOP_XATTR;
reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
"storage.\n", PRIVROOT_NAME);
@@ -886,7 +890,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
#endif
/* Actual operations that are exported to VFS-land */
-static const struct xattr_handler *reiserfs_xattr_handlers[] = {
+const struct xattr_handler *reiserfs_xattr_handlers[] = {
#ifdef CONFIG_REISERFS_FS_XATTR
&reiserfs_xattr_user_handler,
&reiserfs_xattr_trusted_handler,
@@ -957,8 +961,10 @@ int reiserfs_lookup_privroot(struct super_block *s)
if (!IS_ERR(dentry)) {
REISERFS_SB(s)->priv_root = dentry;
d_set_d_op(dentry, &xattr_lookup_poison_ops);
- if (d_really_is_positive(dentry))
+ if (d_really_is_positive(dentry)) {
d_inode(dentry)->i_flags |= S_PRIVATE;
+ d_inode(dentry)->i_opflags &= ~IOP_XATTR;
+ }
} else
err = PTR_ERR(dentry);
inode_unlock(d_inode(s->s_root));
@@ -987,7 +993,6 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
}
if (d_really_is_positive(privroot)) {
- s->s_xattr = reiserfs_xattr_handlers;
inode_lock(d_inode(privroot));
if (!REISERFS_SB(s)->xattr_root) {
struct dentry *dentry;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index aa9380bac196..05f666794561 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -320,10 +320,8 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
* would be useless since permissions are ignored, and a pain because
* it introduces locking cycles
*/
- if (IS_PRIVATE(dir)) {
- inode->i_flags |= S_PRIVATE;
+ if (IS_PRIVATE(inode))
goto apply_umask;
- }
err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
if (err)
diff --git a/fs/statfs.c b/fs/statfs.c
index c25dd9a26cc1..ca1084cbe03c 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -304,19 +304,10 @@ COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *,
static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
{
struct compat_statfs64 buf;
- if (sizeof(ubuf->f_bsize) == 4) {
- if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen |
- kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL)
- return -EOVERFLOW;
- /* f_files and f_ffree may be -1; it's okay
- * to stuff that into 32 bits */
- if (kbuf->f_files != 0xffffffffffffffffULL
- && (kbuf->f_files & 0xffffffff00000000ULL))
- return -EOVERFLOW;
- if (kbuf->f_ffree != 0xffffffffffffffffULL
- && (kbuf->f_ffree & 0xffffffff00000000ULL))
- return -EOVERFLOW;
- }
+
+ if ((kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
+ return -EOVERFLOW;
+
memset(&buf, 0, sizeof(struct compat_statfs64));
buf.f_type = kbuf->f_type;
buf.f_bsize = kbuf->f_bsize;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index ba3d0e0f8615..c7828db206bc 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1164,8 +1164,8 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
* o exact match, i.e. the found zero-level znode contains key @key, then %1
* is returned and slot number of the matched branch is stored in @n;
* o not exact match, which means that zero-level znode does not contain
- * @key, then %0 is returned and slot number of the closest branch is stored
- * in @n;
+ * @key, then %0 is returned and slot number of the closest branch or %-1
+ * is stored in @n; In this case calling tnc_next() is mandatory.
* o @key is so small that it is even less than the lowest key of the
* leftmost zero-level node, then %0 is returned and %0 is stored in @n.
*
@@ -1882,13 +1882,19 @@ int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
static int search_dh_cookie(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_dent_node *dent, uint32_t cookie,
- struct ubifs_znode **zn, int *n)
+ struct ubifs_znode **zn, int *n, int exact)
{
int err;
struct ubifs_znode *znode = *zn;
struct ubifs_zbranch *zbr;
union ubifs_key *dkey;
+ if (!exact) {
+ err = tnc_next(c, &znode, n);
+ if (err)
+ return err;
+ }
+
for (;;) {
zbr = &znode->zbranch[*n];
dkey = &zbr->key;
@@ -1930,7 +1936,7 @@ static int do_lookup_dh(struct ubifs_info *c, const union ubifs_key *key,
if (unlikely(err < 0))
goto out_unlock;
- err = search_dh_cookie(c, key, dent, cookie, &znode, &n);
+ err = search_dh_cookie(c, key, dent, cookie, &znode, &n, err);
out_unlock:
mutex_unlock(&c->tnc_mutex);
@@ -2716,7 +2722,7 @@ int ubifs_tnc_remove_dh(struct ubifs_info *c, const union ubifs_key *key,
if (unlikely(err < 0))
goto out_free;
- err = search_dh_cookie(c, key, dent, cookie, &znode, &n);
+ err = search_dh_cookie(c, key, dent, cookie, &znode, &n, err);
if (err)
goto out_free;
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 28b9d7cca29b..3c1b54091d6c 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -470,13 +470,15 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
return NULL;
}
-/* Extend the file by 'blocks' blocks, return the number of extents added */
+/* Extend the file with new blocks totaling 'new_block_bytes',
+ * return the number of extents added
+ */
static int udf_do_extend_file(struct inode *inode,
struct extent_position *last_pos,
struct kernel_long_ad *last_ext,
- sector_t blocks)
+ loff_t new_block_bytes)
{
- sector_t add;
+ uint32_t add;
int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
struct super_block *sb = inode->i_sb;
struct kernel_lb_addr prealloc_loc = {};
@@ -486,7 +488,7 @@ static int udf_do_extend_file(struct inode *inode,
/* The previous extent is fake and we should not extend by anything
* - there's nothing to do... */
- if (!blocks && fake)
+ if (!new_block_bytes && fake)
return 0;
iinfo = UDF_I(inode);
@@ -517,13 +519,12 @@ static int udf_do_extend_file(struct inode *inode,
/* Can we merge with the previous extent? */
if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
EXT_NOT_RECORDED_NOT_ALLOCATED) {
- add = ((1 << 30) - sb->s_blocksize -
- (last_ext->extLength & UDF_EXTENT_LENGTH_MASK)) >>
- sb->s_blocksize_bits;
- if (add > blocks)
- add = blocks;
- blocks -= add;
- last_ext->extLength += add << sb->s_blocksize_bits;
+ add = (1 << 30) - sb->s_blocksize -
+ (last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
+ if (add > new_block_bytes)
+ add = new_block_bytes;
+ new_block_bytes -= add;
+ last_ext->extLength += add;
}
if (fake) {
@@ -544,28 +545,27 @@ static int udf_do_extend_file(struct inode *inode,
}
/* Managed to do everything necessary? */
- if (!blocks)
+ if (!new_block_bytes)
goto out;
/* All further extents will be NOT_RECORDED_NOT_ALLOCATED */
last_ext->extLocation.logicalBlockNum = 0;
last_ext->extLocation.partitionReferenceNum = 0;
- add = (1 << (30-sb->s_blocksize_bits)) - 1;
- last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
- (add << sb->s_blocksize_bits);
+ add = (1 << 30) - sb->s_blocksize;
+ last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | add;
/* Create enough extents to cover the whole hole */
- while (blocks > add) {
- blocks -= add;
+ while (new_block_bytes > add) {
+ new_block_bytes -= add;
err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1);
if (err)
return err;
count++;
}
- if (blocks) {
+ if (new_block_bytes) {
last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
- (blocks << sb->s_blocksize_bits);
+ new_block_bytes;
err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1);
if (err)
@@ -596,6 +596,24 @@ out:
return count;
}
+/* Extend the final block of the file to final_block_len bytes */
+static void udf_do_extend_final_block(struct inode *inode,
+ struct extent_position *last_pos,
+ struct kernel_long_ad *last_ext,
+ uint32_t final_block_len)
+{
+ struct super_block *sb = inode->i_sb;
+ uint32_t added_bytes;
+
+ added_bytes = final_block_len -
+ (last_ext->extLength & (sb->s_blocksize - 1));
+ last_ext->extLength += added_bytes;
+ UDF_I(inode)->i_lenExtents += added_bytes;
+
+ udf_write_aext(inode, last_pos, &last_ext->extLocation,
+ last_ext->extLength, 1);
+}
+
static int udf_extend_file(struct inode *inode, loff_t newsize)
{
@@ -605,10 +623,12 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
int8_t etype;
struct super_block *sb = inode->i_sb;
sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
+ unsigned long partial_final_block;
int adsize;
struct udf_inode_info *iinfo = UDF_I(inode);
struct kernel_long_ad extent;
- int err;
+ int err = 0;
+ int within_final_block;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
adsize = sizeof(struct short_ad);
@@ -618,18 +638,8 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
BUG();
etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
+ within_final_block = (etype != -1);
- /* File has extent covering the new size (could happen when extending
- * inside a block)? */
- if (etype != -1)
- return 0;
- if (newsize & (sb->s_blocksize - 1))
- offset++;
- /* Extended file just to the boundary of the last file block? */
- if (offset == 0)
- return 0;
-
- /* Truncate is extending the file by 'offset' blocks */
if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
(epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
/* File has no extents at all or has empty last
@@ -643,7 +653,22 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
&extent.extLength, 0);
extent.extLength |= etype << 30;
}
- err = udf_do_extend_file(inode, &epos, &extent, offset);
+
+ partial_final_block = newsize & (sb->s_blocksize - 1);
+
+ /* File has extent covering the new size (could happen when extending
+ * inside a block)?
+ */
+ if (within_final_block) {
+ /* Extending file within the last file block */
+ udf_do_extend_final_block(inode, &epos, &extent,
+ partial_final_block);
+ } else {
+ loff_t add = ((loff_t)offset << sb->s_blocksize_bits) |
+ partial_final_block;
+ err = udf_do_extend_file(inode, &epos, &extent, add);
+ }
+
if (err < 0)
goto out;
err = 0;
@@ -745,6 +770,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
/* Are we beyond EOF? */
if (etype == -1) {
int ret;
+ loff_t hole_len;
isBeyondEOF = true;
if (count) {
if (c)
@@ -760,7 +786,8 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
startnum = (offset > 0);
}
/* Create extents for the hole between EOF and offset */
- ret = udf_do_extend_file(inode, &prev_epos, laarr, offset);
+ hole_len = (loff_t)offset << inode->i_blkbits;
+ ret = udf_do_extend_file(inode, &prev_epos, laarr, hole_len);
if (ret < 0) {
*err = ret;
newblock = 0;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 7a908d683258..a609d480606d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -854,6 +854,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
/* len == 0 means wake all */
struct userfaultfd_wake_range range = { .len = 0, };
unsigned long new_flags;
+ bool still_valid;
ACCESS_ONCE(ctx->released) = true;
@@ -869,8 +870,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
* taking the mmap_sem for writing.
*/
down_write(&mm->mmap_sem);
- if (!mmget_still_valid(mm))
- goto skip_mm;
+ still_valid = mmget_still_valid(mm);
prev = NULL;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
cond_resched();
@@ -881,19 +881,20 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
continue;
}
new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
- prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
- new_flags, vma->anon_vma,
- vma->vm_file, vma->vm_pgoff,
- vma_policy(vma),
- NULL_VM_UFFD_CTX);
- if (prev)
- vma = prev;
- else
- prev = vma;
+ if (still_valid) {
+ prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
+ new_flags, vma->anon_vma,
+ vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ NULL_VM_UFFD_CTX);
+ if (prev)
+ vma = prev;
+ else
+ prev = vma;
+ }
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
-skip_mm:
up_write(&mm->mmap_sem);
mmput(mm);
wakeup:
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index a3cc8afed367..7b25a88569c9 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4006,15 +4006,28 @@ xfs_bmapi_read(
XFS_STATS_INC(mp, xs_blk_mapr);
ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!ifp) {
+ /* No CoW fork? Return a hole. */
+ if (whichfork == XFS_COW_FORK) {
+ mval->br_startoff = bno;
+ mval->br_startblock = HOLESTARTBLOCK;
+ mval->br_blockcount = len;
+ mval->br_state = XFS_EXT_NORM;
+ *nmap = 1;
+ return 0;
+ }
- /* No CoW fork? Return a hole. */
- if (whichfork == XFS_COW_FORK && !ifp) {
- mval->br_startoff = bno;
- mval->br_startblock = HOLESTARTBLOCK;
- mval->br_blockcount = len;
- mval->br_state = XFS_EXT_NORM;
- *nmap = 1;
- return 0;
+ /*
+ * A missing attr ifork implies that the inode says we're in
+ * extents or btree format but failed to pass the inode fork
+ * verifier while trying to load it. Treat that as a file
+ * corruption too.
+ */
+#ifdef DEBUG
+ xfs_alert(mp, "%s: inode %llu missing fork %d",
+ __func__, ip->i_ino, whichfork);
+#endif /* DEBUG */
+ return -EFSCORRUPTED;
}
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 16f93d7356b7..e5970ecdfd58 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -58,6 +58,32 @@ static kmem_zone_t *xfs_buf_zone;
#define xb_to_gfp(flags) \
((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
+/*
+ * Locking orders
+ *
+ * xfs_buf_ioacct_inc:
+ * xfs_buf_ioacct_dec:
+ * b_sema (caller holds)
+ * b_lock
+ *
+ * xfs_buf_stale:
+ * b_sema (caller holds)
+ * b_lock
+ * lru_lock
+ *
+ * xfs_buf_rele:
+ * b_lock
+ * pag_buf_lock
+ * lru_lock
+ *
+ * xfs_buftarg_wait_rele
+ * lru_lock
+ * b_lock (trylock due to inversion)
+ *
+ * xfs_buftarg_isolate
+ * lru_lock
+ * b_lock (trylock due to inversion)
+ */
static inline int
xfs_buf_is_vmapped(
@@ -983,8 +1009,18 @@ xfs_buf_rele(
ASSERT(atomic_read(&bp->b_hold) > 0);
- release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+ /*
+ * We grab the b_lock here first to serialise racing xfs_buf_rele()
+ * calls. The pag_buf_lock being taken on the last reference only
+ * serialises against racing lookups in xfs_buf_find(). IOWs, the second
+ * to last reference we drop here is not serialised against the last
+ * reference until we take bp->b_lock. Hence if we don't grab b_lock
+ * first, the last "release" reference can win the race to the lock and
+ * free the buffer before the second-to-last reference is processed,
+ * leading to a use-after-free scenario.
+ */
spin_lock(&bp->b_lock);
+ release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
if (!release) {
/*
* Drop the in-flight state if the buffer is already on the LRU
@@ -1702,7 +1738,7 @@ xfs_buftarg_isolate(
* zero. If the value is already zero, we need to reclaim the
* buffer, otherwise it gets another trip through the LRU.
*/
- if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+ if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
spin_unlock(&bp->b_lock);
return LRU_ROTATE;
}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index fa0bc4d46065..5f616a6a5358 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -252,6 +252,32 @@ xfs_compat_ioc_bulkstat(
int done;
int error;
+ /*
+ * Output structure handling functions. Depending on the command,
+ * either the xfs_bstat and xfs_inogrp structures are written out
+ * to userpace memory via bulkreq.ubuffer. Normally the compat
+ * functions and structure size are the correct ones to use ...
+ */
+ inumbers_fmt_pf inumbers_func = xfs_inumbers_fmt_compat;
+ bulkstat_one_pf bs_one_func = xfs_bulkstat_one_compat;
+ size_t bs_one_size = sizeof(struct compat_xfs_bstat);
+
+#ifdef CONFIG_X86_X32
+ if (in_x32_syscall()) {
+ /*
+ * ... but on x32 the input xfs_fsop_bulkreq has pointers
+ * which must be handled in the "compat" (32-bit) way, while
+ * the xfs_bstat and xfs_inogrp structures follow native 64-
+ * bit layout convention. So adjust accordingly, otherwise
+ * the data written out in compat layout will not match what
+ * x32 userspace expects.
+ */
+ inumbers_func = xfs_inumbers_fmt;
+ bs_one_func = xfs_bulkstat_one;
+ bs_one_size = sizeof(struct xfs_bstat);
+ }
+#endif
+
/* done = 1 if there are more stats to get and if bulkstat */
/* should be called again (unused here, but used in dmapi) */
@@ -283,15 +309,15 @@ xfs_compat_ioc_bulkstat(
if (cmd == XFS_IOC_FSINUMBERS_32) {
error = xfs_inumbers(mp, &inlast, &count,
- bulkreq.ubuffer, xfs_inumbers_fmt_compat);
+ bulkreq.ubuffer, inumbers_func);
} else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
int res;
- error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
- sizeof(compat_xfs_bstat_t), NULL, &res);
+ error = bs_one_func(mp, inlast, bulkreq.ubuffer,
+ bs_one_size, NULL, &res);
} else if (cmd == XFS_IOC_FSBULKSTAT_32) {
error = xfs_bulkstat(mp, &inlast, &count,
- xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
+ bs_one_func, bs_one_size,
bulkreq.ubuffer, &done);
} else
error = -EINVAL;
@@ -347,6 +373,7 @@ xfs_compat_attrlist_by_handle(
{
int error;
attrlist_cursor_kern_t *cursor;
+ compat_xfs_fsop_attrlist_handlereq_t __user *p = arg;
compat_xfs_fsop_attrlist_handlereq_t al_hreq;
struct dentry *dentry;
char *kbuf;
@@ -381,6 +408,11 @@ xfs_compat_attrlist_by_handle(
if (error)
goto out_kfree;
+ if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) {
+ error = -EFAULT;
+ goto out_kfree;
+ }
+
if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
error = -EFAULT;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 1daa965f1e08..4e6f2c8574f7 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -789,6 +789,7 @@ xfs_setattr_nonsize(
out_cancel:
xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_dqrele:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 488719d43ca8..cdcb7235e41a 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1214,13 +1214,11 @@ xfs_rtmount_inodes(
xfs_sb_t *sbp;
sbp = &mp->m_sb;
- if (sbp->sb_rbmino == NULLFSINO)
- return 0;
error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
if (error)
return error;
ASSERT(mp->m_rbmip != NULL);
- ASSERT(sbp->sb_rsumino != NULLFSINO);
+
error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
if (error) {
IRELE(mp->m_rbmip);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 0b0282d2f011..10151c9268e1 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1715,6 +1715,7 @@ xfs_fs_fill_super(
out_close_devices:
xfs_close_devices(mp);
out_free_fsname:
+ sb->s_fs_info = NULL;
xfs_free_fsname(mp);
kfree(mp);
out:
@@ -1732,6 +1733,10 @@ xfs_fs_put_super(
{
struct xfs_mount *mp = XFS_M(sb);
+ /* if ->fill_super failed, we have no mount to tear down */
+ if (!sb->s_fs_info)
+ return;
+
xfs_notice(mp, "Unmounting Filesystem");
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
@@ -1741,6 +1746,8 @@ xfs_fs_put_super(
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
xfs_close_devices(mp);
+
+ sb->s_fs_info = NULL;
xfs_free_fsname(mp);
kfree(mp);
}
@@ -1760,6 +1767,9 @@ xfs_fs_nr_cached_objects(
struct super_block *sb,
struct shrink_control *sc)
{
+ /* Paranoia: catch incorrect calls during mount setup or teardown */
+ if (WARN_ON_ONCE(!sb->s_fs_info))
+ return 0;
return xfs_reclaim_inodes_count(XFS_M(sb));
}