summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/affs/namei.c4
-rw-r--r--fs/afs/cmservice.c25
-rw-r--r--fs/afs/dir.c47
-rw-r--r--fs/afs/file.c1
-rw-r--r--fs/afs/inode.c1
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/main.c10
-rw-r--r--fs/afs/mntpt.c1
-rw-r--r--fs/afs/xattr.c23
-rw-r--r--fs/aio.c184
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/binfmt_misc.c29
-rw-r--r--fs/block_dev.c24
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/async-thread.c14
-rw-r--r--fs/btrfs/backref.c178
-rw-r--r--fs/btrfs/block-group.c86
-rw-r--r--fs/btrfs/block-group.h3
-rw-r--r--fs/btrfs/btrfs_inode.h15
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.c50
-rw-r--r--fs/btrfs/ctree.h16
-rw-r--r--fs/btrfs/delalloc-space.c2
-rw-r--r--fs/btrfs/delayed-inode.c23
-rw-r--r--fs/btrfs/disk-io.c94
-rw-r--r--fs/btrfs/extent-tree.c26
-rw-r--r--fs/btrfs/extent_io.c15
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file-item.c10
-rw-r--r--fs/btrfs/file.c26
-rw-r--r--fs/btrfs/free-space-cache.c8
-rw-r--r--fs/btrfs/free-space-tree.c10
-rw-r--r--fs/btrfs/inode.c167
-rw-r--r--fs/btrfs/ioctl.c43
-rw-r--r--fs/btrfs/qgroup.c377
-rw-r--r--fs/btrfs/qgroup.h5
-rw-r--r--fs/btrfs/raid56.c58
-rw-r--r--fs/btrfs/relocation.c12
-rw-r--r--fs/btrfs/root-tree.c3
-rw-r--r--fs/btrfs/scrub.c21
-rw-r--r--fs/btrfs/send.c30
-rw-r--r--fs/btrfs/space-info.c5
-rw-r--r--fs/btrfs/transaction.c33
-rw-r--r--fs/btrfs/transaction.h17
-rw-r--r--fs/btrfs/tree-checker.c16
-rw-r--r--fs/btrfs/tree-log.c285
-rw-r--r--fs/btrfs/tree-log.h14
-rw-r--r--fs/btrfs/volumes.c31
-rw-r--r--fs/btrfs/xattr.c31
-rw-r--r--fs/ceph/addr.c10
-rw-r--r--fs/ceph/caps.c32
-rw-r--r--fs/ceph/export.c4
-rw-r--r--fs/ceph/file.c33
-rw-r--r--fs/ceph/inode.c3
-rw-r--r--fs/ceph/mds_client.c42
-rw-r--r--fs/ceph/snap.c54
-rw-r--r--fs/ceph/super.h5
-rw-r--r--fs/cifs/cifs_dfs_ref.c3
-rw-r--r--fs/cifs/cifs_unicode.c9
-rw-r--r--fs/cifs/cifsfs.c5
-rw-r--r--fs/cifs/cifsglob.h3
-rw-r--r--fs/cifs/connect.c10
-rw-r--r--fs/cifs/dir.c22
-rw-r--r--fs/cifs/file.c38
-rw-r--r--fs/cifs/link.c3
-rw-r--r--fs/cifs/sess.c2
-rw-r--r--fs/cifs/smb2misc.c4
-rw-r--r--fs/cifs/smb2ops.c16
-rw-r--r--fs/cifs/smb2pdu.c26
-rw-r--r--fs/cifs/smb2pdu.h16
-rw-r--r--fs/cifs/transport.c31
-rw-r--r--fs/configfs/dir.c20
-rw-r--r--fs/configfs/file.c16
-rw-r--r--fs/crypto/fname.c9
-rw-r--r--fs/crypto/hooks.c44
-rw-r--r--fs/dax.c13
-rw-r--r--fs/debugfs/file.c10
-rw-r--r--fs/debugfs/inode.c7
-rw-r--r--fs/devpts/inode.c2
-rw-r--r--fs/direct-io.c5
-rw-r--r--fs/dlm/config.c9
-rw-r--r--fs/dlm/debug_fs.c1
-rw-r--r--fs/dlm/lock.c9
-rw-r--r--fs/dlm/lowcomms.c2
-rw-r--r--fs/ecryptfs/crypto.c6
-rw-r--r--fs/ecryptfs/main.c6
-rw-r--r--fs/erofs/erofs_fs.h3
-rw-r--r--fs/erofs/inode.c7
-rw-r--r--fs/erofs/super.c4
-rw-r--r--fs/erofs/xattr.c10
-rw-r--r--fs/erofs/zdata.c15
-rw-r--r--fs/erofs/zmap.c10
-rw-r--r--fs/erofs/zpvec.h14
-rw-r--r--fs/exec.c2
-rw-r--r--fs/ext2/balloc.c14
-rw-r--r--fs/ext2/super.c6
-rw-r--r--fs/ext4/balloc.c38
-rw-r--r--fs/ext4/dir.c6
-rw-r--r--fs/ext4/ext4.h7
-rw-r--r--fs/ext4/extents.c138
-rw-r--r--fs/ext4/extents_status.c4
-rw-r--r--fs/ext4/ialloc.c59
-rw-r--r--fs/ext4/inline.c31
-rw-r--r--fs/ext4/inode.c64
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/mballoc.c17
-rw-r--r--fs/ext4/migrate.c23
-rw-r--r--fs/ext4/namei.c58
-rw-r--r--fs/ext4/page-io.c4
-rw-r--r--fs/ext4/resize.c5
-rw-r--r--fs/ext4/super.c93
-rw-r--r--fs/ext4/symlink.c11
-rw-r--r--fs/ext4/sysfs.c7
-rw-r--r--fs/ext4/verity.c89
-rw-r--r--fs/ext4/xattr.c6
-rw-r--r--fs/f2fs/checkpoint.c13
-rw-r--r--fs/f2fs/data.c31
-rw-r--r--fs/f2fs/dir.c15
-rw-r--r--fs/f2fs/f2fs.h35
-rw-r--r--fs/f2fs/file.c13
-rw-r--r--fs/f2fs/gc.c11
-rw-r--r--fs/f2fs/inline.c7
-rw-r--r--fs/f2fs/inode.c5
-rw-r--r--fs/f2fs/namei.c21
-rw-r--r--fs/f2fs/node.c12
-rw-r--r--fs/f2fs/segment.c9
-rw-r--r--fs/f2fs/segment.h7
-rw-r--r--fs/f2fs/super.c127
-rw-r--r--fs/f2fs/sysfs.c8
-rw-r--r--fs/f2fs/verity.c75
-rw-r--r--fs/f2fs/xattr.c11
-rw-r--r--fs/fcntl.c5
-rw-r--r--fs/file.c69
-rw-r--r--fs/fs-writeback.c75
-rw-r--r--fs/fs_context.c6
-rw-r--r--fs/fscache/cookie.c14
-rw-r--r--fs/fscache/internal.h2
-rw-r--r--fs/fscache/main.c39
-rw-r--r--fs/fuse/cuse.c2
-rw-r--r--fs/fuse/dev.c42
-rw-r--r--fs/fuse/dir.c2
-rw-r--r--fs/fuse/file.c51
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/fuse/virtio_fs.c1
-rw-r--r--fs/gfs2/bmap.c8
-rw-r--r--fs/gfs2/file.c5
-rw-r--r--fs/gfs2/glock.c2
-rw-r--r--fs/gfs2/lock_dlm.c13
-rw-r--r--fs/gfs2/rgrp.c12
-rw-r--r--fs/gfs2/super.c10
-rw-r--r--fs/hfs/bfind.c14
-rw-r--r--fs/hfs/bnode.c25
-rw-r--r--fs/hfs/btree.h7
-rw-r--r--fs/hfs/super.c10
-rw-r--r--fs/hfsplus/extents.c7
-rw-r--r--fs/hostfs/hostfs_kern.c19
-rw-r--r--fs/hugetlbfs/inode.c84
-rw-r--r--fs/internal.h1
-rw-r--r--fs/io_uring.c33
-rw-r--r--fs/iomap/buffered-io.c34
-rw-r--r--fs/iomap/seek.c25
-rw-r--r--fs/iomap/swapfile.c16
-rw-r--r--fs/isofs/dir.c1
-rw-r--r--fs/isofs/inode.c29
-rw-r--r--fs/isofs/isofs.h1
-rw-r--r--fs/isofs/joliet.c4
-rw-r--r--fs/isofs/namei.c1
-rw-r--r--fs/jbd2/commit.c4
-rw-r--r--fs/jffs2/build.c4
-rw-r--r--fs/jffs2/compr_rtime.c3
-rw-r--r--fs/jffs2/file.c40
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/scan.c8
-rw-r--r--fs/jffs2/summary.c3
-rw-r--r--fs/jfs/inode.c4
-rw-r--r--fs/jfs/jfs_dmap.c9
-rw-r--r--fs/jfs/jfs_filsys.h1
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/jfs/jfs_mount.c61
-rw-r--r--fs/lockd/svclock.c32
-rw-r--r--fs/minix/inode.c3
-rw-r--r--fs/namei.c10
-rw-r--r--fs/namespace.c48
-rw-r--r--fs/nfs/Kconfig2
-rw-r--r--fs/nfs/callback.h2
-rw-r--r--fs/nfs/callback_proc.c29
-rw-r--r--fs/nfs/callback_xdr.c22
-rw-r--r--fs/nfs/client.c4
-rw-r--r--fs/nfs/dir.c62
-rw-r--r--fs/nfs/direct.c48
-rw-r--r--fs/nfs/file.c4
-rw-r--r--fs/nfs/filelayout/filelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c4
-rw-r--r--fs/nfs/inode.c21
-rw-r--r--fs/nfs/nfs2xdr.c2
-rw-r--r--fs/nfs/nfs3proc.c4
-rw-r--r--fs/nfs/nfs3xdr.c24
-rw-r--r--fs/nfs/nfs42proc.c26
-rw-r--r--fs/nfs/nfs42xdr.c3
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4client.c89
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4namespace.c4
-rw-r--r--fs/nfs/nfs4proc.c66
-rw-r--r--fs/nfs/nfs4state.c15
-rw-r--r--fs/nfs/nfs4xdr.c9
-rw-r--r--fs/nfs/pagelist.c21
-rw-r--r--fs/nfs/pnfs.c40
-rw-r--r--fs/nfs/pnfs.h2
-rw-r--r--fs/nfs/pnfs_nfs.c56
-rw-r--r--fs/nfs/write.c17
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/filecache.c249
-rw-r--r--fs/nfsd/filecache.h2
-rw-r--r--fs/nfsd/nfs3proc.c5
-rw-r--r--fs/nfsd/nfs3xdr.c7
-rw-r--r--fs/nfsd/nfs4callback.c1
-rw-r--r--fs/nfsd/nfs4proc.c5
-rw-r--r--fs/nfsd/nfs4recover.c1
-rw-r--r--fs/nfsd/nfs4state.c17
-rw-r--r--fs/nfsd/nfs4xdr.c19
-rw-r--r--fs/nfsd/nfscache.c12
-rw-r--r--fs/nfsd/nfsctl.c36
-rw-r--r--fs/nfsd/nfsproc.c2
-rw-r--r--fs/nfsd/nfssvc.c9
-rw-r--r--fs/nfsd/trace.h14
-rw-r--r--fs/nfsd/xdr.h2
-rw-r--r--fs/nilfs2/sysfs.c27
-rw-r--r--fs/nilfs2/the_nilfs.c9
-rw-r--r--fs/ntfs/dir.c2
-rw-r--r--fs/ntfs/inode.c39
-rw-r--r--fs/ntfs/inode.h4
-rw-r--r--fs/ntfs/mft.c4
-rw-r--r--fs/ocfs2/alloc.c46
-rw-r--r--fs/ocfs2/aops.c11
-rw-r--r--fs/ocfs2/cluster/heartbeat.c8
-rw-r--r--fs/ocfs2/dlmglue.c3
-rw-r--r--fs/ocfs2/file.c90
-rw-r--r--fs/ocfs2/filecheck.c6
-rw-r--r--fs/ocfs2/stackglue.c8
-rw-r--r--fs/ocfs2/super.c36
-rw-r--r--fs/orangefs/dcache.c4
-rw-r--r--fs/orangefs/orangefs-bufmap.c7
-rw-r--r--fs/orangefs/super.c2
-rw-r--r--fs/overlayfs/copy_up.c18
-rw-r--r--fs/overlayfs/dir.c21
-rw-r--r--fs/overlayfs/export.c2
-rw-r--r--fs/overlayfs/file.c50
-rw-r--r--fs/overlayfs/inode.c2
-rw-r--r--fs/overlayfs/overlayfs.h1
-rw-r--r--fs/overlayfs/super.c37
-rw-r--r--fs/pipe.c19
-rw-r--r--fs/pnode.h2
-rw-r--r--fs/proc/base.c26
-rw-r--r--fs/proc/vmcore.c16
-rw-r--r--fs/pstore/platform.c4
-rw-r--r--fs/qnx4/dir.c69
-rw-r--r--fs/quota/dquot.c11
-rw-r--r--fs/quota/quota_tree.c15
-rw-r--r--fs/quota/quota_v2.c11
-rw-r--r--fs/readdir.c6
-rw-r--r--fs/reiserfs/journal.c14
-rw-r--r--fs/reiserfs/stree.c31
-rw-r--r--fs/reiserfs/super.c8
-rw-r--r--fs/reiserfs/xattr.h2
-rw-r--r--fs/select.c73
-rw-r--r--fs/seq_file.c3
-rw-r--r--fs/signalfd.c12
-rw-r--r--fs/squashfs/export.c45
-rw-r--r--fs/squashfs/file.c6
-rw-r--r--fs/squashfs/id.c42
-rw-r--r--fs/squashfs/squashfs_fs.h1
-rw-r--r--fs/squashfs/squashfs_fs_sb.h1
-rw-r--r--fs/squashfs/super.c6
-rw-r--r--fs/squashfs/xattr.h10
-rw-r--r--fs/squashfs/xattr_id.c68
-rw-r--r--fs/stat.c19
-rw-r--r--fs/super.c23
-rw-r--r--fs/sysfs/file.c55
-rw-r--r--fs/tracefs/inode.c82
-rw-r--r--fs/ubifs/auth.c2
-rw-r--r--fs/ubifs/dir.c51
-rw-r--r--fs/ubifs/file.c12
-rw-r--r--fs/ubifs/io.c34
-rw-r--r--fs/ubifs/ioctl.c2
-rw-r--r--fs/ubifs/replay.c3
-rw-r--r--fs/ubifs/super.c6
-rw-r--r--fs/ubifs/ubifs.h2
-rw-r--r--fs/ubifs/xattr.c44
-rw-r--r--fs/udf/dir.c32
-rw-r--r--fs/udf/inode.c18
-rw-r--r--fs/udf/misc.c13
-rw-r--r--fs/udf/namei.c7
-rw-r--r--fs/udf/super.c84
-rw-r--r--fs/udf/udf_sb.h2
-rw-r--r--fs/udf/unicode.c4
-rw-r--r--fs/userfaultfd.c115
-rw-r--r--fs/verity/enable.c2
-rw-r--r--fs/verity/open.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c4
-rw-r--r--fs/xfs/xfs_ioctl.c3
-rw-r--r--fs/xfs/xfs_iops.c2
303 files changed, 4815 insertions, 2378 deletions
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 41c5749f4db7..5400a876d73f 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -460,8 +460,10 @@ affs_xrename(struct inode *old_dir, struct dentry *old_dentry,
return -EIO;
bh_new = affs_bread(sb, d_inode(new_dentry)->i_ino);
- if (!bh_new)
+ if (!bh_new) {
+ affs_brelse(bh_old);
return -EIO;
+ }
/* Remove old header from its parent directory. */
affs_lock_dir(old_dir);
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index fc5eb0f89304..c2e82b84c554 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -29,16 +29,11 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *);
static int afs_deliver_yfs_cb_callback(struct afs_call *);
-#define CM_NAME(name) \
- char afs_SRXCB##name##_name[] __tracepoint_string = \
- "CB." #name
-
/*
* CB.CallBack operation type
*/
-static CM_NAME(CallBack);
static const struct afs_call_type afs_SRXCBCallBack = {
- .name = afs_SRXCBCallBack_name,
+ .name = "CB.CallBack",
.deliver = afs_deliver_cb_callback,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_CallBack,
@@ -47,9 +42,8 @@ static const struct afs_call_type afs_SRXCBCallBack = {
/*
* CB.InitCallBackState operation type
*/
-static CM_NAME(InitCallBackState);
static const struct afs_call_type afs_SRXCBInitCallBackState = {
- .name = afs_SRXCBInitCallBackState_name,
+ .name = "CB.InitCallBackState",
.deliver = afs_deliver_cb_init_call_back_state,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_InitCallBackState,
@@ -58,9 +52,8 @@ static const struct afs_call_type afs_SRXCBInitCallBackState = {
/*
* CB.InitCallBackState3 operation type
*/
-static CM_NAME(InitCallBackState3);
static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
- .name = afs_SRXCBInitCallBackState3_name,
+ .name = "CB.InitCallBackState3",
.deliver = afs_deliver_cb_init_call_back_state3,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_InitCallBackState,
@@ -69,9 +62,8 @@ static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
/*
* CB.Probe operation type
*/
-static CM_NAME(Probe);
static const struct afs_call_type afs_SRXCBProbe = {
- .name = afs_SRXCBProbe_name,
+ .name = "CB.Probe",
.deliver = afs_deliver_cb_probe,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_Probe,
@@ -80,9 +72,8 @@ static const struct afs_call_type afs_SRXCBProbe = {
/*
* CB.ProbeUuid operation type
*/
-static CM_NAME(ProbeUuid);
static const struct afs_call_type afs_SRXCBProbeUuid = {
- .name = afs_SRXCBProbeUuid_name,
+ .name = "CB.ProbeUuid",
.deliver = afs_deliver_cb_probe_uuid,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_ProbeUuid,
@@ -91,9 +82,8 @@ static const struct afs_call_type afs_SRXCBProbeUuid = {
/*
* CB.TellMeAboutYourself operation type
*/
-static CM_NAME(TellMeAboutYourself);
static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
- .name = afs_SRXCBTellMeAboutYourself_name,
+ .name = "CB.TellMeAboutYourself",
.deliver = afs_deliver_cb_tell_me_about_yourself,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_TellMeAboutYourself,
@@ -102,9 +92,8 @@ static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
/*
* YFS CB.CallBack operation type
*/
-static CM_NAME(YFS_CallBack);
static const struct afs_call_type afs_SRXYFSCB_CallBack = {
- .name = afs_SRXCBYFS_CallBack_name,
+ .name = "YFSCB.CallBack",
.deliver = afs_deliver_yfs_cb_callback,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_CallBack,
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 3c486340b220..8c39533d122a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -69,7 +69,6 @@ const struct inode_operations afs_dir_inode_operations = {
.permission = afs_permission,
.getattr = afs_getattr,
.setattr = afs_setattr,
- .listxattr = afs_listxattr,
};
const struct address_space_operations afs_dir_aops = {
@@ -978,9 +977,9 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
*/
static int afs_d_revalidate_rcu(struct dentry *dentry)
{
- struct afs_vnode *dvnode, *vnode;
+ struct afs_vnode *dvnode;
struct dentry *parent;
- struct inode *dir, *inode;
+ struct inode *dir;
long dir_version, de_version;
_enter("%p", dentry);
@@ -1010,18 +1009,6 @@ static int afs_d_revalidate_rcu(struct dentry *dentry)
return -ECHILD;
}
- /* Check to see if the vnode referred to by the dentry still
- * has a callback.
- */
- if (d_really_is_positive(dentry)) {
- inode = d_inode_rcu(dentry);
- if (inode) {
- vnode = AFS_FS_I(inode);
- if (!afs_check_validity(vnode))
- return -ECHILD;
- }
- }
-
return 1; /* Still valid */
}
@@ -1057,17 +1044,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
if (IS_ERR(key))
key = NULL;
- if (d_really_is_positive(dentry)) {
- inode = d_inode(dentry);
- if (inode) {
- vnode = AFS_FS_I(inode);
- afs_validate(vnode, key);
- if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
- goto out_bad;
- }
- }
-
- /* lock down the parent dentry so we can peer at it */
+ /* Hold the parent dentry so we can peer at it */
parent = dget_parent(dentry);
dir = AFS_FS_I(d_inode(parent));
@@ -1076,7 +1053,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
_debug("%pd: parent dir deleted", dentry);
- goto out_bad_parent;
+ goto not_found;
}
/* We only need to invalidate a dentry if the server's copy changed
@@ -1102,12 +1079,12 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
case 0:
/* the filename maps to something */
if (d_really_is_negative(dentry))
- goto out_bad_parent;
+ goto not_found;
inode = d_inode(dentry);
if (is_bad_inode(inode)) {
printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n",
dentry);
- goto out_bad_parent;
+ goto not_found;
}
vnode = AFS_FS_I(inode);
@@ -1129,9 +1106,6 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
dentry, fid.unique,
vnode->fid.unique,
vnode->vfs_inode.i_generation);
- write_seqlock(&vnode->cb_lock);
- set_bit(AFS_VNODE_DELETED, &vnode->flags);
- write_sequnlock(&vnode->cb_lock);
goto not_found;
}
goto out_valid;
@@ -1146,7 +1120,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
default:
_debug("failed to iterate dir %pd: %d",
parent, ret);
- goto out_bad_parent;
+ goto not_found;
}
out_valid:
@@ -1157,16 +1131,9 @@ out_valid_noupdate:
_leave(" = 1 [valid]");
return 1;
- /* the dirent, if it exists, now points to a different vnode */
not_found:
- spin_lock(&dentry->d_lock);
- dentry->d_flags |= DCACHE_NFSFS_RENAMED;
- spin_unlock(&dentry->d_lock);
-
-out_bad_parent:
_debug("dropping dentry %pd2", dentry);
dput(parent);
-out_bad:
key_put(key);
_leave(" = 0 [bad]");
diff --git a/fs/afs/file.c b/fs/afs/file.c
index dd3c55c9101c..5d0b472e984b 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -42,7 +42,6 @@ const struct inode_operations afs_file_inode_operations = {
.getattr = afs_getattr,
.setattr = afs_setattr,
.permission = afs_permission,
- .listxattr = afs_listxattr,
};
const struct address_space_operations afs_fs_aops = {
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index a74e8e209454..4f58b28a1edd 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -27,7 +27,6 @@
static const struct inode_operations afs_symlink_inode_operations = {
.get_link = page_get_link,
- .listxattr = afs_listxattr,
};
static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *parent_vnode)
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 7fe88d918b23..c3ad582f9fd0 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1354,7 +1354,6 @@ extern int afs_launder_page(struct page *);
* xattr.c
*/
extern const struct xattr_handler *afs_xattr_handlers[];
-extern ssize_t afs_listxattr(struct dentry *, char *, size_t);
/*
* yfsclient.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index c9c45d7078bd..d129a1a49616 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -186,7 +186,7 @@ static int __init afs_init(void)
goto error_cache;
#endif
- ret = register_pernet_subsys(&afs_net_ops);
+ ret = register_pernet_device(&afs_net_ops);
if (ret < 0)
goto error_net;
@@ -196,8 +196,8 @@ static int __init afs_init(void)
goto error_fs;
afs_proc_symlink = proc_symlink("fs/afs", NULL, "../self/net/afs");
- if (IS_ERR(afs_proc_symlink)) {
- ret = PTR_ERR(afs_proc_symlink);
+ if (!afs_proc_symlink) {
+ ret = -ENOMEM;
goto error_proc;
}
@@ -206,7 +206,7 @@ static int __init afs_init(void)
error_proc:
afs_fs_exit();
error_fs:
- unregister_pernet_subsys(&afs_net_ops);
+ unregister_pernet_device(&afs_net_ops);
error_net:
#ifdef CONFIG_AFS_FSCACHE
fscache_unregister_netfs(&afs_cache_netfs);
@@ -237,7 +237,7 @@ static void __exit afs_exit(void)
proc_remove(afs_proc_symlink);
afs_fs_exit();
- unregister_pernet_subsys(&afs_net_ops);
+ unregister_pernet_device(&afs_net_ops);
#ifdef CONFIG_AFS_FSCACHE
fscache_unregister_netfs(&afs_cache_netfs);
#endif
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 79bc5f1338ed..f105f061e89b 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -32,7 +32,6 @@ const struct inode_operations afs_mntpt_inode_operations = {
.lookup = afs_mntpt_lookup,
.readlink = page_readlink,
.getattr = afs_getattr,
- .listxattr = afs_listxattr,
};
const struct inode_operations afs_autocell_inode_operations = {
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index 5552d034090a..13292d4ca860 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -11,29 +11,6 @@
#include <linux/xattr.h>
#include "internal.h"
-static const char afs_xattr_list[] =
- "afs.acl\0"
- "afs.cell\0"
- "afs.fid\0"
- "afs.volume\0"
- "afs.yfs.acl\0"
- "afs.yfs.acl_inherited\0"
- "afs.yfs.acl_num_cleaned\0"
- "afs.yfs.vol_acl";
-
-/*
- * Retrieve a list of the supported xattrs.
- */
-ssize_t afs_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
- if (size == 0)
- return sizeof(afs_xattr_list);
- if (size < sizeof(afs_xattr_list))
- return -ERANGE;
- memcpy(buffer, afs_xattr_list, sizeof(afs_xattr_list));
- return sizeof(afs_xattr_list);
-}
-
/*
* Get a file's ACL.
*/
diff --git a/fs/aio.c b/fs/aio.c
index 47bb7b5685ba..fb92c32a6f1e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -183,8 +183,9 @@ struct poll_iocb {
struct file *file;
struct wait_queue_head *head;
__poll_t events;
- bool done;
bool cancelled;
+ bool work_scheduled;
+ bool work_need_resched;
struct wait_queue_entry wait;
struct work_struct work;
};
@@ -1626,6 +1627,51 @@ static void aio_poll_put_work(struct work_struct *work)
iocb_put(iocb);
}
+/*
+ * Safely lock the waitqueue which the request is on, synchronizing with the
+ * case where the ->poll() provider decides to free its waitqueue early.
+ *
+ * Returns true on success, meaning that req->head->lock was locked, req->wait
+ * is on req->head, and an RCU read lock was taken. Returns false if the
+ * request was already removed from its waitqueue (which might no longer exist).
+ */
+static bool poll_iocb_lock_wq(struct poll_iocb *req)
+{
+ wait_queue_head_t *head;
+
+ /*
+ * While we hold the waitqueue lock and the waitqueue is nonempty,
+ * wake_up_pollfree() will wait for us. However, taking the waitqueue
+ * lock in the first place can race with the waitqueue being freed.
+ *
+ * We solve this as eventpoll does: by taking advantage of the fact that
+ * all users of wake_up_pollfree() will RCU-delay the actual free. If
+ * we enter rcu_read_lock() and see that the pointer to the queue is
+ * non-NULL, we can then lock it without the memory being freed out from
+ * under us, then check whether the request is still on the queue.
+ *
+ * Keep holding rcu_read_lock() as long as we hold the queue lock, in
+ * case the caller deletes the entry from the queue, leaving it empty.
+ * In that case, only RCU prevents the queue memory from being freed.
+ */
+ rcu_read_lock();
+ head = smp_load_acquire(&req->head);
+ if (head) {
+ spin_lock(&head->lock);
+ if (!list_empty(&req->wait.entry))
+ return true;
+ spin_unlock(&head->lock);
+ }
+ rcu_read_unlock();
+ return false;
+}
+
+static void poll_iocb_unlock_wq(struct poll_iocb *req)
+{
+ spin_unlock(&req->head->lock);
+ rcu_read_unlock();
+}
+
static void aio_poll_complete_work(struct work_struct *work)
{
struct poll_iocb *req = container_of(work, struct poll_iocb, work);
@@ -1645,14 +1691,27 @@ static void aio_poll_complete_work(struct work_struct *work)
* avoid further branches in the fast path.
*/
spin_lock_irq(&ctx->ctx_lock);
- if (!mask && !READ_ONCE(req->cancelled)) {
- add_wait_queue(req->head, &req->wait);
- spin_unlock_irq(&ctx->ctx_lock);
- return;
- }
+ if (poll_iocb_lock_wq(req)) {
+ if (!mask && !READ_ONCE(req->cancelled)) {
+ /*
+ * The request isn't actually ready to be completed yet.
+ * Reschedule completion if another wakeup came in.
+ */
+ if (req->work_need_resched) {
+ schedule_work(&req->work);
+ req->work_need_resched = false;
+ } else {
+ req->work_scheduled = false;
+ }
+ poll_iocb_unlock_wq(req);
+ spin_unlock_irq(&ctx->ctx_lock);
+ return;
+ }
+ list_del_init(&req->wait.entry);
+ poll_iocb_unlock_wq(req);
+ } /* else, POLLFREE has freed the waitqueue, so we must complete */
list_del_init(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
- req->done = true;
spin_unlock_irq(&ctx->ctx_lock);
iocb_put(iocb);
@@ -1664,13 +1723,14 @@ static int aio_poll_cancel(struct kiocb *iocb)
struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
struct poll_iocb *req = &aiocb->poll;
- spin_lock(&req->head->lock);
- WRITE_ONCE(req->cancelled, true);
- if (!list_empty(&req->wait.entry)) {
- list_del_init(&req->wait.entry);
- schedule_work(&aiocb->poll.work);
- }
- spin_unlock(&req->head->lock);
+ if (poll_iocb_lock_wq(req)) {
+ WRITE_ONCE(req->cancelled, true);
+ if (!req->work_scheduled) {
+ schedule_work(&aiocb->poll.work);
+ req->work_scheduled = true;
+ }
+ poll_iocb_unlock_wq(req);
+ } /* else, the request was force-cancelled by POLLFREE already */
return 0;
}
@@ -1687,20 +1747,26 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (mask && !(mask & req->events))
return 0;
- list_del_init(&req->wait.entry);
-
- if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
+ /*
+ * Complete the request inline if possible. This requires that three
+ * conditions be met:
+ * 1. An event mask must have been passed. If a plain wakeup was done
+ * instead, then mask == 0 and we have to call vfs_poll() to get
+ * the events, so inline completion isn't possible.
+ * 2. The completion work must not have already been scheduled.
+ * 3. ctx_lock must not be busy. We have to use trylock because we
+ * already hold the waitqueue lock, so this inverts the normal
+ * locking order. Use irqsave/irqrestore because not all
+ * filesystems (e.g. fuse) call this function with IRQs disabled,
+ * yet IRQs have to be disabled before ctx_lock is obtained.
+ */
+ if (mask && !req->work_scheduled &&
+ spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
struct kioctx *ctx = iocb->ki_ctx;
- /*
- * Try to complete the iocb inline if we can. Use
- * irqsave/irqrestore because not all filesystems (e.g. fuse)
- * call this function with IRQs disabled and because IRQs
- * have to be disabled before ctx_lock is obtained.
- */
+ list_del_init(&req->wait.entry);
list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
- req->done = true;
if (iocb->ki_eventfd && eventfd_signal_count()) {
iocb = NULL;
INIT_WORK(&req->work, aio_poll_put_work);
@@ -1710,7 +1776,43 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (iocb)
iocb_put(iocb);
} else {
- schedule_work(&req->work);
+ /*
+ * Schedule the completion work if needed. If it was already
+ * scheduled, record that another wakeup came in.
+ *
+ * Don't remove the request from the waitqueue here, as it might
+ * not actually be complete yet (we won't know until vfs_poll()
+ * is called), and we must not miss any wakeups. POLLFREE is an
+ * exception to this; see below.
+ */
+ if (req->work_scheduled) {
+ req->work_need_resched = true;
+ } else {
+ schedule_work(&req->work);
+ req->work_scheduled = true;
+ }
+
+ /*
+ * If the waitqueue is being freed early but we can't complete
+ * the request inline, we have to tear down the request as best
+ * we can. That means immediately removing the request from its
+ * waitqueue and preventing all further accesses to the
+ * waitqueue via the request. We also need to schedule the
+ * completion work (done above). Also mark the request as
+ * cancelled, to potentially skip an unneeded call to ->poll().
+ */
+ if (mask & POLLFREE) {
+ WRITE_ONCE(req->cancelled, true);
+ list_del_init(&req->wait.entry);
+
+ /*
+ * Careful: this *must* be the last step, since as soon
+ * as req->head is NULL'ed out, the request can be
+ * completed and freed, since aio_poll_complete_work()
+ * will no longer need to take the waitqueue lock.
+ */
+ smp_store_release(&req->head, NULL);
+ }
}
return 1;
}
@@ -1718,6 +1820,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
struct aio_poll_table {
struct poll_table_struct pt;
struct aio_kiocb *iocb;
+ bool queued;
int error;
};
@@ -1728,11 +1831,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
/* multiple wait queues per file are not supported */
- if (unlikely(pt->iocb->poll.head)) {
+ if (unlikely(pt->queued)) {
pt->error = -EINVAL;
return;
}
+ pt->queued = true;
pt->error = 0;
pt->iocb->poll.head = head;
add_wait_queue(head, &pt->iocb->poll.wait);
@@ -1757,12 +1861,14 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
req->head = NULL;
- req->done = false;
req->cancelled = false;
+ req->work_scheduled = false;
+ req->work_need_resched = false;
apt.pt._qproc = aio_poll_queue_proc;
apt.pt._key = req->events;
apt.iocb = aiocb;
+ apt.queued = false;
apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
/* initialized the list so that we can do list_empty checks */
@@ -1771,23 +1877,35 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
mask = vfs_poll(req->file, &apt.pt) & req->events;
spin_lock_irq(&ctx->ctx_lock);
- if (likely(req->head)) {
- spin_lock(&req->head->lock);
- if (unlikely(list_empty(&req->wait.entry))) {
- if (apt.error)
+ if (likely(apt.queued)) {
+ bool on_queue = poll_iocb_lock_wq(req);
+
+ if (!on_queue || req->work_scheduled) {
+ /*
+ * aio_poll_wake() already either scheduled the async
+ * completion work, or completed the request inline.
+ */
+ if (apt.error) /* unsupported case: multiple queues */
cancel = true;
apt.error = 0;
mask = 0;
}
if (mask || apt.error) {
+ /* Steal to complete synchronously. */
list_del_init(&req->wait.entry);
} else if (cancel) {
+ /* Cancel if possible (may be too late though). */
WRITE_ONCE(req->cancelled, true);
- } else if (!req->done) { /* actually waiting for an event */
+ } else if (on_queue) {
+ /*
+ * Actually waiting for an event, so add the request to
+ * active_reqs so that it can be cancelled if needed.
+ */
list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
aiocb->ki_cancel = aio_poll_cancel;
}
- spin_unlock(&req->head->lock);
+ if (on_queue)
+ poll_iocb_unlock_wq(req);
}
if (mask) { /* no async, we'd stolen it */
aiocb->ki_res.res = mangle_poll(mask);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7ce3cfd965d2..72cd871544ac 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -583,7 +583,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
vaddr = eppnt->p_vaddr;
if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
- elf_type |= MAP_FIXED_NOREPLACE;
+ elf_type |= MAP_FIXED;
else if (no_base && interp_elf_ex->e_type == ET_DYN)
load_addr = -vaddr;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index cdb45829354d..056a68292e15 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -696,12 +696,24 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
struct super_block *sb = file_inode(file)->i_sb;
struct dentry *root = sb->s_root, *dentry;
int err = 0;
+ struct file *f = NULL;
e = create_entry(buffer, count);
if (IS_ERR(e))
return PTR_ERR(e);
+ if (e->flags & MISC_FMT_OPEN_FILE) {
+ f = open_exec(e->interpreter);
+ if (IS_ERR(f)) {
+ pr_notice("register: failed to install interpreter file %s\n",
+ e->interpreter);
+ kfree(e);
+ return PTR_ERR(f);
+ }
+ e->interp_file = f;
+ }
+
inode_lock(d_inode(root));
dentry = lookup_one_len(e->name, root, strlen(e->name));
err = PTR_ERR(dentry);
@@ -725,21 +737,6 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
goto out2;
}
- if (e->flags & MISC_FMT_OPEN_FILE) {
- struct file *f;
-
- f = open_exec(e->interpreter);
- if (IS_ERR(f)) {
- err = PTR_ERR(f);
- pr_notice("register: failed to install interpreter file %s\n", e->interpreter);
- simple_release_fs(&bm_mnt, &entry_count);
- iput(inode);
- inode = NULL;
- goto out2;
- }
- e->interp_file = f;
- }
-
e->dentry = dget(dentry);
inode->i_private = e;
inode->i_fop = &bm_entry_operations;
@@ -756,6 +753,8 @@ out:
inode_unlock(d_inode(root));
if (err) {
+ if (f)
+ filp_close(f, NULL);
kfree(e);
return err;
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 79272cdbe827..fa329c7eddf0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -246,6 +246,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
bio.bi_opf = dio_bio_write_op(iocb);
task_io_account_write(ret);
}
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ bio.bi_opf |= REQ_NOWAIT;
if (iocb->ki_flags & IOCB_HIPRI)
bio_set_polled(&bio, iocb);
@@ -399,6 +401,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
bio->bi_opf = dio_bio_write_op(iocb);
task_io_account_write(bio->bi_iter.bi_size);
}
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ bio->bi_opf |= REQ_NOWAIT;
dio->size += bio->bi_iter.bi_size;
pos += bio->bi_iter.bi_size;
@@ -1983,6 +1987,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
struct blk_plug plug;
+ size_t shorted = 0;
ssize_t ret;
if (bdev_read_only(I_BDEV(bd_inode)))
@@ -2001,12 +2006,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
return -EOPNOTSUPP;
- iov_iter_truncate(from, size - iocb->ki_pos);
+ size -= iocb->ki_pos;
+ if (iov_iter_count(from) > size) {
+ shorted = iov_iter_count(from) - size;
+ iov_iter_truncate(from, size);
+ }
blk_start_plug(&plug);
ret = __generic_file_write_iter(iocb, from);
if (ret > 0)
ret = generic_write_sync(iocb, ret);
+ iov_iter_reexpand(from, iov_iter_count(from) + shorted);
blk_finish_plug(&plug);
return ret;
}
@@ -2018,13 +2028,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
loff_t pos = iocb->ki_pos;
+ size_t shorted = 0;
+ ssize_t ret;
if (pos >= size)
return 0;
size -= pos;
- iov_iter_truncate(to, size);
- return generic_file_read_iter(iocb, to);
+ if (iov_iter_count(to) > size) {
+ shorted = iov_iter_count(to) - size;
+ iov_iter_truncate(to, size);
+ }
+
+ ret = generic_file_read_iter(iocb, to);
+ iov_iter_reexpand(to, iov_iter_count(to) + shorted);
+ return ret;
}
EXPORT_SYMBOL_GPL(blkdev_read_iter);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 38651fae7f21..0aa1bee24d80 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -14,6 +14,8 @@ config BTRFS_FS
select RAID6_PQ
select XOR_BLOCKS
select SRCU
+ depends on !PPC_256K_PAGES # powerpc
+ depends on !PAGE_SIZE_256KB # hexagon
help
Btrfs is a general purpose copy-on-write filesystem with extents,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 11be02459b87..eb592b92aa9c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -237,6 +237,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq,
ordered_list);
if (!test_bit(WORK_DONE_BIT, &work->flags))
break;
+ /*
+ * Orders all subsequent loads after reading WORK_DONE_BIT,
+ * paired with the smp_mb__before_atomic in btrfs_work_helper
+ * this guarantees that the ordered function will see all
+ * updates from ordinary work function.
+ */
+ smp_rmb();
/*
* we are going to call the ordered done function, but
@@ -325,6 +332,13 @@ static void btrfs_work_helper(struct work_struct *normal_work)
thresh_exec_hook(wq);
work->func(work);
if (need_order) {
+ /*
+ * Ensures all memory accesses done in the work function are
+ * ordered before setting the WORK_DONE_BIT. Ensuring the thread
+ * which is going to executed the ordered work sees them.
+ * Pairs with the smp_rmb in run_ordered_work.
+ */
+ smp_mb__before_atomic();
set_bit(WORK_DONE_BIT, &work->flags);
run_ordered_work(wq, work);
}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 86e280edf804..c701a19fac53 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -347,33 +347,10 @@ static int add_prelim_ref(const struct btrfs_fs_info *fs_info,
return -ENOMEM;
ref->root_id = root_id;
- if (key) {
+ if (key)
ref->key_for_search = *key;
- /*
- * We can often find data backrefs with an offset that is too
- * large (>= LLONG_MAX, maximum allowed file offset) due to
- * underflows when subtracting a file's offset with the data
- * offset of its corresponding extent data item. This can
- * happen for example in the clone ioctl.
- * So if we detect such case we set the search key's offset to
- * zero to make sure we will find the matching file extent item
- * at add_all_parents(), otherwise we will miss it because the
- * offset taken form the backref is much larger then the offset
- * of the file extent item. This can make us scan a very large
- * number of file extent items, but at least it will not make
- * us miss any.
- * This is an ugly workaround for a behaviour that should have
- * never existed, but it does and a fix for the clone ioctl
- * would touch a lot of places, cause backwards incompatibility
- * and would not fix the problem for extents cloned with older
- * kernels.
- */
- if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY &&
- ref->key_for_search.offset >= LLONG_MAX)
- ref->key_for_search.offset = 0;
- } else {
+ else
memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
- }
ref->inode_list = NULL;
ref->level = level;
@@ -409,10 +386,36 @@ static int add_indirect_ref(const struct btrfs_fs_info *fs_info,
wanted_disk_byte, count, sc, gfp_mask);
}
+static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr)
+{
+ struct rb_node **p = &preftrees->direct.root.rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct prelim_ref *ref = NULL;
+ struct prelim_ref target = {0};
+ int result;
+
+ target.parent = bytenr;
+
+ while (*p) {
+ parent = *p;
+ ref = rb_entry(parent, struct prelim_ref, rbnode);
+ result = prelim_ref_compare(ref, &target);
+
+ if (result < 0)
+ p = &(*p)->rb_left;
+ else if (result > 0)
+ p = &(*p)->rb_right;
+ else
+ return 1;
+ }
+ return 0;
+}
+
static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
- struct ulist *parents, struct prelim_ref *ref,
+ struct ulist *parents,
+ struct preftrees *preftrees, struct prelim_ref *ref,
int level, u64 time_seq, const u64 *extent_item_pos,
- u64 total_refs, bool ignore_offset)
+ bool ignore_offset)
{
int ret = 0;
int slot;
@@ -424,6 +427,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
u64 disk_byte;
u64 wanted_disk_byte = ref->wanted_disk_byte;
u64 count = 0;
+ u64 data_offset;
if (level != 0) {
eb = path->nodes[level];
@@ -434,18 +438,26 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
}
/*
- * We normally enter this function with the path already pointing to
- * the first item to check. But sometimes, we may enter it with
- * slot==nritems. In that case, go to the next leaf before we continue.
+ * 1. We normally enter this function with the path already pointing to
+ * the first item to check. But sometimes, we may enter it with
+ * slot == nritems.
+ * 2. We are searching for normal backref but bytenr of this leaf
+ * matches shared data backref
+ * 3. The leaf owner is not equal to the root we are searching
+ *
+ * For these cases, go to the next leaf before we continue.
*/
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ eb = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(eb) ||
+ is_shared_data_backref(preftrees, eb->start) ||
+ ref->root_id != btrfs_header_owner(eb)) {
if (time_seq == SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
ret = btrfs_next_old_leaf(root, path, time_seq);
}
- while (!ret && count < total_refs) {
+ while (!ret && count < ref->count) {
eb = path->nodes[0];
slot = path->slots[0];
@@ -455,13 +467,31 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
key.type != BTRFS_EXTENT_DATA_KEY)
break;
+ /*
+ * We are searching for normal backref but bytenr of this leaf
+ * matches shared data backref, OR
+ * the leaf owner is not equal to the root we are searching for
+ */
+ if (slot == 0 &&
+ (is_shared_data_backref(preftrees, eb->start) ||
+ ref->root_id != btrfs_header_owner(eb))) {
+ if (time_seq == SEQ_LAST)
+ ret = btrfs_next_leaf(root, path);
+ else
+ ret = btrfs_next_old_leaf(root, path, time_seq);
+ continue;
+ }
fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ data_offset = btrfs_file_extent_offset(eb, fi);
if (disk_byte == wanted_disk_byte) {
eie = NULL;
old = NULL;
- count++;
+ if (ref->key_for_search.offset == key.offset - data_offset)
+ count++;
+ else
+ goto next;
if (extent_item_pos) {
ret = check_extent_in_eb(&key, eb, fi,
*extent_item_pos,
@@ -502,9 +532,9 @@ next:
*/
static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
+ struct preftrees *preftrees,
struct prelim_ref *ref, struct ulist *parents,
- const u64 *extent_item_pos, u64 total_refs,
- bool ignore_offset)
+ const u64 *extent_item_pos, bool ignore_offset)
{
struct btrfs_root *root;
struct btrfs_key root_key;
@@ -513,6 +543,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
int root_level;
int level = ref->level;
int index;
+ struct btrfs_key search_key = ref->key_for_search;
root_key.objectid = ref->root_id;
root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -545,13 +576,33 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
goto out;
}
+ /*
+ * We can often find data backrefs with an offset that is too large
+ * (>= LLONG_MAX, maximum allowed file offset) due to underflows when
+ * subtracting a file's offset with the data offset of its
+ * corresponding extent data item. This can happen for example in the
+ * clone ioctl.
+ *
+ * So if we detect such case we set the search key's offset to zero to
+ * make sure we will find the matching file extent item at
+ * add_all_parents(), otherwise we will miss it because the offset
+ * taken form the backref is much larger then the offset of the file
+ * extent item. This can make us scan a very large number of file
+ * extent items, but at least it will not make us miss any.
+ *
+ * This is an ugly workaround for a behaviour that should have never
+ * existed, but it does and a fix for the clone ioctl would touch a lot
+ * of places, cause backwards incompatibility and would not fix the
+ * problem for extents cloned with older kernels.
+ */
+ if (search_key.type == BTRFS_EXTENT_DATA_KEY &&
+ search_key.offset >= LLONG_MAX)
+ search_key.offset = 0;
path->lowest_level = level;
if (time_seq == SEQ_LAST)
- ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path,
- 0, 0);
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
else
- ret = btrfs_search_old_slot(root, &ref->key_for_search, path,
- time_seq);
+ ret = btrfs_search_old_slot(root, &search_key, path, time_seq);
/* root node has been locked, we can release @subvol_srcu safely here */
srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -574,8 +625,8 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
eb = path->nodes[level];
}
- ret = add_all_parents(root, path, parents, ref, level, time_seq,
- extent_item_pos, total_refs, ignore_offset);
+ ret = add_all_parents(root, path, parents, preftrees, ref, level,
+ time_seq, extent_item_pos, ignore_offset);
out:
path->lowest_level = 0;
btrfs_release_path(path);
@@ -609,7 +660,7 @@ unode_aux_to_inode_list(struct ulist_node *node)
static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
struct preftrees *preftrees,
- const u64 *extent_item_pos, u64 total_refs,
+ const u64 *extent_item_pos,
struct share_check *sc, bool ignore_offset)
{
int err;
@@ -653,9 +704,9 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
ret = BACKREF_FOUND_SHARED;
goto out;
}
- err = resolve_indirect_ref(fs_info, path, time_seq, ref,
- parents, extent_item_pos,
- total_refs, ignore_offset);
+ err = resolve_indirect_ref(fs_info, path, time_seq, preftrees,
+ ref, parents, extent_item_pos,
+ ignore_offset);
/*
* we can only tolerate ENOENT,otherwise,we should catch error
* and return directly.
@@ -758,8 +809,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
*/
static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head, u64 seq,
- struct preftrees *preftrees, u64 *total_refs,
- struct share_check *sc)
+ struct preftrees *preftrees, struct share_check *sc)
{
struct btrfs_delayed_ref_node *node;
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
@@ -793,7 +843,6 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
default:
BUG();
}
- *total_refs += count;
switch (node->type) {
case BTRFS_TREE_BLOCK_REF_KEY: {
/* NORMAL INDIRECT METADATA backref */
@@ -876,7 +925,7 @@ out:
static int add_inline_refs(const struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
int *info_level, struct preftrees *preftrees,
- u64 *total_refs, struct share_check *sc)
+ struct share_check *sc)
{
int ret = 0;
int slot;
@@ -900,7 +949,6 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
- *total_refs += btrfs_extent_refs(leaf, ei);
btrfs_item_key_to_cpu(leaf, &found_key, slot);
ptr = (unsigned long)(ei + 1);
@@ -1125,8 +1173,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct prelim_ref *ref;
struct rb_node *node;
struct extent_inode_elem *eie = NULL;
- /* total of both direct AND indirect refs! */
- u64 total_refs = 0;
struct preftrees preftrees = {
.direct = PREFTREE_INIT,
.indirect = PREFTREE_INIT,
@@ -1162,7 +1208,12 @@ again:
ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0);
+ if (ret == 0) {
+ /* This shouldn't happen, indicates a bug or fs corruption. */
+ ASSERT(ret != 0);
+ ret = -EUCLEAN;
+ goto out;
+ }
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (trans && likely(trans->type != __TRANS_DUMMY) &&
@@ -1195,7 +1246,7 @@ again:
}
spin_unlock(&delayed_refs->lock);
ret = add_delayed_refs(fs_info, head, time_seq,
- &preftrees, &total_refs, sc);
+ &preftrees, sc);
mutex_unlock(&head->mutex);
if (ret)
goto out;
@@ -1216,8 +1267,7 @@ again:
(key.type == BTRFS_EXTENT_ITEM_KEY ||
key.type == BTRFS_METADATA_ITEM_KEY)) {
ret = add_inline_refs(fs_info, path, bytenr,
- &info_level, &preftrees,
- &total_refs, sc);
+ &info_level, &preftrees, sc);
if (ret)
goto out;
ret = add_keyed_refs(fs_info, path, bytenr, info_level,
@@ -1236,7 +1286,7 @@ again:
WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));
ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
- extent_item_pos, total_refs, sc, ignore_offset);
+ extent_item_pos, sc, ignore_offset);
if (ret)
goto out;
@@ -1311,10 +1361,18 @@ again:
goto out;
if (!ret && extent_item_pos) {
/*
- * we've recorded that parent, so we must extend
- * its inode list here
+ * We've recorded that parent, so we must extend
+ * its inode list here.
+ *
+ * However if there was corruption we may not
+ * have found an eie, return an error in this
+ * case.
*/
- BUG_ON(!eie);
+ ASSERT(eie);
+ if (!eie) {
+ ret = -EUCLEAN;
+ goto out;
+ }
while (eie->next)
eie = eie->next;
eie->next = ref->inode_list;
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index ace49a999ece..bcf19dfb0af3 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -640,7 +640,15 @@ static noinline void caching_thread(struct btrfs_work *work)
mutex_lock(&caching_ctl->mutex);
down_read(&fs_info->commit_root_sem);
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ /*
+ * If we are in the transaction that populated the free space tree we
+ * can't actually cache from the free space tree as our commit root and
+ * real root are the same, so we could change the contents of the blocks
+ * while caching. Instead do the slow caching in this case, and after
+ * the transaction has committed we will be safe.
+ */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
ret = load_free_space_tree(caching_ctl);
else
ret = load_extent_tree_free(caching_ctl);
@@ -2040,8 +2048,17 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
return flags;
}
-int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
-
+/*
+ * Mark one block group RO, can be called several times for the same block
+ * group.
+ *
+ * @cache: the destination block group
+ * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to
+ * ensure we still have some free space after marking this
+ * block group RO.
+ */
+int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache,
+ bool do_chunk_alloc)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_trans_handle *trans;
@@ -2071,25 +2088,29 @@ again:
goto again;
}
- /*
- * if we are changing raid levels, try to allocate a corresponding
- * block group with the new raid level.
- */
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
- if (alloc_flags != cache->flags) {
- ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+ if (do_chunk_alloc) {
/*
- * ENOSPC is allowed here, we may have enough space
- * already allocated at the new raid level to
- * carry on
+ * If we are changing raid levels, try to allocate a
+ * corresponding block group with the new raid level.
*/
- if (ret == -ENOSPC)
- ret = 0;
- if (ret < 0)
- goto out;
+ alloc_flags = update_block_group_flags(fs_info, cache->flags);
+ if (alloc_flags != cache->flags) {
+ ret = btrfs_chunk_alloc(trans, alloc_flags,
+ CHUNK_ALLOC_FORCE);
+ /*
+ * ENOSPC is allowed here, we may have enough space
+ * already allocated at the new raid level to carry on
+ */
+ if (ret == -ENOSPC)
+ ret = 0;
+ if (ret < 0)
+ goto out;
+ }
}
- ret = inc_block_group_ro(cache, 0);
+ ret = inc_block_group_ro(cache, !do_chunk_alloc);
+ if (!do_chunk_alloc)
+ goto unlock_out;
if (!ret)
goto out;
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
@@ -2104,6 +2125,7 @@ out:
check_system_chunk(trans, alloc_flags);
mutex_unlock(&fs_info->chunk_mutex);
}
+unlock_out:
mutex_unlock(&fs_info->ro_block_group_mutex);
btrfs_end_transaction(trans);
@@ -2366,7 +2388,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_path *path = NULL;
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
int loops = 0;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -2383,8 +2404,10 @@ again:
if (!path) {
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
/*
@@ -2431,7 +2454,6 @@ again:
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
/*
@@ -2479,16 +2501,14 @@ again:
btrfs_put_block_group(cache);
if (drop_reserve)
btrfs_delayed_refs_rsv_release(fs_info, 1);
-
- if (ret)
- break;
-
/*
* Avoid blocking other tasks for too long. It might even save
* us from writing caches for block groups that are going to be
* removed.
*/
mutex_unlock(&trans->transaction->cache_write_mutex);
+ if (ret)
+ goto out;
mutex_lock(&trans->transaction->cache_write_mutex);
}
mutex_unlock(&trans->transaction->cache_write_mutex);
@@ -2497,7 +2517,8 @@ again:
* Go through delayed refs for all the stuff we've just kicked off
* and then loop back (just once)
*/
- ret = btrfs_run_delayed_refs(trans, 0);
+ if (!ret)
+ ret = btrfs_run_delayed_refs(trans, 0);
if (!ret && loops == 0) {
loops++;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -2511,7 +2532,12 @@ again:
goto again;
}
spin_unlock(&cur_trans->dirty_bgs_lock);
- } else if (ret < 0) {
+ }
+out:
+ if (ret < 0) {
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_splice_init(&dirty, &cur_trans->dirty_bgs);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
}
@@ -2528,7 +2554,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
int should_put;
struct btrfs_path *path;
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
path = btrfs_alloc_path();
if (!path)
@@ -2586,7 +2611,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
list_add_tail(&cache->io_list, io);
} else {
@@ -2609,7 +2633,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
* finished yet (no block group item in the extent tree
* yet, etc). If this is the case, wait for all free
* space endio workers to finish and retry. This is a
- * a very rare case so no need for a more efficient and
+ * very rare case so no need for a more efficient and
* complex approach.
*/
if (ret == -ENOENT) {
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index c391800388dd..0758e6d52acb 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -205,7 +205,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info);
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
u64 type, u64 chunk_offset, u64 size);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
-int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache);
+int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache,
+ bool do_chunk_alloc);
void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index f853835c409c..f3ff57b93158 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -268,6 +268,21 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
mod);
}
+/*
+ * Called every time after doing a buffered, direct IO or memory mapped write.
+ *
+ * This is to ensure that if we write to a file that was previously fsynced in
+ * the current transaction, then try to fsync it again in the same transaction,
+ * we will know that there were changes in the file and that it needs to be
+ * logged.
+ */
+static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
+{
+ spin_lock(&inode->lock);
+ inode->last_sub_trans = inode->root->log_transid;
+ spin_unlock(&inode->lock);
+}
+
static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
{
int ret = 0;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b05b361e2062..28f78e4f2c87 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -273,7 +273,7 @@ static void end_compressed_bio_write(struct bio *bio)
cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0],
cb->start, cb->start + cb->len - 1,
- bio->bi_status == BLK_STS_OK);
+ !cb->errors);
cb->compressed_pages[0]->mapping = NULL;
end_compressed_writeback(inode, cb);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e25133a9e9df..822c615840e8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -260,9 +260,12 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0);
-
- if (ret)
+ if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
return ret;
+ }
btrfs_mark_buffer_dirty(cow);
*cow_ret = cow;
@@ -1403,8 +1406,30 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
"failed to read tree block %llu from get_old_root",
logical);
} else {
+ struct tree_mod_elem *tm2;
+
+ btrfs_tree_read_lock(old);
eb = btrfs_clone_extent_buffer(old);
+ /*
+ * After the lookup for the most recent tree mod operation
+ * above and before we locked and cloned the extent buffer
+ * 'old', a new tree mod log operation may have been added.
+ * So lookup for a more recent one to make sure the number
+ * of mod log operations we replay is consistent with the
+ * number of items we have in the cloned extent buffer,
+ * otherwise we can hit a BUG_ON when rewinding the extent
+ * buffer.
+ */
+ tm2 = tree_mod_log_search(fs_info, logical, time_seq);
+ btrfs_tree_read_unlock(old);
free_extent_buffer(old);
+ ASSERT(tm2);
+ ASSERT(tm2 == tm || tm2->seq > tm->seq);
+ if (!tm2 || tm2->seq < tm->seq) {
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ tm = tm2;
}
} else if (old_root) {
eb_root_owner = btrfs_header_owner(eb_root);
@@ -2633,12 +2658,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
- int root_lock;
+ int root_lock = 0;
int level = 0;
- /* We try very hard to do read locks on the root */
- root_lock = BTRFS_READ_LOCK;
-
if (p->search_commit_root) {
/*
* The commit roots are read only so we always do read locks,
@@ -2676,6 +2698,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
goto out;
}
+ /* We try very hard to do read locks on the root */
+ root_lock = BTRFS_READ_LOCK;
+
/*
* If the level is set to maximum, we can skip trying to get the read
* lock.
@@ -2702,6 +2727,17 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
level = btrfs_header_level(b);
out:
+ /*
+ * The root may have failed to write out at some point, and thus is no
+ * longer valid, return an error in this case.
+ */
+ if (!extent_buffer_uptodate(b)) {
+ if (root_lock)
+ btrfs_tree_unlock_rw(b, root_lock);
+ free_extent_buffer(b);
+ return ERR_PTR(-EIO);
+ }
+
p->nodes[level] = b;
if (!p->skip_locking)
p->locks[level] = root_lock;
@@ -5207,7 +5243,7 @@ again:
slot--;
/*
* check this node pointer against the min_trans parameters.
- * If it is too old, old, skip to the next one.
+ * If it is too old, skip to the next one.
*/
while (slot < nritems) {
u64 gen;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 27128164fac9..cd77c0621a55 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -505,11 +505,6 @@ enum {
*/
BTRFS_FS_EXCL_OP,
/*
- * To info transaction_kthread we need an immediate commit so it
- * doesn't need to wait for commit_interval
- */
- BTRFS_FS_NEED_ASYNC_COMMIT,
- /*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
* Set and cleared while holding fs_info::balance_mutex.
@@ -524,6 +519,9 @@ enum {
* so we don't need to offload checksums to workqueues.
*/
BTRFS_FS_CSUM_IMPL_FAST,
+
+ /* Indicate that we can't trust the free space tree for caching yet */
+ BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
};
struct btrfs_fs_info {
@@ -829,7 +827,10 @@ struct btrfs_fs_info {
*/
struct ulist *qgroup_ulist;
- /* protect user change for quota operations */
+ /*
+ * Protect user change for quota operations. If a transaction is needed,
+ * it must be started before locking this lock.
+ */
struct mutex qgroup_ioctl_lock;
/* list of dirty qgroups to be written at next commit */
@@ -942,6 +943,8 @@ enum {
BTRFS_ROOT_DEAD_TREE,
/* The root has a log tree. Used only for subvolume roots. */
BTRFS_ROOT_HAS_LOG_TREE,
+ /* Qgroup flushing is in progress */
+ BTRFS_ROOT_QGROUP_FLUSHING,
};
/*
@@ -1094,6 +1097,7 @@ struct btrfs_root {
spinlock_t qgroup_meta_rsv_lock;
u64 qgroup_meta_rsv_pertrans;
u64 qgroup_meta_rsv_prealloc;
+ wait_queue_head_t qgroup_flush_wait;
/* Number of active swapfiles */
atomic_t nr_swapfiles;
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index db9f2c58eb4a..f4f531c4aa96 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -151,7 +151,7 @@ int btrfs_check_data_free_space(struct inode *inode,
return ret;
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
- ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), reserved, start, len);
if (ret < 0)
btrfs_free_reserved_data_space_noquota(inode, start, len);
else
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index bef62b01824d..e96890475bac 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -627,7 +627,8 @@ static int btrfs_delayed_inode_reserve_metadata(
*/
if (!src_rsv || (!trans->bytes_reserved &&
src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
- ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC, true);
if (ret < 0)
return ret;
ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
@@ -649,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata(
btrfs_ino(inode),
num_bytes, 1);
} else {
- btrfs_qgroup_free_meta_prealloc(root, fs_info->nodesize);
+ btrfs_qgroup_free_meta_prealloc(root, num_bytes);
}
return ret;
}
@@ -1033,12 +1034,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
nofs_flag = memalloc_nofs_save();
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
memalloc_nofs_restore(nofs_flag);
- if (ret > 0) {
- btrfs_release_path(path);
- return -ENOENT;
- } else if (ret < 0) {
- return ret;
- }
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto out;
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -1076,6 +1075,14 @@ err_out:
btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
btrfs_release_delayed_inode(node);
+ /*
+ * If we fail to update the delayed inode we need to abort the
+ * transaction, because we could leave the inode with the improper
+ * counts behind.
+ */
+ if (ret && ret != -ENOENT)
+ btrfs_abort_transaction(trans, ret);
+
return ret;
search:
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index cd65ef7c7c3f..f18c6d97932e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1154,6 +1154,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
mutex_init(&root->log_mutex);
mutex_init(&root->ordered_extent_mutex);
mutex_init(&root->delalloc_mutex);
+ init_waitqueue_head(&root->qgroup_flush_wait);
init_waitqueue_head(&root->log_writer_wait);
init_waitqueue_head(&root->log_commit_wait[0]);
init_waitqueue_head(&root->log_commit_wait[1]);
@@ -1747,8 +1748,7 @@ static int transaction_kthread(void *arg)
}
now = ktime_get_seconds();
- if (cur->state < TRANS_STATE_BLOCKED &&
- !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
+ if (cur->state < TRANS_STATE_COMMIT_START &&
(now < cur->start_time ||
now - cur->start_time < fs_info->commit_interval)) {
spin_unlock(&fs_info->trans_lock);
@@ -2463,6 +2463,24 @@ static int validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
+ BTRFS_FSID_SIZE)) {
+ btrfs_err(fs_info,
+ "superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
+ fs_info->super_copy->fsid, fs_info->fs_devices->fsid);
+ ret = -EINVAL;
+ }
+
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID) &&
+ memcmp(fs_info->fs_devices->metadata_uuid,
+ fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) {
+ btrfs_err(fs_info,
+"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
+ fs_info->super_copy->metadata_uuid,
+ fs_info->fs_devices->metadata_uuid);
+ ret = -EINVAL;
+ }
+
if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
@@ -2798,7 +2816,7 @@ int open_ctree(struct super_block *sb,
}
/*
- * Verify the type first, if that or the the checksum value are
+ * Verify the type first, if that or the checksum value are
* corrupted, we'll find out
*/
csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data);
@@ -2837,14 +2855,6 @@ int open_ctree(struct super_block *sb,
disk_super = fs_info->super_copy;
- ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
- BTRFS_FSID_SIZE));
-
- if (btrfs_fs_incompat(fs_info, METADATA_UUID)) {
- ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid,
- fs_info->super_copy->metadata_uuid,
- BTRFS_FSID_SIZE));
- }
features = btrfs_super_flags(disk_super);
if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
@@ -2884,6 +2894,29 @@ int open_ctree(struct super_block *sb,
*/
fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+ /*
+ * Flag our filesystem as having big metadata blocks if they are bigger
+ * than the page size
+ */
+ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
+ if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
+ btrfs_info(fs_info,
+ "flagging fs with big metadata feature");
+ features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+ }
+
+ /* Set up fs_info before parsing mount options */
+ nodesize = btrfs_super_nodesize(disk_super);
+ sectorsize = btrfs_super_sectorsize(disk_super);
+ stripesize = sectorsize;
+ fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
+ fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
+
+ /* Cache block sizes */
+ fs_info->nodesize = nodesize;
+ fs_info->sectorsize = sectorsize;
+ fs_info->stripesize = stripesize;
+
ret = btrfs_parse_options(fs_info, options, sb->s_flags);
if (ret) {
err = ret;
@@ -2911,28 +2944,6 @@ int open_ctree(struct super_block *sb,
btrfs_info(fs_info, "has skinny extents");
/*
- * flag our filesystem as having big metadata blocks if
- * they are bigger than the page size
- */
- if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
- if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
- btrfs_info(fs_info,
- "flagging fs with big metadata feature");
- features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
- }
-
- nodesize = btrfs_super_nodesize(disk_super);
- sectorsize = btrfs_super_sectorsize(disk_super);
- stripesize = sectorsize;
- fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
- fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
-
- /* Cache block sizes */
- fs_info->nodesize = nodesize;
- fs_info->sectorsize = sectorsize;
- fs_info->stripesize = stripesize;
-
- /*
* mixed block groups end up with duplicate but slightly offset
* extent buffers for the same range. It leads to corruptions
*/
@@ -3134,7 +3145,8 @@ retry_root_backup:
goto fail_sysfs;
}
- if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
+ if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
+ !btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
"writable mount is not allowed due to too many missing devices");
goto fail_sysfs;
@@ -3624,11 +3636,23 @@ static void btrfs_end_empty_barrier(struct bio *bio)
*/
static void write_dev_flush(struct btrfs_device *device)
{
- struct request_queue *q = bdev_get_queue(device->bdev);
struct bio *bio = device->flush_bio;
+#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ /*
+ * When a disk has write caching disabled, we skip submission of a bio
+ * with flush and sync requests before writing the superblock, since
+ * it's not needed. However when the integrity checker is enabled, this
+ * results in reports that there are metadata blocks referred by a
+ * superblock that were not properly flushed. So don't skip the bio
+ * submission only when the integrity checker is enabled for the sake
+ * of simplicity, since this is a debug tool and not meant for use in
+ * non-debug builds.
+ */
+ struct request_queue *q = bdev_get_queue(device->bdev);
if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
return;
+#endif
bio_reset(bio);
bio->bi_end_io = btrfs_end_empty_barrier;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c0dd839e99b7..19d2104c0462 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1338,16 +1338,20 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
u64 bytes;
struct request_queue *req_q;
+ struct btrfs_device *device = stripe->dev;
- if (!stripe->dev->bdev) {
+ if (!device->bdev) {
ASSERT(btrfs_test_opt(fs_info, DEGRADED));
continue;
}
- req_q = bdev_get_queue(stripe->dev->bdev);
+ req_q = bdev_get_queue(device->bdev);
if (!blk_queue_discard(req_q))
continue;
- ret = btrfs_issue_discard(stripe->dev->bdev,
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+ continue;
+
+ ret = btrfs_issue_discard(device->bdev,
stripe->physical,
stripe->length,
&bytes);
@@ -1879,7 +1883,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
btrfs_put_delayed_ref_head(head);
- return 0;
+ return ret;
}
static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
@@ -4592,6 +4596,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
out_free_delayed:
btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
+ btrfs_tree_unlock(buf);
free_extent_buffer(buf);
out_free_reserved:
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
@@ -5390,7 +5395,15 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out_free;
}
- trans = btrfs_start_transaction(tree_root, 0);
+ /*
+ * Use join to avoid potential EINTR from transaction
+ * start. See wait_reserve_ticket and the whole
+ * reservation callchain.
+ */
+ if (for_reloc)
+ trans = btrfs_join_transaction(tree_root);
+ else
+ trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_free;
@@ -5756,6 +5769,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
mutex_lock(&fs_info->fs_devices->device_list_mutex);
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ continue;
+
ret = btrfs_trim_free_extents(device, &group_trimmed);
if (ret) {
dev_failed++;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 95205bde240f..95ddeb477797 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3152,7 +3152,7 @@ static int __do_readpage(struct extent_io_tree *tree,
/*
* If we have a file range that points to a compressed extent
- * and it's followed by a consecutive file range that points to
+ * and it's followed by a consecutive file range that points
* to the same compressed extent (possibly with a different
* offset and/or length, so it either points to the whole extent
* or only part of it), we must make sure we do not submit a
@@ -3755,6 +3755,12 @@ static void set_btree_ioerr(struct page *page)
return;
/*
+ * A read may stumble upon this buffer later, make sure that it gets an
+ * error and knows there was an error.
+ */
+ clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+
+ /*
* If we error out, we should add back the dirty_metadata_bytes
* to make it consistent.
*/
@@ -4648,7 +4654,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
int ret = 0;
- u64 off = start;
+ u64 off;
u64 max = start + len;
u32 flags = 0;
u32 found_type;
@@ -4684,6 +4690,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
goto out_free_ulist;
}
+ /*
+ * We can't initialize that to 'start' as this could miss extents due
+ * to extent item merging
+ */
+ off = 0;
start = round_down(start, btrfs_inode_sectorsize(inode));
len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index fcf1807cc8dd..c8def2bdf247 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -202,7 +202,7 @@ struct extent_buffer {
*/
struct extent_changeset {
/* How many bytes are set/cleared in this operation */
- unsigned int bytes_changed;
+ u64 bytes_changed;
/* Changed ranges */
struct ulist range_changed;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 2b8f29c07668..61b82c69eed5 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -599,7 +599,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
u64 end_byte = bytenr + len;
u64 csum_end;
struct extent_buffer *leaf;
- int ret;
+ int ret = 0;
u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int blocksize_bits = fs_info->sb->s_blocksize_bits;
@@ -618,6 +618,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
+ ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
@@ -674,7 +675,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, root, path,
path->slots[0], del_nr);
if (ret)
- goto out;
+ break;
if (key.offset == bytenr)
break;
} else if (key.offset < bytenr && csum_end > end_byte) {
@@ -718,8 +719,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_split_item(trans, root, path, &key, offset);
if (ret && ret != -EAGAIN) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ break;
}
+ ret = 0;
key.offset = end_byte - 1;
} else {
@@ -729,8 +731,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
}
- ret = 0;
-out:
btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4126513e2429..1279359ed172 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1163,7 +1163,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
int del_nr = 0;
int del_slot = 0;
int recow;
- int ret;
+ int ret = 0;
u64 ino = btrfs_ino(inode);
path = btrfs_alloc_path();
@@ -1384,7 +1384,7 @@ again:
}
out:
btrfs_free_path(path);
- return 0;
+ return ret;
}
/*
@@ -2004,14 +2004,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
inode_unlock(inode);
- /*
- * We also have to set last_sub_trans to the current log transid,
- * otherwise subsequent syncs to a file that's been synced in this
- * transaction will appear to have already occurred.
- */
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->last_sub_trans = root->log_transid;
- spin_unlock(&BTRFS_I(inode)->lock);
+ btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
+
if (num_written > 0)
num_written = generic_write_sync(iocb, num_written);
@@ -3149,10 +3143,13 @@ reserve_space:
&cached_state);
if (ret)
goto out;
- ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
alloc_start, bytes_to_reserve);
- if (ret)
+ if (ret) {
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
goto out;
+ }
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
i_blocksize(inode),
@@ -3319,8 +3316,9 @@ static long btrfs_fallocate(struct file *file, int mode,
free_extent_map(em);
break;
}
- ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
- cur_offset, last_byte - cur_offset);
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
+ &data_reserved, cur_offset,
+ last_byte - cur_offset);
if (ret < 0) {
cur_offset = last_byte;
free_extent_map(em);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6e6be922b937..d2d32fed8f2e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -744,8 +744,10 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
while (num_entries) {
e = kmem_cache_zalloc(btrfs_free_space_cachep,
GFP_NOFS);
- if (!e)
+ if (!e) {
+ ret = -ENOMEM;
goto free_cache;
+ }
ret = io_ctl_read_entry(&io_ctl, e, &type);
if (ret) {
@@ -754,6 +756,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
}
if (!e->bytes) {
+ ret = -1;
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
@@ -774,6 +777,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
e->bitmap = kmem_cache_zalloc(
btrfs_free_space_bitmap_cachep, GFP_NOFS);
if (!e->bitmap) {
+ ret = -ENOMEM;
kmem_cache_free(
btrfs_free_space_cachep, e);
goto free_cache;
@@ -1335,7 +1339,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
/*
* at this point the pages are under IO and we're happy,
- * The caller is responsible for waiting on them and updating the
+ * The caller is responsible for waiting on them and updating
* the cache and the inode
*/
io_ctl->entries = entries;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 48a03f5240f5..dfabbbfc94cc 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1149,6 +1149,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
return PTR_ERR(trans);
set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+ set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
free_space_root = btrfs_create_tree(trans,
BTRFS_FREE_SPACE_TREE_OBJECTID);
if (IS_ERR(free_space_root)) {
@@ -1170,11 +1171,18 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE);
btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+ ret = btrfs_commit_transaction(trans);
- return btrfs_commit_transaction(trans);
+ /*
+ * Now that we've committed the transaction any reading of our commit
+ * root will be safe, so we can cache from the free space tree now.
+ */
+ clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
+ return ret;
abort:
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+ clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 67b49b94c9cd..7755a0362a3a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1200,11 +1200,6 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
PAGE_SHIFT;
- /* atomic_sub_return implies a barrier */
- if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
- 5 * SZ_1M)
- cond_wake_up_nomb(&fs_info->async_submit_wait);
-
/*
* ->inode could be NULL if async_chunk_start has failed to compress,
* in which case we don't have anything to submit, yet we need to
@@ -1213,6 +1208,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)
*/
if (async_chunk->inode)
submit_compressed_extents(async_chunk);
+
+ /* atomic_sub_return implies a barrier */
+ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
+ 5 * SZ_1M)
+ cond_wake_up_nomb(&fs_info->async_submit_wait);
}
static noinline void async_cow_free(struct btrfs_work *work)
@@ -3359,6 +3359,18 @@ out:
if (ret || truncated) {
u64 start, end;
+ /*
+ * If we failed to finish this ordered extent for any reason we
+ * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
+ * extent, and mark the inode with the error if it wasn't
+ * already set. Any error during writeback would have already
+ * set the mapping error, so we need to set it if we're the ones
+ * marking this ordered extent as failed.
+ */
+ if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
+ &ordered_extent->flags))
+ mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
+
if (truncated)
start = ordered_extent->file_offset + logical_len;
else
@@ -3571,6 +3583,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
inode = list_first_entry(&fs_info->delayed_iputs,
struct btrfs_inode, delayed_iput);
run_delayed_iput_locked(fs_info, inode);
+ cond_resched_lock(&fs_info->delayed_iput_lock);
}
spin_unlock(&fs_info->delayed_iput_lock);
}
@@ -6362,7 +6375,7 @@ static int btrfs_dirty_inode(struct inode *inode)
return PTR_ERR(trans);
ret = btrfs_update_inode(trans, root, inode);
- if (ret && ret == -ENOSPC) {
+ if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
/* whoops, lets try again with the full transaction */
btrfs_end_transaction(trans);
trans = btrfs_start_transaction(root, 1);
@@ -6979,7 +6992,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
drop_inode = 1;
} else {
struct dentry *parent = dentry->d_parent;
- int ret;
err = btrfs_update_inode(trans, root, inode);
if (err)
@@ -6994,12 +7006,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
- ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
- true, NULL);
- if (ret == BTRFS_NEED_TRANS_COMMIT) {
- err = btrfs_commit_transaction(trans);
- trans = NULL;
- }
+ btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
}
fail:
@@ -9243,9 +9250,7 @@ again:
set_page_dirty(page);
SetPageUptodate(page);
- BTRFS_I(inode)->last_trans = fs_info->generation;
- BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
- BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+ btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
@@ -9628,7 +9633,7 @@ int __init btrfs_init_cachep(void)
btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
PAGE_SIZE, PAGE_SIZE,
- SLAB_RED_ZONE, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!btrfs_free_space_bitmap_cachep)
goto fail;
@@ -9686,27 +9691,25 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec64 ctime = current_time(old_inode);
- struct dentry *parent;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
u64 new_idx = 0;
int ret;
+ int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
- struct btrfs_log_ctx ctx_root;
- struct btrfs_log_ctx ctx_dest;
- bool sync_log_root = false;
- bool sync_log_dest = false;
- bool commit_transaction = false;
- /* we only allow rename subvolume link between subvolumes */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ /*
+ * For non-subvolumes allow exchange only within one subvolume, in the
+ * same inode namespace. Two subvolumes (represented as directory) can
+ * be exchanged as they're a logical link and have a fixed inode number.
+ */
+ if (root != dest &&
+ (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
+ new_ino != BTRFS_FIRST_FREE_OBJECTID))
return -EXDEV;
- btrfs_init_log_ctx(&ctx_root, old_inode);
- btrfs_init_log_ctx(&ctx_dest, new_inode);
-
/* close the race window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
new_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -9848,30 +9851,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode)->dir_index = new_idx;
if (root_log_pinned) {
- parent = new_dentry->d_parent;
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
- BTRFS_I(old_dir), parent,
- false, &ctx_root);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log_root = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
+ new_dentry->d_parent);
btrfs_end_log_trans(root);
root_log_pinned = false;
}
if (dest_log_pinned) {
- if (!commit_transaction) {
- parent = old_dentry->d_parent;
- ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
- BTRFS_I(new_dir), parent,
- false, &ctx_dest);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log_dest = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
- }
+ btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
+ old_dentry->d_parent);
btrfs_end_log_trans(dest);
dest_log_pinned = false;
}
@@ -9904,46 +9891,13 @@ out_fail:
dest_log_pinned = false;
}
}
- if (!ret && sync_log_root && !commit_transaction) {
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root,
- &ctx_root);
- if (ret)
- commit_transaction = true;
- }
- if (!ret && sync_log_dest && !commit_transaction) {
- ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root,
- &ctx_dest);
- if (ret)
- commit_transaction = true;
- }
- if (commit_transaction) {
- /*
- * We may have set commit_transaction when logging the new name
- * in the destination root, in which case we left the source
- * root context in the list of log contextes. So make sure we
- * remove it to avoid invalid memory accesses, since the context
- * was allocated in our stack frame.
- */
- if (sync_log_root) {
- mutex_lock(&root->log_mutex);
- list_del_init(&ctx_root.list);
- mutex_unlock(&root->log_mutex);
- }
- ret = btrfs_commit_transaction(trans);
- } else {
- int ret2;
-
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
- }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
out_notrans:
if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
- ASSERT(list_empty(&ctx_root.list));
- ASSERT(list_empty(&ctx_dest.list));
-
return ret;
}
@@ -10011,11 +9965,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode = d_inode(old_dentry);
u64 index = 0;
int ret;
+ int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
bool log_pinned = false;
- struct btrfs_log_ctx ctx;
- bool sync_log = false;
- bool commit_transaction = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -10165,17 +10117,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BTRFS_I(old_inode)->dir_index = index;
if (log_pinned) {
- struct dentry *parent = new_dentry->d_parent;
-
- btrfs_init_log_ctx(&ctx, old_inode);
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
- BTRFS_I(old_dir), parent,
- false, &ctx);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
+ new_dentry->d_parent);
btrfs_end_log_trans(root);
log_pinned = false;
}
@@ -10212,23 +10155,8 @@ out_fail:
btrfs_end_log_trans(root);
log_pinned = false;
}
- if (!ret && sync_log) {
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
- if (ret)
- commit_transaction = true;
- } else if (sync_log) {
- mutex_lock(&root->log_mutex);
- list_del(&ctx.list);
- mutex_unlock(&root->log_mutex);
- }
- if (commit_transaction) {
- ret = btrfs_commit_transaction(trans);
- } else {
- int ret2;
-
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
- }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
@@ -10880,9 +10808,19 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
struct btrfs_swap_info *bsi)
{
unsigned long nr_pages;
+ unsigned long max_pages;
u64 first_ppage, first_ppage_reported, next_ppage;
int ret;
+ /*
+ * Our swapfile may have had its size extended after the swap header was
+ * written. In that case activating the swapfile should not go beyond
+ * the max size set in the swap header.
+ */
+ if (bsi->nr_pages >= sis->max)
+ return 0;
+
+ max_pages = sis->max - bsi->nr_pages;
first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
PAGE_SIZE) >> PAGE_SHIFT;
@@ -10890,6 +10828,7 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
if (first_ppage >= next_ppage)
return 0;
nr_pages = next_ppage - first_ppage;
+ nr_pages = min(nr_pages, max_pages);
first_ppage_reported = first_ppage;
if (bsi->start == 0)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8ed71b3b2546..675112aa998f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -667,8 +667,6 @@ static noinline int create_subvol(struct inode *dir,
btrfs_set_root_otransid(root_item, trans->transid);
btrfs_tree_unlock(leaf);
- free_extent_buffer(leaf);
- leaf = NULL;
btrfs_set_root_dirid(root_item, new_dirid);
@@ -677,8 +675,22 @@ static noinline int create_subvol(struct inode *dir,
key.type = BTRFS_ROOT_ITEM_KEY;
ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
root_item);
- if (ret)
+ if (ret) {
+ /*
+ * Since we don't abort the transaction in this case, free the
+ * tree block so that we don't leak space and leave the
+ * filesystem in an inconsistent state (an extent item in the
+ * extent tree without backreferences). Also no need to have
+ * the tree block locked since it is not in any tree at this
+ * point, so no other task can find it and use it.
+ */
+ btrfs_free_tree_block(trans, root, leaf, 0, 1);
+ free_extent_buffer(leaf);
goto fail;
+ }
+
+ free_extent_buffer(leaf);
+ leaf = NULL;
key.offset = (u64)-1;
new_root = btrfs_read_fs_root_no_name(fs_info, &key);
@@ -1907,7 +1919,10 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
readonly = true;
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
- if (vol_args->size > PAGE_SIZE) {
+ u64 nums;
+
+ if (vol_args->size < sizeof(*inherit) ||
+ vol_args->size > PAGE_SIZE) {
ret = -EINVAL;
goto free_args;
}
@@ -1916,6 +1931,20 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
ret = PTR_ERR(inherit);
goto free_args;
}
+
+ if (inherit->num_qgroups > PAGE_SIZE ||
+ inherit->num_ref_copies > PAGE_SIZE ||
+ inherit->num_excl_copies > PAGE_SIZE) {
+ ret = -EINVAL;
+ goto free_inherit;
+ }
+
+ nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
+ 2 * inherit->num_excl_copies;
+ if (vol_args->size != struct_size(inherit, qgroups, nums)) {
+ ret = -EINVAL;
+ goto free_inherit;
+ }
}
ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
@@ -2998,10 +3027,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
inode_lock(inode);
err = btrfs_delete_subvolume(dir, dentry);
inode_unlock(inode);
- if (!err) {
- fsnotify_rmdir(dir, dentry);
- d_delete(dentry);
- }
+ if (!err)
+ d_delete_notify(dir, dentry);
out_dput:
dput(dentry);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index cd8e81c02f63..5a3006c75d63 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -11,7 +11,6 @@
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/btrfs.h>
-#include <linux/sizes.h>
#include "ctree.h"
#include "transaction.h"
@@ -887,20 +886,44 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
struct btrfs_key found_key;
struct btrfs_qgroup *qgroup = NULL;
struct btrfs_trans_handle *trans = NULL;
+ struct ulist *ulist = NULL;
int ret = 0;
int slot;
+ /*
+ * We need to have subvol_sem write locked, to prevent races between
+ * concurrent tasks trying to enable quotas, because we will unlock
+ * and relock qgroup_ioctl_lock before setting fs_info->quota_root
+ * and before setting BTRFS_FS_QUOTA_ENABLED.
+ */
+ lockdep_assert_held_write(&fs_info->subvol_sem);
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (fs_info->quota_root)
goto out;
- fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
- if (!fs_info->qgroup_ulist) {
+ ulist = ulist_alloc(GFP_KERNEL);
+ if (!ulist) {
ret = -ENOMEM;
goto out;
}
/*
+ * Unlock qgroup_ioctl_lock before starting the transaction. This is to
+ * avoid lock acquisition inversion problems (reported by lockdep) between
+ * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
+ * start a transaction.
+ * After we started the transaction lock qgroup_ioctl_lock again and
+ * check if someone else created the quota root in the meanwhile. If so,
+ * just return success and release the transaction handle.
+ *
+ * Also we don't need to worry about someone else calling
+ * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
+ * that function returns 0 (success) when the sysfs entries already exist.
+ */
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+ /*
* 1 for quota root item
* 1 for BTRFS_QGROUP_STATUS item
*
@@ -909,12 +932,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
* would be a lot of overkill.
*/
trans = btrfs_start_transaction(tree_root, 2);
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
goto out;
}
+ if (fs_info->quota_root)
+ goto out;
+
+ fs_info->qgroup_ulist = ulist;
+ ulist = NULL;
+
/*
* initially create the quota tree
*/
@@ -1012,8 +1043,19 @@ out_add_root:
goto out_free_path;
}
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ /*
+ * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
+ * a deadlock with tasks concurrently doing other qgroup operations, such
+ * adding/removing qgroups or adding/deleting qgroup relations for example,
+ * because all qgroup operations first start or join a transaction and then
+ * lock the qgroup_ioctl_lock mutex.
+ * We are safe from a concurrent task trying to enable quotas, by calling
+ * this function, since we are serialized by fs_info->subvol_sem.
+ */
ret = btrfs_commit_transaction(trans);
trans = NULL;
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (ret)
goto out_free_path;
@@ -1047,10 +1089,13 @@ out:
if (ret) {
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
- if (trans)
- btrfs_end_transaction(trans);
}
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (ret && trans)
+ btrfs_end_transaction(trans);
+ else if (trans)
+ ret = btrfs_end_transaction(trans);
+ ulist_free(ulist);
return ret;
}
@@ -1060,24 +1105,55 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
struct btrfs_trans_handle *trans = NULL;
int ret = 0;
+ /*
+ * We need to have subvol_sem write locked, to prevent races between
+ * concurrent tasks trying to disable quotas, because we will unlock
+ * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
+ */
+ lockdep_assert_held_write(&fs_info->subvol_sem);
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
/*
+ * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
+ * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
+ * to lock that mutex while holding a transaction handle and the rescan
+ * worker needs to commit a transaction.
+ */
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+ /*
+ * Request qgroup rescan worker to complete and wait for it. This wait
+ * must be done before transaction start for quota disable since it may
+ * deadlock with transaction by the qgroup rescan worker.
+ */
+ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ btrfs_qgroup_wait_for_completion(fs_info, false);
+
+ /*
* 1 For the root item
*
* We should also reserve enough items for the quota tree deletion in
* btrfs_clean_quota_tree but this is not done.
+ *
+ * Also, we must always start a transaction without holding the mutex
+ * qgroup_ioctl_lock, see btrfs_quota_enable().
*/
trans = btrfs_start_transaction(fs_info->tree_root, 1);
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ trans = NULL;
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
goto out;
}
- clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
- btrfs_qgroup_wait_for_completion(fs_info, false);
+ if (!fs_info->quota_root)
+ goto out;
+
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
@@ -1089,13 +1165,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
ret = btrfs_clean_quota_tree(trans, quota_root);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto end_trans;
+ goto out;
}
ret = btrfs_del_root(trans, &quota_root->root_key);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto end_trans;
+ goto out;
}
list_del(&quota_root->dirty_list);
@@ -1109,10 +1185,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
free_extent_buffer(quota_root->commit_root);
kfree(quota_root);
-end_trans:
- ret = btrfs_end_transaction(trans);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (ret && trans)
+ btrfs_end_transaction(trans);
+ else if (trans)
+ ret = btrfs_end_transaction(trans);
+
return ret;
}
@@ -2262,7 +2341,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
* Update qgroup rfer/excl counters.
* Rfer update is easy, codes can explain themselves.
*
- * Excl update is tricky, the update is split into 2 part.
+ * Excl update is tricky, the update is split into 2 parts.
* Part 1: Possible exclusive <-> sharing detect:
* | A | !A |
* -------------------------------------
@@ -2840,20 +2919,8 @@ out:
return ret;
}
-/*
- * Two limits to commit transaction in advance.
- *
- * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
- * For SIZE, it will be in byte unit as threshold.
- */
-#define QGROUP_FREE_RATIO 32
-#define QGROUP_FREE_SIZE SZ_32M
-static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
- const struct btrfs_qgroup *qg, u64 num_bytes)
+static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
{
- u64 free;
- u64 threshold;
-
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
return false;
@@ -2862,32 +2929,6 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
return false;
- /*
- * Even if we passed the check, it's better to check if reservation
- * for meta_pertrans is pushing us near limit.
- * If there is too much pertrans reservation or it's near the limit,
- * let's try commit transaction to free some, using transaction_kthread
- */
- if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
- BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
- if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
- free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
- threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
- QGROUP_FREE_SIZE);
- } else {
- free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
- threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
- QGROUP_FREE_SIZE);
- }
-
- /*
- * Use transaction_kthread to commit transaction, so we no
- * longer need to bother nested transaction nor lock context.
- */
- if (free < threshold)
- btrfs_commit_transaction_locksafe(fs_info);
- }
-
return true;
}
@@ -2937,7 +2978,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
qg = unode_aux_to_qgroup(unode);
- if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) {
+ if (enforce && !qgroup_check_limits(qg, num_bytes)) {
ret = -EDQUOT;
goto out;
}
@@ -3284,6 +3325,9 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
btrfs_warn(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL;
+ } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+ /* Quota disable is in progress */
+ ret = -EBUSY;
}
if (ret) {
@@ -3411,28 +3455,150 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
}
}
+#define rbtree_iterate_from_safe(node, next, start) \
+ for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
+
+static int qgroup_unreserve_range(struct btrfs_inode *inode,
+ struct extent_changeset *reserved, u64 start,
+ u64 len)
+{
+ struct rb_node *node;
+ struct rb_node *next;
+ struct ulist_node *entry = NULL;
+ int ret = 0;
+
+ node = reserved->range_changed.root.rb_node;
+ while (node) {
+ entry = rb_entry(node, struct ulist_node, rb_node);
+ if (entry->val < start)
+ node = node->rb_right;
+ else if (entry)
+ node = node->rb_left;
+ else
+ break;
+ }
+
+ /* Empty changeset */
+ if (!entry)
+ return 0;
+
+ if (entry->val > start && rb_prev(&entry->rb_node))
+ entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
+ rb_node);
+
+ rbtree_iterate_from_safe(node, next, &entry->rb_node) {
+ u64 entry_start;
+ u64 entry_end;
+ u64 entry_len;
+ int clear_ret;
+
+ entry = rb_entry(node, struct ulist_node, rb_node);
+ entry_start = entry->val;
+ entry_end = entry->aux;
+ entry_len = entry_end - entry_start + 1;
+
+ if (entry_start >= start + len)
+ break;
+ if (entry_start + entry_len <= start)
+ continue;
+ /*
+ * Now the entry is in [start, start + len), revert the
+ * EXTENT_QGROUP_RESERVED bit.
+ */
+ clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
+ entry_end, EXTENT_QGROUP_RESERVED);
+ if (!ret && clear_ret < 0)
+ ret = clear_ret;
+
+ ulist_del(&reserved->range_changed, entry->val, entry->aux);
+ if (likely(reserved->bytes_changed >= entry_len)) {
+ reserved->bytes_changed -= entry_len;
+ } else {
+ WARN_ON(1);
+ reserved->bytes_changed = 0;
+ }
+ }
+
+ return ret;
+}
+
/*
- * Reserve qgroup space for range [start, start + len).
+ * Try to free some space for qgroup.
*
- * This function will either reserve space from related qgroups or doing
- * nothing if the range is already reserved.
+ * For qgroup, there are only 3 ways to free qgroup space:
+ * - Flush nodatacow write
+ * Any nodatacow write will free its reserved data space at run_delalloc_range().
+ * In theory, we should only flush nodatacow inodes, but it's not yet
+ * possible, so we need to flush the whole root.
*
- * Return 0 for successful reserve
- * Return <0 for error (including -EQUOT)
+ * - Wait for ordered extents
+ * When ordered extents are finished, their reserved metadata is finally
+ * converted to per_trans status, which can be freed by later commit
+ * transaction.
*
- * NOTE: this function may sleep for memory allocation.
- * if btrfs_qgroup_reserve_data() is called multiple times with
- * same @reserved, caller must ensure when error happens it's OK
- * to free *ALL* reserved space.
+ * - Commit transaction
+ * This would free the meta_per_trans space.
+ * In theory this shouldn't provide much space, but any more qgroup space
+ * is needed.
*/
-int btrfs_qgroup_reserve_data(struct inode *inode,
+static int try_flush_qgroup(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+ bool can_commit = true;
+
+ /*
+ * We don't want to run flush again and again, so if there is a running
+ * one, we won't try to start a new flush, but exit directly.
+ */
+ if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
+ wait_event(root->qgroup_flush_wait,
+ !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
+ return 0;
+ }
+
+ /*
+ * If current process holds a transaction, we shouldn't flush, as we
+ * assume all space reservation happens before a transaction handle is
+ * held.
+ *
+ * But there are cases like btrfs_delayed_item_reserve_metadata() where
+ * we try to reserve space with one transction handle already held.
+ * In that case we can't commit transaction, but at least try to end it
+ * and hope the started data writes can free some space.
+ */
+ if (current->journal_info &&
+ current->journal_info != BTRFS_SEND_TRANS_STUB)
+ can_commit = false;
+
+ ret = btrfs_start_delalloc_snapshot(root);
+ if (ret < 0)
+ goto out;
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ if (can_commit)
+ ret = btrfs_commit_transaction(trans);
+ else
+ ret = btrfs_end_transaction(trans);
+out:
+ clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
+ wake_up(&root->qgroup_flush_wait);
+ return ret;
+}
+
+static int qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved_ret, u64 start,
u64 len)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
+ struct btrfs_root *root = inode->root;
struct extent_changeset *reserved;
+ bool new_reserved = false;
u64 orig_reserved;
u64 to_reserve;
int ret;
@@ -3445,6 +3611,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
if (WARN_ON(!reserved_ret))
return -EINVAL;
if (!*reserved_ret) {
+ new_reserved = true;
*reserved_ret = extent_changeset_alloc();
if (!*reserved_ret)
return -ENOMEM;
@@ -3452,15 +3619,15 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
reserved = *reserved_ret;
/* Record already reserved space */
orig_reserved = reserved->bytes_changed;
- ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+ ret = set_record_extent_bits(&inode->io_tree, start,
start + len -1, EXTENT_QGROUP_RESERVED, reserved);
/* Newly reserved space */
to_reserve = reserved->bytes_changed - orig_reserved;
- trace_btrfs_qgroup_reserve_data(inode, start, len,
+ trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
to_reserve, QGROUP_RESERVE);
if (ret < 0)
- goto cleanup;
+ goto out;
ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
if (ret < 0)
goto cleanup;
@@ -3468,23 +3635,49 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
return ret;
cleanup:
- /* cleanup *ALL* already reserved ranges */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(&reserved->range_changed, &uiter)))
- clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
- unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
- /* Also free data bytes of already reserved one */
- btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid,
- orig_reserved, BTRFS_QGROUP_RSV_DATA);
- extent_changeset_release(reserved);
+ qgroup_unreserve_range(inode, reserved, start, len);
+out:
+ if (new_reserved) {
+ extent_changeset_release(reserved);
+ kfree(reserved);
+ *reserved_ret = NULL;
+ }
return ret;
}
+/*
+ * Reserve qgroup space for range [start, start + len).
+ *
+ * This function will either reserve space from related qgroups or do nothing
+ * if the range is already reserved.
+ *
+ * Return 0 for successful reservation
+ * Return <0 for error (including -EQUOT)
+ *
+ * NOTE: This function may sleep for memory allocation, dirty page flushing and
+ * commit transaction. So caller should not hold any dirty page locked.
+ */
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
+ struct extent_changeset **reserved_ret, u64 start,
+ u64 len)
+{
+ int ret;
+
+ ret = qgroup_reserve_data(inode, reserved_ret, start, len);
+ if (ret <= 0 && ret != -EDQUOT)
+ return ret;
+
+ ret = try_flush_qgroup(inode->root);
+ if (ret < 0)
+ return ret;
+ return qgroup_reserve_data(inode, reserved_ret, start, len);
+}
+
/* Free ranges specified by @reserved, normally in error path */
-static int qgroup_free_reserved_data(struct inode *inode,
+static int qgroup_free_reserved_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct ulist_node *unode;
struct ulist_iterator uiter;
struct extent_changeset changeset;
@@ -3520,8 +3713,8 @@ static int qgroup_free_reserved_data(struct inode *inode,
* EXTENT_QGROUP_RESERVED, we won't double free.
* So not need to rush.
*/
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree,
- free_start, free_start + free_len - 1,
+ ret = clear_record_extent_bits(&inode->io_tree, free_start,
+ free_start + free_len - 1,
EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
@@ -3550,7 +3743,8 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
/* In release case, we shouldn't have @reserved */
WARN_ON(!free && reserved);
if (free && reserved)
- return qgroup_free_reserved_data(inode, reserved, start, len);
+ return qgroup_free_reserved_data(BTRFS_I(inode), reserved,
+ start, len);
extent_changeset_init(&changeset);
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
@@ -3649,8 +3843,8 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
return num_bytes;
}
-int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
- enum btrfs_qgroup_rsv_type type, bool enforce)
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -3676,6 +3870,21 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return ret;
}
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce)
+{
+ int ret;
+
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
+ if (ret <= 0 && ret != -EDQUOT)
+ return ret;
+
+ ret = try_flush_qgroup(root);
+ if (ret < 0)
+ return ret;
+ return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
+}
+
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index b0420c4f5d0e..0a2659685ad6 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -344,12 +344,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
#endif
/* New io_tree based accurate qgroup reserve API */
-int btrfs_qgroup_reserve_data(struct inode *inode,
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
int btrfs_qgroup_free_data(struct inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
-
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce);
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
/* Reserve metadata space for pertrans and prealloc type */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 8f47a85944eb..7ac679ed2b6c 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1198,22 +1198,19 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
int nr_data = rbio->nr_data;
int stripe;
int pagenr;
- int p_stripe = -1;
- int q_stripe = -1;
+ bool has_qstripe;
struct bio_list bio_list;
struct bio *bio;
int ret;
bio_list_init(&bio_list);
- if (rbio->real_stripes - rbio->nr_data == 1) {
- p_stripe = rbio->real_stripes - 1;
- } else if (rbio->real_stripes - rbio->nr_data == 2) {
- p_stripe = rbio->real_stripes - 2;
- q_stripe = rbio->real_stripes - 1;
- } else {
+ if (rbio->real_stripes - rbio->nr_data == 1)
+ has_qstripe = false;
+ else if (rbio->real_stripes - rbio->nr_data == 2)
+ has_qstripe = true;
+ else
BUG();
- }
/* at this point we either have a full stripe,
* or we've read the full stripe from the drive.
@@ -1257,7 +1254,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
SetPageUptodate(p);
pointers[stripe++] = kmap(p);
- if (q_stripe != -1) {
+ if (has_qstripe) {
/*
* raid6, add the qstripe and call the
@@ -2355,8 +2352,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int nr_data = rbio->nr_data;
int stripe;
int pagenr;
- int p_stripe = -1;
- int q_stripe = -1;
+ bool has_qstripe;
struct page *p_page = NULL;
struct page *q_page = NULL;
struct bio_list bio_list;
@@ -2366,14 +2362,12 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
bio_list_init(&bio_list);
- if (rbio->real_stripes - rbio->nr_data == 1) {
- p_stripe = rbio->real_stripes - 1;
- } else if (rbio->real_stripes - rbio->nr_data == 2) {
- p_stripe = rbio->real_stripes - 2;
- q_stripe = rbio->real_stripes - 1;
- } else {
+ if (rbio->real_stripes - rbio->nr_data == 1)
+ has_qstripe = false;
+ else if (rbio->real_stripes - rbio->nr_data == 2)
+ has_qstripe = true;
+ else
BUG();
- }
if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
is_replace = 1;
@@ -2395,17 +2389,22 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
goto cleanup;
SetPageUptodate(p_page);
- if (q_stripe != -1) {
+ if (has_qstripe) {
+ /* RAID6, allocate and map temp space for the Q stripe */
q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (!q_page) {
__free_page(p_page);
goto cleanup;
}
SetPageUptodate(q_page);
+ pointers[rbio->real_stripes - 1] = kmap(q_page);
}
atomic_set(&rbio->error, 0);
+ /* Map the parity stripe just once */
+ pointers[nr_data] = kmap(p_page);
+
for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
struct page *p;
void *parity;
@@ -2415,17 +2414,8 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
pointers[stripe] = kmap(p);
}
- /* then add the parity stripe */
- pointers[stripe++] = kmap(p_page);
-
- if (q_stripe != -1) {
-
- /*
- * raid6, add the qstripe and call the
- * library function to fill in our p/q
- */
- pointers[stripe++] = kmap(q_page);
-
+ if (has_qstripe) {
+ /* RAID6, call the library function to fill in our P/Q */
raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
pointers);
} else {
@@ -2446,12 +2436,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
for (stripe = 0; stripe < nr_data; stripe++)
kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
- kunmap(p_page);
}
+ kunmap(p_page);
__free_page(p_page);
- if (q_page)
+ if (q_page) {
+ kunmap(q_page);
__free_page(q_page);
+ }
writeback:
/*
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 001f13cf9ab8..ba68b0b41dff 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1336,9 +1336,7 @@ static void __del_reloc_root(struct btrfs_root *root)
RB_CLEAR_NODE(&node->rb_node);
}
spin_unlock(&rc->reloc_root_tree.lock);
- if (!node)
- return;
- BUG_ON((struct btrfs_root *)node->data != root);
+ ASSERT(!node || (struct btrfs_root *)node->data == root);
}
spin_lock(&fs_info->trans_lock);
@@ -1838,8 +1836,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
int ret;
int slot;
- BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
last_snapshot = btrfs_root_last_snapshot(&src->root_item);
again:
@@ -1873,7 +1871,7 @@ again:
struct btrfs_key first_key;
level = btrfs_header_level(parent);
- BUG_ON(level < lowest_level);
+ ASSERT(level >= lowest_level);
ret = btrfs_bin_search(parent, &key, level, &slot);
if (ret < 0)
@@ -4430,7 +4428,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
rc->extent_root = extent_root;
rc->block_group = bg;
- ret = btrfs_inc_block_group_ro(rc->block_group);
+ ret = btrfs_inc_block_group_ro(rc->block_group, true);
if (ret) {
err = ret;
goto out;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 612411c74550..0d07ebe511e7 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -371,7 +371,8 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
key.offset = ref_id;
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- BUG_ON(ret < 0);
+ if (ret < 0)
+ goto out;
if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 93d7cb56e44b..e5db948daa12 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3560,7 +3560,26 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* -> btrfs_scrub_pause()
*/
scrub_pause_on(fs_info);
- ret = btrfs_inc_block_group_ro(cache);
+
+ /*
+ * Don't do chunk preallocation for scrub.
+ *
+ * This is especially important for SYSTEM bgs, or we can hit
+ * -EFBIG from btrfs_finish_chunk_alloc() like:
+ * 1. The only SYSTEM bg is marked RO.
+ * Since SYSTEM bg is small, that's pretty common.
+ * 2. New SYSTEM bg will be allocated
+ * Due to regular version will allocate new chunk.
+ * 3. New SYSTEM bg is empty and will get cleaned up
+ * Before cleanup really happens, it's marked RO again.
+ * 4. Empty SYSTEM bg get scrubbed
+ * We go back to 2.
+ *
+ * This can easily boost the amount of SYSTEM chunks if cleaner
+ * thread can't be triggered fast enough, and use up all space
+ * of btrfs_super_block::sys_chunk_array
+ */
+ ret = btrfs_inc_block_group_ro(cache, false);
if (!ret && sctx->is_dev_replace) {
/*
* If we are doing a device replace wait for any tasks
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 903136ceac34..e258fc484cea 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4087,6 +4087,17 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (ret < 0)
goto out;
} else {
+ /*
+ * If we previously orphanized a directory that
+ * collided with a new reference that we already
+ * processed, recompute the current path because
+ * that directory may be part of the path.
+ */
+ if (orphanized_dir) {
+ ret = refresh_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
ret = send_unlink(sctx, cur->full_path);
if (ret < 0)
goto out;
@@ -4994,6 +5005,10 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
+ btrfs_err(fs_info,
+ "send: IO error at offset %llu for inode %llu root %llu",
+ page_offset(page), sctx->cur_ino,
+ sctx->send_root->root_key.objectid);
put_page(page);
ret = -EIO;
break;
@@ -5519,6 +5534,21 @@ static int clone_range(struct send_ctx *sctx,
break;
offset += clone_len;
clone_root->offset += clone_len;
+
+ /*
+ * If we are cloning from the file we are currently processing,
+ * and using the send root as the clone root, we must stop once
+ * the current clone offset reaches the current eof of the file
+ * at the receiver, otherwise we would issue an invalid clone
+ * operation (source range going beyond eof) and cause the
+ * receiver to fail. So if we reach the current eof, bail out
+ * and fallback to a regular write.
+ */
+ if (clone_root->root == sctx->send_root &&
+ clone_root->ino == sctx->cur_ino &&
+ clone_root->offset >= sctx->cur_inode_next_write_offset)
+ break;
+
data_offset += clone_len;
next:
path->slots[0]++;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 90500b6c41fc..1cd39f6a9c3a 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -262,9 +262,10 @@ static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
{
lockdep_assert_held(&info->lock);
- btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
+ /* The free space could be negative in case of overcommit */
+ btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull",
info->flags,
- info->total_bytes - btrfs_space_info_used(info, true),
+ (s64)(info->total_bytes - btrfs_space_info_used(info, true)),
info->full ? "" : "not ");
btrfs_info(fs_info,
"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c346ee7ec18d..e6cb95b81787 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -27,7 +27,6 @@
static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
[TRANS_STATE_RUNNING] = 0U,
- [TRANS_STATE_BLOCKED] = __TRANS_START,
[TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH),
[TRANS_STATE_COMMIT_DOING] = (__TRANS_START |
__TRANS_ATTACH |
@@ -388,7 +387,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
static inline int is_transaction_blocked(struct btrfs_transaction *trans)
{
- return (trans->state >= TRANS_STATE_BLOCKED &&
+ return (trans->state >= TRANS_STATE_COMMIT_START &&
trans->state < TRANS_STATE_UNBLOCKED &&
!TRANS_ABORTED(trans));
}
@@ -580,7 +579,7 @@ again:
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
- if (cur_trans->state >= TRANS_STATE_BLOCKED &&
+ if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
may_wait_transaction(fs_info, type)) {
current->journal_info = h;
btrfs_commit_transaction(h);
@@ -797,7 +796,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
smp_mb();
- if (cur_trans->state >= TRANS_STATE_BLOCKED ||
+ if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
cur_trans->delayed_refs.flushing)
return 1;
@@ -830,7 +829,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
- int lock = (trans->type != TRANS_JOIN_NOLOCK);
int err = 0;
if (refcount_read(&trans->use_count) > 1) {
@@ -846,13 +844,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_chunk_metadata(trans);
- if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
- if (throttle)
- return btrfs_commit_transaction(trans);
- else
- wake_up_process(info->transaction_kthread);
- }
-
if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(info->sb);
@@ -1212,7 +1203,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
struct btrfs_root *gang[8];
int i;
int ret;
- int err = 0;
spin_lock(&fs_info->fs_roots_radix_lock);
while (1) {
@@ -1224,6 +1214,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
break;
for (i = 0; i < ret; i++) {
struct btrfs_root *root = gang[i];
+ int ret2;
+
radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
@@ -1245,17 +1237,17 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
root->node);
}
- err = btrfs_update_root(trans, fs_info->tree_root,
+ ret2 = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key,
&root->root_item);
+ if (ret2)
+ return ret2;
spin_lock(&fs_info->fs_roots_radix_lock);
- if (err)
- break;
btrfs_qgroup_free_meta_all_pertrans(root);
}
}
spin_unlock(&fs_info->fs_roots_radix_lock);
- return err;
+ return 0;
}
/*
@@ -1273,8 +1265,10 @@ int btrfs_defrag_root(struct btrfs_root *root)
while (1) {
trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
ret = btrfs_defrag_leaves(trans, root);
@@ -2303,7 +2297,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
cur_trans->state = TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
- clear_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
spin_lock(&fs_info->trans_lock);
list_del_init(&cur_trans->list);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 7291a2a93075..cbede328bda5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -13,7 +13,6 @@
enum btrfs_trans_state {
TRANS_STATE_RUNNING,
- TRANS_STATE_BLOCKED,
TRANS_STATE_COMMIT_START,
TRANS_STATE_COMMIT_DOING,
TRANS_STATE_UNBLOCKED,
@@ -161,7 +160,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->last_trans = trans->transaction->transid;
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
- BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+ BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans - 1;
spin_unlock(&BTRFS_I(inode)->lock);
}
@@ -208,20 +207,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
int wait_for_unblock);
-
-/*
- * Try to commit transaction asynchronously, so this is safe to call
- * even holding a spinlock.
- *
- * It's done by informing transaction_kthread to commit transaction without
- * waiting for commit interval.
- */
-static inline void btrfs_commit_transaction_locksafe(
- struct btrfs_fs_info *fs_info)
-{
- set_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
- wake_up_process(fs_info->transaction_kthread);
-}
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
void btrfs_throttle(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 7d06842a3d74..368c43c6cbd0 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1285,22 +1285,14 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
return -EUCLEAN;
}
for (; ptr < end; ptr += sizeof(*dref)) {
- u64 root_objectid;
- u64 owner;
u64 offset;
- u64 hash;
+ /*
+ * We cannot check the extent_data_ref hash due to possible
+ * overflow from the leaf due to hash collisions.
+ */
dref = (struct btrfs_extent_data_ref *)ptr;
- root_objectid = btrfs_extent_data_ref_root(leaf, dref);
- owner = btrfs_extent_data_ref_objectid(leaf, dref);
offset = btrfs_extent_data_ref_offset(leaf, dref);
- hash = hash_extent_data_ref(root_objectid, owner, offset);
- if (hash != key->offset) {
- extent_err(leaf, slot,
- "invalid extent data ref hash, item has 0x%016llx key has 0x%016llx",
- hash, key->offset);
- return -EUCLEAN;
- }
if (!IS_ALIGNED(offset, leaf->fs_info->sectorsize)) {
extent_err(leaf, slot,
"invalid extent data backref offset, have %llu expect aligned to %u",
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index de53e5166997..b7bfecfc2ea3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -174,7 +174,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
- if (ctx) {
+ if (ctx && !ctx->logging_new_name) {
int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
@@ -719,7 +719,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
*/
ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
ins.offset);
- if (ret == 0) {
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
btrfs_init_generic_ref(&ref,
BTRFS_ADD_DELAYED_REF,
ins.objectid, ins.offset, 0);
@@ -898,9 +900,11 @@ out:
}
/*
- * helper function to see if a given name and sequence number found
- * in an inode back reference are already in a directory and correctly
- * point to this inode
+ * See if a given name and sequence number found in an inode back reference are
+ * already in a directory and correctly point to this inode.
+ *
+ * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
+ * exists.
*/
static noinline int inode_in_dir(struct btrfs_root *root,
struct btrfs_path *path,
@@ -909,29 +913,35 @@ static noinline int inode_in_dir(struct btrfs_root *root,
{
struct btrfs_dir_item *di;
struct btrfs_key location;
- int match = 0;
+ int ret = 0;
di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
index, name, name_len, 0);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ if (PTR_ERR(di) != -ENOENT)
+ ret = PTR_ERR(di);
+ goto out;
+ } else if (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
if (location.objectid != objectid)
goto out;
- } else
+ } else {
goto out;
- btrfs_release_path(path);
+ }
+ btrfs_release_path(path);
di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
- if (di && !IS_ERR(di)) {
- btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
- if (location.objectid != objectid)
- goto out;
- } else
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
goto out;
- match = 1;
+ } else if (di) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid == objectid)
+ ret = 1;
+ }
out:
btrfs_release_path(path);
- return match;
+ return ret;
}
/*
@@ -1158,7 +1168,10 @@ next:
/* look for a conflicting sequence number */
di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
ref_index, name, namelen, 0);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ if (PTR_ERR(di) != -ENOENT)
+ return PTR_ERR(di);
+ } else if (di) {
ret = drop_one_dir_item(trans, root, path, dir, di);
if (ret)
return ret;
@@ -1168,7 +1181,9 @@ next:
/* look for a conflicting name */
di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
name, namelen, 0);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ return PTR_ERR(di);
+ } else if (di) {
ret = drop_one_dir_item(trans, root, path, dir, di);
if (ret)
return ret;
@@ -1293,6 +1308,15 @@ again:
inode, name, namelen);
kfree(name);
iput(dir);
+ /*
+ * Whenever we need to check if a name exists or not, we
+ * check the subvolume tree. So after an unlink we must
+ * run delayed items, so that future checks for a name
+ * during log replay see that the name does not exists
+ * anymore.
+ */
+ if (!ret)
+ ret = btrfs_run_delayed_items(trans);
if (ret)
goto out;
goto again;
@@ -1493,10 +1517,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- /* if we already have a perfect match, we're done */
- if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
- btrfs_ino(BTRFS_I(inode)), ref_index,
- name, namelen)) {
+ ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
+ btrfs_ino(BTRFS_I(inode)), ref_index,
+ name, namelen);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
/*
* look for a conflicting back reference in the
* metadata. if we find one we have to unlink that name
@@ -1542,6 +1568,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
*/
if (!ret && inode->i_nlink == 0)
inc_nlink(inode);
+ /*
+ * Whenever we need to check if a name exists or
+ * not, we check the subvolume tree. So after an
+ * unlink we must run delayed items, so that future
+ * checks for a name during log replay see that the
+ * name does not exists anymore.
+ */
+ if (!ret)
+ ret = btrfs_run_delayed_items(trans);
}
if (ret < 0)
goto out;
@@ -1554,6 +1589,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
btrfs_update_inode(trans, root, inode);
}
+ /* Else, ret == 1, we already have a perfect match, we're done. */
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
kfree(name);
@@ -1775,6 +1811,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
break;
if (ret == 1) {
+ ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
@@ -1787,17 +1824,19 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ break;
btrfs_release_path(path);
inode = read_one_inode(root, key.offset);
- if (!inode)
- return -EIO;
+ if (!inode) {
+ ret = -EIO;
+ break;
+ }
ret = fixup_inode_link_count(trans, root, inode);
iput(inode);
if (ret)
- goto out;
+ break;
/*
* fixup on a directory may create new entries,
@@ -1806,8 +1845,6 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
*/
key.offset = (u64)-1;
}
- ret = 0;
-out:
btrfs_release_path(path);
return ret;
}
@@ -1846,8 +1883,6 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
ret = btrfs_update_inode(trans, root, inode);
} else if (ret == -EEXIST) {
ret = 0;
- } else {
- BUG(); /* Logic Error */
}
iput(inode);
@@ -1943,8 +1978,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
struct btrfs_key log_key;
struct inode *dir;
u8 log_type;
- int exists;
- int ret = 0;
+ bool exists;
+ int ret;
bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
bool name_added = false;
@@ -1964,12 +1999,12 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
name_len);
btrfs_dir_item_key_to_cpu(eb, di, &log_key);
- exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
- if (exists == 0)
- exists = 1;
- else
- exists = 0;
+ ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
btrfs_release_path(path);
+ if (ret < 0)
+ goto out;
+ exists = (ret == 0);
+ ret = 0;
if (key->type == BTRFS_DIR_ITEM_KEY) {
dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
@@ -1984,7 +2019,14 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = -EINVAL;
goto out;
}
- if (IS_ERR_OR_NULL(dst_di)) {
+
+ if (dst_di == ERR_PTR(-ENOENT))
+ dst_di = NULL;
+
+ if (IS_ERR(dst_di)) {
+ ret = PTR_ERR(dst_di);
+ goto out;
+ } else if (!dst_di) {
/* we need a sequence number to insert, so we only
* do inserts for the BTRFS_DIR_INDEX_KEY types
*/
@@ -2466,7 +2508,9 @@ again:
else {
ret = find_dir_range(log, path, dirid, key_type,
&range_start, &range_end);
- if (ret != 0)
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
break;
}
@@ -4223,7 +4267,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
/*
* Log all prealloc extents beyond the inode's i_size to make sure we do not
- * lose them after doing a fast fsync and replaying the log. We scan the
+ * lose them after doing a full/fast fsync and replaying the log. We scan the
* subvolume's root instead of iterating the inode's extent map tree because
* otherwise we can log incorrect extent items based on extent map conversion.
* That can happen due to the fact that extent maps are merged when they
@@ -4924,7 +4968,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* Check the inode's logged_trans only instead of
* btrfs_inode_in_log(). This is because the last_log_commit of
* the inode is not updated when we only log that it exists and
- * and it has the full sync bit set (see btrfs_log_inode()).
+ * it has the full sync bit set (see btrfs_log_inode()).
*/
if (BTRFS_I(inode)->logged_trans == trans->transid) {
spin_unlock(&BTRFS_I(inode)->lock);
@@ -5016,6 +5060,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
struct btrfs_root *root = inode->root;
int ins_start_slot = 0;
int ins_nr = 0;
@@ -5036,13 +5081,21 @@ again:
if (min_key->type > max_key->type)
break;
- if (min_key->type == BTRFS_INODE_ITEM_KEY)
+ if (min_key->type == BTRFS_INODE_ITEM_KEY) {
*need_log_inode_item = false;
-
- if ((min_key->type == BTRFS_INODE_REF_KEY ||
- min_key->type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid &&
- !recursive_logging) {
+ } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
+ min_key->offset >= i_size) {
+ /*
+ * Extents at and beyond eof are logged with
+ * btrfs_log_prealloc_extents().
+ * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
+ * and no keys greater than that, so bail out.
+ */
+ break;
+ } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
+ min_key->type == BTRFS_INODE_EXTREF_KEY) &&
+ inode->generation == trans->transid &&
+ !recursive_logging) {
u64 other_ino = 0;
u64 other_parent = 0;
@@ -5073,10 +5126,8 @@ again:
btrfs_release_path(path);
goto next_key;
}
- }
-
- /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
- if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
if (ins_nr == 0)
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
@@ -5129,9 +5180,21 @@ next_key:
break;
}
}
- if (ins_nr)
+ if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
ins_nr, inode_only, logged_isize);
+ if (ret)
+ return ret;
+ }
+
+ if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
+ /*
+ * Release the path because otherwise we might attempt to double
+ * lock the same leaf with btrfs_log_prealloc_extents() below.
+ */
+ btrfs_release_path(path);
+ ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
+ }
return ret;
}
@@ -5232,6 +5295,18 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
/*
+ * For symlinks, we must always log their content, which is stored in an
+ * inline extent, otherwise we could end up with an empty symlink after
+ * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
+ * one attempts to create an empty symlink).
+ * We don't need to worry about flushing delalloc, because when we create
+ * the inline extent when the symlink is created (we never have delalloc
+ * for symlinks).
+ */
+ if (S_ISLNK(inode->vfs_inode.i_mode))
+ inode_only = LOG_INODE_ALL;
+
+ /*
* a brute force approach to making sure we get the most uptodate
* copies of everything.
*/
@@ -5380,19 +5455,34 @@ log_extents:
}
/*
- * Don't update last_log_commit if we logged that an inode exists after
- * it was loaded to memory (full_sync bit set).
- * This is to prevent data loss when we do a write to the inode, then
- * the inode gets evicted after all delalloc was flushed, then we log
- * it exists (due to a rename for example) and then fsync it. This last
- * fsync would do nothing (not logging the extents previously written).
+ * If we are logging that an ancestor inode exists as part of logging a
+ * new name from a link or rename operation, don't mark the inode as
+ * logged - otherwise if an explicit fsync is made against an ancestor,
+ * the fsync considers the inode in the log and doesn't sync the log,
+ * resulting in the ancestor missing after a power failure unless the
+ * log was synced as part of an fsync against any other unrelated inode.
+ * So keep it simple for this case and just don't flag the ancestors as
+ * logged.
*/
- spin_lock(&inode->lock);
- inode->logged_trans = trans->transid;
- if (inode_only != LOG_INODE_EXISTS ||
- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
- inode->last_log_commit = inode->last_sub_trans;
- spin_unlock(&inode->lock);
+ if (!ctx ||
+ !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
+ &inode->vfs_inode != ctx->inode)) {
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
+ /*
+ * Don't update last_log_commit if we logged that an inode exists
+ * after it was loaded to memory (full_sync bit set).
+ * This is to prevent data loss when we do a write to the inode,
+ * then the inode gets evicted after all delalloc was flushed,
+ * then we log it exists (due to a rename for example) and then
+ * fsync it. This last fsync would do nothing (not logging the
+ * extents previously written).
+ */
+ if (inode_only != LOG_INODE_EXISTS ||
+ !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ inode->last_log_commit = inode->last_sub_trans;
+ spin_unlock(&inode->lock);
+ }
out_unlock:
mutex_unlock(&inode->log_mutex);
@@ -5629,7 +5719,7 @@ process_leaf:
}
ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
+ if (type == BTRFS_FT_DIR)
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
log_mode, 0, LLONG_MAX, ctx);
@@ -6328,6 +6418,7 @@ next:
error:
if (wc.trans)
btrfs_end_transaction(wc.trans);
+ clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
btrfs_free_path(path);
return ret;
}
@@ -6417,26 +6508,12 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
/*
* Call this after adding a new name for a file and it will properly
* update the log to reflect the new name.
- *
- * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
- * true (because it's not used).
- *
- * Return value depends on whether @sync_log is true or false.
- * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
- * otherwise.
- * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
- * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
- * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed (without attempting to sync the log).
*/
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx)
+ struct dentry *parent)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- int ret;
+ struct btrfs_log_ctx ctx;
/*
* this will force the logging code to walk the dentry chain
@@ -6449,36 +6526,20 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
* if this inode hasn't been logged and directory we're renaming it
* from hasn't been logged, we don't need to log it
*/
- if (inode->logged_trans <= fs_info->last_trans_committed &&
- (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
- return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
- BTRFS_DONT_NEED_LOG_SYNC;
-
- if (sync_log) {
- struct btrfs_log_ctx ctx2;
-
- btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, &ctx2);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
-
- ret = btrfs_sync_log(trans, inode->root, &ctx2);
- if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- }
-
- ASSERT(ctx);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, ctx);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_LOG_SYNC;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
+ if (!inode_logged(trans, inode) &&
+ (!old_dir || !inode_logged(trans, old_dir)))
+ return;
- return BTRFS_NEED_LOG_SYNC;
+ btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
+ ctx.logging_new_name = true;
+ /*
+ * We don't care about the return value. If we fail to log the new name
+ * then we know the next attempt to sync the log will fallback to a full
+ * transaction commit (due to a call to btrfs_set_log_full_commit()), so
+ * we don't need to worry about getting a log committed that has an
+ * inconsistent state after a rename operation.
+ */
+ btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
+ LOG_INODE_EXISTS, &ctx);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 132e43d29034..ddfc6789d9bf 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -16,6 +16,7 @@ struct btrfs_log_ctx {
int log_ret;
int log_transid;
bool log_new_dentries;
+ bool logging_new_name;
struct inode *inode;
struct list_head list;
};
@@ -26,6 +27,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
ctx->log_ret = 0;
ctx->log_transid = 0;
ctx->log_new_dentries = false;
+ ctx->logging_new_name = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
}
@@ -67,16 +69,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
int for_rename);
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
-/* Return values for btrfs_log_new_name() */
-enum {
- BTRFS_DONT_NEED_TRANS_COMMIT,
- BTRFS_NEED_TRANS_COMMIT,
- BTRFS_DONT_NEED_LOG_SYNC,
- BTRFS_NEED_LOG_SYNC,
-};
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx);
+ struct dentry *parent);
#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 67ffbe92944c..8898682c9103 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -742,6 +742,8 @@ static int btrfs_free_stale_devices(const char *path,
struct btrfs_device *device, *tmp_device;
int ret = 0;
+ lockdep_assert_held(&uuid_mutex);
+
if (path)
ret = -ENOENT;
@@ -1181,11 +1183,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
struct btrfs_device *orig_dev;
int ret = 0;
+ lockdep_assert_held(&uuid_mutex);
+
fs_devices = alloc_fs_devices(orig->fsid, NULL);
if (IS_ERR(fs_devices))
return fs_devices;
- mutex_lock(&orig->device_list_mutex);
fs_devices->total_devices = orig->total_devices;
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
@@ -1217,10 +1220,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
device->fs_devices = fs_devices;
fs_devices->num_devices++;
}
- mutex_unlock(&orig->device_list_mutex);
return fs_devices;
error:
- mutex_unlock(&orig->device_list_mutex);
free_fs_devices(fs_devices);
return ERR_PTR(ret);
}
@@ -1266,6 +1267,7 @@ again:
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
list_del_init(&device->dev_alloc_list);
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
@@ -1310,8 +1312,13 @@ static void btrfs_close_one_device(struct btrfs_device *device)
fs_devices->rw_devices--;
}
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ if (device->devid == BTRFS_DEV_REPLACE_DEVID)
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+ clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
fs_devices->missing_devices--;
+ }
btrfs_close_bdev(device);
@@ -2155,8 +2162,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
u64 num_devices;
int ret = 0;
- mutex_lock(&uuid_mutex);
-
+ /*
+ * The device list in fs_devices is accessed without locks (neither
+ * uuid_mutex nor device_list_mutex) as it won't change on a mounted
+ * filesystem and another device rm cannot run.
+ */
num_devices = btrfs_num_devices(fs_info);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
@@ -2167,7 +2177,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (IS_ERR(device)) {
if (PTR_ERR(device) == -ENOENT &&
- strcmp(device_path, "missing") == 0)
+ device_path && strcmp(device_path, "missing") == 0)
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
ret = PTR_ERR(device);
@@ -2200,11 +2210,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_unlock(&fs_info->chunk_mutex);
}
- mutex_unlock(&uuid_mutex);
ret = btrfs_shrink_device(device, 0);
if (!ret)
btrfs_reada_remove_dev(device);
- mutex_lock(&uuid_mutex);
if (ret)
goto error_undo;
@@ -2286,7 +2294,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
}
out:
- mutex_unlock(&uuid_mutex);
return ret;
error_undo:
@@ -4313,10 +4320,12 @@ static int balance_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
int ret = 0;
+ sb_start_write(fs_info->sb);
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
mutex_unlock(&fs_info->balance_mutex);
+ sb_end_write(fs_info->sb);
return ret;
}
@@ -4409,6 +4418,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
btrfs_warn(fs_info,
"balance: cannot set exclusive op status, resume manually");
+ btrfs_release_path(path);
+
mutex_lock(&fs_info->balance_mutex);
BUG_ON(fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 95d9aebff2c4..48858510739b 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -227,11 +227,33 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
+ const bool start_trans = (current->journal_info == NULL);
int ret;
- trans = btrfs_start_transaction(root, 2);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ if (start_trans) {
+ /*
+ * 1 unit for inserting/updating/deleting the xattr
+ * 1 unit for the inode item update
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ } else {
+ /*
+ * This can happen when smack is enabled and a directory is being
+ * created. It happens through d_instantiate_new(), which calls
+ * smack_d_instantiate(), which in turn calls __vfs_setxattr() to
+ * set the transmute xattr (XATTR_NAME_SMACKTRANSMUTE) on the
+ * inode. We have already reserved space for the xattr and inode
+ * update at btrfs_mkdir(), so just use the transaction handle.
+ * We don't join or start a transaction, as that will reset the
+ * block_rsv of the handle and trigger a warning for the start
+ * case.
+ */
+ ASSERT(strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN) == 0);
+ trans = current->journal_info;
+ }
ret = btrfs_setxattr(trans, inode, name, value, size, flags);
if (ret)
@@ -242,7 +264,8 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
out:
- btrfs_end_transaction(trans);
+ if (start_trans)
+ btrfs_end_transaction(trans);
return ret;
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a02e845eb0fb..34ab7b892b70 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -76,10 +76,6 @@ static int ceph_set_page_dirty(struct page *page)
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
- int ret;
-
- if (unlikely(!mapping))
- return !TestSetPageDirty(page);
if (PageDirty(page)) {
dout("%p set_page_dirty %p idx %lu -- already dirty\n",
@@ -125,11 +121,7 @@ static int ceph_set_page_dirty(struct page *page)
page->private = (unsigned long)snapc;
SetPagePrivate(page);
- ret = __set_page_dirty_nobuffers(page);
- WARN_ON(!PageLocked(page));
- WARN_ON(!page->mapping);
-
- return ret;
+ return __set_page_dirty_nobuffers(page);
}
/*
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 22833fa5bb58..06e9b26bf277 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1775,11 +1775,14 @@ static u64 __mark_caps_flushing(struct inode *inode,
* try to invalidate mapping pages without blocking.
*/
static int try_nonblocking_invalidate(struct inode *inode)
+ __releases(ci->i_ceph_lock)
+ __acquires(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
u32 invalidating_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
+ ceph_fscache_invalidate(inode);
invalidate_mapping_pages(&inode->i_data, 0, -1);
spin_lock(&ci->i_ceph_lock);
@@ -2246,7 +2249,6 @@ static int unsafe_request_wait(struct inode *inode)
int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct ceph_file_info *fi = file->private_data;
struct inode *inode = file->f_mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
u64 flush_tid;
@@ -2277,14 +2279,9 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (err < 0)
ret = err;
- if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) {
- spin_lock(&file->f_lock);
- err = errseq_check_and_advance(&ci->i_meta_err,
- &fi->meta_err);
- spin_unlock(&file->f_lock);
- if (err < 0)
- ret = err;
- }
+ err = file_check_and_advance_wb_err(file);
+ if (err < 0)
+ ret = err;
out:
dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
return ret;
@@ -4052,12 +4049,20 @@ bad:
/*
* Delayed work handler to process end of delayed cap release LRU list.
+ *
+ * If new caps are added to the list while processing it, these won't get
+ * processed in this run. In this case, the ci->i_hold_caps_max will be
+ * returned so that the work can be scheduled accordingly.
*/
-void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
+unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
{
struct inode *inode;
struct ceph_inode_info *ci;
int flags = CHECK_CAPS_NODELAY;
+ struct ceph_mount_options *opt = mdsc->fsc->mount_options;
+ unsigned long delay_max = opt->caps_wanted_delay_max * HZ;
+ unsigned long loop_start = jiffies;
+ unsigned long delay = 0;
dout("check_delayed_caps\n");
while (1) {
@@ -4067,6 +4072,11 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
ci = list_first_entry(&mdsc->cap_delay_list,
struct ceph_inode_info,
i_cap_delay_list);
+ if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) {
+ dout("%s caps added recently. Exiting loop", __func__);
+ delay = ci->i_hold_caps_max;
+ break;
+ }
if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
time_before(jiffies, ci->i_hold_caps_max))
break;
@@ -4083,6 +4093,8 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
}
}
spin_unlock(&mdsc->cap_delay_lock);
+
+ return delay;
}
/*
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e088843a7734..baa6368bece5 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -178,8 +178,10 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
return ERR_CAST(inode);
/* We need LINK caps to reliably check i_nlink */
err = ceph_do_getattr(inode, CEPH_CAP_LINK_SHARED, false);
- if (err)
+ if (err) {
+ iput(inode);
return ERR_PTR(err);
+ }
/* -ESTALE if inode as been unlinked and no file is open */
if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) {
iput(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a10711a6337a..aa1eac6d89f2 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -234,7 +234,6 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
fi->fmode = fmode;
spin_lock_init(&fi->rw_contexts_lock);
INIT_LIST_HEAD(&fi->rw_contexts);
- fi->meta_err = errseq_sample(&ci->i_meta_err);
fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
return 0;
@@ -1469,32 +1468,26 @@ retry_snap:
goto out;
}
- err = file_remove_privs(file);
- if (err)
+ down_read(&osdc->lock);
+ map_flags = osdc->osdmap->flags;
+ pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
+ up_read(&osdc->lock);
+ if ((map_flags & CEPH_OSDMAP_FULL) ||
+ (pool_flags & CEPH_POOL_FLAG_FULL)) {
+ err = -ENOSPC;
goto out;
+ }
- err = file_update_time(file);
+ err = file_remove_privs(file);
if (err)
goto out;
- inode_inc_iversion_raw(inode);
-
if (ci->i_inline_version != CEPH_INLINE_NONE) {
err = ceph_uninline_data(file, NULL);
if (err < 0)
goto out;
}
- down_read(&osdc->lock);
- map_flags = osdc->osdmap->flags;
- pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
- up_read(&osdc->lock);
- if ((map_flags & CEPH_OSDMAP_FULL) ||
- (pool_flags & CEPH_POOL_FLAG_FULL)) {
- err = -ENOSPC;
- goto out;
- }
-
dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
inode, ceph_vinop(inode), pos, count, i_size_read(inode));
if (fi->fmode & CEPH_FILE_MODE_LAZY)
@@ -1507,6 +1500,12 @@ retry_snap:
if (err < 0)
goto out;
+ err = file_update_time(file);
+ if (err)
+ goto out_caps;
+
+ inode_inc_iversion_raw(inode);
+
dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
@@ -1590,6 +1589,8 @@ retry_snap:
}
goto out_unlocked;
+out_caps:
+ ceph_put_cap_refs(ci, got);
out:
if (direct_lock)
ceph_end_io_direct(inode);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 660a878e20ef..af85a7237604 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -515,8 +515,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ceph_fscache_inode_init(ci);
- ci->i_meta_err = 0;
-
return &ci->vfs_inode;
}
@@ -1875,6 +1873,7 @@ static void ceph_do_invalidate_pages(struct inode *inode)
orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
+ ceph_fscache_invalidate(inode);
if (invalidate_inode_pages2(inode->i_mapping) < 0) {
pr_err("invalidate_pages %p fails\n", inode);
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0f21073a51a1..37fb71797b34 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1272,7 +1272,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
{
struct ceph_mds_request *req;
struct rb_node *p;
- struct ceph_inode_info *ci;
dout("cleanup_session_requests mds%d\n", session->s_mds);
mutex_lock(&mdsc->mutex);
@@ -1281,16 +1280,10 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
struct ceph_mds_request, r_unsafe_item);
pr_warn_ratelimited(" dropping unsafe request %llu\n",
req->r_tid);
- if (req->r_target_inode) {
- /* dropping unsafe change of inode's attributes */
- ci = ceph_inode(req->r_target_inode);
- errseq_set(&ci->i_meta_err, -EIO);
- }
- if (req->r_unsafe_dir) {
- /* dropping unsafe directory operation */
- ci = ceph_inode(req->r_unsafe_dir);
- errseq_set(&ci->i_meta_err, -EIO);
- }
+ if (req->r_target_inode)
+ mapping_set_error(req->r_target_inode->i_mapping, -EIO);
+ if (req->r_unsafe_dir)
+ mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO);
__unregister_request(mdsc, req);
}
/* zero r_attempts, so kick_requests() will re-send requests */
@@ -1436,7 +1429,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
spin_unlock(&mdsc->cap_dirty_lock);
if (dirty_dropped) {
- errseq_set(&ci->i_meta_err, -EIO);
+ mapping_set_error(inode->i_mapping, -EIO);
if (ci->i_wrbuffer_ref_head == 0 &&
ci->i_wr_ref == 0 &&
@@ -4049,22 +4042,29 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
}
/*
- * delayed work -- periodically trim expired leases, renew caps with mds
+ * delayed work -- periodically trim expired leases, renew caps with mds. If
+ * the @delay parameter is set to 0 or if it's more than 5 secs, the default
+ * workqueue delay value of 5 secs will be used.
*/
-static void schedule_delayed(struct ceph_mds_client *mdsc)
+static void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay)
{
- int delay = 5;
- unsigned hz = round_jiffies_relative(HZ * delay);
- schedule_delayed_work(&mdsc->delayed_work, hz);
+ unsigned long max_delay = HZ * 5;
+
+ /* 5 secs default delay */
+ if (!delay || (delay > max_delay))
+ delay = max_delay;
+ schedule_delayed_work(&mdsc->delayed_work,
+ round_jiffies_relative(delay));
}
static void delayed_work(struct work_struct *work)
{
- int i;
struct ceph_mds_client *mdsc =
container_of(work, struct ceph_mds_client, delayed_work.work);
+ unsigned long delay;
int renew_interval;
int renew_caps;
+ int i;
dout("mdsc delayed_work\n");
@@ -4119,7 +4119,7 @@ static void delayed_work(struct work_struct *work)
}
mutex_unlock(&mdsc->mutex);
- ceph_check_delayed_caps(mdsc);
+ delay = ceph_check_delayed_caps(mdsc);
ceph_queue_cap_reclaim_work(mdsc);
@@ -4127,7 +4127,7 @@ static void delayed_work(struct work_struct *work)
maybe_recover_session(mdsc);
- schedule_delayed(mdsc);
+ schedule_delayed(mdsc, delay);
}
int ceph_mdsc_init(struct ceph_fs_client *fsc)
@@ -4600,7 +4600,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
mdsc->mdsmap->m_epoch);
mutex_unlock(&mdsc->mutex);
- schedule_delayed(mdsc);
+ schedule_delayed(mdsc, 0);
return;
bad_unlock:
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 923be9399b21..e1b9b224fcb2 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -60,24 +60,26 @@
/*
* increase ref count for the realm
*
- * caller must hold snap_rwsem for write.
+ * caller must hold snap_rwsem.
*/
void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
- dout("get_realm %p %d -> %d\n", realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
+ lockdep_assert_held(&mdsc->snap_rwsem);
+
/*
- * since we _only_ increment realm refs or empty the empty
- * list with snap_rwsem held, adjusting the empty list here is
- * safe. we do need to protect against concurrent empty list
- * additions, however.
+ * The 0->1 and 1->0 transitions must take the snap_empty_lock
+ * atomically with the refcount change. Go ahead and bump the
+ * nref here, unless it's 0, in which case we take the spinlock
+ * and then do the increment and remove it from the list.
*/
- if (atomic_inc_return(&realm->nref) == 1) {
- spin_lock(&mdsc->snap_empty_lock);
+ if (atomic_inc_not_zero(&realm->nref))
+ return;
+
+ spin_lock(&mdsc->snap_empty_lock);
+ if (atomic_inc_return(&realm->nref) == 1)
list_del_init(&realm->empty_item);
- spin_unlock(&mdsc->snap_empty_lock);
- }
+ spin_unlock(&mdsc->snap_empty_lock);
}
static void __insert_snap_realm(struct rb_root *root,
@@ -113,6 +115,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
{
struct ceph_snap_realm *realm;
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
realm = kzalloc(sizeof(*realm), GFP_NOFS);
if (!realm)
return ERR_PTR(-ENOMEM);
@@ -135,7 +139,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
/*
* lookup the realm rooted at @ino.
*
- * caller must hold snap_rwsem for write.
+ * caller must hold snap_rwsem.
*/
static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
u64 ino)
@@ -143,6 +147,8 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
struct rb_node *n = mdsc->snap_realms.rb_node;
struct ceph_snap_realm *r;
+ lockdep_assert_held(&mdsc->snap_rwsem);
+
while (n) {
r = rb_entry(n, struct ceph_snap_realm, node);
if (ino < r->ino)
@@ -176,6 +182,8 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc,
static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
rb_erase(&realm->node, &mdsc->snap_realms);
@@ -198,28 +206,30 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
static void __put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
- dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
+ /*
+ * We do not require the snap_empty_lock here, as any caller that
+ * increments the value must hold the snap_rwsem.
+ */
if (atomic_dec_and_test(&realm->nref))
__destroy_snap_realm(mdsc, realm);
}
/*
- * caller needn't hold any locks
+ * See comments in ceph_get_snap_realm. Caller needn't hold any locks.
*/
void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
- dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
- if (!atomic_dec_and_test(&realm->nref))
+ if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock))
return;
if (down_write_trylock(&mdsc->snap_rwsem)) {
+ spin_unlock(&mdsc->snap_empty_lock);
__destroy_snap_realm(mdsc, realm);
up_write(&mdsc->snap_rwsem);
} else {
- spin_lock(&mdsc->snap_empty_lock);
list_add(&realm->empty_item, &mdsc->snap_empty);
spin_unlock(&mdsc->snap_empty_lock);
}
@@ -236,6 +246,8 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
{
struct ceph_snap_realm *realm;
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
spin_lock(&mdsc->snap_empty_lock);
while (!list_empty(&mdsc->snap_empty)) {
realm = list_first_entry(&mdsc->snap_empty,
@@ -269,6 +281,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
{
struct ceph_snap_realm *parent;
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
if (realm->parent_ino == parentino)
return 0;
@@ -686,6 +700,8 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
int err = -ENOMEM;
LIST_HEAD(dirty_realms);
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
dout("update_snap_trace deletion=%d\n", deletion);
more:
ceph_decode_need(&p, e, sizeof(*ri), bad);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index bb12c9f3a218..6db7b3387e1c 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -402,8 +402,6 @@ struct ceph_inode_info {
struct fscache_cookie *fscache;
u32 i_fscache_gen;
#endif
- errseq_t i_meta_err;
-
struct inode vfs_inode; /* at end */
};
@@ -712,7 +710,6 @@ struct ceph_file_info {
spinlock_t rw_contexts_lock;
struct list_head rw_contexts;
- errseq_t meta_err;
u32 filp_gen;
atomic_t num_locks;
};
@@ -1064,7 +1061,7 @@ extern void ceph_flush_snaps(struct ceph_inode_info *ci,
extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session);
-extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
extern int ceph_drop_caps_for_unlink(struct inode *inode);
extern int ceph_encode_inode_release(void **p, struct inode *inode,
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index cc3ada12848d..42125601ebb1 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -151,6 +151,9 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
return ERR_PTR(-EINVAL);
if (ref) {
+ if (WARN_ON_ONCE(!ref->node_name || ref->path_consumed < 0))
+ return ERR_PTR(-EINVAL);
+
if (strlen(fullpath) - ref->path_consumed) {
prepath = fullpath + ref->path_consumed;
/* skip initial delimiter */
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 9bd03a231032..171ad8b42107 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -358,14 +358,9 @@ cifs_strndup_from_utf16(const char *src, const int maxlen,
if (!dst)
return NULL;
cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
- NO_MAP_UNI_RSVD);
+ NO_MAP_UNI_RSVD);
} else {
- len = strnlen(src, maxlen);
- len++;
- dst = kmalloc(len, GFP_KERNEL);
- if (!dst)
- return NULL;
- strlcpy(dst, src, len);
+ dst = kstrndup(src, maxlen, GFP_KERNEL);
}
return dst;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 115f063497ff..79a18692b84c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -278,7 +278,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
rc = server->ops->queryfs(xid, tcon, buf);
free_xid(xid);
- return 0;
+ return rc;
}
static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len)
@@ -855,6 +855,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
out_super:
deactivate_locked_super(sb);
+ return root;
out:
cifs_cleanup_volume_info(volume_info);
return root;
@@ -888,7 +889,7 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ssize_t rc;
struct inode *inode = file_inode(iocb->ki_filp);
- if (iocb->ki_filp->f_flags & O_DIRECT)
+ if (iocb->ki_flags & IOCB_DIRECT)
return cifs_user_readv(iocb, iter);
rc = cifs_revalidate_mapping(inode);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b16c994414ab..9c0e348cb00f 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -964,7 +964,7 @@ struct cifs_ses {
struct mutex session_mutex;
struct TCP_Server_Info *server; /* pointer to server info */
int ses_count; /* reference counter */
- enum statusEnum status;
+ enum statusEnum status; /* updates protected by GlobalMid_Lock */
unsigned overrideSecFlg; /* if non-zero override global sec flags */
char *serverOS; /* name of operating system underlying server */
char *serverNOS; /* name of network operating system of server */
@@ -1814,6 +1814,7 @@ require use of the stronger protocol */
* list operations on pending_mid_q and oplockQ
* updates to XID counters, multiplex id and SMB sequence numbers
* list operations on global DnotifyReqList
+ * updates to ses->status
* tcp_ses_lock protects:
* list operations on tcp and SMB session lists
* tcon->open_file_lock protects the list of open files hanging off the tcon
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index ab9eeb5ff8e5..86bdebd2ece6 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3049,9 +3049,12 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
spin_unlock(&cifs_tcp_ses_lock);
return;
}
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ spin_lock(&GlobalMid_Lock);
if (ses->status == CifsGood)
ses->status = CifsExiting;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&GlobalMid_Lock);
cifs_free_ipc(ses);
@@ -3688,9 +3691,10 @@ cifs_match_super(struct super_block *sb, void *data)
spin_lock(&cifs_tcp_ses_lock);
cifs_sb = CIFS_SB(sb);
tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
- if (IS_ERR(tlink)) {
+ if (tlink == NULL) {
+ /* can not match superblock if tlink were ever null */
spin_unlock(&cifs_tcp_ses_lock);
- return rc;
+ return 0;
}
tcon = tlink_tcon(tlink);
ses = tcon->ses;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 5a35850ccb1a..9ae9a514676c 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -738,6 +738,7 @@ static int
cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
{
struct inode *inode;
+ int rc;
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -747,8 +748,25 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
if ((flags & LOOKUP_REVAL) && !CIFS_CACHE_READ(CIFS_I(inode)))
CIFS_I(inode)->time = 0; /* force reval */
- if (cifs_revalidate_dentry(direntry))
- return 0;
+ rc = cifs_revalidate_dentry(direntry);
+ if (rc) {
+ cifs_dbg(FYI, "cifs_revalidate_dentry failed with rc=%d", rc);
+ switch (rc) {
+ case -ENOENT:
+ case -ESTALE:
+ /*
+ * Those errors mean the dentry is invalid
+ * (file was deleted or recreated)
+ */
+ return 0;
+ default:
+ /*
+ * Otherwise some unexpected error happened
+ * report it as-is to VFS layer
+ */
+ return rc;
+ }
+ }
else {
/*
* If the inode wasn't known to be a dfs entry when
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 31d578739341..03c85beecec1 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -164,6 +164,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
goto posix_open_ret;
}
} else {
+ cifs_revalidate_mapping(*pinode);
cifs_fattr_to_inode(*pinode, &fattr);
}
@@ -2576,12 +2577,23 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
tcon = tlink_tcon(smbfile->tlink);
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
server = tcon->ses->server;
- if (server->ops->flush)
- rc = server->ops->flush(xid, tcon, &smbfile->fid);
- else
+ if (server->ops->flush == NULL) {
rc = -ENOSYS;
+ goto strict_fsync_exit;
+ }
+
+ if ((OPEN_FMODE(smbfile->f_flags) & FMODE_WRITE) == 0) {
+ smbfile = find_writable_file(CIFS_I(inode), FIND_WR_ANY);
+ if (smbfile) {
+ rc = server->ops->flush(xid, tcon, &smbfile->fid);
+ cifsFileInfo_put(smbfile);
+ } else
+ cifs_dbg(FYI, "ignore fsync for file not open for write\n");
+ } else
+ rc = server->ops->flush(xid, tcon, &smbfile->fid);
}
+strict_fsync_exit:
free_xid(xid);
return rc;
}
@@ -2593,6 +2605,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
struct cifsFileInfo *smbfile = file->private_data;
+ struct inode *inode = file_inode(file);
struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
rc = file_write_and_wait_range(file, start, end);
@@ -2607,12 +2620,23 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
tcon = tlink_tcon(smbfile->tlink);
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
server = tcon->ses->server;
- if (server->ops->flush)
- rc = server->ops->flush(xid, tcon, &smbfile->fid);
- else
+ if (server->ops->flush == NULL) {
rc = -ENOSYS;
+ goto fsync_exit;
+ }
+
+ if ((OPEN_FMODE(smbfile->f_flags) & FMODE_WRITE) == 0) {
+ smbfile = find_writable_file(CIFS_I(inode), FIND_WR_ANY);
+ if (smbfile) {
+ rc = server->ops->flush(xid, tcon, &smbfile->fid);
+ cifsFileInfo_put(smbfile);
+ } else
+ cifs_dbg(FYI, "ignore fsync for file not open for write\n");
+ } else
+ rc = server->ops->flush(xid, tcon, &smbfile->fid);
}
+fsync_exit:
free_xid(xid);
return rc;
}
@@ -2988,7 +3012,7 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb;
struct dentry *dentry = ctx->cfile->dentry;
- int rc;
+ ssize_t rc;
tcon = tlink_tcon(ctx->cfile->tlink);
cifs_sb = CIFS_SB(dentry->d_sb);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index b736acd3917b..a24bcbbb5033 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -97,6 +97,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
if (rc != 1)
return -EINVAL;
+ if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
+ return -EINVAL;
+
rc = symlink_hash(link_len, link_str, md5_hash);
if (rc) {
cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 85bd644f9773..30f841a880ac 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -610,7 +610,7 @@ sess_alloc_buffer(struct sess_data *sess_data, int wct)
return 0;
out_free_smb_buf:
- kfree(smb_buf);
+ cifs_small_buf_release(smb_buf);
sess_data->iov[0].iov_base = NULL;
sess_data->iov[0].iov_len = 0;
sess_data->buf0_type = CIFS_NO_BUFFER;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 7d875a47d022..7177720e822e 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -738,8 +738,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
}
}
spin_unlock(&cifs_tcp_ses_lock);
- cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n");
- return false;
+ cifs_dbg(FYI, "No file id matched, oplock break ignored\n");
+ return true;
}
void
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 30025cc5d4ae..7985fe25850b 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -498,8 +498,8 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
p = buf;
while (bytes_left >= sizeof(*p)) {
info->speed = le64_to_cpu(p->LinkSpeed);
- info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE);
- info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE);
+ info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0;
+ info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0;
cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count);
cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed);
@@ -1643,9 +1643,17 @@ smb2_copychunk_range(const unsigned int xid,
int chunks_copied = 0;
bool chunk_sizes_updated = false;
ssize_t bytes_written, total_bytes_written = 0;
+ struct inode *inode;
pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL);
+ /*
+ * We need to flush all unwritten data before we can send the
+ * copychunk ioctl to the server.
+ */
+ inode = d_inode(trgtfile->dentry);
+ filemap_write_and_wait(inode->i_mapping);
+
if (pcchunk == NULL)
return -ENOMEM;
@@ -1673,6 +1681,8 @@ smb2_copychunk_range(const unsigned int xid,
cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk));
/* Request server copy to target from src identified by key */
+ kfree(retbuf);
+ retbuf = NULL;
rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
true /* is_fsctl */, (char *)pcchunk,
@@ -3693,7 +3703,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
}
spin_unlock(&cifs_tcp_ses_lock);
- return 1;
+ return -EAGAIN;
}
/*
* Encrypt or decrypt @rqst message. @rqst[0] has the following format:
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index be06b26d6ca0..e068f82ffedd 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -490,8 +490,8 @@ build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt)
pneg_ctxt->ContextType = SMB2_PREAUTH_INTEGRITY_CAPABILITIES;
pneg_ctxt->DataLength = cpu_to_le16(38);
pneg_ctxt->HashAlgorithmCount = cpu_to_le16(1);
- pneg_ctxt->SaltLength = cpu_to_le16(SMB311_SALT_SIZE);
- get_random_bytes(pneg_ctxt->Salt, SMB311_SALT_SIZE);
+ pneg_ctxt->SaltLength = cpu_to_le16(SMB311_LINUX_CLIENT_SALT_SIZE);
+ get_random_bytes(pneg_ctxt->Salt, SMB311_LINUX_CLIENT_SALT_SIZE);
pneg_ctxt->HashAlgorithms = SMB2_PREAUTH_INTEGRITY_SHA512;
}
@@ -617,6 +617,9 @@ static void decode_preauth_context(struct smb2_preauth_neg_context *ctxt)
if (len < MIN_PREAUTH_CTXT_DATA_LEN) {
printk_once(KERN_WARNING "server sent bad preauth context\n");
return;
+ } else if (len < MIN_PREAUTH_CTXT_DATA_LEN + le16_to_cpu(ctxt->SaltLength)) {
+ pr_warn_once("server sent invalid SaltLength\n");
+ return;
}
if (le16_to_cpu(ctxt->HashAlgorithmCount) != 1)
printk_once(KERN_WARNING "illegal SMB3 hash algorithm count\n");
@@ -976,6 +979,13 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
/* Internal types */
server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES;
+ /*
+ * SMB3.0 supports only 1 cipher and doesn't have a encryption neg context
+ * Set the cipher type manually.
+ */
+ if (server->dialect == SMB30_PROT_ID && (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION))
+ server->cipher_type = SMB2_ENCRYPTION_AES128_CCM;
+
security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
(struct smb2_sync_hdr *)rsp);
/*
@@ -3601,10 +3611,10 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
* Related requests use info from previous read request
* in chain.
*/
- shdr->SessionId = 0xFFFFFFFF;
+ shdr->SessionId = 0xFFFFFFFFFFFFFFFF;
shdr->TreeId = 0xFFFFFFFF;
- req->PersistentFileId = 0xFFFFFFFF;
- req->VolatileFileId = 0xFFFFFFFF;
+ req->PersistentFileId = 0xFFFFFFFFFFFFFFFF;
+ req->VolatileFileId = 0xFFFFFFFFFFFFFFFF;
}
}
if (remaining_bytes > io_parms->length)
@@ -3740,8 +3750,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
if (rdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
SMB2_MAX_BUFFER_SIZE));
- shdr->CreditRequest =
- cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
+ shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 8);
rc = adjust_credits(server, &rdata->credits, rdata->bytes);
if (rc)
@@ -4035,8 +4044,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
if (wdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
SMB2_MAX_BUFFER_SIZE));
- shdr->CreditRequest =
- cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
+ shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 8);
rc = adjust_credits(server, &wdata->credits, wdata->bytes);
if (rc)
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index f264e1d36fe1..739556e385be 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -227,7 +227,7 @@ struct smb2_negotiate_req {
__le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */
__le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */
__le16 Reserved2;
- __le16 Dialects[1]; /* One dialect (vers=) at a time for now */
+ __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */
} __packed;
/* Dialects */
@@ -271,12 +271,20 @@ struct smb2_neg_context {
/* Followed by array of data */
} __packed;
-#define SMB311_SALT_SIZE 32
+#define SMB311_LINUX_CLIENT_SALT_SIZE 32
/* Hash Algorithm Types */
#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001)
#define SMB2_PREAUTH_HASH_SIZE 64
-#define MIN_PREAUTH_CTXT_DATA_LEN (SMB311_SALT_SIZE + 6)
+/*
+ * SaltLength that the server send can be zero, so the only three required
+ * fields (all __le16) end up six bytes total, so the minimum context data len
+ * in the response is six bytes which accounts for
+ *
+ * HashAlgorithmCount, SaltLength, and 1 HashAlgorithm.
+ */
+#define MIN_PREAUTH_CTXT_DATA_LEN 6
+
struct smb2_preauth_neg_context {
__le16 ContextType; /* 1 */
__le16 DataLength;
@@ -284,7 +292,7 @@ struct smb2_preauth_neg_context {
__le16 HashAlgorithmCount; /* 1 */
__le16 SaltLength;
__le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */
- __u8 Salt[SMB311_SALT_SIZE];
+ __u8 Salt[SMB311_LINUX_CLIENT_SALT_SIZE];
} __packed;
/* Encryption Algorithms Ciphers */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 6d6de183915b..61e7df4d9cb1 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -339,7 +339,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
if (ssocket == NULL)
return -EAGAIN;
- if (signal_pending(current)) {
+ if (fatal_signal_pending(current)) {
cifs_dbg(FYI, "signal pending before send request\n");
return -ERESTARTSYS;
}
@@ -431,7 +431,7 @@ unmask:
if (signal_pending(current) && (total_len != send_length)) {
cifs_dbg(FYI, "signal is pending after attempt to send\n");
- rc = -EINTR;
+ rc = -ERESTARTSYS;
}
/* uncork it */
@@ -659,10 +659,22 @@ wait_for_compound_request(struct TCP_Server_Info *server, int num,
spin_lock(&server->req_lock);
if (*credits < num) {
/*
- * Return immediately if not too many requests in flight since
- * we will likely be stuck on waiting for credits.
+ * If the server is tight on resources or just gives us less
+ * credits for other reasons (e.g. requests are coming out of
+ * order and the server delays granting more credits until it
+ * processes a missing mid) and we exhausted most available
+ * credits there may be situations when we try to send
+ * a compound request but we don't have enough credits. At this
+ * point the client needs to decide if it should wait for
+ * additional credits or fail the request. If at least one
+ * request is in flight there is a high probability that the
+ * server will return enough credits to satisfy this compound
+ * request.
+ *
+ * Return immediately if no requests in flight since we will be
+ * stuck on waiting for credits.
*/
- if (server->in_flight < num - *credits) {
+ if (server->in_flight == 0) {
spin_unlock(&server->req_lock);
return -ENOTSUPP;
}
@@ -1122,9 +1134,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
/*
* Compounding is never used during session establish.
*/
- if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP))
+ if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) {
+ mutex_lock(&server->srv_mutex);
smb311_update_preauth_hash(ses, rqst[0].rq_iov,
rqst[0].rq_nvec);
+ mutex_unlock(&server->srv_mutex);
+ }
for (i = 0; i < num_rqst; i++) {
rc = wait_for_response(server, midQ[i]);
@@ -1133,7 +1148,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
}
if (rc != 0) {
for (; i < num_rqst; i++) {
- cifs_server_dbg(VFS, "Cancelling wait for mid %llu cmd: %d\n",
+ cifs_server_dbg(FYI, "Cancelling wait for mid %llu cmd: %d\n",
midQ[i]->mid, le16_to_cpu(midQ[i]->command));
send_cancel(server, &rqst[i], midQ[i]);
spin_lock(&GlobalMid_Lock);
@@ -1192,7 +1207,9 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
.iov_base = resp_iov[0].iov_base,
.iov_len = resp_iov[0].iov_len
};
+ mutex_lock(&server->srv_mutex);
smb311_update_preauth_hash(ses, &iov, 1);
+ mutex_unlock(&server->srv_mutex);
}
out:
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index cb733652ecca..d73d88d9c259 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -36,6 +36,14 @@
*/
DEFINE_SPINLOCK(configfs_dirent_lock);
+/*
+ * All of link_obj/unlink_obj/link_group/unlink_group require that
+ * subsys->su_mutex is held.
+ * But parent configfs_subsystem is NULL when config_item is root.
+ * Use this mutex when config_item is root.
+ */
+static DEFINE_MUTEX(configfs_subsystem_mutex);
+
static void configfs_d_iput(struct dentry * dentry,
struct inode * inode)
{
@@ -1805,8 +1813,8 @@ void configfs_unregister_group(struct config_group *group)
configfs_detach_group(&group->cg_item);
d_inode(dentry)->i_flags |= S_DEAD;
dont_mount(dentry);
+ d_drop(dentry);
fsnotify_rmdir(d_inode(parent), dentry);
- d_delete(dentry);
inode_unlock(d_inode(parent));
dput(dentry);
@@ -1884,7 +1892,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
group->cg_item.ci_name = group->cg_item.ci_namebuf;
sd = root->d_fsdata;
+ mutex_lock(&configfs_subsystem_mutex);
link_group(to_config_group(sd->s_element), group);
+ mutex_unlock(&configfs_subsystem_mutex);
inode_lock_nested(d_inode(root), I_MUTEX_PARENT);
@@ -1909,7 +1919,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
inode_unlock(d_inode(root));
if (err) {
+ mutex_lock(&configfs_subsystem_mutex);
unlink_group(group);
+ mutex_unlock(&configfs_subsystem_mutex);
configfs_release_fs();
}
put_fragment(frag);
@@ -1947,16 +1959,18 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
configfs_detach_group(&group->cg_item);
d_inode(dentry)->i_flags |= S_DEAD;
dont_mount(dentry);
- fsnotify_rmdir(d_inode(root), dentry);
inode_unlock(d_inode(dentry));
- d_delete(dentry);
+ d_drop(dentry);
+ fsnotify_rmdir(d_inode(root), dentry);
inode_unlock(d_inode(root));
dput(dentry);
+ mutex_lock(&configfs_subsystem_mutex);
unlink_group(group);
+ mutex_unlock(&configfs_subsystem_mutex);
configfs_release_fs();
}
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index fb65b706cc0d..66fae1853d99 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -378,7 +378,7 @@ static int __configfs_open_file(struct inode *inode, struct file *file, int type
attr = to_attr(dentry);
if (!attr)
- goto out_put_item;
+ goto out_free_buffer;
if (type & CONFIGFS_ITEM_BIN_ATTR) {
buffer->bin_attr = to_bin_attr(dentry);
@@ -391,7 +391,7 @@ static int __configfs_open_file(struct inode *inode, struct file *file, int type
/* Grab the module reference for this attribute if we have one */
error = -ENODEV;
if (!try_module_get(buffer->owner))
- goto out_put_item;
+ goto out_free_buffer;
error = -EACCES;
if (!buffer->item->ci_type)
@@ -435,8 +435,6 @@ static int __configfs_open_file(struct inode *inode, struct file *file, int type
out_put_module:
module_put(buffer->owner);
-out_put_item:
- config_item_put(buffer->item);
out_free_buffer:
up_read(&frag->frag_sem);
kfree(buffer);
@@ -484,13 +482,13 @@ static int configfs_release_bin_file(struct inode *inode, struct file *file)
buffer->bin_buffer_size);
}
up_read(&frag->frag_sem);
- /* vfree on NULL is safe */
- vfree(buffer->bin_buffer);
- buffer->bin_buffer = NULL;
- buffer->bin_buffer_size = 0;
- buffer->needs_read_fill = 1;
}
+ vfree(buffer->bin_buffer);
+ buffer->bin_buffer = NULL;
+ buffer->bin_buffer_size = 0;
+ buffer->needs_read_fill = 1;
+
configfs_release(inode, file);
return 0;
}
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 3da3707c10e3..891328f09b3c 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -273,13 +273,8 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
oname->name);
return 0;
}
- if (hash) {
- digested_name.hash = hash;
- digested_name.minor_hash = minor_hash;
- } else {
- digested_name.hash = 0;
- digested_name.minor_hash = 0;
- }
+ digested_name.hash = hash;
+ digested_name.minor_hash = minor_hash;
memcpy(digested_name.digest,
FSCRYPT_FNAME_DIGEST(iname->name, iname->len),
FSCRYPT_FNAME_DIGEST_SIZE);
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index a5a40a76b8ed..82575cfbb04d 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -305,3 +305,47 @@ err_kfree:
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(fscrypt_get_symlink);
+
+/**
+ * fscrypt_symlink_getattr() - set the correct st_size for encrypted symlinks
+ * @path: the path for the encrypted symlink being queried
+ * @stat: the struct being filled with the symlink's attributes
+ *
+ * Override st_size of encrypted symlinks to be the length of the decrypted
+ * symlink target (or the no-key encoded symlink target, if the key is
+ * unavailable) rather than the length of the encrypted symlink target. This is
+ * necessary for st_size to match the symlink target that userspace actually
+ * sees. POSIX requires this, and some userspace programs depend on it.
+ *
+ * This requires reading the symlink target from disk if needed, setting up the
+ * inode's encryption key if possible, and then decrypting or encoding the
+ * symlink target. This makes lstat() more heavyweight than is normally the
+ * case. However, decrypted symlink targets will be cached in ->i_link, so
+ * usually the symlink won't have to be read and decrypted again later if/when
+ * it is actually followed, readlink() is called, or lstat() is called again.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat)
+{
+ struct dentry *dentry = path->dentry;
+ struct inode *inode = d_inode(dentry);
+ const char *link;
+ DEFINE_DELAYED_CALL(done);
+
+ /*
+ * To get the symlink target that userspace will see (whether it's the
+ * decrypted target or the no-key encoded target), we can just get it in
+ * the same way the VFS does during path resolution and readlink().
+ */
+ link = READ_ONCE(inode->i_link);
+ if (!link) {
+ link = inode->i_op->get_link(dentry, inode, &done);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+ }
+ stat->size = strlen(link);
+ do_delayed_call(&done);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fscrypt_symlink_getattr);
diff --git a/fs/dax.c b/fs/dax.c
index cc56313c6b3b..12953e892bb2 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -477,10 +477,11 @@ static void *grab_mapping_entry(struct xa_state *xas,
struct address_space *mapping, unsigned int order)
{
unsigned long index = xas->xa_index;
- bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
+ bool pmd_downgrade; /* splitting PMD entry into PTE entries? */
void *entry;
retry:
+ pmd_downgrade = false;
xas_lock_irq(xas);
entry = get_unlocked_entry(xas, order);
@@ -794,12 +795,12 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
address = pgoff_address(index, vma);
/*
- * Note because we provide range to follow_pte_pmd it will
- * call mmu_notifier_invalidate_range_start() on our behalf
- * before taking any lock.
+ * follow_invalidate_pte() will use the range to call
+ * mmu_notifier_invalidate_range_start() on our behalf before
+ * taking any lock.
*/
- if (follow_pte_pmd(vma->vm_mm, address, &range,
- &ptep, &pmdp, &ptl))
+ if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
+ &pmdp, &ptl))
continue;
/*
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 943637298f65..da87615ad69a 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -146,7 +146,7 @@ static int debugfs_locked_down(struct inode *inode,
struct file *filp,
const struct file_operations *real_fops)
{
- if ((inode->i_mode & 07777) == 0444 &&
+ if ((inode->i_mode & 07777 & ~0444) == 0 &&
!(filp->f_mode & FMODE_WRITE) &&
!real_fops->unlocked_ioctl &&
!real_fops->compat_ioctl &&
@@ -178,8 +178,10 @@ static int open_proxy_open(struct inode *inode, struct file *filp)
if (!fops_get(real_fops)) {
#ifdef CONFIG_MODULES
if (real_fops->owner &&
- real_fops->owner->state == MODULE_STATE_GOING)
+ real_fops->owner->state == MODULE_STATE_GOING) {
+ r = -ENXIO;
goto out;
+ }
#endif
/* Huh? Module did not clean up after itself at exit? */
@@ -313,8 +315,10 @@ static int full_proxy_open(struct inode *inode, struct file *filp)
if (!fops_get(real_fops)) {
#ifdef CONFIG_MODULES
if (real_fops->owner &&
- real_fops->owner->state == MODULE_STATE_GOING)
+ real_fops->owner->state == MODULE_STATE_GOING) {
+ r = -ENXIO;
goto out;
+ }
#endif
/* Huh? Module did not cleanup after itself at exit? */
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 7b975dbb2bb4..e595a29bf46e 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -293,7 +293,7 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
{
struct dentry *dentry;
- if (IS_ERR(parent))
+ if (!debugfs_initialized() || IS_ERR_OR_NULL(name) || IS_ERR(parent))
return NULL;
if (!parent)
@@ -315,6 +315,9 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
struct dentry *dentry;
int error;
+ if (!debugfs_initialized())
+ return ERR_PTR(-ENOENT);
+
pr_debug("creating file '%s'\n", name);
if (IS_ERR(parent))
@@ -519,7 +522,7 @@ struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
{
struct dentry *de = debugfs_create_file(name, mode, parent, data, fops);
- if (de)
+ if (!IS_ERR(de))
d_inode(de)->i_size = file_size;
return de;
}
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 42e5a766d33c..4f25015aa534 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -621,8 +621,8 @@ void devpts_pty_kill(struct dentry *dentry)
dentry->d_fsdata = NULL;
drop_nlink(dentry->d_inode);
- fsnotify_unlink(d_inode(dentry->d_parent), dentry);
d_drop(dentry);
+ fsnotify_unlink(d_inode(dentry->d_parent), dentry);
dput(dentry); /* d_alloc_name() in devpts_pty_new() */
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9329ced91f1d..434cffcc0391 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -848,6 +848,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
struct buffer_head *map_bh)
{
int ret = 0;
+ int boundary = sdio->boundary; /* dio_send_cur_page may clear it */
if (dio->op == REQ_OP_WRITE) {
/*
@@ -886,10 +887,10 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
out:
/*
- * If sdio->boundary then we want to schedule the IO now to
+ * If boundary then we want to schedule the IO now to
* avoid metadata seeks.
*/
- if (sdio->boundary) {
+ if (boundary) {
ret = dio_send_cur_page(dio, sdio, map_bh);
if (sdio->bio)
dio_bio_submit(dio, sdio);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 3b1012a3c439..b7e288cf3f5f 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -78,6 +78,9 @@ struct dlm_cluster {
unsigned int cl_new_rsb_count;
unsigned int cl_recover_callbacks;
char cl_cluster_name[DLM_LOCKSPACE_LEN];
+
+ struct dlm_spaces *sps;
+ struct dlm_comms *cms;
};
static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
@@ -354,6 +357,9 @@ static struct config_group *make_cluster(struct config_group *g,
if (!cl || !sps || !cms)
goto fail;
+ cl->sps = sps;
+ cl->cms = cms;
+
config_group_init_type_name(&cl->group, name, &cluster_type);
config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
@@ -403,6 +409,9 @@ static void drop_cluster(struct config_group *g, struct config_item *i)
static void release_cluster(struct config_item *i)
{
struct dlm_cluster *cl = config_item_to_cluster(i);
+
+ kfree(cl->sps);
+ kfree(cl->cms);
kfree(cl);
}
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index d6bbccb0ed15..d5bd990bcab8 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -542,6 +542,7 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
if (bucket >= ls->ls_rsbtbl_size) {
kfree(ri);
+ ++*pos;
return NULL;
}
tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 18d81599522f..53500b555bfa 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -3975,6 +3975,14 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
int from = ms->m_header.h_nodeid;
int error = 0;
+ /* currently mixing of user/kernel locks are not supported */
+ if (ms->m_flags & DLM_IFL_USER && ~lkb->lkb_flags & DLM_IFL_USER) {
+ log_error(lkb->lkb_resource->res_ls,
+ "got user dlm message for a kernel lock");
+ error = -EINVAL;
+ goto out;
+ }
+
switch (ms->m_type) {
case DLM_MSG_CONVERT:
case DLM_MSG_UNLOCK:
@@ -4003,6 +4011,7 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
error = -EINVAL;
}
+out:
if (error)
log_error(lkb->lkb_resource->res_ls,
"ignore invalid message %d from %d %x %x %x %d",
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3951d39b9b75..d9202ba665ce 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -607,7 +607,7 @@ static void close_connection(struct connection *con, bool and_other,
}
if (con->othercon && and_other) {
/* Will only re-enter once. */
- close_connection(con->othercon, false, true, true);
+ close_connection(con->othercon, false, tx, rx);
}
if (con->rx_page) {
__free_page(con->rx_page);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index a064b408d841..f91db24bbf3b 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -311,10 +311,8 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
struct extent_crypt_result ecr;
int rc = 0;
- if (!crypt_stat || !crypt_stat->tfm
- || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED))
- return -EINVAL;
-
+ BUG_ON(!crypt_stat || !crypt_stat->tfm
+ || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
if (unlikely(ecryptfs_verbosity > 0)) {
ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
crypt_stat->key_size);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index b8a7ce379ffe..75fbdfb8aaef 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -492,6 +492,12 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
goto out;
}
+ if (!dev_name) {
+ rc = -EINVAL;
+ err = "Device name cannot be null";
+ goto out;
+ }
+
rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid);
if (rc) {
err = "Error parsing options";
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index b1ee5654750d..52ab96154467 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -74,6 +74,9 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
#define EROFS_I_VERSION_BIT 0
#define EROFS_I_DATALAYOUT_BIT 1
+#define EROFS_I_ALL \
+ ((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1)
+
/* 32-byte reduced form of an ondisk inode */
struct erofs_inode_compact {
__le16 i_format; /* inode format hints */
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 70fd3af7b8cb..0dbeaf68e1d6 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -44,6 +44,13 @@ static struct page *erofs_read_inode(struct inode *inode,
dic = page_address(page) + *ofs;
ifmt = le16_to_cpu(dic->i_format);
+ if (ifmt & ~EROFS_I_ALL) {
+ erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu",
+ ifmt, vi->nid);
+ err = -EOPNOTSUPP;
+ goto err_out;
+ }
+
vi->datalayout = erofs_inode_datalayout(ifmt);
if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) {
erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu",
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 0e369494f2f2..22e059b4f745 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -124,8 +124,8 @@ static int erofs_read_superblock(struct super_block *sb)
blkszbits = dsb->blkszbits;
/* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */
if (blkszbits != LOG_BLOCK_SIZE) {
- erofs_err(sb, "blksize %u isn't supported on this platform",
- 1 << blkszbits);
+ erofs_err(sb, "blkszbits %u isn't supported on this platform",
+ blkszbits);
goto out;
}
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 503bea20cde2..f78496776e76 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -48,8 +48,14 @@ static int init_inode_xattrs(struct inode *inode)
int ret = 0;
/* the most case is that xattrs of this inode are initialized. */
- if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags))
+ if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) {
+ /*
+ * paired with smp_mb() at the end of the function to ensure
+ * fields will only be observed after the bit is set.
+ */
+ smp_mb();
return 0;
+ }
if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_XATTR_BIT, TASK_KILLABLE))
return -ERESTARTSYS;
@@ -137,6 +143,8 @@ static int init_inode_xattrs(struct inode *inode)
}
xattr_iter_end(&it, atomic_map);
+ /* paired with smp_mb() at the beginning of the function. */
+ smp_mb();
set_bit(EROFS_I_EA_INITED_BIT, &vi->flags);
out_unlock:
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index fad80c97d247..fdd18c250811 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -288,11 +288,10 @@ static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
/* callers must be with collection lock held */
static int z_erofs_attach_page(struct z_erofs_collector *clt,
- struct page *page,
- enum z_erofs_page_type type)
+ struct page *page, enum z_erofs_page_type type,
+ bool pvec_safereuse)
{
int ret;
- bool occupied;
/* give priority for inplaceio */
if (clt->mode >= COLLECT_PRIMARY &&
@@ -300,10 +299,9 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt,
z_erofs_try_inplace_io(clt, page))
return 0;
- ret = z_erofs_pagevec_enqueue(&clt->vector,
- page, type, &occupied);
+ ret = z_erofs_pagevec_enqueue(&clt->vector, page, type,
+ pvec_safereuse);
clt->cl->vcnt += (unsigned int)ret;
-
return ret ? 0 : -EAGAIN;
}
@@ -654,14 +652,15 @@ hitted:
tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED);
retry:
- err = z_erofs_attach_page(clt, page, page_type);
+ err = z_erofs_attach_page(clt, page, page_type,
+ clt->mode >= COLLECT_PRIMARY_FOLLOWED);
/* should allocate an additional staging page for pagevec */
if (err == -EAGAIN) {
struct page *const newpage =
__stagingpage_alloc(pagepool, GFP_NOFS);
err = z_erofs_attach_page(clt, newpage,
- Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+ Z_EROFS_PAGE_TYPE_EXCLUSIVE, true);
if (!err)
goto retry;
}
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 6a26c293ae2d..fff574100721 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -36,8 +36,14 @@ static int fill_inode_lazy(struct inode *inode)
void *kaddr;
struct z_erofs_map_header *h;
- if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
+ if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
+ /*
+ * paired with smp_mb() at the end of the function to ensure
+ * fields will only be observed after the bit is set.
+ */
+ smp_mb();
return 0;
+ }
if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE))
return -ERESTARTSYS;
@@ -83,6 +89,8 @@ static int fill_inode_lazy(struct inode *inode)
vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits +
((h->h_clusterbits >> 5) & 7);
+ /* paired with smp_mb() at the beginning of the function */
+ smp_mb();
set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
unmap_done:
kunmap_atomic(kaddr);
diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h
index 58556903aa94..6a20b2c3a24c 100644
--- a/fs/erofs/zpvec.h
+++ b/fs/erofs/zpvec.h
@@ -108,12 +108,17 @@ static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
struct page *page,
enum z_erofs_page_type type,
- bool *occupied)
+ bool pvec_safereuse)
{
- *occupied = false;
- if (!ctor->next && type)
- if (ctor->index + 1 == ctor->nr)
+ if (!ctor->next) {
+ /* some pages cannot be reused as pvec safely without I/O */
+ if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse)
+ type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED;
+
+ if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+ ctor->index + 1 == ctor->nr)
return false;
+ }
if (ctor->index >= ctor->nr)
z_erofs_pagevec_ctor_pagedown(ctor, false);
@@ -125,7 +130,6 @@ static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
/* should remind that collector->next never equal to 1, 2 */
if (type == (uintptr_t)ctor->next) {
ctor->next = page;
- *occupied = true;
}
ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type);
return true;
diff --git a/fs/exec.c b/fs/exec.c
index 1b4d2206d53a..098de820abcc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -988,7 +988,7 @@ int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size,
struct fd f = fdget(fd);
int ret = -EBADF;
- if (!f.file)
+ if (!f.file || !(f.file->f_mode & FMODE_READ))
goto out;
ret = kernel_read_file(f.file, buf, size, max_size, id);
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index e0cc55164505..abac81a2e694 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -48,10 +48,9 @@ struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
struct ext2_sb_info *sbi = EXT2_SB(sb);
if (block_group >= sbi->s_groups_count) {
- ext2_error (sb, "ext2_get_group_desc",
- "block_group >= groups_count - "
- "block_group = %d, groups_count = %lu",
- block_group, sbi->s_groups_count);
+ WARN(1, "block_group >= groups_count - "
+ "block_group = %d, groups_count = %lu",
+ block_group, sbi->s_groups_count);
return NULL;
}
@@ -59,10 +58,9 @@ struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
group_desc = block_group >> EXT2_DESC_PER_BLOCK_BITS(sb);
offset = block_group & (EXT2_DESC_PER_BLOCK(sb) - 1);
if (!sbi->s_group_desc[group_desc]) {
- ext2_error (sb, "ext2_get_group_desc",
- "Group descriptor not loaded - "
- "block_group = %d, group_desc = %lu, desc = %lu",
- block_group, group_desc, offset);
+ WARN(1, "Group descriptor not loaded - "
+ "block_group = %d, group_desc = %lu, desc = %lu",
+ block_group, group_desc, offset);
return NULL;
}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 065cd2d1bdc6..db403c01d4d5 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -770,8 +770,12 @@ static loff_t ext2_max_size(int bits)
res += 1LL << (bits-2);
res += 1LL << (2*(bits-2));
res += 1LL << (3*(bits-2));
+ /* Compute how many metadata blocks are needed */
+ meta_blocks = 1;
+ meta_blocks += 1 + ppb;
+ meta_blocks += 1 + ppb + ppb * ppb;
/* Does block tree limit file size? */
- if (res < upper_limit)
+ if (res + meta_blocks <= upper_limit)
goto check_lfs;
res = upper_limit;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 5aba67a504cf..031ff3f19018 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -612,27 +612,41 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
/**
* ext4_should_retry_alloc() - check if a block allocation should be retried
- * @sb: super block
- * @retries: number of attemps has been made
+ * @sb: superblock
+ * @retries: number of retry attempts made so far
*
- * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
- * it is profitable to retry the operation, this function will wait
- * for the current or committing transaction to complete, and then
- * return TRUE. We will only retry once.
+ * ext4_should_retry_alloc() is called when ENOSPC is returned while
+ * attempting to allocate blocks. If there's an indication that a pending
+ * journal transaction might free some space and allow another attempt to
+ * succeed, this function will wait for the current or committing transaction
+ * to complete and then return TRUE.
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
- (*retries)++ > 1 ||
- !EXT4_SB(sb)->s_journal)
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!sbi->s_journal)
return 0;
- smp_mb();
- if (EXT4_SB(sb)->s_mb_free_pending == 0)
+ if (++(*retries) > 3) {
+ percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit);
return 0;
+ }
+ /*
+ * if there's no indication that blocks are about to be freed it's
+ * possible we just missed a transaction commit that did so
+ */
+ smp_mb();
+ if (sbi->s_mb_free_pending == 0)
+ return ext4_has_free_clusters(sbi, 1, 0);
+
+ /*
+ * it's possible we've just missed a transaction commit here,
+ * so ignore the returned status
+ */
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
- jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+ (void) jbd2_journal_force_commit_nested(sbi->s_journal);
return 1;
}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 0589e914663f..e8275b5d2743 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -536,7 +536,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
struct dir_private_info *info = file->private_data;
struct inode *inode = file_inode(file);
struct fname *fname;
- int ret;
+ int ret = 0;
if (!info) {
info = ext4_htree_create_dir_info(file, ctx->pos);
@@ -584,7 +584,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
info->curr_minor_hash,
&info->next_hash);
if (ret < 0)
- return ret;
+ goto finished;
if (ret == 0) {
ctx->pos = ext4_get_htree_eof(file);
break;
@@ -615,7 +615,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
}
finished:
info->last_pos = ctx->pos;
- return 0;
+ return ret < 0 ? ret : 0;
}
static int ext4_dir_open(struct inode * inode, struct file * filp)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1c558b554788..e932e8482371 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -718,7 +718,7 @@ enum {
#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
/* Max logical block we can support */
-#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFF
+#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFE
/*
* Structure of an inode on the disk
@@ -1420,6 +1420,7 @@ struct ext4_sb_info {
struct percpu_counter s_freeinodes_counter;
struct percpu_counter s_dirs_counter;
struct percpu_counter s_dirtyclusters_counter;
+ struct percpu_counter s_sra_exceeded_retry_limit;
struct blockgroup_lock *s_blockgroup_lock;
struct proc_dir_entry *s_proc;
struct kobject s_kobj;
@@ -1965,6 +1966,10 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
* Structure of a directory entry
*/
#define EXT4_NAME_LEN 255
+/*
+ * Base length of the ext4 directory entry excluding the name length
+ */
+#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN)
struct ext4_dir_entry {
__le32 inode; /* Inode number */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3193f0b4a02d..f1bbce4350c4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -133,14 +133,25 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path)
{
+ int err = 0;
+
if (path->p_bh) {
/* path points to block */
BUFFER_TRACE(path->p_bh, "get_write_access");
- return ext4_journal_get_write_access(handle, path->p_bh);
+ err = ext4_journal_get_write_access(handle, path->p_bh);
+
+ /*
+ * The extent buffer's verified bit will be set again in
+ * __ext4_ext_dirty(). We could leave an inconsistent
+ * buffer if the extents updating procudure break off du
+ * to some error happens, force to check it again.
+ */
+ if (!err)
+ clear_buffer_verified(path->p_bh);
}
/* path points to leaf/index in inode body */
/* we use in-core data, no need to protect them */
- return 0;
+ return err;
}
/*
@@ -160,6 +171,9 @@ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
/* path points to block */
err = __ext4_handle_dirty_metadata(where, line, handle,
inode, path->p_bh);
+ /* Extents updating done, re-set verified flag */
+ if (!err)
+ set_buffer_verified(path->p_bh);
} else {
/* path points to leaf/index in inode body */
err = ext4_mark_inode_dirty(handle, inode);
@@ -390,9 +404,13 @@ static int ext4_valid_extent_idx(struct inode *inode,
static int ext4_valid_extent_entries(struct inode *inode,
struct ext4_extent_header *eh,
+ ext4_lblk_t lblk, ext4_fsblk_t *pblk,
int depth)
{
unsigned short entries;
+ ext4_lblk_t lblock = 0;
+ ext4_lblk_t prev = 0;
+
if (eh->eh_entries == 0)
return 1;
@@ -403,32 +421,52 @@ static int ext4_valid_extent_entries(struct inode *inode,
struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
ext4_fsblk_t pblock = 0;
- ext4_lblk_t lblock = 0;
- ext4_lblk_t prev = 0;
- int len = 0;
+
+ /*
+ * The logical block in the first entry should equal to
+ * the number in the index block.
+ */
+ if (depth != ext_depth(inode) &&
+ lblk != le32_to_cpu(ext->ee_block))
+ return 0;
while (entries) {
if (!ext4_valid_extent(inode, ext))
return 0;
/* Check for overlapping extents */
lblock = le32_to_cpu(ext->ee_block);
- len = ext4_ext_get_actual_len(ext);
if ((lblock <= prev) && prev) {
pblock = ext4_ext_pblock(ext);
es->s_last_error_block = cpu_to_le64(pblock);
return 0;
}
+ prev = lblock + ext4_ext_get_actual_len(ext) - 1;
ext++;
entries--;
- prev = lblock + len - 1;
}
} else {
struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
+
+ /*
+ * The logical block in the first entry should equal to
+ * the number in the parent index block.
+ */
+ if (depth != ext_depth(inode) &&
+ lblk != le32_to_cpu(ext_idx->ei_block))
+ return 0;
while (entries) {
if (!ext4_valid_extent_idx(inode, ext_idx))
return 0;
+
+ /* Check for overlapping index extents */
+ lblock = le32_to_cpu(ext_idx->ei_block);
+ if ((lblock <= prev) && prev) {
+ *pblk = ext4_idx_pblock(ext_idx);
+ return 0;
+ }
ext_idx++;
entries--;
+ prev = lblock;
}
}
return 1;
@@ -436,7 +474,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
static int __ext4_ext_check(const char *function, unsigned int line,
struct inode *inode, struct ext4_extent_header *eh,
- int depth, ext4_fsblk_t pblk)
+ int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk)
{
const char *error_msg;
int max = 0, err = -EFSCORRUPTED;
@@ -462,7 +500,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
error_msg = "invalid eh_entries";
goto corrupted;
}
- if (!ext4_valid_extent_entries(inode, eh, depth)) {
+ if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
error_msg = "invalid extent entries";
goto corrupted;
}
@@ -491,7 +529,7 @@ corrupted:
}
#define ext4_ext_check(inode, eh, depth, pblk) \
- __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
+ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0)
int ext4_ext_check_inode(struct inode *inode)
{
@@ -524,12 +562,14 @@ static void ext4_cache_extents(struct inode *inode,
static struct buffer_head *
__read_extent_tree_block(const char *function, unsigned int line,
- struct inode *inode, ext4_fsblk_t pblk, int depth,
- int flags)
+ struct inode *inode, struct ext4_extent_idx *idx,
+ int depth, int flags)
{
struct buffer_head *bh;
int err;
+ ext4_fsblk_t pblk;
+ pblk = ext4_idx_pblock(idx);
bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS);
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
@@ -545,8 +585,8 @@ __read_extent_tree_block(const char *function, unsigned int line,
if (!ext4_has_feature_journal(inode->i_sb) ||
(inode->i_ino !=
le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) {
- err = __ext4_ext_check(function, line, inode,
- ext_block_hdr(bh), depth, pblk);
+ err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
+ depth, pblk, le32_to_cpu(idx->ei_block));
if (err)
goto errout;
}
@@ -565,8 +605,8 @@ errout:
}
-#define read_extent_tree_block(inode, pblk, depth, flags) \
- __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
+#define read_extent_tree_block(inode, idx, depth, flags) \
+ __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \
(depth), (flags))
/*
@@ -613,8 +653,7 @@ int ext4_ext_precache(struct inode *inode)
i--;
continue;
}
- bh = read_extent_tree_block(inode,
- ext4_idx_pblock(path[i].p_idx++),
+ bh = read_extent_tree_block(inode, path[i].p_idx++,
depth - i - 1,
EXT4_EX_FORCE_CACHE);
if (IS_ERR(bh)) {
@@ -862,6 +901,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
eh->eh_entries = 0;
eh->eh_magic = EXT4_EXT_MAGIC;
eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
+ eh->eh_generation = 0;
ext4_mark_inode_dirty(handle, inode);
return 0;
}
@@ -916,8 +956,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
- bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
- flags);
+ bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags);
if (IS_ERR(bh)) {
ret = PTR_ERR(bh);
goto err;
@@ -1118,6 +1157,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_depth = 0;
+ neh->eh_generation = 0;
/* move remainder of path[depth] to the new leaf */
if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -1195,6 +1235,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
neh->eh_depth = cpu_to_le16(depth - i);
+ neh->eh_generation = 0;
fidx = EXT_FIRST_INDEX(neh);
fidx->ei_block = border;
ext4_idx_store_pblock(fidx, oldblock);
@@ -1514,7 +1555,6 @@ static int ext4_ext_search_right(struct inode *inode,
struct ext4_extent_header *eh;
struct ext4_extent_idx *ix;
struct ext4_extent *ex;
- ext4_fsblk_t block;
int depth; /* Note, NOT eh_depth; depth from top of tree */
int ee_len;
@@ -1581,20 +1621,17 @@ got_index:
* follow it and find the closest allocated
* block to the right */
ix++;
- block = ext4_idx_pblock(ix);
while (++depth < path->p_depth) {
/* subtract from p_depth to get proper eh_depth */
- bh = read_extent_tree_block(inode, block,
- path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
ix = EXT_FIRST_INDEX(eh);
- block = ext4_idx_pblock(ix);
put_bh(bh);
}
- bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -3116,9 +3153,9 @@ again:
ext_debug("move to level %d (block %llu)\n",
i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
- bh = read_extent_tree_block(inode,
- ext4_idx_pblock(path[i].p_idx), depth - i - 1,
- EXT4_EX_NOCACHE);
+ bh = read_extent_tree_block(inode, path[i].p_idx,
+ depth - i - 1,
+ EXT4_EX_NOCACHE);
if (IS_ERR(bh)) {
/* should we reset i_size? */
err = PTR_ERR(bh);
@@ -3378,7 +3415,10 @@ static int ext4_split_extent_at(handle_t *handle,
ext4_ext_mark_unwritten(ex2);
err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
- if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ if (err != -ENOSPC && err != -EDQUOT)
+ goto out;
+
+ if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
if (split_flag & EXT4_EXT_DATA_VALID1) {
err = ext4_ext_zeroout(inode, ex2);
@@ -3404,30 +3444,30 @@ static int ext4_split_extent_at(handle_t *handle,
ext4_ext_pblock(&orig_ex));
}
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_len = cpu_to_le16(ee_len);
- ext4_ext_try_to_merge(handle, inode, path, ex);
- err = ext4_ext_dirty(handle, inode, path + path->p_depth);
- if (err)
- goto fix_extent_len;
-
- /* update extent status tree */
- err = ext4_zeroout_es(inode, &zero_ex);
-
- goto out;
- } else if (err)
- goto fix_extent_len;
-
-out:
- ext4_ext_show_leaf(inode, path);
- return err;
+ if (!err) {
+ /* update the extent length and mark as initialized */
+ ex->ee_len = cpu_to_le16(ee_len);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+ if (!err)
+ /* update extent status tree */
+ err = ext4_zeroout_es(inode, &zero_ex);
+ /* If we failed at this point, we don't know in which
+ * state the extent tree exactly is so don't try to fix
+ * length of the original extent as it may do even more
+ * damage.
+ */
+ goto out;
+ }
+ }
fix_extent_len:
ex->ee_len = orig_ex.ee_len;
ext4_ext_dirty(handle, inode, path + path->p_depth);
return err;
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
}
/*
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index d996b44d2265..43fba01da6c3 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1553,11 +1553,9 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
- if (!nr_to_scan)
- return ret;
-
nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
+ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
return nr_shrunk;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 64b6549dd901..83846cc81485 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -394,7 +394,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
*
* We always try to spread first-level directories.
*
- * If there are blockgroups with both free inodes and free blocks counts
+ * If there are blockgroups with both free inodes and free clusters counts
* not worse than average we return one with smallest directory count.
* Otherwise we simply return a random group.
*
@@ -403,7 +403,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
* It's OK to put directory into a group unless
* it has too many directories already (max_dirs) or
* it has too few free inodes left (min_inodes) or
- * it has too few free blocks left (min_blocks) or
+ * it has too few free clusters left (min_clusters) or
* Parent's group is preferred, if it doesn't satisfy these
* conditions we search cyclically through the rest. If none
* of the groups look good we just look for a group with more
@@ -419,7 +419,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
ext4_group_t real_ngroups = ext4_get_groups_count(sb);
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
unsigned int freei, avefreei, grp_free;
- ext4_fsblk_t freeb, avefreec;
+ ext4_fsblk_t freec, avefreec;
unsigned int ndirs;
int max_dirs, min_inodes;
ext4_grpblk_t min_clusters;
@@ -438,9 +438,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
avefreei = freei / ngroups;
- freeb = EXT4_C2B(sbi,
- percpu_counter_read_positive(&sbi->s_freeclusters_counter));
- avefreec = freeb;
+ freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter);
+ avefreec = freec;
do_div(avefreec, ngroups);
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
@@ -1353,6 +1352,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
handle_t *handle;
ext4_fsblk_t blk;
int num, ret = 0, used_blks = 0;
+ unsigned long used_inos = 0;
/* This should not happen, but just to be sure check this */
if (sb_rdonly(sb)) {
@@ -1383,22 +1383,37 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
* used inodes so we need to skip blocks with used inodes in
* inode table.
*/
- if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
- used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
- ext4_itable_unused_count(sb, gdp)),
- sbi->s_inodes_per_block);
-
- if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group) ||
- ((group == 0) && ((EXT4_INODES_PER_GROUP(sb) -
- ext4_itable_unused_count(sb, gdp)) <
- EXT4_FIRST_INO(sb)))) {
- ext4_error(sb, "Something is wrong with group %u: "
- "used itable blocks: %d; "
- "itable unused count: %u",
- group, used_blks,
- ext4_itable_unused_count(sb, gdp));
- ret = 1;
- goto err_out;
+ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) {
+ used_inos = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
+ used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block);
+
+ /* Bogus inode unused count? */
+ if (used_blks < 0 || used_blks > sbi->s_itb_per_group) {
+ ext4_error(sb, "Something is wrong with group %u: "
+ "used itable blocks: %d; "
+ "itable unused count: %u",
+ group, used_blks,
+ ext4_itable_unused_count(sb, gdp));
+ ret = 1;
+ goto err_out;
+ }
+
+ used_inos += group * EXT4_INODES_PER_GROUP(sb);
+ /*
+ * Are there some uninitialized inodes in the inode table
+ * before the first normal inode?
+ */
+ if ((used_blks != sbi->s_itb_per_group) &&
+ (used_inos < EXT4_FIRST_INO(sb))) {
+ ext4_error(sb, "Something is wrong with group %u: "
+ "itable unused count: %u; "
+ "itables initialized count: %ld",
+ group, ext4_itable_unused_count(sb, gdp),
+ used_inos);
+ ret = 1;
+ goto err_out;
+ }
}
blk = ext4_inode_table(sb, gdp) + used_blks;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 519378a15bc6..8f665aa1d706 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -733,25 +733,26 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
void *kaddr;
struct ext4_iloc iloc;
- if (unlikely(copied < len)) {
- if (!PageUptodate(page)) {
- copied = 0;
- goto out;
- }
- }
+ if (unlikely(copied < len) && !PageUptodate(page))
+ return 0;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret) {
ext4_std_error(inode->i_sb, ret);
- copied = 0;
- goto out;
+ return ret;
}
ext4_write_lock_xattr(inode, &no_expand);
BUG_ON(!ext4_has_inline_data(inode));
+ /*
+ * ei->i_inline_off may have changed since ext4_write_begin()
+ * called ext4_try_to_write_inline_data()
+ */
+ (void) ext4_find_inline_data_nolock(inode);
+
kaddr = kmap_atomic(page);
- ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
+ ext4_write_inline_data(inode, &iloc, kaddr, pos, copied);
kunmap_atomic(kaddr);
SetPageUptodate(page);
/* clear page dirty so that writepages wouldn't work for us. */
@@ -760,7 +761,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
ext4_write_unlock_xattr(inode, &no_expand);
brelse(iloc.bh);
mark_inode_dirty(inode);
-out:
+
return copied;
}
@@ -1119,7 +1120,15 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
struct ext4_iloc *iloc,
void *buf, int inline_size)
{
- ext4_create_inline_data(handle, inode, inline_size);
+ int ret;
+
+ ret = ext4_create_inline_data(handle, inode, inline_size);
+ if (ret) {
+ ext4_msg(inode->i_sb, KERN_EMERG,
+ "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)",
+ inode->i_ino, ret);
+ return;
+ }
ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3bac525f0439..00686fbe3c27 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1439,6 +1439,7 @@ static int ext4_write_end(struct file *file,
goto errout;
}
copied = ret;
+ ret = 0;
} else
copied = block_write_end(file, mapping, pos,
len, copied, page, fsdata);
@@ -1465,13 +1466,14 @@ static int ext4_write_end(struct file *file,
if (i_size_changed || inline_data)
ext4_mark_inode_dirty(handle, inode);
+errout:
if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
* inode->i_size. So truncate them
*/
ext4_orphan_add(handle, inode);
-errout:
+
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1554,6 +1556,7 @@ static int ext4_journalled_write_end(struct file *file,
goto errout;
}
copied = ret;
+ ret = 0;
} else if (unlikely(copied < len) && !PageUptodate(page)) {
copied = 0;
ext4_journalled_zero_new_buffers(handle, page, from, to);
@@ -1583,6 +1586,7 @@ static int ext4_journalled_write_end(struct file *file,
ret = ret2;
}
+errout:
if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
@@ -1590,7 +1594,6 @@ static int ext4_journalled_write_end(struct file *file,
*/
ext4_orphan_add(handle, inode);
-errout:
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1782,6 +1785,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
bool allocated = false;
+ bool reserved = false;
/*
* If the cluster containing lblk is shared with a delayed,
@@ -1798,6 +1802,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
ret = ext4_da_reserve_space(inode);
if (ret != 0) /* ENOSPC */
goto errout;
+ reserved = true;
} else { /* bigalloc */
if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
if (!ext4_es_scan_clu(inode,
@@ -1810,6 +1815,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
ret = ext4_da_reserve_space(inode);
if (ret != 0) /* ENOSPC */
goto errout;
+ reserved = true;
} else {
allocated = true;
}
@@ -1820,6 +1826,8 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
}
ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
+ if (ret && reserved)
+ ext4_da_release_space(inode, 1);
errout:
return ret;
@@ -2076,13 +2084,13 @@ static int __ext4_journalled_writepage(struct page *page,
if (!ret)
ret = err;
- if (!ext4_has_inline_data(inode))
- ext4_walk_page_buffers(NULL, page_bufs, 0, len,
- NULL, bput_one);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
unlock_page(page);
out_no_pagelock:
+ if (!inline_data && page_bufs)
+ ext4_walk_page_buffers(NULL, page_bufs, 0, len,
+ NULL, bput_one);
brelse(inode_bh);
return ret;
}
@@ -2153,6 +2161,15 @@ static int ext4_writepage(struct page *page,
else
len = PAGE_SIZE;
+ /* Should never happen but for bugs in other kernel subsystems */
+ if (!page_has_buffers(page)) {
+ ext4_warning_inode(inode,
+ "page %lu does not have buffers attached", page->index);
+ ClearPageDirty(page);
+ unlock_page(page);
+ return 0;
+ }
+
page_bufs = page_buffers(page);
/*
* We cannot do block allocation or other extent handling in this
@@ -2702,6 +2719,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
+ /*
+ * Should never happen but for buggy code in
+ * other subsystems that call
+ * set_page_dirty() without properly warning
+ * the file system first. See [1] for more
+ * information.
+ *
+ * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
+ */
+ if (!page_has_buffers(page)) {
+ ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index);
+ ClearPageDirty(page);
+ unlock_page(page);
+ continue;
+ }
+
if (mpd->map.m_len == 0)
mpd->first_page = page->index;
mpd->next_page = page->index + 1;
@@ -4278,7 +4311,8 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
struct super_block *sb = inode->i_sb;
ext4_lblk_t first_block, stop_block;
struct address_space *mapping = inode->i_mapping;
- loff_t first_block_offset, last_block_offset;
+ loff_t first_block_offset, last_block_offset, max_length;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
handle_t *handle;
unsigned int credits;
int ret = 0;
@@ -4324,6 +4358,14 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
offset;
}
+ /*
+ * For punch hole the length + offset needs to be within one block
+ * before last range. Adjust the length if it goes beyond that limit.
+ */
+ max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
+ if (offset + length > max_length)
+ length = max_length - offset;
+
if (offset & (sb->s_blocksize - 1) ||
(offset + length) & (sb->s_blocksize - 1)) {
/*
@@ -5209,7 +5251,7 @@ static int other_inode_match(struct inode * inode, unsigned long ino,
(inode->i_state & I_DIRTY_TIME)) {
struct ext4_inode_info *ei = EXT4_I(inode);
- inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+ inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock);
spin_lock(&ei->i_raw_lock);
@@ -5267,7 +5309,7 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
struct super_block *sb = inode->i_sb;
- int err = 0, rc, block;
+ int err = 0, block;
int need_datasync = 0, set_large_file = 0;
uid_t i_uid;
gid_t i_gid;
@@ -5379,9 +5421,9 @@ static int ext4_do_update_inode(handle_t *handle,
bh->b_data);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- rc = ext4_handle_dirty_metadata(handle, NULL, bh);
- if (!err)
- err = rc;
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out_brelse;
ext4_clear_inode_state(inode, EXT4_STATE_NEW);
if (set_large_file) {
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index ba13fbb443d5..9fa20f9ba52b 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1120,8 +1120,6 @@ resizefs_out:
sizeof(range)))
return -EFAULT;
- range.minlen = max((unsigned int)range.minlen,
- q->limits.discard_granularity);
ret = ext4_trim_fs(sb, &range);
if (ret < 0)
return ret;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd69510f2947..0307702d114d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1542,10 +1542,11 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
/* Should never happen! (but apparently sometimes does?!?) */
WARN_ON(1);
- ext4_error(e4b->bd_sb, "corruption or bug in mb_find_extent "
- "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
- block, order, needed, ex->fe_group, ex->fe_start,
- ex->fe_len, ex->fe_logical);
+ ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0,
+ "corruption or bug in mb_find_extent "
+ "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
+ block, order, needed, ex->fe_group, ex->fe_start,
+ ex->fe_len, ex->fe_logical);
ex->fe_len = 0;
ex->fe_start = 0;
ex->fe_group = 0;
@@ -5269,6 +5270,7 @@ out:
*/
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct ext4_group_info *grp;
ext4_group_t group, first_group, last_group;
ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
@@ -5287,6 +5289,13 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
start >= max_blks ||
range->len < sb->s_blocksize)
return -EINVAL;
+ /* No point to try to trim less than discard granularity */
+ if (range->minlen < q->limits.discard_granularity) {
+ minlen = EXT4_NUM_B2C(EXT4_SB(sb),
+ q->limits.discard_granularity >> sb->s_blocksize_bits);
+ if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
+ goto out;
+ }
if (end >= max_blks)
end = max_blks - 1;
if (end <= first_data_blk)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index be4ee3dcc5cf..c5b2ea1a9372 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -455,12 +455,12 @@ int ext4_ext_migrate(struct inode *inode)
percpu_down_write(&sbi->s_writepages_rwsem);
/*
- * Worst case we can touch the allocation bitmaps, a bgd
- * block, and a block to link in the orphan list. We do need
- * need to worry about credits for modifying the quota inode.
+ * Worst case we can touch the allocation bitmaps and a block
+ * group descriptor block. We do need need to worry about
+ * credits for modifying the quota inode.
*/
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
- 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
+ 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
@@ -477,6 +477,13 @@ int ext4_ext_migrate(struct inode *inode)
ext4_journal_stop(handle);
goto out_unlock;
}
+ /*
+ * Use the correct seed for checksum (i.e. the seed from 'inode'). This
+ * is so that the metadata blocks will have the correct checksum after
+ * the migration.
+ */
+ ei = EXT4_I(inode);
+ EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed;
i_size_write(tmp_inode, i_size_read(inode));
/*
* Set the i_nlink to zero so it will be deleted later
@@ -485,7 +492,6 @@ int ext4_ext_migrate(struct inode *inode)
clear_nlink(tmp_inode);
ext4_ext_tree_init(handle, tmp_inode);
- ext4_orphan_add(handle, tmp_inode);
ext4_journal_stop(handle);
/*
@@ -510,17 +516,10 @@ int ext4_ext_migrate(struct inode *inode)
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
if (IS_ERR(handle)) {
- /*
- * It is impossible to update on-disk structures without
- * a handle, so just rollback in-core changes and live other
- * work to orphan_list_cleanup()
- */
- ext4_orphan_del(NULL, tmp_inode);
retval = PTR_ERR(handle);
goto out_tmp_inode;
}
- ei = EXT4_I(inode);
i_data = ei->i_data;
memset(&lb, 0, sizeof(lb));
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f05ec9bfbf4f..f10307215d58 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1385,10 +1385,10 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
de = (struct ext4_dir_entry_2 *)search_buf;
dlimit = search_buf + buf_size;
- while ((char *) de < dlimit) {
+ while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
- if ((char *) de + de->name_len <= dlimit &&
+ if (de->name + de->name_len <= dlimit &&
ext4_match(dir, fname, de)) {
/* found a match - just to be sure, do
* a full check */
@@ -2405,11 +2405,10 @@ again:
(frame - 1)->bh);
if (err)
goto journal_error;
- if (restart) {
- err = ext4_handle_dirty_dx_node(handle, dir,
- frame->bh);
+ err = ext4_handle_dirty_dx_node(handle, dir,
+ frame->bh);
+ if (restart || err)
goto journal_error;
- }
} else {
struct dx_root *dxroot;
memcpy((char *) entries2, (char *) entries,
@@ -3548,6 +3547,31 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
return 0;
}
+static void ext4_resetent(handle_t *handle, struct ext4_renament *ent,
+ unsigned ino, unsigned file_type)
+{
+ struct ext4_renament old = *ent;
+ int retval = 0;
+
+ /*
+ * old->de could have moved from under us during make indexed dir,
+ * so the old->de may no longer valid and need to find it again
+ * before reset old inode info.
+ */
+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
+ if (IS_ERR(old.bh))
+ retval = PTR_ERR(old.bh);
+ if (!old.bh)
+ retval = -ENOENT;
+ if (retval) {
+ ext4_std_error(old.dir->i_sb, retval);
+ return;
+ }
+
+ ext4_setent(handle, &old, ino, file_type);
+ brelse(old.bh);
+}
+
static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
const struct qstr *d_name)
{
@@ -3707,14 +3731,14 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
*/
retval = -ENOENT;
if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
- goto end_rename;
+ goto release_bh;
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
&new.de, &new.inlined);
if (IS_ERR(new.bh)) {
retval = PTR_ERR(new.bh);
new.bh = NULL;
- goto end_rename;
+ goto release_bh;
}
if (new.bh) {
if (!new.inode) {
@@ -3731,15 +3755,13 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
- handle = NULL;
- goto end_rename;
+ goto release_bh;
}
} else {
whiteout = ext4_whiteout_for_rename(&old, credits, &handle);
if (IS_ERR(whiteout)) {
retval = PTR_ERR(whiteout);
- whiteout = NULL;
- goto end_rename;
+ goto release_bh;
}
}
@@ -3844,19 +3866,21 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
end_rename:
if (whiteout) {
if (retval) {
- ext4_setent(handle, &old,
- old.inode->i_ino, old_file_type);
+ ext4_resetent(handle, &old,
+ old.inode->i_ino, old_file_type);
drop_nlink(whiteout);
+ ext4_orphan_add(handle, whiteout);
}
unlock_new_inode(whiteout);
+ ext4_journal_stop(handle);
iput(whiteout);
-
+ } else {
+ ext4_journal_stop(handle);
}
+release_bh:
brelse(old.dir_bh);
brelse(old.bh);
brelse(new.bh);
- if (handle)
- ext4_journal_stop(handle);
return retval;
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 2cc9f2168b9e..b66b335a0ca6 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -100,8 +100,10 @@ static void ext4_finish_bio(struct bio *bio)
continue;
}
clear_buffer_async_write(bh);
- if (bio->bi_status)
+ if (bio->bi_status) {
+ set_buffer_write_io_error(bh);
buffer_io_error(bh);
+ }
} while ((bh = bh->b_this_page) != head);
bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
local_irq_restore(flags);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ad1d4c8faf44..b7f20196439a 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -74,6 +74,11 @@ int ext4_resize_begin(struct super_block *sb)
return -EPERM;
}
+ if (ext4_has_feature_sparse_super2(sb)) {
+ ext4_msg(sb, KERN_ERR, "Online resizing not supported with sparse_super2");
+ return -EOPNOTSUPP;
+ }
+
if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING,
&EXT4_SB(sb)->s_ext4_flags))
ret = -EBUSY;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 06568467b0c2..c13879bd2168 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1017,6 +1017,7 @@ static void ext4_put_super(struct super_block *sb)
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
percpu_free_rwsem(&sbi->s_writepages_rwsem);
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -1140,6 +1141,12 @@ static void ext4_destroy_inode(struct inode *inode)
true);
dump_stack();
}
+
+ if (EXT4_I(inode)->i_reserved_data_blocks)
+ ext4_msg(inode->i_sb, KERN_ERR,
+ "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
+ inode->i_ino, EXT4_I(inode),
+ EXT4_I(inode)->i_reserved_data_blocks);
}
static void init_once(void *foo)
@@ -2674,9 +2681,6 @@ static void ext4_orphan_cleanup(struct super_block *sb,
sb->s_flags &= ~SB_RDONLY;
}
#ifdef CONFIG_QUOTA
- /* Needed for iput() to work correctly and not trash data */
- sb->s_flags |= SB_ACTIVE;
-
/*
* Turn on quotas which were not enabled for read-only mounts if
* filesystem has quota feature, so that they are updated correctly.
@@ -2737,8 +2741,15 @@ static void ext4_orphan_cleanup(struct super_block *sb,
inode_lock(inode);
truncate_inode_pages(inode->i_mapping, inode->i_size);
ret = ext4_truncate(inode);
- if (ret)
+ if (ret) {
+ /*
+ * We need to clean up the in-core orphan list
+ * manually if ext4_truncate() failed to get a
+ * transaction handle.
+ */
+ ext4_orphan_del(NULL, inode);
ext4_std_error(inode->i_sb, ret);
+ }
inode_unlock(inode);
nr_truncates++;
} else {
@@ -2825,17 +2836,17 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
*/
static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
- loff_t res = EXT4_NDIR_BLOCKS;
+ unsigned long long upper_limit, res = EXT4_NDIR_BLOCKS;
int meta_blocks;
- loff_t upper_limit;
- /* This is calculated to be the largest file size for a dense, block
+
+ /*
+ * This is calculated to be the largest file size for a dense, block
* mapped file such that the file's total number of 512-byte sectors,
* including data and all indirect blocks, does not exceed (2^48 - 1).
*
* __u32 i_blocks_lo and _u16 i_blocks_high represent the total
* number of 512-byte sectors of the file.
*/
-
if (!has_huge_files) {
/*
* !has_huge_files or implies that the inode i_block field
@@ -2878,7 +2889,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
if (res > MAX_LFS_FILESIZE)
res = MAX_LFS_FILESIZE;
- return res;
+ return (loff_t)res;
}
static ext4_fsblk_t descriptor_loc(struct super_block *sb,
@@ -3060,8 +3071,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
struct ext4_group_desc *gdp = NULL;
ext4_group_t group, ngroups;
struct super_block *sb;
- unsigned long timeout = 0;
int ret = 0;
+ u64 start_time;
sb = elr->lr_super;
ngroups = EXT4_SB(sb)->s_groups_count;
@@ -3081,13 +3092,12 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = 1;
if (!ret) {
- timeout = jiffies;
+ start_time = ktime_get_real_ns();
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
if (elr->lr_timeout == 0) {
- timeout = (jiffies - timeout) *
- elr->lr_sbi->s_li_wait_mult;
- elr->lr_timeout = timeout;
+ elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
+ elr->lr_sbi->s_li_wait_mult);
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
@@ -3475,9 +3485,11 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
ext4_fsblk_t first_block, last_block, b;
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int s, j, count = 0;
+ int has_super = ext4_bg_has_super(sb, grp);
if (!ext4_has_feature_bigalloc(sb))
- return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+ return (has_super + ext4_bg_num_gdb(sb, grp) +
+ (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
sbi->s_itb_per_group + 2);
first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
@@ -4502,9 +4514,18 @@ no_journal:
* Get the # of file system overhead blocks from the
* superblock if present.
*/
- if (es->s_overhead_clusters)
- sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
- else {
+ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
+ /* ignore the precalculated value if it is ridiculous */
+ if (sbi->s_overhead > ext4_blocks_count(es))
+ sbi->s_overhead = 0;
+ /*
+ * If the bigalloc feature is not enabled recalculating the
+ * overhead doesn't take long, so we might as well just redo
+ * it to make sure we are using the correct value.
+ */
+ if (!ext4_has_feature_bigalloc(sb))
+ sbi->s_overhead = 0;
+ if (sbi->s_overhead == 0) {
err = ext4_calculate_overhead(sb);
if (err)
goto failed_mount_wq;
@@ -4598,6 +4619,9 @@ no_journal:
err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
GFP_KERNEL);
if (!err)
+ err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0,
+ GFP_KERNEL);
+ if (!err)
err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
if (err) {
@@ -4610,6 +4634,7 @@ no_journal:
ext4_msg(sb, KERN_ERR,
"unable to initialize "
"flex_bg meta info!");
+ ret = -ENOMEM;
goto failed_mount6;
}
@@ -4699,6 +4724,7 @@ failed_mount6:
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
percpu_free_rwsem(&sbi->s_writepages_rwsem);
failed_mount5:
ext4_ext_release(sb);
@@ -5055,8 +5081,10 @@ static int ext4_commit_super(struct super_block *sb, int sync)
struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
int error = 0;
- if (!sbh || block_device_ejected(sb))
- return error;
+ if (!sbh)
+ return -EINVAL;
+ if (block_device_ejected(sb))
+ return -ENODEV;
/*
* If the file system is mounted read-only, don't update the
@@ -5895,10 +5923,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
err = dquot_quota_on(sb, type, format_id, path);
- if (err) {
- lockdep_set_quota_inode(path->dentry->d_inode,
- I_DATA_SEM_NORMAL);
- } else {
+ if (!err) {
struct inode *inode = d_inode(path->dentry);
handle_t *handle;
@@ -5918,7 +5943,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
ext4_journal_stop(handle);
unlock_inode:
inode_unlock(inode);
+ if (err)
+ dquot_quota_off(sb, type);
}
+ if (err)
+ lockdep_set_quota_inode(path->dentry->d_inode,
+ I_DATA_SEM_NORMAL);
return err;
}
@@ -5981,8 +6011,19 @@ static int ext4_enable_quotas(struct super_block *sb)
"Failed to enable quota tracking "
"(type=%d, err=%d). Please run "
"e2fsck to fix.", type, err);
- for (type--; type >= 0; type--)
+ for (type--; type >= 0; type--) {
+ struct inode *inode;
+
+ inode = sb_dqopt(sb)->files[type];
+ if (inode)
+ inode = igrab(inode);
dquot_quota_off(sb, type);
+ if (inode) {
+ lockdep_set_quota_inode(inode,
+ I_DATA_SEM_NORMAL);
+ iput(inode);
+ }
+ }
return err;
}
@@ -6084,7 +6125,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
struct buffer_head *bh;
handle_t *handle = journal_current_handle();
- if (EXT4_SB(sb)->s_journal && !handle) {
+ if (!handle) {
ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
" cancelled because transaction is not started",
(unsigned long long)off, (unsigned long long)len);
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index dd05af983092..a9457fed351e 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -52,10 +52,19 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
return paddr;
}
+static int ext4_encrypted_symlink_getattr(const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ ext4_getattr(path, stat, request_mask, query_flags);
+
+ return fscrypt_symlink_getattr(path, stat);
+}
+
const struct inode_operations ext4_encrypted_symlink_inode_operations = {
.get_link = ext4_encrypted_get_link,
.setattr = ext4_setattr,
- .getattr = ext4_getattr,
+ .getattr = ext4_encrypted_symlink_getattr,
.listxattr = ext4_listxattr,
};
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index eb1efad0e20a..9394360ff137 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -23,6 +23,7 @@ typedef enum {
attr_session_write_kbytes,
attr_lifetime_write_kbytes,
attr_reserved_clusters,
+ attr_sra_exceeded_retry_limit,
attr_inode_readahead,
attr_trigger_test_error,
attr_first_error_time,
@@ -176,6 +177,7 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
EXT4_ATTR_FUNC(session_write_kbytes, 0444);
EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
EXT4_ATTR_FUNC(reserved_clusters, 0644);
+EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
ext4_sb_info, s_inode_readahead_blks);
@@ -207,6 +209,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes),
ATTR_LIST(reserved_clusters),
+ ATTR_LIST(sra_exceeded_retry_limit),
ATTR_LIST(inode_readahead_blks),
ATTR_LIST(inode_goal),
ATTR_LIST(mb_stats),
@@ -308,6 +311,10 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
return snprintf(buf, PAGE_SIZE, "%llu\n",
(unsigned long long)
atomic64_read(&sbi->s_resv_clusters));
+ case attr_sra_exceeded_retry_limit:
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)
+ percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit));
case attr_inode_readahead:
case attr_pointer_ui:
if (!ptr)
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index d0d8a9795dd6..6a30e54c1128 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -198,55 +198,76 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc,
struct inode *inode = file_inode(filp);
const int credits = 2; /* superblock and inode for ext4_orphan_del() */
handle_t *handle;
+ struct ext4_iloc iloc;
int err = 0;
- int err2;
- if (desc != NULL) {
- /* Succeeded; write the verity descriptor. */
- err = ext4_write_verity_descriptor(inode, desc, desc_size,
- merkle_tree_size);
-
- /* Write all pages before clearing VERITY_IN_PROGRESS. */
- if (!err)
- err = filemap_write_and_wait(inode->i_mapping);
- }
+ /*
+ * If an error already occurred (which fs/verity/ signals by passing
+ * desc == NULL), then only clean-up is needed.
+ */
+ if (desc == NULL)
+ goto cleanup;
- /* If we failed, truncate anything we wrote past i_size. */
- if (desc == NULL || err)
- ext4_truncate(inode);
+ /* Append the verity descriptor. */
+ err = ext4_write_verity_descriptor(inode, desc, desc_size,
+ merkle_tree_size);
+ if (err)
+ goto cleanup;
/*
- * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and
- * deleting the inode from the orphan list, even if something failed.
- * If everything succeeded, we'll also set the verity bit in the same
- * transaction.
+ * Write all pages (both data and verity metadata). Note that this must
+ * happen before clearing EXT4_STATE_VERITY_IN_PROGRESS; otherwise pages
+ * beyond i_size won't be written properly. For crash consistency, this
+ * also must happen before the verity inode flag gets persisted.
*/
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err)
+ goto cleanup;
- ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+ /*
+ * Finally, set the verity inode flag and remove the inode from the
+ * orphan list (in a single transaction).
+ */
handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
if (IS_ERR(handle)) {
- ext4_orphan_del(NULL, inode);
- return PTR_ERR(handle);
+ err = PTR_ERR(handle);
+ goto cleanup;
}
- err2 = ext4_orphan_del(handle, inode);
- if (err2)
- goto out_stop;
+ err = ext4_orphan_del(handle, inode);
+ if (err)
+ goto stop_and_cleanup;
- if (desc != NULL && !err) {
- struct ext4_iloc iloc;
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto stop_and_cleanup;
- err = ext4_reserve_inode_write(handle, inode, &iloc);
- if (err)
- goto out_stop;
- ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
- ext4_set_inode_flags(inode);
- err = ext4_mark_iloc_dirty(handle, inode, &iloc);
- }
-out_stop:
+ ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
+ ext4_set_inode_flags(inode);
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (err)
+ goto stop_and_cleanup;
+
+ ext4_journal_stop(handle);
+
+ ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+ return 0;
+
+stop_and_cleanup:
ext4_journal_stop(handle);
- return err ?: err2;
+cleanup:
+ /*
+ * Verity failed to be enabled, so clean up by truncating any verity
+ * metadata that was written beyond i_size (both from cache and from
+ * disk), removing the inode from the orphan list (if it wasn't done
+ * already), and clearing EXT4_STATE_VERITY_IN_PROGRESS.
+ */
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext4_truncate(inode);
+ ext4_orphan_del(NULL, inode);
+ ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+ return err;
}
static int ext4_get_verity_descriptor_location(struct inode *inode,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 894a61010ae9..20e40cac819e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1476,6 +1476,9 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
if (!ce)
return NULL;
+ WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) &&
+ !(current->flags & PF_MEMALLOC_NOFS));
+
ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
if (!ea_data) {
mb_cache_entry_put(ea_inode_cache, ce);
@@ -2342,6 +2345,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
error = -ENOSPC;
goto cleanup;
}
+ WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS));
}
error = ext4_reserve_inode_write(handle, inode, &is.iloc);
@@ -2415,7 +2419,7 @@ retry_inode:
* external inode if possible.
*/
if (ext4_has_feature_ea_inode(inode->i_sb) &&
- !i.in_inode) {
+ i.value_len && !i.in_inode) {
i.in_inode = 1;
goto retry_inode;
}
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index a57219c51c01..54f0d2c4c7d8 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -583,7 +583,7 @@ int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi)
if (time_to_inject(sbi, FAULT_ORPHAN)) {
spin_unlock(&im->ino_lock);
- f2fs_show_injection_info(FAULT_ORPHAN);
+ f2fs_show_injection_info(sbi, FAULT_ORPHAN);
return -ENOSPC;
}
@@ -848,6 +848,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
struct page *cp_page_1 = NULL, *cp_page_2 = NULL;
struct f2fs_checkpoint *cp_block = NULL;
unsigned long long cur_version = 0, pre_version = 0;
+ unsigned int cp_blocks;
int err;
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
@@ -855,15 +856,16 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (err)
return NULL;
- if (le32_to_cpu(cp_block->cp_pack_total_block_count) >
- sbi->blocks_per_seg) {
+ cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count);
+
+ if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) {
f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u",
le32_to_cpu(cp_block->cp_pack_total_block_count));
goto invalid_cp;
}
pre_version = *version;
- cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
+ cp_addr += cp_blocks - 1;
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
&cp_page_2, version);
if (err)
@@ -1144,7 +1146,8 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
if (!is_journalled_quota(sbi))
return false;
- down_write(&sbi->quota_sem);
+ if (!down_write_trylock(&sbi->quota_sem))
+ return true;
if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) {
ret = false;
} else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) {
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 68be334afc28..773028921c48 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -167,9 +167,10 @@ static bool f2fs_bio_post_read_required(struct bio *bio)
static void f2fs_read_end_io(struct bio *bio)
{
- if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)),
- FAULT_READ_IO)) {
- f2fs_show_injection_info(FAULT_READ_IO);
+ struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio));
+
+ if (time_to_inject(sbi, FAULT_READ_IO)) {
+ f2fs_show_injection_info(sbi, FAULT_READ_IO);
bio->bi_status = BLK_STS_IOERR;
}
@@ -191,7 +192,7 @@ static void f2fs_write_end_io(struct bio *bio)
struct bvec_iter_all iter_all;
if (time_to_inject(sbi, FAULT_WRITE_IO)) {
- f2fs_show_injection_info(FAULT_WRITE_IO);
+ f2fs_show_injection_info(sbi, FAULT_WRITE_IO);
bio->bi_status = BLK_STS_IOERR;
}
@@ -318,7 +319,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
if (test_opt(sbi, LFS) && current->plug)
blk_finish_plug(current->plug);
- if (F2FS_IO_ALIGNED(sbi))
+ if (!F2FS_IO_ALIGNED(sbi))
goto submit_io;
start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS;
@@ -1190,7 +1191,21 @@ next_dnode:
if (err) {
if (flag == F2FS_GET_BLOCK_BMAP)
map->m_pblk = 0;
+
if (err == -ENOENT) {
+ /*
+ * There is one exceptional case that read_node_page()
+ * may return -ENOENT due to filesystem has been
+ * shutdown or cp_error, so force to convert error
+ * number to EIO for such case.
+ */
+ if (map->m_may_create &&
+ (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
+ f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto unlock_out;
+ }
+
err = 0;
if (map->m_next_pgofs)
*map->m_next_pgofs =
@@ -2452,8 +2467,12 @@ static int __f2fs_write_data_pages(struct address_space *mapping,
/* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */
if (wbc->sync_mode == WB_SYNC_ALL)
atomic_inc(&sbi->wb_sync_req[DATA]);
- else if (atomic_read(&sbi->wb_sync_req[DATA]))
+ else if (atomic_read(&sbi->wb_sync_req[DATA])) {
+ /* to avoid potential deadlock */
+ if (current->plug)
+ blk_finish_plug(current->plug);
goto skip_write;
+ }
if (__should_serialize_io(inode, wbc)) {
mutex_lock(&sbi->writepages);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 78d041f9775a..99c4a868d73b 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -618,7 +618,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
start:
if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) {
- f2fs_show_injection_info(FAULT_DIR_DEPTH);
+ f2fs_show_injection_info(F2FS_I_SB(dir), FAULT_DIR_DEPTH);
return -ENOSPC;
}
@@ -892,6 +892,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode);
struct blk_plug plug;
bool readdir_ra = sbi->readdir_ra == 1;
+ bool found_valid_dirent = false;
int err = 0;
bit_pos = ((unsigned long)ctx->pos % d->max);
@@ -906,12 +907,15 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
de = &d->dentry[bit_pos];
if (de->name_len == 0) {
+ if (found_valid_dirent || !bit_pos) {
+ printk_ratelimited(
+ "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.",
+ KERN_WARNING, sbi->sb->s_id,
+ le32_to_cpu(de->ino));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ }
bit_pos++;
ctx->pos = start_pos + bit_pos;
- printk_ratelimited(
- "%s, invalid namelen(0), ino:%u, run fsck to fix.",
- KERN_WARNING, le32_to_cpu(de->ino));
- set_sbi_flag(sbi, SBI_NEED_FSCK);
continue;
}
@@ -954,6 +958,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
f2fs_ra_node_page(sbi, le32_to_cpu(de->ino));
ctx->pos = start_pos + bit_pos;
+ found_valid_dirent = true;
}
out:
if (readdir_ra)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4ca3c2a0a0f5..5645502c156d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -931,6 +931,7 @@ struct f2fs_sm_info {
unsigned int segment_count; /* total # of segments */
unsigned int main_segments; /* # of segments in main area */
unsigned int reserved_segments; /* # of reserved segments */
+ unsigned int additional_reserved_segments;/* reserved segs for IO align feature */
unsigned int ovp_segments; /* # of overprovision segments */
/* a threshold to reclaim prefree segments */
@@ -1374,9 +1375,10 @@ struct f2fs_private_dio {
};
#ifdef CONFIG_F2FS_FAULT_INJECTION
-#define f2fs_show_injection_info(type) \
- printk_ratelimited("%sF2FS-fs : inject %s in %s of %pS\n", \
- KERN_INFO, f2fs_fault_name[type], \
+#define f2fs_show_injection_info(sbi, type) \
+ printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", \
+ KERN_INFO, sbi->sb->s_id, \
+ f2fs_fault_name[type], \
__func__, __builtin_return_address(0))
static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
{
@@ -1396,7 +1398,7 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
return false;
}
#else
-#define f2fs_show_injection_info(type) do { } while (0)
+#define f2fs_show_injection_info(sbi, type) do { } while (0)
static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
{
return false;
@@ -1781,7 +1783,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
return ret;
if (time_to_inject(sbi, FAULT_BLOCK)) {
- f2fs_show_injection_info(FAULT_BLOCK);
+ f2fs_show_injection_info(sbi, FAULT_BLOCK);
release = *count;
goto release_quota;
}
@@ -1799,6 +1801,11 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
if (!__allow_reserved_blocks(sbi, inode, true))
avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
+
+ if (F2FS_IO_ALIGNED(sbi))
+ avail_user_block_count -= sbi->blocks_per_seg *
+ SM_I(sbi)->additional_reserved_segments;
+
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
if (avail_user_block_count > sbi->unusable_block_count)
avail_user_block_count -= sbi->unusable_block_count;
@@ -2033,7 +2040,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
}
if (time_to_inject(sbi, FAULT_BLOCK)) {
- f2fs_show_injection_info(FAULT_BLOCK);
+ f2fs_show_injection_info(sbi, FAULT_BLOCK);
goto enospc;
}
@@ -2044,6 +2051,11 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
if (!__allow_reserved_blocks(sbi, inode, false))
valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks;
+
+ if (F2FS_IO_ALIGNED(sbi))
+ valid_block_count += sbi->blocks_per_seg *
+ SM_I(sbi)->additional_reserved_segments;
+
user_block_count = sbi->user_block_count;
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
user_block_count -= sbi->unusable_block_count;
@@ -2148,7 +2160,8 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
return page;
if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) {
- f2fs_show_injection_info(FAULT_PAGE_ALLOC);
+ f2fs_show_injection_info(F2FS_M_SB(mapping),
+ FAULT_PAGE_ALLOC);
return NULL;
}
}
@@ -2163,7 +2176,7 @@ static inline struct page *f2fs_pagecache_get_page(
int fgp_flags, gfp_t gfp_mask)
{
if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) {
- f2fs_show_injection_info(FAULT_PAGE_GET);
+ f2fs_show_injection_info(F2FS_M_SB(mapping), FAULT_PAGE_GET);
return NULL;
}
@@ -2232,7 +2245,7 @@ static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi,
return bio;
}
if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
- f2fs_show_injection_info(FAULT_ALLOC_BIO);
+ f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO);
return NULL;
}
@@ -2797,7 +2810,7 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
size_t size, gfp_t flags)
{
if (time_to_inject(sbi, FAULT_KMALLOC)) {
- f2fs_show_injection_info(FAULT_KMALLOC);
+ f2fs_show_injection_info(sbi, FAULT_KMALLOC);
return NULL;
}
@@ -2814,7 +2827,7 @@ static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi,
size_t size, gfp_t flags)
{
if (time_to_inject(sbi, FAULT_KVMALLOC)) {
- f2fs_show_injection_info(FAULT_KVMALLOC);
+ f2fs_show_injection_info(sbi, FAULT_KVMALLOC);
return NULL;
}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5d94abe467a4..516007bb1ced 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -682,10 +682,14 @@ int f2fs_truncate(struct inode *inode)
trace_f2fs_truncate(inode);
if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) {
- f2fs_show_injection_info(FAULT_TRUNCATE);
+ f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_TRUNCATE);
return -EIO;
}
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
/* we should check inline_data size */
if (!f2fs_may_inline_data(inode)) {
err = f2fs_convert_inline_inode(inode);
@@ -761,7 +765,8 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr)
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
- if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+ if (!in_group_p(inode->i_gid) &&
+ !capable_wrt_inode_uidgid(inode, CAP_FSETID))
mode &= ~S_ISGID;
set_acl_inode(inode, mode);
}
@@ -976,7 +981,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
if (pg_start < pg_end) {
- struct address_space *mapping = inode->i_mapping;
loff_t blk_start, blk_end;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -988,8 +992,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
down_write(&F2FS_I(inode)->i_mmap_sem);
- truncate_inode_pages_range(mapping, blk_start,
- blk_end - 1);
+ truncate_pagecache_range(inode, blk_start, blk_end - 1);
f2fs_lock_op(sbi);
ret = f2fs_truncate_hole(inode, pg_start, pg_end);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index a78aa5480454..68d5c73c5ed1 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -54,7 +54,7 @@ static int gc_thread_func(void *data)
}
if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
- f2fs_show_injection_info(FAULT_CHECKPOINT);
+ f2fs_show_injection_info(sbi, FAULT_CHECKPOINT);
f2fs_stop_checkpoint(sbi, false);
}
@@ -633,6 +633,11 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
set_sbi_flag(sbi, SBI_NEED_FSCK);
}
+ if (f2fs_check_nid_range(sbi, dni->ino)) {
+ f2fs_put_page(node_page, 1);
+ return false;
+ }
+
*nofs = ofs_of_node(node_page);
source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node);
f2fs_put_page(node_page, 1);
@@ -1095,8 +1100,10 @@ next_step:
int err;
if (S_ISREG(inode->i_mode)) {
- if (!down_write_trylock(&fi->i_gc_rwsem[READ]))
+ if (!down_write_trylock(&fi->i_gc_rwsem[READ])) {
+ sbi->skipped_gc_rwsem++;
continue;
+ }
if (!down_write_trylock(
&fi->i_gc_rwsem[WRITE])) {
sbi->skipped_gc_rwsem++;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 183388393c6a..c6bd669f4b4e 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -189,6 +189,10 @@ int f2fs_convert_inline_inode(struct inode *inode)
if (!f2fs_has_inline_data(inode))
return 0;
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
page = f2fs_grab_cache_page(inode->i_mapping, 0, false);
if (!page)
return -ENOMEM;
@@ -212,7 +216,8 @@ out:
f2fs_put_page(page, 1);
- f2fs_balance_fs(sbi, dn.node_changed);
+ if (!err)
+ f2fs_balance_fs(sbi, dn.node_changed);
return err;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 386ad54c13c3..264c19e17779 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -455,7 +455,7 @@ make_now:
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- inode_nohighmem(inode);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
} else if (S_ISLNK(inode->i_mode)) {
if (file_is_encrypt(inode))
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
@@ -681,7 +681,7 @@ retry:
err = f2fs_truncate(inode);
if (time_to_inject(sbi, FAULT_EVICT_INODE)) {
- f2fs_show_injection_info(FAULT_EVICT_INODE);
+ f2fs_show_injection_info(sbi, FAULT_EVICT_INODE);
err = -EIO;
}
@@ -777,6 +777,7 @@ void f2fs_handle_failed_inode(struct inode *inode)
err = f2fs_get_node_info(sbi, inode->i_ino, &ni);
if (err) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ set_inode_flag(inode, FI_FREE_NID);
f2fs_warn(sbi, "May loss orphan inode, run fsck to fix.");
goto out;
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 5d9584281935..ed95c27e9302 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -679,7 +679,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- inode_nohighmem(inode);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
set_inode_flag(inode, FI_INC_LINK);
f2fs_lock_op(sbi);
@@ -797,7 +797,11 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
if (whiteout) {
f2fs_i_links_write(inode, false);
+
+ spin_lock(&inode->i_lock);
inode->i_state |= I_LINKABLE;
+ spin_unlock(&inode->i_lock);
+
*whiteout = inode;
} else {
d_tmpfile(dentry, inode);
@@ -996,7 +1000,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
err = f2fs_add_link(old_dentry, whiteout);
if (err)
goto put_out_dir;
+
+ spin_lock(&whiteout->i_lock);
whiteout->i_state &= ~I_LINKABLE;
+ spin_unlock(&whiteout->i_lock);
+
iput(whiteout);
}
@@ -1248,9 +1256,18 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
return target;
}
+static int f2fs_encrypted_symlink_getattr(const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ f2fs_getattr(path, stat, request_mask, query_flags);
+
+ return fscrypt_symlink_getattr(path, stat);
+}
+
const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
.get_link = f2fs_encrypted_get_link,
- .getattr = f2fs_getattr,
+ .getattr = f2fs_encrypted_symlink_getattr,
.setattr = f2fs_setattr,
#ifdef CONFIG_F2FS_FS_XATTR
.listxattr = f2fs_listxattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 7ce33698ae38..3dc7cc3d6ac6 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1385,6 +1385,7 @@ page_hit:
nid, nid_of_node(page), ino_of_node(page),
ofs_of_node(page), cpver_of_node(page),
next_blkaddr_of_node(page));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
err = -EINVAL;
out_err:
ClearPageUptodate(page);
@@ -1994,8 +1995,12 @@ static int f2fs_write_node_pages(struct address_space *mapping,
if (wbc->sync_mode == WB_SYNC_ALL)
atomic_inc(&sbi->wb_sync_req[NODE]);
- else if (atomic_read(&sbi->wb_sync_req[NODE]))
+ else if (atomic_read(&sbi->wb_sync_req[NODE])) {
+ /* to avoid potential deadlock */
+ if (current->plug)
+ blk_finish_plug(current->plug);
goto skip_write;
+ }
trace_f2fs_writepages(mapping->host, wbc, NODE);
@@ -2406,7 +2411,7 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
struct free_nid *i = NULL;
retry:
if (time_to_inject(sbi, FAULT_ALLOC_NID)) {
- f2fs_show_injection_info(FAULT_ALLOC_NID);
+ f2fs_show_injection_info(sbi, FAULT_ALLOC_NID);
return false;
}
@@ -2718,6 +2723,9 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
struct f2fs_nat_entry raw_ne;
nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
+ if (f2fs_check_nid_range(sbi, nid))
+ continue;
+
raw_ne = nat_in_journal(journal, i);
ne = __lookup_nat_cache(nm_i, nid);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 5ba677f85533..78c54bb7898d 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -489,7 +489,7 @@ int f2fs_commit_inmem_pages(struct inode *inode)
void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
{
if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
- f2fs_show_injection_info(FAULT_CHECKPOINT);
+ f2fs_show_injection_info(sbi, FAULT_CHECKPOINT);
f2fs_stop_checkpoint(sbi, false);
}
@@ -1017,8 +1017,9 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
if (dc->error)
printk_ratelimited(
- "%sF2FS-fs: Issue discard(%u, %u, %u) failed, ret: %d",
- KERN_INFO, dc->lstart, dc->start, dc->len, dc->error);
+ "%sF2FS-fs (%s): Issue discard(%u, %u, %u) failed, ret: %d",
+ KERN_INFO, sbi->sb->s_id,
+ dc->lstart, dc->start, dc->len, dc->error);
__detach_discard_cmd(dcc, dc);
}
@@ -1158,7 +1159,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
dc->len += len;
if (time_to_inject(sbi, FAULT_DISCARD)) {
- f2fs_show_injection_info(FAULT_DISCARD);
+ f2fs_show_injection_info(sbi, FAULT_DISCARD);
err = -EIO;
goto submit;
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 325781a1ae4d..15b343f65609 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -88,11 +88,11 @@
#define BLKS_PER_SEC(sbi) \
((sbi)->segs_per_sec * (sbi)->blocks_per_seg)
#define GET_SEC_FROM_SEG(sbi, segno) \
- ((segno) / (sbi)->segs_per_sec)
+ (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec)
#define GET_SEG_FROM_SEC(sbi, secno) \
((secno) * (sbi)->segs_per_sec)
#define GET_ZONE_FROM_SEC(sbi, secno) \
- ((secno) / (sbi)->secs_per_zone)
+ (((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone)
#define GET_ZONE_FROM_SEG(sbi, segno) \
GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno))
@@ -508,7 +508,8 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
static inline int reserved_segments(struct f2fs_sb_info *sbi)
{
- return SM_I(sbi)->reserved_segments;
+ return SM_I(sbi)->reserved_segments +
+ SM_I(sbi)->additional_reserved_segments;
}
static inline unsigned int free_sections(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a9a083232bcf..6bd8a944902e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -277,6 +277,46 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).s_resgid));
}
+static inline int adjust_reserved_segment(struct f2fs_sb_info *sbi)
+{
+ unsigned int sec_blks = sbi->blocks_per_seg * sbi->segs_per_sec;
+ unsigned int avg_vblocks;
+ unsigned int wanted_reserved_segments;
+ block_t avail_user_block_count;
+
+ if (!F2FS_IO_ALIGNED(sbi))
+ return 0;
+
+ /* average valid block count in section in worst case */
+ avg_vblocks = sec_blks / F2FS_IO_SIZE(sbi);
+
+ /*
+ * we need enough free space when migrating one section in worst case
+ */
+ wanted_reserved_segments = (F2FS_IO_SIZE(sbi) / avg_vblocks) *
+ reserved_segments(sbi);
+ wanted_reserved_segments -= reserved_segments(sbi);
+
+ avail_user_block_count = sbi->user_block_count -
+ sbi->current_reserved_blocks -
+ F2FS_OPTION(sbi).root_reserved_blocks;
+
+ if (wanted_reserved_segments * sbi->blocks_per_seg >
+ avail_user_block_count) {
+ f2fs_err(sbi, "IO align feature can't grab additional reserved segment: %u, available segments: %u",
+ wanted_reserved_segments,
+ avail_user_block_count >> sbi->log_blocks_per_seg);
+ return -ENOSPC;
+ }
+
+ SM_I(sbi)->additional_reserved_segments = wanted_reserved_segments;
+
+ f2fs_info(sbi, "IO align feature needs additional reserved segment: %u",
+ wanted_reserved_segments);
+
+ return 0;
+}
+
static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi)
{
if (!F2FS_OPTION(sbi).unusable_cap_perc)
@@ -1994,64 +2034,76 @@ static int f2fs_enable_quotas(struct super_block *sb)
return 0;
}
-int f2fs_quota_sync(struct super_block *sb, int type)
+static int f2fs_quota_sync_file(struct f2fs_sb_info *sbi, int type)
{
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
- struct quota_info *dqopt = sb_dqopt(sb);
- int cnt;
- int ret;
+ struct quota_info *dqopt = sb_dqopt(sbi->sb);
+ struct address_space *mapping = dqopt->files[type]->i_mapping;
+ int ret = 0;
- /*
- * do_quotactl
- * f2fs_quota_sync
- * down_read(quota_sem)
- * dquot_writeback_dquots()
- * f2fs_dquot_commit
- * block_operation
- * down_read(quota_sem)
- */
- f2fs_lock_op(sbi);
+ ret = dquot_writeback_dquots(sbi->sb, type);
+ if (ret)
+ goto out;
- down_read(&sbi->quota_sem);
- ret = dquot_writeback_dquots(sb, type);
+ ret = filemap_fdatawrite(mapping);
if (ret)
goto out;
+ /* if we are using journalled quota */
+ if (is_journalled_quota(sbi))
+ goto out;
+
+ ret = filemap_fdatawait(mapping);
+
+ truncate_inode_pages(&dqopt->files[type]->i_data, 0);
+out:
+ if (ret)
+ set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+ return ret;
+}
+
+int f2fs_quota_sync(struct super_block *sb, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct quota_info *dqopt = sb_dqopt(sb);
+ int cnt;
+ int ret = 0;
+
/*
* Now when everything is written we can discard the pagecache so
* that userspace sees the changes.
*/
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- struct address_space *mapping;
if (type != -1 && cnt != type)
continue;
+
if (!sb_has_quota_active(sb, cnt))
continue;
- mapping = dqopt->files[cnt]->i_mapping;
+ inode_lock(dqopt->files[cnt]);
- ret = filemap_fdatawrite(mapping);
- if (ret)
- goto out;
+ /*
+ * do_quotactl
+ * f2fs_quota_sync
+ * down_read(quota_sem)
+ * dquot_writeback_dquots()
+ * f2fs_dquot_commit
+ * block_operation
+ * down_read(quota_sem)
+ */
+ f2fs_lock_op(sbi);
+ down_read(&sbi->quota_sem);
- /* if we are using journalled quota */
- if (is_journalled_quota(sbi))
- continue;
+ ret = f2fs_quota_sync_file(sbi, cnt);
- ret = filemap_fdatawait(mapping);
- if (ret)
- set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
+ up_read(&sbi->quota_sem);
+ f2fs_unlock_op(sbi);
- inode_lock(dqopt->files[cnt]);
- truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
inode_unlock(dqopt->files[cnt]);
+
+ if (ret)
+ break;
}
-out:
- if (ret)
- set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
- up_read(&sbi->quota_sem);
- f2fs_unlock_op(sbi);
return ret;
}
@@ -3438,6 +3490,10 @@ try_onemore:
goto free_nm;
}
+ err = adjust_reserved_segment(sbi);
+ if (err)
+ goto free_nm;
+
/* For write statistics */
if (sb->s_bdev->bd_part)
sbi->sectors_written_start =
@@ -3804,4 +3860,5 @@ module_exit(exit_f2fs_fs)
MODULE_AUTHOR("Samsung Electronics's Praesto Team");
MODULE_DESCRIPTION("Flash Friendly File System");
MODULE_LICENSE("GPL");
+MODULE_SOFTDEP("pre: crc32");
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 029e693e201c..84bb37665093 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -262,7 +262,9 @@ out:
if (a->struct_type == RESERVED_BLOCKS) {
spin_lock(&sbi->stat_lock);
if (t > (unsigned long)(sbi->user_block_count -
- F2FS_OPTION(sbi).root_reserved_blocks)) {
+ F2FS_OPTION(sbi).root_reserved_blocks -
+ sbi->blocks_per_seg *
+ SM_I(sbi)->additional_reserved_segments)) {
spin_unlock(&sbi->stat_lock);
return -EINVAL;
}
@@ -499,7 +501,9 @@ F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND);
F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY);
#endif
F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM);
+#ifdef CONFIG_UNICODE
F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD);
+#endif
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -568,7 +572,9 @@ static struct attribute *f2fs_feat_attrs[] = {
ATTR_LIST(verity),
#endif
ATTR_LIST(sb_checksum),
+#ifdef CONFIG_UNICODE
ATTR_LIST(casefold),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(f2fs_feat);
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index a401ef72bc82..7944a08a3977 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -150,40 +150,73 @@ static int f2fs_end_enable_verity(struct file *filp, const void *desc,
size_t desc_size, u64 merkle_tree_size)
{
struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size;
struct fsverity_descriptor_location dloc = {
.version = cpu_to_le32(1),
.size = cpu_to_le32(desc_size),
.pos = cpu_to_le64(desc_pos),
};
- int err = 0;
+ int err = 0, err2 = 0;
- if (desc != NULL) {
- /* Succeeded; write the verity descriptor. */
- err = pagecache_write(inode, desc, desc_size, desc_pos);
+ /*
+ * If an error already occurred (which fs/verity/ signals by passing
+ * desc == NULL), then only clean-up is needed.
+ */
+ if (desc == NULL)
+ goto cleanup;
- /* Write all pages before clearing FI_VERITY_IN_PROGRESS. */
- if (!err)
- err = filemap_write_and_wait(inode->i_mapping);
- }
+ /* Append the verity descriptor. */
+ err = pagecache_write(inode, desc, desc_size, desc_pos);
+ if (err)
+ goto cleanup;
+
+ /*
+ * Write all pages (both data and verity metadata). Note that this must
+ * happen before clearing FI_VERITY_IN_PROGRESS; otherwise pages beyond
+ * i_size won't be written properly. For crash consistency, this also
+ * must happen before the verity inode flag gets persisted.
+ */
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err)
+ goto cleanup;
+
+ /* Set the verity xattr. */
+ err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY,
+ F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc),
+ NULL, XATTR_CREATE);
+ if (err)
+ goto cleanup;
- /* If we failed, truncate anything we wrote past i_size. */
- if (desc == NULL || err)
- f2fs_truncate(inode);
+ /* Finally, set the verity inode flag. */
+ file_set_verity(inode);
+ f2fs_set_inode_flags(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
+ return 0;
- if (desc != NULL && !err) {
- err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY,
- F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc),
- NULL, XATTR_CREATE);
- if (!err) {
- file_set_verity(inode);
- f2fs_set_inode_flags(inode);
- f2fs_mark_inode_dirty_sync(inode, true);
- }
+cleanup:
+ /*
+ * Verity failed to be enabled, so clean up by truncating any verity
+ * metadata that was written beyond i_size (both from cache and from
+ * disk) and clearing FI_VERITY_IN_PROGRESS.
+ *
+ * Taking i_gc_rwsem[WRITE] is needed to stop f2fs garbage collection
+ * from re-instantiating cached pages we are truncating (since unlike
+ * normal file accesses, garbage collection isn't limited by i_size).
+ */
+ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ err2 = f2fs_truncate(inode);
+ if (err2) {
+ f2fs_err(sbi, "Truncating verity metadata failed (errno=%d)",
+ err2);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
}
- return err;
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
+ return err ?: err2;
}
static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 296b3189448a..cf1bfed1e662 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -661,8 +661,17 @@ static int __f2fs_setxattr(struct inode *inode, int index,
}
last = here;
- while (!IS_XATTR_LAST_ENTRY(last))
+ while (!IS_XATTR_LAST_ENTRY(last)) {
+ if ((void *)(last) + sizeof(__u32) > last_base_addr ||
+ (void *)XATTR_NEXT_ENTRY(last) > last_base_addr) {
+ f2fs_err(F2FS_I_SB(inode), "inode (%lu) has invalid last xattr entry, entry_size: %zu",
+ inode->i_ino, ENTRY_SIZE(last));
+ set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
+ error = -EFSCORRUPTED;
+ goto exit;
+ }
last = XATTR_NEXT_ENTRY(last);
+ }
newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 3dc90e5293e6..fa0fdd829613 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -993,13 +993,14 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
while (fa) {
struct fown_struct *fown;
+ unsigned long flags;
if (fa->magic != FASYNC_MAGIC) {
printk(KERN_ERR "kill_fasync: bad magic number in "
"fasync_struct!\n");
return;
}
- read_lock(&fa->fa_lock);
+ read_lock_irqsave(&fa->fa_lock, flags);
if (fa->fa_file) {
fown = &fa->fa_file->f_owner;
/* Don't send SIGURG to processes which have not set a
@@ -1008,7 +1009,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
if (!(sig == SIGURG && fown->signum == 0))
send_sigio(fown, fa->fa_fd, band);
}
- read_unlock(&fa->fa_lock);
+ read_unlock_irqrestore(&fa->fa_lock, flags);
fa = rcu_dereference(fa->fa_next);
}
}
diff --git a/fs/file.c b/fs/file.c
index e5d328335f88..51f53a7dc221 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -706,24 +706,69 @@ void do_close_on_exec(struct files_struct *files)
spin_unlock(&files->file_lock);
}
+static inline struct file *__fget_files_rcu(struct files_struct *files,
+ unsigned int fd, fmode_t mask, unsigned int refs)
+{
+ for (;;) {
+ struct file *file;
+ struct fdtable *fdt = rcu_dereference_raw(files->fdt);
+ struct file __rcu **fdentry;
+
+ if (unlikely(fd >= fdt->max_fds))
+ return NULL;
+
+ fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
+ file = rcu_dereference_raw(*fdentry);
+ if (unlikely(!file))
+ return NULL;
+
+ if (unlikely(file->f_mode & mask))
+ return NULL;
+
+ /*
+ * Ok, we have a file pointer. However, because we do
+ * this all locklessly under RCU, we may be racing with
+ * that file being closed.
+ *
+ * Such a race can take two forms:
+ *
+ * (a) the file ref already went down to zero,
+ * and get_file_rcu_many() fails. Just try
+ * again:
+ */
+ if (unlikely(!get_file_rcu_many(file, refs)))
+ continue;
+
+ /*
+ * (b) the file table entry has changed under us.
+ * Note that we don't need to re-check the 'fdt->fd'
+ * pointer having changed, because it always goes
+ * hand-in-hand with 'fdt'.
+ *
+ * If so, we need to put our refs and try again.
+ */
+ if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
+ unlikely(rcu_dereference_raw(*fdentry) != file)) {
+ fput_many(file, refs);
+ continue;
+ }
+
+ /*
+ * Ok, we have a ref to the file, and checked that it
+ * still exists.
+ */
+ return file;
+ }
+}
+
+
static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
{
struct files_struct *files = current->files;
struct file *file;
rcu_read_lock();
-loop:
- file = fcheck_files(files, fd);
- if (file) {
- /* File object ref couldn't be taken.
- * dup2() atomicity guarantee is the reason
- * we loop to catch the new file (or NULL pointer)
- */
- if (file->f_mode & mask)
- file = NULL;
- else if (!get_file_rcu_many(file, refs))
- goto loop;
- }
+ file = __fget_files_rcu(files, fd, mask, refs);
rcu_read_unlock();
return file;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5f6400ba82c0..22e9c88f3960 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -505,12 +505,19 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
if (!isw)
return;
+ atomic_inc(&isw_nr_in_flight);
+
/* find and pin the new wb */
rcu_read_lock();
memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
- if (memcg_css)
- isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ if (memcg_css && !css_tryget(memcg_css))
+ memcg_css = NULL;
rcu_read_unlock();
+ if (!memcg_css)
+ goto out_free;
+
+ isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ css_put(memcg_css);
if (!isw->new_wb)
goto out_free;
@@ -535,11 +542,10 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
*/
call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
-
- atomic_inc(&isw_nr_in_flight);
return;
out_free:
+ atomic_dec(&isw_nr_in_flight);
if (isw->new_wb)
wb_put(isw->new_wb);
kfree(isw);
@@ -1238,7 +1244,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
*/
static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
- int flags, unsigned long dirtied_before)
+ unsigned long dirtied_before)
{
LIST_HEAD(tmp);
struct list_head *pos, *node;
@@ -1254,8 +1260,6 @@ static int move_expired_inodes(struct list_head *delaying_queue,
list_move(&inode->i_io_list, &tmp);
moved++;
spin_lock(&inode->i_lock);
- if (flags & EXPIRE_DIRTY_ATIME)
- inode->i_state |= I_DIRTY_TIME_EXPIRED;
inode->i_state |= I_SYNC_QUEUED;
spin_unlock(&inode->i_lock);
if (sb_is_blkdev_sb(inode->i_sb))
@@ -1303,11 +1307,11 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
- moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, dirtied_before);
+ moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
if (!work->for_sync)
time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
- EXPIRE_DIRTY_ATIME, time_expire_jif);
+ time_expire_jif);
if (moved)
wb_io_lists_populated(wb);
trace_writeback_queue_io(wb, work, dirtied_before, moved);
@@ -1476,25 +1480,25 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
}
/*
+ * If the inode has dirty timestamps and we need to write them, call
+ * mark_inode_dirty_sync() to notify the filesystem about it and to
+ * change I_DIRTY_TIME into I_DIRTY_SYNC.
+ */
+ if ((inode->i_state & I_DIRTY_TIME) &&
+ (wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync ||
+ time_after(jiffies, inode->dirtied_time_when +
+ dirtytime_expire_interval * HZ))) {
+ trace_writeback_lazytime(inode);
+ mark_inode_dirty_sync(inode);
+ }
+
+ /*
* Some filesystems may redirty the inode during the writeback
* due to delalloc, clear dirty metadata flags right before
* write_inode()
*/
spin_lock(&inode->i_lock);
-
dirty = inode->i_state & I_DIRTY;
- if (inode->i_state & I_DIRTY_TIME) {
- if ((dirty & I_DIRTY_INODE) ||
- wbc->sync_mode == WB_SYNC_ALL ||
- unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
- unlikely(time_after(jiffies,
- (inode->dirtied_time_when +
- dirtytime_expire_interval * HZ)))) {
- dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
- trace_writeback_lazytime(inode);
- }
- } else
- inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
inode->i_state &= ~dirty;
/*
@@ -1515,8 +1519,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
spin_unlock(&inode->i_lock);
- if (dirty & I_DIRTY_TIME)
- mark_inode_dirty_sync(inode);
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & ~I_DIRTY_PAGES) {
int err = write_inode(inode, wbc);
@@ -2200,28 +2202,6 @@ int dirtytime_interval_handler(struct ctl_table *table, int write,
return ret;
}
-static noinline void block_dump___mark_inode_dirty(struct inode *inode)
-{
- if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
- struct dentry *dentry;
- const char *name = "?";
-
- dentry = d_find_alias(inode);
- if (dentry) {
- spin_lock(&dentry->d_lock);
- name = (const char *) dentry->d_name.name;
- }
- printk(KERN_DEBUG
- "%s(%d): dirtied inode %lu (%s) on %s\n",
- current->comm, task_pid_nr(current), inode->i_ino,
- name, inode->i_sb->s_id);
- if (dentry) {
- spin_unlock(&dentry->d_lock);
- dput(dentry);
- }
- }
-}
-
/**
* __mark_inode_dirty - internal function
*
@@ -2281,9 +2261,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
(dirtytime && (inode->i_state & I_DIRTY_INODE)))
return;
- if (unlikely(block_dump))
- block_dump___mark_inode_dirty(inode);
-
spin_lock(&inode->i_lock);
if (dirtytime && (inode->i_state & I_DIRTY_INODE))
goto out_unlock_inode;
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 138b5b4d621d..e492a83fa100 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -258,7 +258,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
struct fs_context *fc;
int ret = -ENOMEM;
- fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
+ fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT);
if (!fc)
return ERR_PTR(-ENOMEM);
@@ -585,7 +585,7 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
param->key);
}
- if (len > PAGE_SIZE - 2 - size)
+ if (size + len + 2 > PAGE_SIZE)
return invalf(fc, "VFS: Legacy: Cumulative options too large");
if (strchr(param->key, ',') ||
(param->type == fs_value_is_string &&
@@ -686,7 +686,7 @@ const struct fs_context_operations legacy_fs_context_ops = {
*/
static int legacy_init_fs_context(struct fs_context *fc)
{
- fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL);
+ fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT);
if (!fc->fs_private)
return -ENOMEM;
fc->ops = &legacy_fs_context_ops;
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 0ce39658a620..44a426c8ea01 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -74,10 +74,8 @@ void fscache_free_cookie(struct fscache_cookie *cookie)
static int fscache_set_key(struct fscache_cookie *cookie,
const void *index_key, size_t index_key_len)
{
- unsigned long long h;
u32 *buf;
int bufs;
- int i;
bufs = DIV_ROUND_UP(index_key_len, sizeof(*buf));
@@ -91,17 +89,7 @@ static int fscache_set_key(struct fscache_cookie *cookie,
}
memcpy(buf, index_key, index_key_len);
-
- /* Calculate a hash and combine this with the length in the first word
- * or first half word
- */
- h = (unsigned long)cookie->parent;
- h += index_key_len + cookie->type;
-
- for (i = 0; i < bufs; i++)
- h += buf[i];
-
- cookie->key_hash = h ^ (h >> 32);
+ cookie->key_hash = fscache_hash(0, buf, bufs);
return 0;
}
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 9616af3768e1..d09d4e69c818 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -97,6 +97,8 @@ extern struct workqueue_struct *fscache_object_wq;
extern struct workqueue_struct *fscache_op_wq;
DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+extern unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n);
+
static inline bool fscache_object_congested(void)
{
return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 59c2494efda3..3aa3756c7176 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -95,6 +95,45 @@ static struct ctl_table fscache_sysctls_root[] = {
#endif
/*
+ * Mixing scores (in bits) for (7,20):
+ * Input delta: 1-bit 2-bit
+ * 1 round: 330.3 9201.6
+ * 2 rounds: 1246.4 25475.4
+ * 3 rounds: 1907.1 31295.1
+ * 4 rounds: 2042.3 31718.6
+ * Perfect: 2048 31744
+ * (32*64) (32*31/2 * 64)
+ */
+#define HASH_MIX(x, y, a) \
+ ( x ^= (a), \
+ y ^= x, x = rol32(x, 7),\
+ x += y, y = rol32(y,20),\
+ y *= 9 )
+
+static inline unsigned int fold_hash(unsigned long x, unsigned long y)
+{
+ /* Use arch-optimized multiply if one exists */
+ return __hash_32(y ^ __hash_32(x));
+}
+
+/*
+ * Generate a hash. This is derived from full_name_hash(), but we want to be
+ * sure it is arch independent and that it doesn't change as bits of the
+ * computed hash value might appear on disk. The caller also guarantees that
+ * the hashed data will be a series of aligned 32-bit words.
+ */
+unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n)
+{
+ unsigned int a, x = 0, y = salt;
+
+ for (; n; n--) {
+ a = *data++;
+ HASH_MIX(x, y, a);
+ }
+ return fold_hash(x, y);
+}
+
+/*
* initialise the fs caching module
*/
static int __init fscache_init(void)
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 00015d851382..e51b7019e887 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -624,6 +624,8 @@ static int __init cuse_init(void)
cuse_channel_fops.owner = THIS_MODULE;
cuse_channel_fops.open = cuse_channel_open;
cuse_channel_fops.release = cuse_channel_release;
+ /* CUSE is not prepared for FUSE_DEV_IOC_CLONE */
+ cuse_channel_fops.unlocked_ioctl = NULL;
cuse_class = class_create(THIS_MODULE, "cuse");
if (IS_ERR(cuse_class))
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 40c262b3f9ff..ac6a8da34013 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -282,10 +282,10 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
/*
* test_and_set_bit() implies smp_mb() between bit
- * changing and below intr_entry check. Pairs with
+ * changing and below FR_INTERRUPTED check. Pairs with
* smp_mb() from queue_interrupt().
*/
- if (!list_empty(&req->intr_entry)) {
+ if (test_bit(FR_INTERRUPTED, &req->flags)) {
spin_lock(&fiq->lock);
list_del_init(&req->intr_entry);
spin_unlock(&fiq->lock);
@@ -770,6 +770,7 @@ static int fuse_check_page(struct page *page)
1 << PG_uptodate |
1 << PG_lru |
1 << PG_active |
+ 1 << PG_workingset |
1 << PG_reclaim |
1 << PG_waiters))) {
pr_warn("trying to steal weird page\n");
@@ -843,6 +844,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (!(buf->flags & PIPE_BUF_FLAG_LRU))
lru_cache_add_file(newpage);
+ /*
+ * Release while we have extra ref on stolen page. Otherwise
+ * anon_pipe_buf_release() might think the page can be reused.
+ */
+ pipe_buf_release(cs->pipe, buf);
+
err = 0;
spin_lock(&cs->req->waitq.lock);
if (test_bit(FR_ABORTED, &cs->req->flags))
@@ -926,7 +933,17 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
while (count) {
if (cs->write && cs->pipebufs && page) {
- return fuse_ref_page(cs, page, offset, count);
+ /*
+ * Can't control lifetime of pipe buffers, so always
+ * copy user pages.
+ */
+ if (cs->req->args->user_pages) {
+ err = fuse_copy_fill(cs);
+ if (err)
+ return err;
+ } else {
+ return fuse_ref_page(cs, page, offset, count);
+ }
} else if (!cs->len) {
if (cs->move_pages && page &&
offset == 0 && count == PAGE_SIZE) {
@@ -1263,6 +1280,15 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
goto restart;
}
spin_lock(&fpq->lock);
+ /*
+ * Must not put request on fpq->io queue after having been shut down by
+ * fuse_abort_conn()
+ */
+ if (!fpq->connected) {
+ req->out.h.error = err = -ECONNABORTED;
+ goto out_end;
+
+ }
list_add(&req->list, &fpq->io);
spin_unlock(&fpq->lock);
cs->req = req;
@@ -1859,7 +1885,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
}
err = -EINVAL;
- if (oh.error <= -1000 || oh.error > 0)
+ if (oh.error <= -512 || oh.error > 0)
goto copy_finish;
spin_lock(&fpq->lock);
@@ -2017,8 +2043,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
pipe_lock(pipe);
out_free:
- for (idx = 0; idx < nbuf; idx++)
- pipe_buf_release(pipe, &bufs[idx]);
+ for (idx = 0; idx < nbuf; idx++) {
+ struct pipe_buffer *buf = &bufs[idx];
+
+ if (buf->ops)
+ pipe_buf_release(pipe, buf);
+ }
pipe_unlock(pipe);
kvfree(bufs);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 60378f3baaae..34487bf1d791 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1032,7 +1032,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
if (!parent)
return -ENOENT;
- inode_lock(parent);
+ inode_lock_nested(parent, I_MUTEX_PARENT);
if (!S_ISDIR(parent->i_mode))
goto unlock;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1e1aef1bc20b..efb2a4871291 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -193,12 +193,11 @@ void fuse_finish_open(struct inode *inode, struct file *file)
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = get_fuse_conn(inode);
- if (!(ff->open_flags & FOPEN_KEEP_CACHE))
- invalidate_inode_pages2(inode->i_mapping);
if (ff->open_flags & FOPEN_STREAM)
stream_open(inode, file);
else if (ff->open_flags & FOPEN_NONSEEKABLE)
nonseekable_open(inode, file);
+
if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -206,10 +205,14 @@ void fuse_finish_open(struct inode *inode, struct file *file)
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, 0);
spin_unlock(&fi->lock);
+ truncate_pagecache(inode, 0);
fuse_invalidate_attr(inode);
if (fc->writeback_cache)
file_update_time(file);
+ } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
+ invalidate_inode_pages2(inode->i_mapping);
}
+
if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
fuse_link_write_file(file);
}
@@ -1108,6 +1111,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
unsigned int offset, i;
+ bool short_write;
int err;
for (i = 0; i < ap->num_pages; i++)
@@ -1120,32 +1124,38 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
if (!err && ia->write.out.size > count)
err = -EIO;
+ short_write = ia->write.out.size < count;
offset = ap->descs[0].offset;
count = ia->write.out.size;
for (i = 0; i < ap->num_pages; i++) {
struct page *page = ap->pages[i];
- if (!err && !offset && count >= PAGE_SIZE)
- SetPageUptodate(page);
-
- if (count > PAGE_SIZE - offset)
- count -= PAGE_SIZE - offset;
- else
- count = 0;
- offset = 0;
-
- unlock_page(page);
+ if (err) {
+ ClearPageUptodate(page);
+ } else {
+ if (count >= PAGE_SIZE - offset)
+ count -= PAGE_SIZE - offset;
+ else {
+ if (short_write)
+ ClearPageUptodate(page);
+ count = 0;
+ }
+ offset = 0;
+ }
+ if (ia->write.page_locked && (i == ap->num_pages - 1))
+ unlock_page(page);
put_page(page);
}
return err;
}
-static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap,
+static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
struct address_space *mapping,
struct iov_iter *ii, loff_t pos,
unsigned int max_pages)
{
+ struct fuse_args_pages *ap = &ia->ap;
struct fuse_conn *fc = get_fuse_conn(mapping->host);
unsigned offset = pos & (PAGE_SIZE - 1);
size_t count = 0;
@@ -1198,6 +1208,16 @@ static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap,
if (offset == PAGE_SIZE)
offset = 0;
+ /* If we copied full page, mark it uptodate */
+ if (tmp == PAGE_SIZE)
+ SetPageUptodate(page);
+
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ } else {
+ ia->write.page_locked = true;
+ break;
+ }
if (!fc->big_writes)
break;
} while (iov_iter_count(ii) && count < fc->max_write &&
@@ -1241,7 +1261,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
break;
}
- count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages);
+ count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages);
if (count <= 0) {
err = count;
} else {
@@ -1413,6 +1433,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
}
+ ap->args.user_pages = true;
if (write)
ap->args.in_pages = 1;
else
@@ -3168,7 +3189,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
{
- int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX);
if (!err)
fuse_sync_writes(inode);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e3688312e9f1..83c2855bc740 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -248,6 +248,7 @@ struct fuse_args {
bool nocreds:1;
bool in_pages:1;
bool out_pages:1;
+ bool user_pages:1;
bool out_argvar:1;
bool page_zeroing:1;
bool page_replace:1;
@@ -791,6 +792,7 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc)
static inline void fuse_make_bad(struct inode *inode)
{
+ remove_inode_hash(inode);
set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state);
}
@@ -844,6 +846,7 @@ struct fuse_io_args {
struct {
struct fuse_write_in in;
struct fuse_write_out out;
+ bool page_locked;
} write;
};
struct fuse_args_pages ap;
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 7505f8102762..fadf6fb90fe2 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -667,6 +667,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
out_vqs:
vdev->config->reset(vdev);
virtio_fs_cleanup_vqs(vdev, fs);
+ kfree(fs->vqs);
out:
vdev->priv = NULL;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 50fa3e08c02f..dec5285a02e9 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -940,7 +940,7 @@ do_alloc:
else if (height == ip->i_height)
ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
else
- iomap->length = size - pos;
+ iomap->length = size - iomap->offset;
} else if (flags & IOMAP_WRITE) {
u64 alloc_size;
@@ -1228,6 +1228,9 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
gfs2_inplace_release(ip);
+ if (ip->i_qadata && ip->i_qadata->qa_qd_num)
+ gfs2_quota_unlock(ip);
+
if (length != written && (iomap->flags & IOMAP_F_NEW)) {
/* Deallocate blocks that were just allocated. */
loff_t blockmask = i_blocksize(inode) - 1;
@@ -1240,9 +1243,6 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
}
}
- if (ip->i_qadata && ip->i_qadata->qa_qd_num)
- gfs2_quota_unlock(ip);
-
if (unlikely(!written))
goto out_unlock;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4a10b4e7092a..69c52edf5713 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -875,8 +875,11 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
current->backing_dev_info = inode_to_bdi(inode);
buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
current->backing_dev_info = NULL;
- if (unlikely(buffered <= 0))
+ if (unlikely(buffered <= 0)) {
+ if (!ret)
+ ret = buffered;
goto out_unlock;
+ }
/*
* We need to ensure that the page cache pages are written to
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9e1685a30bf8..01c41087f067 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1552,6 +1552,7 @@ __acquires(&lru_lock)
while(!list_empty(list)) {
gl = list_entry(list->next, struct gfs2_glock, gl_lru);
list_del_init(&gl->gl_lru);
+ clear_bit(GLF_LRU, &gl->gl_flags);
if (!spin_trylock(&gl->gl_lockref.lock)) {
add_back_to_lru:
list_add(&gl->gl_lru, &lru_list);
@@ -1597,7 +1598,6 @@ static long gfs2_scan_glock_lru(int nr)
if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
list_move(&gl->gl_lru, &dispose);
atomic_dec(&lru_count);
- clear_bit(GLF_LRU, &gl->gl_flags);
freed++;
continue;
}
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 7c7197343ee2..94c290a333a0 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -280,7 +280,6 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- int lvb_needs_unlock = 0;
int error;
if (gl->gl_lksb.sb_lkid == 0) {
@@ -293,13 +292,15 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
gfs2_update_request_times(gl);
- /* don't want to skip dlm_unlock writing the lvb when lock is ex */
-
- if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE))
- lvb_needs_unlock = 1;
+ /* don't want to call dlm if we've unmounted the lock protocol */
+ if (test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) {
+ gfs2_glock_free(gl);
+ return;
+ }
+ /* don't want to skip dlm_unlock writing the lvb when lock has one */
if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) &&
- !lvb_needs_unlock) {
+ !gl->gl_lksb.sb_lvbptr) {
gfs2_glock_free(gl);
return;
}
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c056ed5c6df3..8153a3eac540 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -925,15 +925,15 @@ static int read_rindex_entry(struct gfs2_inode *ip)
rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
spin_lock_init(&rgd->rd_rsspin);
- error = compute_bitstructs(rgd);
- if (error)
- goto fail;
-
error = gfs2_glock_get(sdp, rgd->rd_addr,
&gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
if (error)
goto fail;
+ error = compute_bitstructs(rgd);
+ if (error)
+ goto fail_glock;
+
rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
if (rgd->rd_data > sdp->sd_max_rg_data)
@@ -950,6 +950,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
}
error = 0; /* someone else read in the rgrp; free it and ignore it */
+fail_glock:
gfs2_glock_put(rgd->rd_gl);
fail:
@@ -1429,7 +1430,8 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
start = r.start >> bs_shift;
end = start + (r.len >> bs_shift);
- minlen = max_t(u64, r.minlen,
+ minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize);
+ minlen = max_t(u64, minlen,
q->limits.discard_granularity) >> bs_shift;
if (end <= start || minlen > sdp->sd_max_rg_data)
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 50c925d9c610..9c593fd50c6a 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -757,11 +757,13 @@ void gfs2_freeze_func(struct work_struct *work)
static int gfs2_freeze(struct super_block *sb)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
- int error = 0;
+ int error;
mutex_lock(&sdp->sd_freeze_mutex);
- if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN)
+ if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) {
+ error = -EBUSY;
goto out;
+ }
if (test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) {
error = -EINVAL;
@@ -798,10 +800,10 @@ static int gfs2_unfreeze(struct super_block *sb)
struct gfs2_sbd *sdp = sb->s_fs_info;
mutex_lock(&sdp->sd_freeze_mutex);
- if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN ||
+ if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN ||
!gfs2_holder_initialized(&sdp->sd_freeze_gh)) {
mutex_unlock(&sdp->sd_freeze_mutex);
- return 0;
+ return -EINVAL;
}
gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 4af318fbda77..ef9498a6e88a 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -25,7 +25,19 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
fd->key = ptr + tree->max_key_len + 2;
hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
tree->cnid, __builtin_return_address(0));
- mutex_lock(&tree->tree_lock);
+ switch (tree->cnid) {
+ case HFS_CAT_CNID:
+ mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX);
+ break;
+ case HFS_EXT_CNID:
+ mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX);
+ break;
+ case HFS_ATTR_CNID:
+ mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX);
+ break;
+ default:
+ return -EINVAL;
+ }
return 0;
}
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index b63a4df7327b..c0a73a6ffb28 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -15,16 +15,31 @@
#include "btree.h"
-void hfs_bnode_read(struct hfs_bnode *node, void *buf,
- int off, int len)
+void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
{
struct page *page;
+ int pagenum;
+ int bytes_read;
+ int bytes_to_read;
+ void *vaddr;
off += node->page_offset;
- page = node->page[0];
+ pagenum = off >> PAGE_SHIFT;
+ off &= ~PAGE_MASK; /* compute page offset for the first page */
- memcpy(buf, kmap(page) + off, len);
- kunmap(page);
+ for (bytes_read = 0; bytes_read < len; bytes_read += bytes_to_read) {
+ if (pagenum >= node->tree->pages_per_bnode)
+ break;
+ page = node->page[pagenum];
+ bytes_to_read = min_t(int, len - bytes_read, PAGE_SIZE - off);
+
+ vaddr = kmap_atomic(page);
+ memcpy(buf + bytes_read, vaddr + off, bytes_to_read);
+ kunmap_atomic(vaddr);
+
+ pagenum++;
+ off = 0; /* page offset only applies to the first page */
+ }
}
u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index dcc2aab1b2c4..25ac9a8bb57a 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -13,6 +13,13 @@ typedef int (*btree_keycmp)(const btree_key *, const btree_key *);
#define NODE_HASH_SIZE 256
+/* B-tree mutex nested subclasses */
+enum hfs_btree_mutex_classes {
+ CATALOG_BTREE_MUTEX,
+ EXTENTS_BTREE_MUTEX,
+ ATTR_BTREE_MUTEX,
+};
+
/* A HFS BTree held in memory */
struct hfs_btree {
struct super_block *sb;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index c33324686d89..bcf820ce0e02 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -421,14 +421,12 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
if (!res) {
if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) {
res = -EIO;
- goto bail;
+ goto bail_hfs_find;
}
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength);
}
- if (res) {
- hfs_find_exit(&fd);
- goto bail_no_root;
- }
+ if (res)
+ goto bail_hfs_find;
res = -EINVAL;
root_inode = hfs_iget(sb, &fd.search_key->cat, &rec);
hfs_find_exit(&fd);
@@ -444,6 +442,8 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
/* everything's okay */
return 0;
+bail_hfs_find:
+ hfs_find_exit(&fd);
bail_no_root:
pr_err("get root inode failed\n");
bail:
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index a930ddd15681..7054a542689f 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -598,13 +598,15 @@ void hfsplus_file_truncate(struct inode *inode)
res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
if (res)
break;
- hfs_brec_remove(&fd);
- mutex_unlock(&fd.tree->tree_lock);
start = hip->cached_start;
+ if (blk_cnt <= start)
+ hfs_brec_remove(&fd);
+ mutex_unlock(&fd.tree->tree_lock);
hfsplus_free_extents(sb, hip->cached_extents,
alloc_cnt - start, alloc_cnt - blk_cnt);
hfsplus_dump_extent(hip->cached_extents);
+ mutex_lock(&fd.tree->tree_lock);
if (blk_cnt > start) {
hip->extent_state |= HFSPLUS_EXT_DIRTY;
break;
@@ -612,7 +614,6 @@ void hfsplus_file_truncate(struct inode *inode)
alloc_cnt = start;
hip->cached_start = hip->cached_blocks = 0;
hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
- mutex_lock(&fd.tree->tree_lock);
}
hfs_find_exit(&fd);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 5a7eb0c79839..58a972667bf8 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -139,10 +139,10 @@ static char *inode_name(struct inode *ino)
static char *follow_link(char *link)
{
- int len, n;
char *name, *resolved, *end;
+ int n;
- name = __getname();
+ name = kmalloc(PATH_MAX, GFP_KERNEL);
if (!name) {
n = -ENOMEM;
goto out_free;
@@ -164,21 +164,18 @@ static char *follow_link(char *link)
return name;
*(end + 1) = '\0';
- len = strlen(link) + strlen(name) + 1;
- resolved = kmalloc(len, GFP_KERNEL);
+ resolved = kasprintf(GFP_KERNEL, "%s%s", link, name);
if (resolved == NULL) {
n = -ENOMEM;
goto out_free;
}
- sprintf(resolved, "%s%s", link, name);
- __putname(name);
- kfree(link);
+ kfree(name);
return resolved;
out_free:
- __putname(name);
+ kfree(name);
return ERR_PTR(n);
}
@@ -918,18 +915,16 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
sb->s_d_op = &simple_dentry_operations;
sb->s_maxbytes = MAX_LFS_FILESIZE;
- /* NULL is printed as <NULL> by sprintf: avoid that. */
+ /* NULL is printed as '(null)' by printf(): avoid that. */
if (req_root == NULL)
req_root = "";
err = -ENOMEM;
sb->s_fs_info = host_root_path =
- kmalloc(strlen(root_ino) + strlen(req_root) + 2, GFP_KERNEL);
+ kasprintf(GFP_KERNEL, "%s/%s", root_ino, req_root);
if (host_root_path == NULL)
goto out;
- sprintf(host_root_path, "%s/%s", root_ino, req_root);
-
root_inode = new_inode(sb);
if (!root_inode)
goto out;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5fff7cb3582f..7d039ba5ae28 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -38,6 +38,7 @@
#include <linux/uio.h>
#include <linux/uaccess.h>
+#include <linux/sched/mm.h>
static const struct super_operations hugetlbfs_ops;
static const struct address_space_operations hugetlbfs_aops;
@@ -76,7 +77,7 @@ enum hugetlb_param {
static const struct fs_parameter_spec hugetlb_param_specs[] = {
fsparam_u32 ("gid", Opt_gid),
fsparam_string("min_size", Opt_min_size),
- fsparam_u32 ("mode", Opt_mode),
+ fsparam_u32oct("mode", Opt_mode),
fsparam_string("nr_inodes", Opt_nr_inodes),
fsparam_string("pagesize", Opt_pagesize),
fsparam_string("size", Opt_size),
@@ -135,6 +136,7 @@ static void huge_pagevec_release(struct pagevec *pvec)
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
+ struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
loff_t len, vma_len;
int ret;
struct hstate *h = hstate_file(file);
@@ -150,6 +152,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
vma->vm_ops = &hugetlb_vm_ops;
+ ret = seal_check_future_write(info->seals, vma);
+ if (ret)
+ return ret;
+
/*
* page based offset in vm_pgoff could be sufficiently large to
* overflow a loff_t when converted to byte offset. This can
@@ -196,13 +202,61 @@ out:
#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
static unsigned long
+hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ struct hstate *h = hstate_file(file);
+ struct vm_unmapped_area_info info;
+
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = current->mm->mmap_base;
+ info.high_limit = arch_get_mmap_end(addr);
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ return vm_unmapped_area(&info);
+}
+
+static unsigned long
+hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ struct hstate *h = hstate_file(file);
+ struct vm_unmapped_area_info info;
+
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+ info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ addr = vm_unmapped_area(&info);
+
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ if (unlikely(offset_in_page(addr))) {
+ VM_BUG_ON(addr != -ENOMEM);
+ info.flags = 0;
+ info.low_limit = current->mm->mmap_base;
+ info.high_limit = arch_get_mmap_end(addr);
+ addr = vm_unmapped_area(&info);
+ }
+
+ return addr;
+}
+
+static unsigned long
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
+ const unsigned long mmap_end = arch_get_mmap_end(addr);
if (len & ~huge_page_mask(h))
return -EINVAL;
@@ -218,18 +272,21 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (addr) {
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (mmap_end - len >= addr &&
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
- info.flags = 0;
- info.length = len;
- info.low_limit = TASK_UNMAPPED_BASE;
- info.high_limit = TASK_SIZE;
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
- return vm_unmapped_area(&info);
+ /*
+ * Use mm->get_unmapped_area value as a hint to use topdown routine.
+ * If architectures have special needs, they should define their own
+ * version of hugetlb_get_unmapped_area.
+ */
+ if (mm->get_unmapped_area == arch_get_unmapped_area_topdown)
+ return hugetlb_get_unmapped_area_topdown(file, addr, len,
+ pgoff, flags);
+ return hugetlb_get_unmapped_area_bottomup(file, addr, len,
+ pgoff, flags);
}
#endif
@@ -440,7 +497,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
u32 hash;
index = page->index;
- hash = hugetlb_fault_mutex_hash(h, mapping, index, 0);
+ hash = hugetlb_fault_mutex_hash(h, mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/*
@@ -644,7 +701,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
addr = index * hpage_size;
/* mutex taken here, fault path and hole punch */
- hash = hugetlb_fault_mutex_hash(h, mapping, index, addr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/* See if already present in mapping to avoid alloc/free */
@@ -675,9 +732,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ set_page_huge_active(page);
/*
* unlock_page because locked by add_to_page_cache()
- * page_put due to reference from alloc_huge_page()
+ * put_page() due to reference from alloc_huge_page()
*/
unlock_page(page);
put_page(page);
diff --git a/fs/internal.h b/fs/internal.h
index 7651e8b8ef13..61aed95f83d1 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -52,7 +52,6 @@ extern void __init chrdev_init(void);
*/
extern const struct fs_context_operations legacy_fs_context_ops;
extern int parse_monolithic_mount_data(struct fs_context *, void *);
-extern void fc_drop_locked(struct fs_context *);
extern void vfs_clean_context(struct fs_context *fc);
extern int finish_clean_context(struct fs_context *fc);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4127ea027a14..e73969fa96bc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -438,6 +438,22 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
return ctx;
}
+static void io_req_put_fs(struct io_kiocb *req)
+{
+ struct fs_struct *fs = req->fs;
+
+ if (!fs)
+ return;
+
+ spin_lock(&req->fs->lock);
+ if (--fs->users)
+ fs = NULL;
+ spin_unlock(&req->fs->lock);
+ if (fs)
+ free_fs_struct(fs);
+ req->fs = NULL;
+}
+
static inline bool __io_sequence_defer(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
@@ -695,6 +711,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
static void __io_free_req(struct io_kiocb *req)
{
+ io_req_put_fs(req);
if (req->file && !(req->flags & REQ_F_FIXED_FILE))
fput(req->file);
percpu_ref_put(&req->ctx->refs);
@@ -1701,16 +1718,7 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
ret = -EINTR;
}
- if (req->fs) {
- struct fs_struct *fs = req->fs;
-
- spin_lock(&req->fs->lock);
- if (--fs->users)
- fs = NULL;
- spin_unlock(&req->fs->lock);
- if (fs)
- free_fs_struct(fs);
- }
+ io_req_put_fs(req);
io_cqring_add_event(req->ctx, sqe->user_data, ret);
io_put_req(req);
return 0;
@@ -2226,7 +2234,8 @@ restart:
/* Ensure we clear previously set non-block flag */
req->rw.ki_flags &= ~IOCB_NOWAIT;
- if (req->fs != current->fs && current->fs != old_fs_struct) {
+ if ((req->fs && req->fs != current->fs) ||
+ (!req->fs && current->fs != old_fs_struct)) {
task_lock(current);
if (req->fs)
current->fs = req->fs;
@@ -2351,7 +2360,7 @@ out:
mmput(cur_mm);
}
revert_creds(old_cred);
- if (old_fs_struct) {
+ if (old_fs_struct != current->fs) {
task_lock(current);
current->fs = old_fs_struct;
task_unlock(current);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 80867a1a94f2..5c73751adb2d 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -30,6 +30,7 @@ iomap_page_create(struct inode *inode, struct page *page)
iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
atomic_set(&iop->read_count, 0);
atomic_set(&iop->write_count, 0);
+ spin_lock_init(&iop->uptodate_lock);
bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
/*
@@ -118,25 +119,38 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
}
static void
-iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
+iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len)
{
struct iomap_page *iop = to_iomap_page(page);
struct inode *inode = page->mapping->host;
unsigned first = off >> inode->i_blkbits;
unsigned last = (off + len - 1) >> inode->i_blkbits;
- unsigned int i;
bool uptodate = true;
+ unsigned long flags;
+ unsigned int i;
- if (iop) {
- for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) {
- if (i >= first && i <= last)
- set_bit(i, iop->uptodate);
- else if (!test_bit(i, iop->uptodate))
- uptodate = false;
- }
+ spin_lock_irqsave(&iop->uptodate_lock, flags);
+ for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) {
+ if (i >= first && i <= last)
+ set_bit(i, iop->uptodate);
+ else if (!test_bit(i, iop->uptodate))
+ uptodate = false;
}
- if (uptodate && !PageError(page))
+ if (uptodate)
+ SetPageUptodate(page);
+ spin_unlock_irqrestore(&iop->uptodate_lock, flags);
+}
+
+static void
+iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
+{
+ if (PageError(page))
+ return;
+
+ if (page_has_private(page))
+ iomap_iop_set_range_uptodate(page, off, len);
+ else
SetPageUptodate(page);
}
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index c04bad4b2b43..10c4c1e80124 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -140,23 +140,20 @@ loff_t
iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
{
loff_t size = i_size_read(inode);
- loff_t length = size - offset;
loff_t ret;
/* Nothing to be found before or beyond the end of the file. */
if (offset < 0 || offset >= size)
return -ENXIO;
- while (length > 0) {
- ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
- &offset, iomap_seek_hole_actor);
+ while (offset < size) {
+ ret = iomap_apply(inode, offset, size - offset, IOMAP_REPORT,
+ ops, &offset, iomap_seek_hole_actor);
if (ret < 0)
return ret;
if (ret == 0)
break;
-
offset += ret;
- length -= ret;
}
return offset;
@@ -186,27 +183,23 @@ loff_t
iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
{
loff_t size = i_size_read(inode);
- loff_t length = size - offset;
loff_t ret;
/* Nothing to be found before or beyond the end of the file. */
if (offset < 0 || offset >= size)
return -ENXIO;
- while (length > 0) {
- ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
- &offset, iomap_seek_data_actor);
+ while (offset < size) {
+ ret = iomap_apply(inode, offset, size - offset, IOMAP_REPORT,
+ ops, &offset, iomap_seek_data_actor);
if (ret < 0)
return ret;
if (ret == 0)
- break;
-
+ return offset;
offset += ret;
- length -= ret;
}
- if (length <= 0)
- return -ENXIO;
- return offset;
+ /* We've reached the end of the file without finding data */
+ return -ENXIO;
}
EXPORT_SYMBOL_GPL(iomap_seek_data);
diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
index 152a230f668d..2d18246f6726 100644
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -30,11 +30,16 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
{
struct iomap *iomap = &isi->iomap;
unsigned long nr_pages;
+ unsigned long max_pages;
uint64_t first_ppage;
uint64_t first_ppage_reported;
uint64_t next_ppage;
int error;
+ if (unlikely(isi->nr_pages >= isi->sis->max))
+ return 0;
+ max_pages = isi->sis->max - isi->nr_pages;
+
/*
* Round the start up and the end down so that the physical
* extent aligns to a page boundary.
@@ -47,6 +52,7 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
if (first_ppage >= next_ppage)
return 0;
nr_pages = next_ppage - first_ppage;
+ nr_pages = min(nr_pages, max_pages);
/*
* Calculate how much swap space we're adding; the first page contains
@@ -169,6 +175,16 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
return ret;
}
+ /*
+ * If this swapfile doesn't contain even a single page-aligned
+ * contiguous range of blocks, reject this useless swapfile to
+ * prevent confusion later on.
+ */
+ if (isi.nr_pages == 0) {
+ pr_warn("swapon: Cannot find a single usable page in file.\n");
+ return -EINVAL;
+ }
+
*pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
sis->max = isi.nr_pages;
sis->pages = isi.nr_pages - 1;
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index f0fe641893a5..b9e6a7ec78be 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -152,6 +152,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *file,
printk(KERN_NOTICE "iso9660: Corrupted directory entry"
" in block %lu of inode %lu\n", block,
inode->i_ino);
+ brelse(bh);
return -EIO;
}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 62c0462dc89f..74e487d63c62 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -155,7 +155,6 @@ struct iso9660_options{
unsigned int overriderockperm:1;
unsigned int uid_set:1;
unsigned int gid_set:1;
- unsigned int utf8:1;
unsigned char map;
unsigned char check;
unsigned int blocksize;
@@ -355,7 +354,6 @@ static int parse_options(char *options, struct iso9660_options *popt)
popt->gid = GLOBAL_ROOT_GID;
popt->uid = GLOBAL_ROOT_UID;
popt->iocharset = NULL;
- popt->utf8 = 0;
popt->overriderockperm = 0;
popt->session=-1;
popt->sbsector=-1;
@@ -388,10 +386,13 @@ static int parse_options(char *options, struct iso9660_options *popt)
case Opt_cruft:
popt->cruft = 1;
break;
+#ifdef CONFIG_JOLIET
case Opt_utf8:
- popt->utf8 = 1;
+ kfree(popt->iocharset);
+ popt->iocharset = kstrdup("utf8", GFP_KERNEL);
+ if (!popt->iocharset)
+ return 0;
break;
-#ifdef CONFIG_JOLIET
case Opt_iocharset:
kfree(popt->iocharset);
popt->iocharset = match_strdup(&args[0]);
@@ -494,7 +495,6 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root)
if (sbi->s_nocompress) seq_puts(m, ",nocompress");
if (sbi->s_overriderockperm) seq_puts(m, ",overriderockperm");
if (sbi->s_showassoc) seq_puts(m, ",showassoc");
- if (sbi->s_utf8) seq_puts(m, ",utf8");
if (sbi->s_check) seq_printf(m, ",check=%c", sbi->s_check);
if (sbi->s_mapping) seq_printf(m, ",map=%c", sbi->s_mapping);
@@ -517,9 +517,10 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",fmode=%o", sbi->s_fmode);
#ifdef CONFIG_JOLIET
- if (sbi->s_nls_iocharset &&
- strcmp(sbi->s_nls_iocharset->charset, CONFIG_NLS_DEFAULT) != 0)
+ if (sbi->s_nls_iocharset)
seq_printf(m, ",iocharset=%s", sbi->s_nls_iocharset->charset);
+ else
+ seq_puts(m, ",iocharset=utf8");
#endif
return 0;
}
@@ -867,14 +868,13 @@ root_found:
sbi->s_nls_iocharset = NULL;
#ifdef CONFIG_JOLIET
- if (joliet_level && opt.utf8 == 0) {
+ if (joliet_level) {
char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT;
- sbi->s_nls_iocharset = load_nls(p);
- if (! sbi->s_nls_iocharset) {
- /* Fail only if explicit charset specified */
- if (opt.iocharset)
+ if (strcmp(p, "utf8") != 0) {
+ sbi->s_nls_iocharset = opt.iocharset ?
+ load_nls(opt.iocharset) : load_nls_default();
+ if (!sbi->s_nls_iocharset)
goto out_freesbi;
- sbi->s_nls_iocharset = load_nls_default();
}
}
#endif
@@ -890,7 +890,6 @@ root_found:
sbi->s_gid = opt.gid;
sbi->s_uid_set = opt.uid_set;
sbi->s_gid_set = opt.gid_set;
- sbi->s_utf8 = opt.utf8;
sbi->s_nocompress = opt.nocompress;
sbi->s_overriderockperm = opt.overriderockperm;
/*
@@ -1329,6 +1328,8 @@ static int isofs_read_inode(struct inode *inode, int relocated)
de = (struct iso_directory_record *) (bh->b_data + offset);
de_len = *(unsigned char *) de;
+ if (de_len < sizeof(struct iso_directory_record))
+ goto fail;
if (offset + de_len > bufsize) {
int frag1 = bufsize - offset;
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 055ec6c586f7..dcdc191ed183 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -44,7 +44,6 @@ struct isofs_sb_info {
unsigned char s_session;
unsigned int s_high_sierra:1;
unsigned int s_rock:2;
- unsigned int s_utf8:1;
unsigned int s_cruft:1; /* Broken disks with high byte of length
* containing junk */
unsigned int s_nocompress:1;
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index be8b6a9d0b92..c0f04a1e7f69 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -41,14 +41,12 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
int
get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode)
{
- unsigned char utf8;
struct nls_table *nls;
unsigned char len = 0;
- utf8 = ISOFS_SB(inode->i_sb)->s_utf8;
nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset;
- if (utf8) {
+ if (!nls) {
len = utf16s_to_utf8s((const wchar_t *) de->name,
de->name_len[0] >> 1, UTF16_BIG_ENDIAN,
outname, PAGE_SIZE);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index cac468f04820..558e7c51ce0d 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -102,6 +102,7 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
printk(KERN_NOTICE "iso9660: Corrupted directory entry"
" in block %lu of inode %lu\n", block,
dir->i_ino);
+ brelse(bh);
return 0;
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 88146008b3e3..d45ceb2e2149 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -451,7 +451,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
}
spin_unlock(&commit_transaction->t_handle_lock);
commit_transaction->t_state = T_SWITCH;
- write_unlock(&journal->j_state_lock);
J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
journal->j_max_transaction_buffers);
@@ -471,6 +470,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* has reserved. This is consistent with the existing behaviour
* that multiple jbd2_journal_get_write_access() calls to the same
* buffer are perfectly permissible.
+ * We use journal->j_state_lock here to serialize processing of
+ * t_reserved_list with eviction of buffers from journal_unmap_buffer().
*/
while (commit_transaction->t_reserved_list) {
jh = commit_transaction->t_reserved_list;
@@ -490,6 +491,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_refile_buffer(journal, jh);
}
+ write_unlock(&journal->j_state_lock);
/*
* Now try to drop any written-back buffers from the journal's
* checkpoint lists. We do this *before* commit because it potentially
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index b288c8ae1236..837cd55fd4c5 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -415,13 +415,15 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
jffs2_free_ino_caches(c);
jffs2_free_raw_node_refs(c);
ret = -EIO;
- goto out_free;
+ goto out_sum_exit;
}
jffs2_calc_trigger_levels(c);
return 0;
+ out_sum_exit:
+ jffs2_sum_exit(c);
out_free:
kvfree(c->blocks);
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 406d9cc84ba8..79e771ab624f 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -37,6 +37,9 @@ static int jffs2_rtime_compress(unsigned char *data_in,
int outpos = 0;
int pos=0;
+ if (*dstlen <= 3)
+ return -1;
+
memset(positions,0,sizeof(positions));
while (pos < (*sourcelen) && outpos <= (*dstlen)-2) {
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index f8fb89b10227..34880a4c2173 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -135,20 +135,15 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
struct page *pg;
struct inode *inode = mapping->host;
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+ struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
pgoff_t index = pos >> PAGE_SHIFT;
uint32_t pageofs = index << PAGE_SHIFT;
int ret = 0;
- pg = grab_cache_page_write_begin(mapping, index, flags);
- if (!pg)
- return -ENOMEM;
- *pagep = pg;
-
jffs2_dbg(1, "%s()\n", __func__);
if (pageofs > inode->i_size) {
/* Make new hole frag from old EOF to new page */
- struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
struct jffs2_raw_inode ri;
struct jffs2_full_dnode *fn;
uint32_t alloc_len;
@@ -159,7 +154,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
if (ret)
- goto out_page;
+ goto out_err;
mutex_lock(&f->sem);
memset(&ri, 0, sizeof(ri));
@@ -189,7 +184,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
ret = PTR_ERR(fn);
jffs2_complete_reservation(c);
mutex_unlock(&f->sem);
- goto out_page;
+ goto out_err;
}
ret = jffs2_add_full_dnode_to_inode(c, f, fn);
if (f->metadata) {
@@ -204,7 +199,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
jffs2_free_full_dnode(fn);
jffs2_complete_reservation(c);
mutex_unlock(&f->sem);
- goto out_page;
+ goto out_err;
}
jffs2_complete_reservation(c);
inode->i_size = pageofs;
@@ -212,6 +207,19 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
}
/*
+ * While getting a page and reading data in, lock c->alloc_sem until
+ * the page is Uptodate. Otherwise GC task may attempt to read the same
+ * page in read_cache_page(), which causes a deadlock.
+ */
+ mutex_lock(&c->alloc_sem);
+ pg = grab_cache_page_write_begin(mapping, index, flags);
+ if (!pg) {
+ ret = -ENOMEM;
+ goto release_sem;
+ }
+ *pagep = pg;
+
+ /*
* Read in the page if it wasn't already present. Cannot optimize away
* the whole page write case until jffs2_write_end can handle the
* case of a short-copy.
@@ -220,15 +228,17 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
mutex_lock(&f->sem);
ret = jffs2_do_readpage_nolock(inode, pg);
mutex_unlock(&f->sem);
- if (ret)
- goto out_page;
+ if (ret) {
+ unlock_page(pg);
+ put_page(pg);
+ goto release_sem;
+ }
}
jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
- return ret;
-out_page:
- unlock_page(pg);
- put_page(pg);
+release_sem:
+ mutex_unlock(&c->alloc_sem);
+out_err:
return ret;
}
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index ab8cdd9e9325..ad1eba809e7e 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -602,8 +602,8 @@ out_root:
jffs2_free_ino_caches(c);
jffs2_free_raw_node_refs(c);
kvfree(c->blocks);
- out_inohash:
jffs2_clear_xattr_subsystem(c);
+ out_inohash:
kfree(c->inocache_list);
out_wbuf:
jffs2_flash_cleanup(c);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 5f7e284e0df3..f73904c08b39 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -136,7 +136,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
if (!s) {
JFFS2_WARNING("Can't allocate memory for summary\n");
ret = -ENOMEM;
- goto out;
+ goto out_buf;
}
}
@@ -274,13 +274,15 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
}
ret = 0;
out:
+ jffs2_sum_reset_collected(s);
+ kfree(s);
+ out_buf:
if (buf_size)
kfree(flashbuf);
#ifndef __ECOS
else
mtd_unpoint(c->mtd, 0, c->mtd->size);
#endif
- kfree(s);
return ret;
}
@@ -1078,7 +1080,7 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
memcpy(&fd->name, rd->name, checkedlen);
fd->name[checkedlen] = 0;
- crc = crc32(0, fd->name, rd->nsize);
+ crc = crc32(0, fd->name, checkedlen);
if (crc != je32_to_cpu(rd->name_crc)) {
pr_notice("%s(): Name CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
__func__, ofs, je32_to_cpu(rd->name_crc), crc);
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index be7c8a6a5748..4fe64519870f 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -783,6 +783,8 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
dbg_summary("Writing unknown RWCOMPAT_COPY node type %x\n",
je16_to_cpu(temp->u.nodetype));
jffs2_sum_disable_collecting(c->summary);
+ /* The above call removes the list, nothing more to do */
+ goto bail_rwcompat;
} else {
BUG(); /* unknown node in summary information */
}
@@ -794,6 +796,7 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
c->summary->sum_num--;
}
+ bail_rwcompat:
jffs2_sum_reset_collected(c->summary);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9486afcdac76..62c4a5450cda 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -146,12 +146,14 @@ void jfs_evict_inode(struct inode *inode)
dquot_initialize(inode);
if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
+ struct inode *ipimap = JFS_SBI(inode->i_sb)->ipimap;
truncate_inode_pages_final(&inode->i_data);
if (test_cflag(COMMIT_Freewmap, inode))
jfs_free_zero_link(inode);
- diFree(inode);
+ if (ipimap && JFS_IP(ipimap)->i_imap)
+ diFree(inode);
/*
* Free the inode from the quota allocation.
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index caade185e568..79f3440e204b 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -148,6 +148,7 @@ static const s8 budtab[256] = {
* 0 - success
* -ENOMEM - insufficient memory
* -EIO - i/o error
+ * -EINVAL - wrong bmap data
*/
int dbMount(struct inode *ipbmap)
{
@@ -179,6 +180,12 @@ int dbMount(struct inode *ipbmap)
bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
+ if (!bmp->db_numag) {
+ release_metapage(mp);
+ kfree(bmp);
+ return -EINVAL;
+ }
+
bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
@@ -1656,7 +1663,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
} else if (rc == -ENOSPC) {
/* search for next smaller log2 block */
l2nb = BLKSTOL2(nblocks) - 1;
- nblocks = 1 << l2nb;
+ nblocks = 1LL << l2nb;
} else {
/* Trim any already allocated blocks */
jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n");
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index 1e899298f7f0..b5d702df7111 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -268,5 +268,6 @@
* fsck() must be run to repair
*/
#define FM_EXTENDFS 0x00000008 /* file system extendfs() in progress */
+#define FM_STATE_MAX 0x0000000f /* max value of s_state */
#endif /* _H_JFS_FILSYS */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 9330eff210e0..78fd136ac13b 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1324,6 +1324,7 @@ int lmLogInit(struct jfs_log * log)
} else {
if (!uuid_equal(&logsuper->uuid, &log->uuid)) {
jfs_warn("wrong uuid on JFS log device");
+ rc = -EINVAL;
goto errout20;
}
log->size = le32_to_cpu(logsuper->size);
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index eb8b9e233d73..d41733540df9 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -36,6 +36,7 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
+#include <linux/log2.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
@@ -79,14 +80,14 @@ int jfs_mount(struct super_block *sb)
* (initialize mount inode from the superblock)
*/
if ((rc = chkSuper(sb))) {
- goto errout20;
+ goto out;
}
ipaimap = diReadSpecial(sb, AGGREGATE_I, 0);
if (ipaimap == NULL) {
jfs_err("jfs_mount: Failed to read AGGREGATE_I");
rc = -EIO;
- goto errout20;
+ goto out;
}
sbi->ipaimap = ipaimap;
@@ -97,7 +98,7 @@ int jfs_mount(struct super_block *sb)
*/
if ((rc = diMount(ipaimap))) {
jfs_err("jfs_mount: diMount(ipaimap) failed w/rc = %d", rc);
- goto errout21;
+ goto err_ipaimap;
}
/*
@@ -106,7 +107,7 @@ int jfs_mount(struct super_block *sb)
ipbmap = diReadSpecial(sb, BMAP_I, 0);
if (ipbmap == NULL) {
rc = -EIO;
- goto errout22;
+ goto err_umount_ipaimap;
}
jfs_info("jfs_mount: ipbmap:0x%p", ipbmap);
@@ -118,7 +119,7 @@ int jfs_mount(struct super_block *sb)
*/
if ((rc = dbMount(ipbmap))) {
jfs_err("jfs_mount: dbMount failed w/rc = %d", rc);
- goto errout22;
+ goto err_ipbmap;
}
/*
@@ -137,7 +138,7 @@ int jfs_mount(struct super_block *sb)
if (!ipaimap2) {
jfs_err("jfs_mount: Failed to read AGGREGATE_I");
rc = -EIO;
- goto errout35;
+ goto err_umount_ipbmap;
}
sbi->ipaimap2 = ipaimap2;
@@ -149,7 +150,7 @@ int jfs_mount(struct super_block *sb)
if ((rc = diMount(ipaimap2))) {
jfs_err("jfs_mount: diMount(ipaimap2) failed, rc = %d",
rc);
- goto errout35;
+ goto err_ipaimap2;
}
} else
/* Secondary aggregate inode table is not valid */
@@ -166,7 +167,7 @@ int jfs_mount(struct super_block *sb)
jfs_err("jfs_mount: Failed to read FILESYSTEM_I");
/* open fileset secondary inode allocation map */
rc = -EIO;
- goto errout40;
+ goto err_umount_ipaimap2;
}
jfs_info("jfs_mount: ipimap:0x%p", ipimap);
@@ -176,41 +177,34 @@ int jfs_mount(struct super_block *sb)
/* initialize fileset inode allocation map */
if ((rc = diMount(ipimap))) {
jfs_err("jfs_mount: diMount failed w/rc = %d", rc);
- goto errout41;
+ goto err_ipimap;
}
- goto out;
+ return rc;
/*
* unwind on error
*/
- errout41: /* close fileset inode allocation map inode */
+err_ipimap:
+ /* close fileset inode allocation map inode */
diFreeSpecial(ipimap);
-
- errout40: /* fileset closed */
-
+err_umount_ipaimap2:
/* close secondary aggregate inode allocation map */
- if (ipaimap2) {
+ if (ipaimap2)
diUnmount(ipaimap2, 1);
+err_ipaimap2:
+ /* close aggregate inodes */
+ if (ipaimap2)
diFreeSpecial(ipaimap2);
- }
-
- errout35:
-
- /* close aggregate block allocation map */
+err_umount_ipbmap: /* close aggregate block allocation map */
dbUnmount(ipbmap, 1);
+err_ipbmap: /* close aggregate inodes */
diFreeSpecial(ipbmap);
-
- errout22: /* close aggregate inode allocation map */
-
+err_umount_ipaimap: /* close aggregate inode allocation map */
diUnmount(ipaimap, 1);
-
- errout21: /* close aggregate inodes */
+err_ipaimap: /* close aggregate inodes */
diFreeSpecial(ipaimap);
- errout20: /* aggregate closed */
-
- out:
-
+out:
if (rc)
jfs_err("Mount JFS Failure: %d", rc);
@@ -365,6 +359,15 @@ static int chkSuper(struct super_block *sb)
sbi->bsize = bsize;
sbi->l2bsize = le16_to_cpu(j_sb->s_l2bsize);
+ /* check some fields for possible corruption */
+ if (sbi->l2bsize != ilog2((u32)bsize) ||
+ j_sb->pad != 0 ||
+ le32_to_cpu(j_sb->s_state) > FM_STATE_MAX) {
+ rc = -EINVAL;
+ jfs_err("jfs_mount: Mount Failure: superblock is corrupt!");
+ goto out;
+ }
+
/*
* For now, ignore s_pbsize, l2bfactor. All I/O going through buffer
* cache.
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 61d3cc2283dc..273a81971ed5 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -395,28 +395,10 @@ nlmsvc_release_lockowner(struct nlm_lock *lock)
nlmsvc_put_lockowner(lock->fl.fl_owner);
}
-static void nlmsvc_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
-{
- struct nlm_lockowner *nlm_lo = (struct nlm_lockowner *)fl->fl_owner;
- new->fl_owner = nlmsvc_get_lockowner(nlm_lo);
-}
-
-static void nlmsvc_locks_release_private(struct file_lock *fl)
-{
- nlmsvc_put_lockowner((struct nlm_lockowner *)fl->fl_owner);
-}
-
-static const struct file_lock_operations nlmsvc_lock_ops = {
- .fl_copy_lock = nlmsvc_locks_copy_lock,
- .fl_release_private = nlmsvc_locks_release_private,
-};
-
void nlmsvc_locks_init_private(struct file_lock *fl, struct nlm_host *host,
pid_t pid)
{
fl->fl_owner = nlmsvc_find_lockowner(host, pid);
- if (fl->fl_owner != NULL)
- fl->fl_ops = &nlmsvc_lock_ops;
}
/*
@@ -634,7 +616,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
conflock->caller = "somehost"; /* FIXME */
conflock->len = strlen(conflock->caller);
conflock->oh.len = 0; /* don't return OH info */
- conflock->svid = ((struct nlm_lockowner *)lock->fl.fl_owner)->pid;
+ conflock->svid = lock->fl.fl_pid;
conflock->fl.fl_type = lock->fl.fl_type;
conflock->fl.fl_start = lock->fl.fl_start;
conflock->fl.fl_end = lock->fl.fl_end;
@@ -788,9 +770,21 @@ nlmsvc_notify_blocked(struct file_lock *fl)
printk(KERN_WARNING "lockd: notification for unknown block!\n");
}
+static fl_owner_t nlmsvc_get_owner(fl_owner_t owner)
+{
+ return nlmsvc_get_lockowner(owner);
+}
+
+static void nlmsvc_put_owner(fl_owner_t owner)
+{
+ nlmsvc_put_lockowner(owner);
+}
+
const struct lock_manager_operations nlmsvc_lock_operations = {
.lm_notify = nlmsvc_notify_blocked,
.lm_grant = nlmsvc_grant_deferred,
+ .lm_get_owner = nlmsvc_get_owner,
+ .lm_put_owner = nlmsvc_put_owner,
};
/*
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 7b09a9158e40..3fffc709afd4 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -447,7 +447,8 @@ static const struct address_space_operations minix_aops = {
.writepage = minix_writepage,
.write_begin = minix_write_begin,
.write_end = generic_write_end,
- .bmap = minix_bmap
+ .bmap = minix_bmap,
+ .direct_IO = noop_direct_IO
};
static const struct inode_operations minix_symlink_inode_operations = {
diff --git a/fs/namei.c b/fs/namei.c
index 5b5759d70822..b952ecbd49c2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3878,13 +3878,12 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
dentry->d_inode->i_flags |= S_DEAD;
dont_mount(dentry);
detach_mounts(dentry);
- fsnotify_rmdir(dir, dentry);
out:
inode_unlock(dentry->d_inode);
dput(dentry);
if (!error)
- d_delete(dentry);
+ d_delete_notify(dir, dentry);
return error;
}
EXPORT_SYMBOL(vfs_rmdir);
@@ -3995,7 +3994,6 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
if (!error) {
dont_mount(dentry);
detach_mounts(dentry);
- fsnotify_unlink(dir, dentry);
}
}
}
@@ -4003,9 +4001,11 @@ out:
inode_unlock(target);
/* We don't d_delete() NFS sillyrenamed files--they still exist. */
- if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
+ if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ fsnotify_unlink(dir, dentry);
+ } else if (!error) {
fsnotify_link_count(target);
- d_delete(dentry);
+ d_delete_notify(dir, dentry);
}
return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index 76ea92994d26..5782cd55dfdb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1647,8 +1647,12 @@ static inline bool may_mount(void)
}
#ifdef CONFIG_MANDATORY_FILE_LOCKING
-static inline bool may_mandlock(void)
+static bool may_mandlock(void)
{
+ pr_warn_once("======================================================\n"
+ "WARNING: the mand mount option is being deprecated and\n"
+ " will be removed in v5.15!\n"
+ "======================================================\n");
return capable(CAP_SYS_ADMIN);
}
#else
@@ -1861,6 +1865,20 @@ void drop_collected_mounts(struct vfsmount *mnt)
namespace_unlock();
}
+static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
+{
+ struct mount *child;
+
+ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
+ if (!is_subdir(child->mnt_mountpoint, dentry))
+ continue;
+
+ if (child->mnt.mnt_flags & MNT_LOCKED)
+ return true;
+ }
+ return false;
+}
+
/**
* clone_private_mount - create a private clone of a path
*
@@ -1875,14 +1893,27 @@ struct vfsmount *clone_private_mount(const struct path *path)
struct mount *old_mnt = real_mount(path->mnt);
struct mount *new_mnt;
+ down_read(&namespace_sem);
if (IS_MNT_UNBINDABLE(old_mnt))
- return ERR_PTR(-EINVAL);
+ goto invalid;
+
+ if (!check_mnt(old_mnt))
+ goto invalid;
+
+ if (has_locked_children(old_mnt, path->dentry))
+ goto invalid;
new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
+ up_read(&namespace_sem);
+
if (IS_ERR(new_mnt))
return ERR_CAST(new_mnt);
return &new_mnt->mnt;
+
+invalid:
+ up_read(&namespace_sem);
+ return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(clone_private_mount);
@@ -2234,19 +2265,6 @@ static int do_change_type(struct path *path, int ms_flags)
return err;
}
-static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
-{
- struct mount *child;
- list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
- if (!is_subdir(child->mnt_mountpoint, dentry))
- continue;
-
- if (child->mnt.mnt_flags & MNT_LOCKED)
- return true;
- }
- return false;
-}
-
static struct mount *__do_loopback(struct path *old_path, int recurse)
{
struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index e7dd07f47825..e84c187d942e 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -127,7 +127,7 @@ config PNFS_BLOCK
config PNFS_FLEXFILE_LAYOUT
tristate
depends on NFS_V4_1 && NFS_V3
- default m
+ default NFS_V4
config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
string "NFSv4.1 Implementation ID Domain"
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 8f34daf85f70..5d5227ce4d91 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -168,7 +168,7 @@ struct cb_devicenotifyitem {
};
struct cb_devicenotifyargs {
- int ndevs;
+ uint32_t ndevs;
struct cb_devicenotifyitem *devs;
};
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index fc775b0b5194..31922657e836 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -364,12 +364,11 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp,
struct cb_process_state *cps)
{
struct cb_devicenotifyargs *args = argp;
- int i;
+ const struct pnfs_layoutdriver_type *ld = NULL;
+ uint32_t i;
__be32 res = 0;
- struct nfs_client *clp = cps->clp;
- struct nfs_server *server = NULL;
- if (!clp) {
+ if (!cps->clp) {
res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
goto out;
}
@@ -377,23 +376,15 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp,
for (i = 0; i < args->ndevs; i++) {
struct cb_devicenotifyitem *dev = &args->devs[i];
- if (!server ||
- server->pnfs_curr_ld->id != dev->cbd_layout_type) {
- rcu_read_lock();
- list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
- if (server->pnfs_curr_ld &&
- server->pnfs_curr_ld->id == dev->cbd_layout_type) {
- rcu_read_unlock();
- goto found;
- }
- rcu_read_unlock();
- continue;
+ if (!ld || ld->id != dev->cbd_layout_type) {
+ pnfs_put_layoutdriver(ld);
+ ld = pnfs_find_layoutdriver(dev->cbd_layout_type);
+ if (!ld)
+ continue;
}
-
- found:
- nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
+ nfs4_delete_deviceid(ld, cps->clp, &dev->cbd_dev_id);
}
-
+ pnfs_put_layoutdriver(ld);
out:
kfree(args->devs);
return res;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 73a5a5ea2976..04d27f0ed39a 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -258,11 +258,9 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
void *argp)
{
struct cb_devicenotifyargs *args = argp;
+ uint32_t tmp, n, i;
__be32 *p;
__be32 status = 0;
- u32 tmp;
- int n, i;
- args->ndevs = 0;
/* Num of device notifications */
p = xdr_inline_decode(xdr, sizeof(uint32_t));
@@ -271,12 +269,8 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
goto out;
}
n = ntohl(*p++);
- if (n <= 0)
- goto out;
- if (n > ULONG_MAX / sizeof(*args->devs)) {
- status = htonl(NFS4ERR_BADXDR);
+ if (n == 0)
goto out;
- }
args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL);
if (!args->devs) {
@@ -330,19 +324,21 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
dev->cbd_immediate = 0;
}
- args->ndevs++;
-
dprintk("%s: type %d layout 0x%x immediate %d\n",
__func__, dev->cbd_notify_type, dev->cbd_layout_type,
dev->cbd_immediate);
}
+ args->ndevs = n;
+ dprintk("%s: ndevs %d\n", __func__, args->ndevs);
+ return 0;
+err:
+ kfree(args->devs);
out:
+ args->devs = NULL;
+ args->ndevs = 0;
dprintk("%s: status %d ndevs %d\n",
__func__, ntohl(status), args->ndevs);
return status;
-err:
- kfree(args->devs);
- goto out;
}
static __be32 decode_sessionid(struct xdr_stream *xdr,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a05f77f9c21e..35abe63655a9 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -176,6 +176,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
INIT_LIST_HEAD(&clp->cl_superblocks);
clp->cl_rpcclient = ERR_PTR(-EINVAL);
+ clp->cl_flags = cl_init->init_flags;
clp->cl_proto = cl_init->proto;
clp->cl_nconnect = cl_init->nconnect;
clp->cl_net = get_net(cl_init->net);
@@ -399,7 +400,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
if (cl_init->hostname == NULL) {
WARN_ON(1);
- return NULL;
+ return ERR_PTR(-EINVAL);
}
/* see if the client already exists */
@@ -419,7 +420,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
list_add_tail(&new->cl_share_link,
&nn->nfs_client_list);
spin_unlock(&nn->nfs_client_lock);
- new->cl_flags = cl_init->init_flags;
return rpc_ops->init_client(new, cl_init);
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 188b17a3b19e..28ceee102d0b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1073,6 +1073,15 @@ out_force:
goto out;
}
+static void nfs_mark_dir_for_revalidate(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ spin_lock(&inode->i_lock);
+ nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
+ spin_unlock(&inode->i_lock);
+}
+
/*
* We judge how long we want to trust negative
* dentries by looking at the parent inode mtime.
@@ -1107,19 +1116,14 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
__func__, dentry);
return 1;
case 0:
- nfs_mark_for_revalidate(dir);
- if (inode && S_ISDIR(inode->i_mode)) {
- /* Purge readdir caches. */
- nfs_zap_caches(inode);
- /*
- * We can't d_drop the root of a disconnected tree:
- * its d_hash is on the s_anon list and d_drop() would hide
- * it from shrink_dcache_for_unmount(), leading to busy
- * inodes on unmount and further oopses.
- */
- if (IS_ROOT(dentry))
- return 1;
- }
+ /*
+ * We can't d_drop the root of a disconnected tree:
+ * its d_hash is on the s_anon list and d_drop() would hide
+ * it from shrink_dcache_for_unmount(), leading to busy
+ * inodes on unmount and further oopses.
+ */
+ if (inode && IS_ROOT(dentry))
+ return 1;
dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
__func__, dentry);
return 0;
@@ -1188,6 +1192,13 @@ out:
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
nfs4_label_free(label);
+
+ /*
+ * If the lookup failed despite the dentry change attribute being
+ * a match, then we should revalidate the directory cache.
+ */
+ if (!ret && nfs_verify_change_attribute(dir, dentry->d_time))
+ nfs_mark_dir_for_revalidate(dir);
return nfs_lookup_revalidate_done(dir, dentry, inode, ret);
}
@@ -1230,7 +1241,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
error = nfs_lookup_verify_inode(inode, flags);
if (error) {
if (error == -ESTALE)
- nfs_zap_caches(dir);
+ nfs_mark_dir_for_revalidate(dir);
goto out_bad;
}
nfs_advise_use_readdirplus(dir);
@@ -1627,6 +1638,24 @@ out:
no_open:
res = nfs_lookup(dir, dentry, lookup_flags);
+ if (!res) {
+ inode = d_inode(dentry);
+ if ((lookup_flags & LOOKUP_DIRECTORY) && inode &&
+ !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)))
+ res = ERR_PTR(-ENOTDIR);
+ else if (inode && S_ISREG(inode->i_mode))
+ res = ERR_PTR(-EOPENSTALE);
+ } else if (!IS_ERR(res)) {
+ inode = d_inode(res);
+ if ((lookup_flags & LOOKUP_DIRECTORY) && inode &&
+ !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) {
+ dput(res);
+ res = ERR_PTR(-ENOTDIR);
+ } else if (inode && S_ISREG(inode->i_mode)) {
+ dput(res);
+ res = ERR_PTR(-EOPENSTALE);
+ }
+ }
if (switched) {
d_lookup_done(dentry);
if (!res)
@@ -1725,7 +1754,6 @@ out:
dput(parent);
return d;
out_error:
- nfs_mark_for_revalidate(dir);
d = ERR_PTR(error);
goto out;
}
@@ -2025,6 +2053,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
trace_nfs_link_enter(inode, dir, dentry);
d_drop(dentry);
+ if (S_ISREG(inode->i_mode))
+ nfs_sync_inode(inode);
error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
if (error == 0) {
ihold(inode);
@@ -2113,6 +2143,8 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
}
+ if (S_ISREG(old_inode->i_mode))
+ nfs_sync_inode(old_inode);
task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
if (IS_ERR(task)) {
error = PTR_ERR(task);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 6b0bf4ebd812..0682037f972b 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -272,8 +272,8 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
if (iov_iter_rw(iter) == READ)
- return nfs_file_direct_read(iocb, iter);
- return nfs_file_direct_write(iocb, iter);
+ return nfs_file_direct_read(iocb, iter, true);
+ return nfs_file_direct_write(iocb, iter, true);
}
static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
@@ -524,6 +524,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
* nfs_file_direct_read - file direct read operation for NFS files
* @iocb: target I/O control block
* @iter: vector of user buffers into which to read data
+ * @swap: flag indicating this is swap IO, not O_DIRECT IO
*
* We use this function for direct reads instead of calling
* generic_file_aio_read() in order to avoid gfar's check to see if
@@ -539,7 +540,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
* client must read the updated atime from the server back into its
* cache.
*/
-ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
+ bool swap)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -581,12 +583,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
if (iter_is_iovec(iter))
dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
- nfs_start_io_direct(inode);
+ if (!swap)
+ nfs_start_io_direct(inode);
NFS_I(inode)->read_io += count;
requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
- nfs_end_io_direct(inode);
+ if (!swap)
+ nfs_end_io_direct(inode);
if (requested > 0) {
result = nfs_direct_wait(dreq);
@@ -851,7 +855,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
*/
static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
struct iov_iter *iter,
- loff_t pos)
+ loff_t pos, int ioflags)
{
struct nfs_pageio_descriptor desc;
struct inode *inode = dreq->inode;
@@ -859,7 +863,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
size_t requested_bytes = 0;
size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
- nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,
+ nfs_pageio_init_write(&desc, inode, ioflags, false,
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
get_dreq(dreq);
@@ -937,6 +941,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
* nfs_file_direct_write - file direct write operation for NFS files
* @iocb: target I/O control block
* @iter: vector of user buffers from which to write data
+ * @swap: flag indicating this is swap IO, not O_DIRECT IO
*
* We use this function for direct writes instead of calling
* generic_file_aio_write() in order to avoid taking the inode
@@ -953,7 +958,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
* Note that O_APPEND is not supported for NFS direct writes, as there
* is no atomic O_APPEND write facility in the NFS protocol.
*/
-ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
+ bool swap)
{
ssize_t result = -EINVAL, requested;
size_t count;
@@ -967,7 +973,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
file, iov_iter_count(iter), (long long) iocb->ki_pos);
- result = generic_write_checks(iocb, iter);
+ if (swap)
+ /* bypass generic checks */
+ result = iov_iter_count(iter);
+ else
+ result = generic_write_checks(iocb, iter);
if (result <= 0)
return result;
count = result;
@@ -997,16 +1007,22 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
- nfs_start_io_direct(inode);
+ if (swap) {
+ requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
+ FLUSH_STABLE);
+ } else {
+ nfs_start_io_direct(inode);
- requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+ requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
+ FLUSH_COND_STABLE);
- if (mapping->nrpages) {
- invalidate_inode_pages2_range(mapping,
- pos >> PAGE_SHIFT, end);
- }
+ if (mapping->nrpages) {
+ invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_SHIFT, end);
+ }
- nfs_end_io_direct(inode);
+ nfs_end_io_direct(inode);
+ }
if (requested > 0) {
result = nfs_direct_wait(dreq);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 387a2cfa7e17..73415970af38 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -161,7 +161,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
ssize_t result;
if (iocb->ki_flags & IOCB_DIRECT)
- return nfs_file_direct_read(iocb, to);
+ return nfs_file_direct_read(iocb, to, false);
dprintk("NFS: read(%pD2, %zu@%lu)\n",
iocb->ki_filp,
@@ -609,7 +609,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
return result;
if (iocb->ki_flags & IOCB_DIRECT)
- return nfs_file_direct_write(iocb, from);
+ return nfs_file_direct_write(iocb, from, false);
dprintk("NFS: write(%pD2, %zu@%Ld)\n",
file, iov_iter_count(from), (long long) iocb->ki_pos);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index c9b605f6c9cb..98b74cdabb99 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -717,7 +717,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
if (unlikely(!p))
goto out_err;
fl->fh_array[i]->size = be32_to_cpup(p++);
- if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
+ if (fl->fh_array[i]->size > NFS_MAXFHSIZE) {
printk(KERN_ERR "NFS: Too big fh %d received %d\n",
i, fl->fh_array[i]->size);
goto out_err;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 1741d902b0d8..fa1c920afb49 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -103,7 +103,7 @@ static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
if (unlikely(!p))
return -ENOBUFS;
fh->size = be32_to_cpup(p++);
- if (fh->size > sizeof(struct nfs_fh)) {
+ if (fh->size > NFS_MAXFHSIZE) {
printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
fh->size);
return -EOVERFLOW;
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 3eda40a320a5..1f12297109b4 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -378,10 +378,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
goto noconnect;
ds = mirror->mirror_ds->ds;
+ if (READ_ONCE(ds->ds_clp))
+ goto out;
/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
smp_rmb();
- if (ds->ds_clp)
- goto out;
/* FIXME: For now we assume the server sent only one version of NFS
* to use for the DS.
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 53604cc090ca..3bddf5332b6d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -791,12 +791,9 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
goto out_no_update;
/* Flush out writes to the server in order to update c/mtime. */
- if ((request_mask & (STATX_CTIME|STATX_MTIME)) &&
- S_ISREG(inode->i_mode)) {
- err = filemap_write_and_wait(inode->i_mapping);
- if (err)
- goto out;
- }
+ if ((request_mask & (STATX_CTIME | STATX_MTIME)) &&
+ S_ISREG(inode->i_mode))
+ filemap_write_and_wait(inode->i_mapping);
/*
* We may force a getattr if the user cares about atime.
@@ -1048,6 +1045,7 @@ EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
{
filp->private_data = get_nfs_open_context(ctx);
+ set_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags);
if (list_empty(&ctx->list))
nfs_inode_attach_open_context(ctx);
}
@@ -1067,6 +1065,8 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct
continue;
if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
continue;
+ if (!test_bit(NFS_CONTEXT_FILE_OPEN, &pos->flags))
+ continue;
ctx = get_nfs_open_context(pos);
if (ctx)
break;
@@ -1082,6 +1082,7 @@ void nfs_file_clear_open_context(struct file *filp)
if (ctx) {
struct inode *inode = d_inode(ctx->dentry);
+ clear_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags);
/*
* We fatal error on write before. Try to writeback
* every page again.
@@ -1618,10 +1619,10 @@ EXPORT_SYMBOL_GPL(_nfs_display_fhandle);
*/
static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
{
- const struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long attr_gencount = NFS_I(inode)->attr_gencount;
- return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
- ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
+ return (long)(fattr->gencount - attr_gencount) > 0 ||
+ (long)(attr_gencount - nfs_read_attr_generation_counter()) > 0;
}
static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
@@ -2049,7 +2050,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfsi->attrtimeo_timestamp = now;
}
/* Set the barrier to be more recent than this fattr */
- if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0)
+ if ((long)(fattr->gencount - nfsi->attr_gencount) > 0)
nfsi->attr_gencount = fattr->gencount;
}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 887f9136a9db..af557dc2cfe1 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -953,7 +953,7 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
error = decode_filename_inline(xdr, &entry->name, &entry->len);
if (unlikely(error))
- return error;
+ return -EAGAIN;
/*
* The type (size and byte order) of nfscookie isn't defined in
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 9eb2f1a503ab..01a03ca36659 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -350,7 +350,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
break;
case NFS3_CREATE_UNCHECKED:
- goto out;
+ goto out_release_acls;
}
nfs_fattr_init(data->res.dir_attr);
nfs_fattr_init(data->res.fattr);
@@ -717,7 +717,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
break;
default:
status = -EINVAL;
- goto out;
+ goto out_release_acls;
}
d_alias = nfs3_do_create(dir, dentry, data);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 1f60ab2535ee..84369d51353a 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -35,6 +35,7 @@
*/
#define NFS3_fhandle_sz (1+16)
#define NFS3_fh_sz (NFS3_fhandle_sz) /* shorthand */
+#define NFS3_post_op_fh_sz (1+NFS3_fh_sz)
#define NFS3_sattr_sz (15)
#define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2))
#define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2))
@@ -72,7 +73,7 @@
#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+1)
#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+1)
#define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4)
-#define NFS3_createres_sz (1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
+#define NFS3_createres_sz (1+NFS3_post_op_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
#define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz))
#define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+1)
@@ -1967,7 +1968,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
bool plus)
{
struct user_namespace *userns = rpc_userns(entry->server->client);
- struct nfs_entry old = *entry;
__be32 *p;
int error;
u64 new_cookie;
@@ -1987,15 +1987,15 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
error = decode_fileid3(xdr, &entry->ino);
if (unlikely(error))
- return error;
+ return -EAGAIN;
error = decode_inline_filename3(xdr, &entry->name, &entry->len);
if (unlikely(error))
- return error;
+ return -EAGAIN;
error = decode_cookie3(xdr, &new_cookie);
if (unlikely(error))
- return error;
+ return -EAGAIN;
entry->d_type = DT_UNKNOWN;
@@ -2003,7 +2003,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
entry->fattr->valid = 0;
error = decode_post_op_attr(xdr, entry->fattr, userns);
if (unlikely(error))
- return error;
+ return -EAGAIN;
if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
@@ -2018,11 +2018,8 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
return -EAGAIN;
if (*p != xdr_zero) {
error = decode_nfs_fh3(xdr, entry->fh);
- if (unlikely(error)) {
- if (error == -E2BIG)
- goto out_truncated;
- return error;
- }
+ if (unlikely(error))
+ return -EAGAIN;
} else
zero_nfs_fh3(entry->fh);
}
@@ -2031,11 +2028,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
entry->cookie = new_cookie;
return 0;
-
-out_truncated:
- dprintk("NFS: directory entry contains invalid file handle\n");
- *entry = old;
- return -EAGAIN;
}
/*
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 9b61c80a93e9..504812ea4bc2 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -59,7 +59,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
loff_t offset, loff_t len)
{
- struct nfs_server *server = NFS_SERVER(file_inode(filep));
+ struct inode *inode = file_inode(filep);
+ struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_exception exception = { };
struct nfs_lock_context *lock;
int err;
@@ -68,9 +69,13 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
if (IS_ERR(lock))
return PTR_ERR(lock);
- exception.inode = file_inode(filep);
+ exception.inode = inode;
exception.state = lock->open_context->state;
+ err = nfs_sync_inode(inode);
+ if (err)
+ goto out;
+
do {
err = _nfs42_proc_fallocate(msg, filep, lock, offset, len);
if (err == -ENOTSUPP) {
@@ -79,7 +84,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
}
err = nfs4_handle_exception(server, err, &exception);
} while (exception.retry);
-
+out:
nfs_put_lock_context(lock);
return err;
}
@@ -117,16 +122,13 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
return -EOPNOTSUPP;
inode_lock(inode);
- err = nfs_sync_inode(inode);
- if (err)
- goto out_unlock;
err = nfs42_proc_fallocate(&msg, filep, offset, len);
if (err == 0)
truncate_pagecache_range(inode, offset, (offset + len) -1);
if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
-out_unlock:
+
inode_unlock(inode);
return err;
}
@@ -293,8 +295,9 @@ static ssize_t _nfs42_proc_copy(struct file *src,
goto out;
}
- truncate_pagecache_range(dst_inode, pos_dst,
- pos_dst + res->write_res.count);
+ WARN_ON_ONCE(invalidate_inode_pages2_range(dst_inode->i_mapping,
+ pos_dst >> PAGE_SHIFT,
+ (pos_dst + res->write_res.count - 1) >> PAGE_SHIFT));
status = res->write_res.count;
out:
@@ -498,7 +501,10 @@ static loff_t _nfs42_proc_llseek(struct file *filep,
if (status)
return status;
- return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
+ if (whence == SEEK_DATA && res.sr_eof)
+ return -NFS4ERR_NXIO;
+ else
+ return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
}
loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index aed865a84629..2b78f7b8d546 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -769,8 +769,7 @@ static int nfs4_xdr_dec_clone(struct rpc_rqst *rqstp,
status = decode_clone(xdr);
if (status)
goto out;
- status = decode_getfattr(xdr, res->dst_fattr, res->server);
-
+ decode_getfattr(xdr, res->dst_fattr, res->server);
out:
res->rpc_status = status;
return status;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index c4a98cbda6dd..ebd77a301057 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -203,6 +203,7 @@ struct nfs4_exception {
struct inode *inode;
nfs4_stateid *stateid;
long timeout;
+ unsigned char task_is_privileged : 1;
unsigned char delay : 1,
recovering : 1,
retry : 1;
@@ -278,7 +279,8 @@ struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
struct nfs_fh *, struct nfs_fattr *);
int nfs4_replace_transport(struct nfs_server *server,
const struct nfs4_fs_locations *locations);
-
+size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa,
+ size_t salen, struct net *net);
/* nfs4proc.c */
extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *);
extern int nfs4_async_handle_error(struct rpc_task *task,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 914feab64702..3671a51fe5eb 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -197,8 +197,11 @@ void nfs40_shutdown_client(struct nfs_client *clp)
struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
{
- int err;
+ char buf[INET6_ADDRSTRLEN + 1];
+ const char *ip_addr = cl_init->ip_addr;
struct nfs_client *clp = nfs_alloc_client(cl_init);
+ int err;
+
if (IS_ERR(clp))
return clp;
@@ -222,6 +225,44 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
init_waitqueue_head(&clp->cl_lock_waitq);
#endif
INIT_LIST_HEAD(&clp->pending_cb_stateids);
+
+ if (cl_init->minorversion != 0)
+ __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
+ __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
+ __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
+
+ /*
+ * Set up the connection to the server before we add add to the
+ * global list.
+ */
+ err = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I);
+ if (err == -EINVAL)
+ err = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
+ if (err < 0)
+ goto error;
+
+ /* If no clientaddr= option was specified, find a usable cb address */
+ if (ip_addr == NULL) {
+ struct sockaddr_storage cb_addr;
+ struct sockaddr *sap = (struct sockaddr *)&cb_addr;
+
+ err = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr));
+ if (err < 0)
+ goto error;
+ err = rpc_ntop(sap, buf, sizeof(buf));
+ if (err < 0)
+ goto error;
+ ip_addr = (const char *)buf;
+ }
+ strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
+
+ err = nfs_idmap_new(clp);
+ if (err < 0) {
+ dprintk("%s: failed to create idmapper. Error = %d\n",
+ __func__, err);
+ goto error;
+ }
+ __set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
return clp;
error:
@@ -372,8 +413,6 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
struct nfs_client *nfs4_init_client(struct nfs_client *clp,
const struct nfs_client_initdata *cl_init)
{
- char buf[INET6_ADDRSTRLEN + 1];
- const char *ip_addr = cl_init->ip_addr;
struct nfs_client *old;
int error;
@@ -381,43 +420,6 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
/* the client is initialised already */
return clp;
- /* Check NFS protocol revision and initialize RPC op vector */
- clp->rpc_ops = &nfs_v4_clientops;
-
- if (clp->cl_minorversion != 0)
- __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
- __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
- __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
-
- error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I);
- if (error == -EINVAL)
- error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
- if (error < 0)
- goto error;
-
- /* If no clientaddr= option was specified, find a usable cb address */
- if (ip_addr == NULL) {
- struct sockaddr_storage cb_addr;
- struct sockaddr *sap = (struct sockaddr *)&cb_addr;
-
- error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr));
- if (error < 0)
- goto error;
- error = rpc_ntop(sap, buf, sizeof(buf));
- if (error < 0)
- goto error;
- ip_addr = (const char *)buf;
- }
- strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
-
- error = nfs_idmap_new(clp);
- if (error < 0) {
- dprintk("%s: failed to create idmapper. Error = %d\n",
- __func__, error);
- goto error;
- }
- __set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
-
error = nfs4_init_client_minor_version(clp);
if (error < 0)
goto error;
@@ -435,8 +437,8 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
*/
nfs_mark_client_ready(clp, -EPERM);
}
- nfs_put_client(clp);
clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags);
+ nfs_put_client(clp);
return old;
error:
@@ -1291,8 +1293,11 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
}
nfs_put_client(clp);
- if (server->nfs_client->cl_hostname == NULL)
+ if (server->nfs_client->cl_hostname == NULL) {
server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL);
+ if (server->nfs_client->cl_hostname == NULL)
+ return -ENOMEM;
+ }
nfs_server_insert_lists(server);
return nfs_probe_destination(server);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 6b31cb5f9c9d..7c73097b2f4e 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -168,7 +168,7 @@ static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
case SEEK_HOLE:
case SEEK_DATA:
ret = nfs42_proc_llseek(filep, offset, whence);
- if (ret != -ENOTSUPP)
+ if (ret != -EOPNOTSUPP)
return ret;
/* Fall through */
default:
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2e460c33ae48..768258848a68 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -121,8 +121,8 @@ static int nfs4_validate_fspath(struct dentry *dentry,
return 0;
}
-static size_t nfs_parse_server_name(char *string, size_t len,
- struct sockaddr *sa, size_t salen, struct net *net)
+size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa,
+ size_t salen, struct net *net)
{
ssize_t ret;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 30e44b33040a..cf3b00751ff6 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -359,6 +359,14 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
kunmap_atomic(start);
}
+static void nfs4_fattr_set_prechange(struct nfs_fattr *fattr, u64 version)
+{
+ if (!(fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) {
+ fattr->pre_change_attr = version;
+ fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
+ }
+}
+
static void nfs4_test_and_free_stateid(struct nfs_server *server,
nfs4_stateid *stateid,
const struct cred *cred)
@@ -581,6 +589,8 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
goto out_retry;
}
if (exception->recovering) {
+ if (exception->task_is_privileged)
+ return -EDEADLOCK;
ret = nfs4_wait_clnt_recover(clp);
if (test_bit(NFS_MIG_FAILED, &server->mig_status))
return -EIO;
@@ -606,6 +616,8 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server,
goto out_retry;
}
if (exception->recovering) {
+ if (exception->task_is_privileged)
+ return -EDEADLOCK;
rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
@@ -1545,15 +1557,16 @@ static bool nfs_stateid_is_sequential(struct nfs4_state *state,
{
if (test_bit(NFS_OPEN_STATE, &state->flags)) {
/* The common case - we're updating to a new sequence number */
- if (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
- nfs4_stateid_is_next(&state->open_stateid, stateid)) {
- return true;
+ if (nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+ if (nfs4_stateid_is_next(&state->open_stateid, stateid))
+ return true;
+ return false;
}
- } else {
- /* This is the first OPEN in this generation */
- if (stateid->seqid == cpu_to_be32(1))
- return true;
+ /* The server returned a new stateid */
}
+ /* This is the first OPEN in this generation */
+ if (stateid->seqid == cpu_to_be32(1))
+ return true;
return false;
}
@@ -1647,7 +1660,7 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state,
rcu_read_unlock();
trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0);
- if (!signal_pending(current)) {
+ if (!fatal_signal_pending(current)) {
if (schedule_timeout(5*HZ) == 0)
status = -EAGAIN;
else
@@ -3416,7 +3429,7 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst,
write_sequnlock(&state->seqlock);
trace_nfs4_close_stateid_update_wait(state->inode, dst, 0);
- if (signal_pending(current))
+ if (fatal_signal_pending(current))
status = -EINTR;
else
if (schedule_timeout(5*HZ) != 0)
@@ -5754,6 +5767,9 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE);
int ret, i;
+ /* You can't remove system.nfs4_acl: */
+ if (buflen == 0)
+ return -EINVAL;
if (!nfs4_server_supports_acls(server))
return -EOPNOTSUPP;
if (npages > ARRAY_SIZE(pages))
@@ -5792,6 +5808,14 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
do {
err = __nfs4_proc_set_acl(inode, buf, buflen);
trace_nfs4_set_acl(inode, err);
+ if (err == -NFS4ERR_BADOWNER || err == -NFS4ERR_BADNAME) {
+ /*
+ * no need to retry since the kernel
+ * isn't involved in encoding the ACEs.
+ */
+ err = -EINVAL;
+ break;
+ }
err = nfs4_handle_exception(NFS_SERVER(inode), err,
&exception);
} while (exception.retry);
@@ -5830,7 +5854,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,
return ret;
if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
return -ENOENT;
- return 0;
+ return label.len;
}
static int nfs4_get_security_label(struct inode *inode, void *buf,
@@ -6228,6 +6252,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
struct nfs4_exception exception = {
.inode = data->inode,
.stateid = &data->stateid,
+ .task_is_privileged = data->args.seq_args.sa_privileged,
};
if (!nfs4_sequence_done(task, &data->res.seq_res))
@@ -6290,7 +6315,9 @@ static void nfs4_delegreturn_release(void *calldata)
pnfs_roc_release(&data->lr.arg, &data->lr.res,
data->res.lr_ret);
if (inode) {
- nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
+ nfs4_fattr_set_prechange(&data->fattr,
+ inode_peek_iversion_raw(inode));
+ nfs_refresh_inode(inode, &data->fattr);
nfs_iput_and_deactive(inode);
}
kfree(calldata);
@@ -6346,7 +6373,6 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
data = kzalloc(sizeof(*data), GFP_NOFS);
if (data == NULL)
return -ENOMEM;
- nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0);
nfs4_state_protect(server->nfs_client,
NFS_SP4_MACH_CRED_CLEANUP,
@@ -6374,6 +6400,12 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
}
}
+ if (!data->inode)
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1,
+ 1);
+ else
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1,
+ 0);
task_setup_data.callback_data = data;
msg.rpc_argp = &data->args;
msg.rpc_resp = &data->res;
@@ -7896,6 +7928,7 @@ nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata)
case -NFS4ERR_DEADSESSION:
nfs4_schedule_session_recovery(clp->cl_session,
task->tk_status);
+ return;
}
if (args->dir == NFS4_CDFC4_FORE_OR_BOTH &&
res->dir != NFS4_CDFS4_BOTH) {
@@ -9329,15 +9362,20 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
&task_setup_data.rpc_client, &msg);
dprintk("--> %s\n", __func__);
+ lrp->inode = nfs_igrab_and_active(lrp->args.inode);
if (!sync) {
- lrp->inode = nfs_igrab_and_active(lrp->args.inode);
if (!lrp->inode) {
nfs4_layoutreturn_release(lrp);
return -EAGAIN;
}
task_setup_data.flags |= RPC_TASK_ASYNC;
}
- nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1, 0);
+ if (!lrp->inode)
+ nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1,
+ 1);
+ else
+ nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1,
+ 0);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index ea680f619438..1d2b81a233bb 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -49,6 +49,7 @@
#include <linux/workqueue.h>
#include <linux/bitops.h>
#include <linux/jiffies.h>
+#include <linux/sched/mm.h>
#include <linux/sunrpc/clnt.h>
@@ -2070,6 +2071,9 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred
}
result = -NFS4ERR_NXIO;
+ if (!locations->nlocations)
+ goto out;
+
if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) {
dprintk("<-- %s: No fs_locations data, migration skipped\n",
__func__);
@@ -2501,9 +2505,17 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
static void nfs4_state_manager(struct nfs_client *clp)
{
+ unsigned int memflags;
int status = 0;
const char *section = "", *section_sep = "";
+ /*
+ * State recovery can deadlock if the direct reclaim code tries
+ * start NFS writeback. So ensure memory allocations are all
+ * GFP_NOFS.
+ */
+ memflags = memalloc_nofs_save();
+
/* Ensure exclusive access to NFSv4 state */
do {
clear_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
@@ -2597,6 +2609,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
}
+ memalloc_nofs_restore(memflags);
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
@@ -2613,6 +2626,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
return;
if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
return;
+ memflags = memalloc_nofs_save();
} while (refcount_read(&clp->cl_count) > 1 && !signalled());
goto out_drain;
@@ -2624,6 +2638,7 @@ out_error:
clp->cl_hostname, -status);
ssleep(1);
out_drain:
+ memalloc_nofs_restore(memflags);
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9a022a4fb964..2b7741fe42ea 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3683,8 +3683,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
if (unlikely(!p))
goto out_eio;
n = be32_to_cpup(p);
- if (n <= 0)
- goto out_eio;
for (res->nlocations = 0; res->nlocations < n; res->nlocations++) {
u32 m;
struct nfs4_fs_location *loc;
@@ -4187,10 +4185,11 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
} else
printk(KERN_WARNING "%s: label too long (%u)!\n",
__func__, len);
+ if (label && label->label)
+ dprintk("%s: label=%.*s, len=%d, PI=%d, LFS=%d\n",
+ __func__, label->len, (char *)label->label,
+ label->len, label->pi, label->lfs);
}
- if (label && label->label)
- dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__,
- (char *)label->label, label->len, label->pi, label->lfs);
return status;
}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index f4407dd426bf..e3b85bfcfc7d 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -986,15 +986,16 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *prev = NULL;
- if (mirror->pg_count != 0) {
- prev = nfs_list_entry(mirror->pg_list.prev);
- } else {
+ if (list_empty(&mirror->pg_list)) {
if (desc->pg_ops->pg_init)
desc->pg_ops->pg_init(desc, req);
if (desc->pg_error < 0)
return 0;
mirror->pg_base = req->wb_pgbase;
- }
+ mirror->pg_count = 0;
+ mirror->pg_recoalesce = 0;
+ } else
+ prev = nfs_list_entry(mirror->pg_list.prev);
if (desc->pg_maxretrans && req->wb_nio > desc->pg_maxretrans) {
if (NFS_SERVER(desc->pg_inode)->flags & NFS_MOUNT_SOFTERR)
@@ -1018,17 +1019,16 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
{
struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
if (!list_empty(&mirror->pg_list)) {
int error = desc->pg_ops->pg_doio(desc);
if (error < 0)
desc->pg_error = error;
- else
+ if (list_empty(&mirror->pg_list)) {
mirror->pg_bytes_written += mirror->pg_count;
- }
- if (list_empty(&mirror->pg_list)) {
- mirror->pg_count = 0;
- mirror->pg_base = 0;
+ mirror->pg_count = 0;
+ mirror->pg_base = 0;
+ mirror->pg_recoalesce = 0;
+ }
}
}
@@ -1122,7 +1122,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
do {
list_splice_init(&mirror->pg_list, &head);
- mirror->pg_bytes_written -= mirror->pg_count;
mirror->pg_count = 0;
mirror->pg_base = 0;
mirror->pg_recoalesce = 0;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 4232f956bdac..0471b6e0da16 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -92,6 +92,17 @@ find_pnfs_driver(u32 id)
return local;
}
+const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id)
+{
+ return find_pnfs_driver(id);
+}
+
+void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld)
+{
+ if (ld)
+ module_put(ld->owner);
+}
+
void
unset_pnfs_layoutdriver(struct nfs_server *nfss)
{
@@ -1285,6 +1296,11 @@ _pnfs_return_layout(struct inode *ino)
{
struct pnfs_layout_hdr *lo = NULL;
struct nfs_inode *nfsi = NFS_I(ino);
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
LIST_HEAD(tmp_list);
nfs4_stateid stateid;
int status = 0;
@@ -1311,16 +1327,10 @@ _pnfs_return_layout(struct inode *ino)
}
valid_layout = pnfs_layout_is_valid(lo);
pnfs_clear_layoutcommit(ino, &tmp_list);
- pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0);
+ pnfs_mark_matching_lsegs_return(lo, &tmp_list, &range, 0);
- if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
- struct pnfs_layout_range range = {
- .iomode = IOMODE_ANY,
- .offset = 0,
- .length = NFS4_MAX_UINT64,
- };
+ if (NFS_SERVER(ino)->pnfs_curr_ld->return_range)
NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
- }
/* Don't send a LAYOUTRETURN if list was initially empty */
if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) ||
@@ -2369,7 +2379,13 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
* We got an entirely new state ID. Mark all segments for the
* inode invalid, and retry the layoutget
*/
- pnfs_mark_layout_stateid_invalid(lo, &free_me);
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .length = NFS4_MAX_UINT64,
+ };
+ pnfs_set_plh_return_info(lo, IOMODE_ANY, 0);
+ pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
+ &range, 0);
goto out_forget;
}
@@ -2388,6 +2404,7 @@ out_forget:
spin_unlock(&ino->i_lock);
lseg->pls_layout = lo;
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+ pnfs_free_lseg_list(&free_me);
return ERR_PTR(-EAGAIN);
}
@@ -2420,6 +2437,9 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
assert_spin_locked(&lo->plh_inode->i_lock);
+ if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ tmp_list = &lo->plh_return_segs;
+
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
dprintk("%s: marking lseg %p iomode %d "
@@ -2427,6 +2447,8 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
lseg, lseg->pls_range.iomode,
lseg->pls_range.offset,
lseg->pls_range.length);
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ tmp_list = &lo->plh_return_segs;
if (mark_lseg_invalid(lseg, tmp_list))
continue;
remaining++;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 3d55edd6b25a..68339680bb7d 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -226,6 +226,8 @@ struct pnfs_devicelist {
extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+extern const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id);
+extern void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld);
/* nfs4proc.c */
extern size_t max_response_pages(struct nfs_server *server);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 8b37e7f8e789..aff44a7b98f8 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -556,19 +556,16 @@ out:
}
EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add);
-static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
+static int nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
{
might_sleep();
- wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING,
- TASK_KILLABLE);
+ return wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, TASK_KILLABLE);
}
static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
{
smp_mb__before_atomic();
- clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
- smp_mb__after_atomic();
- wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
+ clear_and_wake_up_bit(NFS4DS_CONNECTING, &ds->ds_state);
}
static struct nfs_client *(*get_v3_ds_connect)(
@@ -644,7 +641,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
}
smp_wmb();
- ds->ds_clp = clp;
+ WRITE_ONCE(ds->ds_clp, clp);
dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
out:
return status;
@@ -717,7 +714,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
}
smp_wmb();
- ds->ds_clp = clp;
+ WRITE_ONCE(ds->ds_clp, clp);
dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
out:
return status;
@@ -734,30 +731,33 @@ int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
{
int err;
-again:
- err = 0;
- if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
- if (version == 3) {
- err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo,
- retrans);
- } else if (version == 4) {
- err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo,
- retrans, minor_version);
- } else {
- dprintk("%s: unsupported DS version %d\n", __func__,
- version);
- err = -EPROTONOSUPPORT;
- }
+ do {
+ err = nfs4_wait_ds_connect(ds);
+ if (err || ds->ds_clp)
+ goto out;
+ if (nfs4_test_deviceid_unavailable(devid))
+ return -ENODEV;
+ } while (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) != 0);
- nfs4_clear_ds_conn_bit(ds);
- } else {
- nfs4_wait_ds_connect(ds);
+ if (ds->ds_clp)
+ goto connect_done;
- /* what was waited on didn't connect AND didn't mark unavail */
- if (!ds->ds_clp && !nfs4_test_deviceid_unavailable(devid))
- goto again;
+ switch (version) {
+ case 3:
+ err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, retrans);
+ break;
+ case 4:
+ err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo, retrans,
+ minor_version);
+ break;
+ default:
+ dprintk("%s: unsupported DS version %d\n", __func__, version);
+ err = -EPROTONOSUPPORT;
}
+connect_done:
+ nfs4_clear_ds_conn_bit(ds);
+out:
/*
* At this point the ds->ds_clp should be ready, but it might have
* hit an error.
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 613c3ef23e07..30d8e7bc1cef 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1050,25 +1050,11 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
struct nfs_page *req, *tmp;
int ret = 0;
-restart:
list_for_each_entry_safe(req, tmp, src, wb_list) {
kref_get(&req->wb_kref);
if (!nfs_lock_request(req)) {
- int status;
-
- /* Prevent deadlock with nfs_lock_and_join_requests */
- if (!list_empty(dst)) {
- nfs_release_request(req);
- continue;
- }
- /* Ensure we make progress to prevent livelock */
- mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
- status = nfs_wait_on_request(req);
nfs_release_request(req);
- mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
- if (status < 0)
- break;
- goto restart;
+ continue;
}
nfs_request_remove_commit_list(req, cinfo);
clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
@@ -1935,6 +1921,7 @@ static int __nfs_commit_inode(struct inode *inode, int how,
int may_wait = how & FLUSH_SYNC;
int ret, nscan;
+ how &= ~FLUSH_SYNC;
nfs_init_cinfo_from_inode(&cinfo, inode);
nfs_commit_begin(cinfo.mds);
for (;;) {
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index f2f81561ebb6..4d6e71335bce 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -73,6 +73,7 @@ config NFSD_V4
select NFSD_V3
select FS_POSIX_ACL
select SUNRPC_GSS
+ select CRYPTO
select CRYPTO_MD5
select CRYPTO_SHA256
select GRACE_PERIOD
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 51c08ae79063..79ad6b2c9608 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -44,6 +44,17 @@ struct nfsd_fcache_bucket {
static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
+struct nfsd_fcache_disposal {
+ struct list_head list;
+ struct work_struct work;
+ struct net *net;
+ spinlock_t lock;
+ struct list_head freeme;
+ struct rcu_head rcu;
+};
+
+struct workqueue_struct *nfsd_filecache_wq __read_mostly;
+
static struct kmem_cache *nfsd_file_slab;
static struct kmem_cache *nfsd_file_mark_slab;
static struct nfsd_fcache_bucket *nfsd_file_hashtbl;
@@ -52,32 +63,21 @@ static long nfsd_file_lru_flags;
static struct fsnotify_group *nfsd_file_fsnotify_group;
static atomic_long_t nfsd_filecache_count;
static struct delayed_work nfsd_filecache_laundrette;
+static DEFINE_SPINLOCK(laundrette_lock);
+static LIST_HEAD(laundrettes);
-enum nfsd_file_laundrette_ctl {
- NFSD_FILE_LAUNDRETTE_NOFLUSH = 0,
- NFSD_FILE_LAUNDRETTE_MAY_FLUSH
-};
+static void nfsd_file_gc(void);
static void
-nfsd_file_schedule_laundrette(enum nfsd_file_laundrette_ctl ctl)
+nfsd_file_schedule_laundrette(void)
{
long count = atomic_long_read(&nfsd_filecache_count);
if (count == 0 || test_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags))
return;
- /* Be more aggressive about scanning if over the threshold */
- if (count > NFSD_FILE_LRU_THRESHOLD)
- mod_delayed_work(system_wq, &nfsd_filecache_laundrette, 0);
- else
- schedule_delayed_work(&nfsd_filecache_laundrette, NFSD_LAUNDRETTE_DELAY);
-
- if (ctl == NFSD_FILE_LAUNDRETTE_NOFLUSH)
- return;
-
- /* ...and don't delay flushing if we're out of control */
- if (count >= NFSD_FILE_LRU_LIMIT)
- flush_delayed_work(&nfsd_filecache_laundrette);
+ queue_delayed_work(system_wq, &nfsd_filecache_laundrette,
+ NFSD_LAUNDRETTE_DELAY);
}
static void
@@ -260,8 +260,6 @@ nfsd_file_do_unhash(struct nfsd_file *nf)
nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id));
--nfsd_file_hashtbl[nf->nf_hashval].nfb_count;
hlist_del_rcu(&nf->nf_node);
- if (!list_empty(&nf->nf_lru))
- list_lru_del(&nfsd_file_lru, &nf->nf_lru);
atomic_long_dec(&nfsd_filecache_count);
}
@@ -270,6 +268,8 @@ nfsd_file_unhash(struct nfsd_file *nf)
{
if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
nfsd_file_do_unhash(nf);
+ if (!list_empty(&nf->nf_lru))
+ list_lru_del(&nfsd_file_lru, &nf->nf_lru);
return true;
}
return false;
@@ -316,7 +316,9 @@ nfsd_file_put(struct nfsd_file *nf)
set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
if (nfsd_file_put_noref(nf) == 1 && is_hashed && unused)
- nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_MAY_FLUSH);
+ nfsd_file_schedule_laundrette();
+ if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT)
+ nfsd_file_gc();
}
struct nfsd_file *
@@ -357,6 +359,58 @@ nfsd_file_dispose_list_sync(struct list_head *dispose)
flush_delayed_fput();
}
+static void
+nfsd_file_list_remove_disposal(struct list_head *dst,
+ struct nfsd_fcache_disposal *l)
+{
+ spin_lock(&l->lock);
+ list_splice_init(&l->freeme, dst);
+ spin_unlock(&l->lock);
+}
+
+static void
+nfsd_file_list_add_disposal(struct list_head *files, struct net *net)
+{
+ struct nfsd_fcache_disposal *l;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(l, &laundrettes, list) {
+ if (l->net == net) {
+ spin_lock(&l->lock);
+ list_splice_tail_init(files, &l->freeme);
+ spin_unlock(&l->lock);
+ queue_work(nfsd_filecache_wq, &l->work);
+ break;
+ }
+ }
+ rcu_read_unlock();
+}
+
+static void
+nfsd_file_list_add_pernet(struct list_head *dst, struct list_head *src,
+ struct net *net)
+{
+ struct nfsd_file *nf, *tmp;
+
+ list_for_each_entry_safe(nf, tmp, src, nf_lru) {
+ if (nf->nf_net == net)
+ list_move_tail(&nf->nf_lru, dst);
+ }
+}
+
+static void
+nfsd_file_dispose_list_delayed(struct list_head *dispose)
+{
+ LIST_HEAD(list);
+ struct nfsd_file *nf;
+
+ while(!list_empty(dispose)) {
+ nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
+ nfsd_file_list_add_pernet(&list, dispose, nf->nf_net);
+ nfsd_file_list_add_disposal(&list, nf->nf_net);
+ }
+}
+
/*
* Note this can deadlock with nfsd_file_cache_purge.
*/
@@ -403,18 +457,40 @@ out_skip:
return LRU_SKIP;
}
-static void
-nfsd_file_lru_dispose(struct list_head *head)
+static unsigned long
+nfsd_file_lru_walk_list(struct shrink_control *sc)
{
- while(!list_empty(head)) {
- struct nfsd_file *nf = list_first_entry(head,
- struct nfsd_file, nf_lru);
- list_del_init(&nf->nf_lru);
+ LIST_HEAD(head);
+ struct nfsd_file *nf;
+ unsigned long ret;
+
+ if (sc)
+ ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
+ nfsd_file_lru_cb, &head);
+ else
+ ret = list_lru_walk(&nfsd_file_lru,
+ nfsd_file_lru_cb,
+ &head, LONG_MAX);
+ list_for_each_entry(nf, &head, nf_lru) {
spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
nfsd_file_do_unhash(nf);
spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
- nfsd_file_put_noref(nf);
}
+ nfsd_file_dispose_list_delayed(&head);
+ return ret;
+}
+
+static void
+nfsd_file_gc(void)
+{
+ nfsd_file_lru_walk_list(NULL);
+}
+
+static void
+nfsd_file_gc_worker(struct work_struct *work)
+{
+ nfsd_file_gc();
+ nfsd_file_schedule_laundrette();
}
static unsigned long
@@ -426,12 +502,7 @@ nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc)
static unsigned long
nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
{
- LIST_HEAD(head);
- unsigned long ret;
-
- ret = list_lru_shrink_walk(&nfsd_file_lru, sc, nfsd_file_lru_cb, &head);
- nfsd_file_lru_dispose(&head);
- return ret;
+ return nfsd_file_lru_walk_list(sc);
}
static struct shrinker nfsd_file_shrinker = {
@@ -493,7 +564,7 @@ nfsd_file_close_inode(struct inode *inode)
__nfsd_file_close_inode(inode, hashval, &dispose);
trace_nfsd_file_close_inode(inode, hashval, !list_empty(&dispose));
- nfsd_file_dispose_list(&dispose);
+ nfsd_file_dispose_list_delayed(&dispose);
}
/**
@@ -509,16 +580,11 @@ static void
nfsd_file_delayed_close(struct work_struct *work)
{
LIST_HEAD(head);
+ struct nfsd_fcache_disposal *l = container_of(work,
+ struct nfsd_fcache_disposal, work);
- list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, &head, LONG_MAX);
-
- if (test_and_clear_bit(NFSD_FILE_LRU_RESCAN, &nfsd_file_lru_flags))
- nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_NOFLUSH);
-
- if (!list_empty(&head)) {
- nfsd_file_lru_dispose(&head);
- flush_delayed_fput();
- }
+ nfsd_file_list_remove_disposal(&head, l);
+ nfsd_file_dispose_list(&head);
}
static int
@@ -579,6 +645,10 @@ nfsd_file_cache_init(void)
if (nfsd_file_hashtbl)
return 0;
+ nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", 0, 0);
+ if (!nfsd_filecache_wq)
+ goto out;
+
nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE,
sizeof(*nfsd_file_hashtbl), GFP_KERNEL);
if (!nfsd_file_hashtbl) {
@@ -632,7 +702,7 @@ nfsd_file_cache_init(void)
spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock);
}
- INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_delayed_close);
+ INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_gc_worker);
out:
return ret;
out_notifier:
@@ -648,6 +718,8 @@ out_err:
nfsd_file_mark_slab = NULL;
kfree(nfsd_file_hashtbl);
nfsd_file_hashtbl = NULL;
+ destroy_workqueue(nfsd_filecache_wq);
+ nfsd_filecache_wq = NULL;
goto out;
}
@@ -686,6 +758,88 @@ nfsd_file_cache_purge(struct net *net)
}
}
+static struct nfsd_fcache_disposal *
+nfsd_alloc_fcache_disposal(struct net *net)
+{
+ struct nfsd_fcache_disposal *l;
+
+ l = kmalloc(sizeof(*l), GFP_KERNEL);
+ if (!l)
+ return NULL;
+ INIT_WORK(&l->work, nfsd_file_delayed_close);
+ l->net = net;
+ spin_lock_init(&l->lock);
+ INIT_LIST_HEAD(&l->freeme);
+ return l;
+}
+
+static void
+nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l)
+{
+ rcu_assign_pointer(l->net, NULL);
+ cancel_work_sync(&l->work);
+ nfsd_file_dispose_list(&l->freeme);
+ kfree_rcu(l, rcu);
+}
+
+static void
+nfsd_add_fcache_disposal(struct nfsd_fcache_disposal *l)
+{
+ spin_lock(&laundrette_lock);
+ list_add_tail_rcu(&l->list, &laundrettes);
+ spin_unlock(&laundrette_lock);
+}
+
+static void
+nfsd_del_fcache_disposal(struct nfsd_fcache_disposal *l)
+{
+ spin_lock(&laundrette_lock);
+ list_del_rcu(&l->list);
+ spin_unlock(&laundrette_lock);
+}
+
+static int
+nfsd_alloc_fcache_disposal_net(struct net *net)
+{
+ struct nfsd_fcache_disposal *l;
+
+ l = nfsd_alloc_fcache_disposal(net);
+ if (!l)
+ return -ENOMEM;
+ nfsd_add_fcache_disposal(l);
+ return 0;
+}
+
+static void
+nfsd_free_fcache_disposal_net(struct net *net)
+{
+ struct nfsd_fcache_disposal *l;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(l, &laundrettes, list) {
+ if (l->net != net)
+ continue;
+ nfsd_del_fcache_disposal(l);
+ rcu_read_unlock();
+ nfsd_free_fcache_disposal(l);
+ return;
+ }
+ rcu_read_unlock();
+}
+
+int
+nfsd_file_cache_start_net(struct net *net)
+{
+ return nfsd_alloc_fcache_disposal_net(net);
+}
+
+void
+nfsd_file_cache_shutdown_net(struct net *net)
+{
+ nfsd_file_cache_purge(net);
+ nfsd_free_fcache_disposal_net(net);
+}
+
void
nfsd_file_cache_shutdown(void)
{
@@ -712,6 +866,8 @@ nfsd_file_cache_shutdown(void)
nfsd_file_mark_slab = NULL;
kfree(nfsd_file_hashtbl);
nfsd_file_hashtbl = NULL;
+ destroy_workqueue(nfsd_filecache_wq);
+ nfsd_filecache_wq = NULL;
}
static bool
@@ -751,6 +907,8 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
continue;
if (!nfsd_match_cred(nf->nf_cred, current_cred()))
continue;
+ if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
+ continue;
if (nfsd_file_get(nf) != NULL)
return nf;
}
@@ -879,7 +1037,8 @@ open_file:
nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount,
nfsd_file_hashtbl[hashval].nfb_count);
spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
- atomic_long_inc(&nfsd_filecache_count);
+ if (atomic_long_inc_return(&nfsd_filecache_count) >= NFSD_FILE_LRU_THRESHOLD)
+ nfsd_file_gc();
nf->nf_mark = nfsd_file_mark_find_or_create(nf);
if (nf->nf_mark)
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 851d9abf54c2..79a7d6808d97 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -51,6 +51,8 @@ struct nfsd_file {
int nfsd_file_cache_init(void);
void nfsd_file_cache_purge(struct net *);
void nfsd_file_cache_shutdown(void);
+int nfsd_file_cache_start_net(struct net *net);
+void nfsd_file_cache_shutdown_net(struct net *net);
void nfsd_file_put(struct nfsd_file *nf);
struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
void nfsd_file_close_inode_sync(struct inode *inode);
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index cea68d8411ac..bb97c020c96b 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -195,6 +195,11 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
(unsigned long long) argp->offset,
argp->stable? " stable" : "");
+ resp->status = nfserr_fbig;
+ if (argp->offset > (u64)OFFSET_MAX ||
+ argp->offset + argp->len > (u64)OFFSET_MAX)
+ return rpc_success;
+
fh_copy(&resp->fh, &argp->fh);
resp->committed = argp->stable;
nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages,
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 86e5658651f1..8f077e66e613 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -857,9 +857,14 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
if (isdotent(name, namlen)) {
if (namlen == 2) {
dchild = dget_parent(dparent);
- /* filesystem root - cannot return filehandle for ".." */
+ /*
+ * Don't return filehandle for ".." if we're at
+ * the filesystem or export root:
+ */
if (dchild == dparent)
goto out;
+ if (dparent == exp->ex_path.dentry)
+ goto out;
} else
dchild = dget(dparent);
} else
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index efe55d101b0e..3c50d18fe8a9 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1121,6 +1121,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case -EIO:
case -ETIMEDOUT:
+ case -EACCES:
nfsd4_mark_cb_down(clp, task->tk_status);
}
break;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4798667af647..452ed633a2c7 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -992,8 +992,9 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
unsigned long cnt;
int nvecs;
- if (write->wr_offset >= OFFSET_MAX)
- return nfserr_inval;
+ if (write->wr_offset > (u64)OFFSET_MAX ||
+ write->wr_offset + write->wr_buflen > (u64)OFFSET_MAX)
+ return nfserr_fbig;
cnt = write->wr_buflen;
trace_nfsd_write_start(rqstp, &cstate->current_fh,
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index c35c0ebaf722..7d408957ed62 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -2177,6 +2177,7 @@ static struct notifier_block nfsd4_cld_block = {
int
register_cld_notifier(void)
{
+ WARN_ON(!nfsd_net_id);
return rpc_pipefs_notifier_register(&nfsd4_cld_block);
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8cb2f744dde6..62eb78ac7437 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1041,6 +1041,11 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
return 0;
}
+static bool delegation_hashed(struct nfs4_delegation *dp)
+{
+ return !(list_empty(&dp->dl_perfile));
+}
+
static bool
unhash_delegation_locked(struct nfs4_delegation *dp)
{
@@ -1048,7 +1053,7 @@ unhash_delegation_locked(struct nfs4_delegation *dp)
lockdep_assert_held(&state_lock);
- if (list_empty(&dp->dl_perfile))
+ if (!delegation_hashed(dp))
return false;
dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID;
@@ -2572,9 +2577,9 @@ static void force_expire_client(struct nfs4_client *clp)
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
bool already_expired;
- spin_lock(&clp->cl_lock);
+ spin_lock(&nn->client_lock);
clp->cl_time = 0;
- spin_unlock(&clp->cl_lock);
+ spin_unlock(&nn->client_lock);
wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0);
spin_lock(&nn->client_lock);
@@ -3936,8 +3941,10 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
status = nfserr_clid_inuse;
if (client_has_state(old)
&& !same_creds(&unconf->cl_cred,
- &old->cl_cred))
+ &old->cl_cred)) {
+ old = NULL;
goto out;
+ }
status = mark_client_expired_locked(old);
if (status) {
old = NULL;
@@ -4406,7 +4413,7 @@ static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb)
* queued for a lease break. Don't queue it again.
*/
spin_lock(&state_lock);
- if (dp->dl_time == 0) {
+ if (delegation_hashed(dp) && dp->dl_time == 0) {
dp->dl_time = get_seconds();
list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index d6f244559e75..e61d9c435957 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3131,15 +3131,18 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
goto fail;
cd->rd_maxcount -= entry_bytes;
/*
- * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
- * let's always let through the first entry, at least:
+ * RFC 3530 14.2.24 describes rd_dircount as only a "hint", and
+ * notes that it could be zero. If it is zero, then the server
+ * should enforce only the rd_maxcount value.
*/
- if (!cd->rd_dircount)
- goto fail;
- name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
- if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
- goto fail;
- cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
+ if (cd->rd_dircount) {
+ name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
+ if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
+ goto fail;
+ cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
+ if (!cd->rd_dircount)
+ cd->rd_maxcount = 0;
+ }
cd->cookie_offset = cookie_offset;
skip_entry:
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 4a258065188e..670e97dd67f0 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -173,14 +173,10 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
if (status)
goto out_nomem;
- nn->drc_hashtbl = kcalloc(hashsize,
- sizeof(*nn->drc_hashtbl), GFP_KERNEL);
- if (!nn->drc_hashtbl) {
- nn->drc_hashtbl = vzalloc(array_size(hashsize,
- sizeof(*nn->drc_hashtbl)));
- if (!nn->drc_hashtbl)
- goto out_shrinker;
- }
+ nn->drc_hashtbl = kvzalloc(array_size(hashsize,
+ sizeof(*nn->drc_hashtbl)), GFP_KERNEL);
+ if (!nn->drc_hashtbl)
+ goto out_shrinker;
for (i = 0; i < hashsize; i++) {
INIT_LIST_HEAD(&nn->drc_hashtbl[i].lru_head);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index be418fccc9d8..055cc0458f27 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -792,7 +792,10 @@ out_close:
svc_xprt_put(xprt);
}
out_err:
- nfsd_destroy(net);
+ if (!list_empty(&nn->nfsd_serv->sv_permsocks))
+ nn->nfsd_serv->sv_nrthreads--;
+ else
+ nfsd_destroy(net);
return err;
}
@@ -1244,7 +1247,8 @@ static void nfsdfs_remove_file(struct inode *dir, struct dentry *dentry)
clear_ncl(d_inode(dentry));
dget(dentry);
ret = simple_unlink(dir, dentry);
- d_delete(dentry);
+ d_drop(dentry);
+ fsnotify_unlink(dir, dentry);
dput(dentry);
WARN_ON_ONCE(ret);
}
@@ -1333,8 +1337,8 @@ void nfsd_client_rmdir(struct dentry *dentry)
dget(dentry);
ret = simple_rmdir(dir, dentry);
WARN_ON_ONCE(ret);
+ d_drop(dentry);
fsnotify_rmdir(dir, dentry);
- d_delete(dentry);
dput(dentry);
inode_unlock(dir);
}
@@ -1523,15 +1527,9 @@ static int __init init_nfsd(void)
int retval;
printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
- retval = register_pernet_subsys(&nfsd_net_ops);
- if (retval < 0)
- return retval;
- retval = register_cld_notifier();
- if (retval)
- goto out_unregister_pernet;
retval = nfsd4_init_slabs();
if (retval)
- goto out_unregister_notifier;
+ return retval;
retval = nfsd4_init_pnfs();
if (retval)
goto out_free_slabs;
@@ -1546,9 +1544,19 @@ static int __init init_nfsd(void)
goto out_free_lockd;
retval = register_filesystem(&nfsd_fs_type);
if (retval)
+ goto out_free_exports;
+ retval = register_pernet_subsys(&nfsd_net_ops);
+ if (retval < 0)
+ goto out_free_filesystem;
+ retval = register_cld_notifier();
+ if (retval)
goto out_free_all;
return 0;
out_free_all:
+ unregister_pernet_subsys(&nfsd_net_ops);
+out_free_filesystem:
+ unregister_filesystem(&nfsd_fs_type);
+out_free_exports:
remove_proc_entry("fs/nfs/exports", NULL);
remove_proc_entry("fs/nfs", NULL);
out_free_lockd:
@@ -1560,15 +1568,13 @@ out_free_stat:
nfsd4_exit_pnfs();
out_free_slabs:
nfsd4_free_slabs();
-out_unregister_notifier:
- unregister_cld_notifier();
-out_unregister_pernet:
- unregister_pernet_subsys(&nfsd_net_ops);
return retval;
}
static void __exit exit_nfsd(void)
{
+ unregister_cld_notifier();
+ unregister_pernet_subsys(&nfsd_net_ops);
nfsd_drc_slab_free();
remove_proc_entry("fs/nfs/exports", NULL);
remove_proc_entry("fs/nfs", NULL);
@@ -1578,8 +1584,6 @@ static void __exit exit_nfsd(void)
nfsd4_exit_pnfs();
nfsd_fault_inject_cleanup();
unregister_filesystem(&nfsd_fs_type);
- unregister_cld_notifier();
- unregister_pernet_subsys(&nfsd_net_ops);
}
MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 754c763374dd..4aca93e11af7 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -230,7 +230,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
unsigned long cnt = argp->len;
unsigned int nvecs;
- dprintk("nfsd: WRITE %s %d bytes at %d\n",
+ dprintk("nfsd: WRITE %s %u bytes at %d\n",
SVCFH_fmt(&argp->fh),
argp->len, argp->offset);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 155a4e43b24e..d63cdda1782d 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -394,13 +394,18 @@ static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cre
nn->lockd_up = 1;
}
- ret = nfs4_state_start_net(net);
+ ret = nfsd_file_cache_start_net(net);
if (ret)
goto out_lockd;
+ ret = nfs4_state_start_net(net);
+ if (ret)
+ goto out_filecache;
nn->nfsd_net_up = true;
return 0;
+out_filecache:
+ nfsd_file_cache_shutdown_net(net);
out_lockd:
if (nn->lockd_up) {
lockd_down(net);
@@ -415,7 +420,7 @@ static void nfsd_shutdown_net(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- nfsd_file_cache_purge(net);
+ nfsd_file_cache_shutdown_net(net);
nfs4_state_shutdown_net(net);
if (nn->lockd_up) {
lockd_down(net);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index b073bdc2e6e8..127db5351d01 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -53,14 +53,14 @@ TRACE_EVENT(nfsd_compound_status,
DECLARE_EVENT_CLASS(nfsd_io_class,
TP_PROTO(struct svc_rqst *rqstp,
struct svc_fh *fhp,
- loff_t offset,
- unsigned long len),
+ u64 offset,
+ u32 len),
TP_ARGS(rqstp, fhp, offset, len),
TP_STRUCT__entry(
__field(u32, xid)
__field(u32, fh_hash)
- __field(loff_t, offset)
- __field(unsigned long, len)
+ __field(u64, offset)
+ __field(u32, len)
),
TP_fast_assign(
__entry->xid = be32_to_cpu(rqstp->rq_xid);
@@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(nfsd_io_class,
__entry->offset = offset;
__entry->len = len;
),
- TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld len=%lu",
+ TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu len=%u",
__entry->xid, __entry->fh_hash,
__entry->offset, __entry->len)
)
@@ -77,8 +77,8 @@ DECLARE_EVENT_CLASS(nfsd_io_class,
DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
TP_PROTO(struct svc_rqst *rqstp, \
struct svc_fh *fhp, \
- loff_t offset, \
- unsigned long len), \
+ u64 offset, \
+ u32 len), \
TP_ARGS(rqstp, fhp, offset, len))
DEFINE_NFSD_IO_EVENT(read_start);
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index ea7cca3a64b7..6251d8754c82 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -33,7 +33,7 @@ struct nfsd_readargs {
struct nfsd_writeargs {
svc_fh fh;
__u32 offset;
- int len;
+ __u32 len;
struct kvec first;
};
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index e60be7bb55b0..28a2db3b1787 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -64,11 +64,9 @@ static const struct sysfs_ops nilfs_##name##_attr_ops = { \
#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \
static void nilfs_##name##_attr_release(struct kobject *kobj) \
{ \
- struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \
- struct the_nilfs *nilfs = container_of(kobj->parent, \
- struct the_nilfs, \
- ns_##parent_name##_kobj); \
- subgroups = nilfs->ns_##parent_name##_subgroups; \
+ struct nilfs_sysfs_##parent_name##_subgroups *subgroups = container_of(kobj, \
+ struct nilfs_sysfs_##parent_name##_subgroups, \
+ sg_##name##_kobj); \
complete(&subgroups->sg_##name##_kobj_unregister); \
} \
static struct kobj_type nilfs_##name##_ktype = { \
@@ -94,12 +92,12 @@ static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \
err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \
#name); \
if (err) \
- return err; \
- return 0; \
+ kobject_put(kobj); \
+ return err; \
} \
static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \
{ \
- kobject_del(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \
+ kobject_put(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \
}
/************************************************************************
@@ -210,14 +208,14 @@ int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root)
}
if (err)
- return err;
+ kobject_put(&root->snapshot_kobj);
- return 0;
+ return err;
}
void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root)
{
- kobject_del(&root->snapshot_kobj);
+ kobject_put(&root->snapshot_kobj);
}
/************************************************************************
@@ -1000,7 +998,7 @@ int nilfs_sysfs_create_device_group(struct super_block *sb)
err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL,
"%s", sb->s_id);
if (err)
- goto free_dev_subgroups;
+ goto cleanup_dev_kobject;
err = nilfs_sysfs_create_mounted_snapshots_group(nilfs);
if (err)
@@ -1037,9 +1035,7 @@ delete_mounted_snapshots_group:
nilfs_sysfs_delete_mounted_snapshots_group(nilfs);
cleanup_dev_kobject:
- kobject_del(&nilfs->ns_dev_kobj);
-
-free_dev_subgroups:
+ kobject_put(&nilfs->ns_dev_kobj);
kfree(nilfs->ns_dev_subgroups);
failed_create_device_group:
@@ -1054,6 +1050,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs)
nilfs_sysfs_delete_superblock_group(nilfs);
nilfs_sysfs_delete_segctor_group(nilfs);
kobject_del(&nilfs->ns_dev_kobj);
+ kobject_put(&nilfs->ns_dev_kobj);
kfree(nilfs->ns_dev_subgroups);
}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 484785cdf96e..931870768556 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -797,14 +797,13 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
void nilfs_put_root(struct nilfs_root *root)
{
- if (refcount_dec_and_test(&root->count)) {
- struct the_nilfs *nilfs = root->nilfs;
+ struct the_nilfs *nilfs = root->nilfs;
- nilfs_sysfs_delete_snapshot_group(root);
-
- spin_lock(&nilfs->ns_cptree_lock);
+ if (refcount_dec_and_lock(&root->count, &nilfs->ns_cptree_lock)) {
rb_erase(&root->rb_node, &nilfs->ns_cptree);
spin_unlock(&nilfs->ns_cptree_lock);
+
+ nilfs_sysfs_delete_snapshot_group(root);
iput(root->ifile);
kfree(root);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 3c4811469ae8..e278bfc5ee7f 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1503,7 +1503,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
na.type = AT_BITMAP;
na.name = I30;
na.name_len = 4;
- bmp_vi = ilookup5(vi->i_sb, vi->i_ino, (test_t)ntfs_test_inode, &na);
+ bmp_vi = ilookup5(vi->i_sb, vi->i_ino, ntfs_test_inode, &na);
if (bmp_vi) {
write_inode_now(bmp_vi, !datasync);
iput(bmp_vi);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 84933a0af49b..cf222c9225d6 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -30,10 +30,10 @@
/**
* ntfs_test_inode - compare two (possibly fake) inodes for equality
* @vi: vfs inode which to test
- * @na: ntfs attribute which is being tested with
+ * @data: data which is being tested with
*
* Compare the ntfs attribute embedded in the ntfs specific part of the vfs
- * inode @vi for equality with the ntfs attribute @na.
+ * inode @vi for equality with the ntfs attribute @data.
*
* If searching for the normal file/directory inode, set @na->type to AT_UNUSED.
* @na->name and @na->name_len are then ignored.
@@ -43,8 +43,9 @@
* NOTE: This function runs with the inode_hash_lock spin lock held so it is not
* allowed to sleep.
*/
-int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
+int ntfs_test_inode(struct inode *vi, void *data)
{
+ ntfs_attr *na = (ntfs_attr *)data;
ntfs_inode *ni;
if (vi->i_ino != na->mft_no)
@@ -72,9 +73,9 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
/**
* ntfs_init_locked_inode - initialize an inode
* @vi: vfs inode to initialize
- * @na: ntfs attribute which to initialize @vi to
+ * @data: data which to initialize @vi to
*
- * Initialize the vfs inode @vi with the values from the ntfs attribute @na in
+ * Initialize the vfs inode @vi with the values from the ntfs attribute @data in
* order to enable ntfs_test_inode() to do its work.
*
* If initializing the normal file/directory inode, set @na->type to AT_UNUSED.
@@ -87,8 +88,9 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
* NOTE: This function runs with the inode->i_lock spin lock held so it is not
* allowed to sleep. (Hence the GFP_ATOMIC allocation.)
*/
-static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
+static int ntfs_init_locked_inode(struct inode *vi, void *data)
{
+ ntfs_attr *na = (ntfs_attr *)data;
ntfs_inode *ni = NTFS_I(vi);
vi->i_ino = na->mft_no;
@@ -131,7 +133,6 @@ static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
return 0;
}
-typedef int (*set_t)(struct inode *, void *);
static int ntfs_read_locked_inode(struct inode *vi);
static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi);
static int ntfs_read_locked_index_inode(struct inode *base_vi,
@@ -164,8 +165,8 @@ struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no)
na.name = NULL;
na.name_len = 0;
- vi = iget5_locked(sb, mft_no, (test_t)ntfs_test_inode,
- (set_t)ntfs_init_locked_inode, &na);
+ vi = iget5_locked(sb, mft_no, ntfs_test_inode,
+ ntfs_init_locked_inode, &na);
if (unlikely(!vi))
return ERR_PTR(-ENOMEM);
@@ -225,8 +226,8 @@ struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
na.name = name;
na.name_len = name_len;
- vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode,
- (set_t)ntfs_init_locked_inode, &na);
+ vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode,
+ ntfs_init_locked_inode, &na);
if (unlikely(!vi))
return ERR_PTR(-ENOMEM);
@@ -280,8 +281,8 @@ struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
na.name = name;
na.name_len = name_len;
- vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode,
- (set_t)ntfs_init_locked_inode, &na);
+ vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode,
+ ntfs_init_locked_inode, &na);
if (unlikely(!vi))
return ERR_PTR(-ENOMEM);
@@ -476,7 +477,7 @@ err_corrupt_attr:
}
file_name_attr = (FILE_NAME_ATTR*)((u8*)attr +
le16_to_cpu(attr->data.resident.value_offset));
- p2 = (u8*)attr + le32_to_cpu(attr->data.resident.value_length);
+ p2 = (u8 *)file_name_attr + le32_to_cpu(attr->data.resident.value_length);
if (p2 < (u8*)attr || p2 > p)
goto err_corrupt_attr;
/* This attribute is ok, but is it in the $Extend directory? */
@@ -628,6 +629,12 @@ static int ntfs_read_locked_inode(struct inode *vi)
}
a = ctx->attr;
/* Get the standard information attribute value. */
+ if ((u8 *)a + le16_to_cpu(a->data.resident.value_offset)
+ + le32_to_cpu(a->data.resident.value_length) >
+ (u8 *)ctx->mrec + vol->mft_record_size) {
+ ntfs_error(vi->i_sb, "Corrupt standard information attribute in inode.");
+ goto unm_err_out;
+ }
si = (STANDARD_INFORMATION*)((u8*)a +
le16_to_cpu(a->data.resident.value_offset));
@@ -1874,6 +1881,10 @@ int ntfs_read_inode_mount(struct inode *vi)
}
/* Now allocate memory for the attribute list. */
ni->attr_list_size = (u32)ntfs_attr_size(a);
+ if (!ni->attr_list_size) {
+ ntfs_error(sb, "Attr_list_size is zero");
+ goto put_err_out;
+ }
ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
if (!ni->attr_list) {
ntfs_error(sb, "Not enough memory to allocate buffer "
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 98e670fbdd31..363e4e820673 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -253,9 +253,7 @@ typedef struct {
ATTR_TYPE type;
} ntfs_attr;
-typedef int (*test_t)(struct inode *, void *);
-
-extern int ntfs_test_inode(struct inode *vi, ntfs_attr *na);
+extern int ntfs_test_inode(struct inode *vi, void *data);
extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no);
extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 3aac5c917afe..58234b42d68f 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -958,7 +958,7 @@ bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
* dirty code path of the inode dirty code path when writing
* $MFT occurs.
*/
- vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na);
+ vi = ilookup5_nowait(sb, mft_no, ntfs_test_inode, &na);
}
if (vi) {
ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
@@ -1019,7 +1019,7 @@ bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
vi = igrab(mft_vi);
BUG_ON(vi != mft_vi);
} else
- vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode,
+ vi = ilookup5_nowait(sb, na.mft_no, ntfs_test_inode,
&na);
if (!vi) {
/*
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0a6fe7d5aba7..4db87b26cf7b 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7048,7 +7048,7 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct buffer_head *di_bh)
{
- int ret, i, has_data, num_pages = 0;
+ int ret, has_data, num_pages = 0;
int need_free = 0;
u32 bit_off, num;
handle_t *handle;
@@ -7057,26 +7057,17 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_alloc_context *data_ac = NULL;
- struct page **pages = NULL;
- loff_t end = osb->s_clustersize;
+ struct page *page = NULL;
struct ocfs2_extent_tree et;
int did_quota = 0;
has_data = i_size_read(inode) ? 1 : 0;
if (has_data) {
- pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
- sizeof(struct page *), GFP_NOFS);
- if (pages == NULL) {
- ret = -ENOMEM;
- mlog_errno(ret);
- return ret;
- }
-
ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
if (ret) {
mlog_errno(ret);
- goto free_pages;
+ goto out;
}
}
@@ -7096,7 +7087,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
if (has_data) {
- unsigned int page_end;
+ unsigned int page_end = min_t(unsigned, PAGE_SIZE,
+ osb->s_clustersize);
u64 phys;
ret = dquot_alloc_space_nodirty(inode,
@@ -7120,15 +7112,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
*/
block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
- /*
- * Non sparse file systems zero on extend, so no need
- * to do that now.
- */
- if (!ocfs2_sparse_alloc(osb) &&
- PAGE_SIZE < osb->s_clustersize)
- end = PAGE_SIZE;
-
- ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
+ ret = ocfs2_grab_eof_pages(inode, 0, page_end, &page,
+ &num_pages);
if (ret) {
mlog_errno(ret);
need_free = 1;
@@ -7139,20 +7124,15 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
* This should populate the 1st page for us and mark
* it up to date.
*/
- ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
+ ret = ocfs2_read_inline_data(inode, page, di_bh);
if (ret) {
mlog_errno(ret);
need_free = 1;
goto out_unlock;
}
- page_end = PAGE_SIZE;
- if (PAGE_SIZE > osb->s_clustersize)
- page_end = osb->s_clustersize;
-
- for (i = 0; i < num_pages; i++)
- ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
- pages[i], i > 0, &phys);
+ ocfs2_map_and_dirty_page(inode, handle, 0, page_end, page, 0,
+ &phys);
}
spin_lock(&oi->ip_lock);
@@ -7183,8 +7163,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
out_unlock:
- if (pages)
- ocfs2_unlock_and_free_pages(pages, num_pages);
+ if (page)
+ ocfs2_unlock_and_free_pages(&page, num_pages);
out_commit:
if (ret < 0 && did_quota)
@@ -7208,8 +7188,6 @@ out_commit:
out:
if (data_ac)
ocfs2_free_alloc_context(data_ac);
-free_pages:
- kfree(pages);
return ret;
}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 9cd0a6815933..7f66e3342475 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2304,7 +2304,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
struct ocfs2_alloc_context *meta_ac = NULL;
handle_t *handle = NULL;
loff_t end = offset + bytes;
- int ret = 0, credits = 0, locked = 0;
+ int ret = 0, credits = 0;
ocfs2_init_dealloc_ctxt(&dealloc);
@@ -2315,13 +2315,6 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
!dwc->dw_orphaned)
goto out;
- /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
- * are in that context. */
- if (dwc->dw_writer_pid != task_pid_nr(current)) {
- inode_lock(inode);
- locked = 1;
- }
-
ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret < 0) {
mlog_errno(ret);
@@ -2402,8 +2395,6 @@ out:
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
ocfs2_run_deallocs(osb, &dealloc);
- if (locked)
- inode_unlock(inode);
ocfs2_dio_free_write_ctx(inode, dwc);
return ret;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a368350d4c27..c843bc318382 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2052,7 +2052,7 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
o2hb_nego_timeout_handler,
reg, NULL, &reg->hr_handler_list);
if (ret)
- goto free;
+ goto remove_item;
ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key,
sizeof(struct o2hb_nego_msg),
@@ -2067,6 +2067,12 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
unregister_handler:
o2net_unregister_handler_list(&reg->hr_handler_list);
+remove_item:
+ spin_lock(&o2hb_live_lock);
+ list_del(&reg->hr_all_item);
+ if (o2hb_global_heartbeat_active())
+ clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+ spin_unlock(&o2hb_live_lock);
free:
kfree(reg);
return ERR_PTR(ret);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 50a863fc1779..207ec61569ea 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3933,7 +3933,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
oi = OCFS2_I(inode);
oi->ip_dir_lock_gen++;
mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
- goto out;
+ goto out_forget;
}
if (!S_ISREG(inode->i_mode))
@@ -3964,6 +3964,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
filemap_fdatawait(mapping);
}
+out_forget:
forget_all_cached_acls(inode);
out:
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6cd5e4924e4d..c30747bdfb12 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -478,10 +478,11 @@ int ocfs2_truncate_file(struct inode *inode,
* greater than page size, so we have to truncate them
* anyway.
*/
- unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(inode->i_mapping, new_i_size);
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ unmap_mapping_range(inode->i_mapping,
+ new_i_size + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(inode->i_mapping, new_i_size);
status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
i_size_read(inode), 1);
if (status)
@@ -500,6 +501,9 @@ int ocfs2_truncate_file(struct inode *inode,
goto bail_unlock_sem;
}
+ unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(inode->i_mapping, new_i_size);
+
status = ocfs2_commit_truncate(osb, inode, di_bh);
if (status < 0) {
mlog_errno(status);
@@ -1244,22 +1248,24 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
goto bail_unlock;
}
}
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
2 * ocfs2_quota_trans_credits(sb));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
- goto bail_unlock;
+ goto bail_unlock_alloc;
}
status = __dquot_transfer(inode, transfer_to);
if (status < 0)
goto bail_commit;
} else {
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
- goto bail_unlock;
+ goto bail_unlock_alloc;
}
}
@@ -1272,6 +1278,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
bail_commit:
ocfs2_commit_trans(osb, handle);
+bail_unlock_alloc:
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
bail_unlock:
if (status && inode_locked) {
ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
@@ -1525,6 +1533,45 @@ static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
}
}
+/*
+ * zero out partial blocks of one cluster.
+ *
+ * start: file offset where zero starts, will be made upper block aligned.
+ * len: it will be trimmed to the end of current cluster if "start + len"
+ * is bigger than it.
+ */
+static int ocfs2_zeroout_partial_cluster(struct inode *inode,
+ u64 start, u64 len)
+{
+ int ret;
+ u64 start_block, end_block, nr_blocks;
+ u64 p_block, offset;
+ u32 cluster, p_cluster, nr_clusters;
+ struct super_block *sb = inode->i_sb;
+ u64 end = ocfs2_align_bytes_to_clusters(sb, start);
+
+ if (start + len < end)
+ end = start + len;
+
+ start_block = ocfs2_blocks_for_bytes(sb, start);
+ end_block = ocfs2_blocks_for_bytes(sb, end);
+ nr_blocks = end_block - start_block;
+ if (!nr_blocks)
+ return 0;
+
+ cluster = ocfs2_bytes_to_clusters(sb, start);
+ ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
+ &nr_clusters, NULL);
+ if (ret)
+ return ret;
+ if (!p_cluster)
+ return 0;
+
+ offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
+ p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
+ return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
+}
+
static int ocfs2_zero_partial_clusters(struct inode *inode,
u64 start, u64 len)
{
@@ -1534,6 +1581,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
unsigned int csize = osb->s_clustersize;
handle_t *handle;
+ loff_t isize = i_size_read(inode);
/*
* The "start" and "end" values are NOT necessarily part of
@@ -1554,6 +1602,26 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
goto out;
+ /* No page cache for EOF blocks, issue zero out to disk. */
+ if (end > isize) {
+ /*
+ * zeroout eof blocks in last cluster starting from
+ * "isize" even "start" > "isize" because it is
+ * complicated to zeroout just at "start" as "start"
+ * may be not aligned with block size, buffer write
+ * would be required to do that, but out of eof buffer
+ * write is not supported.
+ */
+ ret = ocfs2_zeroout_partial_cluster(inode, isize,
+ end - isize);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ if (start >= isize)
+ goto out;
+ end = isize;
+ }
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
@@ -1861,7 +1929,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
{
int ret;
s64 llen;
- loff_t size;
+ loff_t size, orig_isize;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *di_bh = NULL;
handle_t *handle;
@@ -1953,6 +2021,15 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
default:
ret = -EINVAL;
}
+
+ orig_isize = i_size_read(inode);
+ /* zeroout eof blocks in the cluster. */
+ if (!ret && change_size && orig_isize < size) {
+ ret = ocfs2_zeroout_partial_cluster(inode, orig_isize,
+ size - orig_isize);
+ if (!ret)
+ i_size_write(inode, size);
+ }
up_write(&OCFS2_I(inode)->ip_alloc_sem);
if (ret) {
mlog_errno(ret);
@@ -1969,9 +2046,6 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
goto out_inode_unlock;
}
- if (change_size && i_size_read(inode) < size)
- i_size_write(inode, size);
-
inode->i_ctime = inode->i_mtime = current_time(inode);
ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
if (ret < 0)
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
index 50f11bfdc8c2..82a3edc4aea4 100644
--- a/fs/ocfs2/filecheck.c
+++ b/fs/ocfs2/filecheck.c
@@ -328,11 +328,7 @@ static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj,
ret = snprintf(buf + total, remain, "%lu\t\t%u\t%s\n",
p->fe_ino, p->fe_done,
ocfs2_filecheck_error(p->fe_status));
- if (ret < 0) {
- total = ret;
- break;
- }
- if (ret == remain) {
+ if (ret >= remain) {
/* snprintf() didn't fit */
total = -E2BIG;
break;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 8aa6a667860c..188038760136 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -502,11 +502,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
ret = snprintf(buf, remain, "%s\n",
p->sp_name);
- if (ret < 0) {
- total = ret;
- break;
- }
- if (ret == remain) {
+ if (ret >= remain) {
/* snprintf() didn't fit */
total = -E2BIG;
break;
@@ -533,7 +529,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
if (active_stack) {
ret = snprintf(buf, PAGE_SIZE, "%s\n",
active_stack->sp_name);
- if (ret == PAGE_SIZE)
+ if (ret >= PAGE_SIZE)
ret = -E2BIG;
}
spin_unlock(&ocfs2_stack_lock);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 60b4b6df1ed3..c1cf67b24c19 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1100,17 +1100,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
goto read_super_error;
}
- root = d_make_root(inode);
- if (!root) {
- status = -ENOMEM;
- mlog_errno(status);
- goto read_super_error;
- }
-
- sb->s_root = root;
-
- ocfs2_complete_mount_recovery(osb);
-
osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL,
&ocfs2_kset->kobj);
if (!osb->osb_dev_kset) {
@@ -1128,6 +1117,17 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
goto read_super_error;
}
+ root = d_make_root(inode);
+ if (!root) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto read_super_error;
+ }
+
+ sb->s_root = root;
+
+ ocfs2_complete_mount_recovery(osb);
+
if (ocfs2_mount_local(osb))
snprintf(nodestr, sizeof(nodestr), "local");
else
@@ -2150,11 +2150,17 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
if (ocfs2_clusterinfo_valid(osb)) {
+ /*
+ * ci_stack and ci_cluster in ocfs2_cluster_info may not be null
+ * terminated, so make sure no overflow happens here by using
+ * memcpy. Destination strings will always be null terminated
+ * because osb is allocated using kzalloc.
+ */
osb->osb_stackflags =
OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
- strlcpy(osb->osb_cluster_stack,
+ memcpy(osb->osb_cluster_stack,
OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
- OCFS2_STACK_LABEL_LEN + 1);
+ OCFS2_STACK_LABEL_LEN);
if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
mlog(ML_ERROR,
"couldn't mount because of an invalid "
@@ -2163,9 +2169,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
status = -EINVAL;
goto bail;
}
- strlcpy(osb->osb_cluster_name,
+ memcpy(osb->osb_cluster_name,
OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
- OCFS2_CLUSTER_NAME_LEN + 1);
+ OCFS2_CLUSTER_NAME_LEN);
} else {
/* The empty string is identical with classic tools that
* don't know about s_cluster_info. */
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
index fe484cf93e5c..8bbe9486e3a6 100644
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -26,8 +26,10 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__);
new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
- if (!new_op)
+ if (!new_op) {
+ ret = -ENOMEM;
goto out_put_parent;
+ }
new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
new_op->upcall.req.lookup.parent_refn = parent->refn;
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index 2bb916d68576..023b9bc54b7c 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -179,7 +179,7 @@ orangefs_bufmap_free(struct orangefs_bufmap *bufmap)
{
kfree(bufmap->page_array);
kfree(bufmap->desc_array);
- kfree(bufmap->buffer_index_array);
+ bitmap_free(bufmap->buffer_index_array);
kfree(bufmap);
}
@@ -229,8 +229,7 @@ orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc)
bufmap->desc_size = user_desc->size;
bufmap->desc_shift = ilog2(bufmap->desc_size);
- bufmap->buffer_index_array =
- kzalloc(DIV_ROUND_UP(bufmap->desc_count, BITS_PER_LONG), GFP_KERNEL);
+ bufmap->buffer_index_array = bitmap_zalloc(bufmap->desc_count, GFP_KERNEL);
if (!bufmap->buffer_index_array)
goto out_free_bufmap;
@@ -253,7 +252,7 @@ orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc)
out_free_desc_array:
kfree(bufmap->desc_array);
out_free_index_array:
- kfree(bufmap->buffer_index_array);
+ bitmap_free(bufmap->buffer_index_array);
out_free_bufmap:
kfree(bufmap);
out:
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index ee5efdc35cc1..2f2e430461b2 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -209,7 +209,7 @@ static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = (sector_t) new_op->downcall.resp.statfs.blocks_avail;
buf->f_files = (sector_t) new_op->downcall.resp.statfs.files_total;
buf->f_ffree = (sector_t) new_op->downcall.resp.statfs.files_avail;
- buf->f_frsize = sb->s_blocksize;
+ buf->f_frsize = 0;
out_op_release:
op_release(new_op);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index ec5eca5a96f4..143c52de97ec 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -76,6 +76,14 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new)
if (ovl_is_private_xattr(name))
continue;
+
+ error = security_inode_copy_up_xattr(name);
+ if (error < 0 && error != -EOPNOTSUPP)
+ break;
+ if (error == 1) {
+ error = 0;
+ continue; /* Discard */
+ }
retry:
size = vfs_getxattr(old, name, value, value_size);
if (size == -ERANGE)
@@ -99,13 +107,6 @@ retry:
goto retry;
}
- error = security_inode_copy_up_xattr(name);
- if (error < 0 && error != -EOPNOTSUPP)
- break;
- if (error == 1) {
- error = 0;
- continue; /* Discard */
- }
error = vfs_setxattr(new, name, value, size, 0);
if (error)
break;
@@ -851,7 +852,7 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
int ovl_copy_up_flags(struct dentry *dentry, int flags)
{
int err = 0;
- const struct cred *old_cred = ovl_override_creds(dentry->d_sb);
+ const struct cred *old_cred;
bool disconnected = (dentry->d_flags & DCACHE_DISCONNECTED);
/*
@@ -862,6 +863,7 @@ int ovl_copy_up_flags(struct dentry *dentry, int flags)
if (WARN_ON(disconnected && d_is_dir(dentry)))
return -EIO;
+ old_cred = ovl_override_creds(dentry->d_sb);
while (!err) {
struct dentry *next;
struct dentry *parent = NULL;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 29abdb1d3b5c..8c89eaea1583 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -113,8 +113,7 @@ kill_whiteout:
goto out;
}
-static int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry,
- umode_t mode)
+int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode)
{
int err;
struct dentry *d, *dentry = *newdentry;
@@ -513,8 +512,10 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
goto out_cleanup;
}
err = ovl_instantiate(dentry, inode, newdentry, hardlink);
- if (err)
- goto out_cleanup;
+ if (err) {
+ ovl_cleanup(udir, newdentry);
+ dput(newdentry);
+ }
out_dput:
dput(upper);
out_unlock:
@@ -940,8 +941,8 @@ static char *ovl_get_redirect(struct dentry *dentry, bool abs_redirect)
buflen -= thislen;
memcpy(&buf[buflen], name, thislen);
- tmp = dget_dlock(d->d_parent);
spin_unlock(&d->d_lock);
+ tmp = dget_parent(d);
dput(d);
d = tmp;
@@ -1160,9 +1161,13 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
goto out_dput;
}
} else {
- if (!d_is_negative(newdentry) &&
- (!new_opaque || !ovl_is_whiteout(newdentry)))
- goto out_dput;
+ if (!d_is_negative(newdentry)) {
+ if (!new_opaque || !ovl_is_whiteout(newdentry))
+ goto out_dput;
+ } else {
+ if (flags & RENAME_EXCHANGE)
+ goto out_dput;
+ }
}
if (olddentry == trap)
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 11dd8177770d..19574ef17470 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -395,6 +395,7 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected,
*/
take_dentry_name_snapshot(&name, real);
this = lookup_one_len(name.name.name, connected, name.name.len);
+ release_dentry_name_snapshot(&name);
err = PTR_ERR(this);
if (IS_ERR(this)) {
goto fail;
@@ -409,7 +410,6 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected,
}
out:
- release_dentry_name_snapshot(&name);
dput(parent);
inode_unlock(dir);
return this;
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 7a08a576f7b2..6bf5fb12395a 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -9,6 +9,9 @@
#include <linux/xattr.h>
#include <linux/uio.h>
#include <linux/uaccess.h>
+#include <linux/splice.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
#include "overlayfs.h"
static char ovl_whatisit(struct inode *inode, struct inode *realinode)
@@ -293,6 +296,51 @@ out_unlock:
return ret;
}
+/*
+ * Calling iter_file_splice_write() directly from overlay's f_op may deadlock
+ * due to lock order inversion between pipe->mutex in iter_file_splice_write()
+ * and file_start_write(real.file) in ovl_write_iter().
+ *
+ * So do everything ovl_write_iter() does and call iter_file_splice_write() on
+ * the real file.
+ */
+static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
+{
+ struct fd real;
+ const struct cred *old_cred;
+ struct inode *inode = file_inode(out);
+ struct inode *realinode = ovl_inode_real(inode);
+ ssize_t ret;
+
+ inode_lock(inode);
+ /* Update mode */
+ ovl_copyattr(realinode, inode);
+ ret = file_remove_privs(out);
+ if (ret)
+ goto out_unlock;
+
+ ret = ovl_real_fdget(out, &real);
+ if (ret)
+ goto out_unlock;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ file_start_write(real.file);
+
+ ret = iter_file_splice_write(pipe, real.file, ppos, len, flags);
+
+ file_end_write(real.file);
+ /* Update size */
+ ovl_copyattr(realinode, inode);
+ revert_creds(old_cred);
+ fdput(real);
+
+out_unlock:
+ inode_unlock(inode);
+
+ return ret;
+}
+
static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct fd real;
@@ -649,6 +697,8 @@ const struct file_operations ovl_file_operations = {
.fadvise = ovl_fadvise,
.unlocked_ioctl = ovl_ioctl,
.compat_ioctl = ovl_compat_ioctl,
+ .splice_read = generic_file_splice_read,
+ .splice_write = ovl_splice_write,
.copy_file_range = ovl_copy_file_range,
.remap_file_range = ovl_remap_file_range,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index bb980721502d..56b55397a7a0 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -337,7 +337,9 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
goto out;
if (!value && !upperdentry) {
+ old_cred = ovl_override_creds(dentry->d_sb);
err = vfs_getxattr(realdentry, name, NULL, 0);
+ revert_creds(old_cred);
if (err < 0)
goto out_drop_write;
}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 6934bcf030f0..a8e9da5f01eb 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -409,6 +409,7 @@ struct ovl_cattr {
#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) })
+int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode);
struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct ovl_cattr *attr);
int ovl_cleanup(struct inode *dir, struct dentry *dentry);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index d6b724beb304..f5cf0938f298 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -79,7 +79,7 @@ static void ovl_dentry_release(struct dentry *dentry)
static struct dentry *ovl_d_real(struct dentry *dentry,
const struct inode *inode)
{
- struct dentry *real;
+ struct dentry *real = NULL, *lower;
/* It's an overlay file */
if (inode && d_inode(dentry) == inode)
@@ -98,9 +98,10 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
if (real && !inode && ovl_has_upperdata(d_inode(dentry)))
return real;
- real = ovl_dentry_lowerdata(dentry);
- if (!real)
+ lower = ovl_dentry_lowerdata(dentry);
+ if (!lower)
goto bug;
+ real = lower;
/* Handle recursion */
real = d_real(real, inode);
@@ -108,8 +109,10 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
if (!inode || inode == d_inode(real))
return real;
bug:
- WARN(1, "ovl_d_real(%pd4, %s:%lu): real dentry not found\n", dentry,
- inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0);
+ WARN(1, "%s(%pd4, %s:%lu): real dentry (%p/%lu) not found\n",
+ __func__, dentry, inode ? inode->i_sb->s_id : "NULL",
+ inode ? inode->i_ino : 0, real,
+ real && d_inode(real) ? d_inode(real)->i_ino : 0);
return dentry;
}
@@ -647,10 +650,14 @@ retry:
goto retry;
}
- work = ovl_create_real(dir, work, OVL_CATTR(attr.ia_mode));
- err = PTR_ERR(work);
- if (IS_ERR(work))
- goto out_err;
+ err = ovl_mkdir_real(dir, &work, attr.ia_mode);
+ if (err)
+ goto out_dput;
+
+ /* Weird filesystem returning with hashed negative (kernfs)? */
+ err = -EINVAL;
+ if (d_really_is_negative(work))
+ goto out_dput;
/*
* Try to remove POSIX ACL xattrs from workdir. We are good if:
@@ -1522,7 +1529,8 @@ out_err:
* - upper/work dir of any overlayfs instance
*/
static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs,
- struct dentry *dentry, const char *name)
+ struct dentry *dentry, const char *name,
+ bool is_lower)
{
struct dentry *next = dentry, *parent;
int err = 0;
@@ -1534,7 +1542,7 @@ static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs,
/* Walk back ancestors to root (inclusive) looking for traps */
while (!err && parent != next) {
- if (ovl_lookup_trap_inode(sb, parent)) {
+ if (is_lower && ovl_lookup_trap_inode(sb, parent)) {
err = -ELOOP;
pr_err("overlayfs: overlapping %s path\n", name);
} else if (ovl_is_inuse(parent)) {
@@ -1560,7 +1568,7 @@ static int ovl_check_overlapping_layers(struct super_block *sb,
if (ofs->upper_mnt) {
err = ovl_check_layer(sb, ofs, ofs->upper_mnt->mnt_root,
- "upperdir");
+ "upperdir", false);
if (err)
return err;
@@ -1571,7 +1579,8 @@ static int ovl_check_overlapping_layers(struct super_block *sb,
* workbasedir. In that case, we already have their traps in
* inode cache and we will catch that case on lookup.
*/
- err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir");
+ err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir",
+ false);
if (err)
return err;
}
@@ -1579,7 +1588,7 @@ static int ovl_check_overlapping_layers(struct super_block *sb,
for (i = 0; i < ofs->numlower; i++) {
err = ovl_check_layer(sb, ofs,
ofs->lower_layers[i].mnt->mnt_root,
- "lowerdir");
+ "lowerdir", true);
if (err)
return err;
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 8a2ab2f974bd..30a43b195674 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -31,6 +31,21 @@
#include "internal.h"
/*
+ * New pipe buffers will be restricted to this size while the user is exceeding
+ * their pipe buffer quota. The general pipe use case needs at least two
+ * buffers: one for data yet to be read, and one for new data. If this is less
+ * than two, then a write to a non-empty pipe may block even if the pipe is not
+ * full. This can occur with GNU make jobserver or similar uses of pipes as
+ * semaphores: multiple processes may be waiting to write tokens back to the
+ * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
+ *
+ * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
+ * own risk, namely: pipe writes to non-full pipes may block until the pipe is
+ * emptied.
+ */
+#define PIPE_MIN_DEF_BUFFERS 2
+
+/*
* The max size that a non-root user is allowed to grow the pipe. Can
* be set by root in /proc/sys/fs/pipe-max-size
*/
@@ -666,8 +681,8 @@ struct pipe_inode_info *alloc_pipe_info(void)
user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) {
- user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
- pipe_bufs = 1;
+ user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
+ pipe_bufs = PIPE_MIN_DEF_BUFFERS;
}
if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user())
diff --git a/fs/pnode.h b/fs/pnode.h
index 26f74e092bd9..988f1aa9b02a 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -12,7 +12,7 @@
#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
#define IS_MNT_SLAVE(m) ((m)->mnt_master)
-#define IS_MNT_NEW(m) (!(m)->mnt_ns)
+#define IS_MNT_NEW(m) (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns))
#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 653c2d8aa1cd..5a187e9b7221 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -549,8 +549,17 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
{
unsigned long totalpages = totalram_pages() + total_swap_pages;
unsigned long points = 0;
+ long badness;
+
+ badness = oom_badness(task, totalpages);
+ /*
+ * Special case OOM_SCORE_ADJ_MIN for all others scale the
+ * badness value into [0, 2000] range which we have been
+ * exporting for a long time so userspace might depend on it.
+ */
+ if (badness != LONG_MIN)
+ points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3;
- points = oom_badness(task, totalpages) * 1000 / totalpages;
seq_printf(m, "%lu\n", points);
return 0;
@@ -836,7 +845,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
while (count > 0) {
- int this_len = min_t(int, count, PAGE_SIZE);
+ size_t this_len = min_t(size_t, count, PAGE_SIZE);
if (write && copy_from_user(page, buf, this_len)) {
copied = -EFAULT;
@@ -2527,6 +2536,13 @@ out:
}
#ifdef CONFIG_SECURITY
+static int proc_pid_attr_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
+ return 0;
+}
+
static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
@@ -2556,6 +2572,10 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
void *page;
int rv;
+ /* A task may only write when it was the opener. */
+ if (file->private_data != current->mm)
+ return -EPERM;
+
rcu_read_lock();
task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (!task) {
@@ -2603,9 +2623,11 @@ out:
}
static const struct file_operations proc_pid_attr_operations = {
+ .open = proc_pid_attr_open,
.read = proc_pid_attr_read,
.write = proc_pid_attr_write,
.llseek = generic_file_llseek,
+ .release = mem_release,
};
#define LSM_DIR_OPS(LSM) \
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 080ca9d5eccb..b1102a31a108 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -125,9 +125,13 @@ ssize_t read_from_oldmem(char *buf, size_t count,
nr_bytes = count;
/* If pfn is not ram, return zeros for sparse dump files */
- if (pfn_is_ram(pfn) == 0)
- memset(buf, 0, nr_bytes);
- else {
+ if (pfn_is_ram(pfn) == 0) {
+ tmp = 0;
+ if (!userbuf)
+ memset(buf, 0, nr_bytes);
+ else if (clear_user(buf, nr_bytes))
+ tmp = -EFAULT;
+ } else {
if (encrypted)
tmp = copy_oldmem_page_encrypted(pfn, buf,
nr_bytes,
@@ -136,10 +140,10 @@ ssize_t read_from_oldmem(char *buf, size_t count,
else
tmp = copy_oldmem_page(pfn, buf, nr_bytes,
offset, userbuf);
-
- if (tmp < 0)
- return tmp;
}
+ if (tmp < 0)
+ return tmp;
+
*ppos += nr_bytes;
count -= nr_bytes;
buf += nr_bytes;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 74a60bae2b23..705b79bb9b24 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -275,7 +275,7 @@ static int pstore_compress(const void *in, void *out,
{
int ret;
- if (!IS_ENABLED(CONFIG_PSTORE_COMPRESSION))
+ if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS))
return -EINVAL;
ret = crypto_comp_compress(tfm, in, inlen, out, &outlen);
@@ -664,7 +664,7 @@ static void decompress_record(struct pstore_record *record)
int unzipped_len;
char *unzipped, *workspace;
- if (!IS_ENABLED(CONFIG_PSTORE_COMPRESSION) || !record->compressed)
+ if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !record->compressed)
return;
/* Only PSTORE_TYPE_DMESG support compression. */
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index a6ee23aadd28..66645a5a35f3 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -15,13 +15,48 @@
#include <linux/buffer_head.h>
#include "qnx4.h"
+/*
+ * A qnx4 directory entry is an inode entry or link info
+ * depending on the status field in the last byte. The
+ * first byte is where the name start either way, and a
+ * zero means it's empty.
+ *
+ * Also, due to a bug in gcc, we don't want to use the
+ * real (differently sized) name arrays in the inode and
+ * link entries, but always the 'de_name[]' one in the
+ * fake struct entry.
+ *
+ * See
+ *
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99578#c6
+ *
+ * for details, but basically gcc will take the size of the
+ * 'name' array from one of the used union entries randomly.
+ *
+ * This use of 'de_name[]' (48 bytes) avoids the false positive
+ * warnings that would happen if gcc decides to use 'inode.di_name'
+ * (16 bytes) even when the pointer and size were to come from
+ * 'link.dl_name' (48 bytes).
+ *
+ * In all cases the actual name pointer itself is the same, it's
+ * only the gcc internal 'what is the size of this field' logic
+ * that can get confused.
+ */
+union qnx4_directory_entry {
+ struct {
+ const char de_name[48];
+ u8 de_pad[15];
+ u8 de_status;
+ };
+ struct qnx4_inode_entry inode;
+ struct qnx4_link_info link;
+};
+
static int qnx4_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
unsigned int offset;
struct buffer_head *bh;
- struct qnx4_inode_entry *de;
- struct qnx4_link_info *le;
unsigned long blknum;
int ix, ino;
int size;
@@ -38,27 +73,27 @@ static int qnx4_readdir(struct file *file, struct dir_context *ctx)
}
ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) {
+ union qnx4_directory_entry *de;
+
offset = ix * QNX4_DIR_ENTRY_SIZE;
- de = (struct qnx4_inode_entry *) (bh->b_data + offset);
- if (!de->di_fname[0])
+ de = (union qnx4_directory_entry *) (bh->b_data + offset);
+
+ if (!de->de_name[0])
continue;
- if (!(de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)))
+ if (!(de->de_status & (QNX4_FILE_USED|QNX4_FILE_LINK)))
continue;
- if (!(de->di_status & QNX4_FILE_LINK))
- size = QNX4_SHORT_NAME_MAX;
- else
- size = QNX4_NAME_MAX;
- size = strnlen(de->di_fname, size);
- QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
- if (!(de->di_status & QNX4_FILE_LINK))
+ if (!(de->de_status & QNX4_FILE_LINK)) {
+ size = sizeof(de->inode.di_fname);
ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
- else {
- le = (struct qnx4_link_info*)de;
- ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
+ } else {
+ size = sizeof(de->link.dl_fname);
+ ino = ( le32_to_cpu(de->link.dl_inode_blk) - 1 ) *
QNX4_INODES_PER_BLOCK +
- le->dl_inode_ndx;
+ de->link.dl_inode_ndx;
}
- if (!dir_emit(ctx, de->di_fname, size, ino, DT_UNKNOWN)) {
+ size = strnlen(de->de_name, size);
+ QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, name));
+ if (!dir_emit(ctx, de->de_name, size, ino, DT_UNKNOWN)) {
brelse(bh);
return 0;
}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 7abc3230c21a..dc5f8654b277 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -693,9 +693,14 @@ int dquot_quota_sync(struct super_block *sb, int type)
/* This is not very clever (and fast) but currently I don't know about
* any other simple way of getting quota data to disk and we must get
* them there for userspace to be visible... */
- if (sb->s_op->sync_fs)
- sb->s_op->sync_fs(sb, 1);
- sync_blockdev(sb->s_bdev);
+ if (sb->s_op->sync_fs) {
+ ret = sb->s_op->sync_fs(sb, 1);
+ if (ret)
+ return ret;
+ }
+ ret = sync_blockdev(sb->s_bdev);
+ if (ret)
+ return ret;
/*
* Now when everything is written we can discard the pagecache so
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index c5562c871c8b..1a188fbdf34e 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -423,6 +423,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
quota_error(dquot->dq_sb, "Quota structure has offset to "
"other block (%u) than it should (%u)", blk,
(uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+ ret = -EIO;
goto out_buf;
}
ret = read_blk(info, blk, buf);
@@ -488,6 +489,13 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
goto out_buf;
}
newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+ if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) {
+ quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
+ newblk, info->dqi_blocks);
+ ret = -EUCLEAN;
+ goto out_buf;
+ }
+
if (depth == info->dqi_qtree_depth - 1) {
ret = free_dqentry(info, dquot, newblk);
newblk = 0;
@@ -587,6 +595,13 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
if (!blk) /* No reference? */
goto out_buf;
+ if (blk < QT_TREEOFF || blk >= info->dqi_blocks) {
+ quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
+ blk, info->dqi_blocks);
+ ret = -EUCLEAN;
+ goto out_buf;
+ }
+
if (depth < info->dqi_qtree_depth - 1)
ret = find_tree_dqentry(info, dquot, blk, depth+1);
else
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 36dce17b0101..56aedf4ba886 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -166,19 +166,24 @@ static int v2_read_file_info(struct super_block *sb, int type)
quota_error(sb, "Number of blocks too big for quota file size (%llu > %llu).",
(loff_t)qinfo->dqi_blocks << qinfo->dqi_blocksize_bits,
i_size_read(sb_dqopt(sb)->files[type]));
- goto out;
+ goto out_free;
}
if (qinfo->dqi_free_blk >= qinfo->dqi_blocks) {
quota_error(sb, "Free block number too big (%u >= %u).",
qinfo->dqi_free_blk, qinfo->dqi_blocks);
- goto out;
+ goto out_free;
}
if (qinfo->dqi_free_entry >= qinfo->dqi_blocks) {
quota_error(sb, "Block with free entry too big (%u >= %u).",
qinfo->dqi_free_entry, qinfo->dqi_blocks);
- goto out;
+ goto out_free;
}
ret = 0;
+out_free:
+ if (ret) {
+ kfree(info->dqi_priv);
+ info->dqi_priv = NULL;
+ }
out:
up_read(&dqopt->dqio_sem);
return ret;
diff --git a/fs/readdir.c b/fs/readdir.c
index de2eceffdee8..07a3b5baa404 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -150,6 +150,9 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen,
if (buf->result)
return -EINVAL;
+ buf->result = verify_dirent_name(name, namlen);
+ if (buf->result < 0)
+ return buf->result;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->result = -EOVERFLOW;
@@ -417,6 +420,9 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name,
if (buf->result)
return -EINVAL;
+ buf->result = verify_dirent_name(name, namlen);
+ if (buf->result < 0)
+ return buf->result;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->result = -EOVERFLOW;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 4b3e3e73b512..09ad022a78a5 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2763,6 +2763,20 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
goto free_and_return;
}
+ /*
+ * Sanity check to see if journal first block is correct.
+ * If journal first block is invalid it can cause
+ * zeroing important superblock members.
+ */
+ if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
+ SB_ONDISK_JOURNAL_1st_BLOCK(sb) < SB_JOURNAL_1st_RESERVED_BLOCK(sb)) {
+ reiserfs_warning(sb, "journal-1393",
+ "journal 1st super block is invalid: 1st reserved block %d, but actual 1st block is %d",
+ SB_JOURNAL_1st_RESERVED_BLOCK(sb),
+ SB_ONDISK_JOURNAL_1st_BLOCK(sb));
+ goto free_and_return;
+ }
+
if (journal_init_dev(sb, journal, j_dev_name) != 0) {
reiserfs_warning(sb, "sh-462",
"unable to initialize journal device");
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 9e64e23014e8..07d787145cc3 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -387,6 +387,24 @@ void pathrelse(struct treepath *search_path)
search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
}
+static int has_valid_deh_location(struct buffer_head *bh, struct item_head *ih)
+{
+ struct reiserfs_de_head *deh;
+ int i;
+
+ deh = B_I_DEH(bh, ih);
+ for (i = 0; i < ih_entry_count(ih); i++) {
+ if (deh_location(&deh[i]) > ih_item_len(ih)) {
+ reiserfs_warning(NULL, "reiserfs-5094",
+ "directory entry location seems wrong %h",
+ &deh[i]);
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
{
struct block_head *blkh;
@@ -454,11 +472,14 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
"(second one): %h", ih);
return 0;
}
- if (is_direntry_le_ih(ih) && (ih_item_len(ih) < (ih_entry_count(ih) * IH_SIZE))) {
- reiserfs_warning(NULL, "reiserfs-5093",
- "item entry count seems wrong %h",
- ih);
- return 0;
+ if (is_direntry_le_ih(ih)) {
+ if (ih_item_len(ih) < (ih_entry_count(ih) * IH_SIZE)) {
+ reiserfs_warning(NULL, "reiserfs-5093",
+ "item entry count seems wrong %h",
+ ih);
+ return 0;
+ }
+ return has_valid_deh_location(bh, ih);
}
prev_location = ih_location(ih);
}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1b9c7a387dc7..913f5af9bf24 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2082,6 +2082,14 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
unlock_new_inode(root_inode);
}
+ if (!S_ISDIR(root_inode->i_mode) || !inode_get_bytes(root_inode) ||
+ !root_inode->i_size) {
+ SWARN(silent, s, "", "corrupt root inode, run fsck");
+ iput(root_inode);
+ errval = -EUCLEAN;
+ goto error;
+ }
+
s->s_root = d_make_root(root_inode);
if (!s->s_root)
goto error;
diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
index c764352447ba..81bec2c80b25 100644
--- a/fs/reiserfs/xattr.h
+++ b/fs/reiserfs/xattr.h
@@ -43,7 +43,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec);
static inline int reiserfs_xattrs_initialized(struct super_block *sb)
{
- return REISERFS_SB(sb)->priv_root != NULL;
+ return REISERFS_SB(sb)->priv_root && REISERFS_SB(sb)->xattr_root;
}
#define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header))
diff --git a/fs/select.c b/fs/select.c
index 53a0c149f528..7716d9d5be1e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -458,9 +458,11 @@ get_max:
return max;
}
-#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR)
-#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR)
-#define POLLEX_SET (EPOLLPRI)
+#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\
+ EPOLLNVAL)
+#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\
+ EPOLLNVAL)
+#define POLLEX_SET (EPOLLPRI | EPOLLNVAL)
static inline void wait_key_set(poll_table *wait, unsigned long in,
unsigned long out, unsigned long bit,
@@ -527,6 +529,7 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
break;
if (!(bit & all_bits))
continue;
+ mask = EPOLLNVAL;
f = fdget(i);
if (f.file) {
wait_key_set(wait, in, out, bit,
@@ -534,34 +537,34 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
mask = vfs_poll(f.file, wait);
fdput(f);
- if ((mask & POLLIN_SET) && (in & bit)) {
- res_in |= bit;
- retval++;
- wait->_qproc = NULL;
- }
- if ((mask & POLLOUT_SET) && (out & bit)) {
- res_out |= bit;
- retval++;
- wait->_qproc = NULL;
- }
- if ((mask & POLLEX_SET) && (ex & bit)) {
- res_ex |= bit;
- retval++;
- wait->_qproc = NULL;
- }
- /* got something, stop busy polling */
- if (retval) {
- can_busy_loop = false;
- busy_flag = 0;
-
- /*
- * only remember a returned
- * POLL_BUSY_LOOP if we asked for it
- */
- } else if (busy_flag & mask)
- can_busy_loop = true;
-
}
+ if ((mask & POLLIN_SET) && (in & bit)) {
+ res_in |= bit;
+ retval++;
+ wait->_qproc = NULL;
+ }
+ if ((mask & POLLOUT_SET) && (out & bit)) {
+ res_out |= bit;
+ retval++;
+ wait->_qproc = NULL;
+ }
+ if ((mask & POLLEX_SET) && (ex & bit)) {
+ res_ex |= bit;
+ retval++;
+ wait->_qproc = NULL;
+ }
+ /* got something, stop busy polling */
+ if (retval) {
+ can_busy_loop = false;
+ busy_flag = 0;
+
+ /*
+ * only remember a returned
+ * POLL_BUSY_LOOP if we asked for it
+ */
+ } else if (busy_flag & mask)
+ can_busy_loop = true;
+
}
if (res_in)
*rinp = res_in;
@@ -1037,10 +1040,9 @@ static long do_restart_poll(struct restart_block *restart_block)
ret = do_sys_poll(ufds, nfds, to);
- if (ret == -ERESTARTNOHAND) {
- restart_block->fn = do_restart_poll;
- ret = -ERESTART_RESTARTBLOCK;
- }
+ if (ret == -ERESTARTNOHAND)
+ ret = set_restart_fn(restart_block, do_restart_poll);
+
return ret;
}
@@ -1062,7 +1064,6 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
struct restart_block *restart_block;
restart_block = &current->restart_block;
- restart_block->fn = do_restart_poll;
restart_block->poll.ufds = ufds;
restart_block->poll.nfds = nfds;
@@ -1073,7 +1074,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
} else
restart_block->poll.has_timeout = 0;
- ret = -ERESTART_RESTARTBLOCK;
+ ret = set_restart_fn(restart_block, do_restart_poll);
}
return ret;
}
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 1600034a929b..c19ecc1f2d50 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -29,6 +29,9 @@ static void seq_set_overflow(struct seq_file *m)
static void *seq_buf_alloc(unsigned long size)
{
+ if (unlikely(size > MAX_RW_COUNT))
+ return NULL;
+
return kvmalloc(size, GFP_KERNEL_ACCOUNT);
}
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 5b78719be445..3e94d181930f 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -35,17 +35,7 @@
void signalfd_cleanup(struct sighand_struct *sighand)
{
- wait_queue_head_t *wqh = &sighand->signalfd_wqh;
- /*
- * The lockless check can race with remove_wait_queue() in progress,
- * but in this case its caller should run under rcu_read_lock() and
- * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return.
- */
- if (likely(!waitqueue_active(wqh)))
- return;
-
- /* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */
- wake_up_poll(wqh, EPOLLHUP | POLLFREE);
+ wake_up_pollfree(&sighand->signalfd_wqh);
}
struct signalfd_ctx {
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index ae2c87bb0fbe..723763746238 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -41,12 +41,17 @@ static long long squashfs_inode_lookup(struct super_block *sb, int ino_num)
struct squashfs_sb_info *msblk = sb->s_fs_info;
int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1);
int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1);
- u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]);
+ u64 start;
__le64 ino;
int err;
TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num);
+ if (ino_num == 0 || (ino_num - 1) >= msblk->inodes)
+ return -EINVAL;
+
+ start = le64_to_cpu(msblk->inode_lookup_table[blk]);
+
err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino));
if (err < 0)
return err;
@@ -111,7 +116,10 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
u64 lookup_table_start, u64 next_table, unsigned int inodes)
{
unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes);
+ unsigned int indexes = SQUASHFS_LOOKUP_BLOCKS(inodes);
+ int n;
__le64 *table;
+ u64 start, end;
TRACE("In read_inode_lookup_table, length %d\n", length);
@@ -121,20 +129,41 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
if (inodes == 0)
return ERR_PTR(-EINVAL);
- /* length bytes should not extend into the next table - this check
- * also traps instances where lookup_table_start is incorrectly larger
- * than the next table start
+ /*
+ * The computed size of the lookup table (length bytes) should exactly
+ * match the table start and end points
*/
- if (lookup_table_start + length > next_table)
+ if (length != (next_table - lookup_table_start))
return ERR_PTR(-EINVAL);
table = squashfs_read_table(sb, lookup_table_start, length);
+ if (IS_ERR(table))
+ return table;
/*
- * table[0] points to the first inode lookup table metadata block,
- * this should be less than lookup_table_start
+ * table0], table[1], ... table[indexes - 1] store the locations
+ * of the compressed inode lookup blocks. Each entry should be
+ * less than the next (i.e. table[0] < table[1]), and the difference
+ * between them should be SQUASHFS_METADATA_SIZE or less.
+ * table[indexes - 1] should be less than lookup_table_start, and
+ * again the difference should be SQUASHFS_METADATA_SIZE or less
*/
- if (!IS_ERR(table) && le64_to_cpu(table[0]) >= lookup_table_start) {
+ for (n = 0; n < (indexes - 1); n++) {
+ start = le64_to_cpu(table[n]);
+ end = le64_to_cpu(table[n + 1]);
+
+ if (start >= end
+ || (end - start) >
+ (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
+ kfree(table);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ start = le64_to_cpu(table[indexes - 1]);
+ if (start >= lookup_table_start ||
+ (lookup_table_start - start) >
+ (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
kfree(table);
return ERR_PTR(-EINVAL);
}
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 7b1128398976..89d492916dea 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -211,11 +211,11 @@ failure:
* If the skip factor is limited in this way then the file will use multiple
* slots.
*/
-static inline int calculate_skip(int blocks)
+static inline int calculate_skip(u64 blocks)
{
- int skip = blocks / ((SQUASHFS_META_ENTRIES + 1)
+ u64 skip = blocks / ((SQUASHFS_META_ENTRIES + 1)
* SQUASHFS_META_INDEXES);
- return min(SQUASHFS_CACHED_BLKS - 1, skip + 1);
+ return min((u64) SQUASHFS_CACHED_BLKS - 1, skip + 1);
}
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index 6be5afe7287d..ea5387679723 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -35,10 +35,15 @@ int squashfs_get_id(struct super_block *sb, unsigned int index,
struct squashfs_sb_info *msblk = sb->s_fs_info;
int block = SQUASHFS_ID_BLOCK(index);
int offset = SQUASHFS_ID_BLOCK_OFFSET(index);
- u64 start_block = le64_to_cpu(msblk->id_table[block]);
+ u64 start_block;
__le32 disk_id;
int err;
+ if (index >= msblk->ids)
+ return -EINVAL;
+
+ start_block = le64_to_cpu(msblk->id_table[block]);
+
err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset,
sizeof(disk_id));
if (err < 0)
@@ -56,7 +61,10 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb,
u64 id_table_start, u64 next_table, unsigned short no_ids)
{
unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids);
+ unsigned int indexes = SQUASHFS_ID_BLOCKS(no_ids);
+ int n;
__le64 *table;
+ u64 start, end;
TRACE("In read_id_index_table, length %d\n", length);
@@ -67,20 +75,38 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb,
return ERR_PTR(-EINVAL);
/*
- * length bytes should not extend into the next table - this check
- * also traps instances where id_table_start is incorrectly larger
- * than the next table start
+ * The computed size of the index table (length bytes) should exactly
+ * match the table start and end points
*/
- if (id_table_start + length > next_table)
+ if (length != (next_table - id_table_start))
return ERR_PTR(-EINVAL);
table = squashfs_read_table(sb, id_table_start, length);
+ if (IS_ERR(table))
+ return table;
/*
- * table[0] points to the first id lookup table metadata block, this
- * should be less than id_table_start
+ * table[0], table[1], ... table[indexes - 1] store the locations
+ * of the compressed id blocks. Each entry should be less than
+ * the next (i.e. table[0] < table[1]), and the difference between them
+ * should be SQUASHFS_METADATA_SIZE or less. table[indexes - 1]
+ * should be less than id_table_start, and again the difference
+ * should be SQUASHFS_METADATA_SIZE or less
*/
- if (!IS_ERR(table) && le64_to_cpu(table[0]) >= id_table_start) {
+ for (n = 0; n < (indexes - 1); n++) {
+ start = le64_to_cpu(table[n]);
+ end = le64_to_cpu(table[n + 1]);
+
+ if (start >= end || (end - start) >
+ (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
+ kfree(table);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ start = le64_to_cpu(table[indexes - 1]);
+ if (start >= id_table_start || (id_table_start - start) >
+ (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
kfree(table);
return ERR_PTR(-EINVAL);
}
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 7187bd1a30ea..236664d69141 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -17,6 +17,7 @@
/* size of metadata (inode and directory) blocks */
#define SQUASHFS_METADATA_SIZE 8192
+#define SQUASHFS_BLOCK_OFFSET 2
/* default size of block device I/O */
#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 34c21ffb6df3..166e98806265 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -64,5 +64,6 @@ struct squashfs_sb_info {
unsigned int inodes;
unsigned int fragments;
int xattr_ids;
+ unsigned int ids;
};
#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 0cc4ceec0562..2110323b610b 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -166,6 +166,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
msblk->directory_table = le64_to_cpu(sblk->directory_table_start);
msblk->inodes = le32_to_cpu(sblk->inodes);
msblk->fragments = le32_to_cpu(sblk->fragments);
+ msblk->ids = le16_to_cpu(sblk->no_ids);
flags = le16_to_cpu(sblk->flags);
TRACE("Found valid superblock on %pg\n", sb->s_bdev);
@@ -177,7 +178,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
TRACE("Block size %d\n", msblk->block_size);
TRACE("Number of inodes %d\n", msblk->inodes);
TRACE("Number of fragments %d\n", msblk->fragments);
- TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids));
+ TRACE("Number of ids %d\n", msblk->ids);
TRACE("sblk->inode_table_start %llx\n", msblk->inode_table);
TRACE("sblk->directory_table_start %llx\n", msblk->directory_table);
TRACE("sblk->fragment_table_start %llx\n",
@@ -236,8 +237,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
allocate_id_index_table:
/* Allocate and read id index table */
msblk->id_table = squashfs_read_id_index_table(sb,
- le64_to_cpu(sblk->id_table_start), next_table,
- le16_to_cpu(sblk->no_ids));
+ le64_to_cpu(sblk->id_table_start), next_table, msblk->ids);
if (IS_ERR(msblk->id_table)) {
errorf(fc, "unable to read id index table");
err = PTR_ERR(msblk->id_table);
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index 184129afd456..d8a270d3ac4c 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -17,8 +17,16 @@ extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
u64 start, u64 *xattr_table_start, int *xattr_ids)
{
+ struct squashfs_xattr_id_table *id_table;
+
+ id_table = squashfs_read_table(sb, start, sizeof(*id_table));
+ if (IS_ERR(id_table))
+ return (__le64 *) id_table;
+
+ *xattr_table_start = le64_to_cpu(id_table->xattr_table_start);
+ kfree(id_table);
+
ERROR("Xattrs in filesystem, these will be ignored\n");
- *xattr_table_start = start;
return ERR_PTR(-ENOTSUPP);
}
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index d99e08464554..087cab8c78f4 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -31,10 +31,15 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
struct squashfs_sb_info *msblk = sb->s_fs_info;
int block = SQUASHFS_XATTR_BLOCK(index);
int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
- u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
+ u64 start_block;
struct squashfs_xattr_id id;
int err;
+ if (index >= msblk->xattr_ids)
+ return -EINVAL;
+
+ start_block = le64_to_cpu(msblk->xattr_id_table[block]);
+
err = squashfs_read_metadata(sb, &id, &start_block, &offset,
sizeof(id));
if (err < 0)
@@ -50,13 +55,17 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
/*
* Read uncompressed xattr id lookup table indexes from disk into memory
*/
-__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
+__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start,
u64 *xattr_table_start, int *xattr_ids)
{
- unsigned int len;
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ unsigned int len, indexes;
struct squashfs_xattr_id_table *id_table;
+ __le64 *table;
+ u64 start, end;
+ int n;
- id_table = squashfs_read_table(sb, start, sizeof(*id_table));
+ id_table = squashfs_read_table(sb, table_start, sizeof(*id_table));
if (IS_ERR(id_table))
return (__le64 *) id_table;
@@ -70,13 +79,54 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
if (*xattr_ids == 0)
return ERR_PTR(-EINVAL);
- /* xattr_table should be less than start */
- if (*xattr_table_start >= start)
+ len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
+ indexes = SQUASHFS_XATTR_BLOCKS(*xattr_ids);
+
+ /*
+ * The computed size of the index table (len bytes) should exactly
+ * match the table start and end points
+ */
+ start = table_start + sizeof(*id_table);
+ end = msblk->bytes_used;
+
+ if (len != (end - start))
return ERR_PTR(-EINVAL);
- len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
+ table = squashfs_read_table(sb, start, len);
+ if (IS_ERR(table))
+ return table;
+
+ /* table[0], table[1], ... table[indexes - 1] store the locations
+ * of the compressed xattr id blocks. Each entry should be less than
+ * the next (i.e. table[0] < table[1]), and the difference between them
+ * should be SQUASHFS_METADATA_SIZE or less. table[indexes - 1]
+ * should be less than table_start, and again the difference
+ * shouls be SQUASHFS_METADATA_SIZE or less.
+ *
+ * Finally xattr_table_start should be less than table[0].
+ */
+ for (n = 0; n < (indexes - 1); n++) {
+ start = le64_to_cpu(table[n]);
+ end = le64_to_cpu(table[n + 1]);
+
+ if (start >= end || (end - start) >
+ (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
+ kfree(table);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ start = le64_to_cpu(table[indexes - 1]);
+ if (start >= table_start || (table_start - start) >
+ (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
+ kfree(table);
+ return ERR_PTR(-EINVAL);
+ }
- TRACE("In read_xattr_index_table, length %d\n", len);
+ if (*xattr_table_start >= le64_to_cpu(table[0])) {
+ kfree(table);
+ return ERR_PTR(-EINVAL);
+ }
- return squashfs_read_table(sb, start + sizeof(*id_table), len);
+ return table;
}
diff --git a/fs/stat.c b/fs/stat.c
index c38e4c2e1221..268c9eb89656 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -290,9 +290,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
# define choose_32_64(a,b) b
#endif
-#define valid_dev(x) choose_32_64(old_valid_dev(x),true)
-#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
-
#ifndef INIT_STRUCT_STAT_PADDING
# define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
#endif
@@ -301,7 +298,9 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
{
struct stat tmp;
- if (!valid_dev(stat->dev) || !valid_dev(stat->rdev))
+ if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
+ return -EOVERFLOW;
+ if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
return -EOVERFLOW;
#if BITS_PER_LONG == 32
if (stat->size > MAX_NON_LFS)
@@ -309,7 +308,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
#endif
INIT_STRUCT_STAT_PADDING(tmp);
- tmp.st_dev = encode_dev(stat->dev);
+ tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_ino = stat->ino;
if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
@@ -319,7 +318,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
return -EOVERFLOW;
SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
- tmp.st_rdev = encode_dev(stat->rdev);
+ tmp.st_rdev = new_encode_dev(stat->rdev);
tmp.st_size = stat->size;
tmp.st_atime = stat->atime.tv_sec;
tmp.st_mtime = stat->mtime.tv_sec;
@@ -593,11 +592,13 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
{
struct compat_stat tmp;
- if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+ if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
+ return -EOVERFLOW;
+ if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
return -EOVERFLOW;
memset(&tmp, 0, sizeof(tmp));
- tmp.st_dev = old_encode_dev(stat->dev);
+ tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_ino = stat->ino;
if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
@@ -607,7 +608,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
return -EOVERFLOW;
SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
- tmp.st_rdev = old_encode_dev(stat->rdev);
+ tmp.st_rdev = new_encode_dev(stat->rdev);
if ((u64) stat->size > MAX_NON_LFS)
return -EOVERFLOW;
tmp.st_size = stat->size;
diff --git a/fs/super.c b/fs/super.c
index 877532baf513..e255c18fa2c8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1470,8 +1470,8 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
}
EXPORT_SYMBOL(mount_nodev);
-static int reconfigure_single(struct super_block *s,
- int flags, void *data)
+int reconfigure_single(struct super_block *s,
+ int flags, void *data)
{
struct fs_context *fc;
int ret;
@@ -1691,11 +1691,9 @@ static void lockdep_sb_freeze_acquire(struct super_block *sb)
percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
}
-static void sb_freeze_unlock(struct super_block *sb)
+static void sb_freeze_unlock(struct super_block *sb, int level)
{
- int level;
-
- for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
+ for (level--; level >= 0; level--)
percpu_up_write(sb->s_writers.rw_sem + level);
}
@@ -1766,7 +1764,14 @@ int freeze_super(struct super_block *sb)
sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
/* All writers are done so after syncing there won't be dirty data */
- sync_filesystem(sb);
+ ret = sync_filesystem(sb);
+ if (ret) {
+ sb->s_writers.frozen = SB_UNFROZEN;
+ sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT);
+ wake_up(&sb->s_writers.wait_unfrozen);
+ deactivate_locked_super(sb);
+ return ret;
+ }
/* Now wait for internal filesystem counter */
sb->s_writers.frozen = SB_FREEZE_FS;
@@ -1778,7 +1783,7 @@ int freeze_super(struct super_block *sb)
printk(KERN_ERR
"VFS:Filesystem freeze failed\n");
sb->s_writers.frozen = SB_UNFROZEN;
- sb_freeze_unlock(sb);
+ sb_freeze_unlock(sb, SB_FREEZE_FS);
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
return ret;
@@ -1829,7 +1834,7 @@ static int thaw_super_locked(struct super_block *sb)
}
sb->s_writers.frozen = SB_UNFROZEN;
- sb_freeze_unlock(sb);
+ sb_freeze_unlock(sb, SB_FREEZE_FS);
out:
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 130fc6fbcc03..4f7cf975b27c 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -15,6 +15,7 @@
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
+#include <linux/mm.h>
#include "sysfs.h"
@@ -558,3 +559,57 @@ void sysfs_remove_bin_file(struct kobject *kobj,
kernfs_remove_by_name(kobj->sd, attr->attr.name);
}
EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
+
+/**
+ * sysfs_emit - scnprintf equivalent, aware of PAGE_SIZE buffer.
+ * @buf: start of PAGE_SIZE buffer.
+ * @fmt: format
+ * @...: optional arguments to @format
+ *
+ *
+ * Returns number of characters written to @buf.
+ */
+int sysfs_emit(char *buf, const char *fmt, ...)
+{
+ va_list args;
+ int len;
+
+ if (WARN(!buf || offset_in_page(buf),
+ "invalid sysfs_emit: buf:%p\n", buf))
+ return 0;
+
+ va_start(args, fmt);
+ len = vscnprintf(buf, PAGE_SIZE, fmt, args);
+ va_end(args);
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(sysfs_emit);
+
+/**
+ * sysfs_emit_at - scnprintf equivalent, aware of PAGE_SIZE buffer.
+ * @buf: start of PAGE_SIZE buffer.
+ * @at: offset in @buf to start write in bytes
+ * @at must be >= 0 && < PAGE_SIZE
+ * @fmt: format
+ * @...: optional arguments to @fmt
+ *
+ *
+ * Returns number of characters written starting at &@buf[@at].
+ */
+int sysfs_emit_at(char *buf, int at, const char *fmt, ...)
+{
+ va_list args;
+ int len;
+
+ if (WARN(!buf || offset_in_page(buf) || at < 0 || at >= PAGE_SIZE,
+ "invalid sysfs_emit_at: buf:%p at:%d\n", buf, at))
+ return 0;
+
+ va_start(args, fmt);
+ len = vscnprintf(buf + at, PAGE_SIZE - at, fmt, args);
+ va_end(args);
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(sysfs_emit_at);
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 0caa151cae4e..7878f145bf1b 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -159,6 +159,77 @@ struct tracefs_fs_info {
struct tracefs_mount_opts mount_opts;
};
+static void change_gid(struct dentry *dentry, kgid_t gid)
+{
+ if (!dentry->d_inode)
+ return;
+ dentry->d_inode->i_gid = gid;
+}
+
+/*
+ * Taken from d_walk, but without he need for handling renames.
+ * Nothing can be renamed while walking the list, as tracefs
+ * does not support renames. This is only called when mounting
+ * or remounting the file system, to set all the files to
+ * the given gid.
+ */
+static void set_gid(struct dentry *parent, kgid_t gid)
+{
+ struct dentry *this_parent;
+ struct list_head *next;
+
+ this_parent = parent;
+ spin_lock(&this_parent->d_lock);
+
+ change_gid(this_parent, gid);
+repeat:
+ next = this_parent->d_subdirs.next;
+resume:
+ while (next != &this_parent->d_subdirs) {
+ struct list_head *tmp = next;
+ struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+ next = tmp->next;
+
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+
+ change_gid(dentry, gid);
+
+ if (!list_empty(&dentry->d_subdirs)) {
+ spin_unlock(&this_parent->d_lock);
+ spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
+ this_parent = dentry;
+ spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
+ goto repeat;
+ }
+ spin_unlock(&dentry->d_lock);
+ }
+ /*
+ * All done at this level ... ascend and resume the search.
+ */
+ rcu_read_lock();
+ascend:
+ if (this_parent != parent) {
+ struct dentry *child = this_parent;
+ this_parent = child->d_parent;
+
+ spin_unlock(&child->d_lock);
+ spin_lock(&this_parent->d_lock);
+
+ /* go into the first sibling still alive */
+ do {
+ next = child->d_child.next;
+ if (next == &this_parent->d_subdirs)
+ goto ascend;
+ child = list_entry(next, struct dentry, d_child);
+ } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
+ rcu_read_unlock();
+ goto resume;
+ }
+ rcu_read_unlock();
+ spin_unlock(&this_parent->d_lock);
+ return;
+}
+
static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
{
substring_t args[MAX_OPT_ARGS];
@@ -217,7 +288,9 @@ static int tracefs_apply_options(struct super_block *sb)
inode->i_mode |= opts->mode;
inode->i_uid = opts->uid;
- inode->i_gid = opts->gid;
+
+ /* Set all the group ids to the mount option */
+ set_gid(sb->s_root, opts->gid);
return 0;
}
@@ -409,6 +482,8 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
inode->i_mode = mode;
inode->i_fop = fops ? fops : &tracefs_file_operations;
inode->i_private = data;
+ inode->i_uid = d_inode(dentry->d_parent)->i_uid;
+ inode->i_gid = d_inode(dentry->d_parent)->i_gid;
d_instantiate(dentry, inode);
fsnotify_create(dentry->d_parent->d_inode, dentry);
return end_creating(dentry);
@@ -427,9 +502,12 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent,
if (unlikely(!inode))
return failed_creating(dentry);
- inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ /* Do not set bits for OTH */
+ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP;
inode->i_op = ops;
inode->i_fop = &simple_dir_operations;
+ inode->i_uid = d_inode(dentry->d_parent)->i_uid;
+ inode->i_gid = d_inode(dentry->d_parent)->i_gid;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index b10418b5fb71..8be17a773196 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -342,7 +342,7 @@ int ubifs_init_authentication(struct ubifs_info *c)
ubifs_err(c, "hmac %s is bigger than maximum allowed hmac size (%d > %d)",
hmac_name, c->hmac_desc_len, UBIFS_HMAC_ARR_SZ);
err = -EINVAL;
- goto out_free_hash;
+ goto out_free_hmac;
}
err = crypto_shash_setkey(c->hmac_tfm, ukp->data, ukp->datalen);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index a5e5e9b9d4e3..83a173feb698 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -361,15 +361,18 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry,
{
struct inode *inode;
struct ubifs_info *c = dir->i_sb->s_fs_info;
- struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1};
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .dirtied_ino = 1};
struct ubifs_budget_req ino_req = { .dirtied_ino = 1 };
struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir);
int err, instantiated = 0;
struct fscrypt_name nm;
/*
- * Budget request settings: new dirty inode, new direntry,
- * budget for dirtied inode will be released via writeback.
+ * Budget request settings: new inode, new direntry, changing the
+ * parent directory inode.
+ * Allocate budget separately for new dirtied inode, the budget will
+ * be released via writeback.
*/
dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
@@ -439,6 +442,8 @@ out_inode:
make_bad_inode(inode);
if (!instantiated)
iput(inode);
+ else if (whiteout)
+ iput(*whiteout);
out_budg:
ubifs_release_budget(c, &req);
if (!instantiated)
@@ -955,7 +960,8 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_info *c = dir->i_sb->s_fs_info;
int err, sz_change;
- struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .dirtied_ino = 1};
struct fscrypt_name nm;
/*
@@ -1330,6 +1336,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
if (flags & RENAME_WHITEOUT) {
union ubifs_dev_desc *dev = NULL;
+ struct ubifs_budget_req wht_req;
dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
if (!dev) {
@@ -1343,11 +1350,31 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_release;
}
+ spin_lock(&whiteout->i_lock);
whiteout->i_state |= I_LINKABLE;
+ spin_unlock(&whiteout->i_lock);
+
whiteout_ui = ubifs_inode(whiteout);
whiteout_ui->data = dev;
whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0));
ubifs_assert(c, !whiteout_ui->dirty);
+
+ memset(&wht_req, 0, sizeof(struct ubifs_budget_req));
+ wht_req.dirtied_ino = 1;
+ wht_req.dirtied_ino_d = ALIGN(whiteout_ui->data_len, 8);
+ /*
+ * To avoid deadlock between space budget (holds ui_mutex and
+ * waits wb work) and writeback work(waits ui_mutex), do space
+ * budget before ubifs inodes locked.
+ */
+ err = ubifs_budget_space(c, &wht_req);
+ if (err) {
+ iput(whiteout);
+ goto out_release;
+ }
+
+ /* Add the old_dentry size to the old_dir size. */
+ old_sz -= CALC_DENT_SIZE(fname_len(&old_nm));
}
lock_4_inodes(old_dir, new_dir, new_inode, whiteout);
@@ -1422,21 +1449,13 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (whiteout) {
- struct ubifs_budget_req wht_req = { .dirtied_ino = 1,
- .dirtied_ino_d = \
- ALIGN(ubifs_inode(whiteout)->data_len, 8) };
-
- err = ubifs_budget_space(c, &wht_req);
- if (err) {
- kfree(whiteout_ui->data);
- whiteout_ui->data_len = 0;
- iput(whiteout);
- goto out_release;
- }
-
inc_nlink(whiteout);
mark_inode_dirty(whiteout);
+
+ spin_lock(&whiteout->i_lock);
whiteout->i_state &= ~I_LINKABLE;
+ spin_unlock(&whiteout->i_lock);
+
iput(whiteout);
}
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 8dada89bbe4d..6069c63d833a 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1629,6 +1629,16 @@ static const char *ubifs_get_link(struct dentry *dentry,
return fscrypt_get_symlink(inode, ui->data, ui->data_len, done);
}
+static int ubifs_symlink_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ ubifs_getattr(path, stat, request_mask, query_flags);
+
+ if (IS_ENCRYPTED(d_inode(path->dentry)))
+ return fscrypt_symlink_getattr(path, stat);
+ return 0;
+}
+
const struct address_space_operations ubifs_file_address_operations = {
.readpage = ubifs_readpage,
.writepage = ubifs_writepage,
@@ -1654,7 +1664,7 @@ const struct inode_operations ubifs_file_inode_operations = {
const struct inode_operations ubifs_symlink_inode_operations = {
.get_link = ubifs_get_link,
.setattr = ubifs_setattr,
- .getattr = ubifs_getattr,
+ .getattr = ubifs_symlink_getattr,
#ifdef CONFIG_UBIFS_FS_XATTR
.listxattr = ubifs_listxattr,
#endif
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index eae9cf5a57b0..89b671ad0f9a 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -846,16 +846,42 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
*/
n = aligned_len >> c->max_write_shift;
if (n) {
- n <<= c->max_write_shift;
+ int m = n - 1;
+
dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
wbuf->offs);
- err = ubifs_leb_write(c, wbuf->lnum, buf + written,
- wbuf->offs, n);
+
+ if (m) {
+ /* '(n-1)<<c->max_write_shift < len' is always true. */
+ m <<= c->max_write_shift;
+ err = ubifs_leb_write(c, wbuf->lnum, buf + written,
+ wbuf->offs, m);
+ if (err)
+ goto out;
+ wbuf->offs += m;
+ aligned_len -= m;
+ len -= m;
+ written += m;
+ }
+
+ /*
+ * The non-written len of buf may be less than 'n' because
+ * parameter 'len' is not 8 bytes aligned, so here we read
+ * min(len, n) bytes from buf.
+ */
+ n = 1 << c->max_write_shift;
+ memcpy(wbuf->buf, buf + written, min(len, n));
+ if (n > len) {
+ ubifs_assert(c, n - len < 8);
+ ubifs_pad(c, wbuf->buf + len, n - len);
+ }
+
+ err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, n);
if (err)
goto out;
wbuf->offs += n;
aligned_len -= n;
- len -= n;
+ len -= min(len, n);
written += n;
}
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index eeb1be259888..2923d5a6a7c0 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -101,7 +101,7 @@ static int setflags(struct inode *inode, int flags)
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct ubifs_budget_req req = { .dirtied_ino = 1,
- .dirtied_ino_d = ui->data_len };
+ .dirtied_ino_d = ALIGN(ui->data_len, 8) };
err = ubifs_budget_space(c, &req);
if (err)
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 01fcf7975047..723a4b66a9b9 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -223,7 +223,8 @@ static bool inode_still_linked(struct ubifs_info *c, struct replay_entry *rino)
*/
list_for_each_entry_reverse(r, &c->replay_list, list) {
ubifs_assert(c, r->sqnum >= rino->sqnum);
- if (key_inum(c, &r->key) == key_inum(c, &rino->key))
+ if (key_inum(c, &r->key) == key_inum(c, &rino->key) &&
+ key_type(c, &r->key) == UBIFS_INO_KEY)
return r->deletion == 0;
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index e49bd69dfc1c..b37c6b1e3325 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -257,6 +257,7 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb)
memset((void *)ui + sizeof(struct inode), 0,
sizeof(struct ubifs_inode) - sizeof(struct inode));
mutex_init(&ui->ui_mutex);
+ init_rwsem(&ui->xattr_sem);
spin_lock_init(&ui->ui_lock);
return &ui->vfs_inode;
};
@@ -820,8 +821,10 @@ static int alloc_wbufs(struct ubifs_info *c)
c->jheads[i].wbuf.jhead = i;
c->jheads[i].grouped = 1;
c->jheads[i].log_hash = ubifs_hash_get_desc(c);
- if (IS_ERR(c->jheads[i].log_hash))
+ if (IS_ERR(c->jheads[i].log_hash)) {
+ err = PTR_ERR(c->jheads[i].log_hash);
goto out;
+ }
}
/*
@@ -1832,7 +1835,6 @@ out:
kthread_stop(c->bgt);
c->bgt = NULL;
}
- free_wbufs(c);
kfree(c->write_reserve_buf);
c->write_reserve_buf = NULL;
vfree(c->ileb_buf);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c55f212dcb75..b3b7e3576e98 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -356,6 +356,7 @@ struct ubifs_gced_idx_leb {
* @ui_mutex: serializes inode write-back with the rest of VFS operations,
* serializes "clean <-> dirty" state changes, serializes bulk-read,
* protects @dirty, @bulk_read, @ui_size, and @xattr_size
+ * @xattr_sem: serilizes write operations (remove|set|create) on xattr
* @ui_lock: protects @synced_i_size
* @synced_i_size: synchronized size of inode, i.e. the value of inode size
* currently stored on the flash; used only for regular file
@@ -409,6 +410,7 @@ struct ubifs_inode {
unsigned int bulk_read:1;
unsigned int compr_type:2;
struct mutex ui_mutex;
+ struct rw_semaphore xattr_sem;
spinlock_t ui_lock;
loff_t synced_i_size;
loff_t ui_size;
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index a0b9b349efe6..09280796fc61 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -285,6 +285,7 @@ int ubifs_xattr_set(struct inode *host, const char *name, const void *value,
if (!xent)
return -ENOMEM;
+ down_write(&ubifs_inode(host)->xattr_sem);
/*
* The extended attribute entries are stored in LNC, so multiple
* look-ups do not involve reading the flash.
@@ -319,6 +320,7 @@ int ubifs_xattr_set(struct inode *host, const char *name, const void *value,
iput(inode);
out_free:
+ up_write(&ubifs_inode(host)->xattr_sem);
kfree(xent);
return err;
}
@@ -341,18 +343,19 @@ ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
if (!xent)
return -ENOMEM;
+ down_read(&ubifs_inode(host)->xattr_sem);
xent_key_init(c, &key, host->i_ino, &nm);
err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
if (err) {
if (err == -ENOENT)
err = -ENODATA;
- goto out_unlock;
+ goto out_cleanup;
}
inode = iget_xattr(c, le64_to_cpu(xent->inum));
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- goto out_unlock;
+ goto out_cleanup;
}
ui = ubifs_inode(inode);
@@ -374,7 +377,8 @@ ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
out_iput:
mutex_unlock(&ui->ui_mutex);
iput(inode);
-out_unlock:
+out_cleanup:
+ up_read(&ubifs_inode(host)->xattr_sem);
kfree(xent);
return err;
}
@@ -406,16 +410,21 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
dbg_gen("ino %lu ('%pd'), buffer size %zd", host->i_ino,
dentry, size);
+ down_read(&host_ui->xattr_sem);
len = host_ui->xattr_names + host_ui->xattr_cnt;
- if (!buffer)
+ if (!buffer) {
/*
* We should return the minimum buffer size which will fit a
* null-terminated list of all the extended attribute names.
*/
- return len;
+ err = len;
+ goto out_err;
+ }
- if (len > size)
- return -ERANGE;
+ if (len > size) {
+ err = -ERANGE;
+ goto out_err;
+ }
lowest_xent_key(c, &key, host->i_ino);
while (1) {
@@ -437,8 +446,9 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
pxent = xent;
key_read(c, &xent->key, &key);
}
-
kfree(pxent);
+ up_read(&host_ui->xattr_sem);
+
if (err != -ENOENT) {
ubifs_err(c, "cannot find next direntry, error %d", err);
return err;
@@ -446,6 +456,10 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
ubifs_assert(c, written <= size);
return written;
+
+out_err:
+ up_read(&host_ui->xattr_sem);
+ return err;
}
static int remove_xattr(struct ubifs_info *c, struct inode *host,
@@ -504,6 +518,7 @@ int ubifs_purge_xattrs(struct inode *host)
ubifs_warn(c, "inode %lu has too many xattrs, doing a non-atomic deletion",
host->i_ino);
+ down_write(&ubifs_inode(host)->xattr_sem);
lowest_xent_key(c, &key, host->i_ino);
while (1) {
xent = ubifs_tnc_next_ent(c, &key, &nm);
@@ -523,7 +538,7 @@ int ubifs_purge_xattrs(struct inode *host)
ubifs_ro_mode(c, err);
kfree(pxent);
kfree(xent);
- return err;
+ goto out_err;
}
ubifs_assert(c, ubifs_inode(xino)->xattr);
@@ -535,7 +550,7 @@ int ubifs_purge_xattrs(struct inode *host)
kfree(xent);
iput(xino);
ubifs_err(c, "cannot remove xattr, error %d", err);
- return err;
+ goto out_err;
}
iput(xino);
@@ -544,14 +559,19 @@ int ubifs_purge_xattrs(struct inode *host)
pxent = xent;
key_read(c, &xent->key, &key);
}
-
kfree(pxent);
+ up_write(&ubifs_inode(host)->xattr_sem);
+
if (err != -ENOENT) {
ubifs_err(c, "cannot find next direntry, error %d", err);
return err;
}
return 0;
+
+out_err:
+ up_write(&ubifs_inode(host)->xattr_sem);
+ return err;
}
/**
@@ -594,6 +614,7 @@ static int ubifs_xattr_remove(struct inode *host, const char *name)
if (!xent)
return -ENOMEM;
+ down_write(&ubifs_inode(host)->xattr_sem);
xent_key_init(c, &key, host->i_ino, &nm);
err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
if (err) {
@@ -618,6 +639,7 @@ static int ubifs_xattr_remove(struct inode *host, const char *name)
iput(inode);
out_free:
+ up_write(&ubifs_inode(host)->xattr_sem);
kfree(xent);
return err;
}
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index c19dba45aa20..d0f92a52e3ba 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -31,6 +31,7 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/bio.h>
+#include <linux/iversion.h>
#include "udf_i.h"
#include "udf_sb.h"
@@ -44,7 +45,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
struct fileIdentDesc *fi = NULL;
struct fileIdentDesc cfi;
udf_pblk_t block, iblock;
- loff_t nf_pos;
+ loff_t nf_pos, emit_pos = 0;
int flen;
unsigned char *fname = NULL, *copy_name = NULL;
unsigned char *nameptr;
@@ -58,6 +59,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
int i, num, ret = 0;
struct extent_position epos = { NULL, 0, {0, 0} };
struct super_block *sb = dir->i_sb;
+ bool pos_valid = false;
if (ctx->pos == 0) {
if (!dir_emit_dot(file, ctx))
@@ -68,6 +70,21 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
if (nf_pos >= size)
goto out;
+ /*
+ * Something changed since last readdir (either lseek was called or dir
+ * changed)? We need to verify the position correctly points at the
+ * beginning of some dir entry so that the directory parsing code does
+ * not get confused. Since UDF does not have any reliable way of
+ * identifying beginning of dir entry (names are under user control),
+ * we need to scan the directory from the beginning.
+ */
+ if (!inode_eq_iversion(dir, file->f_version)) {
+ emit_pos = nf_pos;
+ nf_pos = 0;
+ } else {
+ pos_valid = true;
+ }
+
fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
if (!fname) {
ret = -ENOMEM;
@@ -123,13 +140,21 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
while (nf_pos < size) {
struct kernel_lb_addr tloc;
+ loff_t cur_pos = nf_pos;
- ctx->pos = (nf_pos >> 2) + 1;
+ /* Update file position only if we got past the current one */
+ if (nf_pos >= emit_pos) {
+ ctx->pos = (nf_pos >> 2) + 1;
+ pos_valid = true;
+ }
fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc,
&elen, &offset);
if (!fi)
goto out;
+ /* Still not at offset where user asked us to read from? */
+ if (cur_pos < emit_pos)
+ continue;
liu = le16_to_cpu(cfi.lengthOfImpUse);
lfi = cfi.lengthFileIdent;
@@ -187,8 +212,11 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
} /* end while */
ctx->pos = (nf_pos >> 2) + 1;
+ pos_valid = true;
out:
+ if (pos_valid)
+ file->f_version = inode_query_iversion(dir);
if (fibh.sbh != fibh.ebh)
brelse(fibh.ebh);
brelse(fibh.sbh);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 97a192eb9949..639aabf30eaf 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -258,10 +258,6 @@ int udf_expand_file_adinicb(struct inode *inode)
char *kaddr;
struct udf_inode_info *iinfo = UDF_I(inode);
int err;
- struct writeback_control udf_wbc = {
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = 1,
- };
WARN_ON_ONCE(!inode_is_locked(inode));
if (!iinfo->i_lenAlloc) {
@@ -305,8 +301,10 @@ int udf_expand_file_adinicb(struct inode *inode)
iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
/* from now on we have normal address_space methods */
inode->i_data.a_ops = &udf_aops;
+ set_page_dirty(page);
+ unlock_page(page);
up_write(&iinfo->i_data_sem);
- err = inode->i_data.a_ops->writepage(page, &udf_wbc);
+ err = filemap_fdatawrite(inode->i_mapping);
if (err) {
/* Restore everything back so that we don't lose data... */
lock_page(page);
@@ -318,6 +316,7 @@ int udf_expand_file_adinicb(struct inode *inode)
unlock_page(page);
iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
inode->i_data.a_ops = &udf_adinicb_aops;
+ iinfo->i_lenAlloc = inode->i_size;
up_write(&iinfo->i_data_sem);
}
put_page(page);
@@ -547,11 +546,14 @@ static int udf_do_extend_file(struct inode *inode,
udf_write_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1);
+
/*
- * We've rewritten the last extent but there may be empty
- * indirect extent after it - enter it.
+ * We've rewritten the last extent. If we are going to add
+ * more extents, we may need to enter possible following
+ * empty indirect extent.
*/
- udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0);
+ if (new_block_bytes || prealloc_len)
+ udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0);
}
/* Managed to do everything necessary? */
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 401e64cde1be..853bcff51043 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -173,13 +173,22 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type,
else
offset = le32_to_cpu(eahd->appAttrLocation);
- while (offset < iinfo->i_lenEAttr) {
+ while (offset + sizeof(*gaf) < iinfo->i_lenEAttr) {
+ uint32_t attrLength;
+
gaf = (struct genericFormat *)&ea[offset];
+ attrLength = le32_to_cpu(gaf->attrLength);
+
+ /* Detect undersized elements and buffer overflows */
+ if ((attrLength < sizeof(*gaf)) ||
+ (attrLength > (iinfo->i_lenEAttr - offset)))
+ break;
+
if (le32_to_cpu(gaf->attrType) == type &&
gaf->attrSubtype == subtype)
return gaf;
else
- offset += le32_to_cpu(gaf->attrLength);
+ offset += attrLength;
}
}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 77b6d89b9bcd..3c009562375d 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -30,6 +30,7 @@
#include <linux/sched.h>
#include <linux/crc-itu-t.h>
#include <linux/exportfs.h>
+#include <linux/iversion.h>
static inline int udf_match(int len1, const unsigned char *name1, int len2,
const unsigned char *name2)
@@ -135,6 +136,8 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
mark_buffer_dirty_inode(fibh->ebh, inode);
mark_buffer_dirty_inode(fibh->sbh, inode);
}
+ inode_inc_iversion(inode);
+
return 0;
}
@@ -933,6 +936,10 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
iinfo->i_location.partitionReferenceNum,
0);
epos.bh = udf_tgetblk(sb, block);
+ if (unlikely(!epos.bh)) {
+ err = -ENOMEM;
+ goto out_no_entry;
+ }
lock_buffer(epos.bh);
memset(epos.bh->b_data, 0x00, bsize);
set_buffer_uptodate(epos.bh);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4aba4878ed96..5193b94c0683 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -57,6 +57,7 @@
#include <linux/crc-itu-t.h>
#include <linux/log2.h>
#include <asm/byteorder.h>
+#include <linux/iversion.h>
#include "udf_sb.h"
#include "udf_i.h"
@@ -108,16 +109,10 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb)
return NULL;
lvid = (struct logicalVolIntegrityDesc *)UDF_SB(sb)->s_lvid_bh->b_data;
partnum = le32_to_cpu(lvid->numOfPartitions);
- if ((sb->s_blocksize - sizeof(struct logicalVolIntegrityDescImpUse) -
- offsetof(struct logicalVolIntegrityDesc, impUse)) /
- (2 * sizeof(uint32_t)) < partnum) {
- udf_err(sb, "Logical volume integrity descriptor corrupted "
- "(numOfPartitions = %u)!\n", partnum);
- return NULL;
- }
/* The offset is to skip freeSpaceTable and sizeTable arrays */
offset = partnum * 2 * sizeof(uint32_t);
- return (struct logicalVolIntegrityDescImpUse *)&(lvid->impUse[offset]);
+ return (struct logicalVolIntegrityDescImpUse *)
+ (((uint8_t *)(lvid + 1)) + offset);
}
/* UDF filesystem type */
@@ -155,6 +150,7 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
init_rwsem(&ei->i_data_sem);
ei->cached_extent.lstart = -1;
spin_lock_init(&ei->i_extent_cache_lock);
+ inode_set_iversion(&ei->vfs_inode, 1);
return &ei->vfs_inode;
}
@@ -349,10 +345,10 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",lastblock=%u", sbi->s_last_block);
if (sbi->s_anchor != 0)
seq_printf(seq, ",anchor=%u", sbi->s_anchor);
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8))
- seq_puts(seq, ",utf8");
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP) && sbi->s_nls_map)
+ if (sbi->s_nls_map)
seq_printf(seq, ",iocharset=%s", sbi->s_nls_map->charset);
+ else
+ seq_puts(seq, ",iocharset=utf8");
return 0;
}
@@ -557,19 +553,24 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
/* Ignored (never implemented properly) */
break;
case Opt_utf8:
- uopt->flags |= (1 << UDF_FLAG_UTF8);
+ if (!remount) {
+ unload_nls(uopt->nls_map);
+ uopt->nls_map = NULL;
+ }
break;
case Opt_iocharset:
if (!remount) {
- if (uopt->nls_map)
- unload_nls(uopt->nls_map);
- /*
- * load_nls() failure is handled later in
- * udf_fill_super() after all options are
- * parsed.
- */
+ unload_nls(uopt->nls_map);
+ uopt->nls_map = NULL;
+ }
+ /* When nls_map is not loaded then UTF-8 is used */
+ if (!remount && strcmp(args[0].from, "utf8") != 0) {
uopt->nls_map = load_nls(args[0].from);
- uopt->flags |= (1 << UDF_FLAG_NLS_MAP);
+ if (!uopt->nls_map) {
+ pr_err("iocharset %s not found\n",
+ args[0].from);
+ return 0;
+ }
}
break;
case Opt_uforget:
@@ -705,6 +706,7 @@ static int udf_check_vsd(struct super_block *sb)
struct buffer_head *bh = NULL;
int nsr = 0;
struct udf_sb_info *sbi;
+ loff_t session_offset;
sbi = UDF_SB(sb);
if (sb->s_blocksize < sizeof(struct volStructDesc))
@@ -712,7 +714,8 @@ static int udf_check_vsd(struct super_block *sb)
else
sectorsize = sb->s_blocksize;
- sector += (((loff_t)sbi->s_session) << sb->s_blocksize_bits);
+ session_offset = (loff_t)sbi->s_session << sb->s_blocksize_bits;
+ sector += session_offset;
udf_debug("Starting at sector %u (%lu byte sectors)\n",
(unsigned int)(sector >> sb->s_blocksize_bits),
@@ -757,8 +760,7 @@ static int udf_check_vsd(struct super_block *sb)
if (nsr > 0)
return 1;
- else if (!bh && sector - (sbi->s_session << sb->s_blocksize_bits) ==
- VSD_FIRST_SECTOR_OFFSET)
+ else if (!bh && sector - session_offset == VSD_FIRST_SECTOR_OFFSET)
return -1;
else
return 0;
@@ -1547,6 +1549,7 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
struct udf_sb_info *sbi = UDF_SB(sb);
struct logicalVolIntegrityDesc *lvid;
int indirections = 0;
+ u32 parts, impuselen;
while (++indirections <= UDF_MAX_LVID_NESTING) {
final_bh = NULL;
@@ -1573,15 +1576,27 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
lvid = (struct logicalVolIntegrityDesc *)final_bh->b_data;
if (lvid->nextIntegrityExt.extLength == 0)
- return;
+ goto check;
loc = leea_to_cpu(lvid->nextIntegrityExt);
}
udf_warn(sb, "Too many LVID indirections (max %u), ignoring.\n",
UDF_MAX_LVID_NESTING);
+out_err:
brelse(sbi->s_lvid_bh);
sbi->s_lvid_bh = NULL;
+ return;
+check:
+ parts = le32_to_cpu(lvid->numOfPartitions);
+ impuselen = le32_to_cpu(lvid->lengthOfImpUse);
+ if (parts >= sb->s_blocksize || impuselen >= sb->s_blocksize ||
+ sizeof(struct logicalVolIntegrityDesc) + impuselen +
+ 2 * parts * sizeof(u32) > sb->s_blocksize) {
+ udf_warn(sb, "Corrupted LVID (parts=%u, impuselen=%u), "
+ "ignoring.\n", parts, impuselen);
+ goto out_err;
+ }
}
/*
@@ -2144,21 +2159,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
if (!udf_parse_options((char *)options, &uopt, false))
goto parse_options_failure;
- if (uopt.flags & (1 << UDF_FLAG_UTF8) &&
- uopt.flags & (1 << UDF_FLAG_NLS_MAP)) {
- udf_err(sb, "utf8 cannot be combined with iocharset\n");
- goto parse_options_failure;
- }
- if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) {
- uopt.nls_map = load_nls_default();
- if (!uopt.nls_map)
- uopt.flags &= ~(1 << UDF_FLAG_NLS_MAP);
- else
- udf_debug("Using default NLS map\n");
- }
- if (!(uopt.flags & (1 << UDF_FLAG_NLS_MAP)))
- uopt.flags |= (1 << UDF_FLAG_UTF8);
-
fileset.logicalBlockNum = 0xFFFFFFFF;
fileset.partitionReferenceNum = 0xFFFF;
@@ -2313,8 +2313,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
error_out:
iput(sbi->s_vat_inode);
parse_options_failure:
- if (uopt.nls_map)
- unload_nls(uopt.nls_map);
+ unload_nls(uopt.nls_map);
if (lvid_open)
udf_close_lvid(sb);
brelse(sbi->s_lvid_bh);
@@ -2364,8 +2363,7 @@ static void udf_put_super(struct super_block *sb)
sbi = UDF_SB(sb);
iput(sbi->s_vat_inode);
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
- unload_nls(sbi->s_nls_map);
+ unload_nls(sbi->s_nls_map);
if (!sb_rdonly(sb))
udf_close_lvid(sb);
brelse(sbi->s_lvid_bh);
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 3d83be54c474..8eace7a633d3 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -20,8 +20,6 @@
#define UDF_FLAG_UNDELETE 6
#define UDF_FLAG_UNHIDE 7
#define UDF_FLAG_VARCONV 8
-#define UDF_FLAG_NLS_MAP 9
-#define UDF_FLAG_UTF8 10
#define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */
#define UDF_FLAG_GID_FORGET 12
#define UDF_FLAG_UID_SET 13
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 5fcfa96463eb..622569007b53 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -177,7 +177,7 @@ static int udf_name_from_CS0(struct super_block *sb,
return 0;
}
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
+ if (UDF_SB(sb)->s_nls_map)
conv_f = UDF_SB(sb)->s_nls_map->uni2char;
else
conv_f = NULL;
@@ -285,7 +285,7 @@ static int udf_name_to_CS0(struct super_block *sb,
if (ocu_max_len <= 0)
return 0;
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
+ if (UDF_SB(sb)->s_nls_map)
conv_f = UDF_SB(sb)->s_nls_map->char2uni;
else
conv_f = NULL;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index d99d166fd892..ec57bbb6bb05 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -32,11 +32,6 @@ int sysctl_unprivileged_userfaultfd __read_mostly = 1;
static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
-enum userfaultfd_state {
- UFFD_STATE_WAIT_API,
- UFFD_STATE_RUNNING,
-};
-
/*
* Start with fault_pending_wqh and fault_wqh so they're more likely
* to be in the same cacheline.
@@ -68,8 +63,6 @@ struct userfaultfd_ctx {
unsigned int flags;
/* features requested from the userspace */
unsigned int features;
- /* state machine */
- enum userfaultfd_state state;
/* released */
bool released;
/* memory mappings are changing because of non-cooperative event */
@@ -103,6 +96,14 @@ struct userfaultfd_wake_range {
unsigned long len;
};
+/* internal indication that UFFD_API ioctl was successfully executed */
+#define UFFD_FEATURE_INITIALIZED (1u << 31)
+
+static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
+{
+ return ctx->features & UFFD_FEATURE_INITIALIZED;
+}
+
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
@@ -699,7 +700,6 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
refcount_set(&ctx->refcount, 1);
ctx->flags = octx->flags;
- ctx->state = UFFD_STATE_RUNNING;
ctx->features = octx->features;
ctx->released = false;
ctx->mmap_changing = false;
@@ -980,38 +980,33 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
poll_wait(file, &ctx->fd_wqh, wait);
- switch (ctx->state) {
- case UFFD_STATE_WAIT_API:
+ if (!userfaultfd_is_initialized(ctx))
return EPOLLERR;
- case UFFD_STATE_RUNNING:
- /*
- * poll() never guarantees that read won't block.
- * userfaults can be waken before they're read().
- */
- if (unlikely(!(file->f_flags & O_NONBLOCK)))
- return EPOLLERR;
- /*
- * lockless access to see if there are pending faults
- * __pollwait last action is the add_wait_queue but
- * the spin_unlock would allow the waitqueue_active to
- * pass above the actual list_add inside
- * add_wait_queue critical section. So use a full
- * memory barrier to serialize the list_add write of
- * add_wait_queue() with the waitqueue_active read
- * below.
- */
- ret = 0;
- smp_mb();
- if (waitqueue_active(&ctx->fault_pending_wqh))
- ret = EPOLLIN;
- else if (waitqueue_active(&ctx->event_wqh))
- ret = EPOLLIN;
-
- return ret;
- default:
- WARN_ON_ONCE(1);
+
+ /*
+ * poll() never guarantees that read won't block.
+ * userfaults can be waken before they're read().
+ */
+ if (unlikely(!(file->f_flags & O_NONBLOCK)))
return EPOLLERR;
- }
+ /*
+ * lockless access to see if there are pending faults
+ * __pollwait last action is the add_wait_queue but
+ * the spin_unlock would allow the waitqueue_active to
+ * pass above the actual list_add inside
+ * add_wait_queue critical section. So use a full
+ * memory barrier to serialize the list_add write of
+ * add_wait_queue() with the waitqueue_active read
+ * below.
+ */
+ ret = 0;
+ smp_mb();
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ ret = EPOLLIN;
+ else if (waitqueue_active(&ctx->event_wqh))
+ ret = EPOLLIN;
+
+ return ret;
}
static const struct file_operations userfaultfd_fops;
@@ -1205,7 +1200,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
struct uffd_msg msg;
int no_wait = file->f_flags & O_NONBLOCK;
- if (ctx->state == UFFD_STATE_WAIT_API)
+ if (!userfaultfd_is_initialized(ctx))
return -EINVAL;
for (;;) {
@@ -1272,23 +1267,21 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
}
static __always_inline int validate_range(struct mm_struct *mm,
- __u64 *start, __u64 len)
+ __u64 start, __u64 len)
{
__u64 task_size = mm->task_size;
- *start = untagged_addr(*start);
-
- if (*start & ~PAGE_MASK)
+ if (start & ~PAGE_MASK)
return -EINVAL;
if (len & ~PAGE_MASK)
return -EINVAL;
if (!len)
return -EINVAL;
- if (*start < mmap_min_addr)
+ if (start < mmap_min_addr)
return -EINVAL;
- if (*start >= task_size)
+ if (start >= task_size)
return -EINVAL;
- if (len > task_size - *start)
+ if (len > task_size - start)
return -EINVAL;
return 0;
}
@@ -1338,7 +1331,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
goto out;
}
- ret = validate_range(mm, &uffdio_register.range.start,
+ ret = validate_range(mm, uffdio_register.range.start,
uffdio_register.range.len);
if (ret)
goto out;
@@ -1527,7 +1520,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
goto out;
- ret = validate_range(mm, &uffdio_unregister.start,
+ ret = validate_range(mm, uffdio_unregister.start,
uffdio_unregister.len);
if (ret)
goto out;
@@ -1678,7 +1671,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len);
+ ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
if (ret)
goto out;
@@ -1718,7 +1711,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
sizeof(uffdio_copy)-sizeof(__s64)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len);
+ ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
if (ret)
goto out;
/*
@@ -1774,7 +1767,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
sizeof(uffdio_zeropage)-sizeof(__s64)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_zeropage.range.start,
+ ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
uffdio_zeropage.range.len);
if (ret)
goto out;
@@ -1809,9 +1802,10 @@ out:
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
- * For the current set of features the bits just coincide
+ * For the current set of features the bits just coincide. Set
+ * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
*/
- return (unsigned int)user_features;
+ return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
}
/*
@@ -1824,12 +1818,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
{
struct uffdio_api uffdio_api;
void __user *buf = (void __user *)arg;
+ unsigned int ctx_features;
int ret;
__u64 features;
- ret = -EINVAL;
- if (ctx->state != UFFD_STATE_WAIT_API)
- goto out;
ret = -EFAULT;
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
goto out;
@@ -1846,9 +1838,13 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = -EFAULT;
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
goto out;
- ctx->state = UFFD_STATE_RUNNING;
+
/* only enable the requested features for this uffd context */
- ctx->features = uffd_ctx_features(features);
+ ctx_features = uffd_ctx_features(features);
+ ret = -EINVAL;
+ if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
+ goto err_out;
+
ret = 0;
out:
return ret;
@@ -1865,7 +1861,7 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
int ret = -EINVAL;
struct userfaultfd_ctx *ctx = file->private_data;
- if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
+ if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
return -EINVAL;
switch(cmd) {
@@ -1966,7 +1962,6 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
refcount_set(&ctx->refcount, 1);
ctx->flags = flags;
ctx->features = 0;
- ctx->state = UFFD_STATE_WAIT_API;
ctx->released = false;
ctx->mmap_changing = false;
ctx->mm = current->mm;
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index eabc6ac19906..1370bfd17e87 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -136,7 +136,7 @@ static int build_merkle_tree(struct inode *inode,
* (level 0) and ascending to the root node (level 'num_levels - 1').
* Then at the end (level 'num_levels'), calculate the root hash.
*/
- blocks = (inode->i_size + params->block_size - 1) >>
+ blocks = ((u64)inode->i_size + params->block_size - 1) >>
params->log_blocksize;
for (level = 0; level <= params->num_levels; level++) {
err = build_merkle_tree_level(inode, level, blocks, params,
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 63d1004b688c..6200826107c2 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -89,7 +89,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
*/
/* Compute number of levels and the number of blocks in each level */
- blocks = (inode->i_size + params->block_size - 1) >> log_blocksize;
+ blocks = ((u64)inode->i_size + params->block_size - 1) >> log_blocksize;
pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks);
while (blocks > 1) {
if (params->num_levels >= FS_VERITY_MAX_LEVELS) {
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 6c7354abd0ae..0ba7368b9a5f 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -100,9 +100,9 @@ xfs_trans_log_inode(
* to log the timestamps, or will clear already cleared fields in the
* worst case.
*/
- if (inode->i_state & (I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED)) {
+ if (inode->i_state & I_DIRTY_TIME) {
spin_lock(&inode->i_lock);
- inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+ inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock);
}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index b3021d9b34a5..7b7a009425e2 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -714,7 +714,8 @@ xfs_ioc_space(
flags |= XFS_PREALLOC_CLEAR;
if (bf->l_start > XFS_ISIZE(ip)) {
error = xfs_alloc_file_space(ip, XFS_ISIZE(ip),
- bf->l_start - XFS_ISIZE(ip), 0);
+ bf->l_start - XFS_ISIZE(ip),
+ XFS_BMAPI_PREALLOC);
if (error)
goto out_unlock;
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index dec511823fcb..ca8c763902b9 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -839,7 +839,7 @@ xfs_setattr_size(
ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
ASSERT(S_ISREG(inode->i_mode));
ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
- ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
+ ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0);
oldsize = inode->i_size;
newsize = iattr->ia_size;