summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_file.c2
-rw-r--r--fs/9p/vfs_super.c3
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/namei.c7
-rw-r--r--fs/affs/super.c31
-rw-r--r--fs/affs/symlink.c7
-rw-r--r--fs/aio.c3
-rw-r--r--fs/befs/linuxvfs.c1
-rw-r--r--fs/bfs/inode.c43
-rw-r--r--fs/binfmt_aout.c1
-rw-r--r--fs/binfmt_elf.c27
-rw-r--r--fs/binfmt_elf_fdpic.c16
-rw-r--r--fs/binfmt_flat.c1
-rw-r--r--fs/binfmt_misc.c2
-rw-r--r--fs/binfmt_som.c1
-rw-r--r--fs/bio-integrity.c3
-rw-r--r--fs/bio.c7
-rw-r--r--fs/block_dev.c20
-rw-r--r--fs/btrfs/acl.c39
-rw-r--r--fs/btrfs/btrfs_inode.h5
-rw-r--r--fs/btrfs/ctree.c229
-rw-r--r--fs/btrfs/ctree.h40
-rw-r--r--fs/btrfs/dir-item.c19
-rw-r--r--fs/btrfs/disk-io.c40
-rw-r--r--fs/btrfs/extent-tree.c112
-rw-r--r--fs/btrfs/extent_io.c3
-rw-r--r--fs/btrfs/file.c729
-rw-r--r--fs/btrfs/inode.c624
-rw-r--r--fs/btrfs/ioctl.c43
-rw-r--r--fs/btrfs/ordered-data.c117
-rw-r--r--fs/btrfs/ordered-data.h5
-rw-r--r--fs/btrfs/relocation.c43
-rw-r--r--fs/btrfs/super.c20
-rw-r--r--fs/btrfs/transaction.c44
-rw-r--r--fs/btrfs/transaction.h6
-rw-r--r--fs/btrfs/tree-log.c86
-rw-r--r--fs/btrfs/volumes.c19
-rw-r--r--fs/btrfs/xattr.c80
-rw-r--r--fs/btrfs/xattr.h9
-rw-r--r--fs/cachefiles/security.c4
-rw-r--r--fs/char_dev.c4
-rw-r--r--fs/cifs/cifsfs.c6
-rw-r--r--fs/cifs/cifsglob.h1
-rw-r--r--fs/cifs/cifsproto.h6
-rw-r--r--fs/cifs/cifssmb.c19
-rw-r--r--fs/cifs/connect.c13
-rw-r--r--fs/cifs/dir.c20
-rw-r--r--fs/cifs/dns_resolve.c69
-rw-r--r--fs/cifs/dns_resolve.h6
-rw-r--r--fs/cifs/file.c11
-rw-r--r--fs/cifs/inode.c25
-rw-r--r--fs/cifs/readdir.c1
-rw-r--r--fs/cifs/sess.c10
-rw-r--r--fs/compat.c2
-rw-r--r--fs/debugfs/inode.c55
-rw-r--r--fs/devpts/inode.c16
-rw-r--r--fs/dlm/ast.c74
-rw-r--r--fs/dlm/ast.h4
-rw-r--r--fs/dlm/config.c24
-rw-r--r--fs/dlm/debug_fs.c2
-rw-r--r--fs/dlm/dir.c7
-rw-r--r--fs/dlm/dlm_internal.h11
-rw-r--r--fs/dlm/lock.c120
-rw-r--r--fs/dlm/lockspace.c15
-rw-r--r--fs/dlm/lowcomms.c6
-rw-r--r--fs/dlm/member.c8
-rw-r--r--fs/dlm/memory.c6
-rw-r--r--fs/dlm/netlink.c2
-rw-r--r--fs/dlm/plock.c6
-rw-r--r--fs/dlm/rcom.c2
-rw-r--r--fs/dlm/requestqueue.c2
-rw-r--r--fs/dlm/user.c22
-rw-r--r--fs/dlm/user.h4
-rw-r--r--fs/ecryptfs/crypto.c4
-rw-r--r--fs/ecryptfs/file.c72
-rw-r--r--fs/ecryptfs/inode.c129
-rw-r--r--fs/ecryptfs/messaging.c17
-rw-r--r--fs/ecryptfs/super.c1
-rw-r--r--fs/exec.c75
-rw-r--r--fs/exofs/dir.c2
-rw-r--r--fs/exofs/inode.c17
-rw-r--r--fs/ext3/inode.c18
-rw-r--r--fs/ext3/super.c2
-rw-r--r--fs/ext3/xattr.c7
-rw-r--r--fs/ext4/balloc.c11
-rw-r--r--fs/ext4/block_validity.c2
-rw-r--r--fs/ext4/dir.c12
-rw-r--r--fs/ext4/ext4.h179
-rw-r--r--fs/ext4/ext4_extents.h3
-rw-r--r--fs/ext4/ext4_jbd2.c2
-rw-r--r--fs/ext4/ext4_jbd2.h27
-rw-r--r--fs/ext4/extents.c288
-rw-r--r--fs/ext4/file.c6
-rw-r--r--fs/ext4/fsync.c93
-rw-r--r--fs/ext4/ialloc.c98
-rw-r--r--fs/ext4/inode.c507
-rw-r--r--fs/ext4/ioctl.c66
-rw-r--r--fs/ext4/mballoc.c152
-rw-r--r--fs/ext4/mballoc.h2
-rw-r--r--fs/ext4/migrate.c41
-rw-r--r--fs/ext4/move_extent.c303
-rw-r--r--fs/ext4/namei.c62
-rw-r--r--fs/ext4/resize.c5
-rw-r--r--fs/ext4/super.c135
-rw-r--r--fs/ext4/xattr.c35
-rw-r--r--fs/fat/namei_vfat.c6
-rw-r--r--fs/fcntl.c102
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/fs-writeback.c31
-rw-r--r--fs/fuse/dev.c16
-rw-r--r--fs/fuse/file.c3
-rw-r--r--fs/gfs2/acl.c170
-rw-r--r--fs/gfs2/acl.h24
-rw-r--r--fs/gfs2/dir.c2
-rw-r--r--fs/gfs2/file.c9
-rw-r--r--fs/gfs2/xattr.c18
-rw-r--r--fs/hfs/catalog.c4
-rw-r--r--fs/hfs/dir.c11
-rw-r--r--fs/hfs/super.c7
-rw-r--r--fs/jbd/journal.c2
-rw-r--r--fs/jbd2/checkpoint.c15
-rw-r--r--fs/jbd2/commit.c23
-rw-r--r--fs/jbd2/journal.c15
-rw-r--r--fs/jffs2/gc.c3
-rw-r--r--fs/jfs/resize.c6
-rw-r--r--fs/jfs/xattr.c87
-rw-r--r--fs/libfs.c3
-rw-r--r--fs/namei.c15
-rw-r--r--fs/namespace.c9
-rw-r--r--fs/nfs/client.c7
-rw-r--r--fs/nfs/delegation.h6
-rw-r--r--fs/nfs/dir.c6
-rw-r--r--fs/nfs/direct.c3
-rw-r--r--fs/nfs/dns_resolve.c18
-rw-r--r--fs/nfs/file.c13
-rw-r--r--fs/nfs/fscache.c9
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/mount_clnt.c2
-rw-r--r--fs/nfs/nfs2xdr.c2
-rw-r--r--fs/nfs/nfs4_fs.h1
-rw-r--r--fs/nfs/nfs4proc.c28
-rw-r--r--fs/nfs/nfs4xdr.c12
-rw-r--r--fs/nfs/pagelist.c40
-rw-r--r--fs/nfs/super.c46
-rw-r--r--fs/nfs/write.c6
-rw-r--r--fs/nfsd/nfs4acl.c2
-rw-r--r--fs/nfsd/nfs4state.c2
-rw-r--r--fs/nfsd/nfs4xdr.c20
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/vfs.c5
-rw-r--r--fs/nilfs2/super.c1
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c7
-rw-r--r--fs/notify/inotify/inotify_user.c20
-rw-r--r--fs/ocfs2/acl.c108
-rw-r--r--fs/ocfs2/alloc.c10
-rw-r--r--fs/ocfs2/alloc.h5
-rw-r--r--fs/ocfs2/aops.c5
-rw-r--r--fs/ocfs2/buffer_head_io.c2
-rw-r--r--fs/ocfs2/dlm/dlmfs.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c9
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c22
-rw-r--r--fs/ocfs2/dlm/dlmthread.c114
-rw-r--r--fs/ocfs2/inode.c7
-rw-r--r--fs/ocfs2/locks.c2
-rw-r--r--fs/ocfs2/refcounttree.c140
-rw-r--r--fs/ocfs2/suballoc.c13
-rw-r--r--fs/ocfs2/super.c4
-rw-r--r--fs/ocfs2/symlink.c2
-rw-r--r--fs/partitions/efi.c30
-rw-r--r--fs/partitions/efi.h8
-rw-r--r--fs/partitions/ibm.c13
-rw-r--r--fs/partitions/msdos.c85
-rw-r--r--fs/proc/array.c97
-rw-r--r--fs/proc/base.c31
-rw-r--r--fs/proc/task_mmu.c28
-rw-r--r--fs/quota/dquot.c265
-rw-r--r--fs/reiserfs/dir.c2
-rw-r--r--fs/reiserfs/inode.c17
-rw-r--r--fs/reiserfs/journal.c15
-rw-r--r--fs/reiserfs/xattr.c19
-rw-r--r--fs/reiserfs/xattr_security.c2
-rw-r--r--fs/romfs/super.c1
-rw-r--r--fs/signalfd.c2
-rw-r--r--fs/splice.c12
-rw-r--r--fs/stat.c10
-rw-r--r--fs/super.c3
-rw-r--r--fs/sysfs/file.c2
-rw-r--r--fs/sysfs/inode.c47
-rw-r--r--fs/udf/super.c32
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c118
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c65
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c285
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h8
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c3
-rw-r--r--fs/xfs/xfs_ag.h1
-rw-r--r--fs/xfs/xfs_alloc.c52
-rw-r--r--fs/xfs/xfs_dfrag.c111
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_iget.c31
-rw-r--r--fs/xfs/xfs_inode.c17
-rw-r--r--fs/xfs/xfs_iomap.c9
-rw-r--r--fs/xfs/xfs_log_recover.c24
-rw-r--r--fs/xfs/xfs_mount.c2
-rw-r--r--fs/xfs/xfs_mount.h2
-rw-r--r--fs/xfs/xfs_rw.h7
-rw-r--r--fs/xfs/xfs_trans.c7
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_vnodeops.c179
-rw-r--r--fs/xfs/xfs_vnodeops.h1
213 files changed, 5201 insertions, 3192 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3902bf43a088..5fb43bd19688 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -114,7 +114,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
/* No mandatory locks */
- if (__mandatory_lock(inode))
+ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
return -ENOLCK;
if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 14a86448572c..69357c0d9899 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -188,7 +188,8 @@ static void v9fs_kill_super(struct super_block *s)
P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
- v9fs_dentry_release(s->s_root); /* clunk root */
+ if (s->s_root)
+ v9fs_dentry_release(s->s_root); /* clunk root */
kill_anon_super(s);
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index e511dc621a2e..0e40caaba456 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -106,8 +106,8 @@ struct affs_sb_info {
u32 s_last_bmap;
struct buffer_head *s_bmap_bh;
char *s_prefix; /* Prefix for volumes and assigns. */
- int s_prefix_len; /* Length of prefix. */
char s_volume[32]; /* Volume prefix for absolute symlinks. */
+ spinlock_t symlink_lock; /* protects the previous two */
};
#define SF_INTL 0x0001 /* International filesystem. */
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 960d336ec694..d70bbbac6b7b 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -341,10 +341,13 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
p = (char *)AFFS_HEAD(bh)->table;
lc = '/';
if (*symname == '/') {
+ struct affs_sb_info *sbi = AFFS_SB(sb);
while (*symname == '/')
symname++;
- while (AFFS_SB(sb)->s_volume[i]) /* Cannot overflow */
- *p++ = AFFS_SB(sb)->s_volume[i++];
+ spin_lock(&sbi->symlink_lock);
+ while (sbi->s_volume[i]) /* Cannot overflow */
+ *p++ = sbi->s_volume[i++];
+ spin_unlock(&sbi->symlink_lock);
}
while (i < maxlen && (c = *symname++)) {
if (c == '.' && lc == '/' && *symname == '.' && symname[1] == '/') {
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 104fdcb3a7fc..d41e9673cd97 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -203,7 +203,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
switch (token) {
case Opt_bs:
if (match_int(&args[0], &n))
- return -EINVAL;
+ return 0;
if (n != 512 && n != 1024 && n != 2048
&& n != 4096) {
printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
@@ -213,7 +213,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
break;
case Opt_mode:
if (match_octal(&args[0], &option))
- return 1;
+ return 0;
*mode = option & 0777;
*mount_opts |= SF_SETMODE;
break;
@@ -221,8 +221,6 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
*mount_opts |= SF_MUFS;
break;
case Opt_prefix:
- /* Free any previous prefix */
- kfree(*prefix);
*prefix = match_strdup(&args[0]);
if (!*prefix)
return 0;
@@ -233,21 +231,21 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
break;
case Opt_reserved:
if (match_int(&args[0], reserved))
- return 1;
+ return 0;
break;
case Opt_root:
if (match_int(&args[0], root))
- return 1;
+ return 0;
break;
case Opt_setgid:
if (match_int(&args[0], &option))
- return 1;
+ return 0;
*gid = option;
*mount_opts |= SF_SETGID;
break;
case Opt_setuid:
if (match_int(&args[0], &option))
- return -EINVAL;
+ return 0;
*uid = option;
*mount_opts |= SF_SETUID;
break;
@@ -311,11 +309,14 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
return -ENOMEM;
sb->s_fs_info = sbi;
mutex_init(&sbi->s_bmlock);
+ spin_lock_init(&sbi->symlink_lock);
if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
&blocksize,&sbi->s_prefix,
sbi->s_volume, &mount_flags)) {
printk(KERN_ERR "AFFS: Error parsing options\n");
+ kfree(sbi->s_prefix);
+ kfree(sbi);
return -EINVAL;
}
/* N.B. after this point s_prefix must be released */
@@ -516,14 +517,18 @@ affs_remount(struct super_block *sb, int *flags, char *data)
unsigned long mount_flags;
int res = 0;
char *new_opts = kstrdup(data, GFP_KERNEL);
+ char volume[32];
+ char *prefix = NULL;
pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data);
*flags |= MS_NODIRATIME;
+ memcpy(volume, sbi->s_volume, 32);
if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block,
- &blocksize, &sbi->s_prefix, sbi->s_volume,
+ &blocksize, &prefix, volume,
&mount_flags)) {
+ kfree(prefix);
kfree(new_opts);
return -EINVAL;
}
@@ -534,6 +539,14 @@ affs_remount(struct super_block *sb, int *flags, char *data)
sbi->s_mode = mode;
sbi->s_uid = uid;
sbi->s_gid = gid;
+ /* protect against readers */
+ spin_lock(&sbi->symlink_lock);
+ if (prefix) {
+ kfree(sbi->s_prefix);
+ sbi->s_prefix = prefix;
+ }
+ memcpy(sbi->s_volume, volume, 32);
+ spin_unlock(&sbi->symlink_lock);
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
unlock_kernel();
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index 41782539c907..ee00f08c4f53 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -20,7 +20,6 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
int i, j;
char c;
char lc;
- char *pf;
pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino);
@@ -32,11 +31,15 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
j = 0;
lf = (struct slink_front *)bh->b_data;
lc = 0;
- pf = AFFS_SB(inode->i_sb)->s_prefix ? AFFS_SB(inode->i_sb)->s_prefix : "/";
if (strchr(lf->symname,':')) { /* Handle assign or volume name */
+ struct affs_sb_info *sbi = AFFS_SB(inode->i_sb);
+ char *pf;
+ spin_lock(&sbi->symlink_lock);
+ pf = sbi->s_prefix ? sbi->s_prefix : "/";
while (i < 1023 && (c = pf[i]))
link[i++] = c;
+ spin_unlock(&sbi->symlink_lock);
while (i < 1023 && lf->symname[j] != ':')
link[i++] = lf->symname[j++];
if (i < 1023)
diff --git a/fs/aio.c b/fs/aio.c
index 02a2c9340573..b84a7695358d 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1639,6 +1639,9 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
if (unlikely(nr < 0))
return -EINVAL;
+ if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
+ nr = LONG_MAX/sizeof(*iocbpp);
+
if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
return -EFAULT;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 33baf27fac78..34ddda888e63 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -873,6 +873,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
brelse(bh);
unacquire_priv_sbp:
+ kfree(befs_sb->mount_opts.iocharset);
kfree(sb->s_fs_info);
unacquire_none:
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 6f60336c6628..8f3d9fd89604 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -353,35 +353,35 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
struct inode *inode;
unsigned i, imap_len;
struct bfs_sb_info *info;
- long ret = -EINVAL;
+ int ret = -EINVAL;
unsigned long i_sblock, i_eblock, i_eoff, s_size;
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info)
return -ENOMEM;
+ mutex_init(&info->bfs_lock);
s->s_fs_info = info;
sb_set_blocksize(s, BFS_BSIZE);
- bh = sb_bread(s, 0);
- if(!bh)
+ info->si_sbh = sb_bread(s, 0);
+ if (!info->si_sbh)
goto out;
- bfs_sb = (struct bfs_super_block *)bh->b_data;
+ bfs_sb = (struct bfs_super_block *)info->si_sbh->b_data;
if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
if (!silent)
printf("No BFS filesystem on %s (magic=%08x)\n",
s->s_id, le32_to_cpu(bfs_sb->s_magic));
- goto out;
+ goto out1;
}
if (BFS_UNCLEAN(bfs_sb, s) && !silent)
printf("%s is unclean, continuing\n", s->s_id);
s->s_magic = BFS_MAGIC;
- info->si_sbh = bh;
if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
printf("Superblock is corrupted\n");
- goto out;
+ goto out1;
}
info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
@@ -390,7 +390,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
imap_len = (info->si_lasti / 8) + 1;
info->si_imap = kzalloc(imap_len, GFP_KERNEL);
if (!info->si_imap)
- goto out;
+ goto out1;
for (i = 0; i < BFS_ROOT_INO; i++)
set_bit(i, info->si_imap);
@@ -398,15 +398,13 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
inode = bfs_iget(s, BFS_ROOT_INO);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
- kfree(info->si_imap);
- goto out;
+ goto out2;
}
s->s_root = d_alloc_root(inode);
if (!s->s_root) {
iput(inode);
ret = -ENOMEM;
- kfree(info->si_imap);
- goto out;
+ goto out2;
}
info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS;
@@ -419,10 +417,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
bh = sb_bread(s, info->si_blocks - 1);
if (!bh) {
printf("Last block not available: %lu\n", info->si_blocks - 1);
- iput(inode);
ret = -EIO;
- kfree(info->si_imap);
- goto out;
+ goto out3;
}
brelse(bh);
@@ -459,11 +455,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
printf("Inode 0x%08x corrupted\n", i);
brelse(bh);
- s->s_root = NULL;
- kfree(info->si_imap);
- kfree(info);
- s->s_fs_info = NULL;
- return -EIO;
+ ret = -EIO;
+ goto out3;
}
if (!di->i_ino) {
@@ -483,11 +476,17 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
s->s_dirt = 1;
}
dump_imap("read_super", s);
- mutex_init(&info->bfs_lock);
return 0;
+out3:
+ dput(s->s_root);
+ s->s_root = NULL;
+out2:
+ kfree(info->si_imap);
+out1:
+ brelse(info->si_sbh);
out:
- brelse(bh);
+ mutex_destroy(&info->bfs_lock);
kfree(info);
s->s_fs_info = NULL;
return ret;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index b639dcf7c778..0133b5a30413 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -263,6 +263,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
#else
set_personality(PER_LINUX);
#endif
+ setup_new_exec(bprm);
current->mm->end_code = ex.a_text +
(current->mm->start_code = N_TXTADDR(ex));
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b9b3bb51b1e4..1ed37ba72343 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -662,27 +662,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
goto out_free_interp;
- /*
- * The early SET_PERSONALITY here is so that the lookup
- * for the interpreter happens in the namespace of the
- * to-be-execed image. SET_PERSONALITY can select an
- * alternate root.
- *
- * However, SET_PERSONALITY is NOT allowed to switch
- * this task into the new images's memory mapping
- * policy - that is, TASK_SIZE must still evaluate to
- * that which is appropriate to the execing application.
- * This is because exit_mmap() needs to have TASK_SIZE
- * evaluate to the size of the old image.
- *
- * So if (say) a 64-bit application is execing a 32-bit
- * application it is the architecture's responsibility
- * to defer changing the value of TASK_SIZE until the
- * switch really is going to happen - do this in
- * flush_thread(). - akpm
- */
- SET_PERSONALITY(loc->elf_ex);
-
interpreter = open_exec(elf_interpreter);
retval = PTR_ERR(interpreter);
if (IS_ERR(interpreter))
@@ -730,9 +709,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
/* Verify the interpreter has a valid arch */
if (!elf_check_arch(&loc->interp_elf_ex))
goto out_free_dentry;
- } else {
- /* Executables without an interpreter also need a personality */
- SET_PERSONALITY(loc->elf_ex);
}
/* Flush all traces of the currently running executable */
@@ -752,7 +728,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
current->flags |= PF_RANDOMIZE;
- arch_pick_mmap_layout(current->mm);
+
+ setup_new_exec(bprm);
/* Do this so that we can load the interpreter, if need be. We will
change some of these later */
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 38502c67987c..e7a0bb47252c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -171,6 +171,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
#ifdef ELF_FDPIC_PLAT_INIT
unsigned long dynaddr;
#endif
+#ifndef CONFIG_MMU
+ unsigned long stack_prot;
+#endif
struct file *interpreter = NULL; /* to shut gcc up */
char *interpreter_name = NULL;
int executable_stack;
@@ -316,6 +319,11 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
* defunct, deceased, etc. after this point we have to exit via
* error_kill */
set_personality(PER_LINUX_FDPIC);
+ if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
+ current->personality |= READ_IMPLIES_EXEC;
+
+ setup_new_exec(bprm);
+
set_binfmt(&elf_fdpic_format);
current->mm->start_code = 0;
@@ -377,9 +385,13 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
if (stack_size < PAGE_SIZE * 2)
stack_size = PAGE_SIZE * 2;
+ stack_prot = PROT_READ | PROT_WRITE;
+ if (executable_stack == EXSTACK_ENABLE_X ||
+ (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC))
+ stack_prot |= PROT_EXEC;
+
down_write(&current->mm->mmap_sem);
- current->mm->start_brk = do_mmap(NULL, 0, stack_size,
- PROT_READ | PROT_WRITE | PROT_EXEC,
+ current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN,
0);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index a2796651e756..ca88c4687b84 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -519,6 +519,7 @@ static int load_flat_file(struct linux_binprm * bprm,
/* OK, This is the point of no return */
set_personality(PER_LINUX_32BIT);
+ setup_new_exec(bprm);
}
/*
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index c4e83537ead7..42b60b04ea06 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -723,7 +723,7 @@ static int __init init_misc_binfmt(void)
{
int err = register_filesystem(&bm_fs_type);
if (!err) {
- err = register_binfmt(&misc_format);
+ err = insert_binfmt(&misc_format);
if (err)
unregister_filesystem(&bm_fs_type);
}
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index eff74b9c9e77..35cf0023ddb8 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -227,6 +227,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
/* OK, This is the point of no return */
current->flags &= ~PF_FORKNOEXEC;
current->personality = PER_HPUX;
+ setup_new_exec(bprm);
/* Set the task size for HP-UX processes such that
* the gateway page is outside the address space.
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 49a34e7f7306..a16f29e888cd 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -61,7 +61,7 @@ static inline unsigned int vecs_to_idx(unsigned int nr)
static inline int use_bip_pool(unsigned int idx)
{
- if (idx == BIOVEC_NR_POOLS)
+ if (idx == BIOVEC_MAX_IDX)
return 1;
return 0;
@@ -95,6 +95,7 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
/* Use mempool if lower order alloc failed or max vecs were requested */
if (bip == NULL) {
+ idx = BIOVEC_MAX_IDX; /* so we free the payload properly later */
bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
if (unlikely(bip == NULL)) {
diff --git a/fs/bio.c b/fs/bio.c
index 12da5db8682c..e0c9e71cc404 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -542,13 +542,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
if (page == prev->bv_page &&
offset == prev->bv_offset + prev->bv_len) {
+ unsigned int prev_bv_len = prev->bv_len;
prev->bv_len += len;
if (q->merge_bvec_fn) {
struct bvec_merge_data bvm = {
+ /* prev_bvec is already charged in
+ bi_size, discharge it in order to
+ simulate merging updated prev_bvec
+ as new bvec. */
.bi_bdev = bio->bi_bdev,
.bi_sector = bio->bi_sector,
- .bi_size = bio->bi_size,
+ .bi_size = bio->bi_size - prev_bv_len,
.bi_rw = bio->bi_rw,
};
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8bed0557d88c..e65efa2f267a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -246,7 +246,8 @@ struct super_block *freeze_bdev(struct block_device *bdev)
if (!sb)
goto out;
if (sb->s_flags & MS_RDONLY) {
- deactivate_locked_super(sb);
+ sb->s_frozen = SB_FREEZE_TRANS;
+ up_write(&sb->s_umount);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return sb;
}
@@ -307,7 +308,7 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
BUG_ON(sb->s_bdev != bdev);
down_write(&sb->s_umount);
if (sb->s_flags & MS_RDONLY)
- goto out_deactivate;
+ goto out_unfrozen;
if (sb->s_op->unfreeze_fs) {
error = sb->s_op->unfreeze_fs(sb);
@@ -321,11 +322,11 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
}
}
+out_unfrozen:
sb->s_frozen = SB_UNFROZEN;
smp_wmb();
wake_up(&sb->s_wait_unfrozen);
-out_deactivate:
if (sb)
deactivate_locked_super(sb);
out_unlock:
@@ -403,7 +404,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
* NULL first argument is nfsd_sync_dir() and that's not a directory.
*/
-static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
{
return sync_blockdev(I_BDEV(filp->f_mapping->host));
}
@@ -422,6 +423,7 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
return NULL;
return &ei->vfs_inode;
}
+EXPORT_SYMBOL(block_fsync);
static void bdev_destroy_inode(struct inode *inode)
{
@@ -1173,10 +1175,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
/*
* hooks: /n/, see "layering violations".
*/
- ret = devcgroup_inode_permission(bdev->bd_inode, perm);
- if (ret != 0) {
- bdput(bdev);
- return ret;
+ if (!for_part) {
+ ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+ if (ret != 0) {
+ bdput(bdev);
+ return ret;
+ }
}
lock_kernel();
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 361604244271..12d7be8df561 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -94,7 +94,8 @@ static int btrfs_xattr_get_acl(struct inode *inode, int type,
/*
* Needs to be called with fs_mutex held
*/
-static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+static int btrfs_set_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct posix_acl *acl, int type)
{
int ret, size = 0;
const char *name;
@@ -111,12 +112,14 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
switch (type) {
case ACL_TYPE_ACCESS:
mode = inode->i_mode;
- ret = posix_acl_equiv_mode(acl, &mode);
- if (ret < 0)
- return ret;
- ret = 0;
- inode->i_mode = mode;
name = POSIX_ACL_XATTR_ACCESS;
+ if (acl) {
+ ret = posix_acl_equiv_mode(acl, &mode);
+ if (ret < 0)
+ return ret;
+ inode->i_mode = mode;
+ }
+ ret = 0;
break;
case ACL_TYPE_DEFAULT:
if (!S_ISDIR(inode->i_mode))
@@ -140,8 +143,7 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
goto out;
}
- ret = __btrfs_setxattr(inode, name, value, size, 0);
-
+ ret = __btrfs_setxattr(trans, inode, name, value, size, 0);
out:
kfree(value);
@@ -154,9 +156,12 @@ out:
static int btrfs_xattr_set_acl(struct inode *inode, int type,
const void *value, size_t size)
{
- int ret = 0;
+ int ret;
struct posix_acl *acl = NULL;
+ if (!is_owner_or_cap(inode))
+ return -EPERM;
+
if (value) {
acl = posix_acl_from_xattr(value, size);
if (acl == NULL) {
@@ -167,7 +172,7 @@ static int btrfs_xattr_set_acl(struct inode *inode, int type,
}
}
- ret = btrfs_set_acl(inode, acl, type);
+ ret = btrfs_set_acl(NULL, inode, acl, type);
posix_acl_release(acl);
@@ -221,7 +226,8 @@ int btrfs_check_acl(struct inode *inode, int mask)
* stuff has been fixed to work with that. If the locking stuff changes, we
* need to re-evaluate the acl locking stuff.
*/
-int btrfs_init_acl(struct inode *inode, struct inode *dir)
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir)
{
struct posix_acl *acl = NULL;
int ret = 0;
@@ -246,7 +252,8 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
mode_t mode;
if (S_ISDIR(inode->i_mode)) {
- ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
+ ret = btrfs_set_acl(trans, inode, acl,
+ ACL_TYPE_DEFAULT);
if (ret)
goto failed;
}
@@ -261,10 +268,11 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
inode->i_mode = mode;
if (ret > 0) {
/* we need an acl */
- ret = btrfs_set_acl(inode, clone,
+ ret = btrfs_set_acl(trans, inode, clone,
ACL_TYPE_ACCESS);
}
}
+ posix_acl_release(clone);
}
failed:
posix_acl_release(acl);
@@ -294,7 +302,7 @@ int btrfs_acl_chmod(struct inode *inode)
ret = posix_acl_chmod_masq(clone, inode->i_mode);
if (!ret)
- ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
+ ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS);
posix_acl_release(clone);
@@ -320,7 +328,8 @@ int btrfs_acl_chmod(struct inode *inode)
return 0;
}
-int btrfs_init_acl(struct inode *inode, struct inode *dir)
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir)
{
return 0;
}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index f6783a42f010..3f1f50d9d916 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,9 +44,6 @@ struct btrfs_inode {
*/
struct extent_io_tree io_failure_tree;
- /* held while inesrting or deleting extents from files */
- struct mutex extent_mutex;
-
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
@@ -166,7 +163,7 @@ static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
static inline void btrfs_i_size_write(struct inode *inode, u64 size)
{
- inode->i_size = size;
+ i_size_write(inode, size);
BTRFS_I(inode)->disk_i_size = size;
}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ec96f3a6d536..c4bc570a396e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -37,6 +37,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *src_buf);
static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
+static int setup_items_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ u32 total_data, u32 total_size, int nr);
+
struct btrfs_path *btrfs_alloc_path(void)
{
@@ -451,9 +456,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
extent_buffer_get(cow);
spin_unlock(&root->node_lock);
- btrfs_free_extent(trans, root, buf->start, buf->len,
- parent_start, root->root_key.objectid,
- level, 0);
+ btrfs_free_tree_block(trans, root, buf->start, buf->len,
+ parent_start, root->root_key.objectid, level);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
} else {
@@ -468,9 +472,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
btrfs_mark_buffer_dirty(parent);
- btrfs_free_extent(trans, root, buf->start, buf->len,
- parent_start, root->root_key.objectid,
- level, 0);
+ btrfs_free_tree_block(trans, root, buf->start, buf->len,
+ parent_start, root->root_key.objectid, level);
}
if (unlock_orig)
btrfs_tree_unlock(buf);
@@ -1030,8 +1033,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(mid);
/* once for the path */
free_extent_buffer(mid);
- ret = btrfs_free_extent(trans, root, mid->start, mid->len,
- 0, root->root_key.objectid, level, 1);
+ ret = btrfs_free_tree_block(trans, root, mid->start, mid->len,
+ 0, root->root_key.objectid, level);
/* once for the root ptr */
free_extent_buffer(mid);
return ret;
@@ -1095,10 +1098,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1);
if (wret)
ret = wret;
- wret = btrfs_free_extent(trans, root, bytenr,
- blocksize, 0,
- root->root_key.objectid,
- level, 0);
+ wret = btrfs_free_tree_block(trans, root,
+ bytenr, blocksize, 0,
+ root->root_key.objectid,
+ level);
if (wret)
ret = wret;
} else {
@@ -1143,9 +1146,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
wret = del_ptr(trans, root, path, level + 1, pslot);
if (wret)
ret = wret;
- wret = btrfs_free_extent(trans, root, bytenr, blocksize,
- 0, root->root_key.objectid,
- level, 0);
+ wret = btrfs_free_tree_block(trans, root, bytenr, blocksize,
+ 0, root->root_key.objectid, level);
if (wret)
ret = wret;
} else {
@@ -2997,75 +2999,85 @@ again:
return ret;
}
-/*
- * This function splits a single item into two items,
- * giving 'new_key' to the new item and splitting the
- * old one at split_offset (from the start of the item).
- *
- * The path may be released by this operation. After
- * the split, the path is pointing to the old item. The
- * new item is going to be in the same node as the old one.
- *
- * Note, the item being split must be smaller enough to live alone on
- * a tree block with room for one extra struct btrfs_item
- *
- * This allows us to split the item in place, keeping a lock on the
- * leaf the entire time.
- */
-int btrfs_split_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *new_key,
- unsigned long split_offset)
+static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int ins_len)
{
- u32 item_size;
+ struct btrfs_key key;
struct extent_buffer *leaf;
- struct btrfs_key orig_key;
- struct btrfs_item *item;
- struct btrfs_item *new_item;
- int ret = 0;
- int slot;
- u32 nritems;
- u32 orig_offset;
- struct btrfs_disk_key disk_key;
- char *buf;
+ struct btrfs_file_extent_item *fi;
+ u64 extent_len = 0;
+ u32 item_size;
+ int ret;
leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
- if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
- goto split;
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
+ key.type != BTRFS_EXTENT_CSUM_KEY);
+
+ if (btrfs_leaf_free_space(root, leaf) >= ins_len)
+ return 0;
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ if (key.type == BTRFS_EXTENT_DATA_KEY) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_len = btrfs_file_extent_num_bytes(leaf, fi);
+ }
btrfs_release_path(root, path);
- path->search_for_split = 1;
path->keep_locks = 1;
-
- ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
+ path->search_for_split = 1;
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
path->search_for_split = 0;
+ if (ret < 0)
+ goto err;
+ ret = -EAGAIN;
+ leaf = path->nodes[0];
/* if our item isn't there or got smaller, return now */
- if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
- path->slots[0])) {
- path->keep_locks = 0;
- return -EAGAIN;
+ if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+ goto err;
+
+ if (key.type == BTRFS_EXTENT_DATA_KEY) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
+ goto err;
}
btrfs_set_path_blocking(path);
- ret = split_leaf(trans, root, &orig_key, path,
- sizeof(struct btrfs_item), 1);
- path->keep_locks = 0;
+ ret = split_leaf(trans, root, &key, path, ins_len, 1);
BUG_ON(ret);
+ path->keep_locks = 0;
btrfs_unlock_up_safe(path, 1);
+ return 0;
+err:
+ path->keep_locks = 0;
+ return ret;
+}
+
+static noinline int split_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ unsigned long split_offset)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ struct btrfs_item *new_item;
+ int slot;
+ char *buf;
+ u32 nritems;
+ u32 item_size;
+ u32 orig_offset;
+ struct btrfs_disk_key disk_key;
+
leaf = path->nodes[0];
BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
-split:
- /*
- * make sure any changes to the path from split_leaf leave it
- * in a blocking state
- */
btrfs_set_path_blocking(path);
item = btrfs_item_nr(leaf, path->slots[0]);
@@ -3073,19 +3085,19 @@ split:
item_size = btrfs_item_size(leaf, item);
buf = kmalloc(item_size, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
path->slots[0]), item_size);
- slot = path->slots[0] + 1;
- leaf = path->nodes[0];
+ slot = path->slots[0] + 1;
nritems = btrfs_header_nritems(leaf);
-
if (slot != nritems) {
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
- btrfs_item_nr_offset(slot),
- (nritems - slot) * sizeof(struct btrfs_item));
-
+ btrfs_item_nr_offset(slot),
+ (nritems - slot) * sizeof(struct btrfs_item));
}
btrfs_cpu_key_to_disk(&disk_key, new_key);
@@ -3113,16 +3125,81 @@ split:
item_size - split_offset);
btrfs_mark_buffer_dirty(leaf);
- ret = 0;
- if (btrfs_leaf_free_space(root, leaf) < 0) {
- btrfs_print_leaf(root, leaf);
- BUG();
- }
+ BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
kfree(buf);
+ return 0;
+}
+
+/*
+ * This function splits a single item into two items,
+ * giving 'new_key' to the new item and splitting the
+ * old one at split_offset (from the start of the item).
+ *
+ * The path may be released by this operation. After
+ * the split, the path is pointing to the old item. The
+ * new item is going to be in the same node as the old one.
+ *
+ * Note, the item being split must be smaller enough to live alone on
+ * a tree block with room for one extra struct btrfs_item
+ *
+ * This allows us to split the item in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ unsigned long split_offset)
+{
+ int ret;
+ ret = setup_leaf_for_split(trans, root, path,
+ sizeof(struct btrfs_item));
+ if (ret)
+ return ret;
+
+ ret = split_item(trans, root, path, new_key, split_offset);
return ret;
}
/*
+ * This function duplicate a item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item
+ * is contiguous with the original item.
+ *
+ * This allows us to split file extent in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key)
+{
+ struct extent_buffer *leaf;
+ int ret;
+ u32 item_size;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ret = setup_leaf_for_split(trans, root, path,
+ item_size + sizeof(struct btrfs_item));
+ if (ret)
+ return ret;
+
+ path->slots[0]++;
+ ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
+ item_size, item_size +
+ sizeof(struct btrfs_item), 1);
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ memcpy_extent_buffer(leaf,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+ item_size);
+ return 0;
+}
+
+/*
* make the item pointed to by the path smaller. new_size indicates
* how small to make it, and from_end tells us if we just chop bytes
* off the end of the item or if we shift the item to chop bytes off
@@ -3714,8 +3791,8 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
*/
btrfs_unlock_up_safe(path, 0);
- ret = btrfs_free_extent(trans, root, leaf->start, leaf->len,
- 0, root->root_key.objectid, 0, 0);
+ ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len,
+ 0, root->root_key.objectid, 0);
return ret;
}
/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 444b3e9b92a4..9f806dd04c27 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -310,6 +310,9 @@ struct btrfs_header {
#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) - \
sizeof(struct btrfs_file_extent_item))
+#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
+ sizeof(struct btrfs_item) -\
+ sizeof(struct btrfs_dir_item))
/*
@@ -859,8 +862,9 @@ struct btrfs_fs_info {
struct mutex ordered_operations_mutex;
struct rw_semaphore extent_commit_sem;
- struct rw_semaphore subvol_sem;
+ struct rw_semaphore cleanup_work_sem;
+ struct rw_semaphore subvol_sem;
struct srcu_struct subvol_srcu;
struct list_head trans_list;
@@ -868,6 +872,9 @@ struct btrfs_fs_info {
struct list_head dead_roots;
struct list_head caching_block_groups;
+ spinlock_t delayed_iput_lock;
+ struct list_head delayed_iputs;
+
atomic_t nr_async_submits;
atomic_t async_submit_draining;
atomic_t nr_async_bios;
@@ -1034,12 +1041,12 @@ struct btrfs_root {
int ref_cows;
int track_dirty;
int in_radix;
+ int clean_orphans;
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
int defrag_running;
- int defrag_level;
char *name;
int in_sysfs;
@@ -1975,6 +1982,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
u64 hint, u64 empty_size);
+int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u32 blocksize,
+ u64 parent, u64 root_objectid, int level);
struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u32 blocksize,
@@ -2089,6 +2100,10 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_key *new_key,
unsigned long split_offset);
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key);
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, struct btrfs_path *p, int
ins_len, int cow);
@@ -2196,9 +2211,10 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_dir_item *di);
int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, const char *name,
- u16 name_len, const void *data, u16 data_len,
- u64 dir);
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ const char *name, u16 name_len,
+ const void *data, u16 data_len);
struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
@@ -2292,7 +2308,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct inode *inode, u64 new_size,
u32 min_type);
-int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
int btrfs_writepages(struct address_space *mapping,
struct writeback_control *wbc);
@@ -2332,6 +2348,8 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
void btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t size);
int btrfs_invalidate_inodes(struct btrfs_root *root);
+void btrfs_add_delayed_iput(struct inode *inode);
+void btrfs_run_delayed_iputs(struct btrfs_root *root);
extern const struct dentry_operations btrfs_dentry_operations;
/* ioctl.c */
@@ -2345,12 +2363,9 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
int skip_pinned);
int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
extern const struct file_operations btrfs_file_operations;
-int btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- u64 start, u64 end, u64 locked_end,
- u64 inline_limit, u64 *hint_block, int drop_cache);
+int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
+ u64 start, u64 end, u64 *hint_byte, int drop_cache);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct inode *inode, u64 start, u64 end);
int btrfs_release_file(struct inode *inode, struct file *file);
@@ -2380,7 +2395,8 @@ int btrfs_check_acl(struct inode *inode, int mask);
#else
#define btrfs_check_acl NULL
#endif
-int btrfs_init_acl(struct inode *inode, struct inode *dir);
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir);
int btrfs_acl_chmod(struct inode *inode);
/* relocation.c */
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index f3a6075519cc..e9103b3baa49 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -68,12 +68,12 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
* into the tree
*/
int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, const char *name,
- u16 name_len, const void *data, u16 data_len,
- u64 dir)
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ const char *name, u16 name_len,
+ const void *data, u16 data_len)
{
int ret = 0;
- struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
unsigned long name_ptr, data_ptr;
struct btrfs_key key, location;
@@ -81,15 +81,11 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
u32 data_size;
- key.objectid = dir;
+ BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
+
+ key.objectid = objectid;
btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
key.offset = btrfs_name_hash(name, name_len);
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- if (name_len + data_len + sizeof(struct btrfs_dir_item) >
- BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
- return -ENOSPC;
data_size = sizeof(*dir_item) + name_len + data_len;
dir_item = insert_with_overflow(trans, root, path, &key, data_size,
@@ -117,7 +113,6 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, data, data_ptr, data_len);
btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 02b6afbd7450..2b59201b955c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -892,6 +892,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->stripesize = stripesize;
root->ref_cows = 0;
root->track_dirty = 0;
+ root->in_radix = 0;
+ root->clean_orphans = 0;
root->fs_info = fs_info;
root->objectid = objectid;
@@ -928,7 +930,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->defrag_trans_start = fs_info->generation;
init_completion(&root->kobj_unregister);
root->defrag_running = 0;
- root->defrag_level = 0;
root->root_key.objectid = objectid;
root->anon_super.s_root = NULL;
root->anon_super.s_dev = 0;
@@ -980,12 +981,12 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
while (1) {
ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
- 0, &start, &end, EXTENT_DIRTY);
+ 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
if (ret)
break;
- clear_extent_dirty(&log_root_tree->dirty_log_pages,
- start, end, GFP_NOFS);
+ clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
+ EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
}
eb = fs_info->log_root_tree->node;
@@ -1210,8 +1211,10 @@ again:
ret = radix_tree_insert(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
root);
- if (ret == 0)
+ if (ret == 0) {
root->in_radix = 1;
+ root->clean_orphans = 1;
+ }
spin_unlock(&fs_info->fs_roots_radix_lock);
radix_tree_preload_end();
if (ret) {
@@ -1225,10 +1228,6 @@ again:
ret = btrfs_find_dead_roots(fs_info->tree_root,
root->root_key.objectid);
WARN_ON(ret);
-
- if (!(fs_info->sb->s_flags & MS_RDONLY))
- btrfs_orphan_cleanup(root);
-
return root;
fail:
free_fs_root(root);
@@ -1477,6 +1476,7 @@ static int cleaner_kthread(void *arg)
if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
mutex_trylock(&root->fs_info->cleaner_mutex)) {
+ btrfs_run_delayed_iputs(root);
btrfs_clean_old_snapshots(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
}
@@ -1606,6 +1606,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
+ INIT_LIST_HEAD(&fs_info->delayed_iputs);
INIT_LIST_HEAD(&fs_info->hashers);
INIT_LIST_HEAD(&fs_info->delalloc_inodes);
INIT_LIST_HEAD(&fs_info->ordered_operations);
@@ -1614,6 +1615,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->new_trans_lock);
spin_lock_init(&fs_info->ref_cache_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
+ spin_lock_init(&fs_info->delayed_iput_lock);
init_completion(&fs_info->kobj_unregister);
fs_info->tree_root = tree_root;
@@ -1689,6 +1691,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
init_rwsem(&fs_info->extent_commit_sem);
+ init_rwsem(&fs_info->cleanup_work_sem);
init_rwsem(&fs_info->subvol_sem);
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
@@ -1979,7 +1982,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (!(sb->s_flags & MS_RDONLY)) {
ret = btrfs_recover_relocation(tree_root);
- BUG_ON(ret);
+ if (ret < 0) {
+ printk(KERN_WARNING
+ "btrfs: failed to recover relocation\n");
+ err = -EINVAL;
+ goto fail_trans_kthread;
+ }
}
location.objectid = BTRFS_FS_TREE_OBJECTID;
@@ -1990,6 +1998,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (!fs_info->fs_root)
goto fail_trans_kthread;
+ if (!(sb->s_flags & MS_RDONLY)) {
+ down_read(&fs_info->cleanup_work_sem);
+ btrfs_orphan_cleanup(fs_info->fs_root);
+ up_read(&fs_info->cleanup_work_sem);
+ }
+
return tree_root;
fail_trans_kthread:
@@ -2386,8 +2400,14 @@ int btrfs_commit_super(struct btrfs_root *root)
int ret;
mutex_lock(&root->fs_info->cleaner_mutex);
+ btrfs_run_delayed_iputs(root);
btrfs_clean_old_snapshots(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
+
+ /* wait until ongoing cleanup work done */
+ down_write(&root->fs_info->cleanup_work_sem);
+ up_write(&root->fs_info->cleanup_work_sem);
+
trans = btrfs_start_transaction(root, 1);
ret = btrfs_commit_transaction(trans, root);
BUG_ON(ret);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 94627c4cc193..559f72489b3b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -83,6 +83,17 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
return (cache->flags & bits) == bits;
}
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+{
+ atomic_inc(&cache->count);
+}
+
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
+{
+ if (atomic_dec_and_test(&cache->count))
+ kfree(cache);
+}
+
/*
* this adds the block group to the fs_info rb tree for the block group
* cache
@@ -156,7 +167,7 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
}
}
if (ret)
- atomic_inc(&ret->count);
+ btrfs_get_block_group(ret);
spin_unlock(&info->block_group_cache_lock);
return ret;
@@ -195,6 +206,14 @@ static int exclude_super_stripes(struct btrfs_root *root,
int stripe_len;
int i, nr, ret;
+ if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
+ stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
+ cache->bytes_super += stripe_len;
+ ret = add_excluded_extent(root, cache->key.objectid,
+ stripe_len);
+ BUG_ON(ret);
+ }
+
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
@@ -255,7 +274,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
if (ret)
break;
- if (extent_start == start) {
+ if (extent_start <= start) {
start = extent_end + 1;
} else if (extent_start > start && extent_start < end) {
size = extent_start - start;
@@ -399,6 +418,8 @@ err:
put_caching_control(caching_ctl);
atomic_dec(&block_group->space_info->caching_threads);
+ btrfs_put_block_group(block_group);
+
return 0;
}
@@ -439,6 +460,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache)
up_write(&fs_info->extent_commit_sem);
atomic_inc(&cache->space_info->caching_threads);
+ btrfs_get_block_group(cache);
tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
cache->key.objectid);
@@ -478,12 +500,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
return cache;
}
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
-{
- if (atomic_dec_and_test(&cache->count))
- kfree(cache);
-}
-
static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
u64 flags)
{
@@ -2574,7 +2590,7 @@ next_block_group(struct btrfs_root *root,
if (node) {
cache = rb_entry(node, struct btrfs_block_group_cache,
cache_node);
- atomic_inc(&cache->count);
+ btrfs_get_block_group(cache);
} else
cache = NULL;
spin_unlock(&root->fs_info->block_group_cache_lock);
@@ -2880,9 +2896,9 @@ static noinline void flush_delalloc_async(struct btrfs_work *work)
root = async->root;
info = async->info;
- btrfs_start_delalloc_inodes(root);
+ btrfs_start_delalloc_inodes(root, 0);
wake_up(&info->flush_wait);
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_wait_ordered_extents(root, 0, 0);
spin_lock(&info->lock);
info->flushing = 0;
@@ -2956,8 +2972,8 @@ static void flush_delalloc(struct btrfs_root *root,
return;
flush:
- btrfs_start_delalloc_inodes(root);
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_start_delalloc_inodes(root, 0);
+ btrfs_wait_ordered_extents(root, 0, 0);
spin_lock(&info->lock);
info->flushing = 0;
@@ -3454,14 +3470,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
else
old_val -= num_bytes;
btrfs_set_super_bytes_used(&info->super_copy, old_val);
-
- /* block accounting for root item */
- old_val = btrfs_root_used(&root->root_item);
- if (alloc)
- old_val += num_bytes;
- else
- old_val -= num_bytes;
- btrfs_set_root_used(&root->root_item, old_val);
spin_unlock(&info->delalloc_lock);
while (total) {
@@ -4049,6 +4057,21 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
return ret;
}
+int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u32 blocksize,
+ u64 parent, u64 root_objectid, int level)
+{
+ u64 used;
+ spin_lock(&root->node_lock);
+ used = btrfs_root_used(&root->root_item) - blocksize;
+ btrfs_set_root_used(&root->root_item, used);
+ spin_unlock(&root->node_lock);
+
+ return btrfs_free_extent(trans, root, bytenr, blocksize,
+ parent, root_objectid, level, 0);
+}
+
static u64 stripe_align(struct btrfs_root *root, u64 val)
{
u64 mask = ((u64)root->stripesize - 1);
@@ -4212,7 +4235,7 @@ search:
u64 offset;
int cached;
- atomic_inc(&block_group->count);
+ btrfs_get_block_group(block_group);
search_start = block_group->key.objectid;
have_block_group:
@@ -4300,7 +4323,7 @@ have_block_group:
btrfs_put_block_group(block_group);
block_group = last_ptr->block_group;
- atomic_inc(&block_group->count);
+ btrfs_get_block_group(block_group);
spin_unlock(&last_ptr->lock);
spin_unlock(&last_ptr->refill_lock);
@@ -4578,7 +4601,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
{
int ret;
u64 search_start = 0;
- struct btrfs_fs_info *info = root->fs_info;
data = btrfs_get_alloc_profile(root, data);
again:
@@ -4586,17 +4608,9 @@ again:
* the only place that sets empty_size is btrfs_realloc_node, which
* is not called recursively on allocations
*/
- if (empty_size || root->ref_cows) {
- if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- 2 * 1024 * 1024,
- BTRFS_BLOCK_GROUP_METADATA |
- (info->metadata_alloc_profile &
- info->avail_metadata_alloc_bits), 0);
- }
+ if (empty_size || root->ref_cows)
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
num_bytes + 2 * 1024 * 1024, data, 0);
- }
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(trans, root, num_bytes, empty_size,
@@ -4897,6 +4911,14 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op);
BUG_ON(ret);
}
+
+ if (root_objectid == root->root_key.objectid) {
+ u64 used;
+ spin_lock(&root->node_lock);
+ used = btrfs_root_used(&root->root_item) + num_bytes;
+ btrfs_set_root_used(&root->root_item, used);
+ spin_unlock(&root->node_lock);
+ }
return ret;
}
@@ -4919,8 +4941,16 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
btrfs_set_buffer_uptodate(buf);
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
- set_extent_dirty(&root->dirty_log_pages, buf->start,
- buf->start + buf->len - 1, GFP_NOFS);
+ /*
+ * we allow two log transactions at a time, use different
+ * EXENT bit to differentiate dirty pages.
+ */
+ if (root->log_transid % 2 == 0)
+ set_extent_dirty(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1, GFP_NOFS);
+ else
+ set_extent_new(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1, GFP_NOFS);
} else {
set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
@@ -5372,10 +5402,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
int ret;
while (level >= 0) {
- if (path->slots[level] >=
- btrfs_header_nritems(path->nodes[level]))
- break;
-
ret = walk_down_proc(trans, root, path, wc, lookup_info);
if (ret > 0)
break;
@@ -5383,6 +5409,10 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
if (level == 0)
break;
+ if (path->slots[level] >=
+ btrfs_header_nritems(path->nodes[level]))
+ break;
+
ret = do_walk_down(trans, root, path, wc, &lookup_info);
if (ret > 0) {
path->slots[level]++;
@@ -7373,9 +7403,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
wait_block_group_cache_done(block_group);
btrfs_remove_free_space_cache(block_group);
-
- WARN_ON(atomic_read(&block_group->count) != 1);
- kfree(block_group);
+ btrfs_put_block_group(block_group);
spin_lock(&info->block_group_cache_lock);
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 96577e8bf9fd..b177ed319612 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3165,10 +3165,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
spin_unlock(&tree->buffer_lock);
goto free_eb;
}
- spin_unlock(&tree->buffer_lock);
-
/* add one reference for the tree */
atomic_inc(&eb->refs);
+ spin_unlock(&tree->buffer_lock);
return eb;
free_eb:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 06550affbd27..a7fd9f3a750a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -179,18 +179,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
}
flags = em->flags;
if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
- if (em->start <= start &&
- (!testend || em->start + em->len >= start + len)) {
+ if (testend && em->start + em->len >= start + len) {
free_extent_map(em);
write_unlock(&em_tree->lock);
break;
}
- if (start < em->start) {
- len = em->start - start;
- } else {
+ start = em->start + em->len;
+ if (testend)
len = start + len - (em->start + em->len);
- start = em->start + em->len;
- }
free_extent_map(em);
write_unlock(&em_tree->lock);
continue;
@@ -265,324 +261,253 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
* If an extent intersects the range but is not entirely inside the range
* it is either truncated or split. Anything entirely inside the range
* is deleted from the tree.
- *
- * inline_limit is used to tell this code which offsets in the file to keep
- * if they contain inline extents.
*/
-noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- u64 start, u64 end, u64 locked_end,
- u64 inline_limit, u64 *hint_byte, int drop_cache)
+int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
+ u64 start, u64 end, u64 *hint_byte, int drop_cache)
{
- u64 extent_end = 0;
- u64 search_start = start;
- u64 ram_bytes = 0;
- u64 disk_bytenr = 0;
- u64 orig_locked_end = locked_end;
- u8 compression;
- u8 encryption;
- u16 other_encoding = 0;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_buffer *leaf;
- struct btrfs_file_extent_item *extent;
+ struct btrfs_file_extent_item *fi;
struct btrfs_path *path;
struct btrfs_key key;
- struct btrfs_file_extent_item old;
- int keep;
- int slot;
- int bookend;
- int found_type = 0;
- int found_extent;
- int found_inline;
+ struct btrfs_key new_key;
+ u64 search_start = start;
+ u64 disk_bytenr = 0;
+ u64 num_bytes = 0;
+ u64 extent_offset = 0;
+ u64 extent_end = 0;
+ int del_nr = 0;
+ int del_slot = 0;
+ int extent_type;
int recow;
int ret;
- inline_limit = 0;
if (drop_cache)
btrfs_drop_extent_cache(inode, start, end - 1, 0);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+
while (1) {
recow = 0;
- btrfs_release_path(root, path);
ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
search_start, -1);
if (ret < 0)
- goto out;
- if (ret > 0) {
- if (path->slots[0] == 0) {
- ret = 0;
- goto out;
- }
- path->slots[0]--;
+ break;
+ if (ret > 0 && path->slots[0] > 0 && search_start == start) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+ if (key.objectid == inode->i_ino &&
+ key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
}
+ ret = 0;
next_slot:
- keep = 0;
- bookend = 0;
- found_extent = 0;
- found_inline = 0;
- compression = 0;
- encryption = 0;
- extent = NULL;
leaf = path->nodes[0];
- slot = path->slots[0];
- ret = 0;
- btrfs_item_key_to_cpu(leaf, &key, slot);
- if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
- key.offset >= end) {
- goto out;
- }
- if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
- key.objectid != inode->i_ino) {
- goto out;
- }
- if (recow) {
- search_start = max(key.offset, start);
- continue;
- }
- if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
- extent = btrfs_item_ptr(leaf, slot,
- struct btrfs_file_extent_item);
- found_type = btrfs_file_extent_type(leaf, extent);
- compression = btrfs_file_extent_compression(leaf,
- extent);
- encryption = btrfs_file_extent_encryption(leaf,
- extent);
- other_encoding = btrfs_file_extent_other_encoding(leaf,
- extent);
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
- extent_end =
- btrfs_file_extent_disk_bytenr(leaf,
- extent);
- if (extent_end)
- *hint_byte = extent_end;
-
- extent_end = key.offset +
- btrfs_file_extent_num_bytes(leaf, extent);
- ram_bytes = btrfs_file_extent_ram_bytes(leaf,
- extent);
- found_extent = 1;
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- found_inline = 1;
- extent_end = key.offset +
- btrfs_file_extent_inline_len(leaf, extent);
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ BUG_ON(del_nr > 0);
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ret = 0;
+ break;
}
+ leaf = path->nodes[0];
+ recow = 1;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid > inode->i_ino ||
+ key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+ break;
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ extent_offset = btrfs_file_extent_offset(leaf, fi);
+ extent_end = key.offset +
+ btrfs_file_extent_num_bytes(leaf, fi);
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ extent_end = key.offset +
+ btrfs_file_extent_inline_len(leaf, fi);
} else {
+ WARN_ON(1);
extent_end = search_start;
}
- /* we found nothing we can drop */
- if ((!found_extent && !found_inline) ||
- search_start >= extent_end) {
- int nextret;
- u32 nritems;
- nritems = btrfs_header_nritems(leaf);
- if (slot >= nritems - 1) {
- nextret = btrfs_next_leaf(root, path);
- if (nextret)
- goto out;
- recow = 1;
- } else {
- path->slots[0]++;
- }
+ if (extent_end <= search_start) {
+ path->slots[0]++;
goto next_slot;
}
- if (end <= extent_end && start >= key.offset && found_inline)
- *hint_byte = EXTENT_MAP_INLINE;
-
- if (found_extent) {
- read_extent_buffer(leaf, &old, (unsigned long)extent,
- sizeof(old));
- }
-
- if (end < extent_end && end >= key.offset) {
- bookend = 1;
- if (found_inline && start <= key.offset)
- keep = 1;
+ search_start = max(key.offset, start);
+ if (recow) {
+ btrfs_release_path(root, path);
+ continue;
}
- if (bookend && found_extent) {
- if (locked_end < extent_end) {
- ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
- locked_end, extent_end - 1,
- GFP_NOFS);
- if (!ret) {
- btrfs_release_path(root, path);
- lock_extent(&BTRFS_I(inode)->io_tree,
- locked_end, extent_end - 1,
- GFP_NOFS);
- locked_end = extent_end;
- continue;
- }
- locked_end = extent_end;
+ /*
+ * | - range to drop - |
+ * | -------- extent -------- |
+ */
+ if (start > key.offset && end < extent_end) {
+ BUG_ON(del_nr > 0);
+ BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.offset = start;
+ ret = btrfs_duplicate_item(trans, root, path,
+ &new_key);
+ if (ret == -EAGAIN) {
+ btrfs_release_path(root, path);
+ continue;
}
- disk_bytenr = le64_to_cpu(old.disk_bytenr);
- if (disk_bytenr != 0) {
+ if (ret < 0)
+ break;
+
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ start - key.offset);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ extent_offset += start - key.offset;
+ btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - start);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (disk_bytenr > 0) {
ret = btrfs_inc_extent_ref(trans, root,
- disk_bytenr,
- le64_to_cpu(old.disk_num_bytes), 0,
- root->root_key.objectid,
- key.objectid, key.offset -
- le64_to_cpu(old.offset));
+ disk_bytenr, num_bytes, 0,
+ root->root_key.objectid,
+ new_key.objectid,
+ start - extent_offset);
BUG_ON(ret);
+ *hint_byte = disk_bytenr;
}
+ key.offset = start;
}
+ /*
+ * | ---- range to drop ----- |
+ * | -------- extent -------- |
+ */
+ if (start <= key.offset && end < extent_end) {
+ BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
- if (found_inline) {
- u64 mask = root->sectorsize - 1;
- search_start = (extent_end + mask) & ~mask;
- } else
- search_start = extent_end;
-
- /* truncate existing extent */
- if (start > key.offset) {
- u64 new_num;
- u64 old_num;
- keep = 1;
- WARN_ON(start & (root->sectorsize - 1));
- if (found_extent) {
- new_num = start - key.offset;
- old_num = btrfs_file_extent_num_bytes(leaf,
- extent);
- *hint_byte =
- btrfs_file_extent_disk_bytenr(leaf,
- extent);
- if (btrfs_file_extent_disk_bytenr(leaf,
- extent)) {
- inode_sub_bytes(inode, old_num -
- new_num);
- }
- btrfs_set_file_extent_num_bytes(leaf,
- extent, new_num);
- btrfs_mark_buffer_dirty(leaf);
- } else if (key.offset < inline_limit &&
- (end > extent_end) &&
- (inline_limit < extent_end)) {
- u32 new_size;
- new_size = btrfs_file_extent_calc_inline_size(
- inline_limit - key.offset);
- inode_sub_bytes(inode, extent_end -
- inline_limit);
- btrfs_set_file_extent_ram_bytes(leaf, extent,
- new_size);
- if (!compression && !encryption) {
- btrfs_truncate_item(trans, root, path,
- new_size, 1);
- }
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.offset = end;
+ btrfs_set_item_key_safe(trans, root, path, &new_key);
+
+ extent_offset += end - key.offset;
+ btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - end);
+ btrfs_mark_buffer_dirty(leaf);
+ if (disk_bytenr > 0) {
+ inode_sub_bytes(inode, end - key.offset);
+ *hint_byte = disk_bytenr;
}
+ break;
}
- /* delete the entire extent */
- if (!keep) {
- if (found_inline)
- inode_sub_bytes(inode, extent_end -
- key.offset);
- ret = btrfs_del_item(trans, root, path);
- /* TODO update progress marker and return */
- BUG_ON(ret);
- extent = NULL;
- btrfs_release_path(root, path);
- /* the extent will be freed later */
- }
- if (bookend && found_inline && start <= key.offset) {
- u32 new_size;
- new_size = btrfs_file_extent_calc_inline_size(
- extent_end - end);
- inode_sub_bytes(inode, end - key.offset);
- btrfs_set_file_extent_ram_bytes(leaf, extent,
- new_size);
- if (!compression && !encryption)
- ret = btrfs_truncate_item(trans, root, path,
- new_size, 0);
- BUG_ON(ret);
- }
- /* create bookend, splitting the extent in two */
- if (bookend && found_extent) {
- struct btrfs_key ins;
- ins.objectid = inode->i_ino;
- ins.offset = end;
- btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
- btrfs_release_path(root, path);
- path->leave_spinning = 1;
- ret = btrfs_insert_empty_item(trans, root, path, &ins,
- sizeof(*extent));
- BUG_ON(ret);
+ search_start = extent_end;
+ /*
+ * | ---- range to drop ----- |
+ * | -------- extent -------- |
+ */
+ if (start > key.offset && end >= extent_end) {
+ BUG_ON(del_nr > 0);
+ BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
- leaf = path->nodes[0];
- extent = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- write_extent_buffer(leaf, &old,
- (unsigned long)extent, sizeof(old));
-
- btrfs_set_file_extent_compression(leaf, extent,
- compression);
- btrfs_set_file_extent_encryption(leaf, extent,
- encryption);
- btrfs_set_file_extent_other_encoding(leaf, extent,
- other_encoding);
- btrfs_set_file_extent_offset(leaf, extent,
- le64_to_cpu(old.offset) + end - key.offset);
- WARN_ON(le64_to_cpu(old.num_bytes) <
- (extent_end - end));
- btrfs_set_file_extent_num_bytes(leaf, extent,
- extent_end - end);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ start - key.offset);
+ btrfs_mark_buffer_dirty(leaf);
+ if (disk_bytenr > 0) {
+ inode_sub_bytes(inode, extent_end - start);
+ *hint_byte = disk_bytenr;
+ }
+ if (end == extent_end)
+ break;
- /*
- * set the ram bytes to the size of the full extent
- * before splitting. This is a worst case flag,
- * but its the best we can do because we don't know
- * how splitting affects compression
- */
- btrfs_set_file_extent_ram_bytes(leaf, extent,
- ram_bytes);
- btrfs_set_file_extent_type(leaf, extent, found_type);
-
- btrfs_unlock_up_safe(path, 1);
- btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_set_lock_blocking(path->nodes[0]);
-
- path->leave_spinning = 0;
- btrfs_release_path(root, path);
- if (disk_bytenr != 0)
- inode_add_bytes(inode, extent_end - end);
+ path->slots[0]++;
+ goto next_slot;
}
- if (found_extent && !keep) {
- u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
+ /*
+ * | ---- range to drop ----- |
+ * | ------ extent ------ |
+ */
+ if (start <= key.offset && end >= extent_end) {
+ if (del_nr == 0) {
+ del_slot = path->slots[0];
+ del_nr = 1;
+ } else {
+ BUG_ON(del_slot + del_nr != path->slots[0]);
+ del_nr++;
+ }
- if (old_disk_bytenr != 0) {
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
inode_sub_bytes(inode,
- le64_to_cpu(old.num_bytes));
+ extent_end - key.offset);
+ extent_end = ALIGN(extent_end,
+ root->sectorsize);
+ } else if (disk_bytenr > 0) {
ret = btrfs_free_extent(trans, root,
- old_disk_bytenr,
- le64_to_cpu(old.disk_num_bytes),
- 0, root->root_key.objectid,
+ disk_bytenr, num_bytes, 0,
+ root->root_key.objectid,
key.objectid, key.offset -
- le64_to_cpu(old.offset));
+ extent_offset);
BUG_ON(ret);
- *hint_byte = old_disk_bytenr;
+ inode_sub_bytes(inode,
+ extent_end - key.offset);
+ *hint_byte = disk_bytenr;
}
- }
- if (search_start >= end) {
- ret = 0;
- goto out;
+ if (end == extent_end)
+ break;
+
+ if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ ret = btrfs_del_items(trans, root, path, del_slot,
+ del_nr);
+ BUG_ON(ret);
+
+ del_nr = 0;
+ del_slot = 0;
+
+ btrfs_release_path(root, path);
+ continue;
}
+
+ BUG_ON(1);
}
-out:
- btrfs_free_path(path);
- if (locked_end > orig_locked_end) {
- unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end,
- locked_end - 1, GFP_NOFS);
+
+ if (del_nr > 0) {
+ ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+ BUG_ON(ret);
}
+
+ btrfs_free_path(path);
return ret;
}
static int extent_mergeable(struct extent_buffer *leaf, int slot,
- u64 objectid, u64 bytenr, u64 *start, u64 *end)
+ u64 objectid, u64 bytenr, u64 orig_offset,
+ u64 *start, u64 *end)
{
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
@@ -598,6 +523,7 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+ btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
btrfs_file_extent_compression(leaf, fi) ||
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi))
@@ -620,23 +546,24 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
* two or three.
*/
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct inode *inode, u64 start, u64 end)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_buffer *leaf;
struct btrfs_path *path;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
+ struct btrfs_key new_key;
u64 bytenr;
u64 num_bytes;
u64 extent_end;
u64 orig_offset;
u64 other_start;
u64 other_end;
- u64 split = start;
- u64 locked_end = end;
- int extent_type;
- int split_end = 1;
+ u64 split;
+ int del_nr = 0;
+ int del_slot = 0;
+ int recow;
int ret;
btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -644,12 +571,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
BUG_ON(!path);
again:
+ recow = 0;
+ split = start;
key.objectid = inode->i_ino;
key.type = BTRFS_EXTENT_DATA_KEY;
- if (split == start)
- key.offset = split;
- else
- key.offset = split - 1;
+ key.offset = split;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0 && path->slots[0] > 0)
@@ -661,159 +587,158 @@ again:
key.type != BTRFS_EXTENT_DATA_KEY);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- extent_type = btrfs_file_extent_type(leaf, fi);
- BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
+ BUG_ON(btrfs_file_extent_type(leaf, fi) !=
+ BTRFS_FILE_EXTENT_PREALLOC);
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
BUG_ON(key.offset > start || extent_end < end);
bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
+ memcpy(&new_key, &key, sizeof(new_key));
- if (key.offset == start)
- split = end;
-
- if (key.offset == start && extent_end == end) {
- int del_nr = 0;
- int del_slot = 0;
- other_start = end;
- other_end = 0;
- if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
- bytenr, &other_start, &other_end)) {
- extent_end = other_end;
- del_slot = path->slots[0] + 1;
- del_nr++;
- ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
- 0, root->root_key.objectid,
- inode->i_ino, orig_offset);
- BUG_ON(ret);
- }
+ if (start == key.offset && end < extent_end) {
other_start = 0;
other_end = start;
- if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
- bytenr, &other_start, &other_end)) {
- key.offset = other_start;
- del_slot = path->slots[0];
- del_nr++;
- ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
- 0, root->root_key.objectid,
- inode->i_ino, orig_offset);
- BUG_ON(ret);
- }
- split_end = 0;
- if (del_nr == 0) {
- btrfs_set_file_extent_type(leaf, fi,
- BTRFS_FILE_EXTENT_REG);
- goto done;
- }
-
- fi = btrfs_item_ptr(leaf, del_slot - 1,
- struct btrfs_file_extent_item);
- btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
- btrfs_set_file_extent_num_bytes(leaf, fi,
- extent_end - key.offset);
- btrfs_mark_buffer_dirty(leaf);
-
- ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
- BUG_ON(ret);
- goto release;
- } else if (split == start) {
- if (locked_end < extent_end) {
- ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
- locked_end, extent_end - 1, GFP_NOFS);
- if (!ret) {
- btrfs_release_path(root, path);
- lock_extent(&BTRFS_I(inode)->io_tree,
- locked_end, extent_end - 1, GFP_NOFS);
- locked_end = extent_end;
- goto again;
- }
- locked_end = extent_end;
+ if (extent_mergeable(leaf, path->slots[0] - 1,
+ inode->i_ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ new_key.offset = end;
+ btrfs_set_item_key_safe(trans, root, path, &new_key);
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - end);
+ btrfs_set_file_extent_offset(leaf, fi,
+ end - orig_offset);
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ end - other_start);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
}
- btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
- } else {
- BUG_ON(key.offset != start);
- key.offset = split;
- btrfs_set_file_extent_offset(leaf, fi, key.offset -
- orig_offset);
- btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
- btrfs_set_item_key_safe(trans, root, path, &key);
- extent_end = split;
}
- if (extent_end == end) {
- split_end = 0;
- extent_type = BTRFS_FILE_EXTENT_REG;
- }
- if (extent_end == end && split == start) {
+ if (start > key.offset && end == extent_end) {
other_start = end;
other_end = 0;
- if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
- bytenr, &other_start, &other_end)) {
- path->slots[0]++;
+ if (extent_mergeable(leaf, path->slots[0] + 1,
+ inode->i_ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- key.offset = split;
- btrfs_set_item_key_safe(trans, root, path, &key);
- btrfs_set_file_extent_offset(leaf, fi, key.offset -
- orig_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
- other_end - split);
- goto done;
- }
- }
- if (extent_end == end && split == end) {
- other_start = 0;
- other_end = start;
- if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
- bytenr, &other_start, &other_end)) {
- path->slots[0]--;
+ start - key.offset);
+ path->slots[0]++;
+ new_key.offset = start;
+ btrfs_set_item_key_safe(trans, root, path, &new_key);
+
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
- other_start);
- goto done;
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ other_end - start);
+ btrfs_set_file_extent_offset(leaf, fi,
+ start - orig_offset);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
}
}
- btrfs_mark_buffer_dirty(leaf);
+ while (start > key.offset || end < extent_end) {
+ if (key.offset == start)
+ split = end;
- ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
- root->root_key.objectid,
- inode->i_ino, orig_offset);
- BUG_ON(ret);
- btrfs_release_path(root, path);
+ new_key.offset = split;
+ ret = btrfs_duplicate_item(trans, root, path, &new_key);
+ if (ret == -EAGAIN) {
+ btrfs_release_path(root, path);
+ goto again;
+ }
+ BUG_ON(ret < 0);
- key.offset = start;
- ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
- BUG_ON(ret);
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ split - key.offset);
- leaf = path->nodes[0];
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_set_file_extent_type(leaf, fi, extent_type);
- btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
- btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
- btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
- btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
- btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
- btrfs_set_file_extent_compression(leaf, fi, 0);
- btrfs_set_file_extent_encryption(leaf, fi, 0);
- btrfs_set_file_extent_other_encoding(leaf, fi, 0);
-done:
- btrfs_mark_buffer_dirty(leaf);
-
-release:
- btrfs_release_path(root, path);
- if (split_end && split == start) {
- split = end;
- goto again;
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - split);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
+ root->root_key.objectid,
+ inode->i_ino, orig_offset);
+ BUG_ON(ret);
+
+ if (split == start) {
+ key.offset = start;
+ } else {
+ BUG_ON(start != key.offset);
+ path->slots[0]--;
+ extent_end = end;
+ }
+ recow = 1;
+ }
+
+ other_start = end;
+ other_end = 0;
+ if (extent_mergeable(leaf, path->slots[0] + 1,
+ inode->i_ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ if (recow) {
+ btrfs_release_path(root, path);
+ goto again;
+ }
+ extent_end = other_end;
+ del_slot = path->slots[0] + 1;
+ del_nr++;
+ ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ 0, root->root_key.objectid,
+ inode->i_ino, orig_offset);
+ BUG_ON(ret);
+ }
+ other_start = 0;
+ other_end = start;
+ if (extent_mergeable(leaf, path->slots[0] - 1,
+ inode->i_ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ if (recow) {
+ btrfs_release_path(root, path);
+ goto again;
+ }
+ key.offset = other_start;
+ del_slot = path->slots[0];
+ del_nr++;
+ ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ 0, root->root_key.objectid,
+ inode->i_ino, orig_offset);
+ BUG_ON(ret);
}
- if (locked_end > end) {
- unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
- GFP_NOFS);
+ if (del_nr == 0) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG);
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
+ fi = btrfs_item_ptr(leaf, del_slot - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - key.offset);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+ BUG_ON(ret);
}
+out:
btrfs_free_path(path);
return 0;
}
@@ -1210,7 +1135,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
}
mutex_lock(&dentry->d_inode->i_mutex);
out:
- return ret > 0 ? EIO : ret;
+ return ret > 0 ? -EIO : ret;
}
static const struct vm_operations_struct btrfs_file_vm_ops = {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b3ad168a0bfc..e03a836d50d0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -88,13 +88,14 @@ static noinline int cow_file_range(struct inode *inode,
u64 start, u64 end, int *page_started,
unsigned long *nr_written, int unlock);
-static int btrfs_init_inode_security(struct inode *inode, struct inode *dir)
+static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir)
{
int err;
- err = btrfs_init_acl(inode, dir);
+ err = btrfs_init_acl(trans, inode, dir);
if (!err)
- err = btrfs_xattr_security_init(inode, dir);
+ err = btrfs_xattr_security_init(trans, inode, dir);
return err;
}
@@ -188,8 +189,18 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
+ /*
+ * we're an inline extent, so nobody can
+ * extend the file past i_size without locking
+ * a page we already have locked.
+ *
+ * We must do any isize and inode updates
+ * before we unlock the pages. Otherwise we
+ * could end up racing with unlink.
+ */
BTRFS_I(inode)->disk_i_size = inode->i_size;
btrfs_update_inode(trans, root, inode);
+
return 0;
fail:
btrfs_free_path(path);
@@ -230,8 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
return 1;
}
- ret = btrfs_drop_extents(trans, root, inode, start,
- aligned_end, aligned_end, start,
+ ret = btrfs_drop_extents(trans, inode, start, aligned_end,
&hint_byte, 1);
BUG_ON(ret);
@@ -416,7 +426,6 @@ again:
start, end,
total_compressed, pages);
}
- btrfs_end_transaction(trans, root);
if (ret == 0) {
/*
* inline extent creation worked, we don't need
@@ -430,9 +439,11 @@ again:
EXTENT_CLEAR_DELALLOC |
EXTENT_CLEAR_ACCOUNTING |
EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
- ret = 0;
+
+ btrfs_end_transaction(trans, root);
goto free_pages_out;
}
+ btrfs_end_transaction(trans, root);
}
if (will_compress) {
@@ -543,7 +554,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
if (list_empty(&async_cow->extents))
return 0;
- trans = btrfs_join_transaction(root, 1);
while (!list_empty(&async_cow->extents)) {
async_extent = list_entry(async_cow->extents.next,
@@ -590,19 +600,15 @@ retry:
lock_extent(io_tree, async_extent->start,
async_extent->start + async_extent->ram_size - 1,
GFP_NOFS);
- /*
- * here we're doing allocation and writeback of the
- * compressed pages
- */
- btrfs_drop_extent_cache(inode, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1, 0);
+ trans = btrfs_join_transaction(root, 1);
ret = btrfs_reserve_extent(trans, root,
async_extent->compressed_size,
async_extent->compressed_size,
0, alloc_hint,
(u64)-1, &ins, 1);
+ btrfs_end_transaction(trans, root);
+
if (ret) {
int i;
for (i = 0; i < async_extent->nr_pages; i++) {
@@ -618,6 +624,14 @@ retry:
goto retry;
}
+ /*
+ * here we're doing allocation and writeback of the
+ * compressed pages
+ */
+ btrfs_drop_extent_cache(inode, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1, 0);
+
em = alloc_extent_map(GFP_NOFS);
em->start = async_extent->start;
em->len = async_extent->ram_size;
@@ -649,8 +663,6 @@ retry:
BTRFS_ORDERED_COMPRESSED);
BUG_ON(ret);
- btrfs_end_transaction(trans, root);
-
/*
* clear dirty, set writeback and unlock the pages.
*/
@@ -672,13 +684,11 @@ retry:
async_extent->nr_pages);
BUG_ON(ret);
- trans = btrfs_join_transaction(root, 1);
alloc_hint = ins.objectid + ins.offset;
kfree(async_extent);
cond_resched();
}
- btrfs_end_transaction(trans, root);
return 0;
}
@@ -742,6 +752,7 @@ static noinline int cow_file_range(struct inode *inode,
EXTENT_CLEAR_DIRTY |
EXTENT_SET_WRITEBACK |
EXTENT_END_WRITEBACK);
+
*nr_written = *nr_written +
(end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
*page_started = 1;
@@ -1596,7 +1607,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
struct inode *inode, u64 file_pos,
u64 disk_bytenr, u64 disk_num_bytes,
u64 num_bytes, u64 ram_bytes,
- u64 locked_end,
u8 compression, u8 encryption,
u16 other_encoding, int extent_type)
{
@@ -1622,9 +1632,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
* the caller is expected to unpin it and allow it to be merged
* with the others.
*/
- ret = btrfs_drop_extents(trans, root, inode, file_pos,
- file_pos + num_bytes, locked_end,
- file_pos, &hint, 0);
+ ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
+ &hint, 0);
BUG_ON(ret);
ins.objectid = inode->i_ino;
@@ -1671,24 +1680,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
* before we start the transaction. It limits the amount of btree
* reads required while inside the transaction.
*/
-static noinline void reada_csum(struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_ordered_extent *ordered_extent)
-{
- struct btrfs_ordered_sum *sum;
- u64 bytenr;
-
- sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
- list);
- bytenr = sum->sums[0].bytenr;
-
- /*
- * we don't care about the results, the point of this search is
- * just to get the btree leaves into ram
- */
- btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
-}
-
/* as ordered data IO finishes, this gets called so we can finish
* an ordered extent if the range of bytes in the file it covers are
* fully written.
@@ -1699,7 +1690,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
struct btrfs_trans_handle *trans;
struct btrfs_ordered_extent *ordered_extent = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct btrfs_path *path;
int compressed = 0;
int ret;
@@ -1707,46 +1697,32 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
if (!ret)
return 0;
- /*
- * before we join the transaction, try to do some of our IO.
- * This will limit the amount of IO that we have to do with
- * the transaction running. We're unlikely to need to do any
- * IO if the file extents are new, the disk_i_size checks
- * covers the most common case.
- */
- if (start < BTRFS_I(inode)->disk_i_size) {
- path = btrfs_alloc_path();
- if (path) {
- ret = btrfs_lookup_file_extent(NULL, root, path,
- inode->i_ino,
- start, 0);
- ordered_extent = btrfs_lookup_ordered_extent(inode,
- start);
- if (!list_empty(&ordered_extent->list)) {
- btrfs_release_path(root, path);
- reada_csum(root, path, ordered_extent);
- }
- btrfs_free_path(path);
+ ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+ BUG_ON(!ordered_extent);
+
+ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+ BUG_ON(!list_empty(&ordered_extent->list));
+ ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+ if (!ret) {
+ trans = btrfs_join_transaction(root, 1);
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+ btrfs_end_transaction(trans, root);
}
+ goto out;
}
- trans = btrfs_join_transaction(root, 1);
-
- if (!ordered_extent)
- ordered_extent = btrfs_lookup_ordered_extent(inode, start);
- BUG_ON(!ordered_extent);
- if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
- goto nocow;
-
lock_extent(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
GFP_NOFS);
+ trans = btrfs_join_transaction(root, 1);
+
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compressed = 1;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
BUG_ON(compressed);
- ret = btrfs_mark_extent_written(trans, root, inode,
+ ret = btrfs_mark_extent_written(trans, inode,
ordered_extent->file_offset,
ordered_extent->file_offset +
ordered_extent->len);
@@ -1758,8 +1734,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ordered_extent->disk_len,
ordered_extent->len,
ordered_extent->len,
- ordered_extent->file_offset +
- ordered_extent->len,
compressed, 0, 0,
BTRFS_FILE_EXTENT_REG);
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
@@ -1770,22 +1744,20 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
unlock_extent(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
GFP_NOFS);
-nocow:
add_pending_csums(trans, inode, ordered_extent->file_offset,
&ordered_extent->list);
- mutex_lock(&BTRFS_I(inode)->extent_mutex);
- btrfs_ordered_update_i_size(inode, ordered_extent);
- btrfs_update_inode(trans, root, inode);
- btrfs_remove_ordered_extent(inode, ordered_extent);
- mutex_unlock(&BTRFS_I(inode)->extent_mutex);
-
+ /* this also removes the ordered extent from the tree */
+ btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+ btrfs_end_transaction(trans, root);
+out:
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
/* once for the tree */
btrfs_put_ordered_extent(ordered_extent);
- btrfs_end_transaction(trans, root);
return 0;
}
@@ -2008,6 +1980,54 @@ zeroit:
return -EIO;
}
+struct delayed_iput {
+ struct list_head list;
+ struct inode *inode;
+};
+
+void btrfs_add_delayed_iput(struct inode *inode)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct delayed_iput *delayed;
+
+ if (atomic_add_unless(&inode->i_count, -1, 1))
+ return;
+
+ delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
+ delayed->inode = inode;
+
+ spin_lock(&fs_info->delayed_iput_lock);
+ list_add_tail(&delayed->list, &fs_info->delayed_iputs);
+ spin_unlock(&fs_info->delayed_iput_lock);
+}
+
+void btrfs_run_delayed_iputs(struct btrfs_root *root)
+{
+ LIST_HEAD(list);
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct delayed_iput *delayed;
+ int empty;
+
+ spin_lock(&fs_info->delayed_iput_lock);
+ empty = list_empty(&fs_info->delayed_iputs);
+ spin_unlock(&fs_info->delayed_iput_lock);
+ if (empty)
+ return;
+
+ down_read(&root->fs_info->cleanup_work_sem);
+ spin_lock(&fs_info->delayed_iput_lock);
+ list_splice_init(&fs_info->delayed_iputs, &list);
+ spin_unlock(&fs_info->delayed_iput_lock);
+
+ while (!list_empty(&list)) {
+ delayed = list_entry(list.next, struct delayed_iput, list);
+ list_del(&delayed->list);
+ iput(delayed->inode);
+ kfree(delayed);
+ }
+ up_read(&root->fs_info->cleanup_work_sem);
+}
+
/*
* This creates an orphan entry for the given inode in case something goes
* wrong in the middle of an unlink/truncate.
@@ -2080,16 +2100,17 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
struct inode *inode;
int ret = 0, nr_unlink = 0, nr_truncate = 0;
- path = btrfs_alloc_path();
- if (!path)
+ if (!xchg(&root->clean_orphans, 0))
return;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
path->reada = -1;
key.objectid = BTRFS_ORPHAN_OBJECTID;
btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
key.offset = (u64)-1;
-
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
@@ -2834,37 +2855,40 @@ out:
* min_type is the minimum key type to truncate down to. If set to 0, this
* will kill all the items on this inode, including the INODE_ITEM_KEY.
*/
-noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode,
- u64 new_size, u32 min_type)
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ u64 new_size, u32 min_type)
{
- int ret;
struct btrfs_path *path;
- struct btrfs_key key;
- struct btrfs_key found_key;
- u32 found_type = (u8)-1;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
u64 extent_start = 0;
u64 extent_num_bytes = 0;
u64 extent_offset = 0;
u64 item_end = 0;
+ u64 mask = root->sectorsize - 1;
+ u32 found_type = (u8)-1;
int found_extent;
int del_item;
int pending_del_nr = 0;
int pending_del_slot = 0;
int extent_type = -1;
int encoding;
- u64 mask = root->sectorsize - 1;
+ int ret;
+ int err = 0;
+
+ BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
if (root->ref_cows)
btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+
path = btrfs_alloc_path();
BUG_ON(!path);
path->reada = -1;
- /* FIXME, add redo link to tree so we don't leak on crash */
key.objectid = inode->i_ino;
key.offset = (u64)-1;
key.type = (u8)-1;
@@ -2872,17 +2896,17 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
search_again:
path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0)
- goto error;
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
if (ret > 0) {
/* there are no items in the tree for us to truncate, we're
* done
*/
- if (path->slots[0] == 0) {
- ret = 0;
- goto error;
- }
+ if (path->slots[0] == 0)
+ goto out;
path->slots[0]--;
}
@@ -2917,28 +2941,17 @@ search_again:
}
item_end--;
}
- if (item_end < new_size) {
- if (found_type == BTRFS_DIR_ITEM_KEY)
- found_type = BTRFS_INODE_ITEM_KEY;
- else if (found_type == BTRFS_EXTENT_ITEM_KEY)
- found_type = BTRFS_EXTENT_DATA_KEY;
- else if (found_type == BTRFS_EXTENT_DATA_KEY)
- found_type = BTRFS_XATTR_ITEM_KEY;
- else if (found_type == BTRFS_XATTR_ITEM_KEY)
- found_type = BTRFS_INODE_REF_KEY;
- else if (found_type)
- found_type--;
- else
+ if (found_type > min_type) {
+ del_item = 1;
+ } else {
+ if (item_end < new_size)
break;
- btrfs_set_key_type(&key, found_type);
- goto next;
+ if (found_key.offset >= new_size)
+ del_item = 1;
+ else
+ del_item = 0;
}
- if (found_key.offset >= new_size)
- del_item = 1;
- else
- del_item = 0;
found_extent = 0;
-
/* FIXME, shrink the extent if the ref count is only 1 */
if (found_type != BTRFS_EXTENT_DATA_KEY)
goto delete;
@@ -3025,42 +3038,36 @@ delete:
inode->i_ino, extent_offset);
BUG_ON(ret);
}
-next:
- if (path->slots[0] == 0) {
- if (pending_del_nr)
- goto del_pending;
- btrfs_release_path(root, path);
- if (found_type == BTRFS_INODE_ITEM_KEY)
- break;
- goto search_again;
- }
- path->slots[0]--;
- if (pending_del_nr &&
- path->slots[0] + 1 != pending_del_slot) {
- struct btrfs_key debug;
-del_pending:
- btrfs_item_key_to_cpu(path->nodes[0], &debug,
- pending_del_slot);
- ret = btrfs_del_items(trans, root, path,
- pending_del_slot,
- pending_del_nr);
- BUG_ON(ret);
- pending_del_nr = 0;
+ if (found_type == BTRFS_INODE_ITEM_KEY)
+ break;
+
+ if (path->slots[0] == 0 ||
+ path->slots[0] != pending_del_slot) {
+ if (root->ref_cows) {
+ err = -EAGAIN;
+ goto out;
+ }
+ if (pending_del_nr) {
+ ret = btrfs_del_items(trans, root, path,
+ pending_del_slot,
+ pending_del_nr);
+ BUG_ON(ret);
+ pending_del_nr = 0;
+ }
btrfs_release_path(root, path);
- if (found_type == BTRFS_INODE_ITEM_KEY)
- break;
goto search_again;
+ } else {
+ path->slots[0]--;
}
}
- ret = 0;
-error:
+out:
if (pending_del_nr) {
ret = btrfs_del_items(trans, root, path, pending_del_slot,
pending_del_nr);
}
btrfs_free_path(path);
- return ret;
+ return err;
}
/*
@@ -3180,10 +3187,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
if (size <= hole_start)
return 0;
- err = btrfs_truncate_page(inode->i_mapping, inode->i_size);
- if (err)
- return err;
-
while (1) {
struct btrfs_ordered_extent *ordered;
btrfs_wait_ordered_range(inode, hole_start,
@@ -3196,9 +3199,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
btrfs_put_ordered_extent(ordered);
}
- trans = btrfs_start_transaction(root, 1);
- btrfs_set_trans_block_group(trans, inode);
-
cur_offset = hole_start;
while (1) {
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
@@ -3206,40 +3206,120 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
BUG_ON(IS_ERR(em) || !em);
last_byte = min(extent_map_end(em), block_end);
last_byte = (last_byte + mask) & ~mask;
- if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+ if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
u64 hint_byte = 0;
hole_size = last_byte - cur_offset;
- err = btrfs_drop_extents(trans, root, inode,
- cur_offset,
- cur_offset + hole_size,
- block_end,
- cur_offset, &hint_byte, 1);
- if (err)
- break;
- err = btrfs_reserve_metadata_space(root, 1);
+ err = btrfs_reserve_metadata_space(root, 2);
if (err)
break;
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+
+ err = btrfs_drop_extents(trans, inode, cur_offset,
+ cur_offset + hole_size,
+ &hint_byte, 1);
+ BUG_ON(err);
+
err = btrfs_insert_file_extent(trans, root,
inode->i_ino, cur_offset, 0,
0, hole_size, 0, hole_size,
0, 0, 0);
+ BUG_ON(err);
+
btrfs_drop_extent_cache(inode, hole_start,
last_byte - 1, 0);
- btrfs_unreserve_metadata_space(root, 1);
+
+ btrfs_end_transaction(trans, root);
+ btrfs_unreserve_metadata_space(root, 2);
}
free_extent_map(em);
cur_offset = last_byte;
- if (err || cur_offset >= block_end)
+ if (cur_offset >= block_end)
break;
}
- btrfs_end_transaction(trans, root);
unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
return err;
}
+static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ unsigned long nr;
+ int ret;
+
+ if (attr->ia_size == inode->i_size)
+ return 0;
+
+ if (attr->ia_size > inode->i_size) {
+ unsigned long limit;
+ limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ if (attr->ia_size > inode->i_sb->s_maxbytes)
+ return -EFBIG;
+ if (limit != RLIM_INFINITY && attr->ia_size > limit) {
+ send_sig(SIGXFSZ, current, 0);
+ return -EFBIG;
+ }
+ }
+
+ ret = btrfs_reserve_metadata_space(root, 1);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+
+ ret = btrfs_orphan_add(trans, inode);
+ BUG_ON(ret);
+
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+ btrfs_unreserve_metadata_space(root, 1);
+ btrfs_btree_balance_dirty(root, nr);
+
+ if (attr->ia_size > inode->i_size) {
+ ret = btrfs_cont_expand(inode, attr->ia_size);
+ if (ret) {
+ btrfs_truncate(inode);
+ return ret;
+ }
+
+ i_size_write(inode, attr->ia_size);
+ btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+ if (inode->i_nlink > 0) {
+ ret = btrfs_orphan_del(trans, inode);
+ BUG_ON(ret);
+ }
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root, nr);
+ return 0;
+ }
+
+ /*
+ * We're truncating a file that used to have good data down to
+ * zero. Make sure it gets into the ordered flush list so that
+ * any new writes get down to disk quickly.
+ */
+ if (attr->ia_size == 0)
+ BTRFS_I(inode)->ordered_data_close = 1;
+
+ /* we don't support swapfiles, so vmtruncate shouldn't fail */
+ ret = vmtruncate(inode, attr->ia_size);
+ BUG_ON(ret);
+
+ return 0;
+}
+
static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
@@ -3250,23 +3330,14 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
return err;
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
- if (attr->ia_size > inode->i_size) {
- err = btrfs_cont_expand(inode, attr->ia_size);
- if (err)
- return err;
- } else if (inode->i_size > 0 &&
- attr->ia_size == 0) {
-
- /* we're truncating a file that used to have good
- * data down to zero. Make sure it gets into
- * the ordered flush list so that any new writes
- * get down to disk quickly.
- */
- BTRFS_I(inode)->ordered_data_close = 1;
- }
+ err = btrfs_setattr_size(inode, attr);
+ if (err)
+ return err;
}
+ attr->ia_valid &= ~ATTR_SIZE;
- err = inode_setattr(inode, attr);
+ if (attr->ia_valid)
+ err = inode_setattr(inode, attr);
if (!err && ((attr->ia_valid & ATTR_MODE)))
err = btrfs_acl_chmod(inode);
@@ -3287,36 +3358,43 @@ void btrfs_delete_inode(struct inode *inode)
}
btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ if (root->fs_info->log_root_recovering) {
+ BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
+ goto no_delete;
+ }
+
if (inode->i_nlink > 0) {
BUG_ON(btrfs_root_refs(&root->root_item) != 0);
goto no_delete;
}
btrfs_i_size_write(inode, 0);
- trans = btrfs_join_transaction(root, 1);
- btrfs_set_trans_block_group(trans, inode);
- ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
- if (ret) {
- btrfs_orphan_del(NULL, inode);
- goto no_delete_lock;
- }
+ while (1) {
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+ ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
- btrfs_orphan_del(trans, inode);
+ if (ret != -EAGAIN)
+ break;
- nr = trans->blocks_used;
- clear_inode(inode);
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+ trans = NULL;
+ btrfs_btree_balance_dirty(root, nr);
+ }
- btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
- return;
+ if (ret == 0) {
+ ret = btrfs_orphan_del(trans, inode);
+ BUG_ON(ret);
+ }
-no_delete_lock:
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root, nr);
no_delete:
clear_inode(inode);
+ return;
}
/*
@@ -3569,7 +3647,6 @@ static noinline void init_btrfs_i(struct inode *inode)
INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
- mutex_init(&BTRFS_I(inode)->extent_mutex);
mutex_init(&BTRFS_I(inode)->log_mutex);
}
@@ -3695,6 +3772,13 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
}
srcu_read_unlock(&root->fs_info->subvol_srcu, index);
+ if (root != sub_root) {
+ down_read(&root->fs_info->cleanup_work_sem);
+ if (!(inode->i_sb->s_flags & MS_RDONLY))
+ btrfs_orphan_cleanup(sub_root);
+ up_read(&root->fs_info->cleanup_work_sem);
+ }
+
return inode;
}
@@ -3869,7 +3953,11 @@ skip:
/* Reached end of directory/root. Bump pos past the last item. */
if (key_type == BTRFS_DIR_INDEX_KEY)
- filp->f_pos = INT_LIMIT(off_t);
+ /*
+ * 32-bit glibc will use getdents64, but then strtol -
+ * so the last number we can serve is this.
+ */
+ filp->f_pos = 0x7fffffff;
else
filp->f_pos++;
nopos:
@@ -4219,7 +4307,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
if (IS_ERR(inode))
goto out_unlock;
- err = btrfs_init_inode_security(inode, dir);
+ err = btrfs_init_inode_security(trans, inode, dir);
if (err) {
drop_inode = 1;
goto out_unlock;
@@ -4290,7 +4378,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
if (IS_ERR(inode))
goto out_unlock;
- err = btrfs_init_inode_security(inode, dir);
+ err = btrfs_init_inode_security(trans, inode, dir);
if (err) {
drop_inode = 1;
goto out_unlock;
@@ -4336,6 +4424,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (inode->i_nlink == 0)
return -ENOENT;
+ /* do not allow sys_link's with other subvols of the same device */
+ if (root->objectid != BTRFS_I(inode)->root->objectid)
+ return -EPERM;
+
/*
* 1 item for inode ref
* 2 items for dir items
@@ -4423,7 +4515,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
drop_on_err = 1;
- err = btrfs_init_inode_security(inode, dir);
+ err = btrfs_init_inode_security(trans, inode, dir);
if (err)
goto out_fail;
@@ -5074,17 +5166,20 @@ static void btrfs_truncate(struct inode *inode)
unsigned long nr;
u64 mask = root->sectorsize - 1;
- if (!S_ISREG(inode->i_mode))
- return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ if (!S_ISREG(inode->i_mode)) {
+ WARN_ON(1);
return;
+ }
ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
if (ret)
return;
+
btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+ btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
/*
* setattr is responsible for setting the ordered_data_close flag,
@@ -5106,21 +5201,32 @@ static void btrfs_truncate(struct inode *inode)
if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
btrfs_add_ordered_operation(trans, root, inode);
- btrfs_set_trans_block_group(trans, inode);
- btrfs_i_size_write(inode, inode->i_size);
+ while (1) {
+ ret = btrfs_truncate_inode_items(trans, root, inode,
+ inode->i_size,
+ BTRFS_EXTENT_DATA_KEY);
+ if (ret != -EAGAIN)
+ break;
- ret = btrfs_orphan_add(trans, inode);
- if (ret)
- goto out;
- /* FIXME, add redo link to tree so we don't leak on crash */
- ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
- BTRFS_EXTENT_DATA_KEY);
- btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
- ret = btrfs_orphan_del(trans, inode);
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root, nr);
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+ }
+
+ if (ret == 0 && inode->i_nlink > 0) {
+ ret = btrfs_orphan_del(trans, inode);
+ BUG_ON(ret);
+ }
+
+ ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
-out:
nr = trans->blocks_used;
ret = btrfs_end_transaction_throttle(trans, root);
BUG_ON(ret);
@@ -5217,9 +5323,9 @@ void btrfs_destroy_inode(struct inode *inode)
spin_lock(&root->list_lock);
if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
- printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
- " list\n", inode->i_ino);
- dump_stack();
+ printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
+ inode->i_ino);
+ list_del_init(&BTRFS_I(inode)->i_orphan);
}
spin_unlock(&root->list_lock);
@@ -5476,7 +5582,7 @@ out_fail:
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
{
struct list_head *head = &root->fs_info->delalloc_inodes;
struct btrfs_inode *binode;
@@ -5495,7 +5601,10 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
spin_unlock(&root->fs_info->delalloc_lock);
if (inode) {
filemap_flush(inode->i_mapping);
- iput(inode);
+ if (delay_iput)
+ btrfs_add_delayed_iput(inode);
+ else
+ iput(inode);
}
cond_resched();
spin_lock(&root->fs_info->delalloc_lock);
@@ -5569,7 +5678,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (IS_ERR(inode))
goto out_unlock;
- err = btrfs_init_inode_security(inode, dir);
+ err = btrfs_init_inode_security(trans, inode, dir);
if (err) {
drop_inode = 1;
goto out_unlock;
@@ -5641,57 +5750,77 @@ out_fail:
return err;
}
-static int prealloc_file_range(struct btrfs_trans_handle *trans,
- struct inode *inode, u64 start, u64 end,
- u64 locked_end, u64 alloc_hint, int mode)
+static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+ u64 alloc_hint, int mode, loff_t actual_len)
{
+ struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
u64 alloc_size;
u64 cur_offset = start;
u64 num_bytes = end - start;
int ret = 0;
+ u64 i_size;
while (num_bytes > 0) {
alloc_size = min(num_bytes, root->fs_info->max_extent);
- ret = btrfs_reserve_metadata_space(root, 1);
- if (ret)
- goto out;
+ trans = btrfs_start_transaction(root, 1);
ret = btrfs_reserve_extent(trans, root, alloc_size,
root->sectorsize, 0, alloc_hint,
(u64)-1, &ins, 1);
if (ret) {
WARN_ON(1);
- goto out;
+ goto stop_trans;
}
+
+ ret = btrfs_reserve_metadata_space(root, 3);
+ if (ret) {
+ btrfs_free_reserved_extent(root, ins.objectid,
+ ins.offset);
+ goto stop_trans;
+ }
+
ret = insert_reserved_file_extent(trans, inode,
cur_offset, ins.objectid,
ins.offset, ins.offset,
- ins.offset, locked_end,
- 0, 0, 0,
+ ins.offset, 0, 0, 0,
BTRFS_FILE_EXTENT_PREALLOC);
BUG_ON(ret);
btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + ins.offset -1, 0);
+
num_bytes -= ins.offset;
cur_offset += ins.offset;
alloc_hint = ins.objectid + ins.offset;
- btrfs_unreserve_metadata_space(root, 1);
- }
-out:
- if (cur_offset > start) {
+
inode->i_ctime = CURRENT_TIME;
BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- cur_offset > i_size_read(inode))
- btrfs_i_size_write(inode, cur_offset);
+ (actual_len > inode->i_size) &&
+ (cur_offset > inode->i_size)) {
+
+ if (cur_offset > actual_len)
+ i_size = actual_len;
+ else
+ i_size = cur_offset;
+ i_size_write(inode, i_size);
+ btrfs_ordered_update_i_size(inode, i_size, NULL);
+ }
+
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
+
+ btrfs_end_transaction(trans, root);
+ btrfs_unreserve_metadata_space(root, 3);
}
+ return ret;
+stop_trans:
+ btrfs_end_transaction(trans, root);
return ret;
+
}
static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5705,8 +5834,6 @@ static long btrfs_fallocate(struct inode *inode, int mode,
u64 locked_end;
u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
struct extent_map *em;
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root;
int ret;
alloc_start = offset & ~mask;
@@ -5725,9 +5852,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
goto out;
}
- root = BTRFS_I(inode)->root;
-
- ret = btrfs_check_data_free_space(root, inode,
+ ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
alloc_end - alloc_start);
if (ret)
goto out;
@@ -5736,12 +5861,6 @@ static long btrfs_fallocate(struct inode *inode, int mode,
while (1) {
struct btrfs_ordered_extent *ordered;
- trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
- if (!trans) {
- ret = -EIO;
- goto out_free;
- }
-
/* the extent lock is ordered inside the running
* transaction
*/
@@ -5755,8 +5874,6 @@ static long btrfs_fallocate(struct inode *inode, int mode,
btrfs_put_ordered_extent(ordered);
unlock_extent(&BTRFS_I(inode)->io_tree,
alloc_start, locked_end, GFP_NOFS);
- btrfs_end_transaction(trans, BTRFS_I(inode)->root);
-
/*
* we can't wait on the range with the transaction
* running or with the extent lock held
@@ -5777,10 +5894,12 @@ static long btrfs_fallocate(struct inode *inode, int mode,
BUG_ON(IS_ERR(em) || !em);
last_byte = min(extent_map_end(em), alloc_end);
last_byte = (last_byte + mask) & ~mask;
- if (em->block_start == EXTENT_MAP_HOLE) {
- ret = prealloc_file_range(trans, inode, cur_offset,
- last_byte, locked_end + 1,
- alloc_hint, mode);
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ (cur_offset >= inode->i_size &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ ret = prealloc_file_range(inode,
+ cur_offset, last_byte,
+ alloc_hint, mode, offset+len);
if (ret < 0) {
free_extent_map(em);
break;
@@ -5799,9 +5918,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
GFP_NOFS);
- btrfs_end_transaction(trans, BTRFS_I(inode)->root);
-out_free:
- btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start);
+ btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
+ alloc_end - alloc_start);
out:
mutex_unlock(&inode->i_mutex);
return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cdbb054102b9..0bc577695061 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -237,7 +237,6 @@ static noinline int create_subvol(struct btrfs_root *root,
u64 objectid;
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
- unsigned long nr = 1;
/*
* 1 - inode item
@@ -290,7 +289,7 @@ static noinline int create_subvol(struct btrfs_root *root,
btrfs_set_root_generation(&root_item, trans->transid);
btrfs_set_root_level(&root_item, 0);
btrfs_set_root_refs(&root_item, 1);
- btrfs_set_root_used(&root_item, 0);
+ btrfs_set_root_used(&root_item, leaf->len);
btrfs_set_root_last_snapshot(&root_item, 0);
memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
@@ -342,24 +341,21 @@ static noinline int create_subvol(struct btrfs_root *root,
d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
fail:
- nr = trans->blocks_used;
err = btrfs_commit_transaction(trans, root);
if (err && !ret)
ret = err;
btrfs_unreserve_metadata_space(root, 6);
- btrfs_btree_balance_dirty(root, nr);
return ret;
}
static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
char *name, int namelen)
{
+ struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
struct btrfs_trans_handle *trans;
- int ret = 0;
- int err;
- unsigned long nr = 0;
+ int ret;
if (!root->ref_cows)
return -EINVAL;
@@ -372,20 +368,20 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
*/
ret = btrfs_reserve_metadata_space(root, 6);
if (ret)
- goto fail_unlock;
+ goto fail;
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
if (!pending_snapshot) {
ret = -ENOMEM;
btrfs_unreserve_metadata_space(root, 6);
- goto fail_unlock;
+ goto fail;
}
pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
if (!pending_snapshot->name) {
ret = -ENOMEM;
kfree(pending_snapshot);
btrfs_unreserve_metadata_space(root, 6);
- goto fail_unlock;
+ goto fail;
}
memcpy(pending_snapshot->name, name, namelen);
pending_snapshot->name[namelen] = '\0';
@@ -395,10 +391,19 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
pending_snapshot->root = root;
list_add(&pending_snapshot->list,
&trans->transaction->pending_snapshots);
- err = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ btrfs_unreserve_metadata_space(root, 6);
-fail_unlock:
- btrfs_btree_balance_dirty(root, nr);
+ inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ goto fail;
+ }
+ BUG_ON(!inode);
+ d_instantiate(dentry, inode);
+ ret = 0;
+fail:
return ret;
}
@@ -947,7 +952,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
*/
/* the destination must be opened for writing */
- if (!(file->f_mode & FMODE_WRITE))
+ if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
return -EINVAL;
ret = mnt_want_write(file->f_path.mnt);
@@ -959,12 +964,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
ret = -EBADF;
goto out_drop_write;
}
+
src = src_file->f_dentry->d_inode;
ret = -EINVAL;
if (src == inode)
goto out_fput;
+ /* the src must be open for reading */
+ if (!(src_file->f_mode & FMODE_READ))
+ goto out_fput;
+
ret = -EISDIR;
if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
goto out_fput;
@@ -995,7 +1005,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
/* determine range to clone */
ret = -EINVAL;
- if (off >= src->i_size || off + len > src->i_size)
+ if (off + len > src->i_size || off + len < off)
goto out_unlock;
if (len == 0)
olen = len = src->i_size - off;
@@ -1027,8 +1037,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
BUG_ON(!trans);
/* punch hole in destination first */
- btrfs_drop_extents(trans, root, inode, off, off + len,
- off + len, 0, &hint_byte, 1);
+ btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
/* clone data */
key.objectid = src->i_ino;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5799bc46a309..5c2a9e78a949 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -291,16 +291,16 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
/*
* remove an ordered extent from the tree. No references are dropped
- * but, anyone waiting on this extent is woken up.
+ * and you must wake_up entry->wait. You must hold the tree mutex
+ * while you call this function.
*/
-int btrfs_remove_ordered_extent(struct inode *inode,
+static int __btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
tree = &BTRFS_I(inode)->ordered_tree;
- mutex_lock(&tree->mutex);
node = &entry->rb_node;
rb_erase(node, &tree->tree);
tree->last = NULL;
@@ -326,16 +326,34 @@ int btrfs_remove_ordered_extent(struct inode *inode,
}
spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ return 0;
+}
+
+/*
+ * remove an ordered extent from the tree. No references are dropped
+ * but any waiters are woken.
+ */
+int btrfs_remove_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ int ret;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ mutex_lock(&tree->mutex);
+ ret = __btrfs_remove_ordered_extent(inode, entry);
mutex_unlock(&tree->mutex);
wake_up(&entry->wait);
- return 0;
+
+ return ret;
}
/*
* wait for all the ordered extents in a root. This is done when balancing
* space between drives.
*/
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
+int btrfs_wait_ordered_extents(struct btrfs_root *root,
+ int nocow_only, int delay_iput)
{
struct list_head splice;
struct list_head *cur;
@@ -372,7 +390,10 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
if (inode) {
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
- iput(inode);
+ if (delay_iput)
+ btrfs_add_delayed_iput(inode);
+ else
+ iput(inode);
} else {
btrfs_put_ordered_extent(ordered);
}
@@ -430,7 +451,7 @@ again:
btrfs_wait_ordered_range(inode, 0, (u64)-1);
else
filemap_flush(inode->i_mapping);
- iput(inode);
+ btrfs_add_delayed_iput(inode);
}
cond_resched();
@@ -589,7 +610,7 @@ out:
* After an extent is done, call this to conditionally update the on disk
* i_size. i_size is updated to cover any fully written part of the file.
*/
-int btrfs_ordered_update_i_size(struct inode *inode,
+int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered)
{
struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
@@ -597,18 +618,32 @@ int btrfs_ordered_update_i_size(struct inode *inode,
u64 disk_i_size;
u64 new_i_size;
u64 i_size_test;
+ u64 i_size = i_size_read(inode);
struct rb_node *node;
+ struct rb_node *prev = NULL;
struct btrfs_ordered_extent *test;
+ int ret = 1;
+
+ if (ordered)
+ offset = entry_end(ordered);
+ else
+ offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
mutex_lock(&tree->mutex);
disk_i_size = BTRFS_I(inode)->disk_i_size;
+ /* truncate file */
+ if (disk_i_size > i_size) {
+ BTRFS_I(inode)->disk_i_size = i_size;
+ ret = 0;
+ goto out;
+ }
+
/*
* if the disk i_size is already at the inode->i_size, or
* this ordered extent is inside the disk i_size, we're done
*/
- if (disk_i_size >= inode->i_size ||
- ordered->file_offset + ordered->len <= disk_i_size) {
+ if (disk_i_size == i_size || offset <= disk_i_size) {
goto out;
}
@@ -616,8 +651,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
* we can't update the disk_isize if there are delalloc bytes
* between disk_i_size and this ordered extent
*/
- if (test_range_bit(io_tree, disk_i_size,
- ordered->file_offset + ordered->len - 1,
+ if (test_range_bit(io_tree, disk_i_size, offset - 1,
EXTENT_DELALLOC, 0, NULL)) {
goto out;
}
@@ -626,20 +660,32 @@ int btrfs_ordered_update_i_size(struct inode *inode,
* if we find an ordered extent then we can't update disk i_size
* yet
*/
- node = &ordered->rb_node;
- while (1) {
- node = rb_prev(node);
- if (!node)
- break;
+ if (ordered) {
+ node = rb_prev(&ordered->rb_node);
+ } else {
+ prev = tree_search(tree, offset);
+ /*
+ * we insert file extents without involving ordered struct,
+ * so there should be no ordered struct cover this offset
+ */
+ if (prev) {
+ test = rb_entry(prev, struct btrfs_ordered_extent,
+ rb_node);
+ BUG_ON(offset_in_entry(test, offset));
+ }
+ node = prev;
+ }
+ while (node) {
test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
if (test->file_offset + test->len <= disk_i_size)
break;
- if (test->file_offset >= inode->i_size)
+ if (test->file_offset >= i_size)
break;
if (test->file_offset >= disk_i_size)
goto out;
+ node = rb_prev(node);
}
- new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
+ new_i_size = min_t(u64, offset, i_size);
/*
* at this point, we know we can safely update i_size to at least
@@ -647,7 +693,14 @@ int btrfs_ordered_update_i_size(struct inode *inode,
* walk forward and see if ios from higher up in the file have
* finished.
*/
- node = rb_next(&ordered->rb_node);
+ if (ordered) {
+ node = rb_next(&ordered->rb_node);
+ } else {
+ if (prev)
+ node = rb_next(prev);
+ else
+ node = rb_first(&tree->tree);
+ }
i_size_test = 0;
if (node) {
/*
@@ -655,10 +708,10 @@ int btrfs_ordered_update_i_size(struct inode *inode,
* between our ordered extent and the next one.
*/
test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (test->file_offset > entry_end(ordered))
+ if (test->file_offset > offset)
i_size_test = test->file_offset;
} else {
- i_size_test = i_size_read(inode);
+ i_size_test = i_size;
}
/*
@@ -667,15 +720,25 @@ int btrfs_ordered_update_i_size(struct inode *inode,
* are no delalloc bytes in this area, it is safe to update
* disk_i_size to the end of the region.
*/
- if (i_size_test > entry_end(ordered) &&
- !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
- EXTENT_DELALLOC, 0, NULL)) {
- new_i_size = min_t(u64, i_size_test, i_size_read(inode));
+ if (i_size_test > offset &&
+ !test_range_bit(io_tree, offset, i_size_test - 1,
+ EXTENT_DELALLOC, 0, NULL)) {
+ new_i_size = min_t(u64, i_size_test, i_size);
}
BTRFS_I(inode)->disk_i_size = new_i_size;
+ ret = 0;
out:
+ /*
+ * we need to remove the ordered extent with the tree lock held
+ * so that other people calling this function don't find our fully
+ * processed ordered entry and skip updating the i_size
+ */
+ if (ordered)
+ __btrfs_remove_ordered_extent(inode, ordered);
mutex_unlock(&tree->mutex);
- return 0;
+ if (ordered)
+ wake_up(&ordered->wait);
+ return ret;
}
/*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f82e87488ca8..1fe1282ef47c 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -150,12 +150,13 @@ void btrfs_start_ordered_extent(struct inode *inode,
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
-int btrfs_ordered_update_i_size(struct inode *inode,
+int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
+int btrfs_wait_ordered_extents(struct btrfs_root *root,
+ int nocow_only, int delay_iput);
#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cfcc93c93a7b..ab7ab5318745 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1561,6 +1561,20 @@ static int invalidate_extent_cache(struct btrfs_root *root,
return 0;
}
+static void put_inodes(struct list_head *list)
+{
+ struct inodevec *ivec;
+ while (!list_empty(list)) {
+ ivec = list_entry(list->next, struct inodevec, list);
+ list_del(&ivec->list);
+ while (ivec->nr > 0) {
+ ivec->nr--;
+ iput(ivec->inode[ivec->nr]);
+ }
+ kfree(ivec);
+ }
+}
+
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key)
@@ -1723,6 +1737,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_btree_balance_dirty(root, nr);
+ /*
+ * put inodes outside transaction, otherwise we may deadlock.
+ */
+ put_inodes(&inode_list);
+
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
}
@@ -1752,19 +1771,7 @@ out:
btrfs_btree_balance_dirty(root, nr);
- /*
- * put inodes while we aren't holding the tree locks
- */
- while (!list_empty(&inode_list)) {
- struct inodevec *ivec;
- ivec = list_entry(inode_list.next, struct inodevec, list);
- list_del(&ivec->list);
- while (ivec->nr > 0) {
- ivec->nr--;
- iput(ivec->inode[ivec->nr]);
- }
- kfree(ivec);
- }
+ put_inodes(&inode_list);
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
@@ -3274,8 +3281,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
return -ENOMEM;
path = btrfs_alloc_path();
- if (!path)
+ if (!path) {
+ kfree(cluster);
return -ENOMEM;
+ }
rc->extents_found = 0;
rc->extents_skipped = 0;
@@ -3534,8 +3543,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
(unsigned long long)rc->block_group->key.objectid,
(unsigned long long)rc->block_group->flags);
- btrfs_start_delalloc_inodes(fs_info->tree_root);
- btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+ btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+ btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
while (1) {
rc->extents_found = 0;
@@ -3755,6 +3764,8 @@ out:
BTRFS_DATA_RELOC_TREE_OBJECTID);
if (IS_ERR(fs_root))
err = PTR_ERR(fs_root);
+ else
+ btrfs_orphan_cleanup(fs_root);
}
return err;
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 752a5463bf53..a649305b2059 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -126,8 +126,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
{
struct btrfs_fs_info *info = root->fs_info;
substring_t args[MAX_OPT_ARGS];
- char *p, *num;
+ char *p, *num, *orig;
int intarg;
+ int ret = 0;
if (!options)
return 0;
@@ -140,6 +141,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
if (!options)
return -ENOMEM;
+ orig = options;
while ((p = strsep(&options, ",")) != NULL) {
int token;
@@ -262,12 +264,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_discard:
btrfs_set_opt(info->mount_opt, DISCARD);
break;
+ case Opt_err:
+ printk(KERN_INFO "btrfs: unrecognized mount option "
+ "'%s'\n", p);
+ ret = -EINVAL;
+ goto out;
default:
break;
}
}
- kfree(options);
- return 0;
+out:
+ kfree(orig);
+ return ret;
}
/*
@@ -405,8 +413,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return 0;
}
- btrfs_start_delalloc_inodes(root);
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_start_delalloc_inodes(root, 0);
+ btrfs_wait_ordered_extents(root, 0, 0);
trans = btrfs_start_transaction(root, 1);
ret = btrfs_commit_transaction(trans, root);
@@ -450,6 +458,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",notreelog");
if (btrfs_test_opt(root, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
+ if (btrfs_test_opt(root, DISCARD))
+ seq_puts(seq, ",discard");
if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
seq_puts(seq, ",noacl");
return 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c207e8c32c9b..b2acc79f1b34 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -333,6 +333,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
memset(trans, 0, sizeof(*trans));
kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ if (throttle)
+ btrfs_run_delayed_iputs(root);
+
return 0;
}
@@ -354,7 +357,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
* those extents are sent to disk but does not wait on them
*/
int btrfs_write_marked_extents(struct btrfs_root *root,
- struct extent_io_tree *dirty_pages)
+ struct extent_io_tree *dirty_pages, int mark)
{
int ret;
int err = 0;
@@ -367,7 +370,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
while (1) {
ret = find_first_extent_bit(dirty_pages, start, &start, &end,
- EXTENT_DIRTY);
+ mark);
if (ret)
break;
while (start <= end) {
@@ -413,7 +416,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
* on all the pages and clear them from the dirty pages state tree
*/
int btrfs_wait_marked_extents(struct btrfs_root *root,
- struct extent_io_tree *dirty_pages)
+ struct extent_io_tree *dirty_pages, int mark)
{
int ret;
int err = 0;
@@ -425,12 +428,12 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
unsigned long index;
while (1) {
- ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
- EXTENT_DIRTY);
+ ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark);
if (ret)
break;
- clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
+ clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
while (start <= end) {
index = start >> PAGE_CACHE_SHIFT;
start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
@@ -460,13 +463,13 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
* those extents are on disk for transaction or log commit
*/
int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
- struct extent_io_tree *dirty_pages)
+ struct extent_io_tree *dirty_pages, int mark)
{
int ret;
int ret2;
- ret = btrfs_write_marked_extents(root, dirty_pages);
- ret2 = btrfs_wait_marked_extents(root, dirty_pages);
+ ret = btrfs_write_marked_extents(root, dirty_pages, mark);
+ ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
return ret || ret2;
}
@@ -479,7 +482,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
return filemap_write_and_wait(btree_inode->i_mapping);
}
return btrfs_write_and_wait_marked_extents(root,
- &trans->transaction->dirty_pages);
+ &trans->transaction->dirty_pages,
+ EXTENT_DIRTY);
}
/*
@@ -497,13 +501,16 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
{
int ret;
u64 old_root_bytenr;
+ u64 old_root_used;
struct btrfs_root *tree_root = root->fs_info->tree_root;
+ old_root_used = btrfs_root_used(&root->root_item);
btrfs_write_dirty_block_groups(trans, root);
while (1) {
old_root_bytenr = btrfs_root_bytenr(&root->root_item);
- if (old_root_bytenr == root->node->start)
+ if (old_root_bytenr == root->node->start &&
+ old_root_used == btrfs_root_used(&root->root_item))
break;
btrfs_set_root_node(&root->root_item, root->node);
@@ -512,6 +519,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
&root->root_item);
BUG_ON(ret);
+ old_root_used = btrfs_root_used(&root->root_item);
ret = btrfs_write_dirty_block_groups(trans, root);
BUG_ON(ret);
}
@@ -795,7 +803,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
memcpy(&pending->root_key, &key, sizeof(key));
fail:
kfree(new_root_item);
- btrfs_unreserve_metadata_space(root, 6);
return ret;
}
@@ -807,7 +814,6 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
u64 index = 0;
struct btrfs_trans_handle *trans;
struct inode *parent_inode;
- struct inode *inode;
struct btrfs_root *parent_root;
parent_inode = pending->dentry->d_parent->d_inode;
@@ -839,8 +845,6 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
BUG_ON(ret);
- inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
- d_instantiate(pending->dentry, inode);
fail:
btrfs_end_transaction(trans, fs_info->fs_root);
return ret;
@@ -994,11 +998,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_unlock(&root->fs_info->trans_mutex);
if (flush_on_commit) {
- btrfs_start_delalloc_inodes(root);
- ret = btrfs_wait_ordered_extents(root, 0);
+ btrfs_start_delalloc_inodes(root, 1);
+ ret = btrfs_wait_ordered_extents(root, 0, 1);
BUG_ON(ret);
} else if (snap_pending) {
- ret = btrfs_wait_ordered_extents(root, 1);
+ ret = btrfs_wait_ordered_extents(root, 0, 1);
BUG_ON(ret);
}
@@ -1116,6 +1120,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
current->journal_info = NULL;
kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+ if (current != root->fs_info->transaction_kthread)
+ btrfs_run_delayed_iputs(root);
+
return ret;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d4e3e7a6938c..93c7ccb33118 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -107,10 +107,10 @@ void btrfs_throttle(struct btrfs_root *root);
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
- struct extent_io_tree *dirty_pages);
+ struct extent_io_tree *dirty_pages, int mark);
int btrfs_write_marked_extents(struct btrfs_root *root,
- struct extent_io_tree *dirty_pages);
+ struct extent_io_tree *dirty_pages, int mark);
int btrfs_wait_marked_extents(struct btrfs_root *root,
- struct extent_io_tree *dirty_pages);
+ struct extent_io_tree *dirty_pages, int mark);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 741666a7676a..4a9434b622ec 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -542,8 +542,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
saved_nbytes = inode_get_bytes(inode);
/* drop any overlapping extents */
- ret = btrfs_drop_extents(trans, root, inode,
- start, extent_end, extent_end, start, &alloc_hint, 1);
+ ret = btrfs_drop_extents(trans, inode, start, extent_end,
+ &alloc_hint, 1);
BUG_ON(ret);
if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -930,6 +930,17 @@ out_nowrite:
return 0;
}
+static int insert_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset)
+{
+ int ret;
+ ret = btrfs_find_orphan_item(root, offset);
+ if (ret > 0)
+ ret = btrfs_insert_orphan_item(trans, root, offset);
+ return ret;
+}
+
+
/*
* There are a few corners where the link count of the file can't
* be properly maintained during replay. So, instead of adding
@@ -997,9 +1008,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
}
BTRFS_I(inode)->index_cnt = (u64)-1;
- if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
- ret = replay_dir_deletes(trans, root, NULL, path,
- inode->i_ino, 1);
+ if (inode->i_nlink == 0) {
+ if (S_ISDIR(inode->i_mode)) {
+ ret = replay_dir_deletes(trans, root, NULL, path,
+ inode->i_ino, 1);
+ BUG_ON(ret);
+ }
+ ret = insert_orphan_item(trans, root, inode->i_ino);
BUG_ON(ret);
}
btrfs_free_path(path);
@@ -1587,7 +1602,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
/* inode keys are done during the first stage */
if (key.type == BTRFS_INODE_ITEM_KEY &&
wc->stage == LOG_WALK_REPLAY_INODES) {
- struct inode *inode;
struct btrfs_inode_item *inode_item;
u32 mode;
@@ -1603,31 +1617,16 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
eb, i, &key);
BUG_ON(ret);
- /* for regular files, truncate away
- * extents past the new EOF
+ /* for regular files, make sure corresponding
+ * orhpan item exist. extents past the new EOF
+ * will be truncated later by orphan cleanup.
*/
if (S_ISREG(mode)) {
- inode = read_one_inode(root,
- key.objectid);
- BUG_ON(!inode);
-
- ret = btrfs_truncate_inode_items(wc->trans,
- root, inode, inode->i_size,
- BTRFS_EXTENT_DATA_KEY);
+ ret = insert_orphan_item(wc->trans, root,
+ key.objectid);
BUG_ON(ret);
-
- /* if the nlink count is zero here, the iput
- * will free the inode. We bump it to make
- * sure it doesn't get freed until the link
- * count fixup is done
- */
- if (inode->i_nlink == 0) {
- btrfs_inc_nlink(inode);
- btrfs_update_inode(wc->trans,
- root, inode);
- }
- iput(inode);
}
+
ret = link_to_fixup_dir(wc->trans, root,
path, key.objectid);
BUG_ON(ret);
@@ -1977,10 +1976,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
{
int index1;
int index2;
+ int mark;
int ret;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
- u64 log_transid = 0;
+ unsigned long log_transid = 0;
mutex_lock(&root->log_mutex);
index1 = root->log_transid % 2;
@@ -2014,24 +2014,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
+ log_transid = root->log_transid;
+ if (log_transid % 2 == 0)
+ mark = EXTENT_DIRTY;
+ else
+ mark = EXTENT_NEW;
+
/* we start IO on all the marked extents here, but we don't actually
* wait for them until later.
*/
- ret = btrfs_write_marked_extents(log, &log->dirty_log_pages);
+ ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
BUG_ON(ret);
btrfs_set_root_node(&log->root_item, log->node);
root->log_batch = 0;
- log_transid = root->log_transid;
root->log_transid++;
log->log_transid = root->log_transid;
root->log_start_pid = 0;
smp_mb();
/*
- * log tree has been flushed to disk, new modifications of
- * the log will be written to new positions. so it's safe to
- * allow log writers to go in.
+ * IO has been started, blocks of the log tree have WRITTEN flag set
+ * in their headers. new modifications of the log will be written to
+ * new positions. so it's safe to allow log writers to go in.
*/
mutex_unlock(&root->log_mutex);
@@ -2052,7 +2057,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index2 = log_root_tree->log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
- btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
wait_log_commit(trans, log_root_tree,
log_root_tree->log_transid);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2072,16 +2077,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* check the full commit flag again
*/
if (root->fs_info->last_trans_log_full_commit == trans->transid) {
- btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
goto out_wake_log_root;
}
ret = btrfs_write_and_wait_marked_extents(log_root_tree,
- &log_root_tree->dirty_log_pages);
+ &log_root_tree->dirty_log_pages,
+ EXTENT_DIRTY | EXTENT_NEW);
BUG_ON(ret);
- btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_set_super_log_root(&root->fs_info->super_for_commit,
log_root_tree->node->start);
@@ -2147,12 +2153,12 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
while (1) {
ret = find_first_extent_bit(&log->dirty_log_pages,
- 0, &start, &end, EXTENT_DIRTY);
+ 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
if (ret)
break;
- clear_extent_dirty(&log->dirty_log_pages,
- start, end, GFP_NOFS);
+ clear_extent_bits(&log->dirty_log_pages, start, end,
+ EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
}
if (log->log_transid > 0) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7eda483d7b5a..41ecbb2347f2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1135,7 +1135,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
root->fs_info->avail_metadata_alloc_bits;
if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
- root->fs_info->fs_devices->rw_devices <= 4) {
+ root->fs_info->fs_devices->num_devices <= 4) {
printk(KERN_ERR "btrfs: unable to go below four devices "
"on raid10\n");
ret = -EINVAL;
@@ -1143,7 +1143,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
}
if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
- root->fs_info->fs_devices->rw_devices <= 2) {
+ root->fs_info->fs_devices->num_devices <= 2) {
printk(KERN_ERR "btrfs: unable to go below two "
"devices on raid1\n");
ret = -EINVAL;
@@ -1434,8 +1434,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
return -EINVAL;
bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
- if (!bdev)
- return -EIO;
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
if (root->fs_info->fs_devices->seeding) {
seeding_dev = 1;
@@ -2209,7 +2209,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_chunk_size = 10 * calc_size;
min_stripe_size = 64 * 1024 * 1024;
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
- max_chunk_size = 4 * calc_size;
+ max_chunk_size = 256 * 1024 * 1024;
min_stripe_size = 32 * 1024 * 1024;
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
calc_size = 8 * 1024 * 1024;
@@ -2538,6 +2538,11 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
if (!em)
return 1;
+ if (btrfs_test_opt(root, DEGRADED)) {
+ free_extent_map(em);
+ return 0;
+ }
+
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
if (!map->stripes[i].dev->writeable) {
@@ -2649,8 +2654,10 @@ again:
em = lookup_extent_mapping(em_tree, logical, *length);
read_unlock(&em_tree->lock);
- if (!em && unplug_page)
+ if (!em && unplug_page) {
+ kfree(multi);
return 0;
+ }
if (!em) {
printk(KERN_CRIT "unable to find logical %llu len %llu\n",
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index b6dd5967c48a..193b58f7d3f3 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -85,22 +85,23 @@ out:
return ret;
}
-int __btrfs_setxattr(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
+static int do_setxattr(struct btrfs_trans_handle *trans,
+ struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
{
struct btrfs_dir_item *di;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_trans_handle *trans;
struct btrfs_path *path;
- int ret = 0, mod = 0;
+ size_t name_len = strlen(name);
+ int ret = 0;
+
+ if (name_len + size > BTRFS_MAX_XATTR_SIZE(root))
+ return -ENOSPC;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- trans = btrfs_join_transaction(root, 1);
- btrfs_set_trans_block_group(trans, inode);
-
/* first lets see if we already have this xattr */
di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
strlen(name), -1);
@@ -118,15 +119,12 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
}
ret = btrfs_delete_one_dir_name(trans, root, path, di);
- if (ret)
- goto out;
+ BUG_ON(ret);
btrfs_release_path(root, path);
/* if we don't have a value then we are removing the xattr */
- if (!value) {
- mod = 1;
+ if (!value)
goto out;
- }
} else {
btrfs_release_path(root, path);
@@ -138,20 +136,45 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
}
/* ok we have to create a completely new xattr */
- ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
- value, size, inode->i_ino);
+ ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino,
+ name, name_len, value, size);
+ BUG_ON(ret);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+ struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ if (trans)
+ return do_setxattr(trans, inode, name, value, size, flags);
+
+ ret = btrfs_reserve_metadata_space(root, 2);
if (ret)
- goto out;
- mod = 1;
+ return ret;
-out:
- if (mod) {
- inode->i_ctime = CURRENT_TIME;
- ret = btrfs_update_inode(trans, root, inode);
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans) {
+ ret = -ENOMEM;
+ goto out;
}
+ btrfs_set_trans_block_group(trans, inode);
- btrfs_end_transaction(trans, root);
- btrfs_free_path(path);
+ ret = do_setxattr(trans, inode, name, value, size, flags);
+ if (ret)
+ goto out;
+
+ inode->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+out:
+ btrfs_end_transaction_throttle(trans, root);
+ btrfs_unreserve_metadata_space(root, 2);
return ret;
}
@@ -314,7 +337,9 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
if (size == 0)
value = ""; /* empty EA, do not remove */
- return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
+
+ return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size,
+ flags);
}
int btrfs_removexattr(struct dentry *dentry, const char *name)
@@ -329,10 +354,13 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
if (!btrfs_is_valid_xattr(name))
return -EOPNOTSUPP;
- return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
+
+ return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0,
+ XATTR_REPLACE);
}
-int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir)
{
int err;
size_t len;
@@ -354,7 +382,7 @@ int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
} else {
strcpy(name, XATTR_SECURITY_PREFIX);
strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
- err = __btrfs_setxattr(inode, name, value, len, 0);
+ err = __btrfs_setxattr(trans, inode, name, value, len, 0);
kfree(name);
}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index c71e9c3cf3f7..721efa0346e0 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -27,15 +27,16 @@ extern struct xattr_handler *btrfs_xattr_handlers[];
extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
void *buffer, size_t size);
-extern int __btrfs_setxattr(struct inode *inode, const char *name,
- const void *value, size_t size, int flags);
-
+extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+ struct inode *inode, const char *name,
+ const void *value, size_t size, int flags);
extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size);
extern int btrfs_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
extern int btrfs_removexattr(struct dentry *dentry, const char *name);
-extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir);
+extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir);
#endif /* __XATTR__ */
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
index b5808cdb2232..039b5011d83b 100644
--- a/fs/cachefiles/security.c
+++ b/fs/cachefiles/security.c
@@ -77,6 +77,8 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
/*
* check the security details of the on-disk cache
* - must be called with security override in force
+ * - must return with a security override in force - even in the case of an
+ * error
*/
int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
struct dentry *root,
@@ -99,6 +101,8 @@ int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
* which create files */
ret = set_create_files_as(new, root->d_inode);
if (ret < 0) {
+ abort_creds(new);
+ cachefiles_begin_secure(cache, _saved_cred);
_leave(" = %d [cfa]", ret);
return ret;
}
diff --git a/fs/char_dev.c b/fs/char_dev.c
index d6db933df2b2..be7613ebb661 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -39,7 +39,9 @@ struct backing_dev_info directly_mappable_cdev_bdi = {
#endif
/* permit direct mmap, for read, write or exec */
BDI_CAP_MAP_DIRECT |
- BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP),
+ BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
+ /* no writeback happens */
+ BDI_CAP_NO_ACCT_AND_WRITEBACK),
};
static struct kobj_map *cdev_map;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 29f1da761bbf..144540764dec 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1033,7 +1033,7 @@ init_cifs(void)
goto out_unregister_filesystem;
#endif
#ifdef CONFIG_CIFS_DFS_UPCALL
- rc = register_key_type(&key_type_dns_resolver);
+ rc = cifs_init_dns_resolver();
if (rc)
goto out_unregister_key_type;
#endif
@@ -1045,7 +1045,7 @@ init_cifs(void)
out_unregister_resolver_key:
#ifdef CONFIG_CIFS_DFS_UPCALL
- unregister_key_type(&key_type_dns_resolver);
+ cifs_exit_dns_resolver();
out_unregister_key_type:
#endif
#ifdef CONFIG_CIFS_UPCALL
@@ -1071,7 +1071,7 @@ exit_cifs(void)
cifs_proc_clean();
#ifdef CONFIG_CIFS_DFS_UPCALL
cifs_dfs_release_automount_timer();
- unregister_key_type(&key_type_dns_resolver);
+ cifs_exit_dns_resolver();
#endif
#ifdef CONFIG_CIFS_UPCALL
unregister_key_type(&cifs_spnego_key_type);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 5d0fde18039c..c4dbc633361f 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -499,6 +499,7 @@ struct dfs_info3_param {
#define CIFS_FATTR_DFS_REFERRAL 0x1
#define CIFS_FATTR_DELETE_PENDING 0x2
#define CIFS_FATTR_NEED_REVAL 0x4
+#define CIFS_FATTR_INO_COLLISION 0x8
struct cifs_fattr {
u32 cf_flags;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5646727e33f5..05a9b776e1a8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -95,8 +95,10 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
__u16 fileHandle, struct file *file,
struct vfsmount *mnt, unsigned int oflags);
extern int cifs_posix_open(char *full_path, struct inode **pinode,
- struct vfsmount *mnt, int mode, int oflags,
- __u32 *poplock, __u16 *pnetfid, int xid);
+ struct vfsmount *mnt,
+ struct super_block *sb,
+ int mode, int oflags,
+ __u32 *poplock, __u16 *pnetfid, int xid);
extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
FILE_UNIX_BASIC_INFO *info,
struct cifs_sb_info *cifs_sb);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 941441d3e386..4e6dbab84435 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1430,6 +1430,8 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
__u32 bytes_sent;
__u16 byte_count;
+ *nbytes = 0;
+
/* cFYI(1, ("write at %lld %d bytes", offset, count));*/
if (tcon->ses == NULL)
return -ECONNABORTED;
@@ -1512,11 +1514,18 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
cifs_stats_inc(&tcon->num_writes);
if (rc) {
cFYI(1, ("Send error in write = %d", rc));
- *nbytes = 0;
} else {
*nbytes = le16_to_cpu(pSMBr->CountHigh);
*nbytes = (*nbytes) << 16;
*nbytes += le16_to_cpu(pSMBr->Count);
+
+ /*
+ * Mask off high 16 bits when bytes written as returned by the
+ * server is greater than bytes requested by the client. Some
+ * OS/2 servers are known to set incorrect CountHigh values.
+ */
+ if (*nbytes > count)
+ *nbytes &= 0xFFFF;
}
cifs_buf_release(pSMB);
@@ -1605,6 +1614,14 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
*nbytes = le16_to_cpu(pSMBr->CountHigh);
*nbytes = (*nbytes) << 16;
*nbytes += le16_to_cpu(pSMBr->Count);
+
+ /*
+ * Mask off high 16 bits when bytes written as returned by the
+ * server is greater than bytes requested by the client. OS/2
+ * servers are known to set incorrect CountHigh values.
+ */
+ if (*nbytes > count)
+ *nbytes &= 0xFFFF;
}
/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 63ea83ff687f..3bbcaa716b3c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2287,12 +2287,12 @@ int
cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
char *mount_data_global, const char *devname)
{
- int rc = 0;
+ int rc;
int xid;
struct smb_vol *volume_info;
- struct cifsSesInfo *pSesInfo = NULL;
- struct cifsTconInfo *tcon = NULL;
- struct TCP_Server_Info *srvTcp = NULL;
+ struct cifsSesInfo *pSesInfo;
+ struct cifsTconInfo *tcon;
+ struct TCP_Server_Info *srvTcp;
char *full_path;
char *mount_data = mount_data_global;
#ifdef CONFIG_CIFS_DFS_UPCALL
@@ -2301,6 +2301,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
int referral_walks_count = 0;
try_mount_again:
#endif
+ rc = 0;
+ tcon = NULL;
+ pSesInfo = NULL;
+ srvTcp = NULL;
full_path = NULL;
xid = GetXid();
@@ -2597,6 +2601,7 @@ remote_path_check:
cleanup_volume_info(&volume_info);
referral_walks_count++;
+ FreeXid(xid);
goto try_mount_again;
}
#else /* No DFS support, return error on mount */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 1f42f772865a..c3d6182d0ebe 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -183,13 +183,14 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
}
int cifs_posix_open(char *full_path, struct inode **pinode,
- struct vfsmount *mnt, int mode, int oflags,
- __u32 *poplock, __u16 *pnetfid, int xid)
+ struct vfsmount *mnt, struct super_block *sb,
+ int mode, int oflags,
+ __u32 *poplock, __u16 *pnetfid, int xid)
{
int rc;
FILE_UNIX_BASIC_INFO *presp_data;
__u32 posix_flags = 0;
- struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_fattr fattr;
cFYI(1, ("posix open %s", full_path));
@@ -241,7 +242,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
/* get new inode and set it up */
if (*pinode == NULL) {
- *pinode = cifs_iget(mnt->mnt_sb, &fattr);
+ *pinode = cifs_iget(sb, &fattr);
if (!*pinode) {
rc = -ENOMEM;
goto posix_open_ret;
@@ -250,7 +251,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
cifs_fattr_to_inode(*pinode, &fattr);
}
- cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags);
+ if (mnt)
+ cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags);
posix_open_ret:
kfree(presp_data);
@@ -314,13 +316,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
if (nd && (nd->flags & LOOKUP_OPEN))
oflags = nd->intent.open.flags;
else
- oflags = FMODE_READ;
+ oflags = FMODE_READ | SMB_O_CREAT;
if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
(CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
- rc = cifs_posix_open(full_path, &newinode, nd->path.mnt,
- mode, oflags, &oplock, &fileHandle, xid);
+ rc = cifs_posix_open(full_path, &newinode,
+ nd ? nd->path.mnt : NULL,
+ inode->i_sb, mode, oflags, &oplock, &fileHandle, xid);
/* EIO could indicate that (posix open) operation is not
supported, despite what server claimed in capability
negotation. EREMOTE indicates DFS junction, which is not
@@ -677,6 +680,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
(nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
(nd->intent.open.flags & O_CREAT)) {
rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
+ parent_dir_inode->i_sb,
nd->intent.open.create_mode,
nd->intent.open.flags, &oplock,
&fileHandle, xid);
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 87948147d7ec..31da21f0654c 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -23,12 +23,16 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
#include <keys/user-type.h>
#include "dns_resolve.h"
#include "cifsglob.h"
#include "cifsproto.h"
#include "cifs_debug.h"
+static const struct cred *dns_resolver_cache;
+
/* Checks if supplied name is IP address
* returns:
* 1 - name is IP
@@ -93,6 +97,7 @@ struct key_type key_type_dns_resolver = {
int
dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
{
+ const struct cred *saved_cred;
int rc = -EAGAIN;
struct key *rkey = ERR_PTR(-EAGAIN);
char *name;
@@ -132,8 +137,15 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
goto skip_upcall;
}
+ saved_cred = override_creds(dns_resolver_cache);
rkey = request_key(&key_type_dns_resolver, name, "");
+ revert_creds(saved_cred);
if (!IS_ERR(rkey)) {
+ if (!(rkey->perm & KEY_USR_VIEW)) {
+ down_read(&rkey->sem);
+ rkey->perm |= KEY_USR_VIEW;
+ up_read(&rkey->sem);
+ }
len = rkey->type_data.x[0];
data = rkey->payload.data;
} else {
@@ -164,4 +176,61 @@ out:
return rc;
}
+int __init cifs_init_dns_resolver(void)
+{
+ struct cred *cred;
+ struct key *keyring;
+ int ret;
+
+ printk(KERN_NOTICE "Registering the %s key type\n",
+ key_type_dns_resolver.name);
+
+ /* create an override credential set with a special thread keyring in
+ * which DNS requests are cached
+ *
+ * this is used to prevent malicious redirections from being installed
+ * with add_key().
+ */
+ cred = prepare_kernel_cred(NULL);
+ if (!cred)
+ return -ENOMEM;
+
+ keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA);
+ if (IS_ERR(keyring)) {
+ ret = PTR_ERR(keyring);
+ goto failed_put_cred;
+ }
+
+ ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+ if (ret < 0)
+ goto failed_put_key;
+
+ ret = register_key_type(&key_type_dns_resolver);
+ if (ret < 0)
+ goto failed_put_key;
+
+ /* instruct request_key() to use this special keyring as a cache for
+ * the results it looks up */
+ cred->thread_keyring = keyring;
+ cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+ dns_resolver_cache = cred;
+ return 0;
+
+failed_put_key:
+ key_put(keyring);
+failed_put_cred:
+ put_cred(cred);
+ return ret;
+}
+void cifs_exit_dns_resolver(void)
+{
+ key_revoke(dns_resolver_cache->thread_keyring);
+ unregister_key_type(&key_type_dns_resolver);
+ put_cred(dns_resolver_cache);
+ printk(KERN_NOTICE "Unregistered %s key type\n",
+ key_type_dns_resolver.name);
+}
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index 966e9288930b..763237aa2a23 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -24,8 +24,10 @@
#define _DNS_RESOLVE_H
#ifdef __KERNEL__
-#include <linux/key-type.h>
-extern struct key_type key_type_dns_resolver;
+#include <linux/module.h>
+
+extern int __init cifs_init_dns_resolver(void);
+extern void cifs_exit_dns_resolver(void);
extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
#endif /* KERNEL */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 429337eb7afe..5d1099a20cce 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -295,10 +295,12 @@ int cifs_open(struct inode *inode, struct file *file)
(CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
int oflags = (int) cifs_posix_convert_flags(file->f_flags);
+ oflags |= SMB_O_CREAT;
/* can not refresh inode info since size could be stale */
rc = cifs_posix_open(full_path, &inode, file->f_path.mnt,
- cifs_sb->mnt_file_mode /* ignored */,
- oflags, &oplock, &netfid, xid);
+ inode->i_sb,
+ cifs_sb->mnt_file_mode /* ignored */,
+ oflags, &oplock, &netfid, xid);
if (rc == 0) {
cFYI(1, ("posix open succeeded"));
/* no need for special case handling of setting mode
@@ -510,8 +512,9 @@ reopen_error_exit:
int oflags = (int) cifs_posix_convert_flags(file->f_flags);
/* can not refresh inode info since size could be stale */
rc = cifs_posix_open(full_path, NULL, file->f_path.mnt,
- cifs_sb->mnt_file_mode /* ignored */,
- oflags, &oplock, &netfid, xid);
+ inode->i_sb,
+ cifs_sb->mnt_file_mode /* ignored */,
+ oflags, &oplock, &netfid, xid);
if (rc == 0) {
cFYI(1, ("posix reopen succeeded"));
goto reopen_success;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index cababd8a52df..303fd7f4dfe1 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -610,6 +610,16 @@ cifs_find_inode(struct inode *inode, void *opaque)
if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
return 0;
+ /*
+ * uh oh -- it's a directory. We can't use it since hardlinked dirs are
+ * verboten. Disable serverino and return it as if it were found, the
+ * caller can discard it, generate a uniqueid and retry the find
+ */
+ if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) {
+ fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
+ cifs_autodisable_serverino(CIFS_SB(inode->i_sb));
+ }
+
return 1;
}
@@ -629,15 +639,22 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
unsigned long hash;
struct inode *inode;
+retry_iget5_locked:
cFYI(1, ("looking for uniqueid=%llu", fattr->cf_uniqueid));
/* hash down to 32-bits on 32-bit arch */
hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
-
- /* we have fattrs in hand, update the inode */
if (inode) {
+ /* was there a problematic inode number collision? */
+ if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) {
+ iput(inode);
+ fattr->cf_uniqueid = iunique(sb, ROOT_I);
+ fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION;
+ goto retry_iget5_locked;
+ }
+
cifs_fattr_to_inode(inode, fattr);
if (sb->s_flags & MS_NOATIME)
inode->i_flags |= S_NOATIME | S_NOCMTIME;
@@ -1267,6 +1284,10 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
if (rc == 0 || rc != -ETXTBSY)
return rc;
+ /* open-file renames don't work across directories */
+ if (to_dentry->d_parent != from_dentry->d_parent)
+ return rc;
+
/* open the file to be renamed -- we need DELETE perms */
rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
CREATE_NOT_DIR, &srcfid, &oplock, NULL,
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f84062f9a985..f5618f8cc462 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -666,6 +666,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
min(len, max_len), nlt,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
+ pqst->len -= nls_nullsize(nlt);
} else {
pqst->name = filename;
pqst->len = len;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7085a6275c4c..6d6ff4fe60ea 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -723,15 +723,7 @@ ssetup_ntlmssp_authenticate:
/* calculate session key */
setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
- if (first_time) /* should this be moved into common code
- with similar ntlmv2 path? */
- /* cifs_calculate_ntlmv2_mac_key(ses->server->mac_signing_key,
- response BB FIXME, v2_sess_key); */
-
- /* copy session key */
-
- /* memcpy(bcc_ptr, (char *)ntlm_session_key,LM2_SESS_KEY_SIZE);
- bcc_ptr += LM2_SESS_KEY_SIZE; */
+ /* FIXME: calculate MAC key */
memcpy(bcc_ptr, (char *)v2_sess_key,
sizeof(struct ntlmv2_resp));
bcc_ptr += sizeof(struct ntlmv2_resp);
diff --git a/fs/compat.c b/fs/compat.c
index 6c19040ffeef..d576b552e8e2 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1532,8 +1532,6 @@ int compat_do_execve(char * filename,
if (retval < 0)
goto out;
- current->stack_start = current->mm->start_stack;
-
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index d22438ef7674..39c6ee868d7b 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -32,7 +32,9 @@ static struct vfsmount *debugfs_mount;
static int debugfs_mount_count;
static bool debugfs_registered;
-static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev)
+static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev,
+ void *data, const struct file_operations *fops)
+
{
struct inode *inode = new_inode(sb);
@@ -44,14 +46,18 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
init_special_inode(inode, mode, dev);
break;
case S_IFREG:
- inode->i_fop = &debugfs_file_operations;
+ inode->i_fop = fops ? fops : &debugfs_file_operations;
+ inode->i_private = data;
break;
case S_IFLNK:
inode->i_op = &debugfs_link_operations;
+ inode->i_fop = fops;
+ inode->i_private = data;
break;
case S_IFDIR:
inode->i_op = &simple_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
+ inode->i_fop = fops ? fops : &simple_dir_operations;
+ inode->i_private = data;
/* directory inodes start off with i_nlink == 2
* (for "." entry) */
@@ -64,7 +70,8 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
/* SMP-safe */
static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
- int mode, dev_t dev)
+ int mode, dev_t dev, void *data,
+ const struct file_operations *fops)
{
struct inode *inode;
int error = -EPERM;
@@ -72,7 +79,7 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
if (dentry->d_inode)
return -EEXIST;
- inode = debugfs_get_inode(dir->i_sb, mode, dev);
+ inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops);
if (inode) {
d_instantiate(dentry, inode);
dget(dentry);
@@ -81,12 +88,13 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
return error;
}
-static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode,
+ void *data, const struct file_operations *fops)
{
int res;
mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
- res = debugfs_mknod(dir, dentry, mode, 0);
+ res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
if (!res) {
inc_nlink(dir);
fsnotify_mkdir(dir, dentry);
@@ -94,18 +102,20 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
return res;
}
-static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode)
+static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode,
+ void *data, const struct file_operations *fops)
{
mode = (mode & S_IALLUGO) | S_IFLNK;
- return debugfs_mknod(dir, dentry, mode, 0);
+ return debugfs_mknod(dir, dentry, mode, 0, data, fops);
}
-static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode)
+static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode,
+ void *data, const struct file_operations *fops)
{
int res;
mode = (mode & S_IALLUGO) | S_IFREG;
- res = debugfs_mknod(dir, dentry, mode, 0);
+ res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
if (!res)
fsnotify_create(dir, dentry);
return res;
@@ -139,7 +149,9 @@ static struct file_system_type debug_fs_type = {
static int debugfs_create_by_name(const char *name, mode_t mode,
struct dentry *parent,
- struct dentry **dentry)
+ struct dentry **dentry,
+ void *data,
+ const struct file_operations *fops)
{
int error = 0;
@@ -164,13 +176,16 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
if (!IS_ERR(*dentry)) {
switch (mode & S_IFMT) {
case S_IFDIR:
- error = debugfs_mkdir(parent->d_inode, *dentry, mode);
+ error = debugfs_mkdir(parent->d_inode, *dentry, mode,
+ data, fops);
break;
case S_IFLNK:
- error = debugfs_link(parent->d_inode, *dentry, mode);
+ error = debugfs_link(parent->d_inode, *dentry, mode,
+ data, fops);
break;
default:
- error = debugfs_create(parent->d_inode, *dentry, mode);
+ error = debugfs_create(parent->d_inode, *dentry, mode,
+ data, fops);
break;
}
dput(*dentry);
@@ -221,19 +236,13 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
if (error)
goto exit;
- error = debugfs_create_by_name(name, mode, parent, &dentry);
+ error = debugfs_create_by_name(name, mode, parent, &dentry,
+ data, fops);
if (error) {
dentry = NULL;
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
goto exit;
}
-
- if (dentry->d_inode) {
- if (data)
- dentry->d_inode->i_private = data;
- if (fops)
- dentry->d_inode->i_fop = fops;
- }
exit:
return dentry;
}
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index d5f8c96964be..8882ecc0f1bf 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -517,11 +517,23 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
{
+ struct dentry *dentry;
+ struct tty_struct *tty;
+
BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
+ /* Ensure dentry has not been deleted by devpts_pty_kill() */
+ dentry = d_find_alias(pts_inode);
+ if (!dentry)
+ return NULL;
+
+ tty = NULL;
if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
- return (struct tty_struct *)pts_inode->i_private;
- return NULL;
+ tty = (struct tty_struct *)pts_inode->i_private;
+
+ dput(dentry);
+
+ return tty;
}
void devpts_pty_kill(struct tty_struct *tty)
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index dc2ad6008b2d..4314f0d48d85 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
spin_unlock(&ast_queue_lock);
}
-void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode)
{
if (lkb->lkb_flags & DLM_IFL_USER) {
- dlm_user_add_ast(lkb, type, bastmode);
+ dlm_user_add_ast(lkb, type, mode);
return;
}
@@ -44,10 +44,21 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
kref_get(&lkb->lkb_ref);
list_add_tail(&lkb->lkb_astqueue, &ast_queue);
+ lkb->lkb_ast_first = type;
}
+
+ /* sanity check, this should not happen */
+
+ if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP))
+ log_print("repeat cast %d castmode %d lock %x %s",
+ mode, lkb->lkb_castmode,
+ lkb->lkb_id, lkb->lkb_resource->res_name);
+
lkb->lkb_ast_type |= type;
- if (bastmode)
- lkb->lkb_bastmode = bastmode;
+ if (type == AST_BAST)
+ lkb->lkb_bastmode = mode;
+ else
+ lkb->lkb_castmode = mode;
spin_unlock(&ast_queue_lock);
set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,9 +70,9 @@ static void process_asts(void)
struct dlm_ls *ls = NULL;
struct dlm_rsb *r = NULL;
struct dlm_lkb *lkb;
- void (*cast) (void *astparam);
- void (*bast) (void *astparam, int mode);
- int type = 0, bastmode;
+ void (*castfn) (void *astparam);
+ void (*bastfn) (void *astparam, int mode);
+ int type, first, bastmode, castmode, do_bast, do_cast, last_castmode;
repeat:
spin_lock(&ast_queue_lock);
@@ -75,17 +86,48 @@ repeat:
list_del(&lkb->lkb_astqueue);
type = lkb->lkb_ast_type;
lkb->lkb_ast_type = 0;
+ first = lkb->lkb_ast_first;
+ lkb->lkb_ast_first = 0;
bastmode = lkb->lkb_bastmode;
-
+ castmode = lkb->lkb_castmode;
+ castfn = lkb->lkb_astfn;
+ bastfn = lkb->lkb_bastfn;
spin_unlock(&ast_queue_lock);
- cast = lkb->lkb_astfn;
- bast = lkb->lkb_bastfn;
-
- if ((type & AST_COMP) && cast)
- cast(lkb->lkb_astparam);
- if ((type & AST_BAST) && bast)
- bast(lkb->lkb_astparam, bastmode);
+ do_cast = (type & AST_COMP) && castfn;
+ do_bast = (type & AST_BAST) && bastfn;
+
+ /* Skip a bast if its blocking mode is compatible with the
+ granted mode of the preceding cast. */
+
+ if (do_bast) {
+ if (first == AST_COMP)
+ last_castmode = castmode;
+ else
+ last_castmode = lkb->lkb_castmode_done;
+ if (dlm_modes_compat(bastmode, last_castmode))
+ do_bast = 0;
+ }
+
+ if (first == AST_COMP) {
+ if (do_cast)
+ castfn(lkb->lkb_astparam);
+ if (do_bast)
+ bastfn(lkb->lkb_astparam, bastmode);
+ } else if (first == AST_BAST) {
+ if (do_bast)
+ bastfn(lkb->lkb_astparam, bastmode);
+ if (do_cast)
+ castfn(lkb->lkb_astparam);
+ } else {
+ log_error(ls, "bad ast_first %d ast_type %d",
+ first, type);
+ }
+
+ if (do_cast)
+ lkb->lkb_castmode_done = castmode;
+ if (do_bast)
+ lkb->lkb_bastmode_done = bastmode;
/* this removes the reference added by dlm_add_ast
and may result in the lkb being freed */
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 1b5fc5f428fd..bcb1aaba519d 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
/******************************************************************************
*******************************************************************************
**
-** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
#ifndef __ASTD_DOT_H__
#define __ASTD_DOT_H__
-void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode);
void dlm_del_ast(struct dlm_lkb *lkb);
void dlm_astd_wake(void);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index fd9859f92fad..0df243850818 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -410,10 +410,10 @@ static struct config_group *make_cluster(struct config_group *g,
struct dlm_comms *cms = NULL;
void *gps = NULL;
- cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL);
- gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
- sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL);
- cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL);
+ cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS);
+ gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS);
+ sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS);
+ cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS);
if (!cl || !gps || !sps || !cms)
goto fail;
@@ -482,9 +482,9 @@ static struct config_group *make_space(struct config_group *g, const char *name)
struct dlm_nodes *nds = NULL;
void *gps = NULL;
- sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL);
- gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
- nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL);
+ sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS);
+ gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS);
+ nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS);
if (!sp || !gps || !nds)
goto fail;
@@ -536,7 +536,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
{
struct dlm_comm *cm;
- cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL);
+ cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS);
if (!cm)
return ERR_PTR(-ENOMEM);
@@ -569,7 +569,7 @@ static struct config_item *make_node(struct config_group *g, const char *name)
struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
struct dlm_node *nd;
- nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL);
+ nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS);
if (!nd)
return ERR_PTR(-ENOMEM);
@@ -705,7 +705,7 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
return -ENOSPC;
- addr = kzalloc(sizeof(*addr), GFP_KERNEL);
+ addr = kzalloc(sizeof(*addr), GFP_NOFS);
if (!addr)
return -ENOMEM;
@@ -868,7 +868,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
ids_count = sp->members_count;
- ids = kcalloc(ids_count, sizeof(int), GFP_KERNEL);
+ ids = kcalloc(ids_count, sizeof(int), GFP_NOFS);
if (!ids) {
rv = -ENOMEM;
goto out;
@@ -886,7 +886,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
if (!new_count)
goto out_ids;
- new = kcalloc(new_count, sizeof(int), GFP_KERNEL);
+ new = kcalloc(new_count, sizeof(int), GFP_NOFS);
if (!new) {
kfree(ids);
rv = -ENOMEM;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 1c8bb8c3a82e..375a2359b3bf 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -404,7 +404,7 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
if (bucket >= ls->ls_rsbtbl_size)
return NULL;
- ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_KERNEL);
+ ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS);
if (!ri)
return NULL;
if (n == 0)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index c4dfa1dcc86f..7b84c1dbc82e 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,8 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
spin_unlock(&ls->ls_recover_list_lock);
if (!found)
- de = kzalloc(sizeof(struct dlm_direntry) + len,
- ls->ls_allocation);
+ de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_NOFS);
return de;
}
@@ -212,7 +211,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
dlm_dir_clear(ls);
- last_name = kmalloc(DLM_RESNAME_MAXLEN, ls->ls_allocation);
+ last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS);
if (!last_name)
goto out;
@@ -323,7 +322,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
if (namelen > DLM_RESNAME_MAXLEN)
return -EINVAL;
- de = kzalloc(sizeof(struct dlm_direntry) + namelen, ls->ls_allocation);
+ de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_NOFS);
if (!de)
return -ENOMEM;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d01ca0a711db..f632b58cd222 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -232,11 +232,17 @@ struct dlm_lkb {
int8_t lkb_status; /* granted, waiting, convert */
int8_t lkb_rqmode; /* requested lock mode */
int8_t lkb_grmode; /* granted lock mode */
- int8_t lkb_bastmode; /* requested mode */
int8_t lkb_highbast; /* highest mode bast sent for */
+
int8_t lkb_wait_type; /* type of reply waiting for */
int8_t lkb_wait_count;
int8_t lkb_ast_type; /* type of ast queued for */
+ int8_t lkb_ast_first; /* type of first ast queued */
+
+ int8_t lkb_bastmode; /* req mode of queued bast */
+ int8_t lkb_castmode; /* gr mode of queued cast */
+ int8_t lkb_bastmode_done; /* last delivered bastmode */
+ int8_t lkb_castmode_done; /* last delivered castmode */
struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
struct list_head lkb_statequeue; /* rsb g/c/w list */
@@ -473,7 +479,6 @@ struct dlm_ls {
int ls_low_nodeid;
int ls_total_weight;
int *ls_node_array;
- gfp_t ls_allocation;
struct dlm_rsb ls_stub_rsb; /* for returning errors */
struct dlm_lkb ls_stub_lkb; /* for returning errors */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index eb507c453c5f..d0e43a3da887 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
/******************************************************************************
*******************************************************************************
**
-** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
lkb->lkb_lksb->sb_status = rv;
lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
- dlm_add_ast(lkb, AST_COMP, 0);
+ dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode);
}
static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -2280,20 +2280,30 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
if (can_be_queued(lkb)) {
error = -EINPROGRESS;
add_lkb(r, lkb, DLM_LKSTS_WAITING);
- send_blocking_asts(r, lkb);
add_timeout(lkb);
goto out;
}
error = -EAGAIN;
- if (force_blocking_asts(lkb))
- send_blocking_asts_all(r, lkb);
queue_cast(r, lkb, -EAGAIN);
-
out:
return error;
}
+static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ int error)
+{
+ switch (error) {
+ case -EAGAIN:
+ if (force_blocking_asts(lkb))
+ send_blocking_asts_all(r, lkb);
+ break;
+ case -EINPROGRESS:
+ send_blocking_asts(r, lkb);
+ break;
+ }
+}
+
static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
int error = 0;
@@ -2304,7 +2314,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
if (can_be_granted(r, lkb, 1, &deadlk)) {
grant_lock(r, lkb);
queue_cast(r, lkb, 0);
- grant_pending_locks(r);
goto out;
}
@@ -2334,7 +2343,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
if (_can_be_granted(r, lkb, 1)) {
grant_lock(r, lkb);
queue_cast(r, lkb, 0);
- grant_pending_locks(r);
goto out;
}
/* else fall through and move to convert queue */
@@ -2344,28 +2352,47 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
error = -EINPROGRESS;
del_lkb(r, lkb);
add_lkb(r, lkb, DLM_LKSTS_CONVERT);
- send_blocking_asts(r, lkb);
add_timeout(lkb);
goto out;
}
error = -EAGAIN;
- if (force_blocking_asts(lkb))
- send_blocking_asts_all(r, lkb);
queue_cast(r, lkb, -EAGAIN);
-
out:
return error;
}
+static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ int error)
+{
+ switch (error) {
+ case 0:
+ grant_pending_locks(r);
+ /* grant_pending_locks also sends basts */
+ break;
+ case -EAGAIN:
+ if (force_blocking_asts(lkb))
+ send_blocking_asts_all(r, lkb);
+ break;
+ case -EINPROGRESS:
+ send_blocking_asts(r, lkb);
+ break;
+ }
+}
+
static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
remove_lock(r, lkb);
queue_cast(r, lkb, -DLM_EUNLOCK);
- grant_pending_locks(r);
return -DLM_EUNLOCK;
}
+static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ int error)
+{
+ grant_pending_locks(r);
+}
+
/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -2375,12 +2402,18 @@ static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
error = revert_lock(r, lkb);
if (error) {
queue_cast(r, lkb, -DLM_ECANCEL);
- grant_pending_locks(r);
return -DLM_ECANCEL;
}
return 0;
}
+static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ int error)
+{
+ if (error)
+ grant_pending_locks(r);
+}
+
/*
* Four stage 3 varieties:
* _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
@@ -2402,11 +2435,15 @@ static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
goto out;
}
- if (is_remote(r))
+ if (is_remote(r)) {
/* receive_request() calls do_request() on remote node */
error = send_request(r, lkb);
- else
+ } else {
error = do_request(r, lkb);
+ /* for remote locks the request_reply is sent
+ between do_request and do_request_effects */
+ do_request_effects(r, lkb, error);
+ }
out:
return error;
}
@@ -2417,11 +2454,15 @@ static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
int error;
- if (is_remote(r))
+ if (is_remote(r)) {
/* receive_convert() calls do_convert() on remote node */
error = send_convert(r, lkb);
- else
+ } else {
error = do_convert(r, lkb);
+ /* for remote locks the convert_reply is sent
+ between do_convert and do_convert_effects */
+ do_convert_effects(r, lkb, error);
+ }
return error;
}
@@ -2432,11 +2473,15 @@ static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
int error;
- if (is_remote(r))
+ if (is_remote(r)) {
/* receive_unlock() calls do_unlock() on remote node */
error = send_unlock(r, lkb);
- else
+ } else {
error = do_unlock(r, lkb);
+ /* for remote locks the unlock_reply is sent
+ between do_unlock and do_unlock_effects */
+ do_unlock_effects(r, lkb, error);
+ }
return error;
}
@@ -2447,11 +2492,15 @@ static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
int error;
- if (is_remote(r))
+ if (is_remote(r)) {
/* receive_cancel() calls do_cancel() on remote node */
error = send_cancel(r, lkb);
- else
+ } else {
error = do_cancel(r, lkb);
+ /* for remote locks the cancel_reply is sent
+ between do_cancel and do_cancel_effects */
+ do_cancel_effects(r, lkb, error);
+ }
return error;
}
@@ -2689,7 +2738,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
pass into lowcomms_commit and a message buffer (mb) that we
write our data into */
- mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb);
+ mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
if (!mh)
return -ENOBUFS;
@@ -3191,6 +3240,7 @@ static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
attach_lkb(r, lkb);
error = do_request(r, lkb);
send_request_reply(r, lkb, error);
+ do_request_effects(r, lkb, error);
unlock_rsb(r);
put_rsb(r);
@@ -3226,15 +3276,19 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
goto out;
receive_flags(lkb, ms);
+
error = receive_convert_args(ls, lkb, ms);
- if (error)
- goto out_reply;
+ if (error) {
+ send_convert_reply(r, lkb, error);
+ goto out;
+ }
+
reply = !down_conversion(lkb);
error = do_convert(r, lkb);
- out_reply:
if (reply)
send_convert_reply(r, lkb, error);
+ do_convert_effects(r, lkb, error);
out:
unlock_rsb(r);
put_rsb(r);
@@ -3266,13 +3320,16 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
goto out;
receive_flags(lkb, ms);
+
error = receive_unlock_args(ls, lkb, ms);
- if (error)
- goto out_reply;
+ if (error) {
+ send_unlock_reply(r, lkb, error);
+ goto out;
+ }
error = do_unlock(r, lkb);
- out_reply:
send_unlock_reply(r, lkb, error);
+ do_unlock_effects(r, lkb, error);
out:
unlock_rsb(r);
put_rsb(r);
@@ -3307,6 +3364,7 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
error = do_cancel(r, lkb);
send_cancel_reply(r, lkb, error);
+ do_cancel_effects(r, lkb, error);
out:
unlock_rsb(r);
put_rsb(r);
@@ -4512,7 +4570,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
}
if (flags & DLM_LKF_VALBLK) {
- ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+ ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
if (!ua->lksb.sb_lvbptr) {
kfree(ua);
__put_lkb(ls, lkb);
@@ -4582,7 +4640,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
ua = lkb->lkb_ua;
if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
- ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+ ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
if (!ua->lksb.sb_lvbptr) {
error = -ENOMEM;
goto out_put;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d489fcc86713..c010ecfc0d29 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -430,7 +430,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
error = -ENOMEM;
- ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
+ ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS);
if (!ls)
goto out;
memcpy(ls->ls_name, name, namelen);
@@ -443,11 +443,6 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
if (flags & DLM_LSFL_TIMEWARN)
set_bit(LSFL_TIMEWARN, &ls->ls_flags);
- if (flags & DLM_LSFL_FS)
- ls->ls_allocation = GFP_NOFS;
- else
- ls->ls_allocation = GFP_KERNEL;
-
/* ls_exflags are forced to match among nodes, and we don't
need to require all nodes to have some flags set */
ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
@@ -456,7 +451,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
size = dlm_config.ci_rsbtbl_size;
ls->ls_rsbtbl_size = size;
- ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
+ ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_NOFS);
if (!ls->ls_rsbtbl)
goto out_lsfree;
for (i = 0; i < size; i++) {
@@ -468,7 +463,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
size = dlm_config.ci_lkbtbl_size;
ls->ls_lkbtbl_size = size;
- ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
+ ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_NOFS);
if (!ls->ls_lkbtbl)
goto out_rsbfree;
for (i = 0; i < size; i++) {
@@ -480,7 +475,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
size = dlm_config.ci_dirtbl_size;
ls->ls_dirtbl_size = size;
- ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
+ ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_NOFS);
if (!ls->ls_dirtbl)
goto out_lkbfree;
for (i = 0; i < size; i++) {
@@ -527,7 +522,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
mutex_init(&ls->ls_requestqueue_mutex);
mutex_init(&ls->ls_clear_proc_locks);
- ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
+ ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
if (!ls->ls_recover_buf)
goto out_dirfree;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 70736eb4b516..52cab160893c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1060,7 +1060,7 @@ static void init_local(void)
if (dlm_our_addr(&sas, i))
break;
- addr = kmalloc(sizeof(*addr), GFP_KERNEL);
+ addr = kmalloc(sizeof(*addr), GFP_NOFS);
if (!addr)
break;
memcpy(addr, &sas, sizeof(*addr));
@@ -1099,7 +1099,7 @@ static int sctp_listen_for_all(void)
struct sockaddr_storage localaddr;
struct sctp_event_subscribe subscribe;
int result = -EINVAL, num = 1, i, addr_len;
- struct connection *con = nodeid2con(0, GFP_KERNEL);
+ struct connection *con = nodeid2con(0, GFP_NOFS);
int bufsize = NEEDED_RMEM;
if (!con)
@@ -1171,7 +1171,7 @@ out:
static int tcp_listen_for_all(void)
{
struct socket *sock = NULL;
- struct connection *con = nodeid2con(0, GFP_KERNEL);
+ struct connection *con = nodeid2con(0, GFP_NOFS);
int result = -EINVAL;
if (!con)
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index b128775913b2..84f70bfb0baf 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -48,7 +48,7 @@ static int dlm_add_member(struct dlm_ls *ls, int nodeid)
struct dlm_member *memb;
int w, error;
- memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
+ memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
if (!memb)
return -ENOMEM;
@@ -143,7 +143,7 @@ static void make_member_array(struct dlm_ls *ls)
ls->ls_total_weight = total;
- array = kmalloc(sizeof(int) * total, ls->ls_allocation);
+ array = kmalloc(sizeof(int) * total, GFP_NOFS);
if (!array)
return;
@@ -226,7 +226,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
continue;
log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
- memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
+ memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
if (!memb)
return -ENOMEM;
memb->nodeid = rv->new[i];
@@ -341,7 +341,7 @@ int dlm_ls_start(struct dlm_ls *ls)
int *ids = NULL, *new = NULL;
int error, ids_count = 0, new_count = 0;
- rv = kzalloc(sizeof(struct dlm_recover), ls->ls_allocation);
+ rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
if (!rv)
return -ENOMEM;
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index c1775b84ebab..8e0d00db004f 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
{
char *p;
- p = kzalloc(ls->ls_lvblen, ls->ls_allocation);
+ p = kzalloc(ls->ls_lvblen, GFP_NOFS);
return p;
}
@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
- r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation);
+ r = kzalloc(sizeof(*r) + namelen, GFP_NOFS);
return r;
}
@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
{
struct dlm_lkb *lkb;
- lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation);
+ lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS);
return lkb;
}
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 55ea369f43a9..052095cd592f 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -26,7 +26,7 @@ static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size)
struct sk_buff *skb;
void *data;
- skb = genlmsg_new(size, GFP_KERNEL);
+ skb = genlmsg_new(size, GFP_NOFS);
if (!skb)
return -ENOMEM;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 16f682e26c07..2863deb178e2 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -82,7 +82,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
if (!ls)
return -EINVAL;
- xop = kzalloc(sizeof(*xop), GFP_KERNEL);
+ xop = kzalloc(sizeof(*xop), GFP_NOFS);
if (!xop) {
rv = -ENOMEM;
goto out;
@@ -211,7 +211,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
if (!ls)
return -EINVAL;
- op = kzalloc(sizeof(*op), GFP_KERNEL);
+ op = kzalloc(sizeof(*op), GFP_NOFS);
if (!op) {
rv = -ENOMEM;
goto out;
@@ -266,7 +266,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
if (!ls)
return -EINVAL;
- op = kzalloc(sizeof(*op), GFP_KERNEL);
+ op = kzalloc(sizeof(*op), GFP_NOFS);
if (!op) {
rv = -ENOMEM;
goto out;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 67522c268c14..3c83a49a48a3 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -38,7 +38,7 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
char *mb;
int mb_len = sizeof(struct dlm_rcom) + len;
- mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb);
+ mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
if (!mh) {
log_print("create_rcom to %d type %d len %d ENOBUFS",
to_nodeid, type, len);
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 7a2307c08911..a44fa22890e1 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -35,7 +35,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
struct rq_entry *e;
int length = ms->m_header.h_length - sizeof(struct dlm_message);
- e = kmalloc(sizeof(struct rq_entry) + length, ls->ls_allocation);
+ e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS);
if (!e) {
log_print("dlm_add_requestqueue: out of memory len %d", length);
return;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebce994ab0b7..a4bfd31ac45b 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -173,7 +173,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
/* we could possibly check if the cancel of an orphan has resulted in the lkb
being removed and then remove that lkb from the orphans list and free it */
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
{
struct dlm_ls *ls;
struct dlm_user_args *ua;
@@ -206,8 +206,10 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
ast_type = lkb->lkb_ast_type;
lkb->lkb_ast_type |= type;
- if (bastmode)
- lkb->lkb_bastmode = bastmode;
+ if (type == AST_BAST)
+ lkb->lkb_bastmode = mode;
+ else
+ lkb->lkb_castmode = mode;
if (!ast_type) {
kref_get(&lkb->lkb_ref);
@@ -267,7 +269,7 @@ static int device_user_lock(struct dlm_user_proc *proc,
goto out;
}
- ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+ ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
if (!ua)
goto out;
ua->proc = proc;
@@ -307,7 +309,7 @@ static int device_user_unlock(struct dlm_user_proc *proc,
if (!ls)
return -ENOENT;
- ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+ ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
if (!ua)
goto out;
ua->proc = proc;
@@ -352,7 +354,7 @@ static int dlm_device_register(struct dlm_ls *ls, char *name)
error = -ENOMEM;
len = strlen(name) + strlen(name_prefix) + 2;
- ls->ls_device.name = kzalloc(len, GFP_KERNEL);
+ ls->ls_device.name = kzalloc(len, GFP_NOFS);
if (!ls->ls_device.name)
goto fail;
@@ -520,7 +522,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
#endif
return -EINVAL;
- kbuf = kzalloc(count + 1, GFP_KERNEL);
+ kbuf = kzalloc(count + 1, GFP_NOFS);
if (!kbuf)
return -ENOMEM;
@@ -546,7 +548,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
/* add 1 after namelen so that the name string is terminated */
kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1,
- GFP_KERNEL);
+ GFP_NOFS);
if (!kbuf) {
kfree(k32buf);
return -ENOMEM;
@@ -648,7 +650,7 @@ static int device_open(struct inode *inode, struct file *file)
if (!ls)
return -ENOENT;
- proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
+ proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS);
if (!proc) {
dlm_put_lockspace(ls);
return -ENOMEM;
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 1c9686492286..f196091dd7ff 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -9,7 +9,7 @@
#ifndef __USER_DOT_H__
#define __USER_DOT_H__
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode);
int dlm_user_init(void);
void dlm_user_exit(void);
int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index fbb6e5eed697..7cb0a59f4b9d 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1748,7 +1748,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
char *cipher_name, size_t *key_size)
{
char dummy_key[ECRYPTFS_MAX_KEY_BYTES];
- char *full_alg_name;
+ char *full_alg_name = NULL;
int rc;
*key_tfm = NULL;
@@ -1763,7 +1763,6 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
if (rc)
goto out;
*key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC);
- kfree(full_alg_name);
if (IS_ERR(*key_tfm)) {
rc = PTR_ERR(*key_tfm);
printk(KERN_ERR "Unable to allocate crypto cipher with name "
@@ -1786,6 +1785,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
goto out;
}
out:
+ kfree(full_alg_name);
return rc;
}
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 9e944057001b..4e25328986ac 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -191,13 +191,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
| ECRYPTFS_ENCRYPTED);
}
mutex_unlock(&crypt_stat->cs_mutex);
- if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
- && !(file->f_flags & O_RDONLY)) {
- rc = -EPERM;
- printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
- "file must hence be opened RO\n", __func__);
- goto out;
- }
if (!ecryptfs_inode_to_private(inode)->lower_file) {
rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
if (rc) {
@@ -205,9 +198,16 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
"the persistent file for the dentry with name "
"[%s]; rc = [%d]\n", __func__,
ecryptfs_dentry->d_name.name, rc);
- goto out;
+ goto out_free;
}
}
+ if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
+ && !(file->f_flags & O_RDONLY)) {
+ rc = -EPERM;
+ printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
+ "file must hence be opened RO\n", __func__);
+ goto out_free;
+ }
ecryptfs_set_file_lower(
file, ecryptfs_inode_to_private(inode)->lower_file);
if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
@@ -293,12 +293,40 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
return rc;
}
-static int ecryptfs_ioctl(struct inode *inode, struct file *file,
- unsigned int cmd, unsigned long arg);
+static long
+ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct file *lower_file = NULL;
+ long rc = -ENOTTY;
+
+ if (ecryptfs_file_to_private(file))
+ lower_file = ecryptfs_file_to_lower(file);
+ if (lower_file && lower_file->f_op && lower_file->f_op->unlocked_ioctl)
+ rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
+ return rc;
+}
+
+#ifdef CONFIG_COMPAT
+static long
+ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct file *lower_file = NULL;
+ long rc = -ENOIOCTLCMD;
+
+ if (ecryptfs_file_to_private(file))
+ lower_file = ecryptfs_file_to_lower(file);
+ if (lower_file && lower_file->f_op && lower_file->f_op->compat_ioctl)
+ rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
+ return rc;
+}
+#endif
const struct file_operations ecryptfs_dir_fops = {
.readdir = ecryptfs_readdir,
- .ioctl = ecryptfs_ioctl,
+ .unlocked_ioctl = ecryptfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ecryptfs_compat_ioctl,
+#endif
.mmap = generic_file_mmap,
.open = ecryptfs_open,
.flush = ecryptfs_flush,
@@ -315,7 +343,10 @@ const struct file_operations ecryptfs_main_fops = {
.write = do_sync_write,
.aio_write = generic_file_aio_write,
.readdir = ecryptfs_readdir,
- .ioctl = ecryptfs_ioctl,
+ .unlocked_ioctl = ecryptfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ecryptfs_compat_ioctl,
+#endif
.mmap = generic_file_mmap,
.open = ecryptfs_open,
.flush = ecryptfs_flush,
@@ -324,20 +355,3 @@ const struct file_operations ecryptfs_main_fops = {
.fasync = ecryptfs_fasync,
.splice_read = generic_file_splice_read,
};
-
-static int
-ecryptfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
- unsigned long arg)
-{
- int rc = 0;
- struct file *lower_file = NULL;
-
- if (ecryptfs_file_to_private(file))
- lower_file = ecryptfs_file_to_lower(file);
- if (lower_file && lower_file->f_op && lower_file->f_op->ioctl)
- rc = lower_file->f_op->ioctl(ecryptfs_inode_to_lower(inode),
- lower_file, cmd, arg);
- else
- rc = -ENOTTY;
- return rc;
-}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 056fed62d0de..585987ce4e82 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -272,7 +272,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
printk(KERN_ERR "%s: Out of memory whilst attempting "
"to allocate ecryptfs_dentry_info struct\n",
__func__);
- goto out_dput;
+ goto out_put;
}
ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
@@ -345,8 +345,9 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
out_free_kmem:
kmem_cache_free(ecryptfs_header_cache_2, page_virt);
goto out;
-out_dput:
+out_put:
dput(lower_dentry);
+ mntput(lower_mnt);
d_drop(ecryptfs_dentry);
out:
return rc;
@@ -638,38 +639,17 @@ out_lock:
return rc;
}
-static int
-ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
+ size_t *bufsiz)
{
+ struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
char *lower_buf;
- size_t lower_bufsiz;
- struct dentry *lower_dentry;
- struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
- char *plaintext_name;
- size_t plaintext_name_size;
+ size_t lower_bufsiz = PATH_MAX;
mm_segment_t old_fs;
int rc;
- lower_dentry = ecryptfs_dentry_to_lower(dentry);
- if (!lower_dentry->d_inode->i_op->readlink) {
- rc = -EINVAL;
- goto out;
- }
- mount_crypt_stat = &ecryptfs_superblock_to_private(
- dentry->d_sb)->mount_crypt_stat;
- /*
- * If the lower filename is encrypted, it will result in a significantly
- * longer name. If needed, truncate the name after decode and decrypt.
- */
- if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
- lower_bufsiz = PATH_MAX;
- else
- lower_bufsiz = bufsiz;
- /* Released in this function */
lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
- if (lower_buf == NULL) {
- printk(KERN_ERR "%s: Out of memory whilst attempting to "
- "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
+ if (!lower_buf) {
rc = -ENOMEM;
goto out;
}
@@ -679,29 +659,31 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
(char __user *)lower_buf,
lower_bufsiz);
set_fs(old_fs);
- if (rc >= 0) {
- rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
- &plaintext_name_size,
- dentry, lower_buf,
- rc);
- if (rc) {
- printk(KERN_ERR "%s: Error attempting to decode and "
- "decrypt filename; rc = [%d]\n", __func__,
- rc);
- goto out_free_lower_buf;
- }
- /* Check for bufsiz <= 0 done in sys_readlinkat() */
- rc = copy_to_user(buf, plaintext_name,
- min((size_t) bufsiz, plaintext_name_size));
- if (rc)
- rc = -EFAULT;
- else
- rc = plaintext_name_size;
- kfree(plaintext_name);
- fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
- }
-out_free_lower_buf:
+ if (rc < 0)
+ goto out;
+ lower_bufsiz = rc;
+ rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
+ lower_buf, lower_bufsiz);
+out:
kfree(lower_buf);
+ return rc;
+}
+
+static int
+ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+ char *kbuf;
+ size_t kbufsiz, copied;
+ int rc;
+
+ rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz);
+ if (rc)
+ goto out;
+ copied = min_t(size_t, bufsiz, kbufsiz);
+ rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied;
+ kfree(kbuf);
+ fsstack_copy_attr_atime(dentry->d_inode,
+ ecryptfs_dentry_to_lower(dentry)->d_inode);
out:
return rc;
}
@@ -971,6 +953,43 @@ out:
return rc;
}
+int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+ int rc = 0;
+
+ mount_crypt_stat = &ecryptfs_superblock_to_private(
+ dentry->d_sb)->mount_crypt_stat;
+ generic_fillattr(dentry->d_inode, stat);
+ if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+ char *target;
+ size_t targetsiz;
+
+ rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz);
+ if (!rc) {
+ kfree(target);
+ stat->size = targetsiz;
+ }
+ }
+ return rc;
+}
+
+int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct kstat lower_stat;
+ int rc;
+
+ rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry),
+ ecryptfs_dentry_to_lower(dentry), &lower_stat);
+ if (!rc) {
+ generic_fillattr(dentry->d_inode, stat);
+ stat->blocks = lower_stat.blocks;
+ }
+ return rc;
+}
+
int
ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags)
@@ -980,7 +999,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
lower_dentry = ecryptfs_dentry_to_lower(dentry);
if (!lower_dentry->d_inode->i_op->setxattr) {
- rc = -ENOSYS;
+ rc = -EOPNOTSUPP;
goto out;
}
mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -998,7 +1017,7 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
int rc = 0;
if (!lower_dentry->d_inode->i_op->getxattr) {
- rc = -ENOSYS;
+ rc = -EOPNOTSUPP;
goto out;
}
mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1025,7 +1044,7 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
lower_dentry = ecryptfs_dentry_to_lower(dentry);
if (!lower_dentry->d_inode->i_op->listxattr) {
- rc = -ENOSYS;
+ rc = -EOPNOTSUPP;
goto out;
}
mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1042,7 +1061,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
lower_dentry = ecryptfs_dentry_to_lower(dentry);
if (!lower_dentry->d_inode->i_op->removexattr) {
- rc = -ENOSYS;
+ rc = -EOPNOTSUPP;
goto out;
}
mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1073,6 +1092,7 @@ const struct inode_operations ecryptfs_symlink_iops = {
.put_link = ecryptfs_put_link,
.permission = ecryptfs_permission,
.setattr = ecryptfs_setattr,
+ .getattr = ecryptfs_getattr_link,
.setxattr = ecryptfs_setxattr,
.getxattr = ecryptfs_getxattr,
.listxattr = ecryptfs_listxattr,
@@ -1100,6 +1120,7 @@ const struct inode_operations ecryptfs_dir_iops = {
const struct inode_operations ecryptfs_main_iops = {
.permission = ecryptfs_permission,
.setattr = ecryptfs_setattr,
+ .getattr = ecryptfs_getattr,
.setxattr = ecryptfs_setxattr,
.getxattr = ecryptfs_getxattr,
.listxattr = ecryptfs_listxattr,
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index f1c17e87c5fb..3dfe7ce86b1b 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -30,9 +30,9 @@ static struct mutex ecryptfs_msg_ctx_lists_mux;
static struct hlist_head *ecryptfs_daemon_hash;
struct mutex ecryptfs_daemon_hash_mux;
-static int ecryptfs_hash_buckets;
+static int ecryptfs_hash_bits;
#define ecryptfs_uid_hash(uid) \
- hash_long((unsigned long)uid, ecryptfs_hash_buckets)
+ hash_long((unsigned long)uid, ecryptfs_hash_bits)
static u32 ecryptfs_msg_counter;
static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr;
@@ -485,18 +485,19 @@ int ecryptfs_init_messaging(void)
}
mutex_init(&ecryptfs_daemon_hash_mux);
mutex_lock(&ecryptfs_daemon_hash_mux);
- ecryptfs_hash_buckets = 1;
- while (ecryptfs_number_of_users >> ecryptfs_hash_buckets)
- ecryptfs_hash_buckets++;
+ ecryptfs_hash_bits = 1;
+ while (ecryptfs_number_of_users >> ecryptfs_hash_bits)
+ ecryptfs_hash_bits++;
ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head)
- * ecryptfs_hash_buckets), GFP_KERNEL);
+ * (1 << ecryptfs_hash_bits)),
+ GFP_KERNEL);
if (!ecryptfs_daemon_hash) {
rc = -ENOMEM;
printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
mutex_unlock(&ecryptfs_daemon_hash_mux);
goto out;
}
- for (i = 0; i < ecryptfs_hash_buckets; i++)
+ for (i = 0; i < (1 << ecryptfs_hash_bits); i++)
INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]);
mutex_unlock(&ecryptfs_daemon_hash_mux);
ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx)
@@ -553,7 +554,7 @@ void ecryptfs_release_messaging(void)
int i;
mutex_lock(&ecryptfs_daemon_hash_mux);
- for (i = 0; i < ecryptfs_hash_buckets; i++) {
+ for (i = 0; i < (1 << ecryptfs_hash_bits); i++) {
int rc;
hlist_for_each_entry(daemon, elem,
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index b15a43a80ab7..1a037f77aa52 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -85,7 +85,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
if (lower_dentry->d_inode) {
fput(inode_info->lower_file);
inode_info->lower_file = NULL;
- d_drop(lower_dentry);
}
}
ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
diff --git a/fs/exec.c b/fs/exec.c
index ba112bd4a339..a0410eb44dfa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -376,6 +376,9 @@ static int count(char __user * __user * argv, int max)
argv++;
if (i++ >= max)
return -E2BIG;
+
+ if (fatal_signal_pending(current))
+ return -ERESTARTNOHAND;
cond_resched();
}
}
@@ -419,6 +422,12 @@ static int copy_strings(int argc, char __user * __user * argv,
while (len > 0) {
int offset, bytes_to_copy;
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTNOHAND;
+ goto out;
+ }
+ cond_resched();
+
offset = pos % PAGE_SIZE;
if (offset == 0)
offset = PAGE_SIZE;
@@ -572,6 +581,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
struct vm_area_struct *prev = NULL;
unsigned long vm_flags;
unsigned long stack_base;
+ unsigned long stack_size;
+ unsigned long stack_expand;
+ unsigned long rlim_stack;
#ifdef CONFIG_STACK_GROWSUP
/* Limit stack size to 1GB */
@@ -591,6 +603,11 @@ int setup_arg_pages(struct linux_binprm *bprm,
#else
stack_top = arch_align_stack(stack_top);
stack_top = PAGE_ALIGN(stack_top);
+
+ if (unlikely(stack_top < mmap_min_addr) ||
+ unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
+ return -ENOMEM;
+
stack_shift = vma->vm_end - stack_top;
bprm->p -= stack_shift;
@@ -628,10 +645,23 @@ int setup_arg_pages(struct linux_binprm *bprm,
goto out_unlock;
}
+ stack_expand = EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+ stack_size = vma->vm_end - vma->vm_start;
+ /*
+ * Align this down to a page boundary as expand_stack
+ * will align it up.
+ */
+ rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
#ifdef CONFIG_STACK_GROWSUP
- stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+ if (stack_size + stack_expand > rlim_stack)
+ stack_base = vma->vm_start + rlim_stack;
+ else
+ stack_base = vma->vm_end + stack_expand;
#else
- stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+ if (stack_size + stack_expand > rlim_stack)
+ stack_base = vma->vm_end - rlim_stack;
+ else
+ stack_base = vma->vm_start - stack_expand;
#endif
ret = expand_stack(vma, stack_base);
if (ret)
@@ -931,9 +961,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
int flush_old_exec(struct linux_binprm * bprm)
{
- char * name;
- int i, ch, retval;
- char tcomm[sizeof(current->comm)];
+ int retval;
/*
* Make sure we have a private signal table and that
@@ -954,6 +982,25 @@ int flush_old_exec(struct linux_binprm * bprm)
bprm->mm = NULL; /* We're using it now */
+ current->flags &= ~PF_RANDOMIZE;
+ flush_thread();
+ current->personality &= ~bprm->per_clear;
+
+ return 0;
+
+out:
+ return retval;
+}
+EXPORT_SYMBOL(flush_old_exec);
+
+void setup_new_exec(struct linux_binprm * bprm)
+{
+ int i, ch;
+ char * name;
+ char tcomm[sizeof(current->comm)];
+
+ arch_pick_mmap_layout(current->mm);
+
/* This is the point of no return */
current->sas_ss_sp = current->sas_ss_size = 0;
@@ -975,9 +1022,6 @@ int flush_old_exec(struct linux_binprm * bprm)
tcomm[i] = '\0';
set_task_comm(current, tcomm);
- current->flags &= ~PF_RANDOMIZE;
- flush_thread();
-
/* Set the new mm task size. We have to do that late because it may
* depend on TIF_32BIT which is only updated in flush_thread() on
* some architectures like powerpc
@@ -993,8 +1037,6 @@ int flush_old_exec(struct linux_binprm * bprm)
set_dumpable(current->mm, suid_dumpable);
}
- current->personality &= ~bprm->per_clear;
-
/*
* Flush performance counters when crossing a
* security domain:
@@ -1009,14 +1051,8 @@ int flush_old_exec(struct linux_binprm * bprm)
flush_signal_handlers(current, 0);
flush_old_files(current->files);
-
- return 0;
-
-out:
- return retval;
}
-
-EXPORT_SYMBOL(flush_old_exec);
+EXPORT_SYMBOL(setup_new_exec);
/*
* Prepare credentials and lock ->cred_guard_mutex.
@@ -1357,8 +1393,6 @@ int do_execve(char * filename,
if (retval < 0)
goto out;
- current->stack_start = current->mm->start_stack;
-
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
@@ -1891,8 +1925,9 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
/*
* Dont allow local users get cute and trick others to coredump
* into their pre-created files:
+ * Note, this is not relevant for pipes
*/
- if (inode->i_uid != current_fsuid())
+ if (!ispipe && (inode->i_uid != current_fsuid()))
goto close_fail;
if (!file->f_op)
goto close_fail;
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4cfab1cc75c0..d91e9d829bc1 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -608,7 +608,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
de->inode_no = cpu_to_le64(parent->i_ino);
memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
exofs_set_de_type(de, inode);
- kunmap_atomic(page, KM_USER0);
+ kunmap_atomic(kaddr, KM_USER0);
err = exofs_commit_chunk(page, 0, chunk_size);
fail:
page_cache_release(page);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 6c10f7476699..6f7df0f7d282 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -731,13 +731,28 @@ static int exofs_write_begin_export(struct file *file,
fsdata);
}
+static int exofs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ /* According to comment in simple_write_end i_mutex is held */
+ loff_t i_size = inode->i_size;
+ int ret;
+
+ ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata);
+ if (i_size != inode->i_size)
+ mark_inode_dirty(inode);
+ return ret;
+}
+
const struct address_space_operations exofs_aops = {
.readpage = exofs_readpage,
.readpages = exofs_readpages,
.writepage = exofs_writepage,
.writepages = exofs_writepages,
.write_begin = exofs_write_begin_export,
- .write_end = simple_write_end,
+ .write_end = exofs_write_end,
};
/******************************************************************************
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 354ed3b47b30..f9d69378caaa 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1151,6 +1151,16 @@ static int do_journal_get_write_access(handle_t *handle,
return ext3_journal_get_write_access(handle, bh);
}
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext3_truncate_failed_write(struct inode *inode)
+{
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext3_truncate(inode);
+}
+
static int ext3_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1209,7 +1219,7 @@ write_begin_failed:
unlock_page(page);
page_cache_release(page);
if (pos + len > inode->i_size)
- ext3_truncate(inode);
+ ext3_truncate_failed_write(inode);
}
if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
goto retry;
@@ -1304,7 +1314,7 @@ static int ext3_ordered_write_end(struct file *file,
page_cache_release(page);
if (pos + len > inode->i_size)
- ext3_truncate(inode);
+ ext3_truncate_failed_write(inode);
return ret ? ret : copied;
}
@@ -1330,7 +1340,7 @@ static int ext3_writeback_write_end(struct file *file,
page_cache_release(page);
if (pos + len > inode->i_size)
- ext3_truncate(inode);
+ ext3_truncate_failed_write(inode);
return ret ? ret : copied;
}
@@ -1383,7 +1393,7 @@ static int ext3_journalled_write_end(struct file *file,
page_cache_release(page);
if (pos + len > inode->i_size)
- ext3_truncate(inode);
+ ext3_truncate_failed_write(inode);
return ret ? ret : copied;
}
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 427496c4767c..ca3068fd2346 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2686,13 +2686,11 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
- es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
buf->f_bavail = 0;
buf->f_files = le32_to_cpu(es->s_inodes_count);
buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
- es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
buf->f_namelen = EXT3_NAME_LEN;
fsid = le64_to_cpup((void *)es->s_uuid) ^
le64_to_cpup((void *)es->s_uuid + sizeof(u64));
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 545e37c4b91e..387d92d00b97 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -960,6 +960,10 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (error)
goto cleanup;
+ error = ext3_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto cleanup;
+
if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) {
struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
@@ -985,9 +989,6 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (flags & XATTR_CREATE)
goto cleanup;
}
- error = ext3_journal_get_write_access(handle, is.iloc.bh);
- if (error)
- goto cleanup;
if (!value) {
if (!is.s.not_found)
error = ext3_xattr_ibody_set(handle, inode, &i, &is);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1d0418980f8d..e85b63c9fbcb 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -189,9 +189,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
* when a file system is mounted (see ext4_fill_super).
*/
-
-#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-
/**
* ext4_get_group_desc() -- load group descriptor from disk
* @sb: super block
@@ -761,7 +758,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
ext4_group_t group)
{
- return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
+ if (!ext4_bg_has_super(sb, group))
+ return 0;
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+ return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+ else
+ return EXT4_SB(sb)->s_gdb_count;
}
/**
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 50784ef07563..dc79b75d8f70 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb)
if (ext4_bg_has_super(sb, i) &&
((i < 5) || ((i % flex_size) == 0)))
add_system_zone(sbi, ext4_group_first_block_no(sb, i),
- sbi->s_gdb_count + 1);
+ ext4_bg_num_gdb(sb, i) + 1);
gdp = ext4_get_group_desc(sb, i, NULL);
ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
if (ret)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 9dc93168e262..aa6fb6b9d3ad 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -84,9 +84,11 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
if (error_msg != NULL)
ext4_error(dir->i_sb, function,
- "bad entry in directory #%lu: %s - "
- "offset=%u, inode=%u, rec_len=%d, name_len=%d",
- dir->i_ino, error_msg, offset,
+ "bad entry in directory #%lu: %s - block=%llu"
+ "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
+ dir->i_ino, error_msg,
+ (unsigned long long) bh->b_blocknr,
+ (unsigned) (offset%bh->b_size), offset,
le32_to_cpu(de->inode),
rlen, de->name_len);
return error_msg == NULL ? 1 : 0;
@@ -109,7 +111,7 @@ static int ext4_readdir(struct file *filp,
if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_COMPAT_DIR_INDEX) &&
- ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
+ ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
((inode->i_size >> sb->s_blocksize_bits) == 1))) {
err = ext4_dx_readdir(filp, dirent, filldir);
if (err != ERR_BAD_DX_DIR) {
@@ -120,7 +122,7 @@ static int ext4_readdir(struct file *filp,
* We don't set the inode dirty flag since it's not
* critical that it get flushed back to the disk.
*/
- EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
+ ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
}
stored = 0;
offset = filp->f_pos & (sb->s_blocksize - 1);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8825515eeddd..0773352fd512 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
#include <linux/wait.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
+#ifdef __KERNEL__
+#include <linux/compat.h>
+#endif
/*
* The fourth extended filesystem constants/structures
@@ -139,8 +142,8 @@ typedef struct ext4_io_end {
struct inode *inode; /* file being written to */
unsigned int flag; /* unwritten or not */
int error; /* I/O error code */
- ext4_lblk_t offset; /* offset in the file */
- size_t size; /* size of the extent */
+ loff_t offset; /* offset in the file */
+ ssize_t size; /* size of the extent */
struct work_struct work; /* data work queue */
} ext4_io_end_t;
@@ -284,10 +287,12 @@ struct flex_groups {
#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
+#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
+#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */
+#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */
/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -314,15 +319,81 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
}
/*
- * Inode dynamic state flags
+ * Inode flags used for atomic set/get
*/
-#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */
-#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
-#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
-#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
-#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
-#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
-#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
+enum {
+ EXT4_INODE_SECRM = 0, /* Secure deletion */
+ EXT4_INODE_UNRM = 1, /* Undelete */
+ EXT4_INODE_COMPR = 2, /* Compress file */
+ EXT4_INODE_SYNC = 3, /* Synchronous updates */
+ EXT4_INODE_IMMUTABLE = 4, /* Immutable file */
+ EXT4_INODE_APPEND = 5, /* writes to file may only append */
+ EXT4_INODE_NODUMP = 6, /* do not dump file */
+ EXT4_INODE_NOATIME = 7, /* do not update atime */
+/* Reserved for compression usage... */
+ EXT4_INODE_DIRTY = 8,
+ EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
+ EXT4_INODE_NOCOMPR = 10, /* Don't compress */
+ EXT4_INODE_ECOMPR = 11, /* Compression error */
+/* End compression flags --- maybe not all used */
+ EXT4_INODE_INDEX = 12, /* hash-indexed directory */
+ EXT4_INODE_IMAGIC = 13, /* AFS directory */
+ EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */
+ EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */
+ EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */
+ EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
+ EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
+ EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
+ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
+ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
+ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
+};
+
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
+ printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
+ EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
+
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, and we
+ * can't do a compile-time test for ENUM values, we use a run-time
+ * test to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop
+ * out so it won't cost any extra space in the compiled kernel image.
+ * But it's important that these values are the same, since we are
+ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
+ * must be consistent with the values of FS_XXX_FL defined in
+ * include/linux/fs.h and the on-disk values found in ext2, ext3, and
+ * ext4 filesystems, and of course the values defined in e2fsprogs.
+ *
+ * It's not paranoia if the Murphy's Law really *is* out to get you. :-)
+ */
+static inline void ext4_check_flag_values(void)
+{
+ CHECK_FLAG_VALUE(SECRM);
+ CHECK_FLAG_VALUE(UNRM);
+ CHECK_FLAG_VALUE(COMPR);
+ CHECK_FLAG_VALUE(SYNC);
+ CHECK_FLAG_VALUE(IMMUTABLE);
+ CHECK_FLAG_VALUE(APPEND);
+ CHECK_FLAG_VALUE(NODUMP);
+ CHECK_FLAG_VALUE(NOATIME);
+ CHECK_FLAG_VALUE(DIRTY);
+ CHECK_FLAG_VALUE(COMPRBLK);
+ CHECK_FLAG_VALUE(NOCOMPR);
+ CHECK_FLAG_VALUE(ECOMPR);
+ CHECK_FLAG_VALUE(INDEX);
+ CHECK_FLAG_VALUE(IMAGIC);
+ CHECK_FLAG_VALUE(JOURNAL_DATA);
+ CHECK_FLAG_VALUE(NOTAIL);
+ CHECK_FLAG_VALUE(DIRSYNC);
+ CHECK_FLAG_VALUE(TOPDIR);
+ CHECK_FLAG_VALUE(HUGE_FILE);
+ CHECK_FLAG_VALUE(EXTENTS);
+ CHECK_FLAG_VALUE(EA_INODE);
+ CHECK_FLAG_VALUE(EOFBLOCKS);
+ CHECK_FLAG_VALUE(RESERVED);
+}
/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
@@ -335,6 +406,18 @@ struct ext4_new_group_input {
__u16 unused;
};
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+struct compat_ext4_new_group_input {
+ u32 group;
+ compat_u64 block_bitmap;
+ compat_u64 inode_bitmap;
+ compat_u64 inode_table;
+ u32 blocks_count;
+ u16 reserved_blocks;
+ u16 unused;
+};
+#endif
+
/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
struct ext4_new_group_data {
__u32 group;
@@ -361,14 +444,11 @@ struct ext4_new_group_data {
so set the magic i_delalloc_reserve_flag after taking the
inode allocation semaphore for */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
- /* Call ext4_da_update_reserve_space() after successfully
- allocating the blocks */
-#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
/* caller is from the direct IO path, request to creation of an
unitialized extents if not allocated, split the uninitialized
extent if blocks has been preallocated already*/
-#define EXT4_GET_BLOCKS_DIO 0x0010
-#define EXT4_GET_BLOCKS_CONVERT 0x0020
+#define EXT4_GET_BLOCKS_DIO 0x0008
+#define EXT4_GET_BLOCKS_CONVERT 0x0010
#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
/* Convert extent to initialized after direct IO complete */
@@ -397,6 +477,7 @@ struct ext4_new_group_data {
#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
*/
@@ -407,11 +488,13 @@ struct ext4_new_group_data {
#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
#ifdef CONFIG_JBD2_DEBUG
#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
#endif
#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
+#endif
/*
@@ -615,9 +698,8 @@ struct ext4_ext_cache {
*/
struct ext4_inode_info {
__le32 i_data[15]; /* unconverted */
- __u32 i_flags;
- ext4_fsblk_t i_file_acl;
__u32 i_dtime;
+ ext4_fsblk_t i_file_acl;
/*
* i_block_group is the number of the block group which contains
@@ -627,7 +709,8 @@ struct ext4_inode_info {
* near to their parent directory's inode.
*/
ext4_group_t i_block_group;
- __u32 i_state; /* Dynamic state flags for ext4 */
+ unsigned long i_state_flags; /* Dynamic state flags */
+ unsigned long i_flags;
ext4_lblk_t i_dir_start_lookup;
#ifdef CONFIG_EXT4_FS_XATTR
@@ -693,16 +776,29 @@ struct ext4_inode_info {
unsigned int i_reserved_meta_blocks;
unsigned int i_allocated_meta_blocks;
unsigned short i_delalloc_reserved_flag;
+ sector_t i_da_metadata_calc_last_lblock;
+ int i_da_metadata_calc_len;
/* on-disk additional length */
__u16 i_extra_isize;
spinlock_t i_block_reservation_lock;
+#ifdef CONFIG_QUOTA
+ /* quota space reservation, managed internally by quota code */
+ qsize_t i_reserved_quota;
+#endif
/* completed async DIOs that might need unwritten extents handling */
struct list_head i_aio_dio_complete_list;
/* current io_end structure for async DIO write*/
ext4_io_end_t *cur_aio_dio;
+
+ /*
+ * Transactions that contain inode's metadata needed to complete
+ * fsync and fdatasync, respectively.
+ */
+ tid_t i_sync_tid;
+ tid_t i_datasync_tid;
};
/*
@@ -750,6 +846,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
+#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
#define set_opt(o, opt) o |= EXT4_MOUNT_##opt
@@ -1033,6 +1130,37 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
(ino >= EXT4_FIRST_INO(sb) &&
ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}
+
+/*
+ * Inode dynamic state flags
+ */
+enum {
+ EXT4_STATE_JDATA, /* journaled data exists */
+ EXT4_STATE_NEW, /* inode is newly created */
+ EXT4_STATE_XATTR, /* has in-inode xattrs */
+ EXT4_STATE_NO_EXPAND, /* No space for expansion */
+ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
+ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
+ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
+ EXT4_STATE_NEWENTRY, /* File just added to dir */
+};
+
+#define EXT4_INODE_BIT_FNS(name, field) \
+static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
+{ \
+ return test_bit(bit, &EXT4_I(inode)->i_##field); \
+} \
+static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
+{ \
+ set_bit(bit, &EXT4_I(inode)->i_##field); \
+} \
+static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
+{ \
+ clear_bit(bit, &EXT4_I(inode)->i_##field); \
+}
+
+EXT4_INODE_BIT_FNS(flag, flags)
+EXT4_INODE_BIT_FNS(state, state_flags)
#else
/* Assume that user mode programs are passing in an ext4fs superblock, not
* a kernel struct super_block. This will allow us to call the feature-test
@@ -1217,7 +1345,7 @@ struct ext4_dir_entry_2 {
#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
EXT4_FEATURE_COMPAT_DIR_INDEX) && \
- (EXT4_I(dir)->i_flags & EXT4_INDEX_FL))
+ ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
@@ -1424,8 +1552,10 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
-extern qsize_t ext4_get_reserved_space(struct inode *inode);
+extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int flush_aio_dio_completed_IO(struct inode *inode);
+extern void ext4_da_update_reserve_space(struct inode *inode,
+ int used, int quota_claim);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1625,6 +1755,7 @@ struct ext4_group_info {
ext4_grpblk_t bb_first_free; /* first free block */
ext4_grpblk_t bb_free; /* total free blocks */
ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
+ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
struct list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
void *bb_bitmap;
@@ -1728,7 +1859,7 @@ extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
- loff_t len);
+ ssize_t len);
extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
sector_t block, unsigned int max_blocks,
struct buffer_head *bh, int flags);
@@ -1757,6 +1888,8 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+
#endif /* __KERNEL__ */
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 2ca686454e87..bdb6ce7e2eb4 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -225,7 +225,8 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
}
-extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
+extern int ext4_ext_calc_metadata_amount(struct inode *inode,
+ sector_t lblocks);
extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6a9409920dee..496249aeec96 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -89,7 +89,7 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
ext4_journal_abort_handle(where, __func__, bh,
handle, err);
} else {
- if (inode && bh)
+ if (inode)
mark_buffer_dirty_inode(bh, inode);
else
mark_buffer_dirty(bh);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index a2865980342f..386095d5b760 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -49,7 +49,7 @@
#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
EXT4_XATTR_TRANS_BLOCKS - 2 + \
- 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
/*
* Define the number of metadata blocks we need to account to modify data.
@@ -57,7 +57,7 @@
* This include super block, inode block, quota blocks and xattr blocks
*/
#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
- 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
/* Delete operations potentially hit one directory's namespace plus an
* entire inode, plus arbitrary amounts of bitmap/indirection data. Be
@@ -92,6 +92,7 @@
* but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
+
#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
#else
@@ -99,6 +100,9 @@
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
#endif
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
int
ext4_mark_iloc_dirty(handle_t *handle,
@@ -254,6 +258,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
return 0;
}
+static inline void ext4_update_inode_fsync_trans(handle_t *handle,
+ struct inode *inode,
+ int datasync)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (ext4_handle_valid(handle)) {
+ ei->i_sync_tid = handle->h_transaction->t_tid;
+ if (datasync)
+ ei->i_datasync_tid = handle->h_transaction->t_tid;
+ }
+}
+
/* super.c */
int ext4_force_commit(struct super_block *sb);
@@ -265,7 +282,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
return 1;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
return 1;
- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
return 1;
return 0;
}
@@ -276,7 +293,7 @@ static inline int ext4_should_order_data(struct inode *inode)
return 0;
if (!S_ISREG(inode->i_mode))
return 0;
- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
return 0;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
return 1;
@@ -289,7 +306,7 @@ static inline int ext4_should_writeback_data(struct inode *inode)
return 0;
if (EXT4_JOURNAL(inode) == NULL)
return 1;
- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
return 0;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
return 1;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 715264b4bae4..f37555909872 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
if (err <= 0)
return err;
err = ext4_truncate_restart_trans(handle, inode, needed);
- /*
- * We have dropped i_data_sem so someone might have cached again
- * an extent we are going to truncate.
- */
- ext4_ext_invalidate_cache(inode);
+ if (err == 0)
+ err = -EAGAIN;
return err;
}
@@ -296,29 +293,44 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
* to allocate @blocks
* Worse case is one block per extent
*/
-int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
{
- int lcap, icap, rcap, leafs, idxs, num;
- int newextents = blocks;
-
- rcap = ext4_ext_space_root_idx(inode, 0);
- lcap = ext4_ext_space_block(inode, 0);
- icap = ext4_ext_space_block_idx(inode, 0);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int idxs, num = 0;
- /* number of new leaf blocks needed */
- num = leafs = (newextents + lcap - 1) / lcap;
+ idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+ / sizeof(struct ext4_extent_idx));
/*
- * Worse case, we need separate index block(s)
- * to link all new leaf blocks
+ * If the new delayed allocation block is contiguous with the
+ * previous da block, it can share index blocks with the
+ * previous block, so we only need to allocate a new index
+ * block every idxs leaf blocks. At ldxs**2 blocks, we need
+ * an additional index block, and at ldxs**3 blocks, yet
+ * another index blocks.
*/
- idxs = (leafs + icap - 1) / icap;
- do {
- num += idxs;
- idxs = (idxs + icap - 1) / icap;
- } while (idxs > rcap);
+ if (ei->i_da_metadata_calc_len &&
+ ei->i_da_metadata_calc_last_lblock+1 == lblock) {
+ if ((ei->i_da_metadata_calc_len % idxs) == 0)
+ num++;
+ if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
+ num++;
+ if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
+ num++;
+ ei->i_da_metadata_calc_len = 0;
+ } else
+ ei->i_da_metadata_calc_len++;
+ ei->i_da_metadata_calc_last_lblock++;
+ return num;
+ }
- return num;
+ /*
+ * In the worst case we need a new set of index blocks at
+ * every level of the inode's extent tree.
+ */
+ ei->i_da_metadata_calc_len = 1;
+ ei->i_da_metadata_calc_last_lblock = lblock;
+ return ext_depth(inode) + 1;
}
static int
@@ -1761,7 +1773,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
while (block < last && block != EXT_MAX_BLOCK) {
num = last - block;
/* find extent for this block */
+ down_read(&EXT4_I(inode)->i_data_sem);
path = ext4_ext_find_extent(inode, block, path);
+ up_read(&EXT4_I(inode)->i_data_sem);
if (IS_ERR(path)) {
err = PTR_ERR(path);
path = NULL;
@@ -1934,7 +1948,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
cex->ec_type != EXT4_EXT_CACHE_EXTENT);
- if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
+ if (in_range(block, cex->ec_block, cex->ec_len)) {
ex->ee_block = cpu_to_le32(cex->ec_block);
ext4_ext_store_pblock(ex, cex->ec_start);
ex->ee_len = cpu_to_le16(cex->ec_len);
@@ -2074,7 +2088,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
ext_debug("free last %u blocks starting %llu\n", num, start);
for (i = 0; i < num; i++) {
bh = sb_find_get_block(inode->i_sb, start + i);
- ext4_forget(handle, 0, inode, bh, start + i);
+ ext4_forget(handle, metadata, inode, bh, start + i);
}
ext4_free_blocks(handle, inode, start, num, metadata);
} else if (from == le32_to_cpu(ex->ee_block)
@@ -2167,7 +2181,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
correct_index = 1;
credits += (ext_depth(inode)) + 1;
}
- credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+ credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
err = ext4_ext_truncate_extend_restart(handle, inode, credits);
if (err)
@@ -2246,7 +2260,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
int depth = ext_depth(inode);
struct ext4_ext_path *path;
handle_t *handle;
- int i = 0, err = 0;
+ int i, err;
ext_debug("truncate since %u\n", start);
@@ -2255,23 +2269,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
if (IS_ERR(handle))
return PTR_ERR(handle);
+again:
ext4_ext_invalidate_cache(inode);
/*
* We start scanning from right side, freeing all the blocks
* after i_size and walking into the tree depth-wise.
*/
+ depth = ext_depth(inode);
path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
if (path == NULL) {
ext4_journal_stop(handle);
return -ENOMEM;
}
+ path[0].p_depth = depth;
path[0].p_hdr = ext_inode_hdr(inode);
if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
err = -EIO;
goto out;
}
- path[0].p_depth = depth;
+ i = err = 0;
while (i >= 0 && err == 0) {
if (i == depth) {
@@ -2365,6 +2382,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
out:
ext4_ext_drop_refs(path);
kfree(path);
+ if (err == -EAGAIN)
+ goto again;
ext4_journal_stop(handle);
return err;
@@ -2429,7 +2448,7 @@ static void bi_complete(struct bio *bio, int error)
/* FIXME!! we need to try to merge to left or right after zero-out */
static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
{
- int ret = -EIO;
+ int ret;
struct bio *bio;
int blkbits, blocksize;
sector_t ee_pblock;
@@ -2453,6 +2472,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
len = ee_len;
bio = bio_alloc(GFP_NOIO, len);
+ if (!bio)
+ return -ENOMEM;
+
bio->bi_sector = ee_pblock;
bio->bi_bdev = inode->i_sb->s_bdev;
@@ -2480,17 +2502,15 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
submit_bio(WRITE, bio);
wait_for_completion(&event);
- if (test_bit(BIO_UPTODATE, &bio->bi_flags))
- ret = 0;
- else {
- ret = -EIO;
- break;
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ bio_put(bio);
+ return -EIO;
}
bio_put(bio);
ee_len -= done;
ee_pblock += done << (blkbits - 9);
}
- return ret;
+ return 0;
}
#define EXT4_EXT_ZERO_LEN 7
@@ -2515,11 +2535,21 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
struct ext4_extent *ex2 = NULL;
struct ext4_extent *ex3 = NULL;
struct ext4_extent_header *eh;
- ext4_lblk_t ee_block;
+ ext4_lblk_t ee_block, eof_block;
unsigned int allocated, ee_len, depth;
ext4_fsblk_t newblock;
int err = 0;
int ret = 0;
+ int may_zeroout;
+
+ ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
+ "block %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)iblock, max_blocks);
+
+ eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+ inode->i_sb->s_blocksize_bits;
+ if (eof_block < iblock + max_blocks)
+ eof_block = iblock + max_blocks;
depth = ext_depth(inode);
eh = path[depth].p_hdr;
@@ -2528,16 +2558,23 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ee_len = ext4_ext_get_actual_len(ex);
allocated = ee_len - (iblock - ee_block);
newblock = iblock - ee_block + ext_pblock(ex);
+
ex2 = ex;
orig_ex.ee_block = ex->ee_block;
orig_ex.ee_len = cpu_to_le16(ee_len);
ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+ /*
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully insde i_size or new_size.
+ */
+ may_zeroout = ee_block + ee_len <= eof_block;
+
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
- if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
+ if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2568,7 +2605,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
if (allocated > max_blocks) {
unsigned int newdepth;
/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
- if (allocated <= EXT4_EXT_ZERO_LEN) {
+ if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
/*
* iblock == ee_block is handled by the zerouout
* at the beginning.
@@ -2644,7 +2681,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ex3->ee_len = cpu_to_le16(allocated - max_blocks);
ext4_ext_mark_uninitialized(ex3);
err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
- if (err == -ENOSPC) {
+ if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2668,8 +2705,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
* update the extent length after successful insert of the
* split extent
*/
- orig_ex.ee_len = cpu_to_le16(ee_len -
- ext4_ext_get_actual_len(ex3));
+ ee_len -= ext4_ext_get_actual_len(ex3);
+ orig_ex.ee_len = cpu_to_le16(ee_len);
+ may_zeroout = ee_block + ee_len <= eof_block;
+
depth = newdepth;
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode, iblock, path);
@@ -2693,7 +2732,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
* otherwise give the extent a chance to merge to left
*/
if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
- iblock != ee_block) {
+ iblock != ee_block && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2762,7 +2801,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
goto out;
insert:
err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
- if (err == -ENOSPC) {
+ if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2822,14 +2861,21 @@ static int ext4_split_unwritten_extents(handle_t *handle,
struct ext4_extent *ex2 = NULL;
struct ext4_extent *ex3 = NULL;
struct ext4_extent_header *eh;
- ext4_lblk_t ee_block;
+ ext4_lblk_t ee_block, eof_block;
unsigned int allocated, ee_len, depth;
ext4_fsblk_t newblock;
int err = 0;
+ int may_zeroout;
+
+ ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
+ "block %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)iblock, max_blocks);
+
+ eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+ inode->i_sb->s_blocksize_bits;
+ if (eof_block < iblock + max_blocks)
+ eof_block = iblock + max_blocks;
- ext_debug("ext4_split_unwritten_extents: inode %lu,"
- "iblock %llu, max_blocks %u\n", inode->i_ino,
- (unsigned long long)iblock, max_blocks);
depth = ext_depth(inode);
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
@@ -2837,12 +2883,19 @@ static int ext4_split_unwritten_extents(handle_t *handle,
ee_len = ext4_ext_get_actual_len(ex);
allocated = ee_len - (iblock - ee_block);
newblock = iblock - ee_block + ext_pblock(ex);
+
ex2 = ex;
orig_ex.ee_block = ex->ee_block;
orig_ex.ee_len = cpu_to_le16(ee_len);
ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
/*
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully insde i_size or new_size.
+ */
+ may_zeroout = ee_block + ee_len <= eof_block;
+
+ /*
* If the uninitialized extent begins at the same logical
* block where the write begins, and the write completely
* covers the extent, then we don't need to split it.
@@ -2876,7 +2929,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
ex3->ee_len = cpu_to_le16(allocated - max_blocks);
ext4_ext_mark_uninitialized(ex3);
err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
- if (err == -ENOSPC) {
+ if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2900,8 +2953,10 @@ static int ext4_split_unwritten_extents(handle_t *handle,
* update the extent length after successful insert of the
* split extent
*/
- orig_ex.ee_len = cpu_to_le16(ee_len -
- ext4_ext_get_actual_len(ex3));
+ ee_len -= ext4_ext_get_actual_len(ex3);
+ orig_ex.ee_len = cpu_to_le16(ee_len);
+ may_zeroout = ee_block + ee_len <= eof_block;
+
depth = newdepth;
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode, iblock, path);
@@ -2947,7 +3002,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
goto out;
insert:
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
- if (err == -ENOSPC) {
+ if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -3027,6 +3082,14 @@ out:
return err;
}
+static void unmap_underlying_metadata_blocks(struct block_device *bdev,
+ sector_t block, int count)
+{
+ int i;
+ for (i = 0; i < count; i++)
+ unmap_underlying_metadata(bdev, block + i);
+}
+
static int
ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock, unsigned int max_blocks,
@@ -3057,13 +3120,15 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
if (io)
io->flag = DIO_AIO_UNWRITTEN;
else
- EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
+ ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
goto out;
}
/* async DIO end_io complete, convert the filled extent to written */
if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
ret = ext4_convert_unwritten_extents_dio(handle, inode,
path);
+ if (ret >= 0)
+ ext4_update_inode_fsync_trans(handle, inode, 1);
goto out2;
}
/* buffered IO case */
@@ -3091,6 +3156,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
ret = ext4_ext_convert_to_initialized(handle, inode,
path, iblock,
max_blocks);
+ if (ret >= 0)
+ ext4_update_inode_fsync_trans(handle, inode, 1);
out:
if (ret <= 0) {
err = ret;
@@ -3098,6 +3165,30 @@ out:
} else
allocated = ret;
set_buffer_new(bh_result);
+ /*
+ * if we allocated more blocks than requested
+ * we need to make sure we unmap the extra block
+ * allocated. The actual needed block will get
+ * unmapped later when we find the buffer_head marked
+ * new.
+ */
+ if (allocated > max_blocks) {
+ unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
+ newblock + max_blocks,
+ allocated - max_blocks);
+ allocated = max_blocks;
+ }
+
+ /*
+ * If we have done fallocate with the offset that is already
+ * delayed allocated, we would have block reservation
+ * and quota reservation done in the delayed write path.
+ * But fallocate would have already updated quota and block
+ * count for this offset. So cancel these reservation
+ */
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ext4_da_update_reserve_space(inode, allocated, 0);
+
map_out:
set_buffer_mapped(bh_result);
out1:
@@ -3138,9 +3229,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
{
struct ext4_ext_path *path = NULL;
struct ext4_extent_header *eh;
- struct ext4_extent newex, *ex;
+ struct ext4_extent newex, *ex, *last_ex;
ext4_fsblk_t newblock;
- int err = 0, depth, ret, cache_type;
+ int i, err = 0, depth, ret, cache_type;
unsigned int allocated = 0;
struct ext4_allocation_request ar;
ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3190,7 +3281,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
* this situation is possible, though, _during_ tree modification;
* this is why assert can't be put in ext4_ext_find_extent()
*/
- BUG_ON(path[depth].p_ext == NULL && depth != 0);
+ if (path[depth].p_ext == NULL && depth != 0) {
+ ext4_error(inode->i_sb, __func__, "bad extent address "
+ "inode: %lu, iblock: %lu, depth: %d",
+ inode->i_ino, (unsigned long) iblock, depth);
+ err = -EIO;
+ goto out2;
+ }
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
@@ -3205,7 +3302,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
*/
ee_len = ext4_ext_get_actual_len(ex);
/* if found extent covers block, simply return it */
- if (iblock >= ee_block && iblock < ee_block + ee_len) {
+ if (in_range(iblock, ee_block, ee_len)) {
newblock = iblock - ee_block + ee_start;
/* number of remaining blocks in the extent */
allocated = ee_len - (iblock - ee_block);
@@ -3309,10 +3406,36 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
if (io)
io->flag = DIO_AIO_UNWRITTEN;
else
- EXT4_I(inode)->i_state |=
- EXT4_STATE_DIO_UNWRITTEN;;
+ ext4_set_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN);
}
}
+
+ if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
+ if (unlikely(!eh->eh_entries)) {
+ ext4_error(inode->i_sb, __func__,
+ "inode#%lu, eh->eh_entries = 0 and "
+ "EOFBLOCKS_FL set", inode->i_ino);
+ err = -EIO;
+ goto out2;
+ }
+ last_ex = EXT_LAST_EXTENT(eh);
+ /*
+ * If the current leaf block was reached by looking at
+ * the last index block all the way down the tree, and
+ * we are extending the inode beyond the last extent
+ * in the current leaf block, then clear the
+ * EOFBLOCKS_FL flag.
+ */
+ for (i = depth-1; i >= 0; i--) {
+ if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+ break;
+ }
+ if ((i < 0) &&
+ (iblock + ar.len > le32_to_cpu(last_ex->ee_block) +
+ ext4_ext_get_actual_len(last_ex)))
+ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+ }
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err) {
/* free data blocks we just allocated */
@@ -3327,12 +3450,27 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
+ if (allocated > max_blocks)
+ allocated = max_blocks;
set_buffer_new(bh_result);
- /* Cache only when it is _not_ an uninitialized extent */
- if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
+ /*
+ * Update reserved blocks/metadata blocks after successful
+ * block allocation which had been deferred till now.
+ */
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ext4_da_update_reserve_space(inode, allocated, 1);
+
+ /*
+ * Cache the extent and update transaction to commit on fdatasync only
+ * when it is _not_ an uninitialized extent.
+ */
+ if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
EXT4_EXT_CACHE_EXTENT);
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ } else
+ ext4_update_inode_fsync_trans(handle, inode, 0);
out:
if (allocated > max_blocks)
allocated = max_blocks;
@@ -3431,6 +3569,13 @@ static void ext4_falloc_update_inode(struct inode *inode,
i_size_write(inode, new_size);
if (new_size > EXT4_I(inode)->i_disksize)
ext4_update_i_disksize(inode, new_size);
+ } else {
+ /*
+ * Mark that we allocate beyond EOF so the subsequent truncate
+ * can proceed even if the new size is the same as i_size.
+ */
+ if (new_size > i_size_read(inode))
+ ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
}
@@ -3458,7 +3603,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
* currently supporting (pre)allocate mode for extent-based
* files _only_
*/
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EOPNOTSUPP;
/* preallocation to directories is currently not supported */
@@ -3477,6 +3622,11 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
*/
credits = ext4_chunk_trans_blocks(inode, max_blocks);
mutex_lock(&inode->i_mutex);
+ ret = inode_newsize_ok(inode, (len + offset));
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
retry:
while (ret >= 0 && ret < max_blocks) {
block = block + ret;
@@ -3535,7 +3685,7 @@ retry:
* Returns 0 on success.
*/
int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
- loff_t len)
+ ssize_t len)
{
handle_t *handle;
ext4_lblk_t block;
@@ -3671,7 +3821,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
int error = 0;
/* in-inode? */
- if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
struct ext4_iloc iloc;
int offset; /* offset of xattr in inode */
@@ -3684,6 +3834,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
physical += offset;
length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
flags |= FIEMAP_EXTENT_DATA_INLINE;
+ brelse(iloc.bh);
} else { /* external block */
physical = EXT4_I(inode)->i_file_acl << blockbits;
length = inode->i_sb->s_blocksize;
@@ -3699,11 +3850,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
ext4_lblk_t start_blk;
- ext4_lblk_t len_blks;
int error = 0;
/* fallback to generic here if not in extents fmt */
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return generic_block_fiemap(inode, fieinfo, start, len,
ext4_get_block);
@@ -3713,17 +3863,21 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
error = ext4_xattr_fiemap(inode, fieinfo);
} else {
+ ext4_lblk_t len_blks;
+ __u64 last_blk;
+
start_blk = start >> inode->i_sb->s_blocksize_bits;
- len_blks = len >> inode->i_sb->s_blocksize_bits;
+ last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
+ if (last_blk >= EXT_MAX_BLOCK)
+ last_blk = EXT_MAX_BLOCK-1;
+ len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
/*
* Walk the extent tree gathering extent information.
* ext4_ext_fiemap_cb will push extents back to user.
*/
- down_read(&EXT4_I(inode)->i_data_sem);
error = ext4_ext_walk_space(inode, start_blk, len_blks,
ext4_ext_fiemap_cb, fieinfo);
- up_read(&EXT4_I(inode)->i_data_sem);
}
return error;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 9630583cef28..2a6054129bbc 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -35,9 +35,9 @@
*/
static int ext4_release_file(struct inode *inode, struct file *filp)
{
- if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
+ if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
ext4_alloc_da_blocks(inode);
- EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
+ ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
}
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
@@ -65,7 +65,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
* is smaller than s_maxbytes, which is for extent-mapped files.
*/
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
size_t length = iov_length(iov, nr_segs);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 2b1531266ee2..c3660a60c300 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
#include <trace/events/ext4.h>
/*
+ * If we're not journaling and this is a just-created file, we have to
+ * sync our parent directory (if it was freshly created) since
+ * otherwise it will only be written by writeback, leaving a huge
+ * window during which a crash may lose the file. This may apply for
+ * the parent directory's parent as well, and so on recursively, if
+ * they are also freshly created.
+ */
+static void ext4_sync_parent(struct inode *inode)
+{
+ struct dentry *dentry = NULL;
+
+ while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+ ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
+ dentry = list_entry(inode->i_dentry.next,
+ struct dentry, d_alias);
+ if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
+ break;
+ inode = dentry->d_parent->d_inode;
+ sync_mapping_buffers(inode->i_mapping);
+ }
+}
+
+/*
* akpm: A new design for ext4_sync_file().
*
* This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -51,25 +74,34 @@
int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
+ struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- int err, ret = 0;
+ int ret;
+ tid_t commit_tid;
J_ASSERT(ext4_journal_current_handle() == NULL);
trace_ext4_sync_file(file, dentry, datasync);
+ if (inode->i_sb->s_flags & MS_RDONLY)
+ return 0;
+
ret = flush_aio_dio_completed_IO(inode);
if (ret < 0)
- goto out;
+ return ret;
+
+ if (!journal) {
+ ret = simple_fsync(file, dentry, datasync);
+ if (!ret && !list_empty(&inode->i_dentry))
+ ext4_sync_parent(inode);
+ return ret;
+ }
+
/*
- * data=writeback:
+ * data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
- * sync_inode() will sync the metadata
- *
- * data=ordered:
- * The caller's filemap_fdatawrite() will write the data and
- * sync_inode() will write the inode if it is dirty. Then the caller's
- * filemap_fdatawait() will wait on the pages.
+ * Metadata is in the journal, we wait for proper transaction to
+ * commit here.
*
* data=journal:
* filemap_fdatawrite won't do anything (the buffers are clean).
@@ -79,32 +111,25 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
* (they were dirtied by commit). But that's OK - the blocks are
* safe in-journal, which is all fsync() needs to ensure.
*/
- if (ext4_should_journal_data(inode)) {
- ret = ext4_force_commit(inode->i_sb);
- goto out;
- }
-
- if (!journal)
- ret = sync_mapping_buffers(inode->i_mapping);
+ if (ext4_should_journal_data(inode))
+ return ext4_force_commit(inode->i_sb);
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- goto out;
-
- /*
- * The VFS has written the file data. If the inode is unaltered
- * then we need not start a commit.
- */
- if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 0, /* sys_fsync did this */
- };
- err = sync_inode(inode, &wbc);
- if (ret == 0)
- ret = err;
- }
-out:
- if (journal && (journal->j_flags & JBD2_BARRIER))
+ commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+ if (jbd2_log_start_commit(journal, commit_tid)) {
+ /*
+ * When the journal is on a different device than the
+ * fs data disk, we need to issue the barrier in
+ * writeback mode. (In ordered mode, the jbd2 layer
+ * will take care of issuing the barrier. In
+ * data=journal, all of the data blocks are written to
+ * the journal device.)
+ */
+ if (ext4_should_writeback_data(inode) &&
+ (journal->j_fs_dev != journal->j_dev) &&
+ (journal->j_flags & JBD2_BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ ret = jbd2_log_wait_commit(journal, commit_tid);
+ } else if (journal->j_flags & JBD2_BARRIER)
blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
return ret;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f3624ead4f6c..55a93f5bb003 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -244,57 +244,50 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (fatal)
goto error_return;
- /* Ok, now we can actually update the inode bitmaps.. */
- cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
- bit, bitmap_bh->b_data);
- if (!cleared)
- ext4_error(sb, "ext4_free_inode",
- "bit already cleared for inode %lu", ino);
- else {
- gdp = ext4_get_group_desc(sb, block_group, &bh2);
-
+ fatal = -ESRCH;
+ gdp = ext4_get_group_desc(sb, block_group, &bh2);
+ if (gdp) {
BUFFER_TRACE(bh2, "get_write_access");
fatal = ext4_journal_get_write_access(handle, bh2);
- if (fatal) goto error_return;
-
- if (gdp) {
- ext4_lock_group(sb, block_group);
- count = ext4_free_inodes_count(sb, gdp) + 1;
- ext4_free_inodes_set(sb, gdp, count);
- if (is_directory) {
- count = ext4_used_dirs_count(sb, gdp) - 1;
- ext4_used_dirs_set(sb, gdp, count);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t f;
-
- f = ext4_flex_group(sbi, block_group);
- atomic_dec(&sbi->s_flex_groups[f].free_inodes);
- }
+ }
+ ext4_lock_group(sb, block_group);
+ cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+ if (fatal || !cleared) {
+ ext4_unlock_group(sb, block_group);
+ goto out;
+ }
- }
- gdp->bg_checksum = ext4_group_desc_csum(sbi,
- block_group, gdp);
- ext4_unlock_group(sb, block_group);
- percpu_counter_inc(&sbi->s_freeinodes_counter);
- if (is_directory)
- percpu_counter_dec(&sbi->s_dirs_counter);
-
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t f;
-
- f = ext4_flex_group(sbi, block_group);
- atomic_inc(&sbi->s_flex_groups[f].free_inodes);
- }
- }
- BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, NULL, bh2);
- if (!fatal) fatal = err;
+ count = ext4_free_inodes_count(sb, gdp) + 1;
+ ext4_free_inodes_set(sb, gdp, count);
+ if (is_directory) {
+ count = ext4_used_dirs_count(sb, gdp) - 1;
+ ext4_used_dirs_set(sb, gdp, count);
+ percpu_counter_dec(&sbi->s_dirs_counter);
}
- BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
- if (!fatal)
- fatal = err;
- sb->s_dirt = 1;
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+ ext4_unlock_group(sb, block_group);
+
+ percpu_counter_inc(&sbi->s_freeinodes_counter);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, block_group);
+
+ atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+ if (is_directory)
+ atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+ }
+ BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+ fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
+out:
+ if (cleared) {
+ BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+ if (!fatal)
+ fatal = err;
+ sb->s_dirt = 1;
+ } else
+ ext4_error(sb, "ext4_free_inode",
+ "bit already cleared for inode %lu", ino);
+
error_return:
brelse(bitmap_bh);
ext4_std_error(sb, fatal);
@@ -504,7 +497,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
if (S_ISDIR(mode) &&
((parent == sb->s_root->d_inode) ||
- (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
+ (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
int best_ndir = inodes_per_group;
int ret = -1;
@@ -779,7 +772,7 @@ static int ext4_claim_inode(struct super_block *sb,
if (sbi->s_log_groups_per_flex) {
ext4_group_t f = ext4_flex_group(sbi, group);
- atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+ atomic_inc(&sbi->s_flex_groups[f].used_dirs);
}
}
gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
@@ -904,7 +897,7 @@ repeat_in_this_group:
BUFFER_TRACE(inode_bitmap_bh,
"call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle,
- inode,
+ NULL,
inode_bitmap_bh);
if (err)
goto fail;
@@ -1029,7 +1022,8 @@ got:
inode->i_generation = sbi->s_next_generation++;
spin_unlock(&sbi->s_next_gen_lock);
- ei->i_state = EXT4_STATE_NEW;
+ ei->i_state_flags = 0;
+ ext4_set_inode_state(inode, EXT4_STATE_NEW);
ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
@@ -1050,7 +1044,7 @@ got:
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
/* set extent flag only for directory, file and normal symlink*/
if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
- EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
ext4_ext_tree_init(handle, inode);
}
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2c8caa51addb..1b23f9d2ef56 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -957,7 +957,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
int count = 0;
ext4_fsblk_t first_block = 0;
- J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
+ J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
depth = ext4_block_to_path(inode, iblock, offsets,
&blocks_to_boundary);
@@ -1021,10 +1021,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
if (!err)
err = ext4_splice_branch(handle, inode, iblock,
partial, indirect_blks, count);
- else
+ if (err)
goto cleanup;
set_buffer_new(bh_result);
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
got_it:
map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
if (count > blocks_to_boundary)
@@ -1043,92 +1045,121 @@ out:
return err;
}
-qsize_t ext4_get_reserved_space(struct inode *inode)
+#ifdef CONFIG_QUOTA
+qsize_t *ext4_get_reserved_space(struct inode *inode)
{
- unsigned long long total;
-
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- total = EXT4_I(inode)->i_reserved_data_blocks +
- EXT4_I(inode)->i_reserved_meta_blocks;
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-
- return total;
+ return &EXT4_I(inode)->i_reserved_quota;
}
+#endif
+
/*
* Calculate the number of metadata blocks need to reserve
- * to allocate @blocks for non extent file based file
+ * to allocate a new block at @lblocks for non extent file based file
*/
-static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+static int ext4_indirect_calc_metadata_amount(struct inode *inode,
+ sector_t lblock)
{
- int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
- int ind_blks, dind_blks, tind_blks;
-
- /* number of new indirect blocks needed */
- ind_blks = (blocks + icap - 1) / icap;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
+ int blk_bits;
- dind_blks = (ind_blks + icap - 1) / icap;
+ if (lblock < EXT4_NDIR_BLOCKS)
+ return 0;
- tind_blks = 1;
+ lblock -= EXT4_NDIR_BLOCKS;
- return ind_blks + dind_blks + tind_blks;
+ if (ei->i_da_metadata_calc_len &&
+ (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
+ ei->i_da_metadata_calc_len++;
+ return 0;
+ }
+ ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+ ei->i_da_metadata_calc_len = 1;
+ blk_bits = order_base_2(lblock);
+ return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
}
/*
* Calculate the number of metadata blocks need to reserve
- * to allocate given number of blocks
+ * to allocate a block located at @lblock
*/
-static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
{
- if (!blocks)
- return 0;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return ext4_ext_calc_metadata_amount(inode, lblock);
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
- return ext4_ext_calc_metadata_amount(inode, blocks);
-
- return ext4_indirect_calc_metadata_amount(inode, blocks);
+ return ext4_indirect_calc_metadata_amount(inode, lblock);
}
-static void ext4_da_update_reserve_space(struct inode *inode, int used)
+/*
+ * Called with i_data_sem down, which is important since we can call
+ * ext4_discard_preallocations() from here.
+ */
+void ext4_da_update_reserve_space(struct inode *inode,
+ int used, int quota_claim)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- int total, mdb, mdb_free;
-
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- /* recalculate the number of metablocks still need to be reserved */
- total = EXT4_I(inode)->i_reserved_data_blocks - used;
- mdb = ext4_calc_metadata_amount(inode, total);
-
- /* figure out how many metablocks to release */
- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
- mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
-
- if (mdb_free) {
- /* Account for allocated meta_blocks */
- mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
-
- /* update fs dirty blocks counter */
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int mdb_free = 0, allocated_meta_blocks = 0;
+
+ spin_lock(&ei->i_block_reservation_lock);
+ if (unlikely(used > ei->i_reserved_data_blocks)) {
+ ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+ "with only %d reserved data blocks\n",
+ __func__, inode->i_ino, used,
+ ei->i_reserved_data_blocks);
+ WARN_ON(1);
+ used = ei->i_reserved_data_blocks;
+ }
+
+ /* Update per-inode reservations */
+ ei->i_reserved_data_blocks -= used;
+ used += ei->i_allocated_meta_blocks;
+ ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+ allocated_meta_blocks = ei->i_allocated_meta_blocks;
+ ei->i_allocated_meta_blocks = 0;
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
+
+ if (ei->i_reserved_data_blocks == 0) {
+ /*
+ * We can release all of the reserved metadata blocks
+ * only when we have written all of the delayed
+ * allocation blocks.
+ */
+ mdb_free = ei->i_reserved_meta_blocks;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_da_metadata_calc_len = 0;
percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
- EXT4_I(inode)->i_allocated_meta_blocks = 0;
- EXT4_I(inode)->i_reserved_meta_blocks = mdb;
}
-
- /* update per-inode reservations */
- BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
- EXT4_I(inode)->i_reserved_data_blocks -= used;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- /*
- * free those over-booking quota for metadata blocks
- */
- if (mdb_free)
- vfs_dq_release_reservation_block(inode, mdb_free);
+ /* Update quota subsystem */
+ if (quota_claim) {
+ vfs_dq_claim_block(inode, used);
+ if (mdb_free)
+ vfs_dq_release_reservation_block(inode, mdb_free);
+ } else {
+ /*
+ * We did fallocate with an offset that is already delayed
+ * allocated. So on delayed allocated writeback we should
+ * not update the quota for allocated blocks. But then
+ * converting an fallocate region to initialized region would
+ * have caused a metadata allocation. So claim quota for
+ * that
+ */
+ if (allocated_meta_blocks)
+ vfs_dq_claim_block(inode, allocated_meta_blocks);
+ vfs_dq_release_reservation_block(inode, mdb_free + used -
+ allocated_meta_blocks);
+ }
/*
* If we have done all the pending block allocations and if
* there aren't any writers on the inode, we can discard the
* inode's preallocations.
*/
- if (!total && (atomic_read(&inode->i_writecount) == 0))
+ if ((ei->i_reserved_data_blocks == 0) &&
+ (atomic_read(&inode->i_writecount) == 0))
ext4_discard_preallocations(inode);
}
@@ -1243,7 +1274,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* file system block.
*/
down_read((&EXT4_I(inode)->i_data_sem));
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
bh, 0);
} else {
@@ -1305,7 +1336,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
*/
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
bh, flags);
} else {
@@ -1318,20 +1349,22 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* i_data's format changing. Force the migrate
* to fail by clearing migrate flags
*/
- EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
}
- }
+ /*
+ * Update reserved blocks/metadata blocks after successful
+ * block allocation which had been deferred till now. We don't
+ * support fallocate for non extent files. So we can update
+ * reserve space here.
+ */
+ if ((retval > 0) &&
+ (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
+ ext4_da_update_reserve_space(inode, retval, 1);
+ }
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
EXT4_I(inode)->i_delalloc_reserved_flag = 0;
- /*
- * Update reserved blocks/metadata blocks after successful
- * block allocation which had been deferred till now.
- */
- if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
- ext4_da_update_reserve_space(inode, retval);
-
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && buffer_mapped(bh)) {
int ret = check_block_validity(inode, "file system "
@@ -1534,6 +1567,16 @@ static int do_journal_get_write_access(handle_t *handle,
return ext4_journal_get_write_access(handle, bh);
}
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext4_truncate_failed_write(struct inode *inode)
+{
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext4_truncate(inode);
+}
+
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1599,7 +1642,7 @@ retry:
ext4_journal_stop(handle);
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might
* still be on the orphan list; we need to
@@ -1709,7 +1752,7 @@ static int ext4_ordered_write_end(struct file *file,
ret = ret2;
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -1751,7 +1794,7 @@ static int ext4_writeback_write_end(struct file *file,
ret = ret2;
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -1793,7 +1836,7 @@ static int ext4_journalled_write_end(struct file *file,
new_i_size = pos + copied;
if (new_i_size > inode->i_size)
i_size_write(inode, pos+copied);
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
if (new_i_size > EXT4_I(inode)->i_disksize) {
ext4_update_i_disksize(inode, new_i_size);
ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1814,7 +1857,7 @@ static int ext4_journalled_write_end(struct file *file,
if (!ret)
ret = ret2;
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -1827,11 +1870,15 @@ static int ext4_journalled_write_end(struct file *file,
return ret ? ret : copied;
}
-static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+/*
+ * Reserve a single block located at lblock
+ */
+static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
{
int retries = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned long md_needed, mdblocks, total = 0;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned long md_needed, md_reserved;
/*
* recalculate the amount of metadata blocks to reserve
@@ -1839,86 +1886,78 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
* worse case is one extent per block
*/
repeat:
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
- mdblocks = ext4_calc_metadata_amount(inode, total);
- BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
-
- md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
- total = md_needed + nrblocks;
+ spin_lock(&ei->i_block_reservation_lock);
+ md_reserved = ei->i_reserved_meta_blocks;
+ md_needed = ext4_calc_metadata_amount(inode, lblock);
+ spin_unlock(&ei->i_block_reservation_lock);
/*
* Make quota reservation here to prevent quota overflow
* later. Real quota accounting is done at pages writeout
* time.
*/
- if (vfs_dq_reserve_block(inode, total)) {
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ if (vfs_dq_reserve_block(inode, md_needed + 1))
return -EDQUOT;
- }
- if (ext4_claim_free_blocks(sbi, total)) {
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- vfs_dq_release_reservation_block(inode, total);
+ if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
+ vfs_dq_release_reservation_block(inode, md_needed + 1);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
goto repeat;
}
return -ENOSPC;
}
- EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
- EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+ spin_lock(&ei->i_block_reservation_lock);
+ ei->i_reserved_data_blocks++;
+ ei->i_reserved_meta_blocks += md_needed;
+ spin_unlock(&ei->i_block_reservation_lock);
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
return 0; /* success */
}
static void ext4_da_release_space(struct inode *inode, int to_free)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- int total, mdb, mdb_free, release;
+ struct ext4_inode_info *ei = EXT4_I(inode);
if (!to_free)
return; /* Nothing to release, exit */
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- if (!EXT4_I(inode)->i_reserved_data_blocks) {
+ if (unlikely(to_free > ei->i_reserved_data_blocks)) {
/*
- * if there is no reserved blocks, but we try to free some
- * then the counter is messed up somewhere.
- * but since this function is called from invalidate
- * page, it's harmless to return without any action
+ * if there aren't enough reserved blocks, then the
+ * counter is messed up somewhere. Since this
+ * function is called from invalidate page, it's
+ * harmless to return without any action.
*/
- printk(KERN_INFO "ext4 delalloc try to release %d reserved "
- "blocks for inode %lu, but there is no reserved "
- "data blocks\n", to_free, inode->i_ino);
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- return;
+ ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+ "ino %lu, to_free %d with only %d reserved "
+ "data blocks\n", inode->i_ino, to_free,
+ ei->i_reserved_data_blocks);
+ WARN_ON(1);
+ to_free = ei->i_reserved_data_blocks;
}
+ ei->i_reserved_data_blocks -= to_free;
- /* recalculate the number of metablocks still need to be reserved */
- total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
- mdb = ext4_calc_metadata_amount(inode, total);
-
- /* figure out how many metablocks to release */
- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
- mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
-
- release = to_free + mdb_free;
-
- /* update fs dirty blocks counter for truncate case */
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
+ if (ei->i_reserved_data_blocks == 0) {
+ /*
+ * We can release all of the reserved metadata blocks
+ * only when we have written all of the delayed
+ * allocation blocks.
+ */
+ to_free += ei->i_reserved_meta_blocks;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_da_metadata_calc_len = 0;
+ }
- /* update per-inode reservations */
- BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
- EXT4_I(inode)->i_reserved_data_blocks -= to_free;
+ /* update fs dirty blocks counter */
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
- EXT4_I(inode)->i_reserved_meta_blocks = mdb;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- vfs_dq_release_reservation_block(inode, release);
+ vfs_dq_release_reservation_block(inode, to_free);
}
static void ext4_da_page_release_reservation(struct page *page,
@@ -2223,10 +2262,10 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
* variables are updated after the blocks have been allocated.
*/
new.b_state = 0;
- get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
- EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+ get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
if (mpd->b_state & (1 << BH_Delay))
- get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
+ get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
&new, get_blocks_flags);
if (blks < 0) {
@@ -2255,7 +2294,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
ext4_msg(mpd->inode->i_sb, KERN_CRIT,
"delayed block allocation failed for inode %lu at "
"logical offset %llu with max blocks %zd with "
- "error %d\n", mpd->inode->i_ino,
+ "error %d", mpd->inode->i_ino,
(unsigned long long) next,
mpd->b_size >> mpd->inode->i_blkbits, err);
printk(KERN_CRIT "This should not happen!! "
@@ -2322,8 +2361,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
sector_t next;
int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+ /*
+ * XXX Don't go larger than mballoc is willing to allocate
+ * This is a stopgap solution. We eventually need to fold
+ * mpage_da_submit_io() into this function and then call
+ * ext4_get_blocks() multiple times in a loop
+ */
+ if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+ goto flush_it;
+
/* check if thereserved journal credits might overflow */
- if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
if (nrblocks >= EXT4_MAX_TRANS_DATA) {
/*
* With non-extent format we are limited by the journal
@@ -2524,7 +2572,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
* XXX: __block_prepare_write() unmaps passed block,
* is it OK?
*/
- ret = ext4_da_reserve_space(inode, 1);
+ ret = ext4_da_reserve_space(inode, iblock);
if (ret)
/* not enough space to reserve */
return ret;
@@ -2635,7 +2683,7 @@ static int __ext4_journalled_writepage(struct page *page,
ret = err;
walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
return ret;
}
@@ -2788,7 +2836,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
* number of contiguous block. So we will limit
* number of contiguous block to a sane value
*/
- if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
(max_blocks > EXT4_MAX_TRANS_DATA))
max_blocks = EXT4_MAX_TRANS_DATA;
@@ -2908,7 +2956,7 @@ retry:
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
- "%ld pages, ino %lu; err %d\n", __func__,
+ "%ld pages, ino %lu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
goto out_writepages;
}
@@ -2983,7 +3031,7 @@ retry:
if (pages_skipped != wbc->pages_skipped)
ext4_msg(inode->i_sb, KERN_CRIT,
"This should not happen leaving %s "
- "with nr_to_write = %ld ret = %d\n",
+ "with nr_to_write = %ld ret = %d",
__func__, wbc->nr_to_write, ret);
/* Update index */
@@ -2999,8 +3047,7 @@ retry:
out_writepages:
if (!no_nrwrite_index_update)
wbc->no_nrwrite_index_update = 0;
- if (wbc->nr_to_write > nr_to_writebump)
- wbc->nr_to_write -= nr_to_writebump;
+ wbc->nr_to_write -= nr_to_writebump;
wbc->range_start = range_start;
trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
return ret;
@@ -3025,11 +3072,18 @@ static int ext4_nonda_switch(struct super_block *sb)
if (2 * free_blocks < 3 * dirty_blocks ||
free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
/*
- * free block count is less that 150% of dirty blocks
- * or free blocks is less that watermark
+ * free block count is less than 150% of dirty blocks
+ * or free blocks is less than watermark
*/
return 1;
}
+ /*
+ * Even if we don't switch but are nearing capacity,
+ * start pushing delalloc when 1/2 of free blocks are dirty.
+ */
+ if (free_blocks < 2 * dirty_blocks)
+ writeback_inodes_sb_if_idle(sb);
+
return 0;
}
@@ -3037,7 +3091,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
- int ret, retries = 0;
+ int ret, retries = 0, quota_retries = 0;
struct page *page;
pgoff_t index;
unsigned from, to;
@@ -3091,11 +3145,27 @@ retry:
* i_size_read because we hold i_mutex.
*/
if (pos + len > inode->i_size)
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
}
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
+
+ if ((ret == -EDQUOT) &&
+ EXT4_I(inode)->i_reserved_meta_blocks &&
+ (quota_retries++ < 3)) {
+ /*
+ * Since we often over-estimate the number of meta
+ * data blocks required, we may sometimes get a
+ * spurios out of quota error even though there would
+ * be enough space once we write the data blocks and
+ * find out how many meta data blocks were _really_
+ * required. So try forcing the inode write to see if
+ * that helps.
+ */
+ write_inode_now(inode, (quota_retries == 3));
+ goto retry;
+ }
out:
return ret;
}
@@ -3284,7 +3354,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
filemap_write_and_wait(mapping);
}
- if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+ if (EXT4_JOURNAL(inode) &&
+ ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
/*
* This is a REALLY heavyweight approach, but the use of
* bmap on dirty files is expected to be extremely rare:
@@ -3303,7 +3374,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
* everything they get.
*/
- EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
+ ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
journal = EXT4_JOURNAL(inode);
jbd2_journal_lock_updates(journal);
err = jbd2_journal_flush(journal);
@@ -3419,6 +3490,9 @@ retry:
* but cannot extend i_size. Bail out and pretend
* the write failed... */
ret = PTR_ERR(handle);
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+
goto out;
}
if (inode->i_nlink)
@@ -3534,7 +3608,7 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
loff_t offset = io->offset;
- size_t size = io->size;
+ ssize_t size = io->size;
int ret = 0;
ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
@@ -3771,8 +3845,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
ext4_free_io_end(iocb->private);
iocb->private = NULL;
- } else if (ret > 0 && (EXT4_I(inode)->i_state &
- EXT4_STATE_DIO_UNWRITTEN)) {
+ } else if (ret > 0 && ext4_test_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN)) {
int err;
/*
* for non AIO case, since the IO is already
@@ -3782,7 +3856,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
offset, ret);
if (err < 0)
ret = err;
- EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
+ ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
}
return ret;
}
@@ -3798,7 +3872,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4120,6 +4194,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
__le32 *last)
{
__le32 *p;
+ int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode);
+
if (try_to_extend_transaction(handle, inode)) {
if (bh) {
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -4150,11 +4226,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
*p = 0;
tbh = sb_find_get_block(inode->i_sb, nr);
- ext4_forget(handle, 0, inode, tbh, nr);
+ ext4_forget(handle, is_metadata, inode, tbh, nr);
}
}
- ext4_free_blocks(handle, inode, block_to_free, count, 0);
+ ext4_free_blocks(handle, inode, block_to_free, count, is_metadata);
}
/**
@@ -4427,10 +4503,12 @@ void ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return;
+ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
- ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
+ ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
ext4_ext_truncate(inode);
return;
}
@@ -4714,7 +4792,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
/* We have all inode data except xattrs in memory here. */
return __ext4_get_inode_loc(inode, iloc,
- !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
+ !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
}
void ext4_set_inode_flags(struct inode *inode)
@@ -4737,20 +4815,26 @@ void ext4_set_inode_flags(struct inode *inode)
/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
void ext4_get_inode_flags(struct ext4_inode_info *ei)
{
- unsigned int flags = ei->vfs_inode.i_flags;
-
- ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
- EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
- if (flags & S_SYNC)
- ei->i_flags |= EXT4_SYNC_FL;
- if (flags & S_APPEND)
- ei->i_flags |= EXT4_APPEND_FL;
- if (flags & S_IMMUTABLE)
- ei->i_flags |= EXT4_IMMUTABLE_FL;
- if (flags & S_NOATIME)
- ei->i_flags |= EXT4_NOATIME_FL;
- if (flags & S_DIRSYNC)
- ei->i_flags |= EXT4_DIRSYNC_FL;
+ unsigned int vfs_fl;
+ unsigned long old_fl, new_fl;
+
+ do {
+ vfs_fl = ei->vfs_inode.i_flags;
+ old_fl = ei->i_flags;
+ new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
+ EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
+ EXT4_DIRSYNC_FL);
+ if (vfs_fl & S_SYNC)
+ new_fl |= EXT4_SYNC_FL;
+ if (vfs_fl & S_APPEND)
+ new_fl |= EXT4_APPEND_FL;
+ if (vfs_fl & S_IMMUTABLE)
+ new_fl |= EXT4_IMMUTABLE_FL;
+ if (vfs_fl & S_NOATIME)
+ new_fl |= EXT4_NOATIME_FL;
+ if (vfs_fl & S_DIRSYNC)
+ new_fl |= EXT4_DIRSYNC_FL;
+ } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
}
static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -4781,8 +4865,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
struct ext4_iloc iloc;
struct ext4_inode *raw_inode;
struct ext4_inode_info *ei;
- struct buffer_head *bh;
struct inode *inode;
+ journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
int block;
@@ -4793,11 +4877,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
return inode;
ei = EXT4_I(inode);
+ iloc.bh = 0;
ret = __ext4_get_inode_loc(inode, &iloc, 0);
if (ret < 0)
goto bad_inode;
- bh = iloc.bh;
raw_inode = ext4_raw_inode(&iloc);
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
@@ -4808,7 +4892,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
}
inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
- ei->i_state = 0;
+ ei->i_state_flags = 0;
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
@@ -4820,7 +4904,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (inode->i_mode == 0 ||
!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
/* this inode is deleted */
- brelse(bh);
ret = -ESTALE;
goto bad_inode;
}
@@ -4837,6 +4920,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(raw_inode);
ei->i_disksize = inode->i_size;
+#ifdef CONFIG_QUOTA
+ ei->i_reserved_quota = 0;
+#endif
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
ei->i_block_group = iloc.block_group;
ei->i_last_alloc_group = ~0;
@@ -4848,11 +4934,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_data[block] = raw_inode->i_block[block];
INIT_LIST_HEAD(&ei->i_orphan);
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ transaction_t *transaction;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ spin_unlock(&journal->j_state_lock);
+ ei->i_sync_tid = tid;
+ ei->i_datasync_tid = tid;
+ }
+
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
EXT4_INODE_SIZE(inode->i_sb)) {
- brelse(bh);
ret = -EIO;
goto bad_inode;
}
@@ -4865,7 +4975,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
EXT4_GOOD_OLD_INODE_SIZE +
ei->i_extra_isize;
if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
- ei->i_state |= EXT4_STATE_XATTR;
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
}
} else
ei->i_extra_isize = 0;
@@ -4884,10 +4994,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ret = 0;
if (ei->i_file_acl &&
- ((ei->i_file_acl <
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
- EXT4_SB(sb)->s_gdb_count)) ||
- (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
+ !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
ext4_error(sb, __func__,
"bad extended attribute block %llu in inode #%lu",
ei->i_file_acl, inode->i_ino);
@@ -4905,10 +5012,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
/* Validate block references which are part of inode */
ret = ext4_check_inode_blockref(inode);
}
- if (ret) {
- brelse(bh);
+ if (ret)
goto bad_inode;
- }
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
@@ -4936,7 +5041,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
} else {
- brelse(bh);
ret = -EIO;
ext4_error(inode->i_sb, __func__,
"bogus i_mode (%o) for inode=%lu",
@@ -4949,6 +5053,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
return inode;
bad_inode:
+ brelse(iloc.bh);
iget_failed(inode);
return ERR_PTR(ret);
}
@@ -4968,7 +5073,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
*/
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = 0;
- ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
return 0;
}
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
@@ -4981,9 +5086,9 @@ static int ext4_inode_blocks_set(handle_t *handle,
*/
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
- ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
} else {
- ei->i_flags |= EXT4_HUGE_FILE_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
/* i_block is stored in file system block size */
i_blocks = i_blocks >> (inode->i_blkbits - 9);
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
@@ -5010,7 +5115,7 @@ static int ext4_do_update_inode(handle_t *handle,
/* For fields not not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
- if (ei->i_state & EXT4_STATE_NEW)
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
ext4_get_inode_flags(ei);
@@ -5074,7 +5179,7 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
sb->s_dirt = 1;
ext4_handle_sync(handle);
- err = ext4_handle_dirty_metadata(handle, inode,
+ err = ext4_handle_dirty_metadata(handle, NULL,
EXT4_SB(sb)->s_sbh);
}
}
@@ -5103,11 +5208,12 @@ static int ext4_do_update_inode(handle_t *handle,
}
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- rc = ext4_handle_dirty_metadata(handle, inode, bh);
+ rc = ext4_handle_dirty_metadata(handle, NULL, bh);
if (!err)
err = rc;
- ei->i_state &= ~EXT4_STATE_NEW;
+ ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+ ext4_update_inode_fsync_trans(handle, inode, 0);
out_brelse:
brelse(bh);
ext4_std_error(inode->i_sb, err);
@@ -5170,7 +5276,7 @@ int ext4_write_inode(struct inode *inode, int wait)
} else {
struct ext4_iloc iloc;
- err = ext4_get_inode_loc(inode, &iloc);
+ err = __ext4_get_inode_loc(inode, &iloc, 0);
if (err)
return err;
if (wait)
@@ -5183,6 +5289,7 @@ int ext4_write_inode(struct inode *inode, int wait)
(unsigned long long)iloc.bh->b_blocknr);
err = -EIO;
}
+ brelse(iloc.bh);
}
return err;
}
@@ -5227,8 +5334,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
/* (user+group)*(old+new) structure, inode write (sb,
* inode block, ? - but truncate inode update has it) */
- handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
- EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+ handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+ EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
@@ -5249,7 +5356,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
if (attr->ia_size > sbi->s_bitmap_maxbytes) {
@@ -5260,7 +5367,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (S_ISREG(inode->i_mode) &&
- attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+ attr->ia_valid & ATTR_SIZE &&
+ (attr->ia_size < inode->i_size ||
+ (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
handle_t *handle;
handle = ext4_journal_start(inode, 3);
@@ -5291,6 +5400,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
goto err_out;
}
}
+ /* ext4_truncate will clear the flag */
+ if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
+ ext4_truncate(inode);
}
rc = inode_setattr(inode, attr);
@@ -5365,7 +5477,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
{
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
}
@@ -5529,8 +5641,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
entry = IFIRST(header);
/* No extended attributes present */
- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
- header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+ header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
new_extra_isize);
EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -5574,7 +5686,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (ext4_handle_valid(handle) &&
EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
- !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
+ !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
/*
* We need extra buffer credits since we may write into EA block
* with this same handle. If journal_extend fails, then it will
@@ -5588,7 +5700,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
sbi->s_want_extra_isize,
iloc, handle);
if (ret) {
- EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+ ext4_set_inode_state(inode,
+ EXT4_STATE_NO_EXPAND);
if (mnt_count !=
le16_to_cpu(sbi->s_es->s_mnt_count)) {
ext4_warning(inode->i_sb, __func__,
@@ -5655,7 +5768,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
err = jbd2_journal_get_write_access(handle, iloc.bh);
if (!err)
err = ext4_handle_dirty_metadata(handle,
- inode,
+ NULL,
iloc.bh);
brelse(iloc.bh);
}
@@ -5699,9 +5812,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
*/
if (val)
- EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
else
- EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c1cdf613e725..bf5ae883b1bd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
flags &= ~EXT4_EXTENTS_FL;
}
+ if (flags & EXT4_EOFBLOCKS_FL) {
+ /* we don't support adding EOFBLOCKS flag */
+ if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+ } else if (oldflags & EXT4_EOFBLOCKS_FL)
+ ext4_truncate(inode);
+
handle = ext4_journal_start(inode, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
@@ -221,31 +230,39 @@ setversion_out:
struct file *donor_filp;
int err;
+ if (!(filp->f_mode & FMODE_READ) ||
+ !(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
if (copy_from_user(&me,
(struct move_extent __user *)arg, sizeof(me)))
return -EFAULT;
+ me.moved_len = 0;
donor_filp = fget(me.donor_fd);
if (!donor_filp)
return -EBADF;
- if (!capable(CAP_DAC_OVERRIDE)) {
- if ((current->real_cred->fsuid != inode->i_uid) ||
- !(inode->i_mode & S_IRUSR) ||
- !(donor_filp->f_dentry->d_inode->i_mode &
- S_IRUSR)) {
- fput(donor_filp);
- return -EACCES;
- }
+ if (!(donor_filp->f_mode & FMODE_WRITE)) {
+ err = -EBADF;
+ goto mext_out;
}
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ goto mext_out;
+
err = ext4_move_extents(filp, donor_filp, me.orig_start,
me.donor_start, me.len, &me.moved_len);
- fput(donor_filp);
-
- if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
- return -EFAULT;
+ mnt_drop_write(filp->f_path.mnt);
+ if (me.moved_len > 0)
+ file_remove_suid(donor_filp);
+ if (copy_to_user((struct move_extent __user *)arg,
+ &me, sizeof(me)))
+ err = -EFAULT;
+mext_out:
+ fput(donor_filp);
return err;
}
@@ -356,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC32_SETRSVSZ:
cmd = EXT4_IOC_SETRSVSZ;
break;
- case EXT4_IOC_GROUP_ADD:
+ case EXT4_IOC32_GROUP_ADD: {
+ struct compat_ext4_new_group_input __user *uinput;
+ struct ext4_new_group_input input;
+ mm_segment_t old_fs;
+ int err;
+
+ uinput = compat_ptr(arg);
+ err = get_user(input.group, &uinput->group);
+ err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+ err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+ err |= get_user(input.inode_table, &uinput->inode_table);
+ err |= get_user(input.blocks_count, &uinput->blocks_count);
+ err |= get_user(input.reserved_blocks,
+ &uinput->reserved_blocks);
+ if (err)
+ return -EFAULT;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
+ (unsigned long) &input);
+ set_fs(old_fs);
+ return err;
+ }
+ case EXT4_IOC_MOVE_EXT:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bba12824defa..04e07e227278 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
}
}
+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+ int i;
+ int bits;
+
+ grp->bb_largest_free_order = -1; /* uninit */
+
+ bits = sb->s_blocksize_bits + 1;
+ for (i = bits; i >= 0; i--) {
+ if (grp->bb_counters[i] > 0) {
+ grp->bb_largest_free_order = i;
+ break;
+ }
+ }
+}
+
static noinline_for_stack
void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
*/
grp->bb_free = free;
}
+ mb_set_largest_free_order(sb, grp);
clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
* contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
* So it can have information regarding groups_per_page which
* is blocks_per_page/2
+ *
+ * Locking note: This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
*/
static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -910,6 +935,11 @@ out:
return err;
}
+/*
+ * Locking note: This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
static noinline_for_stack
int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
{
@@ -1004,6 +1034,11 @@ err:
return ret;
}
+/*
+ * Locking note: This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
static noinline_for_stack int
ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_buddy *e4b)
@@ -1150,7 +1185,7 @@ err:
return ret;
}
-static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
{
if (e4b->bd_bitmap_page)
page_cache_release(e4b->bd_bitmap_page);
@@ -1300,6 +1335,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
buddy = buddy2;
} while (1);
}
+ mb_set_largest_free_order(sb, e4b->bd_info);
mb_check_buddy(e4b);
}
@@ -1428,6 +1464,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
e4b->bd_info->bb_counters[ord]++;
e4b->bd_info->bb_counters[ord]++;
}
+ mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
mb_check_buddy(e4b);
@@ -1618,7 +1655,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
}
ext4_unlock_group(ac->ac_sb, group);
- ext4_mb_release_desc(e4b);
+ ext4_mb_unload_buddy(e4b);
return 0;
}
@@ -1674,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
ext4_mb_use_best_found(ac, e4b);
}
ext4_unlock_group(ac->ac_sb, group);
- ext4_mb_release_desc(e4b);
+ ext4_mb_unload_buddy(e4b);
return 0;
}
@@ -1823,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
}
}
+/* This is now called BEFORE we load the buddy bitmap. */
static int ext4_mb_good_group(struct ext4_allocation_context *ac,
ext4_group_t group, int cr)
{
unsigned free, fragments;
- unsigned i, bits;
int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
BUG_ON(cr < 0 || cr >= 4);
- BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
+
+ /* We only do this if the grp has never been initialized */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ int ret = ext4_mb_init_group(ac->ac_sb, group);
+ if (ret)
+ return 0;
+ }
free = grp->bb_free;
fragments = grp->bb_fragments;
@@ -1845,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
case 0:
BUG_ON(ac->ac_2order == 0);
+ if (grp->bb_largest_free_order < ac->ac_2order)
+ return 0;
+
/* Avoid using the first bg of a flexgroup for data files */
if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
(flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
((group % flex_size) == 0))
return 0;
- bits = ac->ac_sb->s_blocksize_bits + 1;
- for (i = ac->ac_2order; i <= bits; i++)
- if (grp->bb_counters[i] > 0)
- return 1;
- break;
+ return 1;
case 1:
if ((free / fragments) >= ac->ac_g_ex.fe_len)
return 1;
@@ -1966,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
sbi = EXT4_SB(sb);
ngroups = ext4_get_groups_count(sb);
/* non-extent files are limited to low blocks/groups */
- if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
ngroups = sbi->s_blockfile_groups;
BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -2026,15 +2068,11 @@ repeat:
group = ac->ac_g_ex.fe_group;
for (i = 0; i < ngroups; group++, i++) {
- struct ext4_group_info *grp;
- struct ext4_group_desc *desc;
-
if (group == ngroups)
group = 0;
- /* quick check to skip empty groups */
- grp = ext4_get_group_info(sb, group);
- if (grp->bb_free == 0)
+ /* This now checks without needing the buddy page */
+ if (!ext4_mb_good_group(ac, group, cr))
continue;
err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2042,15 +2080,18 @@ repeat:
goto out;
ext4_lock_group(sb, group);
+
+ /*
+ * We need to check again after locking the
+ * block group
+ */
if (!ext4_mb_good_group(ac, group, cr)) {
- /* someone did allocation from this group */
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
continue;
}
ac->ac_groups_scanned++;
- desc = ext4_get_group_desc(sb, group, NULL);
if (cr == 0)
ext4_mb_simple_scan_group(ac, &e4b);
else if (cr == 1 &&
@@ -2060,7 +2101,7 @@ repeat:
ext4_mb_complex_scan_group(ac, &e4b);
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
if (ac->ac_status != AC_STATUS_CONTINUE)
break;
@@ -2150,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
ext4_lock_group(sb, group);
memcpy(&sg, ext4_get_group_info(sb, group), i);
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2257,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
init_rwsem(&meta_group_info[i]->alloc_sem);
meta_group_info[i]->bb_free_root.rb_node = NULL;
+ meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
#ifdef DOUBLE_CHECK
{
@@ -2529,7 +2571,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
struct ext4_group_info *db;
int err, count = 0, count2 = 0;
struct ext4_free_data *entry;
- ext4_fsblk_t discard_block;
struct list_head *l, *ltmp;
list_for_each_safe(l, ltmp, &txn->t_private_list) {
@@ -2538,6 +2579,23 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
entry->count, entry->group, entry);
+ if (test_opt(sb, DISCARD)) {
+ int ret;
+ ext4_fsblk_t discard_block;
+
+ discard_block = entry->start_blk +
+ ext4_group_first_block_no(sb, entry->group);
+ trace_ext4_discard_blocks(sb,
+ (unsigned long long)discard_block,
+ entry->count);
+ ret = sb_issue_discard(sb, discard_block, entry->count);
+ if (ret == EOPNOTSUPP) {
+ ext4_warning(sb, __func__,
+ "discard not supported, disabling");
+ clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
+ }
+ }
+
err = ext4_mb_load_buddy(sb, entry->group, &e4b);
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);
@@ -2559,15 +2617,8 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
page_cache_release(e4b.bd_bitmap_page);
}
ext4_unlock_group(sb, entry->group);
- discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
- + entry->start_blk
- + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
- trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
- entry->count);
- sb_issue_discard(sb, discard_block, entry->count);
-
kmem_cache_free(ext4_free_ext_cachep, entry);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
}
mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -2750,12 +2801,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
/* release all the reserved blocks if non delalloc */
percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
- else {
- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
- ac->ac_b_ex.fe_len);
- /* convert reserved quota blocks to real quota blocks */
- vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len);
- }
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -3006,6 +3051,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
}
/*
+ * Called on failure; free up any blocks from the inode PA for this
+ * context. We don't need this for MB_GROUP_PA because we only change
+ * pa_free in ext4_mb_release_context(), but on failure, we've already
+ * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
+ */
+static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
+{
+ struct ext4_prealloc_space *pa = ac->ac_pa;
+ int len;
+
+ if (pa && pa->pa_type == MB_INODE_PA) {
+ len = ac->ac_b_ex.fe_len;
+ pa->pa_free += len;
+ }
+
+}
+
+/*
* use blocks preallocated to inode
*/
static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
@@ -3113,7 +3176,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
continue;
/* non-extent files can't have physical blocks past 2^32 */
- if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
continue;
@@ -3692,7 +3755,7 @@ out:
ext4_unlock_group(sb, group);
if (ac)
kmem_cache_free(ext4_ac_cachep, ac);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);
return free;
}
@@ -3796,7 +3859,7 @@ repeat:
if (bitmap_bh == NULL) {
ext4_error(sb, __func__, "Error in reading block "
"bitmap for %u", group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
continue;
}
@@ -3805,7 +3868,7 @@ repeat:
ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);
list_del(&pa->u.pa_tmp_list);
@@ -3921,7 +3984,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
/* don't use group allocation for large files */
size = max(size, isize);
- if (size >= sbi->s_mb_stream_request) {
+ if (size > sbi->s_mb_stream_request) {
ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
}
@@ -4069,7 +4132,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
ext4_mb_release_group_pa(&e4b, pa, ac);
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
list_del(&pa->u.pa_tmp_list);
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
@@ -4290,6 +4353,7 @@ repeat:
ac->ac_status = AC_STATUS_CONTINUE;
goto repeat;
} else if (*errp) {
+ ext4_discard_allocated_blocks(ac);
ac->ac_b_ex.fe_len = 0;
ar->len = 0;
ext4_mb_show_ac(ac);
@@ -4570,7 +4634,7 @@ do_more:
atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
}
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
*freed += count;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 0ca811061bc7..ceb9d41cd7cf 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -221,8 +221,6 @@ struct ext4_buddy {
#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
-#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-
static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
struct ext4_free_extent *fex)
{
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a93d5b80f3e2..7901f133f967 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
* So allocate a credit of 3. We may update
* quota (user and group).
*/
- needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+ needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
if (ext4_journal_extend(handle, needed) != 0)
retval = ext4_journal_restart(handle, needed);
@@ -357,12 +357,12 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
* happened after we started the migrate. We need to
* fail the migrate
*/
- if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
+ if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
retval = -EAGAIN;
up_write(&EXT4_I(inode)->i_data_sem);
goto err_out;
} else
- EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
/*
* We have the extent map build with the tmp inode.
* Now copy the i_data across
@@ -465,7 +465,7 @@ int ext4_ext_migrate(struct inode *inode)
*/
if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_INCOMPAT_EXTENTS) ||
- (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EINVAL;
if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
@@ -477,7 +477,7 @@ int ext4_ext_migrate(struct inode *inode)
handle = ext4_journal_start(inode,
EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
+ EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
+ 1);
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
@@ -494,14 +494,10 @@ int ext4_ext_migrate(struct inode *inode)
}
i_size_write(tmp_inode, i_size_read(inode));
/*
- * We don't want the inode to be reclaimed
- * if we got interrupted in between. We have
- * this tmp inode carrying reference to the
- * data blocks of the original file. We set
- * the i_nlink to zero at the last stage after
- * switching the original file to extent format
+ * Set the i_nlink to zero so it will be deleted later
+ * when we drop inode reference.
*/
- tmp_inode->i_nlink = 1;
+ tmp_inode->i_nlink = 0;
ext4_ext_tree_init(handle, tmp_inode);
ext4_orphan_add(handle, tmp_inode);
@@ -524,10 +520,20 @@ int ext4_ext_migrate(struct inode *inode)
* allocation.
*/
down_read((&EXT4_I(inode)->i_data_sem));
- EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
+ ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
up_read((&EXT4_I(inode)->i_data_sem));
handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ /*
+ * It is impossible to update on-disk structures without
+ * a handle, so just rollback in-core changes and live other
+ * work to orphan_list_cleanup()
+ */
+ ext4_orphan_del(NULL, tmp_inode);
+ retval = PTR_ERR(handle);
+ goto out;
+ }
ei = EXT4_I(inode);
i_data = ei->i_data;
@@ -609,15 +615,8 @@ err_out:
/* Reset the extent details */
ext4_ext_tree_init(handle, tmp_inode);
-
- /*
- * Set the i_nlink to zero so that
- * generic_drop_inode really deletes the
- * inode
- */
- tmp_inode->i_nlink = 0;
-
ext4_journal_stop(handle);
+out:
unlock_new_inode(tmp_inode);
iput(tmp_inode);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 25b6b1457360..a73ed781752d 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -77,12 +77,14 @@ static int
mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
struct ext4_extent **extent)
{
+ struct ext4_extent_header *eh;
int ppos, leaf_ppos = path->p_depth;
ppos = leaf_ppos;
if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
/* leaf block */
*extent = ++path[ppos].p_ext;
+ path[ppos].p_block = ext_pblock(path[ppos].p_ext);
return 0;
}
@@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
ext_block_hdr(path[cur_ppos+1].p_bh);
}
+ path[leaf_ppos].p_ext = *extent = NULL;
+
+ eh = path[leaf_ppos].p_hdr;
+ if (le16_to_cpu(eh->eh_entries) == 0)
+ /* empty leaf is found */
+ return -ENODATA;
+
/* leaf block */
path[leaf_ppos].p_ext = *extent =
EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+ path[leaf_ppos].p_block =
+ ext_pblock(path[leaf_ppos].p_ext);
return 0;
}
}
@@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
}
/**
- * mext_double_down_read - Acquire two inodes' read semaphore
+ * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
*
* @orig_inode: original inode structure
* @donor_inode: donor inode structure
- * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
+ * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
+ * i_ino order.
*/
static void
-mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
-{
- struct inode *first = orig_inode, *second = donor_inode;
-
- /*
- * Use the inode number to provide the stable locking order instead
- * of its address, because the C language doesn't guarantee you can
- * compare pointers that don't come from the same array.
- */
- if (donor_inode->i_ino < orig_inode->i_ino) {
- first = donor_inode;
- second = orig_inode;
- }
-
- down_read(&EXT4_I(first)->i_data_sem);
- down_read(&EXT4_I(second)->i_data_sem);
-}
-
-/**
- * mext_double_down_write - Acquire two inodes' write semaphore
- *
- * @orig_inode: original inode structure
- * @donor_inode: donor inode structure
- * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
- */
-static void
-mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
+double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
{
struct inode *first = orig_inode, *second = donor_inode;
@@ -203,32 +189,18 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
}
down_write(&EXT4_I(first)->i_data_sem);
- down_write(&EXT4_I(second)->i_data_sem);
-}
-
-/**
- * mext_double_up_read - Release two inodes' read semaphore
- *
- * @orig_inode: original inode structure to be released its lock first
- * @donor_inode: donor inode structure to be released its lock second
- * Release read semaphore of two inodes (orig and donor).
- */
-static void
-mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
-{
- up_read(&EXT4_I(orig_inode)->i_data_sem);
- up_read(&EXT4_I(donor_inode)->i_data_sem);
+ down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
}
/**
- * mext_double_up_write - Release two inodes' write semaphore
+ * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
*
* @orig_inode: original inode structure to be released its lock first
* @donor_inode: donor inode structure to be released its lock second
- * Release write semaphore of two inodes (orig and donor).
+ * Release write lock of i_data_sem of two inodes (orig and donor).
*/
static void
-mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
{
up_write(&EXT4_I(orig_inode)->i_data_sem);
up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -280,6 +252,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
}
o_start->ee_len = start_ext->ee_len;
+ eblock = le32_to_cpu(start_ext->ee_block);
new_flag = 1;
} else if (start_ext->ee_len && new_ext->ee_len &&
@@ -290,6 +263,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
* orig |------------------------------|
*/
o_start->ee_len = start_ext->ee_len;
+ eblock = le32_to_cpu(start_ext->ee_block);
new_flag = 1;
} else if (!start_ext->ee_len && new_ext->ee_len &&
@@ -503,7 +477,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
struct ext4_extent new_ext, start_ext, end_ext;
ext4_lblk_t new_ext_end;
- ext4_fsblk_t new_phys_end;
int oext_alen, new_ext_alen, end_ext_alen;
int depth = ext_depth(orig_inode);
int ret;
@@ -517,7 +490,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
new_ext.ee_len = dext->ee_len;
new_ext_alen = ext4_ext_get_actual_len(&new_ext);
new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
- new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
/*
* Case: original extent is first
@@ -530,6 +502,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
le32_to_cpu(oext->ee_block) + oext_alen) {
start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
le32_to_cpu(oext->ee_block));
+ start_ext.ee_block = oext->ee_block;
copy_extent_status(oext, &start_ext);
} else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
prev_ext = oext - 1;
@@ -543,6 +516,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
start_ext.ee_len = cpu_to_le16(
ext4_ext_get_actual_len(prev_ext) +
new_ext_alen);
+ start_ext.ee_block = oext->ee_block;
copy_extent_status(prev_ext, &start_ext);
new_ext.ee_len = 0;
}
@@ -661,6 +635,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
* @donor_inode: donor inode
* @from: block offset of orig_inode
* @count: block count to be replaced
+ * @err: pointer to save return value
*
* Replace original inode extents and donor inode extents page by page.
* We implement this replacement in the following three steps:
@@ -671,33 +646,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
* 3. Change the block information of donor inode to point at the saved
* original inode blocks in the dummy extents.
*
- * Return 0 on success, or a negative error value on failure.
+ * Return replaced block count.
*/
static int
mext_replace_branches(handle_t *handle, struct inode *orig_inode,
struct inode *donor_inode, ext4_lblk_t from,
- ext4_lblk_t count)
+ ext4_lblk_t count, int *err)
{
struct ext4_ext_path *orig_path = NULL;
struct ext4_ext_path *donor_path = NULL;
struct ext4_extent *oext, *dext;
struct ext4_extent tmp_dext, tmp_oext;
ext4_lblk_t orig_off = from, donor_off = from;
- int err = 0;
int depth;
int replaced_count = 0;
int dext_alen;
- mext_double_down_write(orig_inode, donor_inode);
+ /* Protect extent trees against block allocations via delalloc */
+ double_down_write_data_sem(orig_inode, donor_inode);
/* Get the original extent for the block "orig_off" */
- err = get_ext_path(orig_inode, orig_off, &orig_path);
- if (err)
+ *err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (*err)
goto out;
/* Get the donor extent for the head */
- err = get_ext_path(donor_inode, donor_off, &donor_path);
- if (err)
+ *err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (*err)
goto out;
depth = ext_depth(orig_inode);
oext = orig_path[depth].p_ext;
@@ -707,9 +682,9 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
dext = donor_path[depth].p_ext;
tmp_dext = *dext;
- err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
donor_off, count);
- if (err)
+ if (*err)
goto out;
/* Loop for the donor extents */
@@ -718,7 +693,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
if (!dext) {
ext4_error(donor_inode->i_sb, __func__,
"The extent for donor must be found");
- err = -EIO;
+ *err = -EIO;
goto out;
} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
ext4_error(donor_inode->i_sb, __func__,
@@ -726,20 +701,20 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
"extent(%u) should be equal",
donor_off,
le32_to_cpu(tmp_dext.ee_block));
- err = -EIO;
+ *err = -EIO;
goto out;
}
/* Set donor extent to orig extent */
- err = mext_leaf_block(handle, orig_inode,
+ *err = mext_leaf_block(handle, orig_inode,
orig_path, &tmp_dext, &orig_off);
- if (err < 0)
+ if (*err)
goto out;
/* Set orig extent to donor extent */
- err = mext_leaf_block(handle, donor_inode,
+ *err = mext_leaf_block(handle, donor_inode,
donor_path, &tmp_oext, &donor_off);
- if (err < 0)
+ if (*err)
goto out;
dext_alen = ext4_ext_get_actual_len(&tmp_dext);
@@ -753,35 +728,25 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
if (orig_path)
ext4_ext_drop_refs(orig_path);
- err = get_ext_path(orig_inode, orig_off, &orig_path);
- if (err)
+ *err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (*err)
goto out;
depth = ext_depth(orig_inode);
oext = orig_path[depth].p_ext;
- if (le32_to_cpu(oext->ee_block) +
- ext4_ext_get_actual_len(oext) <= orig_off) {
- err = 0;
- goto out;
- }
tmp_oext = *oext;
if (donor_path)
ext4_ext_drop_refs(donor_path);
- err = get_ext_path(donor_inode, donor_off, &donor_path);
- if (err)
+ *err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (*err)
goto out;
depth = ext_depth(donor_inode);
dext = donor_path[depth].p_ext;
- if (le32_to_cpu(dext->ee_block) +
- ext4_ext_get_actual_len(dext) <= donor_off) {
- err = 0;
- goto out;
- }
tmp_dext = *dext;
- err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
donor_off, count - replaced_count);
- if (err)
+ if (*err)
goto out;
}
@@ -795,8 +760,12 @@ out:
kfree(donor_path);
}
- mext_double_up_write(orig_inode, donor_inode);
- return err;
+ ext4_ext_invalidate_cache(orig_inode);
+ ext4_ext_invalidate_cache(donor_inode);
+
+ double_up_write_data_sem(orig_inode, donor_inode);
+
+ return replaced_count;
}
/**
@@ -808,16 +777,17 @@ out:
* @data_offset_in_page: block index where data swapping starts
* @block_len_in_page: the number of blocks to be swapped
* @uninit: orig extent is uninitialized or not
+ * @err: pointer to save return value
*
* Save the data in original inode blocks and replace original inode extents
* with donor inode extents by calling mext_replace_branches().
- * Finally, write out the saved data in new original inode blocks. Return 0
- * on success, or a negative error value on failure.
+ * Finally, write out the saved data in new original inode blocks. Return
+ * replaced block count.
*/
static int
move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
pgoff_t orig_page_offset, int data_offset_in_page,
- int block_len_in_page, int uninit)
+ int block_len_in_page, int uninit, int *err)
{
struct inode *orig_inode = o_filp->f_dentry->d_inode;
struct address_space *mapping = orig_inode->i_mapping;
@@ -829,9 +799,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int w_flags = 0;
- unsigned int tmp_data_len, data_len;
+ unsigned int tmp_data_size, data_size, replaced_size;
void *fsdata;
- int ret, i, jblocks;
+ int i, jblocks;
+ int err2 = 0;
+ int replaced_count = 0;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
/*
@@ -841,8 +813,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
handle = ext4_journal_start(orig_inode, jblocks);
if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- return ret;
+ *err = PTR_ERR(handle);
+ return 0;
}
if (segment_eq(get_fs(), KERNEL_DS))
@@ -858,39 +830,36 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
* Just swap data blocks between orig and donor.
*/
if (uninit) {
- ret = mext_replace_branches(handle, orig_inode,
- donor_inode, orig_blk_offset,
- block_len_in_page);
-
- /* Clear the inode cache not to refer to the old data */
- ext4_ext_invalidate_cache(orig_inode);
- ext4_ext_invalidate_cache(donor_inode);
+ replaced_count = mext_replace_branches(handle, orig_inode,
+ donor_inode, orig_blk_offset,
+ block_len_in_page, err);
goto out2;
}
offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
- /* Calculate data_len */
+ /* Calculate data_size */
if ((orig_blk_offset + block_len_in_page - 1) ==
((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
/* Replace the last block */
- tmp_data_len = orig_inode->i_size & (blocksize - 1);
+ tmp_data_size = orig_inode->i_size & (blocksize - 1);
/*
- * If data_len equal zero, it shows data_len is multiples of
+ * If data_size equal zero, it shows data_size is multiples of
* blocksize. So we set appropriate value.
*/
- if (tmp_data_len == 0)
- tmp_data_len = blocksize;
+ if (tmp_data_size == 0)
+ tmp_data_size = blocksize;
- data_len = tmp_data_len +
+ data_size = tmp_data_size +
((block_len_in_page - 1) << orig_inode->i_blkbits);
- } else {
- data_len = block_len_in_page << orig_inode->i_blkbits;
- }
+ } else
+ data_size = block_len_in_page << orig_inode->i_blkbits;
+
+ replaced_size = data_size;
- ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
+ *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
&page, &fsdata);
- if (unlikely(ret < 0))
+ if (unlikely(*err < 0))
goto out;
if (!PageUptodate(page)) {
@@ -911,14 +880,17 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
/* Release old bh and drop refs */
try_to_release_page(page, 0);
- ret = mext_replace_branches(handle, orig_inode, donor_inode,
- orig_blk_offset, block_len_in_page);
- if (ret < 0)
- goto out;
-
- /* Clear the inode cache not to refer to the old data */
- ext4_ext_invalidate_cache(orig_inode);
- ext4_ext_invalidate_cache(donor_inode);
+ replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
+ orig_blk_offset, block_len_in_page,
+ &err2);
+ if (err2) {
+ if (replaced_count) {
+ block_len_in_page = replaced_count;
+ replaced_size =
+ block_len_in_page << orig_inode->i_blkbits;
+ } else
+ goto out;
+ }
if (!page_has_buffers(page))
create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
@@ -928,16 +900,16 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
bh = bh->b_this_page;
for (i = 0; i < block_len_in_page; i++) {
- ret = ext4_get_block(orig_inode,
+ *err = ext4_get_block(orig_inode,
(sector_t)(orig_blk_offset + i), bh, 0);
- if (ret < 0)
+ if (*err < 0)
goto out;
if (bh->b_this_page != NULL)
bh = bh->b_this_page;
}
- ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
+ *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
page, fsdata);
page = NULL;
@@ -951,18 +923,20 @@ out:
out2:
ext4_journal_stop(handle);
- return ret < 0 ? ret : 0;
+ if (err2)
+ *err = err2;
+
+ return replaced_count;
}
/**
- * mext_check_argumants - Check whether move extent can be done
+ * mext_check_arguments - Check whether move extent can be done
*
* @orig_inode: original inode
* @donor_inode: donor inode
* @orig_start: logical start offset in block for orig
* @donor_start: logical start offset in block for donor
* @len: the number of blocks to be moved
- * @moved_len: moved block length
*
* Check the arguments of ext4_move_extents() whether the files can be
* exchanged with each other.
@@ -970,21 +944,23 @@ out2:
*/
static int
mext_check_arguments(struct inode *orig_inode,
- struct inode *donor_inode, __u64 orig_start,
- __u64 donor_start, __u64 *len, __u64 moved_len)
+ struct inode *donor_inode, __u64 orig_start,
+ __u64 donor_start, __u64 *len)
{
ext4_lblk_t orig_blocks, donor_blocks;
unsigned int blkbits = orig_inode->i_blkbits;
unsigned int blocksize = 1 << blkbits;
- /* Regular file check */
- if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
- ext4_debug("ext4 move extent: The argument files should be "
- "regular file [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
+ if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
+ ext4_debug("ext4 move extent: suid or sgid is set"
+ " to donor file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
+ if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
+ return -EPERM;
+
/* Ext4 move extent does not support swapfile */
if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
ext4_debug("ext4 move extent: The argument files should "
@@ -1002,11 +978,11 @@ mext_check_arguments(struct inode *orig_inode,
}
/* Ext4 move extent supports only extent based file */
- if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: orig file is not extents "
"based file [ino:orig %lu]\n", orig_inode->i_ino);
return -EOPNOTSUPP;
- } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+ } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: donor file is not extents "
"based file [ino:donor %lu]\n", donor_inode->i_ino);
return -EOPNOTSUPP;
@@ -1025,13 +1001,6 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
- if (moved_len) {
- ext4_debug("ext4 move extent: moved_len should be 0 "
- "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
- donor_inode->i_ino);
- return -EINVAL;
- }
-
if ((orig_start > EXT_MAX_BLOCK) ||
(donor_start > EXT_MAX_BLOCK) ||
(*len > EXT_MAX_BLOCK) ||
@@ -1232,16 +1201,24 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
return -EINVAL;
}
- /* protect orig and donor against a truncate */
+ /* Regular file check */
+ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+ ext4_debug("ext4 move extent: The argument files should be "
+ "regular file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Protect orig and donor inodes against a truncate */
ret1 = mext_inode_double_lock(orig_inode, donor_inode);
if (ret1 < 0)
return ret1;
- mext_double_down_read(orig_inode, donor_inode);
+ /* Protect extent tree against block allocations via delalloc */
+ double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
- donor_start, &len, *moved_len);
- mext_double_up_read(orig_inode, donor_inode);
+ donor_start, &len);
if (ret1)
goto out;
@@ -1355,36 +1332,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
seq_start = le32_to_cpu(ext_cur->ee_block);
rest_blocks = seq_blocks;
- /* Discard preallocations of two inodes */
- down_write(&EXT4_I(orig_inode)->i_data_sem);
- ext4_discard_preallocations(orig_inode);
- up_write(&EXT4_I(orig_inode)->i_data_sem);
-
- down_write(&EXT4_I(donor_inode)->i_data_sem);
- ext4_discard_preallocations(donor_inode);
- up_write(&EXT4_I(donor_inode)->i_data_sem);
+ /*
+ * Up semaphore to avoid following problems:
+ * a. transaction deadlock among ext4_journal_start,
+ * ->write_begin via pagefault, and jbd2_journal_commit
+ * b. racing with ->readpage, ->write_begin, and ext4_get_block
+ * in move_extent_per_page
+ */
+ double_up_write_data_sem(orig_inode, donor_inode);
while (orig_page_offset <= seq_end_page) {
/* Swap original branches with new branches */
- ret1 = move_extent_per_page(o_filp, donor_inode,
+ block_len_in_page = move_extent_per_page(
+ o_filp, donor_inode,
orig_page_offset,
data_offset_in_page,
- block_len_in_page, uninit);
- if (ret1 < 0)
- goto out;
- orig_page_offset++;
+ block_len_in_page, uninit,
+ &ret1);
+
/* Count how many blocks we have exchanged */
*moved_len += block_len_in_page;
+ if (ret1 < 0)
+ break;
if (*moved_len > len) {
ext4_error(orig_inode->i_sb, __func__,
"We replaced blocks too much! "
"sum of replaced: %llu requested: %llu",
*moved_len, len);
ret1 = -EIO;
- goto out;
+ break;
}
+ orig_page_offset++;
data_offset_in_page = 0;
rest_blocks -= block_len_in_page;
if (rest_blocks > blocks_per_page)
@@ -1393,6 +1373,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
block_len_in_page = rest_blocks;
}
+ double_down_write_data_sem(orig_inode, donor_inode);
+ if (ret1 < 0)
+ break;
+
/* Decrease buffer counter */
if (holecheck_path)
ext4_ext_drop_refs(holecheck_path);
@@ -1414,6 +1398,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
}
out:
+ if (*moved_len) {
+ ext4_discard_preallocations(orig_inode);
+ ext4_discard_preallocations(donor_inode);
+ }
+
if (orig_path) {
ext4_ext_drop_refs(orig_path);
kfree(orig_path);
@@ -1422,7 +1411,7 @@ out:
ext4_ext_drop_refs(holecheck_path);
kfree(holecheck_path);
}
-
+ double_up_write_data_sem(orig_inode, donor_inode);
ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
if (ret1)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6d2c1b897fc7..c3b6ad0fcc46 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -660,7 +660,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
start_hash, start_minor_hash));
dir = dir_file->f_path.dentry->d_inode;
- if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
+ if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
if (hinfo.hash_version <= DX_HASH_TEA)
hinfo.hash_version +=
@@ -805,7 +805,7 @@ static void ext4_update_dx_flag(struct inode *inode)
{
if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_COMPAT_DIR_INDEX))
- EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
}
/*
@@ -1292,9 +1292,6 @@ errout:
* add_dirent_to_buf will attempt search the directory block for
* space. It will return -ENOSPC if no space is available, and -EIO
* and -EEXIST if directory entry already exists.
- *
- * NOTE! bh is NOT released in the case where ENOSPC is returned. In
- * all other cases bh is released.
*/
static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
struct inode *inode, struct ext4_dir_entry_2 *de,
@@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
top = bh->b_data + blocksize - reclen;
while ((char *) de <= top) {
if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
- bh, offset)) {
- brelse(bh);
+ bh, offset))
return -EIO;
- }
- if (ext4_match(namelen, name, de)) {
- brelse(bh);
+ if (ext4_match(namelen, name, de))
return -EEXIST;
- }
nlen = EXT4_DIR_REC_LEN(de->name_len);
rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
if ((de->inode? rlen - nlen: rlen) >= reclen)
@@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
err = ext4_journal_get_write_access(handle, bh);
if (err) {
ext4_std_error(dir->i_sb, err);
- brelse(bh);
return err;
}
@@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
err = ext4_handle_dirty_metadata(handle, dir, bh);
if (err)
ext4_std_error(dir->i_sb, err);
- brelse(bh);
return 0;
}
@@ -1433,7 +1424,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
brelse(bh);
return retval;
}
- EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
+ ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
data1 = bh2->b_data;
memcpy (data1, de, len);
@@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
if (!(de))
return retval;
- return add_dirent_to_buf(handle, dentry, inode, de, bh);
+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ brelse(bh);
+ return retval;
}
/*
@@ -1504,7 +1497,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
retval = ext4_dx_add_entry(handle, dentry, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
return retval;
- EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
+ ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
dx_fallback++;
ext4_mark_inode_dirty(handle, dir);
}
@@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if(!bh)
return retval;
retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
- if (retval != -ENOSPC)
+ if (retval != -ENOSPC) {
+ brelse(bh);
return retval;
+ }
if (blocks == 1 && !dx_fallback &&
EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
@@ -1528,7 +1523,11 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
de = (struct ext4_dir_entry_2 *) bh->b_data;
de->inode = 0;
de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
- return add_dirent_to_buf(handle, dentry, inode, de, bh);
+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ brelse(bh);
+ if (retval == 0)
+ ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
+ return retval;
}
/*
@@ -1561,10 +1560,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto journal_error;
err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
- if (err != -ENOSPC) {
- bh = NULL;
+ if (err != -ENOSPC)
goto cleanup;
- }
/* Block full, should compress but for now just split */
dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
@@ -1657,7 +1654,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (!de)
goto cleanup;
err = add_dirent_to_buf(handle, dentry, inode, de, bh);
- bh = NULL;
goto cleanup;
journal_error:
@@ -1775,7 +1771,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -1809,7 +1805,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -1846,7 +1842,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2026,11 +2022,18 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
goto out_unlock;
+ /*
+ * Due to previous errors inode may be already a part of on-disk
+ * orphan list. If so skip on-disk list modification.
+ */
+ if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
+ goto mem_insert;
/* Insert this inode at the head of the on-disk orphan list... */
NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
- err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
+ err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (!err)
err = rc;
@@ -2043,6 +2046,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
*
* This is safe: on error we're going to ignore the orphan list
* anyway on the next recovery. */
+mem_insert:
if (!err)
list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
@@ -2102,7 +2106,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
if (err)
goto out_brelse;
sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
- err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
} else {
struct ext4_iloc iloc2;
struct inode *i_prev =
@@ -2259,7 +2263,7 @@ static int ext4_symlink(struct inode *dir,
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2290,7 +2294,7 @@ retry:
}
} else {
/* clear the extent format for fast symlink */
- EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
inode->i_op = &ext4_fast_symlink_inode_operations;
memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
inode->i_size = l-1;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3cfc343c41b5..433ea27e2652 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct super_block *sb,
goto exit_bh;
if (IS_ERR(gdb = bclean(handle, sb, block))) {
- err = PTR_ERR(bh);
+ err = PTR_ERR(gdb);
goto exit_bh;
}
ext4_handle_dirty_metadata(handle, NULL, gdb);
@@ -930,7 +930,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
percpu_counter_add(&sbi->s_freeinodes_counter,
EXT4_INODES_PER_GROUP(sb));
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_log_groups_per_flex) {
ext4_group_t flex_group;
flex_group = ext4_flex_group(sbi, input->group);
atomic_add(input->free_blocks_count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d4ca92aab514..f27e045df7a6 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -227,6 +227,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
if (sb->s_flags & MS_RDONLY)
return ERR_PTR(-EROFS);
+ vfs_check_frozen(sb, SB_FREEZE_TRANS);
/* Special case here: if the journal has aborted behind our
* backs (eg. EIO in the commit thread), then we still need to
* take the FS itself readonly cleanly. */
@@ -603,10 +604,6 @@ static void ext4_put_super(struct super_block *sb)
if (sb->s_dirt)
ext4_commit_super(sb, 1);
- ext4_release_system_zone(sb);
- ext4_mb_release(sb);
- ext4_ext_release(sb);
- ext4_xattr_put_super(sb);
if (sbi->s_journal) {
err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
@@ -614,6 +611,12 @@ static void ext4_put_super(struct super_block *sb)
ext4_abort(sb, __func__,
"Couldn't clean up the journal");
}
+
+ ext4_release_system_zone(sb);
+ ext4_mb_release(sb);
+ ext4_ext_release(sb);
+ ext4_xattr_put_super(sb);
+
if (!(sb->s_flags & MS_RDONLY)) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -700,10 +703,16 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0;
ei->i_allocated_meta_blocks = 0;
+ ei->i_da_metadata_calc_len = 0;
ei->i_delalloc_reserved_flag = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
+#ifdef CONFIG_QUOTA
+ ei->i_reserved_quota = 0;
+#endif
INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
ei->cur_aio_dio = NULL;
+ ei->i_sync_tid = 0;
+ ei->i_datasync_tid = 0;
return &ei->vfs_inode;
}
@@ -868,6 +877,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
seq_puts(seq, ",journal_async_commit");
+ else if (test_opt(sb, JOURNAL_CHECKSUM))
+ seq_puts(seq, ",journal_checksum");
if (test_opt(sb, NOBH))
seq_puts(seq, ",nobh");
if (test_opt(sb, I_VERSION))
@@ -899,6 +910,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (test_opt(sb, NO_AUTO_DA_ALLOC))
seq_puts(seq, ",noauto_da_alloc");
+ if (test_opt(sb, DISCARD))
+ seq_puts(seq, ",discard");
+
+ if (test_opt(sb, NOLOAD))
+ seq_puts(seq, ",norecovery");
+
ext4_show_quota_options(seq, sb);
return 0;
@@ -991,7 +1008,9 @@ static const struct dquot_operations ext4_quota_operations = {
.reserve_space = dquot_reserve_space,
.claim_space = dquot_claim_space,
.release_rsv = dquot_release_reserved_space,
+#ifdef CONFIG_QUOTA
.get_reserved_space = ext4_get_reserved_space,
+#endif
.alloc_inode = dquot_alloc_inode,
.free_space = dquot_free_space,
.free_inode = dquot_free_inode,
@@ -1079,7 +1098,8 @@ enum {
Opt_usrquota, Opt_grpquota, Opt_i_version,
Opt_stripe, Opt_delalloc, Opt_nodelalloc,
Opt_block_validity, Opt_noblock_validity,
- Opt_inode_readahead_blks, Opt_journal_ioprio
+ Opt_inode_readahead_blks, Opt_journal_ioprio,
+ Opt_discard, Opt_nodiscard,
};
static const match_table_t tokens = {
@@ -1104,6 +1124,7 @@ static const match_table_t tokens = {
{Opt_acl, "acl"},
{Opt_noacl, "noacl"},
{Opt_noload, "noload"},
+ {Opt_noload, "norecovery"},
{Opt_nobh, "nobh"},
{Opt_bh, "bh"},
{Opt_commit, "commit=%u"},
@@ -1144,6 +1165,8 @@ static const match_table_t tokens = {
{Opt_auto_da_alloc, "auto_da_alloc=%u"},
{Opt_auto_da_alloc, "auto_da_alloc"},
{Opt_noauto_da_alloc, "noauto_da_alloc"},
+ {Opt_discard, "discard"},
+ {Opt_nodiscard, "nodiscard"},
{Opt_err, NULL},
};
@@ -1195,6 +1218,11 @@ static int parse_options(char *options, struct super_block *sb,
if (!*p)
continue;
+ /*
+ * Initialize args struct so we know whether arg was
+ * found; some options take optional arguments.
+ */
+ args[0].to = args[0].from = 0;
token = match_token(p, tokens, args);
switch (token) {
case Opt_bsd_df:
@@ -1480,10 +1508,11 @@ set_qf_format:
clear_opt(sbi->s_mount_opt, BARRIER);
break;
case Opt_barrier:
- if (match_int(&args[0], &option)) {
- set_opt(sbi->s_mount_opt, BARRIER);
- break;
- }
+ if (args[0].from) {
+ if (match_int(&args[0], &option))
+ return 0;
+ } else
+ option = 1; /* No argument, default to 1 */
if (option)
set_opt(sbi->s_mount_opt, BARRIER);
else
@@ -1556,15 +1585,22 @@ set_qf_format:
set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
break;
case Opt_auto_da_alloc:
- if (match_int(&args[0], &option)) {
- clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
- break;
- }
+ if (args[0].from) {
+ if (match_int(&args[0], &option))
+ return 0;
+ } else
+ option = 1; /* No argument, default to 1 */
if (option)
clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
else
set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
break;
+ case Opt_discard:
+ set_opt(sbi->s_mount_opt, DISCARD);
+ break;
+ case Opt_nodiscard:
+ clear_opt(sbi->s_mount_opt, DISCARD);
+ break;
default:
ext4_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" "
@@ -1673,14 +1709,14 @@ static int ext4_fill_flex_info(struct super_block *sb)
size_t size;
int i;
- if (!sbi->s_es->s_log_groups_per_flex) {
+ sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+ groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+ if (groups_per_flex < 2) {
sbi->s_log_groups_per_flex = 0;
return 1;
}
- sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
- groups_per_flex = 1 << sbi->s_log_groups_per_flex;
-
/* We allocate both existing and potentially added groups */
flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
@@ -2668,24 +2704,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
- err = percpu_counter_init(&sbi->s_freeblocks_counter,
- ext4_count_free_blocks(sb));
- if (!err) {
- err = percpu_counter_init(&sbi->s_freeinodes_counter,
- ext4_count_free_inodes(sb));
- }
- if (!err) {
- err = percpu_counter_init(&sbi->s_dirs_counter,
- ext4_count_dirs(sb));
- }
- if (!err) {
- err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
- }
- if (err) {
- ext4_msg(sb, KERN_ERR, "insufficient memory");
- goto failed_mount3;
- }
-
sbi->s_stripe = ext4_get_stripe_size(sbi);
sbi->s_max_writeback_mb_bump = 128;
@@ -2805,7 +2823,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
no_journal:
-
+ err = percpu_counter_init(&sbi->s_freeblocks_counter,
+ ext4_count_free_blocks(sb));
+ if (!err)
+ err = percpu_counter_init(&sbi->s_freeinodes_counter,
+ ext4_count_free_inodes(sb));
+ if (!err)
+ err = percpu_counter_init(&sbi->s_dirs_counter,
+ ext4_count_dirs(sb));
+ if (!err)
+ err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "insufficient memory");
+ goto failed_mount_wq;
+ }
if (test_opt(sb, NOBH)) {
if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
@@ -2880,7 +2911,7 @@ no_journal:
err = ext4_setup_system_zone(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize system "
- "zone (%d)\n", err);
+ "zone (%d)", err);
goto failed_mount4;
}
@@ -2938,6 +2969,10 @@ failed_mount_wq:
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
+ percpu_counter_destroy(&sbi->s_freeblocks_counter);
+ percpu_counter_destroy(&sbi->s_freeinodes_counter);
+ percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
failed_mount3:
if (sbi->s_flex_groups) {
if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -2945,10 +2980,6 @@ failed_mount3:
else
kfree(sbi->s_flex_groups);
}
- percpu_counter_destroy(&sbi->s_freeblocks_counter);
- percpu_counter_destroy(&sbi->s_freeinodes_counter);
- percpu_counter_destroy(&sbi->s_dirs_counter);
- percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
@@ -3365,8 +3396,10 @@ int ext4_force_commit(struct super_block *sb)
return 0;
journal = EXT4_SB(sb)->s_journal;
- if (journal)
+ if (journal) {
+ vfs_check_frozen(sb, SB_FREEZE_TRANS);
ret = ext4_journal_force_commit(journal);
+ }
return ret;
}
@@ -3415,18 +3448,16 @@ static int ext4_freeze(struct super_block *sb)
* the journal.
*/
error = jbd2_journal_flush(journal);
- if (error < 0) {
- out:
- jbd2_journal_unlock_updates(journal);
- return error;
- }
+ if (error < 0)
+ goto out;
/* Journal blocked and flushed, clear needs_recovery flag. */
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
error = ext4_commit_super(sb, 1);
- if (error)
- goto out;
- return 0;
+out:
+ /* we rely on s_frozen to stop further updates */
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ return error;
}
/*
@@ -3443,7 +3474,6 @@ static int ext4_unfreeze(struct super_block *sb)
EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
ext4_commit_super(sb, 1);
unlock_super(sb);
- jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
return 0;
}
@@ -3668,13 +3698,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
- ext4_free_blocks_count_set(es, buf->f_bfree);
buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
if (buf->f_bfree < ext4_r_blocks_count(es))
buf->f_bavail = 0;
buf->f_files = le32_to_cpu(es->s_inodes_count);
buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
- es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
buf->f_namelen = EXT4_NAME_LEN;
fsid = le64_to_cpup((void *)es->s_uuid) ^
le64_to_cpup((void *)es->s_uuid + sizeof(u64));
@@ -3978,6 +4006,7 @@ static int __init init_ext4_fs(void)
{
int err;
+ ext4_check_flag_values();
err = init_ext4_system_zone();
if (err)
return err;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fed5b01d7a8d..4de7d0a75c42 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -267,7 +267,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
void *end;
int error;
- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
return -ENODATA;
error = ext4_get_inode_loc(inode, &iloc);
if (error)
@@ -393,7 +393,7 @@ ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
void *end;
int error;
- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
return 0;
error = ext4_get_inode_loc(inode, &iloc);
if (error)
@@ -816,7 +816,7 @@ inserted:
EXT4_I(inode)->i_block_group);
/* non-extent files can't have physical blocks past 2^32 */
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
block = ext4_new_meta_blocks(handle, inode,
@@ -824,7 +824,7 @@ inserted:
if (error)
goto cleanup;
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
ea_idebug(inode, "creating block %d", block);
@@ -903,7 +903,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
is->s.base = is->s.first = IFIRST(header);
is->s.here = is->s.first;
is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
error = ext4_xattr_check_names(IFIRST(header), is->s.end);
if (error)
return error;
@@ -935,10 +935,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
header = IHDR(inode, ext4_raw_inode(&is->iloc));
if (!IS_LAST_ENTRY(s->first)) {
header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
- EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
} else {
header->h_magic = cpu_to_le32(0);
- EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
+ ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
}
return 0;
}
@@ -981,17 +981,21 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (strlen(name) > 255)
return -ERANGE;
down_write(&EXT4_I(inode)->xattr_sem);
- no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
- EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+ no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
+ ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
error = ext4_get_inode_loc(inode, &is.iloc);
if (error)
goto cleanup;
- if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto cleanup;
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
- EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
+ ext4_clear_inode_state(inode, EXT4_STATE_NEW);
}
error = ext4_xattr_ibody_find(inode, &i, &is);
@@ -1013,9 +1017,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (flags & XATTR_CREATE)
goto cleanup;
}
- error = ext4_journal_get_write_access(handle, is.iloc.bh);
- if (error)
- goto cleanup;
if (!value) {
if (!is.s.not_found)
error = ext4_xattr_ibody_set(handle, inode, &i, &is);
@@ -1046,7 +1047,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
ext4_xattr_update_super_block(handle, inode->i_sb);
inode->i_ctime = ext4_current_time(inode);
if (!value)
- EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+ ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
/*
* The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1061,7 +1062,7 @@ cleanup:
brelse(is.iloc.bh);
brelse(bs.bh);
if (no_expand == 0)
- EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+ ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
up_write(&EXT4_I(inode)->xattr_sem);
return error;
}
@@ -1326,6 +1327,8 @@ retry:
goto cleanup;
kfree(b_entry_name);
kfree(buffer);
+ b_entry_name = NULL;
+ buffer = NULL;
brelse(is->iloc.bh);
kfree(is);
kfree(bs);
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f565f24019b5..72646e2c0f48 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -309,7 +309,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
{
struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
wchar_t *ip, *ext_start, *end, *name_start;
- unsigned char base[9], ext[4], buf[8], *p;
+ unsigned char base[9], ext[4], buf[5], *p;
unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
int chl, chi;
int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
@@ -467,7 +467,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
return 0;
}
- i = jiffies & 0xffff;
+ i = jiffies;
sz = (jiffies >> 16) & 0x7;
if (baselen > 2) {
baselen = numtail2_baselen;
@@ -476,7 +476,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
name_res[baselen + 4] = '~';
name_res[baselen + 5] = '1' + sz;
while (1) {
- sprintf(buf, "%04X", i);
+ snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
memcpy(&name_res[baselen], buf, 4);
if (vfat_find_form(dir, name_res) < 0)
break;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 2cf93ec40a67..97e01dc0d95f 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -618,60 +618,90 @@ static DEFINE_RWLOCK(fasync_lock);
static struct kmem_cache *fasync_cache __read_mostly;
/*
- * fasync_helper() is used by almost all character device drivers
- * to set up the fasync queue. It returns negative on error, 0 if it did
- * no changes and positive if it added/deleted the entry.
+ * Remove a fasync entry. If successfully removed, return
+ * positive and clear the FASYNC flag. If no entry exists,
+ * do nothing and return 0.
+ *
+ * NOTE! It is very important that the FASYNC flag always
+ * match the state "is the filp on a fasync list".
+ *
+ * We always take the 'filp->f_lock', in since fasync_lock
+ * needs to be irq-safe.
*/
-int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
+static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
{
struct fasync_struct *fa, **fp;
- struct fasync_struct *new = NULL;
int result = 0;
- if (on) {
- new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
- if (!new)
- return -ENOMEM;
+ spin_lock(&filp->f_lock);
+ write_lock_irq(&fasync_lock);
+ for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
+ if (fa->fa_file != filp)
+ continue;
+ *fp = fa->fa_next;
+ kmem_cache_free(fasync_cache, fa);
+ filp->f_flags &= ~FASYNC;
+ result = 1;
+ break;
}
+ write_unlock_irq(&fasync_lock);
+ spin_unlock(&filp->f_lock);
+ return result;
+}
+
+/*
+ * Add a fasync entry. Return negative on error, positive if
+ * added, and zero if did nothing but change an existing one.
+ *
+ * NOTE! It is very important that the FASYNC flag always
+ * match the state "is the filp on a fasync list".
+ */
+static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+{
+ struct fasync_struct *new, *fa, **fp;
+ int result = 0;
+
+ new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
- /*
- * We need to take f_lock first since it's not an IRQ-safe
- * lock.
- */
spin_lock(&filp->f_lock);
write_lock_irq(&fasync_lock);
for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
- if (fa->fa_file == filp) {
- if(on) {
- fa->fa_fd = fd;
- kmem_cache_free(fasync_cache, new);
- } else {
- *fp = fa->fa_next;
- kmem_cache_free(fasync_cache, fa);
- result = 1;
- }
- goto out;
- }
+ if (fa->fa_file != filp)
+ continue;
+ fa->fa_fd = fd;
+ kmem_cache_free(fasync_cache, new);
+ goto out;
}
- if (on) {
- new->magic = FASYNC_MAGIC;
- new->fa_file = filp;
- new->fa_fd = fd;
- new->fa_next = *fapp;
- *fapp = new;
- result = 1;
- }
+ new->magic = FASYNC_MAGIC;
+ new->fa_file = filp;
+ new->fa_fd = fd;
+ new->fa_next = *fapp;
+ *fapp = new;
+ result = 1;
+ filp->f_flags |= FASYNC;
+
out:
- if (on)
- filp->f_flags |= FASYNC;
- else
- filp->f_flags &= ~FASYNC;
write_unlock_irq(&fasync_lock);
spin_unlock(&filp->f_lock);
return result;
}
+/*
+ * fasync_helper() is used by almost all character device drivers
+ * to set up the fasync queue, and for regular files by the file
+ * lease code. It returns negative on error, 0 if it did no changes
+ * and positive if it added/deleted the entry.
+ */
+int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
+{
+ if (!on)
+ return fasync_remove_entry(filp, fapp);
+ return fasync_add_entry(fd, filp, fapp);
+}
+
EXPORT_SYMBOL(fasync_helper);
void __kill_fasync(struct fasync_struct *fa, int sig, int band)
diff --git a/fs/file_table.c b/fs/file_table.c
index 8eb44042e009..666c7ce1fc41 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -420,7 +420,9 @@ retry:
continue;
if (!(f->f_mode & FMODE_WRITE))
continue;
+ spin_lock(&f->f_lock);
f->f_mode &= ~FMODE_WRITE;
+ spin_unlock(&f->f_lock);
if (file_check_writeable(f) != 0)
continue;
file_release_write(f);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9d5360c4c2af..ef2acd24cc02 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -859,6 +859,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
unsigned long expired;
long nr_pages;
+ /*
+ * When set to zero, disable periodic writeback
+ */
+ if (!dirty_writeback_interval)
+ return 0;
+
expired = wb->last_old_flush +
msecs_to_jiffies(dirty_writeback_interval * 10);
if (time_before(jiffies, expired))
@@ -954,8 +960,12 @@ int bdi_writeback_task(struct bdi_writeback *wb)
break;
}
- wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
- schedule_timeout_interruptible(wait_jiffies);
+ if (dirty_writeback_interval) {
+ wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+ schedule_timeout_interruptible(wait_jiffies);
+ } else
+ schedule();
+
try_to_freeze();
}
@@ -1213,6 +1223,23 @@ void writeback_inodes_sb(struct super_block *sb)
EXPORT_SYMBOL(writeback_inodes_sb);
/**
+ * writeback_inodes_sb_if_idle - start writeback if none underway
+ * @sb: the superblock
+ *
+ * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Returns 1 if writeback was started, 0 if not.
+ */
+int writeback_inodes_sb_if_idle(struct super_block *sb)
+{
+ if (!writeback_in_progress(sb->s_bdi)) {
+ writeback_inodes_sb(sb);
+ return 1;
+ } else
+ return 0;
+}
+EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
+
+/**
* sync_inodes_sb - sync sb inode pages
* @sb: the superblock
*
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 51d9e33d634f..650546f8612d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1158,6 +1158,14 @@ __acquires(&fc->lock)
}
}
+static void end_queued_requests(struct fuse_conn *fc)
+{
+ fc->max_background = UINT_MAX;
+ flush_bg_queue(fc);
+ end_requests(fc, &fc->pending);
+ end_requests(fc, &fc->processing);
+}
+
/*
* Abort all requests.
*
@@ -1184,8 +1192,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
fc->connected = 0;
fc->blocked = 0;
end_io_requests(fc);
- end_requests(fc, &fc->pending);
- end_requests(fc, &fc->processing);
+ end_queued_requests(fc);
wake_up_all(&fc->waitq);
wake_up_all(&fc->blocked_waitq);
kill_fasync(&fc->fasync, SIGIO, POLL_IN);
@@ -1200,8 +1207,9 @@ int fuse_dev_release(struct inode *inode, struct file *file)
if (fc) {
spin_lock(&fc->lock);
fc->connected = 0;
- end_requests(fc, &fc->pending);
- end_requests(fc, &fc->processing);
+ fc->blocked = 0;
+ end_queued_requests(fc);
+ wake_up_all(&fc->blocked_waitq);
spin_unlock(&fc->lock);
fuse_conn_put(fc);
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c18913a777ae..a9f5e137f1d3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -828,6 +828,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
if (!page)
break;
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+
pagefault_disable();
tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
pagefault_enable();
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3fc4e3ac7d84..2168da121647 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -12,6 +12,7 @@
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
+#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/gfs2_ondisk.h>
@@ -26,61 +27,6 @@
#include "trans.h"
#include "util.h"
-#define ACL_ACCESS 1
-#define ACL_DEFAULT 0
-
-int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
- struct gfs2_ea_request *er, int *remove, mode_t *mode)
-{
- struct posix_acl *acl;
- int error;
-
- error = gfs2_acl_validate_remove(ip, access);
- if (error)
- return error;
-
- if (!er->er_data)
- return -EINVAL;
-
- acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (!acl) {
- *remove = 1;
- return 0;
- }
-
- error = posix_acl_valid(acl);
- if (error)
- goto out;
-
- if (access) {
- error = posix_acl_equiv_mode(acl, mode);
- if (!error)
- *remove = 1;
- else if (error > 0)
- error = 0;
- }
-
-out:
- posix_acl_release(acl);
- return error;
-}
-
-int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
-{
- if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
- return -EOPNOTSUPP;
- if (!is_owner_or_cap(&ip->i_inode))
- return -EPERM;
- if (S_ISLNK(ip->i_inode.i_mode))
- return -EOPNOTSUPP;
- if (!access && !S_ISDIR(ip->i_inode.i_mode))
- return -EACCES;
-
- return 0;
-}
-
static int acl_get(struct gfs2_inode *ip, const char *name,
struct posix_acl **acl, struct gfs2_ea_location *el,
char **datap, unsigned int *lenp)
@@ -277,3 +223,117 @@ out_brelse:
return error;
}
+static int gfs2_acl_type(const char *name)
+{
+ if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0)
+ return ACL_TYPE_ACCESS;
+ if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0)
+ return ACL_TYPE_DEFAULT;
+ return -EINVAL;
+}
+
+static int gfs2_xattr_system_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ int type;
+
+ type = gfs2_acl_type(name);
+ if (type < 0)
+ return type;
+
+ return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
+}
+
+static int gfs2_set_mode(struct inode *inode, mode_t mode)
+{
+ int error = 0;
+
+ if (mode != inode->i_mode) {
+ struct iattr iattr;
+
+ iattr.ia_valid = ATTR_MODE;
+ iattr.ia_mode = mode;
+
+ error = gfs2_setattr_simple(GFS2_I(inode), &iattr);
+ }
+
+ return error;
+}
+
+static int gfs2_xattr_system_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct posix_acl *acl = NULL;
+ int error = 0, type;
+
+ if (!sdp->sd_args.ar_posix_acl)
+ return -EOPNOTSUPP;
+
+ type = gfs2_acl_type(name);
+ if (type < 0)
+ return type;
+ if (flags & XATTR_CREATE)
+ return -EINVAL;
+ if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+ return value ? -EACCES : 0;
+ if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+ return -EPERM;
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (!value)
+ goto set_acl;
+
+ acl = posix_acl_from_xattr(value, size);
+ if (!acl) {
+ /*
+ * acl_set_file(3) may request that we set default ACLs with
+ * zero length -- defend (gracefully) against that here.
+ */
+ goto out;
+ }
+ if (IS_ERR(acl)) {
+ error = PTR_ERR(acl);
+ goto out;
+ }
+
+ error = posix_acl_valid(acl);
+ if (error)
+ goto out_release;
+
+ error = -EINVAL;
+ if (acl->a_count > GFS2_ACL_MAX_ENTRIES)
+ goto out_release;
+
+ if (type == ACL_TYPE_ACCESS) {
+ mode_t mode = inode->i_mode;
+ error = posix_acl_equiv_mode(acl, &mode);
+
+ if (error <= 0) {
+ posix_acl_release(acl);
+ acl = NULL;
+
+ if (error < 0)
+ return error;
+ }
+
+ error = gfs2_set_mode(inode, mode);
+ if (error)
+ goto out_release;
+ }
+
+set_acl:
+ error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, 0);
+out_release:
+ posix_acl_release(acl);
+out:
+ return error;
+}
+
+struct xattr_handler gfs2_xattr_system_handler = {
+ .prefix = XATTR_SYSTEM_PREFIX,
+ .get = gfs2_xattr_system_get,
+ .set = gfs2_xattr_system_set,
+};
+
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 6751930bfb64..cc954390b6da 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -13,26 +13,12 @@
#include "incore.h"
#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
-#define GFS2_POSIX_ACL_ACCESS_LEN 16
#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
-#define GFS2_POSIX_ACL_DEFAULT_LEN 17
+#define GFS2_ACL_MAX_ENTRIES 25
-#define GFS2_ACL_IS_ACCESS(name, len) \
- ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
- !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
-
-#define GFS2_ACL_IS_DEFAULT(name, len) \
- ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
- !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
-
-struct gfs2_ea_request;
-
-int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
- struct gfs2_ea_request *er,
- int *remove, mode_t *mode);
-int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
-int gfs2_check_acl(struct inode *inode, int mask);
-int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
-int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+extern int gfs2_check_acl(struct inode *inode, int mask);
+extern int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
+extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+extern struct xattr_handler gfs2_xattr_system_handler;
#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 297d7e5cebad..0bb312961c92 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -392,7 +392,7 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
unsigned totlen = be16_to_cpu(dent->de_rec_len);
if (gfs2_dirent_sentinel(dent))
- actual = GFS2_DIRENT_SIZE(0);
+ actual = 0;
if (totlen - actual >= required)
return 1;
return 0;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4eb308aa3234..b3fd1d886946 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -218,6 +218,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
if (error)
goto out_drop_write;
+ error = -EACCES;
+ if (!is_owner_or_cap(inode))
+ goto out;
+
+ error = 0;
flags = ip->i_diskflags;
new_flags = (flags & ~mask) | (reqflags & mask);
if ((new_flags ^ flags) == 0)
@@ -275,8 +280,10 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
{
struct inode *inode = filp->f_path.dentry->d_inode;
u32 fsflags, gfsflags;
+
if (get_user(fsflags, ptr))
return -EFAULT;
+
gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
if (!S_ISDIR(inode->i_mode)) {
if (gfsflags & GFS2_DIF_INHERIT_JDATA)
@@ -606,7 +613,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
- if (__mandatory_lock(&ip->i_inode))
+ if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
return -ENOLCK;
if (cmd == F_CANCELLK) {
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 8a0f8ef6ee27..6b803540951e 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1507,18 +1507,6 @@ static int gfs2_xattr_user_set(struct inode *inode, const char *name,
return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
}
-static int gfs2_xattr_system_get(struct inode *inode, const char *name,
- void *buffer, size_t size)
-{
- return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
-}
-
-static int gfs2_xattr_system_set(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
-{
- return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
-}
-
static int gfs2_xattr_security_get(struct inode *inode, const char *name,
void *buffer, size_t size)
{
@@ -1543,12 +1531,6 @@ static struct xattr_handler gfs2_xattr_security_handler = {
.set = gfs2_xattr_security_set,
};
-static struct xattr_handler gfs2_xattr_system_handler = {
- .prefix = XATTR_SYSTEM_PREFIX,
- .get = gfs2_xattr_system_get,
- .set = gfs2_xattr_system_set,
-};
-
struct xattr_handler *gfs2_xattr_handlers[] = {
&gfs2_xattr_user_handler,
&gfs2_xattr_security_handler,
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 6d98f116ca03..424b0337f524 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -289,6 +289,10 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
err = hfs_brec_find(&src_fd);
if (err)
goto out;
+ if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) {
+ err = -EIO;
+ goto out;
+ }
hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset,
src_fd.entrylength);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 7c69b98a2e45..2b3b8611b41b 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -79,6 +79,11 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
filp->f_pos++;
/* fall through */
case 1:
+ if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
+ err = -EIO;
+ goto out;
+ }
+
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
if (entry.type != HFS_CDR_THD) {
printk(KERN_ERR "hfs: bad catalog folder thread\n");
@@ -109,6 +114,12 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
err = -EIO;
goto out;
}
+
+ if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
+ err = -EIO;
+ goto out;
+ }
+
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
type = entry.type;
len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName);
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index f7fcbe49da72..5ed7252b7b23 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -409,8 +409,13 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
/* try to get the root inode */
hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd);
- if (!res)
+ if (!res) {
+ if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) {
+ res = -EIO;
+ goto bail;
+ }
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength);
+ }
if (res) {
hfs_find_exit(&fd);
goto bail_no_root;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 4160afad6d00..bd224eec9b07 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1913,7 +1913,7 @@ static void __init jbd_create_debugfs_entry(void)
{
jbd_debugfs_dir = debugfs_create_dir("jbd", NULL);
if (jbd_debugfs_dir)
- jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO,
+ jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR,
jbd_debugfs_dir,
&journal_enable_debug);
}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index ca0f5eb62b20..886849370950 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -22,6 +22,7 @@
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/slab.h>
+#include <linux/blkdev.h>
#include <trace/events/jbd2.h>
/*
@@ -515,6 +516,20 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
journal->j_tail_sequence = first_tid;
journal->j_tail = blocknr;
spin_unlock(&journal->j_state_lock);
+
+ /*
+ * If there is an external journal, we need to make sure that
+ * any data blocks that were recently written out --- perhaps
+ * by jbd2_log_do_checkpoint() --- are flushed out before we
+ * drop the transactions from the external journal. It's
+ * unlikely this will be necessary, especially with a
+ * appropriately sized journal, but we need this to guarantee
+ * correctness. Fortunately jbd2_cleanup_journal_tail()
+ * doesn't get called all that often.
+ */
+ if ((journal->j_fs_dev != journal->j_dev) &&
+ (journal->j_flags & JBD2_BARRIER))
+ blkdev_issue_flush(journal->j_fs_dev, NULL);
if (!(journal->j_flags & JBD2_ABORT))
jbd2_journal_update_superblock(journal, 1);
return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d4cfd6d2779e..09ab6ac6a075 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -259,6 +259,7 @@ static int journal_submit_data_buffers(journal_t *journal,
ret = err;
spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction);
+ commit_transaction->t_flushed_data_blocks = 1;
jinode->i_flags &= ~JI_COMMIT_RUNNING;
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
@@ -636,6 +637,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
JBUFFER_TRACE(jh, "ph3: write metadata");
flags = jbd2_journal_write_metadata_buffer(commit_transaction,
jh, &new_jh, blocknr);
+ if (flags < 0) {
+ jbd2_journal_abort(journal, flags);
+ continue;
+ }
set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
wbuf[bufs++] = jh2bh(new_jh);
@@ -704,8 +709,17 @@ start_journal_io:
}
}
- /* Done it all: now write the commit record asynchronously. */
+ /*
+ * If the journal is not located on the file system device,
+ * then we must flush the file system device before we issue
+ * the commit record
+ */
+ if (commit_transaction->t_flushed_data_blocks &&
+ (journal->j_fs_dev != journal->j_dev) &&
+ (journal->j_flags & JBD2_BARRIER))
+ blkdev_issue_flush(journal->j_fs_dev, NULL);
+ /* Done it all: now write the commit record asynchronously. */
if (JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
err = journal_submit_commit_record(journal, commit_transaction,
@@ -716,13 +730,6 @@ start_journal_io:
blkdev_issue_flush(journal->j_dev, NULL);
}
- /*
- * This is the right place to wait for data buffers both for ASYNC
- * and !ASYNC commit. If commit is ASYNC, we need to wait only after
- * the commit block went to disk (which happens above). If commit is
- * SYNC, we need to wait for data buffers before we start writing
- * commit block, which happens below in such setting.
- */
err = journal_finish_inode_data_buffers(journal, commit_transaction);
if (err) {
printk(KERN_WARNING
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fed85388ee86..17af879e6e9e 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno);
EXPORT_SYMBOL(jbd2_journal_ack_err);
EXPORT_SYMBOL(jbd2_journal_clear_err);
EXPORT_SYMBOL(jbd2_log_wait_commit);
+EXPORT_SYMBOL(jbd2_log_start_commit);
EXPORT_SYMBOL(jbd2_journal_start_commit);
EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
EXPORT_SYMBOL(jbd2_journal_wipe);
@@ -358,6 +359,10 @@ repeat:
jbd_unlock_bh_state(bh_in);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
+ if (!tmp) {
+ jbd2_journal_put_journal_head(new_jh);
+ return -ENOMEM;
+ }
jbd_lock_bh_state(bh_in);
if (jh_in->b_frozen_data) {
jbd2_free(tmp, bh_in->b_size);
@@ -1248,6 +1253,13 @@ int jbd2_journal_load(journal_t *journal)
if (jbd2_journal_recover(journal))
goto recovery_error;
+ if (journal->j_failed_commit) {
+ printk(KERN_ERR "JBD2: journal transaction %u on %s "
+ "is corrupt.\n", journal->j_failed_commit,
+ journal->j_devname);
+ return -EIO;
+ }
+
/* OK, we've finished with the dynamic journal bits:
* reinitialise the dynamic contents of the superblock in memory
* and reset them on disk. */
@@ -2103,7 +2115,8 @@ static void __init jbd2_create_debugfs_entry(void)
{
jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL);
if (jbd2_debugfs_dir)
- jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, S_IRUGO,
+ jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME,
+ S_IRUGO | S_IWUSR,
jbd2_debugfs_dir,
&jbd2_journal_enable_debug);
}
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 090c556ffed2..3b6f2fa12cff 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -700,7 +700,8 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
struct jffs2_raw_inode ri;
struct jffs2_node_frag *last_frag;
union jffs2_device_node dev;
- char *mdata = NULL, mdatalen = 0;
+ char *mdata = NULL;
+ int mdatalen = 0;
uint32_t alloclen, ilen;
int ret;
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 7f24a0bb08ca..1aba0039f1c9 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -81,6 +81,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
struct inode *iplist[1];
struct jfs_superblock *j_sb, *j_sb2;
uint old_agsize;
+ int agsizechanged = 0;
struct buffer_head *bh, *bh2;
/* If the volume hasn't grown, get out now */
@@ -333,6 +334,9 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
*/
if ((rc = dbExtendFS(ipbmap, XAddress, nblocks)))
goto error_out;
+
+ agsizechanged |= (bmp->db_agsize != old_agsize);
+
/*
* the map now has extended to cover additional nblocks:
* dn_mapsize = oldMapsize + nblocks;
@@ -432,7 +436,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
* will correctly identify the new ag);
*/
/* if new AG size the same as old AG size, done! */
- if (bmp->db_agsize != old_agsize) {
+ if (agsizechanged) {
if ((rc = diExtendFS(ipimap, ipbmap)))
goto error_out;
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index fad364548bc9..8b0da9b10802 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -85,46 +85,25 @@ struct ea_buffer {
#define EA_MALLOC 0x0008
+static int is_known_namespace(const char *name)
+{
+ if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) &&
+ strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
+ strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+ strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+ return false;
+
+ return true;
+}
+
/*
* These three routines are used to recognize on-disk extended attributes
* that are in a recognized namespace. If the attribute is not recognized,
* "os2." is prepended to the name
*/
-static inline int is_os2_xattr(struct jfs_ea *ea)
+static int is_os2_xattr(struct jfs_ea *ea)
{
- /*
- * Check for "system."
- */
- if ((ea->namelen >= XATTR_SYSTEM_PREFIX_LEN) &&
- !strncmp(ea->name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return false;
- /*
- * Check for "user."
- */
- if ((ea->namelen >= XATTR_USER_PREFIX_LEN) &&
- !strncmp(ea->name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
- return false;
- /*
- * Check for "security."
- */
- if ((ea->namelen >= XATTR_SECURITY_PREFIX_LEN) &&
- !strncmp(ea->name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN))
- return false;
- /*
- * Check for "trusted."
- */
- if ((ea->namelen >= XATTR_TRUSTED_PREFIX_LEN) &&
- !strncmp(ea->name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
- return false;
- /*
- * Add any other valid namespace prefixes here
- */
-
- /*
- * We assume it's OS/2's flat namespace
- */
- return true;
+ return !is_known_namespace(ea->name);
}
static inline int name_size(struct jfs_ea *ea)
@@ -762,13 +741,23 @@ static int can_set_xattr(struct inode *inode, const char *name,
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
return can_set_system_xattr(inode, name, value, value_len);
+ if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) {
+ /*
+ * This makes sure that we aren't trying to set an
+ * attribute in a different namespace by prefixing it
+ * with "os2."
+ */
+ if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN))
+ return -EOPNOTSUPP;
+ return 0;
+ }
+
/*
* Don't allow setting an attribute in an unknown namespace.
*/
if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
- strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
- strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))
+ strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
return -EOPNOTSUPP;
return 0;
@@ -950,19 +939,8 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
int xattr_size;
ssize_t size;
int namelen = strlen(name);
- char *os2name = NULL;
char *value;
- if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
- os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1,
- GFP_KERNEL);
- if (!os2name)
- return -ENOMEM;
- strcpy(os2name, name + XATTR_OS2_PREFIX_LEN);
- name = os2name;
- namelen -= XATTR_OS2_PREFIX_LEN;
- }
-
down_read(&JFS_IP(inode)->xattr_sem);
xattr_size = ea_get(inode, &ea_buf, 0);
@@ -1000,8 +978,6 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
out:
up_read(&JFS_IP(inode)->xattr_sem);
- kfree(os2name);
-
return size;
}
@@ -1010,6 +986,19 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
{
int err;
+ if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
+ /*
+ * skip past "os2." prefix
+ */
+ name += XATTR_OS2_PREFIX_LEN;
+ /*
+ * Don't allow retrieving properly prefixed attributes
+ * by prepending them with "os2."
+ */
+ if (is_known_namespace(name))
+ return -EOPNOTSUPP;
+ }
+
err = __jfs_getxattr(dentry->d_inode, name, data, buf_size);
return err;
diff --git a/fs/libfs.c b/fs/libfs.c
index 219576c52d80..ba36e93764c2 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -415,7 +415,8 @@ int simple_write_end(struct file *file, struct address_space *mapping,
* unique inode values later for this filesystem, then you must take care
* to pass it an appropriate max_reserved value to avoid collisions.
*/
-int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files)
+int simple_fill_super(struct super_block *s, unsigned long magic,
+ struct tree_descr *files)
{
struct inode *inode;
struct dentry *root;
diff --git a/fs/namei.c b/fs/namei.c
index d11f404667e9..b0afbd427be7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -234,6 +234,7 @@ int generic_permission(struct inode *inode, int mask,
/*
* Searching includes executable on directories, else just read.
*/
+ mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
if (capable(CAP_DAC_READ_SEARCH))
return 0;
@@ -828,6 +829,17 @@ fail:
}
/*
+ * This is a temporary kludge to deal with "automount" symlinks; proper
+ * solution is to trigger them on follow_mount(), so that do_lookup()
+ * would DTRT. To be killed before 2.6.34-final.
+ */
+static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
+{
+ return inode && unlikely(inode->i_op->follow_link) &&
+ ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
+}
+
+/*
* Name resolution.
* This is the basic name resolution function, turning a pathname into
* the final dentry. We expect 'base' to be positive and a directory.
@@ -963,8 +975,7 @@ last_component:
if (err)
break;
inode = next.dentry->d_inode;
- if ((lookup_flags & LOOKUP_FOLLOW)
- && inode && inode->i_op->follow_link) {
+ if (follow_on_final(inode, lookup_flags)) {
err = do_follow_link(&next, nd);
if (err)
goto return_err;
diff --git a/fs/namespace.c b/fs/namespace.c
index bdc3cb4fd222..2beb0fbe70ec 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1119,8 +1119,15 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
{
struct path path;
int retval;
+ int lookup_flags = 0;
- retval = user_path(name, &path);
+ if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
+ return -EINVAL;
+
+ if (!(flags & UMOUNT_NOFOLLOW))
+ lookup_flags |= LOOKUP_FOLLOW;
+
+ retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
if (retval)
goto out;
retval = -EINVAL;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 99ea196f071f..19cbbf763110 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -273,7 +273,7 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
sin1->sin6_scope_id != sin2->sin6_scope_id)
return 0;
- return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr);
+ return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
}
#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
@@ -965,6 +965,8 @@ out_error:
static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source)
{
target->flags = source->flags;
+ target->rsize = source->rsize;
+ target->wsize = source->wsize;
target->acregmin = source->acregmin;
target->acregmax = source->acregmax;
target->acdirmin = source->acdirmin;
@@ -1283,7 +1285,8 @@ static int nfs4_init_server(struct nfs_server *server,
/* Initialise the client representation from the mount data */
server->flags = data->flags;
- server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
+ server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|
+ NFS_CAP_POSIX_LOCK;
server->options = data->options;
/* Get a client record */
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 09f383795174..7f237d243be5 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -68,4 +68,10 @@ static inline int nfs_inode_return_delegation(struct inode *inode)
}
#endif
+static inline int nfs_have_delegated_attributes(struct inode *inode)
+{
+ return nfs_have_delegation(inode, FMODE_READ) &&
+ !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
+}
+
#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7cb298525eef..a87cbd8bd0b9 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -837,6 +837,8 @@ out_zap_parent:
/* If we have submounts, don't unhash ! */
if (have_submounts(dentry))
goto out_valid;
+ if (dentry->d_flags & DCACHE_DISCONNECTED)
+ goto out_valid;
shrink_dcache_parent(dentry);
}
d_drop(dentry);
@@ -1025,12 +1027,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
res = NULL;
goto out;
/* This turned out not to be a regular file */
+ case -EISDIR:
case -ENOTDIR:
goto no_open;
case -ELOOP:
if (!(nd->intent.open.flags & O_NOFOLLOW))
goto no_open;
- /* case -EISDIR: */
/* case -EINVAL: */
default:
goto out;
@@ -1797,7 +1799,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
cache = nfs_access_search_rbtree(inode, cred);
if (cache == NULL)
goto out;
- if (!nfs_have_delegation(inode, FMODE_READ) &&
+ if (!nfs_have_delegated_attributes(inode) &&
!time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
goto out_stale;
res->jiffies = cache->jiffies;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e1d415e97849..0d289823e856 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -342,6 +342,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
data->res.fattr = &data->fattr;
data->res.eof = 0;
data->res.count = bytes;
+ nfs_fattr_init(&data->fattr);
msg.rpc_argp = &data->args;
msg.rpc_resp = &data->res;
@@ -575,6 +576,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
data->res.count = 0;
data->res.fattr = &data->fattr;
data->res.verf = &data->verf;
+ nfs_fattr_init(&data->fattr);
NFS_PROTO(data->inode)->commit_setup(data, &msg);
@@ -766,6 +768,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
data->res.fattr = &data->fattr;
data->res.count = bytes;
data->res.verf = &data->verf;
+ nfs_fattr_init(&data->fattr);
task_setup_data.task = &data->task;
task_setup_data.callback_data = data;
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index f4d54ba97cc6..c1fd68bf337d 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -36,6 +36,19 @@ struct nfs_dns_ent {
};
+static void nfs_dns_ent_update(struct cache_head *cnew,
+ struct cache_head *ckey)
+{
+ struct nfs_dns_ent *new;
+ struct nfs_dns_ent *key;
+
+ new = container_of(cnew, struct nfs_dns_ent, h);
+ key = container_of(ckey, struct nfs_dns_ent, h);
+
+ memcpy(&new->addr, &key->addr, key->addrlen);
+ new->addrlen = key->addrlen;
+}
+
static void nfs_dns_ent_init(struct cache_head *cnew,
struct cache_head *ckey)
{
@@ -49,8 +62,7 @@ static void nfs_dns_ent_init(struct cache_head *cnew,
new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
if (new->hostname) {
new->namelen = key->namelen;
- memcpy(&new->addr, &key->addr, key->addrlen);
- new->addrlen = key->addrlen;
+ nfs_dns_ent_update(cnew, ckey);
} else {
new->namelen = 0;
new->addrlen = 0;
@@ -234,7 +246,7 @@ static struct cache_detail nfs_dns_resolve = {
.cache_show = nfs_dns_show,
.match = nfs_dns_match,
.init = nfs_dns_ent_init,
- .update = nfs_dns_ent_init,
+ .update = nfs_dns_ent_update,
.alloc = nfs_dns_ent_alloc,
};
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f5fdd39e037a..9f83d9fe9a61 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -27,6 +27,8 @@
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/aio.h>
+#include <linux/gfp.h>
+#include <linux/swap.h>
#include <asm/uaccess.h>
#include <asm/system.h>
@@ -484,8 +486,19 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
*/
static int nfs_release_page(struct page *page, gfp_t gfp)
{
+ struct address_space *mapping = page->mapping;
+
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
+ /* Only do I/O if gfp is a superset of GFP_KERNEL */
+ if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) {
+ int how = FLUSH_SYNC;
+
+ /* Don't let kswapd deadlock waiting for OOM RPC calls */
+ if (current_is_kswapd())
+ how = 0;
+ nfs_commit_inode(mapping->host, how);
+ }
/* If PagePrivate() is set, then the page is not freeable */
if (PagePrivate(page))
return 0;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index fa588006588d..237874f1af23 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -354,12 +354,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)
*/
int nfs_fscache_release_page(struct page *page, gfp_t gfp)
{
- struct nfs_inode *nfsi = NFS_I(page->mapping->host);
- struct fscache_cookie *cookie = nfsi->fscache;
-
- BUG_ON(!cookie);
-
if (PageFsCache(page)) {
+ struct nfs_inode *nfsi = NFS_I(page->mapping->host);
+ struct fscache_cookie *cookie = nfsi->fscache;
+
+ BUG_ON(!cookie);
dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
cookie, page, nfsi);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index faa091865ad0..3c80474a2651 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -759,7 +759,7 @@ int nfs_attribute_timeout(struct inode *inode)
{
struct nfs_inode *nfsi = NFS_I(inode);
- if (nfs_have_delegation(inode, FMODE_READ))
+ if (nfs_have_delegated_attributes(inode))
return 0;
return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 0adefc40cc89..59047f8d7d72 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -120,7 +120,7 @@ static struct {
{ .status = MNT3ERR_INVAL, .errno = -EINVAL, },
{ .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, },
{ .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, },
- { .status = MNT3ERR_SERVERFAULT, .errno = -ESERVERFAULT, },
+ { .status = MNT3ERR_SERVERFAULT, .errno = -EREMOTEIO, },
};
struct mountres {
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5e078b222b4e..7bc2da8efd4a 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -699,7 +699,7 @@ static struct {
{ NFSERR_BAD_COOKIE, -EBADCOOKIE },
{ NFSERR_NOTSUPP, -ENOTSUPP },
{ NFSERR_TOOSMALL, -ETOOSMALL },
- { NFSERR_SERVERFAULT, -ESERVERFAULT },
+ { NFSERR_SERVERFAULT, -EREMOTEIO },
{ NFSERR_BADTYPE, -EBADTYPE },
{ NFSERR_JUKEBOX, -EJUKEBOX },
{ -1, -EIO }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 6ea07a3c75d4..b4a6b1a586d6 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -141,6 +141,7 @@ enum {
NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */
NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */
NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */
+ NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */
};
struct nfs4_state {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 741a562177fc..3c7581bcdb0d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1439,6 +1439,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
nfs_post_op_update_inode(dir, o_res->dir_attr);
} else
nfs_refresh_inode(dir, o_res->dir_attr);
+ if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
+ server->caps &= ~NFS_CAP_POSIX_LOCK;
if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
status = _nfs4_proc_open_confirm(data);
if (status != 0)
@@ -1573,6 +1575,8 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
status = PTR_ERR(state);
if (IS_ERR(state))
goto err_opendata_put;
+ if (server->caps & NFS_CAP_POSIX_LOCK)
+ set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
nfs4_opendata_put(opendata);
nfs4_put_state_owner(sp);
*res = state;
@@ -3976,6 +3980,22 @@ static const struct rpc_call_ops nfs4_lock_ops = {
.rpc_release = nfs4_lock_release,
};
+static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state *state = lsp->ls_state;
+
+ switch (error) {
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_EXPIRED:
+ if (new_lock_owner != 0 ||
+ (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
+ nfs4_state_mark_reclaim_nograce(clp, state);
+ lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+ };
+}
+
static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int reclaim)
{
struct nfs4_lockdata *data;
@@ -4011,6 +4031,9 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
ret = nfs4_wait_for_completion_rpc_task(task);
if (ret == 0) {
ret = data->rpc_status;
+ if (ret)
+ nfs4_handle_setlk_error(data->server, data->lsp,
+ data->arg.new_lock_owner, ret);
} else
data->cancelled = 1;
rpc_put_task(task);
@@ -4060,8 +4083,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
{
struct nfs_inode *nfsi = NFS_I(state->inode);
unsigned char fl_flags = request->fl_flags;
- int status;
+ int status = -ENOLCK;
+ if ((fl_flags & FL_POSIX) &&
+ !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
+ goto out;
/* Is this a delegated open? */
status = nfs4_set_lock_state(state, request);
if (status != 0)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 20b4e30e6c82..e81b2bf67c42 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -840,8 +840,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
*p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
- *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
+ *p++ = cpu_to_be32(iap->ia_atime.tv_sec);
+ *p++ = cpu_to_be32(iap->ia_atime.tv_nsec);
}
else if (iap->ia_valid & ATTR_ATIME) {
bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
@@ -2096,7 +2096,7 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
encode_compound_hdr(&xdr, req, &hdr);
encode_sequence(&xdr, &args->seq_args, &hdr);
encode_putfh(&xdr, args->fh, &hdr);
- replen = hdr.replen + nfs4_fattr_bitmap_maxsz + 1;
+ replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
@@ -4554,7 +4554,7 @@ static int decode_sequence(struct xdr_stream *xdr,
* If the server returns different values for sessionID, slotID or
* sequence number, the server is looney tunes.
*/
- status = -ESERVERFAULT;
+ status = -EREMOTEIO;
if (memcmp(id.data, res->sr_session->sess_id.data,
NFS4_MAX_SESSIONID_LEN)) {
@@ -5678,7 +5678,7 @@ static struct {
{ NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
{ NFS4ERR_NOTSUPP, -ENOTSUPP },
{ NFS4ERR_TOOSMALL, -ETOOSMALL },
- { NFS4ERR_SERVERFAULT, -ESERVERFAULT },
+ { NFS4ERR_SERVERFAULT, -EREMOTEIO },
{ NFS4ERR_BADTYPE, -EBADTYPE },
{ NFS4ERR_LOCKED, -EAGAIN },
{ NFS4ERR_SYMLINK, -ELOOP },
@@ -5705,7 +5705,7 @@ nfs4_stat_to_errno(int stat)
}
if (stat <= 10000 || stat > 10100) {
/* The server is looney tunes. */
- return -ESERVERFAULT;
+ return -EREMOTEIO;
}
/* If we cannot translate the error, the recovery routines should
* handle it.
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e2975939126a..29d9d36cd5f4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -112,12 +112,10 @@ void nfs_unlock_request(struct nfs_page *req)
*/
int nfs_set_page_tag_locked(struct nfs_page *req)
{
- struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
-
if (!nfs_lock_request_dontget(req))
return 0;
if (req->wb_page != NULL)
- radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
+ radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
return 1;
}
@@ -126,10 +124,10 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
*/
void nfs_clear_page_tag_locked(struct nfs_page *req)
{
- struct inode *inode = req->wb_context->path.dentry->d_inode;
- struct nfs_inode *nfsi = NFS_I(inode);
-
if (req->wb_page != NULL) {
+ struct inode *inode = req->wb_context->path.dentry->d_inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
spin_lock(&inode->i_lock);
radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
nfs_unlock_request(req);
@@ -142,16 +140,22 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
* nfs_clear_request - Free up all resources allocated to the request
* @req:
*
- * Release page resources associated with a write request after it
- * has completed.
+ * Release page and open context resources associated with a read/write
+ * request after it has completed.
*/
void nfs_clear_request(struct nfs_page *req)
{
struct page *page = req->wb_page;
+ struct nfs_open_context *ctx = req->wb_context;
+
if (page != NULL) {
page_cache_release(page);
req->wb_page = NULL;
}
+ if (ctx != NULL) {
+ put_nfs_open_context(ctx);
+ req->wb_context = NULL;
+ }
}
@@ -165,9 +169,8 @@ static void nfs_free_request(struct kref *kref)
{
struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
- /* Release struct file or cached credential */
+ /* Release struct file and open context */
nfs_clear_request(req);
- put_nfs_open_context(req->wb_context);
nfs_page_free(req);
}
@@ -176,6 +179,12 @@ void nfs_release_request(struct nfs_page *req)
kref_put(&req->wb_kref, nfs_free_request);
}
+static int nfs_wait_bit_uninterruptible(void *word)
+{
+ io_schedule();
+ return 0;
+}
+
/**
* nfs_wait_on_request - Wait for a request to complete.
* @req: request to wait upon.
@@ -186,14 +195,9 @@ void nfs_release_request(struct nfs_page *req)
int
nfs_wait_on_request(struct nfs_page *req)
{
- int ret = 0;
-
- if (!test_bit(PG_BUSY, &req->wb_flags))
- goto out;
- ret = out_of_line_wait_on_bit(&req->wb_flags, PG_BUSY,
- nfs_wait_bit_killable, TASK_KILLABLE);
-out:
- return ret;
+ return wait_on_bit(&req->wb_flags, PG_BUSY,
+ nfs_wait_bit_uninterruptible,
+ TASK_UNINTERRUPTIBLE);
}
/**
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 90be551b80c1..c0173a8531e2 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -241,6 +241,7 @@ static int nfs_show_stats(struct seq_file *, struct vfsmount *);
static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
static int nfs_xdev_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static void nfs_put_super(struct super_block *);
static void nfs_kill_super(struct super_block *);
static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
@@ -264,6 +265,7 @@ static const struct super_operations nfs_sops = {
.alloc_inode = nfs_alloc_inode,
.destroy_inode = nfs_destroy_inode,
.write_inode = nfs_write_inode,
+ .put_super = nfs_put_super,
.statfs = nfs_statfs,
.clear_inode = nfs_clear_inode,
.umount_begin = nfs_umount_begin,
@@ -333,6 +335,7 @@ static const struct super_operations nfs4_sops = {
.alloc_inode = nfs_alloc_inode,
.destroy_inode = nfs_destroy_inode,
.write_inode = nfs_write_inode,
+ .put_super = nfs_put_super,
.statfs = nfs_statfs,
.clear_inode = nfs4_clear_inode,
.umount_begin = nfs_umount_begin,
@@ -531,6 +534,22 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
}
}
+#ifdef CONFIG_NFS_V4
+static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
+ int showdefaults)
+{
+ struct nfs_client *clp = nfss->nfs_client;
+
+ seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
+ seq_printf(m, ",minorversion=%u", clp->cl_minorversion);
+}
+#else
+static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
+ int showdefaults)
+{
+}
+#endif
+
/*
* Describe the mount options in force on this server representation
*/
@@ -592,13 +611,18 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
if (version != 4)
nfs_show_mountd_options(m, nfss, showdefaults);
+ else
+ nfs_show_nfsv4_options(m, nfss, showdefaults);
-#ifdef CONFIG_NFS_V4
- if (clp->rpc_ops->version == 4)
- seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
-#endif
if (nfss->options & NFS_OPTION_FSCACHE)
seq_printf(m, ",fsc");
+
+ if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
+ if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+ seq_printf(m, ",lookupcache=none");
+ else
+ seq_printf(m, ",lookupcache=pos");
+ }
}
/*
@@ -734,8 +758,6 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (data) {
- data->rsize = NFS_MAX_FILE_IO_SIZE;
- data->wsize = NFS_MAX_FILE_IO_SIZE;
data->acregmin = NFS_DEF_ACREGMIN;
data->acregmax = NFS_DEF_ACREGMAX;
data->acdirmin = NFS_DEF_ACDIRMIN;
@@ -2198,6 +2220,17 @@ error_splat_super:
}
/*
+ * Ensure that we unregister the bdi before kill_anon_super
+ * releases the device name
+ */
+static void nfs_put_super(struct super_block *s)
+{
+ struct nfs_server *server = NFS_SB(s);
+
+ bdi_unregister(&server->backing_dev_info);
+}
+
+/*
* Destroy an NFS2/3 superblock
*/
static void nfs_kill_super(struct super_block *s)
@@ -2205,7 +2238,6 @@ static void nfs_kill_super(struct super_block *s)
struct nfs_server *server = NFS_SB(s);
kill_anon_super(s);
- bdi_unregister(&server->backing_dev_info);
nfs_fscache_release_super_cookie(s);
nfs_free_server(server);
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53eb26c16b50..cf6c06f66edc 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1542,6 +1542,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
break;
}
ret = nfs_wait_on_request(req);
+ nfs_release_request(req);
if (ret < 0)
goto out;
}
@@ -1612,15 +1613,16 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
if (ret)
goto out_unlock;
page_cache_get(newpage);
+ spin_lock(&mapping->host->i_lock);
req->wb_page = newpage;
SetPagePrivate(newpage);
- set_page_private(newpage, page_private(page));
+ set_page_private(newpage, (unsigned long)req);
ClearPagePrivate(page);
set_page_private(page, 0);
+ spin_unlock(&mapping->host->i_lock);
page_cache_release(page);
out_unlock:
nfs_clear_page_tag_locked(req);
- nfs_release_request(req);
out:
return ret;
}
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 725d02f210e2..6d9c6aabc85e 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -389,7 +389,7 @@ sort_pacl(struct posix_acl *pacl)
sort_pacl_range(pacl, 1, i-1);
BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ);
- j = i++;
+ j = ++i;
while (pacl->a_entries[j].e_tag == ACL_GROUP)
j++;
sort_pacl_range(pacl, i, j-1);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2153f9bdbebd..6ad6282e3076 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2002,7 +2002,9 @@ nfs4_file_downgrade(struct file *filp, unsigned int share_access)
{
if (share_access & NFS4_SHARE_ACCESS_WRITE) {
drop_file_write_access(filp);
+ spin_lock(&filp->f_lock);
filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE;
+ spin_unlock(&filp->f_lock);
}
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 0fbd50cee1f6..12f62ff353d6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -168,10 +168,10 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
argp->p = page_address(argp->pagelist[0]);
argp->pagelist++;
if (argp->pagelen < PAGE_SIZE) {
- argp->end = p + (argp->pagelen>>2);
+ argp->end = argp->p + (argp->pagelen>>2);
argp->pagelen = 0;
} else {
- argp->end = p + (PAGE_SIZE>>2);
+ argp->end = argp->p + (PAGE_SIZE>>2);
argp->pagelen -= PAGE_SIZE;
}
memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
@@ -1433,10 +1433,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
argp->p = page_address(argp->pagelist[0]);
argp->pagelist++;
if (argp->pagelen < PAGE_SIZE) {
- argp->end = p + (argp->pagelen>>2);
+ argp->end = argp->p + (argp->pagelen>>2);
argp->pagelen = 0;
} else {
- argp->end = p + (PAGE_SIZE>>2);
+ argp->end = argp->p + (PAGE_SIZE>>2);
argp->pagelen -= PAGE_SIZE;
}
}
@@ -2129,9 +2129,15 @@ out_acl:
* and this is the root of a cross-mounted filesystem.
*/
if (ignore_crossmnt == 0 &&
- exp->ex_path.mnt->mnt_root->d_inode == dentry->d_inode) {
- err = vfs_getattr(exp->ex_path.mnt->mnt_parent,
- exp->ex_path.mnt->mnt_mountpoint, &stat);
+ dentry == exp->ex_path.mnt->mnt_root) {
+ struct path path = exp->ex_path;
+ path_get(&path);
+ while (follow_up(&path)) {
+ if (path.dentry != path.mnt->mnt_root)
+ break;
+ }
+ err = vfs_getattr(path.mnt, path.dentry, &stat);
+ path_put(&path);
if (err)
goto out_nfserr;
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 67ea83eedd43..4d4e2d00d15e 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -136,7 +136,7 @@ u32 nfsd_supported_minorversion;
int nfsd_vers(int vers, enum vers_op change)
{
if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
- return -1;
+ return 0;
switch(change) {
case NFSD_SET:
nfsd_versions[vers] = nfsd_version[vers];
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a293f0273263..570dd1c103d5 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -774,12 +774,9 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
int (*fsync) (struct file *, struct dentry *, int);
int err;
- err = filemap_fdatawrite(inode->i_mapping);
+ err = filemap_write_and_wait(inode->i_mapping);
if (err == 0 && fop && (fsync = fop->fsync))
err = fsync(filp, dp, 0);
- if (err == 0)
- err = filemap_fdatawait(inode->i_mapping);
-
return err;
}
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 644e66727dd0..63e7b108ff83 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -781,6 +781,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
sb->s_export_op = &nilfs_export_ops;
sb->s_root = NULL;
sb->s_time_gran = 1;
+ sb->s_bdi = nilfs->ns_bdi;
if (!nilfs_loaded(nilfs)) {
err = load_nilfs(nilfs, sbi);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index c9ee67b442e1..5d3d2a782abc 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -28,6 +28,7 @@
#include <linux/path.h> /* struct path */
#include <linux/slab.h> /* kmem_* */
#include <linux/types.h>
+#include <linux/sched.h>
#include "inotify.h"
@@ -71,6 +72,9 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
ret = 0;
}
+ if (entry->mask & IN_ONESHOT)
+ fsnotify_destroy_mark_by_entry(entry);
+
/*
* If we hold the entry until after the event is on the queue
* IN_IGNORED won't be able to pass this event in the queue
@@ -121,7 +125,7 @@ static int idr_callback(int id, void *p, void *data)
if (warned)
return 0;
- warned = false;
+ warned = true;
entry = p;
ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
@@ -146,6 +150,7 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
idr_for_each(&group->inotify_data.idr, idr_callback, group);
idr_remove_all(&group->inotify_data.idr);
idr_destroy(&group->inotify_data.idr);
+ free_uid(group->inotify_data.user);
}
void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index dcd2040d330c..aef8f5d7f94f 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -106,8 +106,11 @@ static inline __u32 inotify_arg_to_mask(u32 arg)
{
__u32 mask;
- /* everything should accept their own ignored and cares about children */
- mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD);
+ /*
+ * everything should accept their own ignored, cares about children,
+ * and should receive events when the inode is unmounted
+ */
+ mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD | FS_UNMOUNT);
/* mask off the flags used to open the fd */
mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT));
@@ -556,21 +559,24 @@ retry:
if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
goto out_err;
+ /* we are putting the mark on the idr, take a reference */
+ fsnotify_get_mark(&tmp_ientry->fsn_entry);
+
spin_lock(&group->inotify_data.idr_lock);
ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
- group->inotify_data.last_wd,
+ group->inotify_data.last_wd+1,
&tmp_ientry->wd);
spin_unlock(&group->inotify_data.idr_lock);
if (ret) {
+ /* we didn't get on the idr, drop the idr reference */
+ fsnotify_put_mark(&tmp_ientry->fsn_entry);
+
/* idr was out of memory allocate and try again */
if (ret == -EAGAIN)
goto retry;
goto out_err;
}
- /* we put the mark on the idr, take a reference */
- fsnotify_get_mark(&tmp_ientry->fsn_entry);
-
/* we are on the idr, now get on the inode */
ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
if (ret) {
@@ -638,7 +644,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
spin_lock_init(&group->inotify_data.idr_lock);
idr_init(&group->inotify_data.idr);
- group->inotify_data.last_wd = 1;
+ group->inotify_data.last_wd = 0;
group->inotify_data.user = user;
group->inotify_data.fa = NULL;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index fbeaec762103..c8288df57120 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -30,6 +30,8 @@
#include "alloc.h"
#include "dlmglue.h"
#include "file.h"
+#include "inode.h"
+#include "journal.h"
#include "ocfs2_fs.h"
#include "xattr.h"
@@ -170,6 +172,60 @@ static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
}
/*
+ * Helper function to set i_mode in memory and disk. Some call paths
+ * will not have di_bh or a journal handle to pass, in which case it
+ * will create it's own.
+ */
+static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
+ handle_t *handle, umode_t new_mode)
+{
+ int ret, commit_handle = 0;
+ struct ocfs2_dinode *di;
+
+ if (di_bh == NULL) {
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else
+ get_bh(di_bh);
+
+ if (handle == NULL) {
+ handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_brelse;
+ }
+
+ commit_handle = 1;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ inode->i_mode = new_mode;
+ di->i_mode = cpu_to_le16(inode->i_mode);
+
+ ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+ if (commit_handle)
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out_brelse:
+ brelse(di_bh);
+out:
+ return ret;
+}
+
+/*
* Set the access or default ACL of an inode.
*/
static int ocfs2_set_acl(handle_t *handle,
@@ -197,9 +253,14 @@ static int ocfs2_set_acl(handle_t *handle,
if (ret < 0)
return ret;
else {
- inode->i_mode = mode;
if (ret == 0)
acl = NULL;
+
+ ret = ocfs2_acl_set_mode(inode, di_bh,
+ handle, mode);
+ if (ret)
+ return ret;
+
}
}
break;
@@ -232,12 +293,30 @@ static int ocfs2_set_acl(handle_t *handle,
int ocfs2_check_acl(struct inode *inode, int mask)
{
- struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *di_bh = NULL;
+ struct posix_acl *acl;
+ int ret = -EAGAIN;
- if (IS_ERR(acl))
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return ret;
+
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh);
+
+ brelse(di_bh);
+
+ if (IS_ERR(acl)) {
+ mlog_errno(PTR_ERR(acl));
return PTR_ERR(acl);
+ }
if (acl) {
- int ret = posix_acl_permission(inode, acl, mask);
+ ret = posix_acl_permission(inode, acl, mask);
posix_acl_release(acl);
return ret;
}
@@ -286,7 +365,8 @@ int ocfs2_init_acl(handle_t *handle,
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct posix_acl *acl = NULL;
- int ret = 0;
+ int ret = 0, ret2;
+ mode_t mode;
if (!S_ISLNK(inode->i_mode)) {
if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
@@ -295,12 +375,17 @@ int ocfs2_init_acl(handle_t *handle,
if (IS_ERR(acl))
return PTR_ERR(acl);
}
- if (!acl)
- inode->i_mode &= ~current_umask();
+ if (!acl) {
+ mode = inode->i_mode & ~current_umask();
+ ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+ if (ret) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
+ }
}
if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
struct posix_acl *clone;
- mode_t mode;
if (S_ISDIR(inode->i_mode)) {
ret = ocfs2_set_acl(handle, inode, di_bh,
@@ -317,7 +402,12 @@ int ocfs2_init_acl(handle_t *handle,
mode = inode->i_mode;
ret = posix_acl_create_masq(clone, &mode);
if (ret >= 0) {
- inode->i_mode = mode;
+ ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+ if (ret2) {
+ mlog_errno(ret2);
+ ret = ret2;
+ goto cleanup;
+ }
if (ret > 0) {
ret = ocfs2_set_acl(handle, inode,
di_bh, ACL_TYPE_ACCESS,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 38a42f5d59ff..5661db139ca0 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1765,9 +1765,9 @@ set_and_inc:
*
* The array index of the subtree root is passed back.
*/
-static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
- struct ocfs2_path *left,
- struct ocfs2_path *right)
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+ struct ocfs2_path *left,
+ struct ocfs2_path *right)
{
int i = 0;
@@ -2872,8 +2872,8 @@ out:
* This looks similar, but is subtly different to
* ocfs2_find_cpos_for_left_leaf().
*/
-static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
- struct ocfs2_path *path, u32 *cpos)
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos)
{
int i, j, ret = 0;
u64 blkno;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 9c122d574464..1db4359ccb90 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -317,4 +317,9 @@ int ocfs2_path_bh_journal_access(handle_t *handle,
int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
handle_t *handle,
struct ocfs2_path *path);
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+ struct ocfs2_path *left,
+ struct ocfs2_path *right);
#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index deb2b132ae5e..5fc918ca2572 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -591,8 +591,9 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
goto bail;
}
- /* We should already CoW the refcounted extent. */
- BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+ /* We should already CoW the refcounted extent in case of create. */
+ BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
+
/*
* get_more_blocks() expects us to describe a hole by clearing
* the mapped bit on bh_result().
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index d43d34a1dd31..5a253bab122c 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -407,6 +407,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
struct buffer_head *bh)
{
int ret = 0;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
mlog_entry_void();
@@ -426,6 +427,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
+ ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
submit_bh(WRITE, bh);
wait_on_buffer(bh);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 02bf17808bdc..18bc101d603f 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -205,7 +205,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
if ((count + *ppos) > i_size_read(inode))
readlen = i_size_read(inode) - *ppos;
else
- readlen = count - *ppos;
+ readlen = count;
lvb_buf = kmalloc(readlen, GFP_NOFS);
if (!lvb_buf)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 83bcaf266b35..ef1ac9ab4ee1 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -511,8 +511,6 @@ static void dlm_lockres_release(struct kref *kref)
atomic_dec(&dlm->res_cur_count);
- dlm_put(dlm);
-
if (!hlist_unhashed(&res->hash_node) ||
!list_empty(&res->granted) ||
!list_empty(&res->converting) ||
@@ -585,8 +583,6 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
res->migration_pending = 0;
res->inflight_locks = 0;
- /* put in dlm_lockres_release */
- dlm_grab(dlm);
res->dlm = dlm;
kref_init(&res->refs);
@@ -3046,8 +3042,6 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
/* check for pre-existing lock */
spin_lock(&dlm->spinlock);
res = __dlm_lookup_lockres(dlm, name, namelen, hash);
- spin_lock(&dlm->master_lock);
-
if (res) {
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -3065,14 +3059,15 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
spin_unlock(&res->spinlock);
}
+ spin_lock(&dlm->master_lock);
/* ignore status. only nonzero status would BUG. */
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
name, namelen,
migrate->new_master,
migrate->master);
-unlock:
spin_unlock(&dlm->master_lock);
+unlock:
spin_unlock(&dlm->spinlock);
if (oldmle) {
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index d9fa3d22e17c..3492550b70c8 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1941,6 +1941,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
struct list_head *queue;
struct dlm_lock *lock, *next;
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
res->state |= DLM_LOCK_RES_RECOVERING;
if (!list_empty(&res->recovering)) {
mlog(0,
@@ -2265,19 +2267,15 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
/* zero the lvb if necessary */
dlm_revalidate_lvb(dlm, res, dead_node);
if (res->owner == dead_node) {
- if (res->state & DLM_LOCK_RES_DROPPING_REF)
- mlog(0, "%s:%.*s: owned by "
- "dead node %u, this node was "
- "dropping its ref when it died. "
- "continue, dropping the flag.\n",
- dlm->name, res->lockname.len,
- res->lockname.name, dead_node);
-
- /* the wake_up for this will happen when the
- * RECOVERING flag is dropped later */
- res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ mlog(ML_NOTICE, "Ignore %.*s for "
+ "recovery as it is being freed\n",
+ res->lockname.len,
+ res->lockname.name);
+ } else
+ dlm_move_lockres_to_recovery_list(dlm,
+ res);
- dlm_move_lockres_to_recovery_list(dlm, res);
} else if (res->owner == dlm->node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 52ec020ea78b..86491f583ea5 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -93,19 +93,27 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
* truly ready to be freed. */
int __dlm_lockres_unused(struct dlm_lock_resource *res)
{
- if (!__dlm_lockres_has_locks(res) &&
- (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) {
- /* try not to scan the bitmap unless the first two
- * conditions are already true */
- int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
- if (bit >= O2NM_MAX_NODES) {
- /* since the bit for dlm->node_num is not
- * set, inflight_locks better be zero */
- BUG_ON(res->inflight_locks != 0);
- return 1;
- }
- }
- return 0;
+ int bit;
+
+ if (__dlm_lockres_has_locks(res))
+ return 0;
+
+ if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
+ return 0;
+
+ if (res->state & DLM_LOCK_RES_RECOVERING)
+ return 0;
+
+ bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+ if (bit < O2NM_MAX_NODES)
+ return 0;
+
+ /*
+ * since the bit for dlm->node_num is not set, inflight_locks better
+ * be zero
+ */
+ BUG_ON(res->inflight_locks != 0);
+ return 1;
}
@@ -153,45 +161,25 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
spin_unlock(&dlm->spinlock);
}
-static int dlm_purge_lockres(struct dlm_ctxt *dlm,
+static void dlm_purge_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
int master;
int ret = 0;
- spin_lock(&res->spinlock);
- if (!__dlm_lockres_unused(res)) {
- mlog(0, "%s:%.*s: tried to purge but not unused\n",
- dlm->name, res->lockname.len, res->lockname.name);
- __dlm_print_one_lock_resource(res);
- spin_unlock(&res->spinlock);
- BUG();
- }
-
- if (res->state & DLM_LOCK_RES_MIGRATING) {
- mlog(0, "%s:%.*s: Delay dropref as this lockres is "
- "being remastered\n", dlm->name, res->lockname.len,
- res->lockname.name);
- /* Re-add the lockres to the end of the purge list */
- if (!list_empty(&res->purge)) {
- list_del_init(&res->purge);
- list_add_tail(&res->purge, &dlm->purge_list);
- }
- spin_unlock(&res->spinlock);
- return 0;
- }
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
master = (res->owner == dlm->node_num);
- if (!master)
- res->state |= DLM_LOCK_RES_DROPPING_REF;
- spin_unlock(&res->spinlock);
mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
res->lockname.name, master);
if (!master) {
+ res->state |= DLM_LOCK_RES_DROPPING_REF;
/* drop spinlock... retake below */
+ spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
spin_lock(&res->spinlock);
@@ -209,31 +197,35 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
dlm->name, res->lockname.len, res->lockname.name, ret);
spin_lock(&dlm->spinlock);
+ spin_lock(&res->spinlock);
}
- spin_lock(&res->spinlock);
if (!list_empty(&res->purge)) {
mlog(0, "removing lockres %.*s:%p from purgelist, "
"master = %d\n", res->lockname.len, res->lockname.name,
res, master);
list_del_init(&res->purge);
- spin_unlock(&res->spinlock);
dlm_lockres_put(res);
dlm->purge_count--;
- } else
- spin_unlock(&res->spinlock);
+ }
+
+ if (!__dlm_lockres_unused(res)) {
+ mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
__dlm_unhash_lockres(res);
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
if (!master) {
- spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_DROPPING_REF;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
- }
- return 0;
+ } else
+ spin_unlock(&res->spinlock);
}
static void dlm_run_purge_list(struct dlm_ctxt *dlm,
@@ -252,17 +244,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
lockres = list_entry(dlm->purge_list.next,
struct dlm_lock_resource, purge);
- /* Status of the lockres *might* change so double
- * check. If the lockres is unused, holding the dlm
- * spinlock will prevent people from getting and more
- * refs on it -- there's no need to keep the lockres
- * spinlock. */
spin_lock(&lockres->spinlock);
- unused = __dlm_lockres_unused(lockres);
- spin_unlock(&lockres->spinlock);
-
- if (!unused)
- continue;
purge_jiffies = lockres->last_used +
msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
@@ -274,15 +256,29 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
* in tail order, we can stop at the first
* unpurgable resource -- anyone added after
* him will have a greater last_used value */
+ spin_unlock(&lockres->spinlock);
break;
}
+ /* Status of the lockres *might* change so double
+ * check. If the lockres is unused, holding the dlm
+ * spinlock will prevent people from getting and more
+ * refs on it. */
+ unused = __dlm_lockres_unused(lockres);
+ if (!unused ||
+ (lockres->state & DLM_LOCK_RES_MIGRATING)) {
+ mlog(0, "lockres %s:%.*s: is in use or "
+ "being remastered, used %d, state %d\n",
+ dlm->name, lockres->lockname.len,
+ lockres->lockname.name, !unused, lockres->state);
+ list_move_tail(&dlm->purge_list, &lockres->purge);
+ spin_unlock(&lockres->spinlock);
+ continue;
+ }
+
dlm_lockres_get(lockres);
- /* This may drop and reacquire the dlm spinlock if it
- * has to do migration. */
- if (dlm_purge_lockres(dlm, lockres))
- BUG();
+ dlm_purge_lockres(dlm, lockres);
dlm_lockres_put(lockres);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 0297fb8982b8..3fcb47959d88 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -485,7 +485,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
OCFS2_BH_IGNORE_CACHE);
} else {
status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
- if (!status)
+ /*
+ * If buffer is in jbd, then its checksum may not have been
+ * computed as yet.
+ */
+ if (!status && !buffer_jbd(bh))
status = ocfs2_validate_inode_block(osb->sb, bh);
}
if (status < 0) {
@@ -559,6 +563,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
+ handle = NULL;
mlog_errno(status);
goto out;
}
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac6245175..b5cb3ede9408 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
- if (__mandatory_lock(inode))
+ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
return -ENOLCK;
return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3a0df7a1b810..10e9527dda1f 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -969,6 +969,103 @@ out:
}
/*
+ * Find the end range for a leaf refcount block indicated by
+ * el->l_recs[index].e_blkno.
+ */
+static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_extent_block *eb,
+ struct ocfs2_extent_list *el,
+ int index, u32 *cpos_end)
+{
+ int ret, i, subtree_root;
+ u32 cpos;
+ u64 blkno;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct ocfs2_path *left_path = NULL, *right_path = NULL;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_extent_list *tmp_el;
+
+ if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
+ /*
+ * We have a extent rec after index, so just use the e_cpos
+ * of the next extent rec.
+ */
+ *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
+ return 0;
+ }
+
+ if (!eb || (eb && !eb->h_next_leaf_blk)) {
+ /*
+ * We are the last extent rec, so any high cpos should
+ * be stored in this leaf refcount block.
+ */
+ *cpos_end = UINT_MAX;
+ return 0;
+ }
+
+ /*
+ * If the extent block isn't the last one, we have to find
+ * the subtree root between this extent block and the next
+ * leaf extent block and get the corresponding e_cpos from
+ * the subroot. Otherwise we may corrupt the b-tree.
+ */
+ ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+
+ left_path = ocfs2_new_path_from_et(&et);
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
+ ret = ocfs2_find_path(ci, left_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ right_path = ocfs2_new_path_from_path(left_path);
+ if (!right_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(ci, right_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ subtree_root = ocfs2_find_subtree_root(&et, left_path,
+ right_path);
+
+ tmp_el = left_path->p_node[subtree_root].el;
+ blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
+ for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) {
+ if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
+ *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
+ break;
+ }
+ }
+
+ BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec));
+
+out:
+ ocfs2_free_path(left_path);
+ ocfs2_free_path(right_path);
+ return ret;
+}
+
+/*
* Given a cpos and len, try to find the refcount record which contains cpos.
* 1. If cpos can be found in one refcount record, return the record.
* 2. If cpos can't be found, return a fake record which start from cpos
@@ -983,10 +1080,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
struct buffer_head **ret_bh)
{
int ret = 0, i, found;
- u32 low_cpos;
+ u32 low_cpos, uninitialized_var(cpos_end);
struct ocfs2_extent_list *el;
- struct ocfs2_extent_rec *tmp, *rec = NULL;
- struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_rec *rec = NULL;
+ struct ocfs2_extent_block *eb = NULL;
struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
struct ocfs2_refcount_block *rb =
@@ -1034,12 +1131,16 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
}
}
- /* adjust len when we have ocfs2_extent_rec after it. */
- if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) {
- tmp = &el->l_recs[i+1];
+ if (found) {
+ ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
+ eb, el, i, &cpos_end);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
- if (le32_to_cpu(tmp->e_cpos) < cpos + len)
- len = le32_to_cpu(tmp->e_cpos) - cpos;
+ if (cpos_end < low_cpos + len)
+ len = cpos_end - low_cpos;
}
ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
@@ -2353,16 +2454,26 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
le32_to_cpu(rec.r_clusters)) - cpos;
/*
- * If the refcount rec already exist, cool. We just need
- * to check whether there is a split. Otherwise we just need
- * to increase the refcount.
- * If we will insert one, increases recs_add.
- *
* We record all the records which will be inserted to the
* same refcount block, so that we can tell exactly whether
* we need a new refcount block or not.
+ *
+ * If we will insert a new one, this is easy and only happens
+ * during adding refcounted flag to the extent, so we don't
+ * have a chance of spliting. We just need one record.
+ *
+ * If the refcount rec already exists, that would be a little
+ * complicated. we may have to:
+ * 1) split at the beginning if the start pos isn't aligned.
+ * we need 1 more record in this case.
+ * 2) split int the end if the end pos isn't aligned.
+ * we need 1 more record in this case.
+ * 3) split in the middle because of file system fragmentation.
+ * we need 2 more records in this case(we can't detect this
+ * beforehand, so always think of the worst case).
*/
if (rec.r_refcount) {
+ recs_add += 2;
/* Check whether we need a split at the beginning. */
if (cpos == start_cpos &&
cpos != le64_to_cpu(rec.r_cpos))
@@ -3995,6 +4106,9 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
di->i_attr = s_di->i_attr;
if (preserve) {
+ t_inode->i_uid = s_inode->i_uid;
+ t_inode->i_gid = s_inode->i_gid;
+ t_inode->i_mode = s_inode->i_mode;
di->i_uid = s_di->i_uid;
di->i_gid = s_di->i_gid;
di->i_mode = s_di->i_mode;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c30b644d9572..79b5dacf9312 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -152,7 +152,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
#define do_error(fmt, ...) \
do{ \
- if (clean_error) \
+ if (resize) \
mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
else \
ocfs2_error(sb, fmt, ##__VA_ARGS__); \
@@ -160,7 +160,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
static int ocfs2_validate_gd_self(struct super_block *sb,
struct buffer_head *bh,
- int clean_error)
+ int resize)
{
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -211,7 +211,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
static int ocfs2_validate_gd_parent(struct super_block *sb,
struct ocfs2_dinode *di,
struct buffer_head *bh,
- int clean_error)
+ int resize)
{
unsigned int max_bits;
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +233,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
return -EINVAL;
}
- if (le16_to_cpu(gd->bg_chain) >=
- le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
+ /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
+ if ((le16_to_cpu(gd->bg_chain) >
+ le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
+ ((le16_to_cpu(gd->bg_chain) ==
+ le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
do_error("Group descriptor #%llu has bad chain %u",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_chain));
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 14f47d2bfe02..9f55be445bd6 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -701,6 +701,10 @@ unlock_osb:
if (!ocfs2_is_hard_readonly(osb))
ocfs2_set_journal_params(osb);
+
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
+ MS_POSIXACL : 0);
}
out:
unlock_kernel();
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index e3421030a69f..91e656fd528b 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
}
/* Fast symlinks can't be large */
- len = strlen(target);
+ len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
link = kzalloc(len + 1, GFP_NOFS);
if (!link) {
status = -ENOMEM;
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 038a6022152f..49cfd5f54238 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -1,7 +1,9 @@
/************************************************************
* EFI GUID Partition Table handling
- * Per Intel EFI Specification v1.02
- * http://developer.intel.com/technology/efi/efi.htm
+ *
+ * http://www.uefi.org/specs/
+ * http://www.intel.com/technology/efi/
+ *
* efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
* Copyright 2000,2001,2002,2004 Dell Inc.
*
@@ -92,6 +94,7 @@
*
************************************************************/
#include <linux/crc32.h>
+#include <linux/math64.h>
#include "check.h"
#include "efi.h"
@@ -141,7 +144,8 @@ last_lba(struct block_device *bdev)
{
if (!bdev || !bdev->bd_inode)
return 0;
- return (bdev->bd_inode->i_size >> 9) - 1ULL;
+ return div_u64(bdev->bd_inode->i_size,
+ bdev_logical_block_size(bdev)) - 1ULL;
}
static inline int
@@ -188,6 +192,7 @@ static size_t
read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
{
size_t totalreadcount = 0;
+ sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
if (!bdev || !buffer || lba > last_lba(bdev))
return 0;
@@ -195,7 +200,7 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
while (count) {
int copied = 512;
Sector sect;
- unsigned char *data = read_dev_sector(bdev, lba++, &sect);
+ unsigned char *data = read_dev_sector(bdev, n++, &sect);
if (!data)
break;
if (copied > count)
@@ -257,15 +262,16 @@ static gpt_header *
alloc_read_gpt_header(struct block_device *bdev, u64 lba)
{
gpt_header *gpt;
+ unsigned ssz = bdev_logical_block_size(bdev);
+
if (!bdev)
return NULL;
- gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL);
+ gpt = kzalloc(ssz, GFP_KERNEL);
if (!gpt)
return NULL;
- if (read_lba(bdev, lba, (u8 *) gpt,
- sizeof (gpt_header)) < sizeof (gpt_header)) {
+ if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
kfree(gpt);
gpt=NULL;
return NULL;
@@ -601,6 +607,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
gpt_header *gpt = NULL;
gpt_entry *ptes = NULL;
u32 i;
+ unsigned ssz = bdev_logical_block_size(bdev) / 512;
if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
kfree(gpt);
@@ -611,13 +618,14 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
pr_debug("GUID Partition Table is valid! Yea!\n");
for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
+ u64 start = le64_to_cpu(ptes[i].starting_lba);
+ u64 size = le64_to_cpu(ptes[i].ending_lba) -
+ le64_to_cpu(ptes[i].starting_lba) + 1ULL;
+
if (!is_pte_valid(&ptes[i], last_lba(bdev)))
continue;
- put_partition(state, i+1, le64_to_cpu(ptes[i].starting_lba),
- (le64_to_cpu(ptes[i].ending_lba) -
- le64_to_cpu(ptes[i].starting_lba) +
- 1ULL));
+ put_partition(state, i+1, start * ssz, size * ssz);
/* If this is a RAID volume, tell md */
if (!efi_guidcmp(ptes[i].partition_type_guid,
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 2cc89d0475bf..6998b589abf9 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -37,7 +37,6 @@
#define EFI_PMBR_OSTYPE_EFI 0xEF
#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
-#define GPT_BLOCK_SIZE 512
#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
#define GPT_HEADER_REVISION_V1 0x00010000
#define GPT_PRIMARY_PARTITION_TABLE_LBA 1
@@ -79,7 +78,12 @@ typedef struct _gpt_header {
__le32 num_partition_entries;
__le32 sizeof_partition_entry;
__le32 partition_entry_array_crc32;
- u8 reserved2[GPT_BLOCK_SIZE - 92];
+
+ /* The rest of the logical block is reserved by UEFI and must be zero.
+ * EFI standard handles this by:
+ *
+ * uint8_t reserved2[ BlockSize - 92 ];
+ */
} __attribute__ ((packed)) gpt_header;
typedef struct _gpt_entry_attributes {
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc71aab08460..bae725b7febd 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -74,6 +74,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
} *label;
unsigned char *data;
Sector sect;
+ sector_t labelsect;
res = 0;
blocksize = bdev_logical_block_size(bdev);
@@ -98,9 +99,19 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
goto out_freeall;
/*
+ * Special case for FBA disks: label sector does not depend on
+ * blocksize.
+ */
+ if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
+ (info->cu_type == 0x3880 && info->dev_type == 0x3370))
+ labelsect = info->label_block;
+ else
+ labelsect = info->label_block * (blocksize >> 9);
+
+ /*
* Get volume label, extract name and type.
*/
- data = read_dev_sector(bdev, info->label_block*(blocksize/512), &sect);
+ data = read_dev_sector(bdev, labelsect, &sect);
if (data == NULL)
goto out_readerr;
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 0028d2ef0662..90be97f1f5a8 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -31,14 +31,17 @@
*/
#include <asm/unaligned.h>
-#define SYS_IND(p) (get_unaligned(&p->sys_ind))
-#define NR_SECTS(p) ({ __le32 __a = get_unaligned(&p->nr_sects); \
- le32_to_cpu(__a); \
- })
+#define SYS_IND(p) get_unaligned(&p->sys_ind)
-#define START_SECT(p) ({ __le32 __a = get_unaligned(&p->start_sect); \
- le32_to_cpu(__a); \
- })
+static inline sector_t nr_sects(struct partition *p)
+{
+ return (sector_t)get_unaligned_le32(&p->nr_sects);
+}
+
+static inline sector_t start_sect(struct partition *p)
+{
+ return (sector_t)get_unaligned_le32(&p->start_sect);
+}
static inline int is_extended_partition(struct partition *p)
{
@@ -104,13 +107,13 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
static void
parse_extended(struct parsed_partitions *state, struct block_device *bdev,
- u32 first_sector, u32 first_size)
+ sector_t first_sector, sector_t first_size)
{
struct partition *p;
Sector sect;
unsigned char *data;
- u32 this_sector, this_size;
- int sector_size = bdev_logical_block_size(bdev) / 512;
+ sector_t this_sector, this_size;
+ sector_t sector_size = bdev_logical_block_size(bdev) / 512;
int loopct = 0; /* number of links followed
without finding a data partition */
int i;
@@ -145,14 +148,14 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
* First process the data partition(s)
*/
for (i=0; i<4; i++, p++) {
- u32 offs, size, next;
- if (!NR_SECTS(p) || is_extended_partition(p))
+ sector_t offs, size, next;
+ if (!nr_sects(p) || is_extended_partition(p))
continue;
/* Check the 3rd and 4th entries -
these sometimes contain random garbage */
- offs = START_SECT(p)*sector_size;
- size = NR_SECTS(p)*sector_size;
+ offs = start_sect(p)*sector_size;
+ size = nr_sects(p)*sector_size;
next = this_sector + offs;
if (i >= 2) {
if (offs + size > this_size)
@@ -179,13 +182,13 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
*/
p -= 4;
for (i=0; i<4; i++, p++)
- if (NR_SECTS(p) && is_extended_partition(p))
+ if (nr_sects(p) && is_extended_partition(p))
break;
if (i == 4)
goto done; /* nothing left to do */
- this_sector = first_sector + START_SECT(p) * sector_size;
- this_size = NR_SECTS(p) * sector_size;
+ this_sector = first_sector + start_sect(p) * sector_size;
+ this_size = nr_sects(p) * sector_size;
put_dev_sector(sect);
}
done:
@@ -197,7 +200,7 @@ done:
static void
parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
- u32 offset, u32 size, int origin)
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_SOLARIS_X86_PARTITION
Sector sect;
@@ -244,7 +247,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
*/
static void
parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
- u32 offset, u32 size, int origin, char *flavour,
+ sector_t offset, sector_t size, int origin, char *flavour,
int max_partitions)
{
Sector sect;
@@ -263,7 +266,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
if (le16_to_cpu(l->d_npartitions) < max_partitions)
max_partitions = le16_to_cpu(l->d_npartitions);
for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
- u32 bsd_start, bsd_size;
+ sector_t bsd_start, bsd_size;
if (state->next == state->limit)
break;
@@ -290,7 +293,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
static void
parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
- u32 offset, u32 size, int origin)
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_BSD_DISKLABEL
parse_bsd(state, bdev, offset, size, origin,
@@ -300,7 +303,7 @@ parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
static void
parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
- u32 offset, u32 size, int origin)
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_BSD_DISKLABEL
parse_bsd(state, bdev, offset, size, origin,
@@ -310,7 +313,7 @@ parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
static void
parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
- u32 offset, u32 size, int origin)
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_BSD_DISKLABEL
parse_bsd(state, bdev, offset, size, origin,
@@ -324,7 +327,7 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
*/
static void
parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
- u32 offset, u32 size, int origin)
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_UNIXWARE_DISKLABEL
Sector sect;
@@ -348,7 +351,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
if (p->s_label != UNIXWARE_FS_UNUSED)
put_partition(state, state->next++,
- START_SECT(p), NR_SECTS(p));
+ le32_to_cpu(p->start_sect),
+ le32_to_cpu(p->nr_sects));
p++;
}
put_dev_sector(sect);
@@ -363,7 +367,7 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
*/
static void
parse_minix(struct parsed_partitions *state, struct block_device *bdev,
- u32 offset, u32 size, int origin)
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_MINIX_SUBPARTITION
Sector sect;
@@ -390,7 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
/* add each partition in use */
if (SYS_IND(p) == MINIX_PARTITION)
put_partition(state, state->next++,
- START_SECT(p), NR_SECTS(p));
+ start_sect(p), nr_sects(p));
}
printk(" >\n");
}
@@ -401,7 +405,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
static struct {
unsigned char id;
void (*parse)(struct parsed_partitions *, struct block_device *,
- u32, u32, int);
+ sector_t, sector_t, int);
} subtypes[] = {
{FREEBSD_PARTITION, parse_freebsd},
{NETBSD_PARTITION, parse_netbsd},
@@ -415,7 +419,7 @@ static struct {
int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
{
- int sector_size = bdev_logical_block_size(bdev) / 512;
+ sector_t sector_size = bdev_logical_block_size(bdev) / 512;
Sector sect;
unsigned char *data;
struct partition *p;
@@ -483,14 +487,21 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
state->next = 5;
for (slot = 1 ; slot <= 4 ; slot++, p++) {
- u32 start = START_SECT(p)*sector_size;
- u32 size = NR_SECTS(p)*sector_size;
+ sector_t start = start_sect(p)*sector_size;
+ sector_t size = nr_sects(p)*sector_size;
if (!size)
continue;
if (is_extended_partition(p)) {
- /* prevent someone doing mkfs or mkswap on an
- extended partition, but leave room for LILO */
- put_partition(state, slot, start, size == 1 ? 1 : 2);
+ /*
+ * prevent someone doing mkfs or mkswap on an
+ * extended partition, but leave room for LILO
+ * FIXME: this uses one logical sector for > 512b
+ * sector, although it may not be enough/proper.
+ */
+ sector_t n = 2;
+ n = min(size, max(sector_size, n));
+ put_partition(state, slot, start, n);
+
printk(" <");
parse_extended(state, bdev, start, size);
printk(" >");
@@ -513,7 +524,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
unsigned char id = SYS_IND(p);
int n;
- if (!NR_SECTS(p))
+ if (!nr_sects(p))
continue;
for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
@@ -521,8 +532,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
if (!subtypes[n].parse)
continue;
- subtypes[n].parse(state, bdev, START_SECT(p)*sector_size,
- NR_SECTS(p)*sector_size, slot);
+ subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
+ nr_sects(p)*sector_size, slot);
}
put_dev_sector(sect);
return 1;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 822c2d506518..42fdc765f817 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -82,7 +82,6 @@
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
-#include <linux/swapops.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
@@ -322,94 +321,6 @@ static inline void task_context_switch_counts(struct seq_file *m,
p->nivcsw);
}
-#ifdef CONFIG_MMU
-
-struct stack_stats {
- struct vm_area_struct *vma;
- unsigned long startpage;
- unsigned long usage;
-};
-
-static int stack_usage_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, struct mm_walk *walk)
-{
- struct stack_stats *ss = walk->private;
- struct vm_area_struct *vma = ss->vma;
- pte_t *pte, ptent;
- spinlock_t *ptl;
- int ret = 0;
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- for (; addr != end; pte++, addr += PAGE_SIZE) {
- ptent = *pte;
-
-#ifdef CONFIG_STACK_GROWSUP
- if (pte_present(ptent) || is_swap_pte(ptent))
- ss->usage = addr - ss->startpage + PAGE_SIZE;
-#else
- if (pte_present(ptent) || is_swap_pte(ptent)) {
- ss->usage = ss->startpage - addr + PAGE_SIZE;
- pte++;
- ret = 1;
- break;
- }
-#endif
- }
- pte_unmap_unlock(pte - 1, ptl);
- cond_resched();
- return ret;
-}
-
-static inline unsigned long get_stack_usage_in_bytes(struct vm_area_struct *vma,
- struct task_struct *task)
-{
- struct stack_stats ss;
- struct mm_walk stack_walk = {
- .pmd_entry = stack_usage_pte_range,
- .mm = vma->vm_mm,
- .private = &ss,
- };
-
- if (!vma->vm_mm || is_vm_hugetlb_page(vma))
- return 0;
-
- ss.vma = vma;
- ss.startpage = task->stack_start & PAGE_MASK;
- ss.usage = 0;
-
-#ifdef CONFIG_STACK_GROWSUP
- walk_page_range(KSTK_ESP(task) & PAGE_MASK, vma->vm_end,
- &stack_walk);
-#else
- walk_page_range(vma->vm_start, (KSTK_ESP(task) & PAGE_MASK) + PAGE_SIZE,
- &stack_walk);
-#endif
- return ss.usage;
-}
-
-static inline void task_show_stack_usage(struct seq_file *m,
- struct task_struct *task)
-{
- struct vm_area_struct *vma;
- struct mm_struct *mm = get_task_mm(task);
-
- if (mm) {
- down_read(&mm->mmap_sem);
- vma = find_vma(mm, task->stack_start);
- if (vma)
- seq_printf(m, "Stack usage:\t%lu kB\n",
- get_stack_usage_in_bytes(vma, task) >> 10);
-
- up_read(&mm->mmap_sem);
- mmput(mm);
- }
-}
-#else
-static void task_show_stack_usage(struct seq_file *m, struct task_struct *task)
-{
-}
-#endif /* CONFIG_MMU */
-
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -429,7 +340,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
task_show_regs(m, task);
#endif
task_context_switch_counts(m, task);
- task_show_stack_usage(m, task);
return 0;
}
@@ -495,7 +405,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
/* add up live thread stats at the group level */
if (whole) {
- struct task_cputime cputime;
struct task_struct *t = task;
do {
min_flt += t->min_flt;
@@ -506,9 +415,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
- thread_group_cputime(task, &cputime);
- utime = cputime.utime;
- stime = cputime.stime;
+ thread_group_times(task, &utime, &stime);
gtime = cputime_add(gtime, sig->gtime);
}
@@ -571,7 +478,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
rsslim,
mm ? mm->start_code : 0,
mm ? mm->end_code : 0,
- (permitted && mm) ? task->stack_start : 0,
+ (permitted && mm) ? mm->start_stack : 0,
esp,
eip,
/* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index af643b5aefe8..a1bb0f649030 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -442,12 +442,13 @@ static const struct file_operations proc_lstats_operations = {
unsigned long badness(struct task_struct *p, unsigned long uptime);
static int proc_oom_score(struct task_struct *task, char *buffer)
{
- unsigned long points;
+ unsigned long points = 0;
struct timespec uptime;
do_posix_clock_monotonic_gettime(&uptime);
read_lock(&tasklist_lock);
- points = badness(task->group_leader, uptime.tv_sec);
+ if (pid_alive(task))
+ points = badness(task, uptime.tv_sec);
read_unlock(&tasklist_lock);
return sprintf(buffer, "%lu\n", points);
}
@@ -2304,16 +2305,30 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct pid_namespace *ns = dentry->d_sb->s_fs_info;
pid_t tgid = task_tgid_nr_ns(current, ns);
- char tmp[PROC_NUMBUF];
- if (!tgid)
- return ERR_PTR(-ENOENT);
- sprintf(tmp, "%d", task_tgid_nr_ns(current, ns));
- return ERR_PTR(vfs_follow_link(nd,tmp));
+ char *name = ERR_PTR(-ENOENT);
+ if (tgid) {
+ name = __getname();
+ if (!name)
+ name = ERR_PTR(-ENOMEM);
+ else
+ sprintf(name, "%d", tgid);
+ }
+ nd_set_link(nd, name);
+ return NULL;
+}
+
+static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
+ void *cookie)
+{
+ char *s = nd_get_link(nd);
+ if (!IS_ERR(s))
+ __putname(s);
}
static const struct inode_operations proc_self_inode_operations = {
.readlink = proc_self_readlink,
.follow_link = proc_self_follow_link,
+ .put_link = proc_self_put_link,
};
/*
@@ -2829,7 +2844,7 @@ out_no_task:
*/
static const struct pid_entry tid_base_stuff[] = {
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
- DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations),
+ DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
REG("environ", S_IRUSR, proc_environ_operations),
INF("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2a1bef9203c6..e10ef0452847 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -206,6 +206,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
int flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
+ unsigned long start;
dev_t dev = 0;
int len;
@@ -216,8 +217,14 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
}
+ /* We don't show the stack guard page in /proc/maps */
+ start = vma->vm_start;
+ if (vma->vm_flags & VM_GROWSDOWN)
+ if (!vma_stack_continue(vma->vm_prev, vma->vm_start))
+ start += PAGE_SIZE;
+
seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
- vma->vm_start,
+ start,
vma->vm_end,
flags & VM_READ ? 'r' : '-',
flags & VM_WRITE ? 'w' : '-',
@@ -243,25 +250,6 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
} else if (vma->vm_start <= mm->start_stack &&
vma->vm_end >= mm->start_stack) {
name = "[stack]";
- } else {
- unsigned long stack_start;
- struct proc_maps_private *pmp;
-
- pmp = m->private;
- stack_start = pmp->task->stack_start;
-
- if (vma->vm_start <= stack_start &&
- vma->vm_end >= stack_start) {
- pad_len_spaces(m, len);
- seq_printf(m,
- "[threadstack:%08lx]",
-#ifdef CONFIG_STACK_GROWSUP
- vma->vm_end - stack_start
-#else
- stack_start - vma->vm_start
-#endif
- );
- }
}
} else {
name = "[vdso]";
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 39b49c42a7ed..4fdb0eb88184 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -229,6 +229,8 @@ static struct hlist_head *dquot_hash;
struct dqstats dqstats;
EXPORT_SYMBOL(dqstats);
+static qsize_t inode_get_rsv_space(struct inode *inode);
+
static inline unsigned int
hashfn(const struct super_block *sb, unsigned int id, int type)
{
@@ -820,11 +822,14 @@ static int dqinit_needed(struct inode *inode, int type)
static void add_dquot_ref(struct super_block *sb, int type)
{
struct inode *inode, *old_inode = NULL;
+ int reserved = 0;
spin_lock(&inode_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
continue;
+ if (unlikely(inode_get_rsv_space(inode) > 0))
+ reserved = 1;
if (!atomic_read(&inode->i_writecount))
continue;
if (!dqinit_needed(inode, type))
@@ -845,6 +850,12 @@ static void add_dquot_ref(struct super_block *sb, int type)
}
spin_unlock(&inode_lock);
iput(old_inode);
+
+ if (reserved) {
+ printk(KERN_WARNING "VFS (%s): Writes happened before quota"
+ " was turned on thus quota information is probably "
+ "inconsistent. Please run quotacheck(8).\n", sb->s_id);
+ }
}
/*
@@ -958,10 +969,12 @@ static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
/*
* Claim reserved quota space
*/
-static void dquot_claim_reserved_space(struct dquot *dquot,
- qsize_t number)
+static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number)
{
- WARN_ON(dquot->dq_dqb.dqb_rsvspace < number);
+ if (dquot->dq_dqb.dqb_rsvspace < number) {
+ WARN_ON_ONCE(1);
+ number = dquot->dq_dqb.dqb_rsvspace;
+ }
dquot->dq_dqb.dqb_curspace += number;
dquot->dq_dqb.dqb_rsvspace -= number;
}
@@ -969,7 +982,12 @@ static void dquot_claim_reserved_space(struct dquot *dquot,
static inline
void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
{
- dquot->dq_dqb.dqb_rsvspace -= number;
+ if (dquot->dq_dqb.dqb_rsvspace >= number)
+ dquot->dq_dqb.dqb_rsvspace -= number;
+ else {
+ WARN_ON_ONCE(1);
+ dquot->dq_dqb.dqb_rsvspace = 0;
+ }
}
static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
@@ -1287,6 +1305,7 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
return QUOTA_NL_BHARDBELOW;
return QUOTA_NL_NOWARN;
}
+
/*
* Initialize quota pointers in inode
* We do things in a bit complicated way but by that we avoid calling
@@ -1298,6 +1317,7 @@ int dquot_initialize(struct inode *inode, int type)
int cnt, ret = 0;
struct dquot *got[MAXQUOTAS] = { NULL, NULL };
struct super_block *sb = inode->i_sb;
+ qsize_t rsv;
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
@@ -1332,6 +1352,13 @@ int dquot_initialize(struct inode *inode, int type)
if (!inode->i_dquot[cnt]) {
inode->i_dquot[cnt] = got[cnt];
got[cnt] = NULL;
+ /*
+ * Make quota reservation system happy if someone
+ * did a write before quota was turned on
+ */
+ rsv = inode_get_rsv_space(inode);
+ if (unlikely(rsv))
+ dquot_resv_space(inode->i_dquot[cnt], rsv);
}
}
out_err:
@@ -1388,6 +1415,72 @@ void vfs_dq_drop(struct inode *inode)
EXPORT_SYMBOL(vfs_dq_drop);
/*
+ * inode_reserved_space is managed internally by quota, and protected by
+ * i_lock similar to i_blocks+i_bytes.
+ */
+static qsize_t *inode_reserved_space(struct inode * inode)
+{
+ /* Filesystem must explicitly define it's own method in order to use
+ * quota reservation interface */
+ BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
+ return inode->i_sb->dq_op->get_reserved_space(inode);
+}
+
+void inode_add_rsv_space(struct inode *inode, qsize_t number)
+{
+ spin_lock(&inode->i_lock);
+ *inode_reserved_space(inode) += number;
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(inode_add_rsv_space);
+
+void inode_claim_rsv_space(struct inode *inode, qsize_t number)
+{
+ spin_lock(&inode->i_lock);
+ *inode_reserved_space(inode) -= number;
+ __inode_add_bytes(inode, number);
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(inode_claim_rsv_space);
+
+void inode_sub_rsv_space(struct inode *inode, qsize_t number)
+{
+ spin_lock(&inode->i_lock);
+ *inode_reserved_space(inode) -= number;
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(inode_sub_rsv_space);
+
+static qsize_t inode_get_rsv_space(struct inode *inode)
+{
+ qsize_t ret;
+
+ if (!inode->i_sb->dq_op->get_reserved_space)
+ return 0;
+ spin_lock(&inode->i_lock);
+ ret = *inode_reserved_space(inode);
+ spin_unlock(&inode->i_lock);
+ return ret;
+}
+
+static void inode_incr_space(struct inode *inode, qsize_t number,
+ int reserve)
+{
+ if (reserve)
+ inode_add_rsv_space(inode, number);
+ else
+ inode_add_bytes(inode, number);
+}
+
+static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
+{
+ if (reserve)
+ inode_sub_rsv_space(inode, number);
+ else
+ inode_sub_bytes(inode, number);
+}
+
+/*
* Following four functions update i_blocks+i_bytes fields and
* quota information (together with appropriate checks)
* NOTE: We absolutely rely on the fact that caller dirties
@@ -1405,6 +1498,21 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
int cnt, ret = QUOTA_OK;
char warntype[MAXQUOTAS];
+ /*
+ * First test before acquiring mutex - solves deadlocks when we
+ * re-enter the quota code and are already holding the mutex
+ */
+ if (IS_NOQUOTA(inode)) {
+ inode_incr_space(inode, number, reserve);
+ goto out;
+ }
+
+ down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ if (IS_NOQUOTA(inode)) {
+ inode_incr_space(inode, number, reserve);
+ goto out_unlock;
+ }
+
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
warntype[cnt] = QUOTA_NL_NOWARN;
@@ -1415,7 +1523,8 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt)
== NO_QUOTA) {
ret = NO_QUOTA;
- goto out_unlock;
+ spin_unlock(&dq_data_lock);
+ goto out_flush_warn;
}
}
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1426,64 +1535,32 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
else
dquot_incr_space(inode->i_dquot[cnt], number);
}
- if (!reserve)
- inode_add_bytes(inode, number);
-out_unlock:
+ inode_incr_space(inode, number, reserve);
spin_unlock(&dq_data_lock);
- flush_warnings(inode->i_dquot, warntype);
- return ret;
-}
-
-int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
-{
- int cnt, ret = QUOTA_OK;
-
- /*
- * First test before acquiring mutex - solves deadlocks when we
- * re-enter the quota code and are already holding the mutex
- */
- if (IS_NOQUOTA(inode)) {
- inode_add_bytes(inode, number);
- goto out;
- }
-
- down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- if (IS_NOQUOTA(inode)) {
- inode_add_bytes(inode, number);
- goto out_unlock;
- }
-
- ret = __dquot_alloc_space(inode, number, warn, 0);
- if (ret == NO_QUOTA)
- goto out_unlock;
+ if (reserve)
+ goto out_flush_warn;
/* Dirtify all the dquots - this can block when journalling */
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
if (inode->i_dquot[cnt])
mark_dquot_dirty(inode->i_dquot[cnt]);
+out_flush_warn:
+ flush_warnings(inode->i_dquot, warntype);
out_unlock:
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
out:
return ret;
}
+
+int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
+{
+ return __dquot_alloc_space(inode, number, warn, 0);
+}
EXPORT_SYMBOL(dquot_alloc_space);
int dquot_reserve_space(struct inode *inode, qsize_t number, int warn)
{
- int ret = QUOTA_OK;
-
- if (IS_NOQUOTA(inode))
- goto out;
-
- down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- if (IS_NOQUOTA(inode))
- goto out_unlock;
-
- ret = __dquot_alloc_space(inode, number, warn, 1);
-out_unlock:
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-out:
- return ret;
+ return __dquot_alloc_space(inode, number, warn, 1);
}
EXPORT_SYMBOL(dquot_reserve_space);
@@ -1540,14 +1617,14 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
int ret = QUOTA_OK;
if (IS_NOQUOTA(inode)) {
- inode_add_bytes(inode, number);
+ inode_claim_rsv_space(inode, number);
goto out;
}
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
if (IS_NOQUOTA(inode)) {
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- inode_add_bytes(inode, number);
+ inode_claim_rsv_space(inode, number);
goto out;
}
@@ -1559,7 +1636,7 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
number);
}
/* Update inode bytes */
- inode_add_bytes(inode, number);
+ inode_claim_rsv_space(inode, number);
spin_unlock(&dq_data_lock);
/* Dirtify all the dquots - this can block when journalling */
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
@@ -1572,38 +1649,9 @@ out:
EXPORT_SYMBOL(dquot_claim_space);
/*
- * Release reserved quota space
- */
-void dquot_release_reserved_space(struct inode *inode, qsize_t number)
-{
- int cnt;
-
- if (IS_NOQUOTA(inode))
- goto out;
-
- down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- if (IS_NOQUOTA(inode))
- goto out_unlock;
-
- spin_lock(&dq_data_lock);
- /* Release reserved dquots */
- for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (inode->i_dquot[cnt])
- dquot_free_reserved_space(inode->i_dquot[cnt], number);
- }
- spin_unlock(&dq_data_lock);
-
-out_unlock:
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-out:
- return;
-}
-EXPORT_SYMBOL(dquot_release_reserved_space);
-
-/*
* This operation can block, but only after everything is updated
*/
-int dquot_free_space(struct inode *inode, qsize_t number)
+int __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
{
unsigned int cnt;
char warntype[MAXQUOTAS];
@@ -1612,7 +1660,7 @@ int dquot_free_space(struct inode *inode, qsize_t number)
* re-enter the quota code and are already holding the mutex */
if (IS_NOQUOTA(inode)) {
out_sub:
- inode_sub_bytes(inode, number);
+ inode_decr_space(inode, number, reserve);
return QUOTA_OK;
}
@@ -1627,21 +1675,43 @@ out_sub:
if (!inode->i_dquot[cnt])
continue;
warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
- dquot_decr_space(inode->i_dquot[cnt], number);
+ if (reserve)
+ dquot_free_reserved_space(inode->i_dquot[cnt], number);
+ else
+ dquot_decr_space(inode->i_dquot[cnt], number);
}
- inode_sub_bytes(inode, number);
+ inode_decr_space(inode, number, reserve);
spin_unlock(&dq_data_lock);
+
+ if (reserve)
+ goto out_unlock;
/* Dirtify all the dquots - this can block when journalling */
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
if (inode->i_dquot[cnt])
mark_dquot_dirty(inode->i_dquot[cnt]);
+out_unlock:
flush_warnings(inode->i_dquot, warntype);
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
return QUOTA_OK;
}
+
+int dquot_free_space(struct inode *inode, qsize_t number)
+{
+ return __dquot_free_space(inode, number, 0);
+}
EXPORT_SYMBOL(dquot_free_space);
/*
+ * Release reserved quota space
+ */
+void dquot_release_reserved_space(struct inode *inode, qsize_t number)
+{
+ __dquot_free_space(inode, number, 1);
+
+}
+EXPORT_SYMBOL(dquot_release_reserved_space);
+
+/*
* This operation can block, but only after everything is updated
*/
int dquot_free_inode(const struct inode *inode, qsize_t number)
@@ -1679,19 +1749,6 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
EXPORT_SYMBOL(dquot_free_inode);
/*
- * call back function, get reserved quota space from underlying fs
- */
-qsize_t dquot_get_reserved_space(struct inode *inode)
-{
- qsize_t reserved_space = 0;
-
- if (sb_any_quota_active(inode->i_sb) &&
- inode->i_sb->dq_op->get_reserved_space)
- reserved_space = inode->i_sb->dq_op->get_reserved_space(inode);
- return reserved_space;
-}
-
-/*
* Transfer the number of inode and blocks from one diskquota to an other.
*
* This operation can block, but only after everything is updated
@@ -1734,7 +1791,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
}
spin_lock(&dq_data_lock);
cur_space = inode_get_bytes(inode);
- rsv_space = dquot_get_reserved_space(inode);
+ rsv_space = inode_get_rsv_space(inode);
space = cur_space + rsv_space;
/* Build the transfer_from list and check the limits */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -2332,34 +2389,34 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
if (di->dqb_valid & QIF_SPACE) {
dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace;
check_blim = 1;
- __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+ set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
}
if (di->dqb_valid & QIF_BLIMITS) {
dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
check_blim = 1;
- __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+ set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
}
if (di->dqb_valid & QIF_INODES) {
dm->dqb_curinodes = di->dqb_curinodes;
check_ilim = 1;
- __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+ set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
}
if (di->dqb_valid & QIF_ILIMITS) {
dm->dqb_isoftlimit = di->dqb_isoftlimit;
dm->dqb_ihardlimit = di->dqb_ihardlimit;
check_ilim = 1;
- __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+ set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
}
if (di->dqb_valid & QIF_BTIME) {
dm->dqb_btime = di->dqb_btime;
check_blim = 1;
- __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+ set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
}
if (di->dqb_valid & QIF_ITIME) {
dm->dqb_itime = di->dqb_itime;
check_ilim = 1;
- __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+ set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
}
if (check_blim) {
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 6d2668fdc384..d42c30ceaee5 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -45,8 +45,6 @@ static inline bool is_privroot_deh(struct dentry *dir,
struct reiserfs_de_head *deh)
{
struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
- if (reiserfs_expose_privroot(dir->d_sb))
- return 0;
return (dir == dir->d_parent && privroot->d_inode &&
deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a14d6cd9eeda..d240c15bf086 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2531,6 +2531,12 @@ static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
return reiserfs_write_full_page(page, wbc);
}
+static void reiserfs_truncate_failed_write(struct inode *inode)
+{
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ reiserfs_truncate_file(inode, 0);
+}
+
static int reiserfs_write_begin(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -2597,6 +2603,8 @@ static int reiserfs_write_begin(struct file *file,
if (ret) {
unlock_page(page);
page_cache_release(page);
+ /* Truncate allocated blocks */
+ reiserfs_truncate_failed_write(inode);
}
return ret;
}
@@ -2689,8 +2697,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
** transaction tracking stuff when the size changes. So, we have
** to do the i_size updates here.
*/
- pos += copied;
- if (pos > inode->i_size) {
+ if (pos + copied > inode->i_size) {
struct reiserfs_transaction_handle myth;
reiserfs_write_lock(inode->i_sb);
/* If the file have grown beyond the border where it
@@ -2708,7 +2715,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
goto journal_error;
}
reiserfs_update_inode_transaction(inode);
- inode->i_size = pos;
+ inode->i_size = pos + copied;
/*
* this will just nest into our transaction. It's important
* to use mark_inode_dirty so the inode gets pushed around on the
@@ -2735,6 +2742,10 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
out:
unlock_page(page);
page_cache_release(page);
+
+ if (pos + len > inode->i_size)
+ reiserfs_truncate_failed_write(inode);
+
return ret == 0 ? copied : ret;
journal_error:
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 90622200b39c..b5fe0aa033e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2184,6 +2184,15 @@ static int journal_read_transaction(struct super_block *sb,
brelse(d_bh);
return 1;
}
+
+ if (bdev_read_only(sb->s_bdev)) {
+ reiserfs_warning(sb, "clm-2076",
+ "device is readonly, unable to replay log");
+ brelse(c_bh);
+ brelse(d_bh);
+ return -EROFS;
+ }
+
trans_id = get_desc_trans_id(desc);
/* now we know we've got a good transaction, and it was inside the valid time ranges */
log_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2422,12 +2431,6 @@ static int journal_read(struct super_block *sb)
goto start_log_replay;
}
- if (continue_replay && bdev_read_only(sb->s_bdev)) {
- reiserfs_warning(sb, "clm-2076",
- "device is readonly, unable to replay log");
- return -1;
- }
-
/* ok, there are transactions that need to be replayed. start with the first log block, find
** all the valid transactions, and pick out the oldest.
*/
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6925b835a43b..cc1caa26be52 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -536,7 +536,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
if (!err && new_size < i_size_read(dentry->d_inode)) {
struct iattr newattrs = {
.ia_ctime = current_fs_time(inode->i_sb),
- .ia_size = buffer_size,
+ .ia_size = new_size,
.ia_valid = ATTR_SIZE | ATTR_CTIME,
};
mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
@@ -952,21 +952,13 @@ int reiserfs_permission(struct inode *inode, int mask)
return generic_permission(inode, mask, NULL);
}
-/* This will catch lookups from the fs root to .reiserfs_priv */
-static int
-xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
+static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
{
- struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root;
- if (container_of(q1, struct dentry, d_name) == priv_root)
- return -ENOENT;
- if (q1->len == name->len &&
- !memcmp(q1->name, name->name, name->len))
- return 0;
- return 1;
+ return -EPERM;
}
static const struct dentry_operations xattr_lookup_poison_ops = {
- .d_compare = xattr_lookup_poison,
+ .d_revalidate = xattr_hide_revalidate,
};
int reiserfs_lookup_privroot(struct super_block *s)
@@ -980,8 +972,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
strlen(PRIVROOT_NAME));
if (!IS_ERR(dentry)) {
REISERFS_SB(s)->priv_root = dentry;
- if (!reiserfs_expose_privroot(s))
- s->s_root->d_op = &xattr_lookup_poison_ops;
+ dentry->d_op = &xattr_lookup_poison_ops;
if (dentry->d_inode)
dentry->d_inode->i_flags |= S_PRIVATE;
} else
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index a92c8792c0f6..b37b13b93d89 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -75,7 +75,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
return error;
}
- if (sec->length) {
+ if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
blocks = reiserfs_xattr_jcreate_nblocks(inode) +
reiserfs_xattr_nblocks(inode, sec->length);
/* We don't want to count the directories twice if we have
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index c117fa80d1e9..42d213546894 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -544,6 +544,7 @@ error:
error_rsb_inval:
ret = -EINVAL;
error_rsb:
+ kfree(rsb);
return ret;
}
diff --git a/fs/signalfd.c b/fs/signalfd.c
index b07565c94386..d98bea8865c1 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -87,6 +87,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid);
err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun);
err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+ err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
break;
case __SI_POLL:
err |= __put_user(kinfo->si_band, &uinfo->ssi_band);
@@ -110,6 +111,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+ err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
break;
default:
/*
diff --git a/fs/splice.c b/fs/splice.c
index 7394e9e17534..e5efbb96d2bd 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -365,17 +365,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
* If the page isn't uptodate, we may need to start io on it
*/
if (!PageUptodate(page)) {
- /*
- * If in nonblock mode then dont block on waiting
- * for an in-flight io page
- */
- if (flags & SPLICE_F_NONBLOCK) {
- if (!trylock_page(page)) {
- error = -EAGAIN;
- break;
- }
- } else
- lock_page(page);
+ lock_page(page);
/*
* Page was truncated, or invalidated by the
diff --git a/fs/stat.c b/fs/stat.c
index 075694e31d8b..c4ecd52c5737 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -401,9 +401,9 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
}
#endif /* __ARCH_WANT_STAT64 */
-void inode_add_bytes(struct inode *inode, loff_t bytes)
+/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
+void __inode_add_bytes(struct inode *inode, loff_t bytes)
{
- spin_lock(&inode->i_lock);
inode->i_blocks += bytes >> 9;
bytes &= 511;
inode->i_bytes += bytes;
@@ -411,6 +411,12 @@ void inode_add_bytes(struct inode *inode, loff_t bytes)
inode->i_blocks++;
inode->i_bytes -= 512;
}
+}
+
+void inode_add_bytes(struct inode *inode, loff_t bytes)
+{
+ spin_lock(&inode->i_lock);
+ __inode_add_bytes(inode, bytes);
spin_unlock(&inode->i_lock);
}
diff --git a/fs/super.c b/fs/super.c
index 19eb70b374bc..aff046b0fe78 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -901,8 +901,9 @@ int get_sb_single(struct file_system_type *fs_type,
return error;
}
s->s_flags |= MS_ACTIVE;
+ } else {
+ do_remount_sb(s, flags, data, 0);
}
- do_remount_sb(s, flags, data, 0);
simple_set_mnt(mnt, s);
return 0;
}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index f5ea4680f15f..7118a383a87a 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
char *p;
p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
- if (p)
+ if (!IS_ERR(p))
memmove(last_sysfs_file, p, strlen(p) + 1);
/* need attr_sd for attr and ops, its parent for kobj */
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e28cecf179f5..02a022a3d473 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -94,30 +94,29 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
if (!sd_attrs)
return -ENOMEM;
sd->s_iattr = sd_attrs;
- } else {
- /* attributes were changed at least once in past */
- iattrs = &sd_attrs->ia_iattr;
-
- if (ia_valid & ATTR_UID)
- iattrs->ia_uid = iattr->ia_uid;
- if (ia_valid & ATTR_GID)
- iattrs->ia_gid = iattr->ia_gid;
- if (ia_valid & ATTR_ATIME)
- iattrs->ia_atime = timespec_trunc(iattr->ia_atime,
- inode->i_sb->s_time_gran);
- if (ia_valid & ATTR_MTIME)
- iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime,
- inode->i_sb->s_time_gran);
- if (ia_valid & ATTR_CTIME)
- iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime,
- inode->i_sb->s_time_gran);
- if (ia_valid & ATTR_MODE) {
- umode_t mode = iattr->ia_mode;
-
- if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
- mode &= ~S_ISGID;
- iattrs->ia_mode = sd->s_mode = mode;
- }
+ }
+ /* attributes were changed at least once in past */
+ iattrs = &sd_attrs->ia_iattr;
+
+ if (ia_valid & ATTR_UID)
+ iattrs->ia_uid = iattr->ia_uid;
+ if (ia_valid & ATTR_GID)
+ iattrs->ia_gid = iattr->ia_gid;
+ if (ia_valid & ATTR_ATIME)
+ iattrs->ia_atime = timespec_trunc(iattr->ia_atime,
+ inode->i_sb->s_time_gran);
+ if (ia_valid & ATTR_MTIME)
+ iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime,
+ inode->i_sb->s_time_gran);
+ if (ia_valid & ATTR_CTIME)
+ iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime,
+ inode->i_sb->s_time_gran);
+ if (ia_valid & ATTR_MODE) {
+ umode_t mode = iattr->ia_mode;
+
+ if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+ mode &= ~S_ISGID;
+ iattrs->ia_mode = sd->s_mode = mode;
}
return error;
}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 9d1b8c2e6c45..1e4543cbcd27 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1078,21 +1078,39 @@ static int udf_fill_partdesc_info(struct super_block *sb,
return 0;
}
-static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
+static void udf_find_vat_block(struct super_block *sb, int p_index,
+ int type1_index, sector_t start_block)
{
struct udf_sb_info *sbi = UDF_SB(sb);
struct udf_part_map *map = &sbi->s_partmaps[p_index];
+ sector_t vat_block;
struct kernel_lb_addr ino;
+
+ /*
+ * VAT file entry is in the last recorded block. Some broken disks have
+ * it a few blocks before so try a bit harder...
+ */
+ ino.partitionReferenceNum = type1_index;
+ for (vat_block = start_block;
+ vat_block >= map->s_partition_root &&
+ vat_block >= start_block - 3 &&
+ !sbi->s_vat_inode; vat_block--) {
+ ino.logicalBlockNum = vat_block - map->s_partition_root;
+ sbi->s_vat_inode = udf_iget(sb, &ino);
+ }
+}
+
+static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
+{
+ struct udf_sb_info *sbi = UDF_SB(sb);
+ struct udf_part_map *map = &sbi->s_partmaps[p_index];
struct buffer_head *bh = NULL;
struct udf_inode_info *vati;
uint32_t pos;
struct virtualAllocationTable20 *vat20;
sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
- /* VAT file entry is in the last recorded block */
- ino.partitionReferenceNum = type1_index;
- ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
- sbi->s_vat_inode = udf_iget(sb, &ino);
+ udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
if (!sbi->s_vat_inode &&
sbi->s_last_block != blocks - 1) {
printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the"
@@ -1100,9 +1118,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
"block of the device (%lu).\n",
(unsigned long)sbi->s_last_block,
(unsigned long)blocks - 1);
- ino.partitionReferenceNum = type1_index;
- ino.logicalBlockNum = blocks - 1 - map->s_partition_root;
- sbi->s_vat_inode = udf_iget(sb, &ino);
+ udf_find_vat_block(sb, p_index, type1_index, blocks - 1);
}
if (!sbi->s_vat_inode)
return 1;
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b23a54506446..4c7b14503716 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -250,8 +250,9 @@ xfs_set_mode(struct inode *inode, mode_t mode)
if (mode != inode->i_mode) {
struct iattr iattr;
- iattr.ia_valid = ATTR_MODE;
+ iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
iattr.ia_mode = mode;
+ iattr.ia_ctime = current_fs_time(inode->i_sb);
error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
}
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c2e30eea74dc..7263002fac64 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -204,14 +204,17 @@ xfs_ioend_new_eof(
}
/*
- * Update on-disk file size now that data has been written to disk.
- * The current in-memory file size is i_size. If a write is beyond
- * eof i_new_size will be the intended file size until i_size is
- * updated. If this write does not extend all the way to the valid
- * file size then restrict this update to the end of the write.
+ * Update on-disk file size now that data has been written to disk. The
+ * current in-memory file size is i_size. If a write is beyond eof i_new_size
+ * will be the intended file size until i_size is updated. If this write does
+ * not extend all the way to the valid file size then restrict this update to
+ * the end of the write.
+ *
+ * This function does not block as blocking on the inode lock in IO completion
+ * can lead to IO completion order dependency deadlocks.. If it can't get the
+ * inode ilock it will return EAGAIN. Callers must handle this.
*/
-
-STATIC void
+STATIC int
xfs_setfilesize(
xfs_ioend_t *ioend)
{
@@ -222,9 +225,11 @@ xfs_setfilesize(
ASSERT(ioend->io_type != IOMAP_READ);
if (unlikely(ioend->io_error))
- return;
+ return 0;
+
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+ return EAGAIN;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
isize = xfs_ioend_new_eof(ioend);
if (isize) {
ip->i_d.di_size = isize;
@@ -232,6 +237,28 @@ xfs_setfilesize(
}
xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/*
+ * Schedule IO completion handling on a xfsdatad if this was
+ * the final hold on this ioend. If we are asked to wait,
+ * flush the workqueue.
+ */
+STATIC void
+xfs_finish_ioend(
+ xfs_ioend_t *ioend,
+ int wait)
+{
+ if (atomic_dec_and_test(&ioend->io_remaining)) {
+ struct workqueue_struct *wq;
+
+ wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
+ xfsconvertd_workqueue : xfsdatad_workqueue;
+ queue_work(wq, &ioend->io_work);
+ if (wait)
+ flush_workqueue(wq);
+ }
}
/*
@@ -243,9 +270,23 @@ xfs_end_bio_delalloc(
{
xfs_ioend_t *ioend =
container_of(work, xfs_ioend_t, io_work);
+ int error;
- xfs_setfilesize(ioend);
- xfs_destroy_ioend(ioend);
+ /*
+ * If we didn't complete processing of the ioend, requeue it to the
+ * tail of the workqueue for another attempt later. Otherwise destroy
+ * it.
+ */
+ error = xfs_setfilesize(ioend);
+ if (error == EAGAIN) {
+ atomic_inc(&ioend->io_remaining);
+ xfs_finish_ioend(ioend, 0);
+ /* ensure we don't spin on blocked ioends */
+ delay(1);
+ } else {
+ ASSERT(!error);
+ xfs_destroy_ioend(ioend);
+ }
}
/*
@@ -257,9 +298,23 @@ xfs_end_bio_written(
{
xfs_ioend_t *ioend =
container_of(work, xfs_ioend_t, io_work);
+ int error;
- xfs_setfilesize(ioend);
- xfs_destroy_ioend(ioend);
+ /*
+ * If we didn't complete processing of the ioend, requeue it to the
+ * tail of the workqueue for another attempt later. Otherwise destroy
+ * it.
+ */
+ error = xfs_setfilesize(ioend);
+ if (error == EAGAIN) {
+ atomic_inc(&ioend->io_remaining);
+ xfs_finish_ioend(ioend, 0);
+ /* ensure we don't spin on blocked ioends */
+ delay(1);
+ } else {
+ ASSERT(!error);
+ xfs_destroy_ioend(ioend);
+ }
}
/*
@@ -279,13 +334,25 @@ xfs_end_bio_unwritten(
size_t size = ioend->io_size;
if (likely(!ioend->io_error)) {
+ int error;
if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- int error;
error = xfs_iomap_write_unwritten(ip, offset, size);
if (error)
ioend->io_error = error;
}
- xfs_setfilesize(ioend);
+ /*
+ * If we didn't complete processing of the ioend, requeue it to the
+ * tail of the workqueue for another attempt later. Otherwise destroy
+ * it.
+ */
+ error = xfs_setfilesize(ioend);
+ if (error == EAGAIN) {
+ atomic_inc(&ioend->io_remaining);
+ xfs_finish_ioend(ioend, 0);
+ /* ensure we don't spin on blocked ioends */
+ delay(1);
+ return;
+ }
}
xfs_destroy_ioend(ioend);
}
@@ -304,27 +371,6 @@ xfs_end_bio_read(
}
/*
- * Schedule IO completion handling on a xfsdatad if this was
- * the final hold on this ioend. If we are asked to wait,
- * flush the workqueue.
- */
-STATIC void
-xfs_finish_ioend(
- xfs_ioend_t *ioend,
- int wait)
-{
- if (atomic_dec_and_test(&ioend->io_remaining)) {
- struct workqueue_struct *wq = xfsdatad_workqueue;
- if (ioend->io_work.func == xfs_end_bio_unwritten)
- wq = xfsconvertd_workqueue;
-
- queue_work(wq, &ioend->io_work);
- if (wait)
- flush_workqueue(wq);
- }
-}
-
-/*
* Allocate and initialise an IO completion structure.
* We need to track unwritten extent write completion here initially.
* We'll need to extend this for updating the ondisk inode size later
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 5bb523d7f37e..98fe0db674bc 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -789,6 +789,8 @@ xfs_ioc_fsgetxattr(
{
struct fsxattr fa;
+ memset(&fa, 0, sizeof(struct fsxattr));
+
xfs_ilock(ip, XFS_ILOCK_SHARED);
fa.fsx_xflags = xfs_ip2xflags(ip);
fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index cd42ef78f6b5..1f3b4b8f7dd4 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -573,8 +573,8 @@ xfs_vn_fallocate(
bf.l_len = len;
xfs_ilock(ip, XFS_IOLOCK_EXCL);
- error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
- 0, XFS_ATTR_NOLOCK);
+ error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
+ 0, XFS_ATTR_NOLOCK);
if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
offset + len > i_size_read(inode))
new_size = offset + len;
@@ -585,7 +585,7 @@ xfs_vn_fallocate(
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = new_size;
- error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
+ error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 18a4b8e11df2..d95bfa27c62a 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -930,13 +930,37 @@ xfs_fs_alloc_inode(
*/
STATIC void
xfs_fs_destroy_inode(
- struct inode *inode)
+ struct inode *inode)
{
- xfs_inode_t *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(inode);
+
+ xfs_itrace_entry(ip);
XFS_STATS_INC(vn_reclaim);
- if (xfs_reclaim(ip))
- panic("%s: cannot reclaim 0x%p\n", __func__, inode);
+
+ /* bad inode, get out here ASAP */
+ if (is_bad_inode(inode))
+ goto out_reclaim;
+
+ xfs_ioend_wait(ip);
+
+ ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+
+ /*
+ * We should never get here with one of the reclaim flags already set.
+ */
+ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
+
+ /*
+ * We always use background reclaim here because even if the
+ * inode is clean, it still may be under IO and hence we have
+ * to take the flush lock. The background reclaim path handles
+ * this more efficiently than we can here, so simply let background
+ * reclaim tear down all inodes.
+ */
+out_reclaim:
+ xfs_inode_set_reclaim_tag(ip);
}
/*
@@ -1140,6 +1164,7 @@ xfs_fs_put_super(
xfs_unmountfs(mp);
xfs_freesb(mp);
+ xfs_inode_shrinker_unregister(mp);
xfs_icsb_destroy_counters(mp);
xfs_close_devices(mp);
xfs_dmops_put(mp);
@@ -1299,6 +1324,8 @@ xfs_fs_remount(
/* ro -> rw */
if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+ __uint64_t resblks;
+
mp->m_flags &= ~XFS_MOUNT_RDONLY;
if (mp->m_flags & XFS_MOUNT_BARRIER)
xfs_mountfs_check_barriers(mp);
@@ -1316,11 +1343,37 @@ xfs_fs_remount(
}
mp->m_update_flags = 0;
}
+
+ /*
+ * Fill out the reserve pool if it is empty. Use the stashed
+ * value if it is non-zero, otherwise go with the default.
+ */
+ if (mp->m_resblks_save) {
+ resblks = mp->m_resblks_save;
+ mp->m_resblks_save = 0;
+ } else {
+ resblks = mp->m_sb.sb_dblocks;
+ do_div(resblks, 20);
+ resblks = min_t(__uint64_t, resblks, 1024);
+ }
+ xfs_reserve_blocks(mp, &resblks, NULL);
}
/* rw -> ro */
if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+ /*
+ * After we have synced the data but before we sync the
+ * metadata, we need to free up the reserve block pool so that
+ * the used block count in the superblock on disk is correct at
+ * the end of the remount. Stash the current reserve pool size
+ * so that if we get remounted rw, we can return it to the same
+ * size.
+ */
+ __uint64_t resblks = 0;
+
xfs_quiesce_data(mp);
+ mp->m_resblks_save = mp->m_resblks;
+ xfs_reserve_blocks(mp, &resblks, NULL);
xfs_quiesce_attr(mp);
mp->m_flags |= XFS_MOUNT_RDONLY;
}
@@ -1503,6 +1556,8 @@ xfs_fs_fill_super(
if (error)
goto fail_vnrele;
+ xfs_inode_shrinker_register(mp);
+
kfree(mtpt);
xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
@@ -1842,6 +1897,7 @@ init_xfs_fs(void)
goto out_cleanup_procfs;
vfs_initquota();
+ xfs_inode_shrinker_init();
error = register_filesystem(&xfs_fs_type);
if (error)
@@ -1871,6 +1927,7 @@ exit_xfs_fs(void)
{
vfs_exitquota();
unregister_filesystem(&xfs_fs_type);
+ xfs_inode_shrinker_destroy();
xfs_sysctl_unregister();
xfs_cleanup_procfs();
xfs_buf_terminate();
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 961df0a22c78..c82683ad1484 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -64,7 +64,6 @@ xfs_inode_ag_lookup(
* as the tree is sparse and a gang lookup walks to find
* the number of objects requested.
*/
- read_lock(&pag->pag_ici_lock);
if (tag == XFS_ICI_NO_TAG) {
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
(void **)&ip, *first_index, 1);
@@ -73,7 +72,7 @@ xfs_inode_ag_lookup(
(void **)&ip, *first_index, 1, tag);
}
if (!nr_found)
- goto unlock;
+ return NULL;
/*
* Update the index for the next lookup. Catch overflows
@@ -83,13 +82,8 @@ xfs_inode_ag_lookup(
*/
*first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
- goto unlock;
-
+ return NULL;
return ip;
-
-unlock:
- read_unlock(&pag->pag_ici_lock);
- return NULL;
}
STATIC int
@@ -99,7 +93,9 @@ xfs_inode_ag_walk(
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags),
int flags,
- int tag)
+ int tag,
+ int exclusive,
+ int *nr_to_scan)
{
struct xfs_perag *pag = &mp->m_perag[ag];
uint32_t first_index;
@@ -113,10 +109,20 @@ restart:
int error = 0;
xfs_inode_t *ip;
+ if (exclusive)
+ write_lock(&pag->pag_ici_lock);
+ else
+ read_lock(&pag->pag_ici_lock);
ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
- if (!ip)
+ if (!ip) {
+ if (exclusive)
+ write_unlock(&pag->pag_ici_lock);
+ else
+ read_unlock(&pag->pag_ici_lock);
break;
+ }
+ /* execute releases pag->pag_ici_lock */
error = execute(ip, pag, flags);
if (error == EAGAIN) {
skipped++;
@@ -124,13 +130,12 @@ restart:
}
if (error)
last_error = error;
- /*
- * bail out if the filesystem is corrupted.
- */
+
+ /* bail out if the filesystem is corrupted. */
if (error == EFSCORRUPTED)
break;
- } while (1);
+ } while ((*nr_to_scan)--);
if (skipped) {
delay(1);
@@ -147,22 +152,31 @@ xfs_inode_ag_iterator(
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags),
int flags,
- int tag)
+ int tag,
+ int exclusive,
+ int *nr_to_scan)
{
int error = 0;
int last_error = 0;
xfs_agnumber_t ag;
+ int nr;
+ nr = nr_to_scan ? *nr_to_scan : INT_MAX;
for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
if (!mp->m_perag[ag].pag_ici_init)
continue;
- error = xfs_inode_ag_walk(mp, ag, execute, flags, tag);
+ error = xfs_inode_ag_walk(mp, ag, execute, flags, tag,
+ exclusive, &nr);
if (error) {
last_error = error;
if (error == EFSCORRUPTED)
break;
}
+ if (nr <= 0)
+ break;
}
+ if (nr_to_scan)
+ *nr_to_scan = nr;
return XFS_ERROR(last_error);
}
@@ -173,30 +187,31 @@ xfs_sync_inode_valid(
struct xfs_perag *pag)
{
struct inode *inode = VFS_I(ip);
+ int error = EFSCORRUPTED;
/* nothing to sync during shutdown */
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- read_unlock(&pag->pag_ici_lock);
- return EFSCORRUPTED;
- }
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ goto out_unlock;
- /*
- * If we can't get a reference on the inode, it must be in reclaim.
- * Leave it for the reclaim code to flush. Also avoid inodes that
- * haven't been fully initialised.
- */
- if (!igrab(inode)) {
- read_unlock(&pag->pag_ici_lock);
- return ENOENT;
- }
- read_unlock(&pag->pag_ici_lock);
+ /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+ error = ENOENT;
+ if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+ goto out_unlock;
- if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
+ /* If we can't grab the inode, it must on it's way to reclaim. */
+ if (!igrab(inode))
+ goto out_unlock;
+
+ if (is_bad_inode(inode)) {
IRELE(ip);
- return ENOENT;
+ goto out_unlock;
}
- return 0;
+ /* inode is valid */
+ error = 0;
+out_unlock:
+ read_unlock(&pag->pag_ici_lock);
+ return error;
}
STATIC int
@@ -281,7 +296,7 @@ xfs_sync_data(
ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
- XFS_ICI_NO_TAG);
+ XFS_ICI_NO_TAG, 0, NULL);
if (error)
return XFS_ERROR(error);
@@ -303,7 +318,7 @@ xfs_sync_attr(
ASSERT((flags & ~SYNC_WAIT) == 0);
return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
- XFS_ICI_NO_TAG);
+ XFS_ICI_NO_TAG, 0, NULL);
}
STATIC int
@@ -663,35 +678,71 @@ xfs_syncd_stop(
kthread_stop(mp->m_sync_task);
}
-int
-xfs_reclaim_inode(
- xfs_inode_t *ip,
- int locked,
- int sync_mode)
+void
+__xfs_inode_set_reclaim_tag(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
{
- xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
+ radix_tree_tag_set(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+ pag->pag_ici_reclaimable++;
+}
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+ xfs_inode_t *ip)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
- /* The hash lock here protects a thread in xfs_iget_core from
- * racing with us on linking the inode back with a vnode.
- * Once we have the XFS_IRECLAIM flag set it will not touch
- * us.
- */
write_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
- if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
- !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+ __xfs_inode_set_reclaim_tag(pag, ip);
+ __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+ spin_unlock(&ip->i_flags_lock);
+ write_unlock(&pag->pag_ici_lock);
+ xfs_put_perag(mp, pag);
+}
+
+void
+__xfs_inode_clear_reclaim_tag(
+ xfs_mount_t *mp,
+ xfs_perag_t *pag,
+ xfs_inode_t *ip)
+{
+ radix_tree_tag_clear(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+ pag->pag_ici_reclaimable--;
+}
+
+STATIC int
+xfs_reclaim_inode(
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ int sync_mode)
+{
+ /*
+ * The radix tree lock here protects a thread in xfs_iget from racing
+ * with us starting reclaim on the inode. Once we have the
+ * XFS_IRECLAIM flag set it will not touch us.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+ /* ignore as it is already under reclaim */
spin_unlock(&ip->i_flags_lock);
write_unlock(&pag->pag_ici_lock);
- if (locked) {
- xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
- return -EAGAIN;
+ return 0;
}
__xfs_iflags_set(ip, XFS_IRECLAIM);
spin_unlock(&ip->i_flags_lock);
write_unlock(&pag->pag_ici_lock);
- xfs_put_perag(ip->i_mount, pag);
/*
* If the inode is still dirty, then flush it out. If the inode
@@ -704,10 +755,8 @@ xfs_reclaim_inode(
* We get the flush lock regardless, though, just to make sure
* we don't free it while it is being flushed.
*/
- if (!locked) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_iflock(ip);
- }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_iflock(ip);
/*
* In the case of a forced shutdown we rely on xfs_iflush() to
@@ -724,68 +773,94 @@ xfs_reclaim_inode(
return 0;
}
-void
-__xfs_inode_set_reclaim_tag(
- struct xfs_perag *pag,
- struct xfs_inode *ip)
+int
+xfs_reclaim_inodes(
+ xfs_mount_t *mp,
+ int mode)
{
- radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
- XFS_ICI_RECLAIM_TAG);
+ return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
+ XFS_ICI_RECLAIM_TAG, 1, NULL);
}
/*
- * We set the inode flag atomically with the radix tree tag.
- * Once we get tag lookups on the radix tree, this inode flag
- * can go away.
+ * Shrinker infrastructure.
+ *
+ * This is all far more complex than it needs to be. It adds a global list of
+ * mounts because the shrinkers can only call a global context. We need to make
+ * the shrinkers pass a context to avoid the need for global state.
*/
-void
-xfs_inode_set_reclaim_tag(
- xfs_inode_t *ip)
+static LIST_HEAD(xfs_mount_list);
+static struct rw_semaphore xfs_mount_list_lock;
+
+static int
+xfs_reclaim_inode_shrink(
+ int nr_to_scan,
+ gfp_t gfp_mask)
{
- xfs_mount_t *mp = ip->i_mount;
- xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
+ struct xfs_mount *mp;
+ xfs_agnumber_t ag;
+ int reclaimable = 0;
+
+ if (nr_to_scan) {
+ if (!(gfp_mask & __GFP_FS))
+ return -1;
+
+ down_read(&xfs_mount_list_lock);
+ list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
+ xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
+ XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
+ if (nr_to_scan <= 0)
+ break;
+ }
+ up_read(&xfs_mount_list_lock);
+ }
- read_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
- __xfs_inode_set_reclaim_tag(pag, ip);
- __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
- spin_unlock(&ip->i_flags_lock);
- read_unlock(&pag->pag_ici_lock);
- xfs_put_perag(mp, pag);
+ down_read(&xfs_mount_list_lock);
+ list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
+ for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+
+ if (!mp->m_perag[ag].pag_ici_init)
+ continue;
+ reclaimable += mp->m_perag[ag].pag_ici_reclaimable;
+ }
+ }
+ up_read(&xfs_mount_list_lock);
+ return reclaimable;
}
-void
-__xfs_inode_clear_reclaim_tag(
- xfs_mount_t *mp,
- xfs_perag_t *pag,
- xfs_inode_t *ip)
+static struct shrinker xfs_inode_shrinker = {
+ .shrink = xfs_reclaim_inode_shrink,
+ .seeks = DEFAULT_SEEKS,
+};
+
+void __init
+xfs_inode_shrinker_init(void)
{
- radix_tree_tag_clear(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+ init_rwsem(&xfs_mount_list_lock);
+ register_shrinker(&xfs_inode_shrinker);
}
-STATIC int
-xfs_reclaim_inode_now(
- struct xfs_inode *ip,
- struct xfs_perag *pag,
- int flags)
+void
+xfs_inode_shrinker_destroy(void)
{
- /* ignore if already under reclaim */
- if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
- read_unlock(&pag->pag_ici_lock);
- return 0;
- }
- read_unlock(&pag->pag_ici_lock);
+ ASSERT(list_empty(&xfs_mount_list));
+ unregister_shrinker(&xfs_inode_shrinker);
+}
- return xfs_reclaim_inode(ip, 0, flags);
+void
+xfs_inode_shrinker_register(
+ struct xfs_mount *mp)
+{
+ down_write(&xfs_mount_list_lock);
+ list_add_tail(&mp->m_mplist, &xfs_mount_list);
+ up_write(&xfs_mount_list_lock);
}
-int
-xfs_reclaim_inodes(
- xfs_mount_t *mp,
- int mode)
+void
+xfs_inode_shrinker_unregister(
+ struct xfs_mount *mp)
{
- return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
- XFS_ICI_RECLAIM_TAG);
+ down_write(&xfs_mount_list_lock);
+ list_del(&mp->m_mplist);
+ up_write(&xfs_mount_list_lock);
}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 27920eb7a820..0b28c13bdf94 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -44,7 +44,6 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
void xfs_flush_inodes(struct xfs_inode *ip);
-int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
@@ -55,6 +54,11 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
- int flags, int tag);
+ int flags, int tag, int write_lock, int *nr_to_scan);
+
+void xfs_inode_shrinker_init(void);
+void xfs_inode_shrinker_destroy(void);
+void xfs_inode_shrinker_register(struct xfs_mount *mp);
+void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
#endif
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index a5346630dfae..97b410c12794 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -59,7 +59,7 @@ xfs_fill_statvfs_from_dquot(
be64_to_cpu(dp->d_blk_hardlimit);
if (limit && statp->f_blocks > limit) {
statp->f_blocks = limit;
- statp->f_bfree =
+ statp->f_bfree = statp->f_bavail =
(statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
(statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
}
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 5d1a3b98a6e6..60fe35821cff 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -893,7 +893,8 @@ xfs_qm_dqrele_all_inodes(
uint flags)
{
ASSERT(mp->m_quotainfo);
- xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
+ xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
+ XFS_ICI_NO_TAG, 0, NULL);
}
/*------------------------------------------------------------------------*/
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index a5d54bf4931b..381fba77b28d 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -215,6 +215,7 @@ typedef struct xfs_perag
int pag_ici_init; /* incore inode cache initialised */
rwlock_t pag_ici_lock; /* incore inode lock */
struct radix_tree_root pag_ici_root; /* incore inode cache root */
+ int pag_ici_reclaimable; /* reclaimable inodes */
#endif
} xfs_perag_t;
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 2cf944eb796d..4cd1c23b77f0 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2703,45 +2703,35 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
xfs_mount_t *mp;
xfs_perag_busy_t *bsy;
xfs_agblock_t uend, bend;
- xfs_lsn_t lsn;
+ xfs_lsn_t lsn = 0;
int cnt;
mp = tp->t_mountp;
spin_lock(&mp->m_perag[agno].pagb_lock);
- cnt = mp->m_perag[agno].pagb_count;
-
uend = bno + len - 1;
- /* search pagb_list for this slot, skipping open slots */
- for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) {
-
- /*
- * (start1,length1) within (start2, length2)
- */
- if (bsy->busy_tp != NULL) {
- bend = bsy->busy_start + bsy->busy_length - 1;
- if ((bno > bend) || (uend < bsy->busy_start)) {
- cnt--;
- } else {
- TRACE_BUSYSEARCH("xfs_alloc_search_busy",
- "found1", agno, bno, len, tp);
- break;
- }
- }
- }
-
/*
- * If a block was found, force the log through the LSN of the
- * transaction that freed the block
+ * search pagb_list for this slot, skipping open slots. We have to
+ * search the entire array as there may be multiple overlaps and
+ * we have to get the most recent LSN for the log force to push out
+ * all the transactions that span the range.
*/
- if (cnt) {
- TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp);
- lsn = bsy->busy_tp->t_commit_lsn;
- spin_unlock(&mp->m_perag[agno].pagb_lock);
- xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
- } else {
- TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp);
- spin_unlock(&mp->m_perag[agno].pagb_lock);
+ for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) {
+ bsy = &mp->m_perag[agno].pagb_list[cnt];
+ if (!bsy->busy_tp)
+ continue;
+ bend = bsy->busy_start + bsy->busy_length - 1;
+ if (bno > bend || uend < bsy->busy_start)
+ continue;
+
+ /* (start1,length1) within (start2, length2) */
+ if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
+ lsn = bsy->busy_tp->t_commit_lsn;
}
+ spin_unlock(&mp->m_perag[agno].pagb_lock);
+ TRACE_BUSYSEARCH("xfs_alloc_search_busy", lsn ? "found" : "not-found",
+ agno, bno, len, tp);
+ if (lsn)
+ xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
}
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index ab89a7e94a0f..e4ccd9f24829 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -62,7 +62,9 @@ xfs_swapext(
goto out;
}
- if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) {
+ if (!(file->f_mode & FMODE_WRITE) ||
+ !(file->f_mode & FMODE_READ) ||
+ (file->f_flags & O_APPEND)) {
error = XFS_ERROR(EBADF);
goto out_put_file;
}
@@ -74,6 +76,7 @@ xfs_swapext(
}
if (!(target_file->f_mode & FMODE_WRITE) ||
+ !(target_file->f_mode & FMODE_READ) ||
(target_file->f_flags & O_APPEND)) {
error = XFS_ERROR(EBADF);
goto out_put_target_file;
@@ -113,10 +116,82 @@ xfs_swapext(
return error;
}
+/*
+ * We need to check that the format of the data fork in the temporary inode is
+ * valid for the target inode before doing the swap. This is not a problem with
+ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
+ * data fork depending on the space the attribute fork is taking so we can get
+ * invalid formats on the target inode.
+ *
+ * E.g. target has space for 7 extents in extent format, temp inode only has
+ * space for 6. If we defragment down to 7 extents, then the tmp format is a
+ * btree, but when swapped it needs to be in extent format. Hence we can't just
+ * blindly swap data forks on attr2 filesystems.
+ *
+ * Note that we check the swap in both directions so that we don't end up with
+ * a corrupt temporary inode, either.
+ *
+ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
+ * inode will prevent this situation from occurring, so all we do here is
+ * reject and log the attempt. basically we are putting the responsibility on
+ * userspace to get this right.
+ */
+static int
+xfs_swap_extents_check_format(
+ xfs_inode_t *ip, /* target inode */
+ xfs_inode_t *tip) /* tmp inode */
+{
+
+ /* Should never get a local format */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+ tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ return EINVAL;
+
+ /*
+ * if the target inode has less extents that then temporary inode then
+ * why did userspace call us?
+ */
+ if (ip->i_d.di_nextents < tip->i_d.di_nextents)
+ return EINVAL;
+
+ /*
+ * if the target inode is in extent form and the temp inode is in btree
+ * form then we will end up with the target inode in the wrong format
+ * as we already know there are less extents in the temp inode.
+ */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ return EINVAL;
+
+ /* Check temp in extent form to max in target */
+ if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+ return EINVAL;
+
+ /* Check target in extent form to max in temp */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+ return EINVAL;
+
+ /* Check root block of temp in btree form to max in target */
+ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_BOFF(ip) &&
+ tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+ return EINVAL;
+
+ /* Check root block of target in btree form to max in temp */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_BOFF(tip) &&
+ ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+ return EINVAL;
+
+ return 0;
+}
+
int
xfs_swap_extents(
- xfs_inode_t *ip,
- xfs_inode_t *tip,
+ xfs_inode_t *ip, /* target inode */
+ xfs_inode_t *tip, /* tmp inode */
xfs_swapext_t *sxp)
{
xfs_mount_t *mp;
@@ -160,13 +235,6 @@ xfs_swap_extents(
goto out_unlock;
}
- /* Should never get a local format */
- if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
- tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
- error = XFS_ERROR(EINVAL);
- goto out_unlock;
- }
-
if (VN_CACHED(VFS_I(tip)) != 0) {
xfs_inval_cached_trace(tip, 0, -1, 0, -1);
error = xfs_flushinval_pages(tip, 0, -1,
@@ -189,13 +257,12 @@ xfs_swap_extents(
goto out_unlock;
}
- /*
- * If the target has extended attributes, the tmp file
- * must also in order to ensure the correct data fork
- * format.
- */
- if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
- error = XFS_ERROR(EINVAL);
+ /* check inode formats now that data is flushed */
+ error = xfs_swap_extents_check_format(ip, tip);
+ if (error) {
+ xfs_fs_cmn_err(CE_NOTE, mp,
+ "%s: inode 0x%llx format is incompatible for exchanging.",
+ __FILE__, ip->i_ino);
goto out_unlock;
}
@@ -276,6 +343,16 @@ xfs_swap_extents(
*tifp = *tempifp; /* struct copy */
/*
+ * Fix the in-memory data fork values that are dependent on the fork
+ * offset in the inode. We can't assume they remain the same as attr2
+ * has dynamic fork offsets.
+ */
+ ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
+ (uint)sizeof(xfs_bmbt_rec_t);
+ tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
+ (uint)sizeof(xfs_bmbt_rec_t);
+
+ /*
* Fix the on-disk inode values
*/
tmp = (__uint64_t)ip->i_d.di_nblocks;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 2d0b3e1da9e6..6f83f58c099f 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -611,7 +611,7 @@ xfs_fs_log_dummy(
xfs_inode_t *ip;
int error;
- tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+ tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
if (error) {
xfs_trans_cancel(tp, 0);
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 80e526489be5..a04a72fbbb65 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -228,13 +228,12 @@ xfs_iget_cache_hit(
xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
/*
- * We need to set XFS_INEW atomically with clearing the
- * reclaimable tag so that we do have an indicator of the
- * inode still being initialized.
+ * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
+ * from stomping over us while we recycle the inode. We can't
+ * clear the radix tree reclaimable tag yet as it requires
+ * pag_ici_lock to be held exclusive.
*/
- ip->i_flags |= XFS_INEW;
- ip->i_flags &= ~XFS_IRECLAIMABLE;
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+ ip->i_flags |= XFS_IRECLAIM;
spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
@@ -253,7 +252,15 @@ xfs_iget_cache_hit(
__xfs_inode_set_reclaim_tag(pag, ip);
goto out_error;
}
+
+ write_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
+ ip->i_flags |= XFS_INEW;
+ __xfs_inode_clear_reclaim_tag(mp, pag, ip);
inode->i_state = I_LOCK|I_NEW;
+ spin_unlock(&ip->i_flags_lock);
+ write_unlock(&pag->pag_ici_lock);
} else {
/* If the VFS inode is being torn down, pause and try again. */
if (!igrab(inode)) {
@@ -511,17 +518,21 @@ xfs_ireclaim(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
XFS_STATS_INC(xs_ig_reclaims);
/*
- * Remove the inode from the per-AG radix tree. It doesn't matter
- * if it was never added to it because radix_tree_delete can deal
- * with that case just fine.
+ * Remove the inode from the per-AG radix tree.
+ *
+ * Because radix_tree_delete won't complain even if the item was never
+ * added to the tree assert that it's been there before to catch
+ * problems with the inode life time early on.
*/
pag = xfs_get_perag(mp, ip->i_ino);
write_lock(&pag->pag_ici_lock);
- radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
+ if (!radix_tree_delete(&pag->pag_ici_root, agino))
+ ASSERT(0);
write_unlock(&pag->pag_ici_lock);
xfs_put_perag(mp, pag);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b92a4fa2a0a1..523a1ae4964d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2877,8 +2877,8 @@ xfs_iflush(
mp = ip->i_mount;
/*
- * If the inode isn't dirty, then just release the inode
- * flush lock and do nothing.
+ * If the inode isn't dirty, then just release the inode flush lock and
+ * do nothing.
*/
if (xfs_inode_clean(ip)) {
xfs_ifunlock(ip);
@@ -2904,6 +2904,19 @@ xfs_iflush(
xfs_iunpin_wait(ip);
/*
+ * For stale inodes we cannot rely on the backing buffer remaining
+ * stale in cache for the remaining life of the stale inode and so
+ * xfs_itobp() below may give us a buffer that no longer contains
+ * inodes below. We have to check this after ensuring the inode is
+ * unpinned so that it is safe to reclaim the stale inode after the
+ * flush call.
+ */
+ if (xfs_iflags_test(ip, XFS_ISTALE)) {
+ xfs_ifunlock(ip);
+ return 0;
+ }
+
+ /*
* This may have been unpinned because the filesystem is shutting
* down forcibly. If that's the case we must not write this inode
* to disk, because the log record didn't make it to disk!
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 67ae5555a30a..7294abce6ef2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -860,8 +860,15 @@ xfs_iomap_write_unwritten(
* set up a transaction to convert the range of extents
* from unwritten to real. Do allocations in a loop until
* we have covered the range passed in.
+ *
+ * Note that we open code the transaction allocation here
+ * to pass KM_NOFS--we can't risk to recursing back into
+ * the filesystem here as we might be asked to write out
+ * the same inode that we complete here and might deadlock
+ * on the iolock.
*/
- tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+ xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+ tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
tp->t_flags |= XFS_TRANS_RESERVE;
error = xfs_trans_reserve(tp, resblks,
XFS_WRITE_LOG_RES(mp), 0,
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index fb17f8226b09..b5b0d8055910 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3517,7 +3517,7 @@ xlog_do_recovery_pass(
{
xlog_rec_header_t *rhead;
xfs_daddr_t blk_no;
- xfs_caddr_t bufaddr, offset;
+ xfs_caddr_t offset;
xfs_buf_t *hbp, *dbp;
int error = 0, h_size;
int bblks, split_bblks;
@@ -3610,7 +3610,7 @@ xlog_do_recovery_pass(
/*
* Check for header wrapping around physical end-of-log
*/
- offset = NULL;
+ offset = XFS_BUF_PTR(hbp);
split_hblks = 0;
wrapped_hblks = 0;
if (blk_no + hblks <= log->l_logBBsize) {
@@ -3646,9 +3646,8 @@ xlog_do_recovery_pass(
* - order is important.
*/
wrapped_hblks = hblks - split_hblks;
- bufaddr = XFS_BUF_PTR(hbp);
error = XFS_BUF_SET_PTR(hbp,
- bufaddr + BBTOB(split_hblks),
+ offset + BBTOB(split_hblks),
BBTOB(hblks - split_hblks));
if (error)
goto bread_err2;
@@ -3658,14 +3657,10 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- error = XFS_BUF_SET_PTR(hbp, bufaddr,
+ error = XFS_BUF_SET_PTR(hbp, offset,
BBTOB(hblks));
if (error)
goto bread_err2;
-
- if (!offset)
- offset = xlog_align(log, 0,
- wrapped_hblks, hbp);
}
rhead = (xlog_rec_header_t *)offset;
error = xlog_valid_rec_header(log, rhead,
@@ -3685,7 +3680,7 @@ xlog_do_recovery_pass(
} else {
/* This log record is split across the
* physical end of log */
- offset = NULL;
+ offset = XFS_BUF_PTR(dbp);
split_bblks = 0;
if (blk_no != log->l_logBBsize) {
/* some data is before the physical
@@ -3714,9 +3709,8 @@ xlog_do_recovery_pass(
* _first_, then the log start (LR header end)
* - order is important.
*/
- bufaddr = XFS_BUF_PTR(dbp);
error = XFS_BUF_SET_PTR(dbp,
- bufaddr + BBTOB(split_bblks),
+ offset + BBTOB(split_bblks),
BBTOB(bblks - split_bblks));
if (error)
goto bread_err2;
@@ -3727,13 +3721,9 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
+ error = XFS_BUF_SET_PTR(dbp, offset, h_size);
if (error)
goto bread_err2;
-
- if (!offset)
- offset = xlog_align(log, wrapped_hblks,
- bblks - split_bblks, dbp);
}
xlog_unpack_data(rhead, offset, log);
if ((error = xlog_recover_process_data(log, rhash,
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 8b6c9e807efb..4d509f742bd2 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1471,7 +1471,7 @@ xfs_log_sbcount(
if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
return 0;
- tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT);
+ tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
XFS_DEFAULT_LOG_COUNT);
if (error) {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a6c023bc0fb2..08fdb6d43efd 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -209,6 +209,7 @@ typedef struct xfs_mount {
__uint64_t m_maxioffset; /* maximum inode offset */
__uint64_t m_resblks; /* total reserved blocks */
__uint64_t m_resblks_avail;/* available reserved blocks */
+ __uint64_t m_resblks_save; /* reserved blks @ remount,ro */
int m_dalign; /* stripe unit */
int m_swidth; /* stripe width */
int m_sinoalign; /* stripe unit inode alignment */
@@ -242,6 +243,7 @@ typedef struct xfs_mount {
wait_queue_head_t m_wait_single_sync_task;
__int64_t m_update_flags; /* sb flags we need to update
on the next remount,rw */
+ struct list_head m_mplist; /* inode shrinker mount list */
} xfs_mount_t;
/*
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f5e4874c37d8..726014d1c925 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -37,13 +37,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
}
/*
- * Flags for xfs_free_eofblocks
- */
-#define XFS_FREE_EOF_LOCK (1<<0)
-#define XFS_FREE_EOF_NOLOCK (1<<1)
-
-
-/*
* helper function to extract extent size hint from inode
*/
STATIC_INLINE xfs_extlen_t
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 66b849358e62..237badcbac3b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -236,19 +236,20 @@ xfs_trans_alloc(
uint type)
{
xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
- return _xfs_trans_alloc(mp, type);
+ return _xfs_trans_alloc(mp, type, KM_SLEEP);
}
xfs_trans_t *
_xfs_trans_alloc(
xfs_mount_t *mp,
- uint type)
+ uint type,
+ uint memflags)
{
xfs_trans_t *tp;
atomic_inc(&mp->m_active_trans);
- tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+ tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
tp->t_magic = XFS_TRANS_MAGIC;
tp->t_type = type;
tp->t_mountp = mp;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index ed47fc77759c..a0574f593f52 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -924,7 +924,7 @@ typedef struct xfs_trans {
* XFS transaction mechanism exported interfaces.
*/
xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
-xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint);
+xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, uint);
xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
uint, uint);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index b572f7e840e0..38a63244e75d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -69,7 +69,6 @@ xfs_setattr(
uint commit_flags=0;
uid_t uid=0, iuid=0;
gid_t gid=0, igid=0;
- int timeflags = 0;
struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2;
int need_iolock = 1;
@@ -134,16 +133,13 @@ xfs_setattr(
if (flags & XFS_ATTR_NOLOCK)
need_iolock = 0;
if (!(mask & ATTR_SIZE)) {
- if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) ||
- (mp->m_flags & XFS_MOUNT_WSYNC)) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
- commit_flags = 0;
- if ((code = xfs_trans_reserve(tp, 0,
- XFS_ICHANGE_LOG_RES(mp), 0,
- 0, 0))) {
- lock_flags = 0;
- goto error_return;
- }
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+ commit_flags = 0;
+ code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp),
+ 0, 0, 0);
+ if (code) {
+ lock_flags = 0;
+ goto error_return;
}
} else {
if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
@@ -294,15 +290,23 @@ xfs_setattr(
* or we are explicitly asked to change it. This handles
* the semantic difference between truncate() and ftruncate()
* as implemented in the VFS.
+ *
+ * The regular truncate() case without ATTR_CTIME and ATTR_MTIME
+ * is a special case where we need to update the times despite
+ * not having these flags set. For all other operations the
+ * VFS set these flags explicitly if it wants a timestamp
+ * update.
*/
- if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME))
- timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+ if (iattr->ia_size != ip->i_size &&
+ (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+ iattr->ia_ctime = iattr->ia_mtime =
+ current_fs_time(inode->i_sb);
+ mask |= ATTR_CTIME | ATTR_MTIME;
+ }
if (iattr->ia_size > ip->i_size) {
ip->i_d.di_size = iattr->ia_size;
ip->i_size = iattr->ia_size;
- if (!(flags & XFS_ATTR_DMI))
- xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
} else if (iattr->ia_size <= ip->i_size ||
(iattr->ia_size == 0 && ip->i_d.di_nextents)) {
@@ -373,9 +377,6 @@ xfs_setattr(
ip->i_d.di_gid = gid;
inode->i_gid = gid;
}
-
- xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
- timeflags |= XFS_ICHGTIME_CHG;
}
/*
@@ -392,51 +393,37 @@ xfs_setattr(
inode->i_mode &= S_IFMT;
inode->i_mode |= mode & ~S_IFMT;
-
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- timeflags |= XFS_ICHGTIME_CHG;
}
/*
* Change file access or modified times.
*/
- if (mask & (ATTR_ATIME|ATTR_MTIME)) {
- if (mask & ATTR_ATIME) {
- inode->i_atime = iattr->ia_atime;
- ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
- ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
- ip->i_update_core = 1;
- }
- if (mask & ATTR_MTIME) {
- inode->i_mtime = iattr->ia_mtime;
- ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
- ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
- timeflags &= ~XFS_ICHGTIME_MOD;
- timeflags |= XFS_ICHGTIME_CHG;
- }
- if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)))
- xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
+ if (mask & ATTR_ATIME) {
+ inode->i_atime = iattr->ia_atime;
+ ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+ ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+ ip->i_update_core = 1;
}
-
- /*
- * Change file inode change time only if ATTR_CTIME set
- * AND we have been called by a DMI function.
- */
-
- if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) {
+ if (mask & ATTR_CTIME) {
inode->i_ctime = iattr->ia_ctime;
ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
ip->i_update_core = 1;
- timeflags &= ~XFS_ICHGTIME_CHG;
+ }
+ if (mask & ATTR_MTIME) {
+ inode->i_mtime = iattr->ia_mtime;
+ ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+ ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+ ip->i_update_core = 1;
}
/*
- * Send out timestamp changes that need to be set to the
- * current time. Not done when called by a DMI function.
+ * And finally, log the inode core if any attribute in it
+ * has been changed.
*/
- if (timeflags && !(flags & XFS_ATTR_DMI))
- xfs_ichgtime(ip, timeflags);
+ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE|
+ ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
XFS_STATS_INC(xs_ig_attrchg);
@@ -451,12 +438,10 @@ xfs_setattr(
* mix so this probably isn't worth the trouble to optimize.
*/
code = 0;
- if (tp) {
- if (mp->m_flags & XFS_MOUNT_WSYNC)
- xfs_trans_set_sync(tp);
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_trans_set_sync(tp);
- code = xfs_trans_commit(tp, commit_flags);
- }
+ code = xfs_trans_commit(tp, commit_flags);
xfs_iunlock(ip, lock_flags);
@@ -612,7 +597,7 @@ xfs_fsync(
{
xfs_trans_t *tp;
int error = 0;
- int log_flushed = 0, changed = 1;
+ int log_flushed = 0;
xfs_itrace_entry(ip);
@@ -642,19 +627,11 @@ xfs_fsync(
* disk yet, the inode will be still be pinned. If it is,
* force the log.
*/
-
xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
if (xfs_ipincount(ip)) {
error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
XFS_LOG_FORCE | XFS_LOG_SYNC,
&log_flushed);
- } else {
- /*
- * If the inode is not pinned and nothing has changed
- * we don't need to flush the cache.
- */
- changed = 0;
}
} else {
/*
@@ -689,7 +666,7 @@ xfs_fsync(
xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
- if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
+ if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
/*
* If the log write didn't issue an ordered tag we need
* to flush the disk cache for the data device now.
@@ -709,6 +686,11 @@ xfs_fsync(
}
/*
+ * Flags for xfs_free_eofblocks
+ */
+#define XFS_FREE_EOF_TRYLOCK (1<<0)
+
+/*
* This is called by xfs_inactive to free any blocks beyond eof
* when the link count isn't zero and by xfs_dm_punch_hole() when
* punching a hole to EOF.
@@ -726,7 +708,6 @@ xfs_free_eofblocks(
xfs_filblks_t map_len;
int nimaps;
xfs_bmbt_irec_t imap;
- int use_iolock = (flags & XFS_FREE_EOF_LOCK);
/*
* Figure out if there are any blocks beyond the end
@@ -768,14 +749,19 @@ xfs_free_eofblocks(
* cache and we can't
* do that within a transaction.
*/
- if (use_iolock)
+ if (flags & XFS_FREE_EOF_TRYLOCK) {
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+ xfs_trans_cancel(tp, 0);
+ return 0;
+ }
+ } else {
xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ }
error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
ip->i_size);
if (error) {
xfs_trans_cancel(tp, 0);
- if (use_iolock)
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return error;
}
@@ -812,8 +798,7 @@ xfs_free_eofblocks(
error = xfs_trans_commit(tp,
XFS_TRANS_RELEASE_LOG_RES);
}
- xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
- : XFS_ILOCK_EXCL));
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
}
return error;
}
@@ -1113,7 +1098,17 @@ xfs_release(
(ip->i_df.if_flags & XFS_IFEXTENTS)) &&
(!(ip->i_d.di_flags &
(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
- error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
+
+ /*
+ * If we can't get the iolock just skip truncating
+ * the blocks past EOF because we could deadlock
+ * with the mmap_sem otherwise. We'll get another
+ * chance to drop them once the last reference to
+ * the inode is dropped, so we'll never leak blocks
+ * permanently.
+ */
+ error = xfs_free_eofblocks(mp, ip,
+ XFS_FREE_EOF_TRYLOCK);
if (error)
return error;
}
@@ -1184,7 +1179,7 @@ xfs_inactive(
(!(ip->i_d.di_flags &
(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
(ip->i_delayed_blks != 0)))) {
- error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
+ error = xfs_free_eofblocks(mp, ip, 0);
if (error)
return VN_INACTIVE_CACHE;
}
@@ -2456,46 +2451,6 @@ xfs_set_dmattrs(
return error;
}
-int
-xfs_reclaim(
- xfs_inode_t *ip)
-{
-
- xfs_itrace_entry(ip);
-
- ASSERT(!VN_MAPPED(VFS_I(ip)));
-
- /* bad inode, get out here ASAP */
- if (is_bad_inode(VFS_I(ip))) {
- xfs_ireclaim(ip);
- return 0;
- }
-
- xfs_ioend_wait(ip);
-
- ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
-
- /*
- * If we have nothing to flush with this inode then complete the
- * teardown now, otherwise break the link between the xfs inode and the
- * linux inode and clean up the xfs inode later. This avoids flushing
- * the inode to disk during the delete operation itself.
- *
- * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
- * first to ensure that xfs_iunpin() will never see an xfs inode
- * that has a linux inode being reclaimed. Synchronisation is provided
- * by the i_flags_lock.
- */
- if (!ip->i_update_core && (ip->i_itemp == NULL)) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_iflock(ip);
- xfs_iflags_set(ip, XFS_IRECLAIMABLE);
- return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
- }
- xfs_inode_set_reclaim_tag(ip);
- return 0;
-}
-
/*
* xfs_alloc_file_space()
* This routine allocates disk space for the given file.
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index a9e102de71a1..167a467403a5 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -38,7 +38,6 @@ int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
const char *target_path, mode_t mode, struct xfs_inode **ipp,
cred_t *credp);
int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
-int xfs_reclaim(struct xfs_inode *ip);
int xfs_change_file_space(struct xfs_inode *ip, int cmd,
xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,