From 54566b2c1594c2326a645a3551f9d989f7ba3c5e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sun, 4 Jan 2009 12:00:53 -0800 Subject: fs: symlink write_begin allocation context fix With the write_begin/write_end aops, page_symlink was broken because it could no longer pass a GFP_NOFS type mask into the point where the allocations happened. They are done in write_begin, which would always assume that the filesystem can be entered from reclaim. This bug could cause filesystem deadlocks. The funny thing with having a gfp_t mask there is that it doesn't really allow the caller to arbitrarily tinker with the context in which it can be called. It couldn't ever be GFP_ATOMIC, for example, because it needs to take the page lock. The only thing any callers care about is __GFP_FS anyway, so turn that into a single flag. Add a new flag for write_begin, AOP_FLAG_NOFS. Filesystems can now act on this flag in their write_begin function. Change __grab_cache_page to accept a nofs argument as well, to honour that flag (while we're there, change the name to grab_cache_page_write_begin which is more instructive and does away with random leading underscores). This is really a more flexible way to go in the end anyway -- if a filesystem happens to want any extra allocations aside from the pagecache ones in ints write_begin function, it may now use GFP_KERNEL (rather than GFP_NOFS) for common case allocations (eg. ocfs2_alloc_write_ctxt, for a random example). [kosaki.motohiro@jp.fujitsu.com: fix ubifs] [kosaki.motohiro@jp.fujitsu.com: fix fuse] Signed-off-by: Nick Piggin Reviewed-by: KOSAKI Motohiro Cc: [2.6.28.x] Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton [ Cleaned up the calling convention: just pass in the AOP flags untouched to the grab_cache_page_write_begin() function. That just simplifies everybody, and may even allow future expansion of the logic. - Linus ] Signed-off-by: Linus Torvalds --- fs/buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 776ae091d3b0..a13f09b696f7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1996,7 +1996,7 @@ int block_write_begin(struct file *file, struct address_space *mapping, page = *pagep; if (page == NULL) { ownpage = 1; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { status = -ENOMEM; goto out; @@ -2502,7 +2502,7 @@ int nobh_write_begin(struct file *file, struct address_space *mapping, from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; -- cgit v1.2.3 From 69e9930993cfd70d82c8d9dd96fc3a88854d06fc Mon Sep 17 00:00:00 2001 From: Franck Bui-Huu Date: Tue, 6 Jan 2009 14:40:19 -0800 Subject: block_write_begin(): remove useless goto Signed-off-by: Franck Bui-Huu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index a13f09b696f7..c26da785938a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2022,7 +2022,6 @@ int block_write_begin(struct file *file, struct address_space *mapping, if (pos + len > inode->i_size) vmtruncate(inode, inode->i_size); } - goto out; } out: -- cgit v1.2.3 From c4be0c1dc4cdc37b175579be1460f15ac6495e9a Mon Sep 17 00:00:00 2001 From: Takashi Sato Date: Fri, 9 Jan 2009 16:40:58 -0800 Subject: filesystem freeze: add error handling of write_super_lockfs/unlockfs Currently, ext3 in mainline Linux doesn't have the freeze feature which suspends write requests. So, we cannot take a backup which keeps the filesystem's consistency with the storage device's features (snapshot and replication) while it is mounted. In many case, a commercial filesystem (e.g. VxFS) has the freeze feature and it would be used to get the consistent backup. If Linux's standard filesystem ext3 has the freeze feature, we can do it without a commercial filesystem. So I have implemented the ioctls of the freeze feature. I think we can take the consistent backup with the following steps. 1. Freeze the filesystem with the freeze ioctl. 2. Separate the replication volume or create the snapshot with the storage device's feature. 3. Unfreeze the filesystem with the unfreeze ioctl. 4. Take the backup from the separated replication volume or the snapshot. This patch: VFS: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they can return an error. Rename write_super_lockfs and unlockfs of the super block operation freeze_fs and unfreeze_fs to avoid a confusion. ext3, ext4, xfs, gfs2, jfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that write_super_lockfs returns an error if needed, and unlockfs always returns 0. reiserfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they always return 0 (success) to keep a current behavior. Signed-off-by: Takashi Sato Signed-off-by: Masayuki Hamaguchi Cc: Cc: Cc: Christoph Hellwig Cc: Dave Kleikamp Cc: Dave Chinner Cc: Alasdair G Kergon Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index c26da785938a..87f9e537b8c3 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -221,8 +221,8 @@ struct super_block *freeze_bdev(struct block_device *bdev) sync_blockdev(sb->s_bdev); - if (sb->s_op->write_super_lockfs) - sb->s_op->write_super_lockfs(sb); + if (sb->s_op->freeze_fs) + sb->s_op->freeze_fs(sb); } sync_blockdev(bdev); @@ -242,8 +242,8 @@ void thaw_bdev(struct block_device *bdev, struct super_block *sb) if (sb) { BUG_ON(sb->s_bdev != bdev); - if (sb->s_op->unlockfs) - sb->s_op->unlockfs(sb); + if (sb->s_op->unfreeze_fs) + sb->s_op->unfreeze_fs(sb); sb->s_frozen = SB_UNFROZEN; smp_wmb(); wake_up(&sb->s_wait_unfrozen); -- cgit v1.2.3 From fcccf502540e3d752d33b2d8e976034dee81f9f7 Mon Sep 17 00:00:00 2001 From: Takashi Sato Date: Fri, 9 Jan 2009 16:40:59 -0800 Subject: filesystem freeze: implement generic freeze feature The ioctls for the generic freeze feature are below. o Freeze the filesystem int ioctl(int fd, int FIFREEZE, arg) fd: The file descriptor of the mountpoint FIFREEZE: request code for the freeze arg: Ignored Return value: 0 if the operation succeeds. Otherwise, -1 o Unfreeze the filesystem int ioctl(int fd, int FITHAW, arg) fd: The file descriptor of the mountpoint FITHAW: request code for unfreeze arg: Ignored Return value: 0 if the operation succeeds. Otherwise, -1 Error number: If the filesystem has already been unfrozen, errno is set to EINVAL. [akpm@linux-foundation.org: fix CONFIG_BLOCK=n] Signed-off-by: Takashi Sato Signed-off-by: Masayuki Hamaguchi Cc: Cc: Cc: Christoph Hellwig Cc: Dave Kleikamp Cc: Dave Chinner Cc: Alasdair G Kergon Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 65 insertions(+), 9 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 87f9e537b8c3..b6e8b8632e2f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -203,10 +203,25 @@ int fsync_bdev(struct block_device *bdev) * happen on bdev until thaw_bdev() is called. * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. + * The reference counter (bd_fsfreeze_count) guarantees that only the last + * unfreeze process can unfreeze the frozen filesystem actually when multiple + * freeze requests arrive simultaneously. It counts up in freeze_bdev() and + * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze + * actually. */ struct super_block *freeze_bdev(struct block_device *bdev) { struct super_block *sb; + int error = 0; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + bdev->bd_fsfreeze_count++; + sb = get_super(bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return sb; + } + bdev->bd_fsfreeze_count++; down(&bdev->bd_mount_sem); sb = get_super(bdev); @@ -221,11 +236,24 @@ struct super_block *freeze_bdev(struct block_device *bdev) sync_blockdev(sb->s_bdev); - if (sb->s_op->freeze_fs) - sb->s_op->freeze_fs(sb); + if (sb->s_op->freeze_fs) { + error = sb->s_op->freeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem freeze failed\n"); + sb->s_frozen = SB_UNFROZEN; + drop_super(sb); + up(&bdev->bd_mount_sem); + bdev->bd_fsfreeze_count--; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return ERR_PTR(error); + } + } } sync_blockdev(bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ } EXPORT_SYMBOL(freeze_bdev); @@ -237,20 +265,48 @@ EXPORT_SYMBOL(freeze_bdev); * * Unlocks the filesystem and marks it writeable again after freeze_bdev(). */ -void thaw_bdev(struct block_device *bdev, struct super_block *sb) +int thaw_bdev(struct block_device *bdev, struct super_block *sb) { + int error = 0; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (!bdev->bd_fsfreeze_count) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return -EINVAL; + } + + bdev->bd_fsfreeze_count--; + if (bdev->bd_fsfreeze_count > 0) { + if (sb) + drop_super(sb); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; + } + if (sb) { BUG_ON(sb->s_bdev != bdev); - - if (sb->s_op->unfreeze_fs) - sb->s_op->unfreeze_fs(sb); - sb->s_frozen = SB_UNFROZEN; - smp_wmb(); - wake_up(&sb->s_wait_unfrozen); + if (!(sb->s_flags & MS_RDONLY)) { + if (sb->s_op->unfreeze_fs) { + error = sb->s_op->unfreeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem thaw failed\n"); + sb->s_frozen = SB_FREEZE_TRANS; + bdev->bd_fsfreeze_count++; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; + } + } + sb->s_frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_wait_unfrozen); + } drop_super(sb); } up(&bdev->bd_mount_sem); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; } EXPORT_SYMBOL(thaw_bdev); -- cgit v1.2.3 From bdc480e3bef6eb0e7071770834cbdda7e30a5436 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:12 +0100 Subject: [CVE-2009-0029] System call wrappers part 10 Signed-off-by: Heiko Carstens --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index b6e8b8632e2f..b58208f1640a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3243,7 +3243,7 @@ void block_sync_page(struct page *page) * Use of bdflush() is deprecated and will be removed in a future kernel. * The `pdflush' kernel threads fully replace bdflush daemons and this call. */ -asmlinkage long sys_bdflush(int func, long data) +SYSCALL_DEFINE2(bdflush, int, func, long, data) { static int msg_count; -- cgit v1.2.3