summaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-12-31 08:31:57 +0100
committerIngo Molnar <mingo@elte.hu>2008-12-31 08:31:57 +0100
commita9de18eb761f7c1c860964b2e5addc1a35c7e861 (patch)
tree886e75fdfd09690cd262ca69cb7f5d1d42b48602 /fs/ext4
parentb2aaf8f74cdc84a9182f6cabf198b7763bcb9d40 (diff)
parent6a94cb73064c952255336cc57731904174b2c58f (diff)
Merge branch 'linus' into stackprotector
Conflicts: arch/x86/include/asm/pda.h kernel/fork.c
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/Kconfig79
-rw-r--r--fs/ext4/balloc.c93
-rw-r--r--fs/ext4/dir.c20
-rw-r--r--fs/ext4/ext4.h4
-rw-r--r--fs/ext4/ext4_sb.h3
-rw-r--r--fs/ext4/ialloc.c6
-rw-r--r--fs/ext4/inode.c150
-rw-r--r--fs/ext4/mballoc.c264
-rw-r--r--fs/ext4/mballoc.h31
-rw-r--r--fs/ext4/namei.c12
-rw-r--r--fs/ext4/super.c201
11 files changed, 436 insertions, 427 deletions
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
new file mode 100644
index 000000000000..7505482a08fa
--- /dev/null
+++ b/fs/ext4/Kconfig
@@ -0,0 +1,79 @@
+config EXT4_FS
+ tristate "The Extended 4 (ext4) filesystem"
+ select JBD2
+ select CRC16
+ help
+ This is the next generation of the ext3 filesystem.
+
+ Unlike the change from ext2 filesystem to ext3 filesystem,
+ the on-disk format of ext4 is not forwards compatible with
+ ext3; it is based on extent maps and it supports 48-bit
+ physical block numbers. The ext4 filesystem also supports delayed
+ allocation, persistent preallocation, high resolution time stamps,
+ and a number of other features to improve performance and speed
+ up fsck time. For more information, please see the web pages at
+ http://ext4.wiki.kernel.org.
+
+ The ext4 filesystem will support mounting an ext3
+ filesystem; while there will be some performance gains from
+ the delayed allocation and inode table readahead, the best
+ performance gains will require enabling ext4 features in the
+ filesystem, or formating a new filesystem as an ext4
+ filesystem initially.
+
+ To compile this file system support as a module, choose M here. The
+ module will be called ext4.
+
+ If unsure, say N.
+
+config EXT4DEV_COMPAT
+ bool "Enable ext4dev compatibility"
+ depends on EXT4_FS
+ help
+ Starting with 2.6.28, the name of the ext4 filesystem was
+ renamed from ext4dev to ext4. Unfortunately there are some
+ legacy userspace programs (such as klibc's fstype) have
+ "ext4dev" hardcoded.
+
+ To enable backwards compatibility so that systems that are
+ still expecting to mount ext4 filesystems using ext4dev,
+ chose Y here. This feature will go away by 2.6.31, so
+ please arrange to get your userspace programs fixed!
+
+config EXT4_FS_XATTR
+ bool "Ext4 extended attributes"
+ depends on EXT4_FS
+ default y
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ If unsure, say N.
+
+ You need this for POSIX ACL support on ext4.
+
+config EXT4_FS_POSIX_ACL
+ bool "Ext4 POSIX Access Control Lists"
+ depends on EXT4_FS_XATTR
+ select FS_POSIX_ACL
+ help
+ POSIX Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the POSIX ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
+
+config EXT4_FS_SECURITY
+ bool "Ext4 Security Labels"
+ depends on EXT4_FS_XATTR
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the ext4 filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index bd2ece228827..38b3acf5683b 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -568,8 +568,16 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
/* this isn't the right place to decide whether block is metadata
* inode.c/extents.c knows better, but for safety ... */
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
- ext4_should_journal_data(inode))
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ metadata = 1;
+
+ /* We need to make sure we don't reuse
+ * block released untill the transaction commit.
+ * writeback mode have weak data consistency so
+ * don't force data as metadata when freeing block
+ * for writeback mode.
+ */
+ if (metadata == 0 && !ext4_should_writeback_data(inode))
metadata = 1;
sb = inode->i_sb;
@@ -581,26 +589,28 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
return;
}
-int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
- s64 nblocks)
+/**
+ * ext4_has_free_blocks()
+ * @sbi: in-core super block structure.
+ * @nblocks: number of needed blocks
+ *
+ * Check if filesystem has nblocks free & available for allocation.
+ * On success return 1, return 0 on failure.
+ */
+int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
{
- s64 free_blocks, dirty_blocks;
- s64 root_blocks = 0;
+ s64 free_blocks, dirty_blocks, root_blocks;
struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
free_blocks = percpu_counter_read_positive(fbc);
dirty_blocks = percpu_counter_read_positive(dbc);
-
- if (!capable(CAP_SYS_RESOURCE) &&
- sbi->s_resuid != current->fsuid &&
- (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
- root_blocks = ext4_r_blocks_count(sbi->s_es);
+ root_blocks = ext4_r_blocks_count(sbi->s_es);
if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
EXT4_FREEBLOCKS_WATERMARK) {
- free_blocks = percpu_counter_sum(fbc);
- dirty_blocks = percpu_counter_sum(dbc);
+ free_blocks = percpu_counter_sum_positive(fbc);
+ dirty_blocks = percpu_counter_sum_positive(dbc);
if (dirty_blocks < 0) {
printk(KERN_CRIT "Dirty block accounting "
"went wrong %lld\n",
@@ -608,57 +618,32 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
}
}
/* Check whether we have space after
- * accounting for current dirty blocks
+ * accounting for current dirty blocks & root reserved blocks.
*/
- if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
- /* we don't have free space */
- return -ENOSPC;
+ if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks))
+ return 1;
+
+ /* Hm, nope. Are (enough) root reserved blocks available? */
+ if (sbi->s_resuid == current_fsuid() ||
+ ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
+ capable(CAP_SYS_RESOURCE)) {
+ if (free_blocks >= (nblocks + dirty_blocks))
+ return 1;
+ }
- /* Add the blocks to nblocks */
- percpu_counter_add(dbc, nblocks);
return 0;
}
-/**
- * ext4_has_free_blocks()
- * @sbi: in-core super block structure.
- * @nblocks: number of neeed blocks
- *
- * Check if filesystem has free blocks available for allocation.
- * Return the number of blocks avaible for allocation for this request
- * On success, return nblocks
- */
-ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
s64 nblocks)
{
- s64 free_blocks, dirty_blocks;
- s64 root_blocks = 0;
- struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
- struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
-
- free_blocks = percpu_counter_read_positive(fbc);
- dirty_blocks = percpu_counter_read_positive(dbc);
-
- if (!capable(CAP_SYS_RESOURCE) &&
- sbi->s_resuid != current->fsuid &&
- (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
- root_blocks = ext4_r_blocks_count(sbi->s_es);
-
- if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
- EXT4_FREEBLOCKS_WATERMARK) {
- free_blocks = percpu_counter_sum(fbc);
- dirty_blocks = percpu_counter_sum(dbc);
- }
- if (free_blocks <= (root_blocks + dirty_blocks))
- /* we don't have free space */
+ if (ext4_has_free_blocks(sbi, nblocks)) {
+ percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
return 0;
-
- if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
- return free_blocks - (root_blocks + dirty_blocks);
- return nblocks;
+ } else
+ return -ENOSPC;
}
-
/**
* ext4_should_retry_alloc()
* @sb: super block
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 3ca6a2b7632d..fed5b610df5a 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -459,17 +459,8 @@ static int ext4_dx_readdir(struct file *filp,
if (info->extra_fname) {
if (call_filldir(filp, dirent, filldir, info->extra_fname))
goto finished;
-
info->extra_fname = NULL;
- info->curr_node = rb_next(info->curr_node);
- if (!info->curr_node) {
- if (info->next_hash == ~0) {
- filp->f_pos = EXT4_HTREE_EOF;
- goto finished;
- }
- info->curr_hash = info->next_hash;
- info->curr_minor_hash = 0;
- }
+ goto next_node;
} else if (!info->curr_node)
info->curr_node = rb_first(&info->root);
@@ -501,9 +492,14 @@ static int ext4_dx_readdir(struct file *filp,
info->curr_minor_hash = fname->minor_hash;
if (call_filldir(filp, dirent, filldir, fname))
break;
-
+ next_node:
info->curr_node = rb_next(info->curr_node);
- if (!info->curr_node) {
+ if (info->curr_node) {
+ fname = rb_entry(info->curr_node, struct fname,
+ rb_hash);
+ info->curr_hash = fname->hash;
+ info->curr_minor_hash = fname->minor_hash;
+ } else {
if (info->next_hash == ~0) {
filp->f_pos = EXT4_HTREE_EOF;
break;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6690a41cdd9f..b0537c827024 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -511,7 +511,6 @@ do { \
/*
* Mount flags
*/
-#define EXT4_MOUNT_CHECK 0x00001 /* Do mount-time checks */
#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
@@ -1004,8 +1003,7 @@ extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock, ext4_fsblk_t goal,
unsigned long *count, int *errp);
extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
- s64 nblocks);
+extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count, int metadata);
extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 6a0b40d43264..445fde603df8 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -99,9 +99,6 @@ struct ext4_sb_info {
struct inode *s_buddy_cache;
long s_blocks_reserved;
spinlock_t s_reserve_lock;
- struct list_head s_active_transaction;
- struct list_head s_closed_transaction;
- struct list_head s_committed_transaction;
spinlock_t s_md_lock;
tid_t s_last_transaction;
unsigned short *s_mb_offsets, *s_mb_maxs;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fe34d74cfb19..08cac9fcace2 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -718,6 +718,8 @@ got:
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
free = ext4_free_blocks_after_init(sb, group, gdp);
gdp->bg_free_blocks_count = cpu_to_le16(free);
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
+ gdp);
}
spin_unlock(sb_bgl_lock(sbi, group));
@@ -785,7 +787,7 @@ got:
spin_unlock(sb_bgl_lock(sbi, flex_group));
}
- inode->i_uid = current->fsuid;
+ inode->i_uid = current_fsuid();
if (test_opt(sb, GRPID))
inode->i_gid = dir->i_gid;
else if (dir->i_mode & S_ISGID) {
@@ -793,7 +795,7 @@ got:
if (S_ISDIR(mode))
mode |= S_ISGID;
} else
- inode->i_gid = current->fsgid;
+ inode->i_gid = current_fsgid();
inode->i_mode = mode;
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9b4ec9decfd1..be21a5ae33cb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
int ret = 0, err, nr_pages, i;
unsigned long index, end;
struct pagevec pvec;
+ long pages_skipped;
BUG_ON(mpd->next_page <= mpd->first_page);
pagevec_init(&pvec, 0);
@@ -1655,20 +1656,30 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
end = mpd->next_page - 1;
while (index <= end) {
- /* XXX: optimize tail */
- nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ /*
+ * We can use PAGECACHE_TAG_DIRTY lookup here because
+ * even though we have cleared the dirty flag on the page
+ * We still keep the page in the radix tree with tag
+ * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
+ * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
+ * which is called via the below writepage callback.
+ */
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index,
+ (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- index = page->index;
- if (index > end)
- break;
- index++;
-
+ pages_skipped = mpd->wbc->pages_skipped;
err = mapping->a_ops->writepage(page, mpd->wbc);
- if (!err)
+ if (!err && (pages_skipped == mpd->wbc->pages_skipped))
+ /*
+ * have successfully written the page
+ * without skipping the same
+ */
mpd->pages_written++;
/*
* In error case, we have to continue because
@@ -2104,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping,
struct writeback_control *wbc,
struct mpage_da_data *mpd)
{
- long to_write;
int ret;
if (!mpd->get_block)
@@ -2119,19 +2129,18 @@ static int mpage_da_writepages(struct address_space *mapping,
mpd->pages_written = 0;
mpd->retval = 0;
- to_write = wbc->nr_to_write;
-
ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
-
/*
* Handle last extent of pages
*/
if (!mpd->io_done && mpd->next_page != mpd->first_page) {
if (mpage_da_map_blocks(mpd) == 0)
mpage_da_submit_io(mpd);
- }
- wbc->nr_to_write = to_write - mpd->pages_written;
+ mpd->io_done = 1;
+ ret = MPAGE_DA_EXTENT_TAIL;
+ }
+ wbc->nr_to_write -= mpd->pages_written;
return ret;
}
@@ -2320,6 +2329,8 @@ static int ext4_da_writepage(struct page *page,
unlock_page(page);
return 0;
}
+ /* now mark the buffer_heads as dirty and uptodate */
+ block_commit_write(page, 0, PAGE_CACHE_SIZE);
}
if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
@@ -2360,12 +2371,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
static int ext4_da_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ pgoff_t index;
+ int range_whole = 0;
handle_t *handle = NULL;
- loff_t range_start = 0;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
+ int no_nrwrite_index_update;
+ long pages_written = 0, pages_skipped;
int needed_blocks, ret = 0, nr_to_writebump = 0;
- long to_write, pages_skipped = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
/*
@@ -2385,23 +2398,26 @@ static int ext4_da_writepages(struct address_space *mapping,
nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
wbc->nr_to_write = sbi->s_mb_stream_request;
}
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
- if (!wbc->range_cyclic)
- /*
- * If range_cyclic is not set force range_cont
- * and save the old writeback_index
- */
- wbc->range_cont = 1;
-
- range_start = wbc->range_start;
- pages_skipped = wbc->pages_skipped;
+ if (wbc->range_cyclic)
+ index = mapping->writeback_index;
+ else
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
mpd.wbc = wbc;
mpd.inode = mapping->host;
-restart_loop:
- to_write = wbc->nr_to_write;
- while (!ret && to_write > 0) {
+ /*
+ * we don't want write_cache_pages to update
+ * nr_to_write and writeback_index
+ */
+ no_nrwrite_index_update = wbc->no_nrwrite_index_update;
+ wbc->no_nrwrite_index_update = 1;
+ pages_skipped = wbc->pages_skipped;
+
+ while (!ret && wbc->nr_to_write > 0) {
/*
* we insert one extent at a time. So we need
@@ -2422,48 +2438,53 @@ restart_loop:
dump_stack();
goto out_writepages;
}
- to_write -= wbc->nr_to_write;
-
mpd.get_block = ext4_da_get_block_write;
ret = mpage_da_writepages(mapping, wbc, &mpd);
ext4_journal_stop(handle);
- if (mpd.retval == -ENOSPC)
+ if (mpd.retval == -ENOSPC) {
+ /* commit the transaction which would
+ * free blocks released in the transaction
+ * and try again
+ */
jbd2_journal_force_commit_nested(sbi->s_journal);
-
- /* reset the retry count */
- if (ret == MPAGE_DA_EXTENT_TAIL) {
+ wbc->pages_skipped = pages_skipped;
+ ret = 0;
+ } else if (ret == MPAGE_DA_EXTENT_TAIL) {
/*
* got one extent now try with
* rest of the pages
*/
- to_write += wbc->nr_to_write;
+ pages_written += mpd.pages_written;
+ wbc->pages_skipped = pages_skipped;
ret = 0;
- } else if (wbc->nr_to_write) {
+ } else if (wbc->nr_to_write)
/*
* There is no more writeout needed
* or we requested for a noblocking writeout
* and we found the device congested
*/
- to_write += wbc->nr_to_write;
break;
- }
- wbc->nr_to_write = to_write;
- }
-
- if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
- /* We skipped pages in this loop */
- wbc->range_start = range_start;
- wbc->nr_to_write = to_write +
- wbc->pages_skipped - pages_skipped;
- wbc->pages_skipped = pages_skipped;
- goto restart_loop;
}
+ if (pages_skipped != wbc->pages_skipped)
+ printk(KERN_EMERG "This should not happen leaving %s "
+ "with nr_to_write = %ld ret = %d\n",
+ __func__, wbc->nr_to_write, ret);
+
+ /* Update index */
+ index += pages_written;
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ /*
+ * set the writeback_index so that range_cyclic
+ * mode will write it back later
+ */
+ mapping->writeback_index = index;
out_writepages:
- wbc->nr_to_write = to_write - nr_to_writebump;
- wbc->range_start = range_start;
+ if (!no_nrwrite_index_update)
+ wbc->no_nrwrite_index_update = 0;
+ wbc->nr_to_write -= nr_to_writebump;
return ret;
}
@@ -4175,7 +4196,6 @@ static int ext4_inode_blocks_set(handle_t *handle,
struct inode *inode = &(ei->vfs_inode);
u64 i_blocks = inode->i_blocks;
struct super_block *sb = inode->i_sb;
- int err = 0;
if (i_blocks <= ~0U) {
/*
@@ -4185,36 +4205,27 @@ static int ext4_inode_blocks_set(handle_t *handle,
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = 0;
ei->i_flags &= ~EXT4_HUGE_FILE_FL;
- } else if (i_blocks <= 0xffffffffffffULL) {
+ return 0;
+ }
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+ return -EFBIG;
+
+ if (i_blocks <= 0xffffffffffffULL) {
/*
* i_blocks can be represented in a 48 bit variable
* as multiple of 512 bytes
*/
- err = ext4_update_rocompat_feature(handle, sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
- if (err)
- goto err_out;
- /* i_block is stored in the split 48 bit fields */
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
ei->i_flags &= ~EXT4_HUGE_FILE_FL;
} else {
- /*
- * i_blocks should be represented in a 48 bit variable
- * as multiple of file system block size
- */
- err = ext4_update_rocompat_feature(handle, sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
- if (err)
- goto err_out;
ei->i_flags |= EXT4_HUGE_FILE_FL;
/* i_block is stored in file system block size */
i_blocks = i_blocks >> (inode->i_blkbits - 9);
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
}
-err_out:
- return err;
+ return 0;
}
/*
@@ -4571,9 +4582,10 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
{
if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
- return ext4_indirect_trans_blocks(inode, nrblocks, 0);
- return ext4_ext_index_trans_blocks(inode, nrblocks, 0);
+ return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
+ return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
}
+
/*
* Account for index blocks, block groups bitmaps and block group
* descriptor blocks if modify datablocks and index blocks
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b580714f0d85..444ad998f72e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
}
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+ meta_group_info[i]->bb_free_root.rb_node = NULL;;
#ifdef DOUBLE_CHECK
{
@@ -2522,9 +2523,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
}
spin_lock_init(&sbi->s_md_lock);
- INIT_LIST_HEAD(&sbi->s_active_transaction);
- INIT_LIST_HEAD(&sbi->s_closed_transaction);
- INIT_LIST_HEAD(&sbi->s_committed_transaction);
spin_lock_init(&sbi->s_bal_lock);
sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
@@ -2553,6 +2551,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
ext4_mb_init_per_dev_proc(sb);
ext4_mb_history_init(sb);
+ sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+
printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
return 0;
}
@@ -2568,7 +2568,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
list_del(&pa->pa_group_list);
count++;
- kfree(pa);
+ kmem_cache_free(ext4_pspace_cachep, pa);
}
if (count)
mb_debug("mballoc: %u PAs left\n", count);
@@ -2582,15 +2582,6 @@ int ext4_mb_release(struct super_block *sb)
struct ext4_group_info *grinfo;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- /* release freed, non-committed blocks */
- spin_lock(&sbi->s_md_lock);
- list_splice_init(&sbi->s_closed_transaction,
- &sbi->s_committed_transaction);
- list_splice_init(&sbi->s_active_transaction,
- &sbi->s_committed_transaction);
- spin_unlock(&sbi->s_md_lock);
- ext4_mb_free_committed_blocks(sb);
-
if (sbi->s_group_info) {
for (i = 0; i < sbi->s_groups_count; i++) {
grinfo = ext4_get_group_info(sb, i);
@@ -2644,61 +2635,57 @@ int ext4_mb_release(struct super_block *sb)
return 0;
}
-static noinline_for_stack void
-ext4_mb_free_committed_blocks(struct super_block *sb)
+/*
+ * This function is called by the jbd2 layer once the commit has finished,
+ * so we know we can free the blocks that were released with that commit.
+ */
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int err;
- int i;
- int count = 0;
- int count2 = 0;
- struct ext4_free_metadata *md;
+ struct super_block *sb = journal->j_private;
struct ext4_buddy e4b;
+ struct ext4_group_info *db;
+ int err, count = 0, count2 = 0;
+ struct ext4_free_data *entry;
+ ext4_fsblk_t discard_block;
+ struct list_head *l, *ltmp;
- if (list_empty(&sbi->s_committed_transaction))
- return;
-
- /* there is committed blocks to be freed yet */
- do {
- /* get next array of blocks */
- md = NULL;
- spin_lock(&sbi->s_md_lock);
- if (!list_empty(&sbi->s_committed_transaction)) {
- md = list_entry(sbi->s_committed_transaction.next,
- struct ext4_free_metadata, list);
- list_del(&md->list);
- }
- spin_unlock(&sbi->s_md_lock);
-
- if (md == NULL)
- break;
+ list_for_each_safe(l, ltmp, &txn->t_private_list) {
+ entry = list_entry(l, struct ext4_free_data, list);
mb_debug("gonna free %u blocks in group %lu (0x%p):",
- md->num, md->group, md);
+ entry->count, entry->group, entry);
- err = ext4_mb_load_buddy(sb, md->group, &e4b);
+ err = ext4_mb_load_buddy(sb, entry->group, &e4b);
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);
+ db = e4b.bd_info;
/* there are blocks to put in buddy to make them really free */
- count += md->num;
+ count += entry->count;
count2++;
- ext4_lock_group(sb, md->group);
- for (i = 0; i < md->num; i++) {
- mb_debug(" %u", md->blocks[i]);
- mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
+ ext4_lock_group(sb, entry->group);
+ /* Take it out of per group rb tree */
+ rb_erase(&entry->node, &(db->bb_free_root));
+ mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+
+ if (!db->bb_free_root.rb_node) {
+ /* No more items in the per group rb tree
+ * balance refcounts from ext4_mb_free_metadata()
+ */
+ page_cache_release(e4b.bd_buddy_page);
+ page_cache_release(e4b.bd_bitmap_page);
}
- mb_debug("\n");
- ext4_unlock_group(sb, md->group);
-
- /* balance refcounts from ext4_mb_free_metadata() */
- page_cache_release(e4b.bd_buddy_page);
- page_cache_release(e4b.bd_bitmap_page);
-
- kfree(md);
+ ext4_unlock_group(sb, entry->group);
+ discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
+ + entry->start_blk
+ + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+ trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id,
+ (unsigned long long) discard_block, entry->count);
+ sb_issue_discard(sb, discard_block, entry->count);
+
+ kmem_cache_free(ext4_free_ext_cachep, entry);
ext4_mb_release_desc(&e4b);
-
- } while (md);
+ }
mb_debug("freed %u blocks in %u structures\n", count, count2);
}
@@ -2712,6 +2699,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
static int ext4_mb_init_per_dev_proc(struct super_block *sb)
{
+#ifdef CONFIG_PROC_FS
mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct proc_dir_entry *proc;
@@ -2735,10 +2723,14 @@ err_out:
remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
return -ENOMEM;
+#else
+ return 0;
+#endif
}
static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
{
+#ifdef CONFIG_PROC_FS
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (sbi->s_proc == NULL)
@@ -2750,7 +2742,7 @@ static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-
+#endif
return 0;
}
@@ -2771,6 +2763,16 @@ int __init init_ext4_mballoc(void)
kmem_cache_destroy(ext4_pspace_cachep);
return -ENOMEM;
}
+
+ ext4_free_ext_cachep =
+ kmem_cache_create("ext4_free_block_extents",
+ sizeof(struct ext4_free_data),
+ 0, SLAB_RECLAIM_ACCOUNT, NULL);
+ if (ext4_free_ext_cachep == NULL) {
+ kmem_cache_destroy(ext4_pspace_cachep);
+ kmem_cache_destroy(ext4_ac_cachep);
+ return -ENOMEM;
+ }
return 0;
}
@@ -2779,6 +2781,7 @@ void exit_ext4_mballoc(void)
/* XXX: synchronize_rcu(); */
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
+ kmem_cache_destroy(ext4_free_ext_cachep);
}
@@ -4324,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
goto out1;
}
- ext4_mb_poll_new_transaction(sb, handle);
-
*errp = ext4_mb_initialize_context(ac, ar);
if (*errp) {
ar->len = 0;
@@ -4384,35 +4385,20 @@ out1:
return block;
}
-static void ext4_mb_poll_new_transaction(struct super_block *sb,
- handle_t *handle)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-
- if (sbi->s_last_transaction == handle->h_transaction->t_tid)
- return;
-
- /* new transaction! time to close last one and free blocks for
- * committed transaction. we know that only transaction can be
- * active, so previos transaction can be being logged and we
- * know that transaction before previous is known to be already
- * logged. this means that now we may free blocks freed in all
- * transactions before previous one. hope I'm clear enough ... */
- spin_lock(&sbi->s_md_lock);
- if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
- mb_debug("new transaction %lu, old %lu\n",
- (unsigned long) handle->h_transaction->t_tid,
- (unsigned long) sbi->s_last_transaction);
- list_splice_init(&sbi->s_closed_transaction,
- &sbi->s_committed_transaction);
- list_splice_init(&sbi->s_active_transaction,
- &sbi->s_closed_transaction);
- sbi->s_last_transaction = handle->h_transaction->t_tid;
- }
- spin_unlock(&sbi->s_md_lock);
-
- ext4_mb_free_committed_blocks(sb);
+/*
+ * We can merge two free data extents only if the physical blocks
+ * are contiguous, AND the extents were freed by the same transaction,
+ * AND the blocks are associated with the same group.
+ */
+static int can_merge(struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
+{
+ if ((entry1->t_tid == entry2->t_tid) &&
+ (entry1->group == entry2->group) &&
+ ((entry1->start_blk + entry1->count) == entry2->start_blk))
+ return 1;
+ return 0;
}
static noinline_for_stack int
@@ -4422,57 +4408,81 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_free_metadata *md;
- int i;
+ struct ext4_free_data *entry, *new_entry;
+ struct rb_node **n = &db->bb_free_root.rb_node, *node;
+ struct rb_node *parent = NULL, *new_node;
+
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
- ext4_lock_group(sb, group);
- for (i = 0; i < count; i++) {
- md = db->bb_md_cur;
- if (md && db->bb_tid != handle->h_transaction->t_tid) {
- db->bb_md_cur = NULL;
- md = NULL;
- }
+ new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+ new_entry->start_blk = block;
+ new_entry->group = group;
+ new_entry->count = count;
+ new_entry->t_tid = handle->h_transaction->t_tid;
+ new_node = &new_entry->node;
- if (md == NULL) {
+ ext4_lock_group(sb, group);
+ if (!*n) {
+ /* first free block exent. We need to
+ protect buddy cache from being freed,
+ * otherwise we'll refresh it from
+ * on-disk bitmap and lose not-yet-available
+ * blocks */
+ page_cache_get(e4b->bd_buddy_page);
+ page_cache_get(e4b->bd_bitmap_page);
+ }
+ while (*n) {
+ parent = *n;
+ entry = rb_entry(parent, struct ext4_free_data, node);
+ if (block < entry->start_blk)
+ n = &(*n)->rb_left;
+ else if (block >= (entry->start_blk + entry->count))
+ n = &(*n)->rb_right;
+ else {
ext4_unlock_group(sb, group);
- md = kmalloc(sizeof(*md), GFP_NOFS);
- if (md == NULL)
- return -ENOMEM;
- md->num = 0;
- md->group = group;
+ ext4_error(sb, __func__,
+ "Double free of blocks %d (%d %d)\n",
+ block, entry->start_blk, entry->count);
+ return 0;
+ }
+ }
- ext4_lock_group(sb, group);
- if (db->bb_md_cur == NULL) {
- spin_lock(&sbi->s_md_lock);
- list_add(&md->list, &sbi->s_active_transaction);
- spin_unlock(&sbi->s_md_lock);
- /* protect buddy cache from being freed,
- * otherwise we'll refresh it from
- * on-disk bitmap and lose not-yet-available
- * blocks */
- page_cache_get(e4b->bd_buddy_page);
- page_cache_get(e4b->bd_bitmap_page);
- db->bb_md_cur = md;
- db->bb_tid = handle->h_transaction->t_tid;
- mb_debug("new md 0x%p for group %lu\n",
- md, md->group);
- } else {
- kfree(md);
- md = db->bb_md_cur;
- }
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &db->bb_free_root);
+
+ /* Now try to see the extent can be merged to left and right */
+ node = rb_prev(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, node);
+ if (can_merge(entry, new_entry)) {
+ new_entry->start_blk = entry->start_blk;
+ new_entry->count += entry->count;
+ rb_erase(node, &(db->bb_free_root));
+ spin_lock(&sbi->s_md_lock);
+ list_del(&entry->list);
+ spin_unlock(&sbi->s_md_lock);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
}
+ }
- BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
- md->blocks[md->num] = block + i;
- md->num++;
- if (md->num == EXT4_BB_MAX_BLOCKS) {
- /* no more space, put full container on a sb's list */
- db->bb_md_cur = NULL;
+ node = rb_next(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, node);
+ if (can_merge(new_entry, entry)) {
+ new_entry->count += entry->count;
+ rb_erase(node, &(db->bb_free_root));
+ spin_lock(&sbi->s_md_lock);
+ list_del(&entry->list);
+ spin_unlock(&sbi->s_md_lock);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
}
}
+ /* Add the extent to transaction's private list */
+ spin_lock(&sbi->s_md_lock);
+ list_add(&new_entry->list, &handle->h_transaction->t_private_list);
+ spin_unlock(&sbi->s_md_lock);
ext4_unlock_group(sb, group);
return 0;
}
@@ -4500,8 +4510,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
*freed = 0;
- ext4_mb_poll_new_transaction(sb, handle);
-
sbi = EXT4_SB(sb);
es = EXT4_SB(sb)->s_es;
if (block < le32_to_cpu(es->s_first_data_block) ||
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b3b4828f8b89..b5dff1fff1e5 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -18,6 +18,8 @@
#include <linux/pagemap.h>
#include <linux/seq_file.h>
#include <linux/version.h>
+#include <linux/blkdev.h>
+#include <linux/marker.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "group.h"
@@ -98,23 +100,29 @@
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
-#ifdef EXT4_BB_MAX_BLOCKS
-#undef EXT4_BB_MAX_BLOCKS
-#endif
-#define EXT4_BB_MAX_BLOCKS 30
+struct ext4_free_data {
+ /* this links the free block information from group_info */
+ struct rb_node node;
-struct ext4_free_metadata {
- ext4_group_t group;
- unsigned short num;
- ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
+ /* this links the free block information from ext4_sb_info */
struct list_head list;
+
+ /* group which free block extent belongs */
+ ext4_group_t group;
+
+ /* free block extent */
+ ext4_grpblk_t start_blk;
+ ext4_grpblk_t count;
+
+ /* transaction which freed this extent */
+ tid_t t_tid;
};
struct ext4_group_info {
unsigned long bb_state;
- unsigned long bb_tid;
- struct ext4_free_metadata *bb_md_cur;
+ struct rb_root bb_free_root;
unsigned short bb_first_free;
unsigned short bb_free;
unsigned short bb_fragments;
@@ -261,8 +269,6 @@ struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
-static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
-static void ext4_mb_free_committed_blocks(struct super_block *);
static void ext4_mb_return_to_preallocation(struct inode *inode,
struct ext4_buddy *e4b, sector_t block,
int count);
@@ -270,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *,
struct super_block *, struct ext4_prealloc_space *pa);
static int ext4_mb_init_per_dev_proc(struct super_block *sb);
static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 92db9e945147..63adcb792988 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1061,7 +1061,6 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
struct dentry *ext4_get_parent(struct dentry *child)
{
unsigned long ino;
- struct dentry *parent;
struct inode *inode;
static const struct qstr dotdot = {
.name = "..",
@@ -1083,16 +1082,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
return ERR_PTR(-EIO);
}
- inode = ext4_iget(child->d_inode->i_sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
-
- parent = d_alloc_anon(inode);
- if (!parent) {
- iput(inode);
- parent = ERR_PTR(-ENOMEM);
- }
- return parent;
+ return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
}
#define S_SHIFT 12
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dea8f13c2fd9..04158ad74dbb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -333,7 +333,8 @@ void ext4_abort(struct super_block *sb, const char *function,
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
sb->s_flags |= MS_RDONLY;
EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
- jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+ if (EXT4_SB(sb)->s_journal)
+ jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
}
void ext4_warning(struct super_block *sb, const char *function,
@@ -374,66 +375,6 @@ void ext4_update_dynamic_rev(struct super_block *sb)
*/
}
-int ext4_update_compat_feature(handle_t *handle,
- struct super_block *sb, __u32 compat)
-{
- int err = 0;
- if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) {
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- return err;
- EXT4_SET_COMPAT_FEATURE(sb, compat);
- sb->s_dirt = 1;
- handle->h_sync = 1;
- BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
- "call ext4_journal_dirty_met adata");
- err = ext4_journal_dirty_metadata(handle,
- EXT4_SB(sb)->s_sbh);
- }
- return err;
-}
-
-int ext4_update_rocompat_feature(handle_t *handle,
- struct super_block *sb, __u32 rocompat)
-{
- int err = 0;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) {
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- return err;
- EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat);
- sb->s_dirt = 1;
- handle->h_sync = 1;
- BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
- "call ext4_journal_dirty_met adata");
- err = ext4_journal_dirty_metadata(handle,
- EXT4_SB(sb)->s_sbh);
- }
- return err;
-}
-
-int ext4_update_incompat_feature(handle_t *handle,
- struct super_block *sb, __u32 incompat)
-{
- int err = 0;
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) {
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- return err;
- EXT4_SET_INCOMPAT_FEATURE(sb, incompat);
- sb->s_dirt = 1;
- handle->h_sync = 1;
- BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
- "call ext4_journal_dirty_met adata");
- err = ext4_journal_dirty_metadata(handle,
- EXT4_SB(sb)->s_sbh);
- }
- return err;
-}
-
/*
* Open the external journal device
*/
@@ -459,7 +400,7 @@ fail:
static int ext4_blkdev_put(struct block_device *bdev)
{
bd_release(bdev);
- return blkdev_put(bdev);
+ return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
}
static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -502,14 +443,16 @@ static void ext4_put_super(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
- int i;
+ int i, err;
ext4_mb_release(sb);
ext4_ext_release(sb);
ext4_xattr_put_super(sb);
- if (jbd2_journal_destroy(sbi->s_journal) < 0)
- ext4_abort(sb, __func__, "Couldn't clean up the journal");
+ err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
+ if (err < 0)
+ ext4_abort(sb, __func__, "Couldn't clean up the journal");
+
if (!(sb->s_flags & MS_RDONLY)) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -904,7 +847,7 @@ static const struct export_operations ext4_export_ops = {
enum {
Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
- Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
+ Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
@@ -915,7 +858,7 @@ enum {
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
- Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+ Opt_stripe, Opt_delalloc, Opt_nodelalloc,
Opt_inode_readahead_blks
};
@@ -933,8 +876,6 @@ static const match_table_t tokens = {
{Opt_err_panic, "errors=panic"},
{Opt_err_ro, "errors=remount-ro"},
{Opt_nouid32, "nouid32"},
- {Opt_nocheck, "nocheck"},
- {Opt_nocheck, "check=none"},
{Opt_debug, "debug"},
{Opt_oldalloc, "oldalloc"},
{Opt_orlov, "orlov"},
@@ -973,8 +914,6 @@ static const match_table_t tokens = {
{Opt_extents, "extents"},
{Opt_noextents, "noextents"},
{Opt_i_version, "i_version"},
- {Opt_mballoc, "mballoc"},
- {Opt_nomballoc, "nomballoc"},
{Opt_stripe, "stripe=%u"},
{Opt_resize, "resize"},
{Opt_delalloc, "delalloc"},
@@ -1073,9 +1012,6 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_nouid32:
set_opt(sbi->s_mount_opt, NO_UID32);
break;
- case Opt_nocheck:
- clear_opt(sbi->s_mount_opt, CHECK);
- break;
case Opt_debug:
set_opt(sbi->s_mount_opt, DEBUG);
break;
@@ -1522,9 +1458,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
/* We allocate both existing and potentially added groups */
flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
- ((sbi->s_es->s_reserved_gdt_blocks +1 ) <<
- EXT4_DESC_PER_BLOCK_BITS(sb))) /
- groups_per_flex;
+ ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
+ EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
sbi->s_flex_groups = kzalloc(flex_group_count *
sizeof(struct flex_groups), GFP_KERNEL);
if (sbi->s_flex_groups == NULL) {
@@ -1618,14 +1553,14 @@ static int ext4_check_descriptors(struct super_block *sb)
if (block_bitmap < first_block || block_bitmap > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
"Block bitmap for group %lu not in group "
- "(block %llu)!", i, block_bitmap);
+ "(block %llu)!\n", i, block_bitmap);
return 0;
}
inode_bitmap = ext4_inode_bitmap(sb, gdp);
if (inode_bitmap < first_block || inode_bitmap > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
"Inode bitmap for group %lu not in group "
- "(block %llu)!", i, inode_bitmap);
+ "(block %llu)!\n", i, inode_bitmap);
return 0;
}
inode_table = ext4_inode_table(sb, gdp);
@@ -1633,7 +1568,7 @@ static int ext4_check_descriptors(struct super_block *sb)
inode_table + sbi->s_itb_per_group - 1 > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
"Inode table for group %lu not in group "
- "(block %llu)!", i, inode_table);
+ "(block %llu)!\n", i, inode_table);
return 0;
}
spin_lock(sb_bgl_lock(sbi, i));
@@ -1778,15 +1713,15 @@ static void ext4_orphan_cleanup(struct super_block *sb,
*
* Note, this does *not* consider any metadata overhead for vfs i_blocks.
*/
-static loff_t ext4_max_size(int blkbits)
+static loff_t ext4_max_size(int blkbits, int has_huge_files)
{
loff_t res;
loff_t upper_limit = MAX_LFS_FILESIZE;
/* small i_blocks in vfs inode? */
- if (sizeof(blkcnt_t) < sizeof(u64)) {
+ if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
/*
- * CONFIG_LSF is not enabled implies the inode
+ * CONFIG_LBD is not enabled implies the inode
* i_block represent total blocks in 512 bytes
* 32 == size of vfs inode i_blocks * 8
*/
@@ -1814,7 +1749,7 @@ static loff_t ext4_max_size(int blkbits)
* block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
* We need to be 1 filesystem block less than the 2^48 sector limit.
*/
-static loff_t ext4_max_bitmap_size(int bits)
+static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
loff_t res = EXT4_NDIR_BLOCKS;
int meta_blocks;
@@ -1827,11 +1762,11 @@ static loff_t ext4_max_bitmap_size(int bits)
* total number of 512 bytes blocks of the file
*/
- if (sizeof(blkcnt_t) < sizeof(u64)) {
+ if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
/*
- * CONFIG_LSF is not enabled implies the inode
- * i_block represent total blocks in 512 bytes
- * 32 == size of vfs inode i_blocks * 8
+ * !has_huge_files or CONFIG_LBD is not enabled
+ * implies the inode i_block represent total blocks in
+ * 512 bytes 32 == size of vfs inode i_blocks * 8
*/
upper_limit = (1LL << 32) - 1;
@@ -1940,7 +1875,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
int blocksize;
int db_count;
int i;
- int needs_recovery;
+ int needs_recovery, has_huge_files;
__le32 features;
__u64 blocks_count;
int err;
@@ -2081,16 +2016,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_id, le32_to_cpu(features));
goto failed_mount;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+ if (has_huge_files) {
/*
* Large file size enabled file system can only be
- * mount if kernel is build with CONFIG_LSF
+ * mount if kernel is build with CONFIG_LBD
*/
if (sizeof(root->i_blocks) < sizeof(u64) &&
!(sb->s_flags & MS_RDONLY)) {
printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
"files cannot be mounted read-write "
- "without CONFIG_LSF.\n", sb->s_id);
+ "without CONFIG_LBD.\n", sb->s_id);
goto failed_mount;
}
}
@@ -2131,8 +2068,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
- sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits);
- sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
+ sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
+ has_huge_files);
+ sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
@@ -2456,6 +2394,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"available.\n");
}
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+ "requested data journaling mode\n");
+ clear_opt(sbi->s_mount_opt, DELALLOC);
+ } else if (test_opt(sb, DELALLOC))
+ printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+
+ ext4_ext_init(sb);
+ err = ext4_mb_init(sb, needs_recovery);
+ if (err) {
+ printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
+ err);
+ goto failed_mount4;
+ }
+
/*
* akpm: core read_super() calls in here with the superblock locked.
* That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2475,21 +2428,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
- printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
- "requested data journaling mode\n");
- clear_opt(sbi->s_mount_opt, DELALLOC);
- } else if (test_opt(sb, DELALLOC))
- printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
-
- ext4_ext_init(sb);
- err = ext4_mb_init(sb, needs_recovery);
- if (err) {
- printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
- err);
- goto failed_mount4;
- }
-
lock_kernel();
return 0;
@@ -2617,7 +2555,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
if (bd_claim(bdev, sb)) {
printk(KERN_ERR
"EXT4: failed to claim external journal device.\n");
- blkdev_put(bdev);
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
return NULL;
}
@@ -2946,12 +2884,9 @@ int ext4_force_commit(struct super_block *sb)
/*
* Ext4 always journals updates to the superblock itself, so we don't
* have to propagate any other updates to the superblock on disk at this
- * point. Just start an async writeback to get the buffers on their way
- * to the disk.
- *
- * This implicitly triggers the writebehind on sync().
+ * point. (We can probably nuke this function altogether, and remove
+ * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...)
*/
-
static void ext4_write_super(struct super_block *sb)
{
if (mutex_trylock(&sb->s_lock) != 0)
@@ -2961,15 +2896,15 @@ static void ext4_write_super(struct super_block *sb)
static int ext4_sync_fs(struct super_block *sb, int wait)
{
- tid_t target;
+ int ret = 0;
trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
sb->s_dirt = 0;
- if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
- if (wait)
- jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
- }
- return 0;
+ if (wait)
+ ret = ext4_force_commit(sb);
+ else
+ jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+ return ret;
}
/*
@@ -3392,30 +3327,30 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
* Standard function to be called on quota_on
*/
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
- char *path, int remount)
+ char *name, int remount)
{
int err;
- struct nameidata nd;
+ struct path path;
if (!test_opt(sb, QUOTA))
return -EINVAL;
- /* When remounting, no checks are needed and in fact, path is NULL */
+ /* When remounting, no checks are needed and in fact, name is NULL */
if (remount)
- return vfs_quota_on(sb, type, format_id, path, remount);
+ return vfs_quota_on(sb, type, format_id, name, remount);
- err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+ err = kern_path(name, LOOKUP_FOLLOW, &path);
if (err)
return err;
/* Quotafile not on the same filesystem? */
- if (nd.path.mnt->mnt_sb != sb) {
- path_put(&nd.path);
+ if (path.mnt->mnt_sb != sb) {
+ path_put(&path);
return -EXDEV;
}
/* Journaling quota? */
if (EXT4_SB(sb)->s_qf_names[type]) {
/* Quotafile not in fs root? */
- if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
+ if (path.dentry->d_parent != sb->s_root)
printk(KERN_WARNING
"EXT4-fs: Quota file not on filesystem root. "
"Journaled quota will not work.\n");
@@ -3425,7 +3360,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
* When we journal data on quota file, we have to flush journal to see
* all updates to the file when we bypass pagecache...
*/
- if (ext4_should_journal_data(nd.path.dentry->d_inode)) {
+ if (ext4_should_journal_data(path.dentry->d_inode)) {
/*
* We don't need to lock updates but journal_flush() could
* otherwise be livelocked...
@@ -3434,13 +3369,13 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
if (err) {
- path_put(&nd.path);
+ path_put(&path);
return err;
}
}
- err = vfs_quota_on_path(sb, type, format_id, &nd.path);
- path_put(&nd.path);
+ err = vfs_quota_on_path(sb, type, format_id, &path);
+ path_put(&path);
return err;
}