From a0c32761e73c9999cbf592b702f284221fea8040 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Thu, 3 Apr 2014 14:46:20 -0700 Subject: sh: fix format string bug in stack tracer Kees reported the following error: arch/sh/kernel/dumpstack.c: In function 'print_trace_address': arch/sh/kernel/dumpstack.c:118:2: error: format not a string literal and no format arguments [-Werror=format-security] Use the "%s" format so that it's impossible to interpret 'data' as a format string. Signed-off-by: Matt Fleming Reported-by: Kees Cook Acked-by: Kees Cook Cc: Paul Mundt Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/kernel/dumpstack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c index b959f5592604..8dfe645bcc4b 100644 --- a/arch/sh/kernel/dumpstack.c +++ b/arch/sh/kernel/dumpstack.c @@ -115,7 +115,7 @@ static int print_trace_stack(void *data, char *name) */ static void print_trace_address(void *data, unsigned long addr, int reliable) { - printk(data); + printk("%s", (char *)data); printk_address(addr, reliable); } -- cgit v1.2.3 From 6ca738d60c563d5c6cf6253ee4b8e76fa77b2b9e Mon Sep 17 00:00:00 2001 From: Derek Basehore Date: Thu, 3 Apr 2014 14:46:22 -0700 Subject: backing_dev: fix hung task on sync bdi_wakeup_thread_delayed() used the mod_delayed_work() function to schedule work to writeback dirty inodes. The problem with this is that it can delay work that is scheduled for immediate execution, such as the work from sync_inodes_sb(). This can happen since mod_delayed_work() can now steal work from a work_queue. This fixes the problem by using queue_delayed_work() instead. This is a regression caused by commit 839a8e8660b6 ("writeback: replace custom worker pool implementation with unbound workqueue"). The reason that this causes a problem is that laptop-mode will change the delay, dirty_writeback_centisecs, to 60000 (10 minutes) by default. In the case that bdi_wakeup_thread_delayed() races with sync_inodes_sb(), sync will be stopped for 10 minutes and trigger a hung task. Even if dirty_writeback_centisecs is not long enough to cause a hung task, we still don't want to delay sync for that long. We fix the problem by using queue_delayed_work() when we want to schedule writeback sometime in future. This function doesn't change the timer if it is already armed. For the same reason, we also change bdi_writeback_workfn() to immediately queue the work again in the case that the work_list is not empty. The same problem can happen if the sync work is run on the rescue worker. [jack@suse.cz: update changelog, add comment, use bdi_wakeup_thread_delayed()] Signed-off-by: Derek Basehore Reviewed-by: Jan Kara Cc: Alexander Viro Reviewed-by: Tejun Heo Cc: Greg Kroah-Hartman Cc: "Darrick J. Wong" Cc: Derek Basehore Cc: Kees Cook Cc: Benson Leung Cc: Sonny Rao Cc: Luigi Semenzato Cc: Jens Axboe Cc: Dave Chinner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 8 ++++---- mm/backing-dev.c | 5 ++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d754e3cf99a8..6788fe0c9761 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1039,10 +1039,10 @@ void bdi_writeback_workfn(struct work_struct *work) trace_writeback_pages_written(pages_written); } - if (!list_empty(&bdi->work_list) || - (wb_has_dirty_io(wb) && dirty_writeback_interval)) - queue_delayed_work(bdi_wq, &wb->dwork, - msecs_to_jiffies(dirty_writeback_interval * 10)); + if (!list_empty(&bdi->work_list)) + mod_delayed_work(bdi_wq, &wb->dwork, 0); + else if (wb_has_dirty_io(wb) && dirty_writeback_interval) + bdi_wakeup_thread_delayed(bdi); current->flags &= ~PF_SWAPWRITE; } diff --git a/mm/backing-dev.c b/mm/backing-dev.c index ce682f7a4f29..fab8401fc54e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -288,13 +288,16 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) * Note, we wouldn't bother setting up the timer, but this function is on the * fast-path (used by '__mark_inode_dirty()'), so we save few context switches * by delaying the wake-up. + * + * We have to be careful not to postpone flush work if it is scheduled for + * earlier. Thus we use queue_delayed_work(). */ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) { unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); + queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); } /* -- cgit v1.2.3 From 5acda9d12dcf1ad0d9a5a2a7c646de3472fa7555 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:23 -0700 Subject: bdi: avoid oops on device removal After commit 839a8e8660b6 ("writeback: replace custom worker pool implementation with unbound workqueue") when device is removed while we are writing to it we crash in bdi_writeback_workfn() -> set_worker_desc() because bdi->dev is NULL. This can happen because even though bdi_unregister() cancels all pending flushing work, nothing really prevents new ones from being queued from balance_dirty_pages() or other places. Fix the problem by clearing BDI_registered bit in bdi_unregister() and checking it before scheduling of any flushing work. Fixes: 839a8e8660b6777e7fe4e80af1a048aebe2b5977 Reviewed-by: Tejun Heo Signed-off-by: Jan Kara Cc: Derek Basehore Cc: Jens Axboe Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 23 ++++++++++++++++++----- include/linux/backing-dev.h | 2 +- mm/backing-dev.c | 13 +++++++++---- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6788fe0c9761..a16315957ef3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -89,16 +89,29 @@ static inline struct inode *wb_inode(struct list_head *head) #define CREATE_TRACE_POINTS #include +static void bdi_wakeup_thread(struct backing_dev_info *bdi) +{ + spin_lock_bh(&bdi->wb_lock); + if (test_bit(BDI_registered, &bdi->state)) + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + spin_unlock_bh(&bdi->wb_lock); +} + static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { trace_writeback_queue(bdi, work); spin_lock_bh(&bdi->wb_lock); + if (!test_bit(BDI_registered, &bdi->state)) { + if (work->done) + complete(work->done); + goto out_unlock; + } list_add_tail(&work->list, &bdi->work_list); - spin_unlock_bh(&bdi->wb_lock); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); +out_unlock: + spin_unlock_bh(&bdi->wb_lock); } static void @@ -114,7 +127,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { trace_writeback_nowork(bdi); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + bdi_wakeup_thread(bdi); return; } @@ -161,7 +174,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) * writeback as soon as there is no other work to do. */ trace_writeback_wake_background(bdi); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + bdi_wakeup_thread(bdi); } /* @@ -1017,7 +1030,7 @@ void bdi_writeback_workfn(struct work_struct *work) current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || - list_empty(&bdi->bdi_list))) { + !test_bit(BDI_registered, &bdi->state))) { /* * The normal path. Keep writing back @bdi until its * work_list is empty. Note that this path is also taken diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 24819001f5c8..e488e9459a93 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -95,7 +95,7 @@ struct backing_dev_info { unsigned int max_ratio, max_prop_frac; struct bdi_writeback wb; /* default writeback info for this bdi */ - spinlock_t wb_lock; /* protects work_list */ + spinlock_t wb_lock; /* protects work_list & wb.dwork scheduling */ struct list_head work_list; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index fab8401fc54e..09d9591b7708 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -297,7 +297,10 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); + spin_lock_bh(&bdi->wb_lock); + if (test_bit(BDI_registered, &bdi->state)) + queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); + spin_unlock_bh(&bdi->wb_lock); } /* @@ -310,9 +313,6 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); - - /* bdi_list is now unused, clear it to mark @bdi dying */ - INIT_LIST_HEAD(&bdi->bdi_list); } int bdi_register(struct backing_dev_info *bdi, struct device *parent, @@ -363,6 +363,11 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) */ bdi_remove_from_list(bdi); + /* Make sure nobody queues further work */ + spin_lock_bh(&bdi->wb_lock); + clear_bit(BDI_registered, &bdi->state); + spin_unlock_bh(&bdi->wb_lock); + /* * Drain work list and shutdown the delayed_work. At this point, * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi -- cgit v1.2.3 From 81c98869faa5f3a9457c93efef908ef476326b31 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Thu, 3 Apr 2014 14:46:25 -0700 Subject: kthread: ensure locality of task_struct allocations In the presence of memoryless nodes, numa_node_id() will return the current CPU's NUMA node, but that may not be where we expect to allocate from memory from. Instead, we should rely on the fallback code in the memory allocator itself, by using NUMA_NO_NODE. Also, when calling kthread_create_on_node(), use the nearest node with memory to the cpu in question, rather than the node it is running on. Signed-off-by: Nishanth Aravamudan Reviewed-by: Christoph Lameter Acked-by: David Rientjes Cc: Anton Blanchard Cc: Tejun Heo Cc: Oleg Nesterov Cc: Jan Kara Cc: Thomas Gleixner Cc: Tetsuo Handa Cc: Wanpeng Li Cc: Joonsoo Kim Cc: Ben Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/kthread.c b/kernel/kthread.c index b5ae3ee860a9..9a130ec06f7a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -217,7 +217,7 @@ int tsk_fork_get_node(struct task_struct *tsk) if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif - return numa_node_id(); + return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) @@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), { struct task_struct *p; - p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, + p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt, cpu); if (IS_ERR(p)) return p; -- cgit v1.2.3 From dc9b3f424903f7d6992778b69b1e35d864914ae5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 3 Apr 2014 14:46:26 -0700 Subject: kmemleak: free internal objects only if there're no leaks to be reported Currently if you stop kmemleak thread before disabling kmemleak, kmemleak objects will be freed and so you won't be able to check previously reported leaks. With this patch, kmemleak objects won't be freed if there're leaks that can be reported. Signed-off-by: Li Zefan Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 31f01c5011e5..474379ea3cff 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -218,7 +218,8 @@ static int kmemleak_stack_scan = 1; static DEFINE_MUTEX(scan_mutex); /* setting kmemleak=on, will set this var, skipping the disable */ static int kmemleak_skip_disable; - +/* If there are leaks that can be reported */ +static bool kmemleak_found_leaks; /* * Early object allocation/freeing logging. Kmemleak is initialized after the @@ -1382,9 +1383,12 @@ static void kmemleak_scan(void) } rcu_read_unlock(); - if (new_leaks) + if (new_leaks) { + kmemleak_found_leaks = true; + pr_info("%d new suspected memory leaks (see " "/sys/kernel/debug/kmemleak)\n", new_leaks); + } } @@ -1592,6 +1596,8 @@ static void kmemleak_clear(void) spin_unlock_irqrestore(&object->lock, flags); } rcu_read_unlock(); + + kmemleak_found_leaks = false; } /* @@ -1685,12 +1691,11 @@ static const struct file_operations kmemleak_fops = { static void kmemleak_do_cleanup(struct work_struct *work) { struct kmemleak_object *object; - bool cleanup = scan_thread == NULL; mutex_lock(&scan_mutex); stop_scan_thread(); - if (cleanup) { + if (!kmemleak_found_leaks) { rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) delete_object_full(object->pointer); -- cgit v1.2.3 From c89da70c7360294e715df5abd4b7239db3274c86 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 3 Apr 2014 14:46:27 -0700 Subject: kmemleak: allow freeing internal objects after kmemleak was disabled Currently if kmemleak is disabled, the kmemleak objects can never be freed, no matter if it's disabled by a user or due to fatal errors. Those objects can be a big waste of memory. OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME 1200264 1197433 99% 0.30K 46164 26 369312K kmemleak_object With this patch, after kmemleak was disabled you can reclaim memory with: # echo clear > /sys/kernel/debug/kmemleak Also inform users about this with a printk. Signed-off-by: Li Zefan Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kmemleak.txt | 15 ++++++++++++++- mm/kmemleak.c | 46 ++++++++++++++++++++++++++++++++-------------- 2 files changed, 46 insertions(+), 15 deletions(-) diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt index b6e39739a36d..6c18be97f3dd 100644 --- a/Documentation/kmemleak.txt +++ b/Documentation/kmemleak.txt @@ -53,7 +53,8 @@ Memory scanning parameters can be modified at run-time by writing to the (default 600, 0 to stop the automatic scanning) scan - trigger a memory scan clear - clear list of current memory leak suspects, done by - marking all current reported unreferenced objects grey + marking all current reported unreferenced objects grey, + or free all kmemleak objects if kmemleak has been disabled. dump= - dump information about the object found at Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on @@ -120,6 +121,18 @@ Then as usual to get your report with: # cat /sys/kernel/debug/kmemleak +Freeing kmemleak internal objects +--------------------------------- + +To allow access to previosuly found memory leaks after kmemleak has been +disabled by the user or due to an fatal error, internal kmemleak objects +won't be freed when kmemleak is disabled, and those objects may occupy +a large part of physical memory. + +In this situation, you may reclaim memory with: + + # echo clear > /sys/kernel/debug/kmemleak + Kmemleak API ------------ diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 474379ea3cff..06cd22035a92 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1600,6 +1600,8 @@ static void kmemleak_clear(void) kmemleak_found_leaks = false; } +static void __kmemleak_do_cleanup(void); + /* * File write operation to configure kmemleak at run-time. The following * commands can be written to the /sys/kernel/debug/kmemleak file: @@ -1612,7 +1614,8 @@ static void kmemleak_clear(void) * disable it) * scan - trigger a memory scan * clear - mark all current reported unreferenced kmemleak objects as - * grey to ignore printing them + * grey to ignore printing them, or free all kmemleak objects + * if kmemleak has been disabled. * dump=... - dump information about the object found at the given address */ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, @@ -1622,9 +1625,6 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, int buf_size; int ret; - if (!atomic_read(&kmemleak_enabled)) - return -EBUSY; - buf_size = min(size, (sizeof(buf) - 1)); if (strncpy_from_user(buf, user_buf, buf_size) < 0) return -EFAULT; @@ -1634,6 +1634,19 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, if (ret < 0) return ret; + if (strncmp(buf, "clear", 5) == 0) { + if (atomic_read(&kmemleak_enabled)) + kmemleak_clear(); + else + __kmemleak_do_cleanup(); + goto out; + } + + if (!atomic_read(&kmemleak_enabled)) { + ret = -EBUSY; + goto out; + } + if (strncmp(buf, "off", 3) == 0) kmemleak_disable(); else if (strncmp(buf, "stack=on", 8) == 0) @@ -1657,8 +1670,6 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, } } else if (strncmp(buf, "scan", 4) == 0) kmemleak_scan(); - else if (strncmp(buf, "clear", 5) == 0) - kmemleak_clear(); else if (strncmp(buf, "dump=", 5) == 0) ret = dump_str_object_info(buf + 5); else @@ -1683,6 +1694,16 @@ static const struct file_operations kmemleak_fops = { .release = kmemleak_release, }; +static void __kmemleak_do_cleanup(void) +{ + struct kmemleak_object *object; + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) + delete_object_full(object->pointer); + rcu_read_unlock(); +} + /* * Stop the memory scanning thread and free the kmemleak internal objects if * no previous scan thread (otherwise, kmemleak may still have some useful @@ -1690,17 +1711,14 @@ static const struct file_operations kmemleak_fops = { */ static void kmemleak_do_cleanup(struct work_struct *work) { - struct kmemleak_object *object; - mutex_lock(&scan_mutex); stop_scan_thread(); - if (!kmemleak_found_leaks) { - rcu_read_lock(); - list_for_each_entry_rcu(object, &object_list, object_list) - delete_object_full(object->pointer); - rcu_read_unlock(); - } + if (!kmemleak_found_leaks) + __kmemleak_do_cleanup(); + else + pr_info("Kmemleak disabled without freeing internal data. " + "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); mutex_unlock(&scan_mutex); } -- cgit v1.2.3 From 5f3bf19aeb8ed5cef0926bc10c80b6a50ac6bdeb Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 3 Apr 2014 14:46:28 -0700 Subject: kmemleak: remove redundant code Remove kmemleak_padding() and kmemleak_release(). Signed-off-by: Li Zefan Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmemleak.h | 2 -- mm/kmemleak.c | 7 +------ 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 2a5e5548a1d2..5bb424659c04 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -30,8 +30,6 @@ extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) __ref; extern void kmemleak_free(const void *ptr) __ref; extern void kmemleak_free_part(const void *ptr, size_t size) __ref; extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; -extern void kmemleak_padding(const void *ptr, unsigned long offset, - size_t size) __ref; extern void kmemleak_not_leak(const void *ptr) __ref; extern void kmemleak_ignore(const void *ptr) __ref; extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 06cd22035a92..6f90d003830a 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1549,11 +1549,6 @@ static int kmemleak_open(struct inode *inode, struct file *file) return seq_open(file, &kmemleak_seq_ops); } -static int kmemleak_release(struct inode *inode, struct file *file) -{ - return seq_release(inode, file); -} - static int dump_str_object_info(const char *str) { unsigned long flags; @@ -1691,7 +1686,7 @@ static const struct file_operations kmemleak_fops = { .read = seq_read, .write = kmemleak_write, .llseek = seq_lseek, - .release = kmemleak_release, + .release = seq_release, }; static void __kmemleak_do_cleanup(void) -- cgit v1.2.3 From 8910ae896c8c961ef9c7d309262730bd2859e747 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 3 Apr 2014 14:46:29 -0700 Subject: kmemleak: change some global variables to int They don't have to be atomic_t, because they are simple boolean toggles. Signed-off-by: Li Zefan Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 80 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 6f90d003830a..91d67eaee050 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -192,15 +192,15 @@ static struct kmem_cache *object_cache; static struct kmem_cache *scan_area_cache; /* set if tracing memory operations is enabled */ -static atomic_t kmemleak_enabled = ATOMIC_INIT(0); +static int kmemleak_enabled; /* set in the late_initcall if there were no errors */ -static atomic_t kmemleak_initialized = ATOMIC_INIT(0); +static int kmemleak_initialized; /* enables or disables early logging of the memory operations */ -static atomic_t kmemleak_early_log = ATOMIC_INIT(1); +static int kmemleak_early_log = 1; /* set if a kmemleak warning was issued */ -static atomic_t kmemleak_warning = ATOMIC_INIT(0); +static int kmemleak_warning; /* set if a fatal kmemleak error has occurred */ -static atomic_t kmemleak_error = ATOMIC_INIT(0); +static int kmemleak_error; /* minimum and maximum address that may be valid pointers */ static unsigned long min_addr = ULONG_MAX; @@ -268,7 +268,7 @@ static void kmemleak_disable(void); #define kmemleak_warn(x...) do { \ pr_warning(x); \ dump_stack(); \ - atomic_set(&kmemleak_warning, 1); \ + kmemleak_warning = 1; \ } while (0) /* @@ -806,7 +806,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size, unsigned long flags; struct early_log *log; - if (atomic_read(&kmemleak_error)) { + if (kmemleak_error) { /* kmemleak stopped recording, just count the requests */ crt_early_log++; return; @@ -841,7 +841,7 @@ static void early_alloc(struct early_log *log) unsigned long flags; int i; - if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr)) + if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr)) return; /* @@ -894,9 +894,9 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, { pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) create_object((unsigned long)ptr, size, min_count, gfp); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_ALLOC, ptr, size, min_count); } EXPORT_SYMBOL_GPL(kmemleak_alloc); @@ -920,11 +920,11 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) * Percpu allocations are only scanned and not reported as leaks * (min_count is set to 0). */ - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) for_each_possible_cpu(cpu) create_object((unsigned long)per_cpu_ptr(ptr, cpu), size, 0, GFP_KERNEL); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); @@ -940,9 +940,9 @@ void __ref kmemleak_free(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) delete_object_full((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_FREE, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); @@ -960,9 +960,9 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) delete_object_part((unsigned long)ptr, size); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_FREE_PART, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_part); @@ -980,11 +980,11 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) for_each_possible_cpu(cpu) delete_object_full((unsigned long)per_cpu_ptr(ptr, cpu)); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_percpu); @@ -1000,9 +1000,9 @@ void __ref kmemleak_not_leak(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_gray_object((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_not_leak); @@ -1020,9 +1020,9 @@ void __ref kmemleak_ignore(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_black_object((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_IGNORE, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_ignore); @@ -1042,9 +1042,9 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && size && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && size && !IS_ERR(ptr)) add_scan_area((unsigned long)ptr, size, gfp); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); } EXPORT_SYMBOL(kmemleak_scan_area); @@ -1062,9 +1062,9 @@ void __ref kmemleak_no_scan(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) object_no_scan((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_no_scan); @@ -1089,7 +1089,7 @@ static bool update_checksum(struct kmemleak_object *object) */ static int scan_should_stop(void) { - if (!atomic_read(&kmemleak_enabled)) + if (!kmemleak_enabled) return 1; /* @@ -1630,14 +1630,14 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, return ret; if (strncmp(buf, "clear", 5) == 0) { - if (atomic_read(&kmemleak_enabled)) + if (kmemleak_enabled) kmemleak_clear(); else __kmemleak_do_cleanup(); goto out; } - if (!atomic_read(&kmemleak_enabled)) { + if (!kmemleak_enabled) { ret = -EBUSY; goto out; } @@ -1726,14 +1726,14 @@ static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); static void kmemleak_disable(void) { /* atomically check whether it was already invoked */ - if (atomic_cmpxchg(&kmemleak_error, 0, 1)) + if (cmpxchg(&kmemleak_error, 0, 1)) return; /* stop any memory operation tracing */ - atomic_set(&kmemleak_enabled, 0); + kmemleak_enabled = 0; /* check whether it is too early for a kernel thread */ - if (atomic_read(&kmemleak_initialized)) + if (kmemleak_initialized) schedule_work(&cleanup_work); pr_info("Kernel memory leak detector disabled\n"); @@ -1775,9 +1775,10 @@ void __init kmemleak_init(void) int i; unsigned long flags; + kmemleak_early_log = 0; + #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF if (!kmemleak_skip_disable) { - atomic_set(&kmemleak_early_log, 0); kmemleak_disable(); return; } @@ -1795,12 +1796,11 @@ void __init kmemleak_init(void) /* the kernel is still in UP mode, so disabling the IRQs is enough */ local_irq_save(flags); - atomic_set(&kmemleak_early_log, 0); - if (atomic_read(&kmemleak_error)) { + if (kmemleak_error) { local_irq_restore(flags); return; } else - atomic_set(&kmemleak_enabled, 1); + kmemleak_enabled = 1; local_irq_restore(flags); /* @@ -1844,9 +1844,9 @@ void __init kmemleak_init(void) log->op_type); } - if (atomic_read(&kmemleak_warning)) { + if (kmemleak_warning) { print_log_trace(log); - atomic_set(&kmemleak_warning, 0); + kmemleak_warning = 0; } } } @@ -1858,9 +1858,9 @@ static int __init kmemleak_late_init(void) { struct dentry *dentry; - atomic_set(&kmemleak_initialized, 1); + kmemleak_initialized = 1; - if (atomic_read(&kmemleak_error)) { + if (kmemleak_error) { /* * Some error occurred and kmemleak was disabled. There is a * small chance that kmemleak_disable() was called immediately -- cgit v1.2.3 From 9ee108b2c626eab894f5c669cda04933b492813d Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 3 Apr 2014 14:46:30 -0700 Subject: fs/cifs/cifsfs.c: add __init to cifs_init_inodecache() cifs_init_inodecache is only called by __init init_cifs. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/cifs/cifsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 849f6132b327..e8ae8323c058 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1005,7 +1005,7 @@ cifs_init_once(void *inode) init_rwsem(&cifsi->lock_sem); } -static int +static int __init cifs_init_inodecache(void) { cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", -- cgit v1.2.3 From ddae82d8e67fbef534e3a240da7e77cc8654462c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 3 Apr 2014 14:46:31 -0700 Subject: fs/freevxfs/vxfs_lookup.c: update function comment nameidata was replaced by flags in commit 00cd8dd3bf95 ("stop passing nameidata to ->lookup()"). Signed-off-by: Fabian Frederick Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/freevxfs/vxfs_lookup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 25d4099a4aea..99c7f0a37af4 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -192,7 +192,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp) * vxfs_lookup - lookup pathname component * @dip: dir in which we lookup * @dp: dentry we lookup - * @nd: lookup nameidata + * @flags: lookup flags * * Description: * vxfs_lookup tries to lookup the pathname component described -- cgit v1.2.3 From 3298cf37bee59c66a51da0cea8bae0d0418e27fd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:32 -0700 Subject: fanotify: remove useless bypass_perm check The prepare_for_access_response() function checks whether group->fanotify_data.bypass_perm is set. However this test can never be true because prepare_for_access_response() is called only from fanotify_read() which means fanotify group is alive with an active fd while bypass_perm is set from fanotify_release() when all file descriptors pointing to the group are closed and the group is going away. Signed-off-by: Jan Kara Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 287a22c04149..70fe65437d21 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -211,14 +211,6 @@ static int prepare_for_access_response(struct fsnotify_group *group, re->fd = fd; mutex_lock(&group->fanotify_data.access_mutex); - - if (atomic_read(&group->fanotify_data.bypass_perm)) { - mutex_unlock(&group->fanotify_data.access_mutex); - kmem_cache_free(fanotify_response_event_cache, re); - FANOTIFY_E(event)->response = FAN_ALLOW; - return 0; - } - list_add_tail(&re->list, &group->fanotify_data.access_list); mutex_unlock(&group->fanotify_data.access_mutex); -- cgit v1.2.3 From f083441ba86acb9e2ef9c1d1747725e488c8b1ff Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:33 -0700 Subject: fanotify: use fanotify event structure for permission response processing Currently, fanotify creates new structure to track the fact that permission event has been reported to userspace and someone is waiting for a response to it. As event structures are now completely in the hands of each notification framework, we can use the event structure for this tracking instead of allocating a new structure. Since this makes the event structures for normal events and permission events even more different and the structures have different lifetime rules, we split them into two separate structures (where permission event structure contains the structure for a normal event). This makes normal events 8 bytes smaller and the code a tad bit cleaner. [akpm@linux-foundation.org: fix build] Signed-off-by: Jan Kara Cc: Eric Paris Cc: Al Viro Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify.c | 63 +++++++++++++------ fs/notify/fanotify/fanotify.h | 34 +++++++--- fs/notify/fanotify/fanotify_user.c | 123 +++++++++++++------------------------ 3 files changed, 116 insertions(+), 104 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index dc638f786d5c..ee9cb3795c2b 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -60,8 +60,8 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS -static int fanotify_get_response_from_access(struct fsnotify_group *group, - struct fanotify_event_info *event) +static int fanotify_get_response(struct fsnotify_group *group, + struct fanotify_perm_event_info *event) { int ret; @@ -142,6 +142,40 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, return false; } +struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, + struct path *path) +{ + struct fanotify_event_info *event; + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (mask & FAN_ALL_PERM_EVENTS) { + struct fanotify_perm_event_info *pevent; + + pevent = kmem_cache_alloc(fanotify_perm_event_cachep, + GFP_KERNEL); + if (!pevent) + return NULL; + event = &pevent->fae; + pevent->response = 0; + goto init; + } +#endif + event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); + if (!event) + return NULL; +init: __maybe_unused + fsnotify_init_event(&event->fse, inode, mask); + event->tgid = get_pid(task_tgid(current)); + if (path) { + event->path = *path; + path_get(&event->path); + } else { + event->path.mnt = NULL; + event->path.dentry = NULL; + } + return event; +} + static int fanotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, @@ -171,25 +205,11 @@ static int fanotify_handle_event(struct fsnotify_group *group, pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, mask); - event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); + event = fanotify_alloc_event(inode, mask, data); if (unlikely(!event)) return -ENOMEM; fsn_event = &event->fse; - fsnotify_init_event(fsn_event, inode, mask); - event->tgid = get_pid(task_tgid(current)); - if (data_type == FSNOTIFY_EVENT_PATH) { - struct path *path = data; - event->path = *path; - path_get(&event->path); - } else { - event->path.mnt = NULL; - event->path.dentry = NULL; - } -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - event->response = 0; -#endif - ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge); if (ret) { /* Permission events shouldn't be merged */ @@ -202,7 +222,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS if (mask & FAN_ALL_PERM_EVENTS) { - ret = fanotify_get_response_from_access(group, event); + ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event)); fsnotify_destroy_event(group, fsn_event); } #endif @@ -225,6 +245,13 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) event = FANOTIFY_E(fsn_event); path_put(&event->path); put_pid(event->tgid); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (fsn_event->mask & FAN_ALL_PERM_EVENTS) { + kmem_cache_free(fanotify_perm_event_cachep, + FANOTIFY_PE(fsn_event)); + return; + } +#endif kmem_cache_free(fanotify_event_cachep, event); } diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 32a2f034fb94..2a5fb14115df 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -3,13 +3,12 @@ #include extern struct kmem_cache *fanotify_event_cachep; +extern struct kmem_cache *fanotify_perm_event_cachep; /* - * Lifetime of the structure differs for normal and permission events. In both - * cases the structure is allocated in fanotify_handle_event(). For normal - * events the structure is freed immediately after reporting it to userspace. - * For permission events we free it only after we receive response from - * userspace. + * Structure for normal fanotify events. It gets allocated in + * fanotify_handle_event() and freed when the information is retrieved by + * userspace */ struct fanotify_event_info { struct fsnotify_event fse; @@ -19,12 +18,33 @@ struct fanotify_event_info { */ struct path path; struct pid *tgid; +}; + #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - u32 response; /* userspace answer to question */ -#endif +/* + * Structure for permission fanotify events. It gets allocated and freed in + * fanotify_handle_event() since we wait there for user response. When the + * information is retrieved by userspace the structure is moved from + * group->notification_list to group->fanotify_data.access_list to wait for + * user response. + */ +struct fanotify_perm_event_info { + struct fanotify_event_info fae; + int response; /* userspace answer to question */ + int fd; /* fd we passed to userspace for this event */ }; +static inline struct fanotify_perm_event_info * +FANOTIFY_PE(struct fsnotify_event *fse) +{ + return container_of(fse, struct fanotify_perm_event_info, fae.fse); +} +#endif + static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) { return container_of(fse, struct fanotify_event_info, fse); } + +struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, + struct path *path); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 70fe65437d21..8f5e85269110 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -28,14 +28,8 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops; static struct kmem_cache *fanotify_mark_cache __read_mostly; -static struct kmem_cache *fanotify_response_event_cache __read_mostly; struct kmem_cache *fanotify_event_cachep __read_mostly; - -struct fanotify_response_event { - struct list_head list; - __s32 fd; - struct fanotify_event_info *event; -}; +struct kmem_cache *fanotify_perm_event_cachep __read_mostly; /* * Get an fsnotify notification event if one exists and is small @@ -135,33 +129,34 @@ static int fill_event_metadata(struct fsnotify_group *group, } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS -static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group, - __s32 fd) +static struct fanotify_perm_event_info *dequeue_event( + struct fsnotify_group *group, int fd) { - struct fanotify_response_event *re, *return_re = NULL; + struct fanotify_perm_event_info *event, *return_e = NULL; mutex_lock(&group->fanotify_data.access_mutex); - list_for_each_entry(re, &group->fanotify_data.access_list, list) { - if (re->fd != fd) + list_for_each_entry(event, &group->fanotify_data.access_list, + fae.fse.list) { + if (event->fd != fd) continue; - list_del_init(&re->list); - return_re = re; + list_del_init(&event->fae.fse.list); + return_e = event; break; } mutex_unlock(&group->fanotify_data.access_mutex); - pr_debug("%s: found return_re=%p\n", __func__, return_re); + pr_debug("%s: found return_re=%p\n", __func__, return_e); - return return_re; + return return_e; } static int process_access_response(struct fsnotify_group *group, struct fanotify_response *response_struct) { - struct fanotify_response_event *re; - __s32 fd = response_struct->fd; - __u32 response = response_struct->response; + struct fanotify_perm_event_info *event; + int fd = response_struct->fd; + int response = response_struct->response; pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group, fd, response); @@ -181,50 +176,15 @@ static int process_access_response(struct fsnotify_group *group, if (fd < 0) return -EINVAL; - re = dequeue_re(group, fd); - if (!re) + event = dequeue_event(group, fd); + if (!event) return -ENOENT; - re->event->response = response; - + event->response = response; wake_up(&group->fanotify_data.access_waitq); - kmem_cache_free(fanotify_response_event_cache, re); - - return 0; -} - -static int prepare_for_access_response(struct fsnotify_group *group, - struct fsnotify_event *event, - __s32 fd) -{ - struct fanotify_response_event *re; - - if (!(event->mask & FAN_ALL_PERM_EVENTS)) - return 0; - - re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL); - if (!re) - return -ENOMEM; - - re->event = FANOTIFY_E(event); - re->fd = fd; - - mutex_lock(&group->fanotify_data.access_mutex); - list_add_tail(&re->list, &group->fanotify_data.access_list); - mutex_unlock(&group->fanotify_data.access_mutex); - - return 0; -} - -#else -static int prepare_for_access_response(struct fsnotify_group *group, - struct fsnotify_event *event, - __s32 fd) -{ return 0; } - #endif static ssize_t copy_event_to_user(struct fsnotify_group *group, @@ -247,9 +207,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, fanotify_event_metadata.event_len)) goto out_close_fd; - ret = prepare_for_access_response(group, event, fd); - if (ret) - goto out_close_fd; +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (event->mask & FAN_ALL_PERM_EVENTS) { + struct fanotify_perm_event_info *pevent; + + pevent = FANOTIFY_PE(event); + pevent->fd = fd; + mutex_lock(&group->fanotify_data.access_mutex); + list_add_tail(&pevent->fae.fse.list, + &group->fanotify_data.access_list); + mutex_unlock(&group->fanotify_data.access_mutex); + } +#endif if (fd != FAN_NOFD) fd_install(fd, f); @@ -263,7 +232,7 @@ out_close_fd: out: #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS if (event->mask & FAN_ALL_PERM_EVENTS) { - FANOTIFY_E(event)->response = FAN_DENY; + FANOTIFY_PE(event)->response = FAN_DENY; wake_up(&group->fanotify_data.access_waitq); } #endif @@ -312,8 +281,8 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, break; ret = copy_event_to_user(group, kevent, buf); /* - * Permission events get destroyed after we - * receive response + * Permission events get queued to wait for response. + * Other events can be destroyed now. */ if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) fsnotify_destroy_event(group, kevent); @@ -375,20 +344,19 @@ static int fanotify_release(struct inode *ignored, struct file *file) struct fsnotify_group *group = file->private_data; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - struct fanotify_response_event *re, *lre; + struct fanotify_perm_event_info *event, *next; mutex_lock(&group->fanotify_data.access_mutex); atomic_inc(&group->fanotify_data.bypass_perm); - list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) { - pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group, - re, re->event); - - list_del_init(&re->list); - re->event->response = FAN_ALLOW; + list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, + fae.fse.list) { + pr_debug("%s: found group=%p event=%p\n", __func__, group, + event); - kmem_cache_free(fanotify_response_event_cache, re); + list_del_init(&event->fae.fse.list); + event->response = FAN_ALLOW; } mutex_unlock(&group->fanotify_data.access_mutex); @@ -723,20 +691,15 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.user = user; atomic_inc(&user->fanotify_listeners); - oevent = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); + oevent = fanotify_alloc_event(NULL, FS_Q_OVERFLOW, NULL); if (unlikely(!oevent)) { fd = -ENOMEM; goto out_destroy_group; } group->overflow_event = &oevent->fse; - fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW); - oevent->tgid = get_pid(task_tgid(current)); - oevent->path.mnt = NULL; - oevent->path.dentry = NULL; group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - oevent->response = 0; mutex_init(&group->fanotify_data.access_mutex); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); @@ -912,9 +875,11 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, static int __init fanotify_user_setup(void) { fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); - fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event, - SLAB_PANIC); fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event_info, + SLAB_PANIC); +#endif return 0; } -- cgit v1.2.3 From 9573f79355ff3711c98227d14a9b7f4cb3222b97 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:34 -0700 Subject: fanotify: convert access_mutex to spinlock access_mutex is used only to guard operations on access_list. There's no need for sleeping within this lock so just make a spinlock out of it. Signed-off-by: Jan Kara Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 14 +++++++------- include/linux/fsnotify_backend.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 8f5e85269110..2a57278afb80 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -134,7 +134,7 @@ static struct fanotify_perm_event_info *dequeue_event( { struct fanotify_perm_event_info *event, *return_e = NULL; - mutex_lock(&group->fanotify_data.access_mutex); + spin_lock(&group->fanotify_data.access_lock); list_for_each_entry(event, &group->fanotify_data.access_list, fae.fse.list) { if (event->fd != fd) @@ -144,7 +144,7 @@ static struct fanotify_perm_event_info *dequeue_event( return_e = event; break; } - mutex_unlock(&group->fanotify_data.access_mutex); + spin_unlock(&group->fanotify_data.access_lock); pr_debug("%s: found return_re=%p\n", __func__, return_e); @@ -213,10 +213,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, pevent = FANOTIFY_PE(event); pevent->fd = fd; - mutex_lock(&group->fanotify_data.access_mutex); + spin_lock(&group->fanotify_data.access_lock); list_add_tail(&pevent->fae.fse.list, &group->fanotify_data.access_list); - mutex_unlock(&group->fanotify_data.access_mutex); + spin_unlock(&group->fanotify_data.access_lock); } #endif @@ -346,7 +346,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS struct fanotify_perm_event_info *event, *next; - mutex_lock(&group->fanotify_data.access_mutex); + spin_lock(&group->fanotify_data.access_lock); atomic_inc(&group->fanotify_data.bypass_perm); @@ -358,7 +358,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) list_del_init(&event->fae.fse.list); event->response = FAN_ALLOW; } - mutex_unlock(&group->fanotify_data.access_mutex); + spin_unlock(&group->fanotify_data.access_lock); wake_up(&group->fanotify_data.access_waitq); #endif @@ -700,7 +700,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - mutex_init(&group->fanotify_data.access_mutex); + spin_lock_init(&group->fanotify_data.access_lock); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); atomic_set(&group->fanotify_data.bypass_perm, 0); diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 64cf3ef50696..fc7718c6bd3e 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -178,7 +178,7 @@ struct fsnotify_group { struct fanotify_group_private_data { #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* allows a group to block waiting for a userspace response */ - struct mutex access_mutex; + spinlock_t access_lock; struct list_head access_list; wait_queue_head_t access_waitq; atomic_t bypass_perm; -- cgit v1.2.3 From d8aaab4f619acfbfafc91d94b15c2932457c65fa Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:35 -0700 Subject: fanotify: reorganize loop in fanotify_read() Swap the error / "read ok" branches in the main loop of fanotify_read(). We will grow the "read ok" part in the next patch and this makes the indentation easier. Also it is more common to have error conditions inside an 'if' instead of the fast path. Signed-off-by: Jan Kara Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 46 ++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 2a57278afb80..f1097f56137e 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -275,35 +275,37 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, kevent = get_one_event(group, count); mutex_unlock(&group->notification_mutex); - if (kevent) { + if (IS_ERR(kevent)) { ret = PTR_ERR(kevent); - if (IS_ERR(kevent)) + break; + } + + if (!kevent) { + ret = -EAGAIN; + if (file->f_flags & O_NONBLOCK) break; - ret = copy_event_to_user(group, kevent, buf); - /* - * Permission events get queued to wait for response. - * Other events can be destroyed now. - */ - if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) - fsnotify_destroy_event(group, kevent); - if (ret < 0) + + ret = -ERESTARTSYS; + if (signal_pending(current)) + break; + + if (start != buf) break; - buf += ret; - count -= ret; + schedule(); continue; } - ret = -EAGAIN; - if (file->f_flags & O_NONBLOCK) - break; - ret = -ERESTARTSYS; - if (signal_pending(current)) - break; - - if (start != buf) + ret = copy_event_to_user(group, kevent, buf); + /* + * Permission events get queued to wait for response. Other + * events can be destroyed now. + */ + if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) + fsnotify_destroy_event(group, kevent); + if (ret < 0) break; - - schedule(); + buf += ret; + count -= ret; } finish_wait(&group->notification_waitq, &wait); -- cgit v1.2.3 From d507816b58bebc8f9c1bed6a28affaf0729306e2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:36 -0700 Subject: fanotify: move unrelated handling from copy_event_to_user() Move code moving event structure to access_list from copy_event_to_user() to fanotify_read() where it is more logical (so that we can immediately see in the main loop that we either move the event to a different list or free it). Also move special error handling for permission events from copy_event_to_user() to the main loop to have it in one place with error handling for normal events. This makes copy_event_to_user() really only copy the event to user without any side effects. Signed-off-by: Jan Kara Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 40 ++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index f1097f56137e..4e565c814309 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -199,7 +199,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f); if (ret < 0) - goto out; + return ret; fd = fanotify_event_metadata.fd; ret = -EFAULT; @@ -208,16 +208,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, goto out_close_fd; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (event->mask & FAN_ALL_PERM_EVENTS) { - struct fanotify_perm_event_info *pevent; - - pevent = FANOTIFY_PE(event); - pevent->fd = fd; - spin_lock(&group->fanotify_data.access_lock); - list_add_tail(&pevent->fae.fse.list, - &group->fanotify_data.access_list); - spin_unlock(&group->fanotify_data.access_lock); - } + if (event->mask & FAN_ALL_PERM_EVENTS) + FANOTIFY_PE(event)->fd = fd; #endif if (fd != FAN_NOFD) @@ -229,13 +221,6 @@ out_close_fd: put_unused_fd(fd); fput(f); } -out: -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (event->mask & FAN_ALL_PERM_EVENTS) { - FANOTIFY_PE(event)->response = FAN_DENY; - wake_up(&group->fanotify_data.access_waitq); - } -#endif return ret; } @@ -300,10 +285,23 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, * Permission events get queued to wait for response. Other * events can be destroyed now. */ - if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) + if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) { fsnotify_destroy_event(group, kevent); - if (ret < 0) - break; + if (ret < 0) + break; + } else { +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (ret < 0) { + FANOTIFY_PE(kevent)->response = FAN_DENY; + wake_up(&group->fanotify_data.access_waitq); + break; + } + spin_lock(&group->fanotify_data.access_lock); + list_add_tail(&kevent->list, + &group->fanotify_data.access_list); + spin_unlock(&group->fanotify_data.access_lock); +#endif + } buf += ret; count -= ret; } -- cgit v1.2.3 From dc53324060f324e8af6867f57bf4891c13c6ef18 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 3 Apr 2014 14:46:37 -0700 Subject: genksyms: fix typeof() handling Recent increased use of typeof() throughout the tree resulted in a number of symbols (25 in a typical distro config of ours) not getting a proper CRC calculated for them anymore, due to the parser in genksyms not coping with several of these uses (interestingly in the majority of [if not all] cases the problem is due to the use of typeof() in code preceding a certain export, not in the declaration/definition of the exported function/object itself; I wasn't able to find a way to address this more general parser shortcoming). The use of parameter_declaration is a little more relaxed than would be ideal (permitting not just a bare type specification, but also one with identifier), but since the same code is being passed through an actual compiler, there's no apparent risk of allowing through any broken code. Otoh using parameter_declaration instead of the ad hoc "decl_specifier_seq '*'" / "decl_specifier_seq" pair allows all types to be handled rather than just plain ones and pointers to plain ones. Signed-off-by: Jan Beulich Cc: Michal Marek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/genksyms/keywords.gperf | 5 +- scripts/genksyms/keywords.hash.c_shipped | 133 +++---- scripts/genksyms/lex.l | 51 ++- scripts/genksyms/lex.lex.c_shipped | 51 ++- scripts/genksyms/parse.tab.c_shipped | 608 ++++++++++++++++--------------- scripts/genksyms/parse.tab.h_shipped | 29 +- scripts/genksyms/parse.y | 5 +- 7 files changed, 498 insertions(+), 384 deletions(-) diff --git a/scripts/genksyms/keywords.gperf b/scripts/genksyms/keywords.gperf index 3e77a943e7b7..a9096d993172 100644 --- a/scripts/genksyms/keywords.gperf +++ b/scripts/genksyms/keywords.gperf @@ -23,6 +23,8 @@ __inline, INLINE_KEYW __inline__, INLINE_KEYW __signed, SIGNED_KEYW __signed__, SIGNED_KEYW +__typeof, TYPEOF_KEYW +__typeof__, TYPEOF_KEYW __volatile, VOLATILE_KEYW __volatile__, VOLATILE_KEYW # According to rth, c99 defines _Bool, __restrict, __restrict__, restrict. KAO @@ -51,9 +53,8 @@ signed, SIGNED_KEYW static, STATIC_KEYW struct, STRUCT_KEYW typedef, TYPEDEF_KEYW +typeof, TYPEOF_KEYW union, UNION_KEYW unsigned, UNSIGNED_KEYW void, VOID_KEYW volatile, VOLATILE_KEYW -typeof, TYPEOF_KEYW -__typeof__, TYPEOF_KEYW diff --git a/scripts/genksyms/keywords.hash.c_shipped b/scripts/genksyms/keywords.hash.c_shipped index 82062607e8c0..e9452482e198 100644 --- a/scripts/genksyms/keywords.hash.c_shipped +++ b/scripts/genksyms/keywords.hash.c_shipped @@ -34,7 +34,7 @@ struct resword; static const struct resword *is_reserved_word(register const char *str, register unsigned int len); #line 8 "scripts/genksyms/keywords.gperf" struct resword { const char *name; int token; }; -/* maximum key range = 64, duplicates = 0 */ +/* maximum key range = 98, duplicates = 0 */ #ifdef __GNUC__ __inline @@ -48,32 +48,32 @@ is_reserved_hash (register const char *str, register unsigned int len) { static const unsigned char asso_values[] = { - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 0, - 67, 67, 67, 67, 67, 67, 15, 67, 67, 67, - 0, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 0, 67, 0, 67, 5, - 25, 20, 15, 30, 67, 15, 67, 67, 10, 0, - 10, 40, 20, 67, 10, 5, 0, 10, 15, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, - 67, 67, 67, 67, 67, 67 + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 0, + 101, 101, 101, 101, 101, 101, 15, 101, 101, 101, + 0, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 0, 101, 0, 101, 5, + 25, 20, 55, 30, 101, 15, 101, 101, 10, 0, + 10, 40, 10, 101, 10, 5, 0, 10, 15, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101 }; return len + asso_values[(unsigned char)str[2]] + asso_values[(unsigned char)str[0]] + asso_values[(unsigned char)str[len - 1]]; } @@ -89,17 +89,17 @@ is_reserved_word (register const char *str, register unsigned int len) { enum { - TOTAL_KEYWORDS = 45, + TOTAL_KEYWORDS = 46, MIN_WORD_LENGTH = 3, MAX_WORD_LENGTH = 24, MIN_HASH_VALUE = 3, - MAX_HASH_VALUE = 66 + MAX_HASH_VALUE = 100 }; static const struct resword wordlist[] = { {""}, {""}, {""}, -#line 33 "scripts/genksyms/keywords.gperf" +#line 35 "scripts/genksyms/keywords.gperf" {"asm", ASM_KEYW}, {""}, #line 15 "scripts/genksyms/keywords.gperf" @@ -108,7 +108,7 @@ is_reserved_word (register const char *str, register unsigned int len) #line 16 "scripts/genksyms/keywords.gperf" {"__asm__", ASM_KEYW}, {""}, {""}, -#line 59 "scripts/genksyms/keywords.gperf" +#line 27 "scripts/genksyms/keywords.gperf" {"__typeof__", TYPEOF_KEYW}, {""}, #line 19 "scripts/genksyms/keywords.gperf" @@ -119,31 +119,31 @@ is_reserved_word (register const char *str, register unsigned int len) {"__const__", CONST_KEYW}, #line 25 "scripts/genksyms/keywords.gperf" {"__signed__", SIGNED_KEYW}, -#line 51 "scripts/genksyms/keywords.gperf" +#line 53 "scripts/genksyms/keywords.gperf" {"static", STATIC_KEYW}, {""}, -#line 46 "scripts/genksyms/keywords.gperf" +#line 48 "scripts/genksyms/keywords.gperf" {"int", INT_KEYW}, -#line 39 "scripts/genksyms/keywords.gperf" +#line 41 "scripts/genksyms/keywords.gperf" {"char", CHAR_KEYW}, -#line 40 "scripts/genksyms/keywords.gperf" +#line 42 "scripts/genksyms/keywords.gperf" {"const", CONST_KEYW}, -#line 52 "scripts/genksyms/keywords.gperf" +#line 54 "scripts/genksyms/keywords.gperf" {"struct", STRUCT_KEYW}, -#line 31 "scripts/genksyms/keywords.gperf" +#line 33 "scripts/genksyms/keywords.gperf" {"__restrict__", RESTRICT_KEYW}, -#line 32 "scripts/genksyms/keywords.gperf" +#line 34 "scripts/genksyms/keywords.gperf" {"restrict", RESTRICT_KEYW}, #line 12 "scripts/genksyms/keywords.gperf" {"EXPORT_SYMBOL_GPL_FUTURE", EXPORT_SYMBOL_KEYW}, #line 23 "scripts/genksyms/keywords.gperf" {"__inline__", INLINE_KEYW}, {""}, -#line 27 "scripts/genksyms/keywords.gperf" +#line 29 "scripts/genksyms/keywords.gperf" {"__volatile__", VOLATILE_KEYW}, #line 10 "scripts/genksyms/keywords.gperf" {"EXPORT_SYMBOL", EXPORT_SYMBOL_KEYW}, -#line 30 "scripts/genksyms/keywords.gperf" +#line 32 "scripts/genksyms/keywords.gperf" {"_restrict", RESTRICT_KEYW}, {""}, #line 17 "scripts/genksyms/keywords.gperf" @@ -152,56 +152,65 @@ is_reserved_word (register const char *str, register unsigned int len) {"EXPORT_SYMBOL_GPL", EXPORT_SYMBOL_KEYW}, #line 21 "scripts/genksyms/keywords.gperf" {"__extension__", EXTENSION_KEYW}, -#line 42 "scripts/genksyms/keywords.gperf" +#line 44 "scripts/genksyms/keywords.gperf" {"enum", ENUM_KEYW}, #line 13 "scripts/genksyms/keywords.gperf" {"EXPORT_UNUSED_SYMBOL", EXPORT_SYMBOL_KEYW}, -#line 43 "scripts/genksyms/keywords.gperf" +#line 45 "scripts/genksyms/keywords.gperf" {"extern", EXTERN_KEYW}, {""}, #line 24 "scripts/genksyms/keywords.gperf" {"__signed", SIGNED_KEYW}, #line 14 "scripts/genksyms/keywords.gperf" {"EXPORT_UNUSED_SYMBOL_GPL", EXPORT_SYMBOL_KEYW}, -#line 54 "scripts/genksyms/keywords.gperf" +#line 57 "scripts/genksyms/keywords.gperf" {"union", UNION_KEYW}, -#line 58 "scripts/genksyms/keywords.gperf" - {"typeof", TYPEOF_KEYW}, -#line 53 "scripts/genksyms/keywords.gperf" - {"typedef", TYPEDEF_KEYW}, + {""}, {""}, #line 22 "scripts/genksyms/keywords.gperf" {"__inline", INLINE_KEYW}, -#line 38 "scripts/genksyms/keywords.gperf" +#line 40 "scripts/genksyms/keywords.gperf" {"auto", AUTO_KEYW}, -#line 26 "scripts/genksyms/keywords.gperf" +#line 28 "scripts/genksyms/keywords.gperf" {"__volatile", VOLATILE_KEYW}, {""}, {""}, -#line 55 "scripts/genksyms/keywords.gperf" +#line 58 "scripts/genksyms/keywords.gperf" {"unsigned", UNSIGNED_KEYW}, {""}, -#line 49 "scripts/genksyms/keywords.gperf" +#line 51 "scripts/genksyms/keywords.gperf" {"short", SHORT_KEYW}, -#line 45 "scripts/genksyms/keywords.gperf" +#line 47 "scripts/genksyms/keywords.gperf" {"inline", INLINE_KEYW}, {""}, -#line 57 "scripts/genksyms/keywords.gperf" +#line 60 "scripts/genksyms/keywords.gperf" {"volatile", VOLATILE_KEYW}, -#line 47 "scripts/genksyms/keywords.gperf" +#line 49 "scripts/genksyms/keywords.gperf" {"long", LONG_KEYW}, -#line 29 "scripts/genksyms/keywords.gperf" +#line 31 "scripts/genksyms/keywords.gperf" {"_Bool", BOOL_KEYW}, {""}, {""}, -#line 48 "scripts/genksyms/keywords.gperf" +#line 50 "scripts/genksyms/keywords.gperf" {"register", REGISTER_KEYW}, -#line 56 "scripts/genksyms/keywords.gperf" +#line 59 "scripts/genksyms/keywords.gperf" {"void", VOID_KEYW}, -#line 44 "scripts/genksyms/keywords.gperf" - {"float", FLOAT_KEYW}, -#line 41 "scripts/genksyms/keywords.gperf" + {""}, +#line 43 "scripts/genksyms/keywords.gperf" {"double", DOUBLE_KEYW}, + {""}, +#line 26 "scripts/genksyms/keywords.gperf" + {"__typeof", TYPEOF_KEYW}, + {""}, {""}, +#line 52 "scripts/genksyms/keywords.gperf" + {"signed", SIGNED_KEYW}, {""}, {""}, {""}, {""}, -#line 50 "scripts/genksyms/keywords.gperf" - {"signed", SIGNED_KEYW} +#line 56 "scripts/genksyms/keywords.gperf" + {"typeof", TYPEOF_KEYW}, +#line 55 "scripts/genksyms/keywords.gperf" + {"typedef", TYPEDEF_KEYW}, + {""}, {""}, {""}, {""}, {""}, {""}, {""}, {""}, {""}, + {""}, {""}, {""}, {""}, {""}, {""}, {""}, {""}, {""}, + {""}, {""}, {""}, {""}, {""}, {""}, {""}, {""}, {""}, +#line 46 "scripts/genksyms/keywords.gperf" + {"float", FLOAT_KEYW} }; if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) diff --git a/scripts/genksyms/lex.l b/scripts/genksyms/lex.l index f770071719cb..e583565f2011 100644 --- a/scripts/genksyms/lex.l +++ b/scripts/genksyms/lex.l @@ -129,8 +129,9 @@ int yylex(void) { static enum { - ST_NOTSTARTED, ST_NORMAL, ST_ATTRIBUTE, ST_ASM, ST_BRACKET, ST_BRACE, - ST_EXPRESSION, ST_TABLE_1, ST_TABLE_2, ST_TABLE_3, ST_TABLE_4, + ST_NOTSTARTED, ST_NORMAL, ST_ATTRIBUTE, ST_ASM, ST_TYPEOF, ST_TYPEOF_1, + ST_BRACKET, ST_BRACE, ST_EXPRESSION, + ST_TABLE_1, ST_TABLE_2, ST_TABLE_3, ST_TABLE_4, ST_TABLE_5, ST_TABLE_6 } lexstate = ST_NOTSTARTED; @@ -198,6 +199,10 @@ repeat: lexstate = ST_ASM; count = 0; goto repeat; + case TYPEOF_KEYW: + lexstate = ST_TYPEOF; + count = 0; + goto repeat; case STRUCT_KEYW: case UNION_KEYW: @@ -284,6 +289,48 @@ repeat: } break; + case ST_TYPEOF: + switch (token) + { + case '(': + if ( ++count == 1 ) + lexstate = ST_TYPEOF_1; + else + APP; + goto repeat; + case ')': + APP; + if (--count == 0) + { + lexstate = ST_NORMAL; + token = TYPEOF_PHRASE; + break; + } + goto repeat; + default: + APP; + goto repeat; + } + break; + + case ST_TYPEOF_1: + if (token == IDENT) + { + if (is_reserved_word(yytext, yyleng) + || find_symbol(yytext, SYM_TYPEDEF, 1)) + { + yyless(0); + unput('('); + lexstate = ST_NORMAL; + token = TYPEOF_KEYW; + break; + } + _APP("(", 1); + } + APP; + lexstate = ST_TYPEOF; + goto repeat; + case ST_BRACKET: APP; switch (token) diff --git a/scripts/genksyms/lex.lex.c_shipped b/scripts/genksyms/lex.lex.c_shipped index 0bf4157e6161..f82740a69b85 100644 --- a/scripts/genksyms/lex.lex.c_shipped +++ b/scripts/genksyms/lex.lex.c_shipped @@ -1938,8 +1938,9 @@ int yylex(void) { static enum { - ST_NOTSTARTED, ST_NORMAL, ST_ATTRIBUTE, ST_ASM, ST_BRACKET, ST_BRACE, - ST_EXPRESSION, ST_TABLE_1, ST_TABLE_2, ST_TABLE_3, ST_TABLE_4, + ST_NOTSTARTED, ST_NORMAL, ST_ATTRIBUTE, ST_ASM, ST_TYPEOF, ST_TYPEOF_1, + ST_BRACKET, ST_BRACE, ST_EXPRESSION, + ST_TABLE_1, ST_TABLE_2, ST_TABLE_3, ST_TABLE_4, ST_TABLE_5, ST_TABLE_6 } lexstate = ST_NOTSTARTED; @@ -2007,6 +2008,10 @@ repeat: lexstate = ST_ASM; count = 0; goto repeat; + case TYPEOF_KEYW: + lexstate = ST_TYPEOF; + count = 0; + goto repeat; case STRUCT_KEYW: case UNION_KEYW: @@ -2093,6 +2098,48 @@ repeat: } break; + case ST_TYPEOF: + switch (token) + { + case '(': + if ( ++count == 1 ) + lexstate = ST_TYPEOF_1; + else + APP; + goto repeat; + case ')': + APP; + if (--count == 0) + { + lexstate = ST_NORMAL; + token = TYPEOF_PHRASE; + break; + } + goto repeat; + default: + APP; + goto repeat; + } + break; + + case ST_TYPEOF_1: + if (token == IDENT) + { + if (is_reserved_word(yytext, yyleng) + || find_symbol(yytext, SYM_TYPEDEF, 1)) + { + yyless(0); + unput('('); + lexstate = ST_NORMAL; + token = TYPEOF_KEYW; + break; + } + _APP("(", 1); + } + APP; + lexstate = ST_TYPEOF; + goto repeat; + case ST_BRACKET: APP; switch (token) diff --git a/scripts/genksyms/parse.tab.c_shipped b/scripts/genksyms/parse.tab.c_shipped index ece53c79bb59..c9f0f0ce82ff 100644 --- a/scripts/genksyms/parse.tab.c_shipped +++ b/scripts/genksyms/parse.tab.c_shipped @@ -1,8 +1,8 @@ -/* A Bison parser, made by GNU Bison 2.5. */ +/* A Bison parser, made by GNU Bison 2.5.1. */ /* Bison implementation for Yacc-like parsers in C - Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc. + Copyright (C) 1984, 1989-1990, 2000-2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -44,7 +44,7 @@ #define YYBISON 1 /* Bison version. */ -#define YYBISON_VERSION "2.5" +#define YYBISON_VERSION "2.5.1" /* Skeleton name. */ #define YYSKELETON_NAME "yacc.c" @@ -117,6 +117,14 @@ static void record_compound(struct string_list **keyw, +# ifndef YY_NULL +# if defined __cplusplus && 201103L <= __cplusplus +# define YY_NULL nullptr +# else +# define YY_NULL 0 +# endif +# endif + /* Enabling traces. */ #ifndef YYDEBUG # define YYDEBUG 1 @@ -171,18 +179,19 @@ static void record_compound(struct string_list **keyw, EXPORT_SYMBOL_KEYW = 284, ASM_PHRASE = 285, ATTRIBUTE_PHRASE = 286, - BRACE_PHRASE = 287, - BRACKET_PHRASE = 288, - EXPRESSION_PHRASE = 289, - CHAR = 290, - DOTS = 291, - IDENT = 292, - INT = 293, - REAL = 294, - STRING = 295, - TYPE = 296, - OTHER = 297, - FILENAME = 298 + TYPEOF_PHRASE = 287, + BRACE_PHRASE = 288, + BRACKET_PHRASE = 289, + EXPRESSION_PHRASE = 290, + CHAR = 291, + DOTS = 292, + IDENT = 293, + INT = 294, + REAL = 295, + STRING = 296, + TYPE = 297, + OTHER = 298, + FILENAME = 299 }; #endif @@ -304,6 +313,7 @@ YYID (yyi) # if ! defined _ALLOCA_H && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \ || defined __cplusplus || defined _MSC_VER) # include /* INFRINGES ON USER NAME SPACE */ + /* Use EXIT_SUCCESS as a witness for stdlib.h. */ # ifndef EXIT_SUCCESS # define EXIT_SUCCESS 0 # endif @@ -395,20 +405,20 @@ union yyalloc #endif #if defined YYCOPY_NEEDED && YYCOPY_NEEDED -/* Copy COUNT objects from FROM to TO. The source and destination do +/* Copy COUNT objects from SRC to DST. The source and destination do not overlap. */ # ifndef YYCOPY # if defined __GNUC__ && 1 < __GNUC__ -# define YYCOPY(To, From, Count) \ - __builtin_memcpy (To, From, (Count) * sizeof (*(From))) +# define YYCOPY(Dst, Src, Count) \ + __builtin_memcpy (Dst, Src, (Count) * sizeof (*(Src))) # else -# define YYCOPY(To, From, Count) \ - do \ - { \ - YYSIZE_T yyi; \ - for (yyi = 0; yyi < (Count); yyi++) \ - (To)[yyi] = (From)[yyi]; \ - } \ +# define YYCOPY(Dst, Src, Count) \ + do \ + { \ + YYSIZE_T yyi; \ + for (yyi = 0; yyi < (Count); yyi++) \ + (Dst)[yyi] = (Src)[yyi]; \ + } \ while (YYID (0)) # endif # endif @@ -417,20 +427,20 @@ union yyalloc /* YYFINAL -- State number of the termination state. */ #define YYFINAL 4 /* YYLAST -- Last index in YYTABLE. */ -#define YYLAST 532 +#define YYLAST 514 /* YYNTOKENS -- Number of terminals. */ -#define YYNTOKENS 53 +#define YYNTOKENS 54 /* YYNNTS -- Number of nonterminals. */ #define YYNNTS 49 /* YYNRULES -- Number of rules. */ #define YYNRULES 132 /* YYNRULES -- Number of states. */ -#define YYNSTATES 188 +#define YYNSTATES 187 /* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX. */ #define YYUNDEFTOK 2 -#define YYMAXUTOK 298 +#define YYMAXUTOK 299 #define YYTRANSLATE(YYX) \ ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK) @@ -442,15 +452,15 @@ static const yytype_uint8 yytranslate[] = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 47, 49, 48, 2, 46, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 52, 44, - 2, 50, 2, 2, 2, 2, 2, 2, 2, 2, + 48, 49, 50, 2, 47, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 53, 45, + 2, 51, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 51, 2, 45, 2, 2, 2, 2, + 2, 2, 2, 52, 2, 46, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -467,7 +477,7 @@ static const yytype_uint8 yytranslate[] = 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, - 35, 36, 37, 38, 39, 40, 41, 42, 43 + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44 }; #if YYDEBUG @@ -478,78 +488,77 @@ static const yytype_uint16 yyprhs[] = 0, 0, 3, 5, 8, 9, 12, 13, 18, 19, 23, 25, 27, 29, 31, 34, 37, 41, 42, 44, 46, 50, 55, 56, 58, 60, 63, 65, 67, 69, - 71, 73, 75, 77, 79, 81, 87, 92, 95, 98, - 101, 105, 109, 113, 116, 119, 122, 124, 126, 128, - 130, 132, 134, 136, 138, 140, 142, 144, 147, 148, - 150, 152, 155, 157, 159, 161, 163, 166, 168, 170, - 175, 180, 183, 187, 191, 194, 196, 198, 200, 205, - 210, 213, 217, 221, 224, 226, 230, 231, 233, 235, - 239, 242, 245, 247, 248, 250, 252, 257, 262, 265, - 269, 273, 277, 278, 280, 283, 287, 291, 292, 294, - 296, 299, 303, 306, 307, 309, 311, 315, 318, 321, - 323, 326, 327, 330, 334, 339, 341, 345, 347, 351, - 354, 355, 357 + 71, 73, 75, 77, 79, 81, 86, 88, 91, 94, + 97, 101, 105, 109, 112, 115, 118, 120, 122, 124, + 126, 128, 130, 132, 134, 136, 138, 140, 143, 144, + 146, 148, 151, 153, 155, 157, 159, 162, 164, 166, + 171, 176, 179, 183, 187, 190, 192, 194, 196, 201, + 206, 209, 213, 217, 220, 222, 226, 227, 229, 231, + 235, 238, 241, 243, 244, 246, 248, 253, 258, 261, + 265, 269, 273, 274, 276, 279, 283, 287, 288, 290, + 292, 295, 299, 302, 303, 305, 307, 311, 314, 317, + 319, 322, 323, 326, 330, 335, 337, 341, 343, 347, + 350, 351, 353 }; /* YYRHS -- A `-1'-separated list of the rules' RHS. */ static const yytype_int8 yyrhs[] = { - 54, 0, -1, 55, -1, 54, 55, -1, -1, 56, - 57, -1, -1, 12, 23, 58, 60, -1, -1, 23, - 59, 60, -1, 60, -1, 84, -1, 99, -1, 101, - -1, 1, 44, -1, 1, 45, -1, 64, 61, 44, - -1, -1, 62, -1, 63, -1, 62, 46, 63, -1, - 74, 100, 95, 85, -1, -1, 65, -1, 66, -1, - 65, 66, -1, 67, -1, 68, -1, 5, -1, 17, - -1, 21, -1, 11, -1, 14, -1, 69, -1, 73, - -1, 28, 47, 65, 48, 49, -1, 28, 47, 65, - 49, -1, 22, 37, -1, 24, 37, -1, 10, 37, - -1, 22, 37, 87, -1, 24, 37, 87, -1, 10, - 37, 96, -1, 10, 96, -1, 22, 87, -1, 24, - 87, -1, 7, -1, 19, -1, 15, -1, 16, -1, - 20, -1, 25, -1, 13, -1, 9, -1, 26, -1, - 6, -1, 41, -1, 48, 71, -1, -1, 72, -1, - 73, -1, 72, 73, -1, 8, -1, 27, -1, 31, - -1, 18, -1, 70, 74, -1, 75, -1, 37, -1, - 75, 47, 78, 49, -1, 75, 47, 1, 49, -1, - 75, 33, -1, 47, 74, 49, -1, 47, 1, 49, - -1, 70, 76, -1, 77, -1, 37, -1, 41, -1, - 77, 47, 78, 49, -1, 77, 47, 1, 49, -1, - 77, 33, -1, 47, 76, 49, -1, 47, 1, 49, - -1, 79, 36, -1, 79, -1, 80, 46, 36, -1, - -1, 80, -1, 81, -1, 80, 46, 81, -1, 65, - 82, -1, 70, 82, -1, 83, -1, -1, 37, -1, - 41, -1, 83, 47, 78, 49, -1, 83, 47, 1, - 49, -1, 83, 33, -1, 47, 82, 49, -1, 47, - 1, 49, -1, 64, 74, 32, -1, -1, 86, -1, - 50, 34, -1, 51, 88, 45, -1, 51, 1, 45, - -1, -1, 89, -1, 90, -1, 89, 90, -1, 64, - 91, 44, -1, 1, 44, -1, -1, 92, -1, 93, - -1, 92, 46, 93, -1, 76, 95, -1, 37, 94, - -1, 94, -1, 52, 34, -1, -1, 95, 31, -1, - 51, 97, 45, -1, 51, 97, 46, 45, -1, 98, - -1, 97, 46, 98, -1, 37, -1, 37, 50, 34, - -1, 30, 44, -1, -1, 30, -1, 29, 47, 37, - 49, 44, -1 + 55, 0, -1, 56, -1, 55, 56, -1, -1, 57, + 58, -1, -1, 12, 23, 59, 61, -1, -1, 23, + 60, 61, -1, 61, -1, 85, -1, 100, -1, 102, + -1, 1, 45, -1, 1, 46, -1, 65, 62, 45, + -1, -1, 63, -1, 64, -1, 63, 47, 64, -1, + 75, 101, 96, 86, -1, -1, 66, -1, 67, -1, + 66, 67, -1, 68, -1, 69, -1, 5, -1, 17, + -1, 21, -1, 11, -1, 14, -1, 70, -1, 74, + -1, 28, 48, 82, 49, -1, 32, -1, 22, 38, + -1, 24, 38, -1, 10, 38, -1, 22, 38, 88, + -1, 24, 38, 88, -1, 10, 38, 97, -1, 10, + 97, -1, 22, 88, -1, 24, 88, -1, 7, -1, + 19, -1, 15, -1, 16, -1, 20, -1, 25, -1, + 13, -1, 9, -1, 26, -1, 6, -1, 42, -1, + 50, 72, -1, -1, 73, -1, 74, -1, 73, 74, + -1, 8, -1, 27, -1, 31, -1, 18, -1, 71, + 75, -1, 76, -1, 38, -1, 76, 48, 79, 49, + -1, 76, 48, 1, 49, -1, 76, 34, -1, 48, + 75, 49, -1, 48, 1, 49, -1, 71, 77, -1, + 78, -1, 38, -1, 42, -1, 78, 48, 79, 49, + -1, 78, 48, 1, 49, -1, 78, 34, -1, 48, + 77, 49, -1, 48, 1, 49, -1, 80, 37, -1, + 80, -1, 81, 47, 37, -1, -1, 81, -1, 82, + -1, 81, 47, 82, -1, 66, 83, -1, 71, 83, + -1, 84, -1, -1, 38, -1, 42, -1, 84, 48, + 79, 49, -1, 84, 48, 1, 49, -1, 84, 34, + -1, 48, 83, 49, -1, 48, 1, 49, -1, 65, + 75, 33, -1, -1, 87, -1, 51, 35, -1, 52, + 89, 46, -1, 52, 1, 46, -1, -1, 90, -1, + 91, -1, 90, 91, -1, 65, 92, 45, -1, 1, + 45, -1, -1, 93, -1, 94, -1, 93, 47, 94, + -1, 77, 96, -1, 38, 95, -1, 95, -1, 53, + 35, -1, -1, 96, 31, -1, 52, 98, 46, -1, + 52, 98, 47, 46, -1, 99, -1, 98, 47, 99, + -1, 38, -1, 38, 51, 35, -1, 30, 45, -1, + -1, 30, -1, 29, 48, 38, 49, 45, -1 }; /* YYRLINE[YYN] -- source line where rule number YYN was defined. */ static const yytype_uint16 yyrline[] = { - 0, 123, 123, 124, 128, 128, 134, 134, 136, 136, - 138, 139, 140, 141, 142, 143, 147, 161, 162, 166, - 174, 187, 193, 194, 198, 199, 203, 209, 213, 214, - 215, 216, 217, 221, 222, 223, 224, 228, 230, 232, - 236, 238, 240, 245, 248, 249, 253, 254, 255, 256, - 257, 258, 259, 260, 261, 262, 263, 267, 272, 273, - 277, 278, 282, 282, 282, 283, 291, 292, 296, 305, - 307, 309, 311, 313, 320, 321, 325, 326, 327, 329, - 331, 333, 335, 340, 341, 342, 346, 347, 351, 352, - 357, 362, 364, 368, 369, 377, 381, 383, 385, 387, - 389, 394, 403, 404, 409, 414, 415, 419, 420, 424, - 425, 429, 431, 436, 437, 441, 442, 446, 447, 448, - 452, 456, 457, 461, 462, 466, 467, 470, 475, 483, - 487, 488, 492 + 0, 124, 124, 125, 129, 129, 135, 135, 137, 137, + 139, 140, 141, 142, 143, 144, 148, 162, 163, 167, + 175, 188, 194, 195, 199, 200, 204, 210, 214, 215, + 216, 217, 218, 222, 223, 224, 225, 229, 231, 233, + 237, 239, 241, 246, 249, 250, 254, 255, 256, 257, + 258, 259, 260, 261, 262, 263, 264, 268, 273, 274, + 278, 279, 283, 283, 283, 284, 292, 293, 297, 306, + 308, 310, 312, 314, 321, 322, 326, 327, 328, 330, + 332, 334, 336, 341, 342, 343, 347, 348, 352, 353, + 358, 363, 365, 369, 370, 378, 382, 384, 386, 388, + 390, 395, 404, 405, 410, 415, 416, 420, 421, 425, + 426, 430, 432, 437, 438, 442, 443, 447, 448, 449, + 453, 457, 458, 462, 463, 467, 468, 471, 476, 484, + 488, 489, 493 }; #endif @@ -565,9 +574,9 @@ static const char *const yytname[] = "SHORT_KEYW", "SIGNED_KEYW", "STATIC_KEYW", "STRUCT_KEYW", "TYPEDEF_KEYW", "UNION_KEYW", "UNSIGNED_KEYW", "VOID_KEYW", "VOLATILE_KEYW", "TYPEOF_KEYW", "EXPORT_SYMBOL_KEYW", "ASM_PHRASE", - "ATTRIBUTE_PHRASE", "BRACE_PHRASE", "BRACKET_PHRASE", + "ATTRIBUTE_PHRASE", "TYPEOF_PHRASE", "BRACE_PHRASE", "BRACKET_PHRASE", "EXPRESSION_PHRASE", "CHAR", "DOTS", "IDENT", "INT", "REAL", "STRING", - "TYPE", "OTHER", "FILENAME", "';'", "'}'", "','", "'('", "'*'", "')'", + "TYPE", "OTHER", "FILENAME", "';'", "'}'", "','", "'('", "')'", "'*'", "'='", "'{'", "':'", "$accept", "declaration_seq", "declaration", "$@1", "declaration1", "$@2", "$@3", "simple_declaration", "init_declarator_list_opt", "init_declarator_list", "init_declarator", @@ -584,7 +593,7 @@ static const char *const yytname[] = "member_declarator_list_opt", "member_declarator_list", "member_declarator", "member_bitfield_declarator", "attribute_opt", "enum_body", "enumerator_list", "enumerator", "asm_definition", - "asm_phrase_opt", "export_definition", 0 + "asm_phrase_opt", "export_definition", YY_NULL }; #endif @@ -597,28 +606,28 @@ static const yytype_uint16 yytoknum[] = 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, - 295, 296, 297, 298, 59, 125, 44, 40, 42, 41, - 61, 123, 58 + 295, 296, 297, 298, 299, 59, 125, 44, 40, 41, + 42, 61, 123, 58 }; # endif /* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */ static const yytype_uint8 yyr1[] = { - 0, 53, 54, 54, 56, 55, 58, 57, 59, 57, - 57, 57, 57, 57, 57, 57, 60, 61, 61, 62, - 62, 63, 64, 64, 65, 65, 66, 66, 67, 67, - 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, - 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, - 69, 69, 69, 69, 69, 69, 69, 70, 71, 71, - 72, 72, 73, 73, 73, 73, 74, 74, 75, 75, - 75, 75, 75, 75, 76, 76, 77, 77, 77, 77, - 77, 77, 77, 78, 78, 78, 79, 79, 80, 80, - 81, 82, 82, 83, 83, 83, 83, 83, 83, 83, - 83, 84, 85, 85, 86, 87, 87, 88, 88, 89, - 89, 90, 90, 91, 91, 92, 92, 93, 93, 93, - 94, 95, 95, 96, 96, 97, 97, 98, 98, 99, - 100, 100, 101 + 0, 54, 55, 55, 57, 56, 59, 58, 60, 58, + 58, 58, 58, 58, 58, 58, 61, 62, 62, 63, + 63, 64, 65, 65, 66, 66, 67, 67, 68, 68, + 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 71, 72, 72, + 73, 73, 74, 74, 74, 74, 75, 75, 76, 76, + 76, 76, 76, 76, 77, 77, 78, 78, 78, 78, + 78, 78, 78, 79, 79, 79, 80, 80, 81, 81, + 82, 83, 83, 84, 84, 84, 84, 84, 84, 84, + 84, 85, 86, 86, 87, 88, 88, 89, 89, 90, + 90, 91, 91, 92, 92, 93, 93, 94, 94, 94, + 95, 96, 96, 97, 97, 98, 98, 99, 99, 100, + 101, 101, 102 }; /* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN. */ @@ -627,7 +636,7 @@ static const yytype_uint8 yyr2[] = 0, 2, 1, 2, 0, 2, 0, 4, 0, 3, 1, 1, 1, 1, 2, 2, 3, 0, 1, 1, 3, 4, 0, 1, 1, 2, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 5, 4, 2, 2, 2, + 1, 1, 1, 1, 1, 4, 1, 2, 2, 2, 3, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 4, @@ -648,68 +657,68 @@ static const yytype_uint8 yydefact[] = 4, 4, 2, 0, 1, 3, 0, 28, 55, 46, 62, 53, 0, 31, 0, 52, 32, 48, 49, 29, 65, 47, 50, 30, 0, 8, 0, 51, 54, 63, - 0, 0, 0, 64, 56, 5, 10, 17, 23, 24, - 26, 27, 33, 34, 11, 12, 13, 14, 15, 39, - 0, 43, 6, 37, 0, 44, 22, 38, 45, 0, - 0, 129, 68, 0, 58, 0, 18, 19, 0, 130, - 67, 25, 42, 127, 0, 125, 22, 40, 0, 113, - 0, 0, 109, 9, 17, 41, 0, 0, 0, 0, - 57, 59, 60, 16, 0, 66, 131, 101, 121, 71, - 0, 0, 123, 0, 7, 112, 106, 76, 77, 0, - 0, 0, 121, 75, 0, 114, 115, 119, 105, 0, - 110, 130, 0, 36, 0, 73, 72, 61, 20, 102, - 0, 93, 0, 84, 87, 88, 128, 124, 126, 118, - 0, 76, 0, 120, 74, 117, 80, 0, 111, 0, - 35, 132, 122, 0, 21, 103, 70, 94, 56, 0, - 93, 90, 92, 69, 83, 0, 82, 81, 0, 0, - 116, 104, 0, 95, 0, 91, 98, 0, 85, 89, - 79, 78, 100, 99, 0, 0, 97, 96 + 0, 0, 0, 64, 36, 56, 5, 10, 17, 23, + 24, 26, 27, 33, 34, 11, 12, 13, 14, 15, + 39, 0, 43, 6, 37, 0, 44, 22, 38, 45, + 0, 0, 129, 68, 0, 58, 0, 18, 19, 0, + 130, 67, 25, 42, 127, 0, 125, 22, 40, 0, + 113, 0, 0, 109, 9, 17, 41, 93, 0, 0, + 0, 0, 57, 59, 60, 16, 0, 66, 131, 101, + 121, 71, 0, 0, 123, 0, 7, 112, 106, 76, + 77, 0, 0, 0, 121, 75, 0, 114, 115, 119, + 105, 0, 110, 130, 94, 56, 0, 93, 90, 92, + 35, 0, 73, 72, 61, 20, 102, 0, 0, 84, + 87, 88, 128, 124, 126, 118, 0, 76, 0, 120, + 74, 117, 80, 0, 111, 0, 0, 95, 0, 91, + 98, 0, 132, 122, 0, 21, 103, 70, 69, 83, + 0, 82, 81, 0, 0, 116, 100, 99, 0, 0, + 104, 85, 89, 79, 78, 97, 96 }; /* YYDEFGOTO[NTERM-NUM]. */ static const yytype_int16 yydefgoto[] = { - -1, 1, 2, 3, 35, 76, 56, 36, 65, 66, - 67, 79, 38, 39, 40, 41, 42, 68, 90, 91, - 43, 121, 70, 112, 113, 132, 133, 134, 135, 161, - 162, 44, 154, 155, 55, 80, 81, 82, 114, 115, - 116, 117, 129, 51, 74, 75, 45, 98, 46 + -1, 1, 2, 3, 36, 77, 57, 37, 66, 67, + 68, 80, 39, 40, 41, 42, 43, 69, 92, 93, + 44, 123, 71, 114, 115, 138, 139, 140, 141, 128, + 129, 45, 165, 166, 56, 81, 82, 83, 116, 117, + 118, 119, 136, 52, 75, 76, 46, 100, 47 }; /* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing STATE-NUM. */ -#define YYPACT_NINF -135 +#define YYPACT_NINF -140 static const yytype_int16 yypact[] = { - -135, 20, -135, 321, -135, -135, 30, -135, -135, -135, - -135, -135, -28, -135, 2, -135, -135, -135, -135, -135, - -135, -135, -135, -135, -6, -135, 9, -135, -135, -135, - -5, 15, -17, -135, -135, -135, -135, 18, 491, -135, - -135, -135, -135, -135, -135, -135, -135, -135, -135, -22, - 31, -135, -135, 19, 106, -135, 491, 19, -135, 491, - 50, -135, -135, 11, -3, 51, 57, -135, 18, -14, - 14, -135, -135, 48, 46, -135, 491, -135, 33, 32, - 59, 154, -135, -135, 18, -135, 365, 56, 60, 61, - -135, -3, -135, -135, 18, -135, -135, -135, -135, -135, - 202, 74, -135, -23, -135, -135, -135, 77, -135, 16, - 101, 49, -135, 34, 92, 93, -135, -135, -135, 94, - -135, 110, 95, -135, 97, -135, -135, -135, -135, -20, - 96, 410, 99, 113, 100, -135, -135, -135, -135, -135, - 103, -135, 107, -135, -135, 111, -135, 239, -135, 32, - -135, -135, -135, 123, -135, -135, -135, -135, -135, 3, - 52, -135, 38, -135, -135, 454, -135, -135, 117, 128, - -135, -135, 134, -135, 135, -135, -135, 276, -135, -135, - -135, -135, -135, -135, 137, 138, -135, -135 + -140, 29, -140, 207, -140, -140, 40, -140, -140, -140, + -140, -140, -27, -140, 44, -140, -140, -140, -140, -140, + -140, -140, -140, -140, -22, -140, -18, -140, -140, -140, + -9, 22, 28, -140, -140, -140, -140, -140, 42, 472, + -140, -140, -140, -140, -140, -140, -140, -140, -140, -140, + 46, 43, -140, -140, 47, 107, -140, 472, 47, -140, + 472, 62, -140, -140, 16, -3, 57, 56, -140, 42, + 35, -11, -140, -140, 53, 48, -140, 472, -140, 51, + 21, 59, 157, -140, -140, 42, -140, 388, 58, 60, + 70, 81, -140, -3, -140, -140, 42, -140, -140, -140, + -140, -140, 253, 71, -140, -20, -140, -140, -140, 83, + -140, 5, 102, 34, -140, 12, 95, 94, -140, -140, + -140, 97, -140, 113, -140, -140, 2, 41, -140, 27, + -140, 99, -140, -140, -140, -140, -24, 98, 101, 109, + 104, -140, -140, -140, -140, -140, 105, -140, 110, -140, + -140, 117, -140, 298, -140, 21, 112, -140, 120, -140, + -140, 343, -140, -140, 121, -140, -140, -140, -140, -140, + 434, -140, -140, 131, 137, -140, -140, -140, 138, 141, + -140, -140, -140, -140, -140, -140, -140 }; /* YYPGOTO[NTERM-NUM]. */ static const yytype_int16 yypgoto[] = { - -135, -135, 187, -135, -135, -135, -135, -50, -135, -135, - 98, 0, -59, -37, -135, -135, -135, -77, -135, -135, - -54, -30, -135, -90, -135, -134, -135, -135, 24, -58, - -135, -135, -135, -135, -18, -135, -135, 109, -135, -135, - 44, 87, 84, 148, -135, 102, -135, -135, -135 + -140, -140, 190, -140, -140, -140, -140, -45, -140, -140, + 96, 1, -60, -31, -140, -140, -140, -78, -140, -140, + -55, -7, -140, -92, -140, -139, -140, -140, -59, -39, + -140, -140, -140, -140, -13, -140, -140, 111, -140, -140, + 39, 87, 84, 147, -140, 106, -140, -140, -140 }; /* YYTABLE[YYPACT[STATE-NUM]]. What to do in state STATE-NUM. If @@ -718,149 +727,145 @@ static const yytype_int16 yypgoto[] = #define YYTABLE_NINF -109 static const yytype_int16 yytable[] = { - 86, 71, 111, 37, 172, 10, 83, 69, 58, 49, - 92, 152, 88, 169, 73, 20, 96, 140, 97, 142, - 4, 144, 137, 50, 29, 52, 104, 61, 33, 50, - 153, 53, 111, 89, 111, 77, -93, 127, 95, 85, - 157, 131, 59, 185, 173, 54, 57, 99, 62, 71, - 159, 64, -93, 141, 160, 62, 84, 108, 63, 64, - 54, 100, 60, 109, 64, 63, 64, 146, 73, 107, - 54, 176, 111, 108, 47, 48, 84, 105, 106, 109, - 64, 147, 160, 160, 110, 177, 141, 87, 131, 157, - 108, 102, 103, 173, 71, 93, 109, 64, 101, 159, - 64, 174, 175, 94, 118, 124, 131, 78, 136, 125, - 126, 7, 8, 9, 10, 11, 12, 13, 131, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 110, - 26, 27, 28, 29, 30, 143, 148, 33, 105, 149, - 96, 151, 152, -22, 150, 156, 165, 34, 163, 164, - -22, -107, 166, -22, -22, 119, 167, 171, -22, 7, - 8, 9, 10, 11, 12, 13, 180, 15, 16, 17, - 18, 19, 20, 21, 22, 23, 24, 181, 26, 27, - 28, 29, 30, 182, 183, 33, 186, 187, 5, 179, - 120, -22, 128, 170, 139, 34, 145, 72, -22, -108, - 0, -22, -22, 130, 0, 138, -22, 7, 8, 9, - 10, 11, 12, 13, 0, 15, 16, 17, 18, 19, - 20, 21, 22, 23, 24, 0, 26, 27, 28, 29, - 30, 0, 0, 33, 0, 0, 0, 0, -86, 0, - 168, 0, 0, 34, 7, 8, 9, 10, 11, 12, - 13, -86, 15, 16, 17, 18, 19, 20, 21, 22, - 23, 24, 0, 26, 27, 28, 29, 30, 0, 0, - 33, 0, 0, 0, 0, -86, 0, 184, 0, 0, - 34, 7, 8, 9, 10, 11, 12, 13, -86, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 0, - 26, 27, 28, 29, 30, 0, 0, 33, 0, 0, - 0, 0, -86, 0, 0, 0, 0, 34, 0, 0, - 0, 0, 6, 0, 0, -86, 7, 8, 9, 10, - 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, - 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, - 31, 32, 33, 0, 0, 0, 0, 0, -22, 0, - 0, 0, 34, 0, 0, -22, 0, 0, -22, -22, - 7, 8, 9, 10, 11, 12, 13, 0, 15, 16, - 17, 18, 19, 20, 21, 22, 23, 24, 0, 26, - 27, 28, 29, 30, 0, 0, 33, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 34, 0, 0, 0, - 0, 0, 0, 122, 123, 7, 8, 9, 10, 11, - 12, 13, 0, 15, 16, 17, 18, 19, 20, 21, - 22, 23, 24, 0, 26, 27, 28, 29, 30, 0, - 0, 33, 0, 0, 0, 0, 0, 157, 0, 0, - 0, 158, 0, 0, 0, 0, 0, 159, 64, 7, + 87, 88, 113, 156, 38, 10, 146, 163, 72, 127, + 94, 50, 84, 59, 174, 20, 54, 90, 74, 148, + 58, 150, 179, 101, 29, 51, 143, 164, 33, 4, + 55, 70, 106, 113, 55, 113, -93, 102, 134, 60, + 124, 78, 87, 147, 157, 86, 152, 110, 127, 127, + 126, -93, 65, 111, 63, 65, 72, 91, 85, 109, + 153, 160, 97, 110, 64, 98, 65, 53, 99, 111, + 61, 65, 147, 62, 112, 161, 110, 113, 85, 124, + 63, 74, 111, 157, 65, 48, 49, 158, 159, 126, + 64, 65, 65, 87, 104, 105, 107, 108, 51, 55, + 89, 87, 95, 96, 103, 120, 142, 130, 79, 131, + 87, 182, 7, 8, 9, 10, 11, 12, 13, 132, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 133, 26, 27, 28, 29, 30, 112, 149, 33, 34, + 154, 155, 107, 98, 162, -22, 169, 167, 163, 35, + 168, 170, -22, -107, 171, -22, 180, -22, 121, 172, + -22, 176, 7, 8, 9, 10, 11, 12, 13, 177, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 183, 26, 27, 28, 29, 30, 184, 185, 33, 34, + 186, 5, 135, 122, 175, -22, 145, 73, 151, 35, + 0, 0, -22, -108, 0, -22, 0, -22, 6, 0, + -22, 144, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 0, 0, 0, 0, 0, -22, 0, 0, 0, 35, + 0, 0, -22, 0, 137, -22, 0, -22, 7, 8, + 9, 10, 11, 12, 13, 0, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 0, 26, 27, 28, + 29, 30, 0, 0, 33, 34, 0, 0, 0, 0, + -86, 0, 0, 0, 0, 35, 0, 0, 0, 173, + 0, 0, -86, 7, 8, 9, 10, 11, 12, 13, + 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 0, 26, 27, 28, 29, 30, 0, 0, 33, + 34, 0, 0, 0, 0, -86, 0, 0, 0, 0, + 35, 0, 0, 0, 178, 0, 0, -86, 7, 8, + 9, 10, 11, 12, 13, 0, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 0, 26, 27, 28, + 29, 30, 0, 0, 33, 34, 0, 0, 0, 0, + -86, 0, 0, 0, 0, 35, 0, 0, 0, 0, + 0, 0, -86, 7, 8, 9, 10, 11, 12, 13, + 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 0, 26, 27, 28, 29, 30, 0, 0, 33, + 34, 0, 0, 0, 0, 0, 124, 0, 0, 0, + 125, 0, 0, 0, 0, 0, 126, 0, 65, 7, 8, 9, 10, 11, 12, 13, 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 26, 27, - 28, 29, 30, 0, 0, 33, 0, 0, 0, 0, - 178, 0, 0, 0, 0, 34, 7, 8, 9, 10, - 11, 12, 13, 0, 15, 16, 17, 18, 19, 20, - 21, 22, 23, 24, 0, 26, 27, 28, 29, 30, - 0, 0, 33, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 34 + 28, 29, 30, 0, 0, 33, 34, 0, 0, 0, + 0, 181, 0, 0, 0, 0, 35, 7, 8, 9, + 10, 11, 12, 13, 0, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 0, 26, 27, 28, 29, + 30, 0, 0, 33, 34, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 35 }; #define yypact_value_is_default(yystate) \ - ((yystate) == (-135)) + ((yystate) == (-140)) #define yytable_value_is_error(yytable_value) \ YYID (0) static const yytype_int16 yycheck[] = { - 59, 38, 79, 3, 1, 8, 56, 37, 26, 37, - 64, 31, 1, 147, 37, 18, 30, 1, 32, 109, - 0, 111, 45, 51, 27, 23, 76, 44, 31, 51, - 50, 37, 109, 63, 111, 53, 33, 91, 68, 57, - 37, 100, 47, 177, 41, 51, 37, 33, 37, 86, - 47, 48, 49, 37, 131, 37, 56, 41, 47, 48, - 51, 47, 47, 47, 48, 47, 48, 33, 37, 37, - 51, 33, 149, 41, 44, 45, 76, 44, 45, 47, - 48, 47, 159, 160, 52, 47, 37, 37, 147, 37, - 41, 45, 46, 41, 131, 44, 47, 48, 50, 47, - 48, 159, 160, 46, 45, 49, 165, 1, 34, 49, - 49, 5, 6, 7, 8, 9, 10, 11, 177, 13, - 14, 15, 16, 17, 18, 19, 20, 21, 22, 52, - 24, 25, 26, 27, 28, 34, 44, 31, 44, 46, - 30, 44, 31, 37, 49, 49, 46, 41, 49, 36, - 44, 45, 49, 47, 48, 1, 49, 34, 52, 5, - 6, 7, 8, 9, 10, 11, 49, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 49, 24, 25, - 26, 27, 28, 49, 49, 31, 49, 49, 1, 165, - 81, 37, 94, 149, 107, 41, 112, 49, 44, 45, - -1, 47, 48, 1, -1, 103, 52, 5, 6, 7, - 8, 9, 10, 11, -1, 13, 14, 15, 16, 17, - 18, 19, 20, 21, 22, -1, 24, 25, 26, 27, - 28, -1, -1, 31, -1, -1, -1, -1, 36, -1, - 1, -1, -1, 41, 5, 6, 7, 8, 9, 10, - 11, 49, 13, 14, 15, 16, 17, 18, 19, 20, - 21, 22, -1, 24, 25, 26, 27, 28, -1, -1, - 31, -1, -1, -1, -1, 36, -1, 1, -1, -1, - 41, 5, 6, 7, 8, 9, 10, 11, 49, 13, - 14, 15, 16, 17, 18, 19, 20, 21, 22, -1, - 24, 25, 26, 27, 28, -1, -1, 31, -1, -1, - -1, -1, 36, -1, -1, -1, -1, 41, -1, -1, - -1, -1, 1, -1, -1, 49, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, - 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, - 29, 30, 31, -1, -1, -1, -1, -1, 37, -1, - -1, -1, 41, -1, -1, 44, -1, -1, 47, 48, - 5, 6, 7, 8, 9, 10, 11, -1, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, -1, 24, - 25, 26, 27, 28, -1, -1, 31, -1, -1, -1, - -1, -1, -1, -1, -1, -1, 41, -1, -1, -1, - -1, -1, -1, 48, 49, 5, 6, 7, 8, 9, - 10, 11, -1, 13, 14, 15, 16, 17, 18, 19, - 20, 21, 22, -1, 24, 25, 26, 27, 28, -1, - -1, 31, -1, -1, -1, -1, -1, 37, -1, -1, - -1, 41, -1, -1, -1, -1, -1, 47, 48, 5, + 60, 60, 80, 1, 3, 8, 1, 31, 39, 87, + 65, 38, 57, 26, 153, 18, 38, 1, 38, 111, + 38, 113, 161, 34, 27, 52, 46, 51, 31, 0, + 52, 38, 77, 111, 52, 113, 34, 48, 93, 48, + 38, 54, 102, 38, 42, 58, 34, 42, 126, 127, + 48, 49, 50, 48, 38, 50, 87, 64, 57, 38, + 48, 34, 69, 42, 48, 30, 50, 23, 33, 48, + 48, 50, 38, 45, 53, 48, 42, 155, 77, 38, + 38, 38, 48, 42, 50, 45, 46, 126, 127, 48, + 48, 50, 50, 153, 46, 47, 45, 46, 52, 52, + 38, 161, 45, 47, 51, 46, 35, 49, 1, 49, + 170, 170, 5, 6, 7, 8, 9, 10, 11, 49, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 49, 24, 25, 26, 27, 28, 53, 35, 31, 32, + 45, 47, 45, 30, 45, 38, 37, 49, 31, 42, + 49, 47, 45, 46, 49, 48, 35, 50, 1, 49, + 53, 49, 5, 6, 7, 8, 9, 10, 11, 49, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 49, 24, 25, 26, 27, 28, 49, 49, 31, 32, + 49, 1, 96, 82, 155, 38, 109, 50, 114, 42, + -1, -1, 45, 46, -1, 48, -1, 50, 1, -1, + 53, 105, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + -1, -1, -1, -1, -1, 38, -1, -1, -1, 42, + -1, -1, 45, -1, 1, 48, -1, 50, 5, 6, + 7, 8, 9, 10, 11, -1, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, -1, 24, 25, 26, + 27, 28, -1, -1, 31, 32, -1, -1, -1, -1, + 37, -1, -1, -1, -1, 42, -1, -1, -1, 1, + -1, -1, 49, 5, 6, 7, 8, 9, 10, 11, + -1, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, -1, 24, 25, 26, 27, 28, -1, -1, 31, + 32, -1, -1, -1, -1, 37, -1, -1, -1, -1, + 42, -1, -1, -1, 1, -1, -1, 49, 5, 6, + 7, 8, 9, 10, 11, -1, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, -1, 24, 25, 26, + 27, 28, -1, -1, 31, 32, -1, -1, -1, -1, + 37, -1, -1, -1, -1, 42, -1, -1, -1, -1, + -1, -1, 49, 5, 6, 7, 8, 9, 10, 11, + -1, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, -1, 24, 25, 26, 27, 28, -1, -1, 31, + 32, -1, -1, -1, -1, -1, 38, -1, -1, -1, + 42, -1, -1, -1, -1, -1, 48, -1, 50, 5, 6, 7, 8, 9, 10, 11, -1, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, -1, 24, 25, - 26, 27, 28, -1, -1, 31, -1, -1, -1, -1, - 36, -1, -1, -1, -1, 41, 5, 6, 7, 8, - 9, 10, 11, -1, 13, 14, 15, 16, 17, 18, - 19, 20, 21, 22, -1, 24, 25, 26, 27, 28, - -1, -1, 31, -1, -1, -1, -1, -1, -1, -1, - -1, -1, 41 + 26, 27, 28, -1, -1, 31, 32, -1, -1, -1, + -1, 37, -1, -1, -1, -1, 42, 5, 6, 7, + 8, 9, 10, 11, -1, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, -1, 24, 25, 26, 27, + 28, -1, -1, 31, 32, -1, -1, -1, -1, -1, + -1, -1, -1, -1, 42 }; /* YYSTOS[STATE-NUM] -- The (internal number of the) accessing symbol of state STATE-NUM. */ static const yytype_uint8 yystos[] = { - 0, 54, 55, 56, 0, 55, 1, 5, 6, 7, + 0, 55, 56, 57, 0, 56, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, - 28, 29, 30, 31, 41, 57, 60, 64, 65, 66, - 67, 68, 69, 73, 84, 99, 101, 44, 45, 37, - 51, 96, 23, 37, 51, 87, 59, 37, 87, 47, - 47, 44, 37, 47, 48, 61, 62, 63, 70, 74, - 75, 66, 96, 37, 97, 98, 58, 87, 1, 64, - 88, 89, 90, 60, 64, 87, 65, 37, 1, 74, - 71, 72, 73, 44, 46, 74, 30, 32, 100, 33, - 47, 50, 45, 46, 60, 44, 45, 37, 41, 47, - 52, 70, 76, 77, 91, 92, 93, 94, 45, 1, - 90, 74, 48, 49, 49, 49, 49, 73, 63, 95, - 1, 65, 78, 79, 80, 81, 34, 45, 98, 94, - 1, 37, 76, 34, 76, 95, 33, 47, 44, 46, - 49, 44, 31, 50, 85, 86, 49, 37, 41, 47, - 70, 82, 83, 49, 36, 46, 49, 49, 1, 78, - 93, 34, 1, 41, 82, 82, 33, 47, 36, 81, - 49, 49, 49, 49, 1, 78, 49, 49 + 28, 29, 30, 31, 32, 42, 58, 61, 65, 66, + 67, 68, 69, 70, 74, 85, 100, 102, 45, 46, + 38, 52, 97, 23, 38, 52, 88, 60, 38, 88, + 48, 48, 45, 38, 48, 50, 62, 63, 64, 71, + 75, 76, 67, 97, 38, 98, 99, 59, 88, 1, + 65, 89, 90, 91, 61, 65, 88, 66, 82, 38, + 1, 75, 72, 73, 74, 45, 47, 75, 30, 33, + 101, 34, 48, 51, 46, 47, 61, 45, 46, 38, + 42, 48, 53, 71, 77, 78, 92, 93, 94, 95, + 46, 1, 91, 75, 38, 42, 48, 71, 83, 84, + 49, 49, 49, 49, 74, 64, 96, 1, 79, 80, + 81, 82, 35, 46, 99, 95, 1, 38, 77, 35, + 77, 96, 34, 48, 45, 47, 1, 42, 83, 83, + 34, 48, 45, 31, 51, 86, 87, 49, 49, 37, + 47, 49, 49, 1, 79, 94, 49, 49, 1, 79, + 35, 37, 82, 49, 49, 49, 49 }; #define yyerrok (yyerrstatus = 0) @@ -890,17 +895,18 @@ static const yytype_uint8 yystos[] = #define YYRECOVERING() (!!yyerrstatus) -#define YYBACKUP(Token, Value) \ -do \ - if (yychar == YYEMPTY && yylen == 1) \ - { \ - yychar = (Token); \ - yylval = (Value); \ - YYPOPSTACK (1); \ - goto yybackup; \ - } \ - else \ - { \ +#define YYBACKUP(Token, Value) \ +do \ + if (yychar == YYEMPTY) \ + { \ + yychar = (Token); \ + yylval = (Value); \ + YYPOPSTACK (yylen); \ + yystate = *yyssp; \ + goto yybackup; \ + } \ + else \ + { \ yyerror (YY_("syntax error: cannot back up")); \ YYERROR; \ } \ @@ -995,6 +1001,8 @@ yy_symbol_value_print (yyoutput, yytype, yyvaluep) YYSTYPE const * const yyvaluep; #endif { + FILE *yyo = yyoutput; + YYUSE (yyo); if (!yyvaluep) return; # ifdef YYPRINT @@ -1246,12 +1254,12 @@ static int yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg, yytype_int16 *yyssp, int yytoken) { - YYSIZE_T yysize0 = yytnamerr (0, yytname[yytoken]); + YYSIZE_T yysize0 = yytnamerr (YY_NULL, yytname[yytoken]); YYSIZE_T yysize = yysize0; YYSIZE_T yysize1; enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 }; /* Internationalized format string. */ - const char *yyformat = 0; + const char *yyformat = YY_NULL; /* Arguments of yyformat. */ char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM]; /* Number of reported tokens (one for the "unexpected", one per @@ -1311,7 +1319,7 @@ yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg, break; } yyarg[yycount++] = yytname[yyx]; - yysize1 = yysize + yytnamerr (0, yytname[yyx]); + yysize1 = yysize + yytnamerr (YY_NULL, yytname[yyx]); if (! (yysize <= yysize1 && yysize1 <= YYSTACK_ALLOC_MAXIMUM)) return 2; @@ -1463,7 +1471,7 @@ yyparse () `yyss': related to states. `yyvs': related to semantic values. - Refer to the stacks thru separate pointers, to allow yyoverflow + Refer to the stacks through separate pointers, to allow yyoverflow to reallocate them elsewhere. */ /* The state stack. */ @@ -2346,7 +2354,7 @@ yyabortlab: yyresult = 1; goto yyreturn; -#if !defined(yyoverflow) || YYERROR_VERBOSE +#if !defined yyoverflow || YYERROR_VERBOSE /*-------------------------------------------------. | yyexhaustedlab -- memory exhaustion comes here. | `-------------------------------------------------*/ diff --git a/scripts/genksyms/parse.tab.h_shipped b/scripts/genksyms/parse.tab.h_shipped index 93240a3cdecc..a4737dec4532 100644 --- a/scripts/genksyms/parse.tab.h_shipped +++ b/scripts/genksyms/parse.tab.h_shipped @@ -1,8 +1,8 @@ -/* A Bison parser, made by GNU Bison 2.5. */ +/* A Bison parser, made by GNU Bison 2.5.1. */ /* Bison interface for Yacc-like parsers in C - Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc. + Copyright (C) 1984, 1989-1990, 2000-2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -66,18 +66,19 @@ EXPORT_SYMBOL_KEYW = 284, ASM_PHRASE = 285, ATTRIBUTE_PHRASE = 286, - BRACE_PHRASE = 287, - BRACKET_PHRASE = 288, - EXPRESSION_PHRASE = 289, - CHAR = 290, - DOTS = 291, - IDENT = 292, - INT = 293, - REAL = 294, - STRING = 295, - TYPE = 296, - OTHER = 297, - FILENAME = 298 + TYPEOF_PHRASE = 287, + BRACE_PHRASE = 288, + BRACKET_PHRASE = 289, + EXPRESSION_PHRASE = 290, + CHAR = 291, + DOTS = 292, + IDENT = 293, + INT = 294, + REAL = 295, + STRING = 296, + TYPE = 297, + OTHER = 298, + FILENAME = 299 }; #endif diff --git a/scripts/genksyms/parse.y b/scripts/genksyms/parse.y index 23c39998ad86..b9f4cf202302 100644 --- a/scripts/genksyms/parse.y +++ b/scripts/genksyms/parse.y @@ -103,6 +103,7 @@ static void record_compound(struct string_list **keyw, %token ASM_PHRASE %token ATTRIBUTE_PHRASE +%token TYPEOF_PHRASE %token BRACE_PHRASE %token BRACKET_PHRASE %token EXPRESSION_PHRASE @@ -220,8 +221,8 @@ storage_class_specifier: type_specifier: simple_type_specifier | cvar_qualifier - | TYPEOF_KEYW '(' decl_specifier_seq '*' ')' - | TYPEOF_KEYW '(' decl_specifier_seq ')' + | TYPEOF_KEYW '(' parameter_declaration ')' + | TYPEOF_PHRASE /* References to s/u/e's defined elsewhere. Rearrange things so that it is easier to expand the definition fully later. */ -- cgit v1.2.3 From d0df04f7cc2b31ae1eb3b70f9333b9f91a963f9b Mon Sep 17 00:00:00 2001 From: Michael Opdenacker Date: Thu, 3 Apr 2014 14:46:38 -0700 Subject: score: remove unused CPU_SCORE7 Kconfig parameter This removes the CPU_SCORE7 Kconfig parameter, which is no longer used anywhere in the source code and Makefiles. Signed-off-by: Michael Opdenacker Cc: Chen Liqin Cc: Lennox Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/score/Kconfig | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/score/Kconfig b/arch/score/Kconfig index c75d06aa27c3..4ac8cae5727c 100644 --- a/arch/score/Kconfig +++ b/arch/score/Kconfig @@ -22,27 +22,21 @@ choice config ARCH_SCORE7 bool "SCORE7 processor" select SYS_SUPPORTS_32BIT_KERNEL - select CPU_SCORE7 select GENERIC_HAS_IOMAP config MACH_SPCT6600 bool "SPCT6600 series based machines" select SYS_SUPPORTS_32BIT_KERNEL - select CPU_SCORE7 select GENERIC_HAS_IOMAP config SCORE_SIM bool "Score simulator" select SYS_SUPPORTS_32BIT_KERNEL - select CPU_SCORE7 select GENERIC_HAS_IOMAP endchoice endmenu -config CPU_SCORE7 - bool - config NO_DMA bool default y -- cgit v1.2.3 From abafe5d9b04648a2f699202e9ae2d15ffe44c3a3 Mon Sep 17 00:00:00 2001 From: Bobby Bingham Date: Thu, 3 Apr 2014 14:46:39 -0700 Subject: sh: push extra copy of r0-r2 for syscall parameters When invoking syscall handlers on sh32, the saved userspace registers are at the top of the stack. This seems to have been intentional, as it is an easy way to pass r0, r1, ... to the handler as parameters 5, 6, ... It causes problems, however, because the compiler is allowed to generate code for a function which clobbers that function's own parameters. For example, gcc generates the following code for clone: : mov.l 8c020714 ,r1 ! 8c020540 mov.l r7,@r15 mov r6,r7 jmp @r1 mov #0,r6 nop .word 0x0540 .word 0x8c02 The `mov.l r7,@r15` clobbers the saved value of r0 passed from userspace. For most system calls, this might not be a problem, because we'll be overwriting r0 with the return value anyway. But in the case of clone, copy_thread will need the original value of r0 if the CLONE_SETTLS flag was specified. The first patch in this series fixes this issue for system calls by pushing to the stack and extra copy of r0-r2 before invoking the handler. We discard this copy before restoring the userspace registers, so it is not a problem if they are clobbered. Exception handlers also receive the userspace register values in a similar manner, and may hit the same problem. The second patch removes the do_fpu_error handler, which looks susceptible to this problem and which, as far as I can tell, has not been used in some time. The third patch addresses other exception handlers. This patch (of 3): The userspace registers are stored at the top of the stack when the syscall handler is invoked, which allows r0-r2 to act as parameters 5-7. Parameters passed on the stack may be clobbered by the syscall handler. The solution is to push an extra copy of the registers which might be used as syscall parameters to the stack, so that the authoritative set of saved register values does not get clobbered. A few system call handlers are also updated to get the userspace registers using current_pt_regs() instead of from the stack. Signed-off-by: Bobby Bingham Cc: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/asm/syscalls_32.h | 12 +++--------- arch/sh/kernel/entry-common.S | 15 +++++++++++---- arch/sh/kernel/signal_32.c | 12 ++++-------- arch/sh/kernel/sys_sh32.c | 7 ++----- 4 files changed, 20 insertions(+), 26 deletions(-) diff --git a/arch/sh/include/asm/syscalls_32.h b/arch/sh/include/asm/syscalls_32.h index 4f97df87d7d5..4f643aa718e3 100644 --- a/arch/sh/include/asm/syscalls_32.h +++ b/arch/sh/include/asm/syscalls_32.h @@ -9,15 +9,9 @@ struct pt_regs; -asmlinkage int sys_sigreturn(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs __regs); -asmlinkage int sys_rt_sigreturn(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs __regs); -asmlinkage int sys_sh_pipe(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs __regs); +asmlinkage int sys_sigreturn(void); +asmlinkage int sys_rt_sigreturn(void); +asmlinkage int sys_sh_pipe(void); asmlinkage ssize_t sys_pread_wrapper(unsigned int fd, char __user *buf, size_t count, long dummy, loff_t pos); asmlinkage ssize_t sys_pwrite_wrapper(unsigned int fd, const char __user *buf, diff --git a/arch/sh/kernel/entry-common.S b/arch/sh/kernel/entry-common.S index ca46834294b7..13047a4facd2 100644 --- a/arch/sh/kernel/entry-common.S +++ b/arch/sh/kernel/entry-common.S @@ -193,10 +193,10 @@ syscall_trace_entry: ! Reload R0-R4 from kernel stack, where the ! parent may have modified them using ! ptrace(POKEUSR). (Note that R0-R2 are - ! used by the system call handler directly - ! from the kernel stack anyway, so don't need - ! to be reloaded here.) This allows the parent - ! to rewrite system calls and args on the fly. + ! reloaded from the kernel stack by syscall_call + ! below, so don't need to be reloaded here.) + ! This allows the parent to rewrite system calls + ! and args on the fly. mov.l @(OFF_R4,r15), r4 ! arg0 mov.l @(OFF_R5,r15), r5 mov.l @(OFF_R6,r15), r6 @@ -357,8 +357,15 @@ syscall_call: mov.l 3f, r8 ! Load the address of sys_call_table add r8, r3 mov.l @r3, r8 + mov.l @(OFF_R2,r15), r2 + mov.l @(OFF_R1,r15), r1 + mov.l @(OFF_R0,r15), r0 + mov.l r2, @-r15 + mov.l r1, @-r15 + mov.l r0, @-r15 jsr @r8 ! jump to specific syscall handler nop + add #12, r15 mov.l @(OFF_R0,r15), r12 ! save r0 mov.l r0, @(OFF_R0,r15) ! save the return value ! diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index 6af6e7c5cac8..594cd371aa28 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -148,11 +148,9 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *r0_p return err; } -asmlinkage int sys_sigreturn(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs __regs) +asmlinkage int sys_sigreturn(void) { - struct pt_regs *regs = RELOC_HIDE(&__regs, 0); + struct pt_regs *regs = current_pt_regs(); struct sigframe __user *frame = (struct sigframe __user *)regs->regs[15]; sigset_t set; int r0; @@ -180,11 +178,9 @@ badframe: return 0; } -asmlinkage int sys_rt_sigreturn(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs __regs) +asmlinkage int sys_rt_sigreturn(void) { - struct pt_regs *regs = RELOC_HIDE(&__regs, 0); + struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame = (struct rt_sigframe __user *)regs->regs[15]; sigset_t set; int r0; diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c index 497bab3a0401..b66d1c62eb19 100644 --- a/arch/sh/kernel/sys_sh32.c +++ b/arch/sh/kernel/sys_sh32.c @@ -21,17 +21,14 @@ * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way Unix traditionally does this, though. */ -asmlinkage int sys_sh_pipe(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs __regs) +asmlinkage int sys_sh_pipe(void) { - struct pt_regs *regs = RELOC_HIDE(&__regs, 0); int fd[2]; int error; error = do_pipe_flags(fd, 0); if (!error) { - regs->regs[1] = fd[1]; + current_pt_regs()->regs[1] = fd[1]; return fd[0]; } return error; -- cgit v1.2.3 From 7caf62de25554da3af00c92c11afa95dcc3592c4 Mon Sep 17 00:00:00 2001 From: Bobby Bingham Date: Thu, 3 Apr 2014 14:46:40 -0700 Subject: sh: remove unused do_fpu_error This does not appear to have been used since commit 74d99a5e2622 ("sh: SH-2A FPU support") in 2007. Signed-off-by: Bobby Bingham Cc: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/math-emu/math.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/arch/sh/math-emu/math.c b/arch/sh/math-emu/math.c index b876780c1e1c..04aa55fa8c75 100644 --- a/arch/sh/math-emu/math.c +++ b/arch/sh/math-emu/math.c @@ -574,24 +574,6 @@ static int ieee_fpe_handler(struct pt_regs *regs) return 0; } -asmlinkage void do_fpu_error(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs regs) -{ - struct task_struct *tsk = current; - siginfo_t info; - - if (ieee_fpe_handler (®s)) - return; - - regs.pc += 2; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = FPE_FLTINV; - info.si_addr = (void __user *)regs.pc; - force_sig_info(SIGFPE, &info, tsk); -} - /** * fpu_init - Initialize FPU registers * @fpu: Pointer to software emulated FPU registers. -- cgit v1.2.3 From a3c195144e162097c42e4284323ed6d386de105d Mon Sep 17 00:00:00 2001 From: Bobby Bingham Date: Thu, 3 Apr 2014 14:46:41 -0700 Subject: sh: don't pass saved userspace state to exception handlers The compiler is permitted to generate code which overwrites the parameters to a function. If those parameters include the only saved copy we have of userspace's registers, we're in trouble. Signed-off-by: Bobby Bingham Cc: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/asm/traps_32.h | 16 ++++------------ arch/sh/kernel/traps_32.c | 23 +++++++---------------- 2 files changed, 11 insertions(+), 28 deletions(-) diff --git a/arch/sh/include/asm/traps_32.h b/arch/sh/include/asm/traps_32.h index cfd55ff9dff2..17e129fe459c 100644 --- a/arch/sh/include/asm/traps_32.h +++ b/arch/sh/include/asm/traps_32.h @@ -42,18 +42,10 @@ static inline void trigger_address_error(void) asmlinkage void do_address_error(struct pt_regs *regs, unsigned long writeaccess, unsigned long address); -asmlinkage void do_divide_error(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs __regs); -asmlinkage void do_reserved_inst(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs __regs); -asmlinkage void do_illegal_slot_inst(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs __regs); -asmlinkage void do_exception_error(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs __regs); +asmlinkage void do_divide_error(unsigned long r4); +asmlinkage void do_reserved_inst(void); +asmlinkage void do_illegal_slot_inst(void); +asmlinkage void do_exception_error(void); #define BUILD_TRAP_HANDLER(name) \ asmlinkage void name##_trap_handler(unsigned long r4, unsigned long r5, \ diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c index 68e99f09171d..ff639342a8be 100644 --- a/arch/sh/kernel/traps_32.c +++ b/arch/sh/kernel/traps_32.c @@ -594,9 +594,7 @@ int is_dsp_inst(struct pt_regs *regs) #endif /* CONFIG_SH_DSP */ #ifdef CONFIG_CPU_SH2A -asmlinkage void do_divide_error(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs __regs) +asmlinkage void do_divide_error(unsigned long r4) { siginfo_t info; @@ -613,11 +611,9 @@ asmlinkage void do_divide_error(unsigned long r4, unsigned long r5, } #endif -asmlinkage void do_reserved_inst(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs __regs) +asmlinkage void do_reserved_inst(void) { - struct pt_regs *regs = RELOC_HIDE(&__regs, 0); + struct pt_regs *regs = current_pt_regs(); unsigned long error_code; struct task_struct *tsk = current; @@ -701,11 +697,9 @@ static int emulate_branch(unsigned short inst, struct pt_regs *regs) } #endif -asmlinkage void do_illegal_slot_inst(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs __regs) +asmlinkage void do_illegal_slot_inst(void) { - struct pt_regs *regs = RELOC_HIDE(&__regs, 0); + struct pt_regs *regs = current_pt_regs(); unsigned long inst; struct task_struct *tsk = current; @@ -730,15 +724,12 @@ asmlinkage void do_illegal_slot_inst(unsigned long r4, unsigned long r5, die_if_no_fixup("illegal slot instruction", regs, inst); } -asmlinkage void do_exception_error(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs __regs) +asmlinkage void do_exception_error(void) { - struct pt_regs *regs = RELOC_HIDE(&__regs, 0); long ex; ex = lookup_exception_vector(); - die_if_kernel("exception", regs, ex); + die_if_kernel("exception", current_pt_regs(), ex); } void per_cpu_trap_init(void) -- cgit v1.2.3 From f0767e897816c82cdb27058bd3be76442b7b5b10 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 3 Apr 2014 14:46:42 -0700 Subject: arch/sh/boards/board-sh7757lcr.c: fixup SDHI register size sh7757lcr SDHI register size is 0x100 Signed-off-by: Kuninori Morimoto Cc: Simon Horman Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/boards/board-sh7757lcr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/boards/board-sh7757lcr.c b/arch/sh/boards/board-sh7757lcr.c index 25c5a932f9fe..669df51a82e3 100644 --- a/arch/sh/boards/board-sh7757lcr.c +++ b/arch/sh/boards/board-sh7757lcr.c @@ -252,7 +252,7 @@ static struct sh_mobile_sdhi_info sdhi_info = { static struct resource sdhi_resources[] = { [0] = { .start = 0xffe50000, - .end = 0xffe501ff, + .end = 0xffe500ff, .flags = IORESOURCE_MEM, }, [1] = { -- cgit v1.2.3 From ba6e8b8f02e0431550e9e3c12cbc7dd2d1e14534 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 3 Apr 2014 14:46:43 -0700 Subject: sh: sh7757: switch RSPI clock to dev ID match Switch the RSPI MSTP clock on SH7757 from a con ID match to a dev ID match, so we can start looking it up using clk_get() with a NULL ID. Signed-off-by: Geert Uytterhoeven Tested-by: Yoshihiro Shimoda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/kernel/cpu/sh4a/clock-sh7757.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7757.c b/arch/sh/kernel/cpu/sh4a/clock-sh7757.c index e84a43229b9c..5c0e3c335161 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7757.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7757.c @@ -132,7 +132,7 @@ static struct clk_lookup lookups[] = { CLKDEV_CON_ID("usb_fck", &mstp_clks[MSTP103]), CLKDEV_DEV_ID("renesas_usbhs.0", &mstp_clks[MSTP102]), CLKDEV_CON_ID("mmc0", &mstp_clks[MSTP220]), - CLKDEV_CON_ID("rspi2", &mstp_clks[MSTP127]), + CLKDEV_DEV_ID("rspi.2", &mstp_clks[MSTP127]), }; int __init arch_clk_init(void) -- cgit v1.2.3 From 0c3d1d62f99b3c83b0c0a0f306b78e65d49fda30 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 3 Apr 2014 14:46:44 -0700 Subject: arch/sh/drivers/pci/pcie-sh7786.h: remove duplicate SH4A_PCIEPHYCTLR Signed-off-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/drivers/pci/pcie-sh7786.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/sh/drivers/pci/pcie-sh7786.h b/arch/sh/drivers/pci/pcie-sh7786.h index 1ee054e47eae..4a6ff55f759b 100644 --- a/arch/sh/drivers/pci/pcie-sh7786.h +++ b/arch/sh/drivers/pci/pcie-sh7786.h @@ -145,9 +145,6 @@ /* PCIERMSGIER */ #define SH4A_PCIERMSGIER (0x004040) /* R/W - 0x0000 0000 32 */ -/* PCIEPHYCTLR */ -#define SH4A_PCIEPHYCTLR (0x010000) /* R/W - 0x0000 0000 32 */ - /* PCIEPHYADRR */ #define SH4A_PCIEPHYADRR (0x010004) /* R/W - 0x0000 0000 32 */ #define BITS_ACK (24) // Rev1.171 -- cgit v1.2.3 From 181a9a043b5424f2e6451297bbc27b196fe88475 Mon Sep 17 00:00:00 2001 From: Zongxun Wang Date: Thu, 3 Apr 2014 14:46:45 -0700 Subject: ocfs2: fix null pointer dereference when access dlm_state before launching dlm thread When mounting an ocfs2 volume, it will firstly generate a file /sys/kernel/debug/o2dlm//dlm_state, and then launch the dlm thread. So the following situation will cause a null pointer dereference. dlm_debug_init -> access file dlm_state which will call dlm_state_print -> dlm_launch_thread Move dlm_debug_init after dlm_launch_thread and dlm_launch_recovery_thread can fix this issue. Signed-off-by: Zongxun Wang Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmdomain.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 33660a4a52fa..1307a8cff8db 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1877,19 +1877,19 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - status = dlm_debug_init(dlm); + status = dlm_launch_thread(dlm); if (status < 0) { mlog_errno(status); goto bail; } - status = dlm_launch_thread(dlm); + status = dlm_launch_recovery_thread(dlm); if (status < 0) { mlog_errno(status); goto bail; } - status = dlm_launch_recovery_thread(dlm); + status = dlm_debug_init(dlm); if (status < 0) { mlog_errno(status); goto bail; -- cgit v1.2.3 From c18ceab01240fd4c354b78d877571b729908e4a3 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Thu, 3 Apr 2014 14:46:46 -0700 Subject: ocfs2: change ip_unaligned_aio to of type mutex from atomit_t There is a problem that waitqueue_active() may check stale data thus miss a wakeup of threads waiting on ip_unaligned_aio. The valid value of ip_unaligned_aio is only 0 and 1 so we can change it to be of type mutex thus the above prolem is avoid. Another benifit is that mutex which works as FIFO is fairer than wake_up_all(). Signed-off-by: Wengang Wang Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 6 +----- fs/ocfs2/aops.h | 5 ----- fs/ocfs2/file.c | 15 +++------------ fs/ocfs2/inode.h | 2 +- fs/ocfs2/super.c | 9 ++------- 5 files changed, 7 insertions(+), 30 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index aeb44e879c51..ebe44f7dce0b 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -571,7 +571,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, { struct inode *inode = file_inode(iocb->ki_filp); int level; - wait_queue_head_t *wq = ocfs2_ioend_wq(inode); /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); @@ -582,10 +581,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, if (ocfs2_iocb_is_unaligned_aio(iocb)) { ocfs2_iocb_clear_unaligned_aio(iocb); - if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) && - waitqueue_active(wq)) { - wake_up_all(wq); - } + mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); } ocfs2_iocb_clear_rw_locked(iocb); diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index f671e49beb34..6cae155d54df 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -102,9 +102,4 @@ enum ocfs2_iocb_lock_bits { #define ocfs2_iocb_is_unaligned_aio(iocb) \ test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) -#define OCFS2_IOEND_WQ_HASH_SZ 37 -#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\ - OCFS2_IOEND_WQ_HASH_SZ]) -extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; - #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 51632c40e896..1673438789fe 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2061,13 +2061,6 @@ out: return ret; } -static void ocfs2_aiodio_wait(struct inode *inode) -{ - wait_queue_head_t *wq = ocfs2_ioend_wq(inode); - - wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0)); -} - static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) { int blockmask = inode->i_sb->s_blocksize - 1; @@ -2345,10 +2338,8 @@ relock: * Wait on previous unaligned aio to complete before * proceeding. */ - ocfs2_aiodio_wait(inode); - - /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */ - atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio); + mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio); + /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */ ocfs2_iocb_set_unaligned_aio(iocb); } @@ -2428,7 +2419,7 @@ out_dio: if (unaligned_dio) { ocfs2_iocb_clear_unaligned_aio(iocb); - atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); + mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); } out: diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 621fc73bf23d..9f1580b506a5 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -44,7 +44,7 @@ struct ocfs2_inode_info struct rw_semaphore ip_xattr_sem; /* Number of outstanding AIO's which are not page aligned */ - atomic_t ip_unaligned_aio; + struct mutex ip_unaligned_aio; /* These fields are protected by ip_lock */ spinlock_t ip_lock; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 49d84f80f36c..d17145552097 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1612,14 +1612,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) return 0; } -wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; - static int __init ocfs2_init(void) { - int status, i; - - for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) - init_waitqueue_head(&ocfs2__ioend_wq[i]); + int status; status = init_ocfs2_uptodate_cache(); if (status < 0) @@ -1761,7 +1756,7 @@ static void ocfs2_inode_init_once(void *data) ocfs2_extent_map_init(&oi->vfs_inode); INIT_LIST_HEAD(&oi->ip_io_markers); oi->ip_dir_start_lookup = 0; - atomic_set(&oi->ip_unaligned_aio, 0); + mutex_init(&oi->ip_unaligned_aio); init_rwsem(&oi->ip_alloc_sem); init_rwsem(&oi->ip_xattr_sem); mutex_init(&oi->ip_io_mutex); -- cgit v1.2.3 From a75fe48cad2fb81e0e2671c73aea6c78ce5626d4 Mon Sep 17 00:00:00 2001 From: "joyce.xue" Date: Thu, 3 Apr 2014 14:46:47 -0700 Subject: ocfs2: remove unused variable uuid_net_key in ocfs2_initialize_super Variable uuid_net_key in ocfs2_initialize_super() is not used. Clean it up. Signed-off-by: joyce.xue Signed-off-by: Joseph Qi Acked-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index d17145552097..d7190b2cfd40 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2072,7 +2072,6 @@ static int ocfs2_initialize_super(struct super_block *sb, struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; struct inode *inode = NULL; struct ocfs2_journal *journal; - __le32 uuid_net_key; struct ocfs2_super *osb; u64 total_blocks; @@ -2306,8 +2305,6 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } - memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key)); - strncpy(osb->vol_label, di->id2.i_super.s_label, 63); osb->vol_label[63] = '\0'; osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); -- cgit v1.2.3 From 2931cdcb49194503b19345c597b68fdcf78396f8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 3 Apr 2014 14:46:48 -0700 Subject: ocfs2: improve fsync efficiency and fix deadlock between aio_write and sync_file Currently, ocfs2_sync_file grabs i_mutex and forces the current journal transaction to complete. This isn't terribly efficient, since sync_file really only needs to wait for the last transaction involving that inode to complete, and this doesn't require i_mutex. Therefore, implement the necessary bits to track the newest tid associated with an inode, and teach sync_file to wait for that instead of waiting for everything in the journal to commit. Furthermore, only issue the flush request to the drive if jbd2 hasn't already done so. This also eliminates the deadlock between ocfs2_file_aio_write() and ocfs2_sync_file(). aio_write takes i_mutex then calls ocfs2_aiodio_wait() to wait for unaligned dio writes to finish. However, if that dio completion involves calling fsync, then we can get into trouble when some ocfs2_sync_file tries to take i_mutex. Signed-off-by: Darrick J. Wong Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 1 + fs/ocfs2/aops.c | 1 + fs/ocfs2/dir.c | 4 ++++ fs/ocfs2/file.c | 36 +++++++++++++++--------------------- fs/ocfs2/inode.c | 28 ++++++++++++++++++++++++++++ fs/ocfs2/inode.h | 7 +++++++ fs/ocfs2/journal.h | 11 +++++++++++ fs/ocfs2/namei.c | 4 ++++ fs/ocfs2/super.c | 3 +++ 9 files changed, 74 insertions(+), 21 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index e2edff38be52..6b97d68e34d3 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6932,6 +6932,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_dinode_new_extent_list(inode, di); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index ebe44f7dce0b..d310d12a9adc 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2039,6 +2039,7 @@ out_write_size: inode->i_mtime = inode->i_ctime = CURRENT_TIME; di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, wc->w_di_bh); ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 91a7e85ac8fd..8b48e9b7ad0e 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2957,6 +2957,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_init_dir_trailer(dir, dirdata_bh, i); } + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, dirdata_bh); if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { @@ -3338,6 +3339,7 @@ do_extend: } else { de->rec_len = cpu_to_le16(sb->s_blocksize); } + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, new_bh); dir_i_size += dir->i_sb->s_blocksize; @@ -3896,6 +3898,7 @@ out_commit: dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(dir->i_sb, 1)); + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_commit_trans(osb, handle); out: @@ -4134,6 +4137,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, mlog_errno(ret); did_quota = 0; + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, dx_root_bh); out_commit: diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 1673438789fe..bd94d26b0b21 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -175,9 +175,13 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { int err = 0; - journal_t *journal; struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + journal_t *journal = osb->journal->j_journal; + int ret; + tid_t commit_tid; + bool needs_barrier = false; trace_ocfs2_sync_file(inode, file, file->f_path.dentry, OCFS2_I(inode)->ip_blkno, @@ -192,29 +196,19 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, if (err) return err; - /* - * Probably don't need the i_mutex at all in here, just putting it here - * to be consistent with how fsync used to be called, someone more - * familiar with the fs could possibly remove it. - */ - mutex_lock(&inode->i_mutex); - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { - /* - * We still have to flush drive's caches to get data to the - * platter - */ - if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - goto bail; + commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid; + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + needs_barrier = true; + err = jbd2_complete_transaction(journal, commit_tid); + if (needs_barrier) { + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (!err) + err = ret; } - journal = osb->journal->j_journal; - err = jbd2_journal_force_commit(journal); - -bail: if (err) mlog_errno(err); - mutex_unlock(&inode->i_mutex); return (err < 0) ? -EIO : 0; } @@ -650,7 +644,7 @@ restarted_transaction: mlog_errno(status); goto leave; } - + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, bh); spin_lock(&OCFS2_I(inode)->ip_lock); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index f29a90fde619..28ab8a9e88a1 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -130,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; + journal_t *journal = OCFS2_SB(sb)->journal->j_journal; trace_ocfs2_iget_begin((unsigned long long)blkno, flags, sysfile_type); @@ -169,6 +170,32 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, goto bail; } + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + read_unlock(&journal->j_state_lock); + oi->i_sync_tid = tid; + oi->i_datasync_tid = tid; + } + bail: if (!IS_ERR(inode)) { trace_ocfs2_iget_end(inode, @@ -1260,6 +1287,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle, fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); ocfs2_journal_dirty(handle, bh); + ocfs2_update_inode_fsync_trans(handle, inode, 1); leave: return status; } diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 9f1580b506a5..837e5e42af85 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -73,6 +73,13 @@ struct ocfs2_inode_info u32 ip_dir_lock_gen; struct ocfs2_alloc_reservation ip_la_data_resv; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; }; /* diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 9ff4e8cf9d97..7f8cde94abfe 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -626,4 +626,15 @@ static inline int ocfs2_begin_ordered_truncate(struct inode *inode, new_size); } +static inline void ocfs2_update_inode_fsync_trans(handle_t *handle, + struct inode *inode, + int datasync) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + oi->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + oi->i_datasync_tid = handle->h_transaction->t_tid; +} + #endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 3683643f3f0e..e61e4c9a077c 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -495,6 +495,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, struct ocfs2_dinode *fe = NULL; struct ocfs2_extent_list *fel; u16 feat; + struct ocfs2_inode_info *oi = OCFS2_I(inode); *new_fe_bh = NULL; @@ -576,6 +577,9 @@ static int __ocfs2_mknod_locked(struct inode *dir, mlog_errno(status); } + oi->i_sync_tid = handle->h_transaction->t_tid; + oi->i_datasync_tid = handle->h_transaction->t_tid; + status = 0; /* error in ocfs2_create_new_inode_locks is not * critical */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index d7190b2cfd40..9fef73da1ca5 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -561,6 +561,9 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) if (!oi) return NULL; + oi->i_sync_tid = 0; + oi->i_datasync_tid = 0; + jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); return &oi->vfs_inode; } -- cgit v1.2.3 From 34aa8dac482f1358d59110d5e3a12f4351f6acaa Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Thu, 3 Apr 2014 14:46:49 -0700 Subject: ocfs2: dlm: fix lock migration crash This issue was introduced by commit 800deef3f6f8 ("ocfs2: use list_for_each_entry where benefical") in 2007 where it replaced list_for_each with list_for_each_entry. The variable "lock" will point to invalid data if "tmpq" list is empty and a panic will be triggered due to this. Sunil advised reverting it back, but the old version was also not right. At the end of the outer for loop, that list_for_each_entry will also set "lock" to an invalid data, then in the next loop, if the "tmpq" list is empty, "lock" will be an stale invalid data and cause the panic. So reverting the list_for_each back and reset "lock" to NULL to fix this issue. Another concern is that this seemes can not happen because the "tmpq" list should not be empty. Let me describe how. old lock resource owner(node 1): migratation target(node 2): image there's lockres with a EX lock from node 2 in granted list, a NR lock from node x with convert_type EX in converting list. dlm_empty_lockres() { dlm_pick_migration_target() { pick node 2 as target as its lock is the first one in granted list. } dlm_migrate_lockres() { dlm_mark_lockres_migrating() { res->state |= DLM_LOCK_RES_BLOCK_DIRTY; wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); //after the above code, we can not dirty lockres any more, // so dlm_thread shuffle list will not run downconvert lock from EX to NR upconvert lock from NR to EX <<< migration may schedule out here, then <<< node 2 send down convert request to convert type from EX to <<< NR, then send up convert request to convert type from NR to <<< EX, at this time, lockres granted list is empty, and two locks <<< in the converting list, node x up convert lock followed by <<< node 2 up convert lock. // will set lockres RES_MIGRATING flag, the following // lock/unlock can not run dlm_lockres_release_ast(dlm, res); } dlm_send_one_lockres() dlm_process_recovery_data() for (i=0; inum_locks; i++) if (ml->node == dlm->node_num) for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { list_for_each_entry(lock, tmpq, list) if (lock) break; <<< lock is invalid as grant list is empty. } if (lock->ml.node != ml->node) BUG() >>> crash here } I see the above locks status from a vmcore of our internal bug. Signed-off-by: Junxiao Bi Reviewed-by: Wengang Wang Cc: Sunil Mushran Reviewed-by: Srinivas Eeda Cc: Joel Becker Cc: Mark Fasheh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 7035af09cc03..c2dd2589e04e 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1750,13 +1750,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, struct dlm_migratable_lockres *mres) { struct dlm_migratable_lock *ml; - struct list_head *queue; + struct list_head *queue, *iter; struct list_head *tmpq = NULL; struct dlm_lock *newlock = NULL; struct dlm_lockstatus *lksb = NULL; int ret = 0; int i, j, bad; - struct dlm_lock *lock = NULL; + struct dlm_lock *lock; u8 from = O2NM_MAX_NODES; unsigned int added = 0; __be64 c; @@ -1791,14 +1791,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* MIGRATION ONLY! */ BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); + lock = NULL; spin_lock(&res->spinlock); for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { tmpq = dlm_list_idx_to_ptr(res, j); - list_for_each_entry(lock, tmpq, list) { - if (lock->ml.cookie != ml->cookie) - lock = NULL; - else + list_for_each(iter, tmpq) { + lock = list_entry(iter, + struct dlm_lock, list); + if (lock->ml.cookie == ml->cookie) break; + lock = NULL; } if (lock) break; -- cgit v1.2.3 From ded2cf71419b9353060e633b59e446c42a6a2a09 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Thu, 3 Apr 2014 14:46:51 -0700 Subject: ocfs2: dlm: fix recovery hung There is a race window in dlm_do_recovery() between dlm_remaster_locks() and dlm_reset_recovery() when the recovery master nearly finish the recovery process for a dead node. After the master sends FINALIZE_RECO message in dlm_remaster_locks(), another node may become the recovery master for another dead node, and then send the BEGIN_RECO message to all the nodes included the old master, in the handler of this message dlm_begin_reco_handler() of old master, dlm->reco.dead_node and dlm->reco.new_master will be set to the second dead node and the new master, then in dlm_reset_recovery(), these two variables will be reset to default value. This will cause new recovery master can not finish the recovery process and hung, at last the whole cluster will hung for recovery. old recovery master: new recovery master: dlm_remaster_locks() become recovery master for another dead node. dlm_send_begin_reco_message() dlm_begin_reco_handler() { if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { return -EAGAIN; } dlm_set_reco_master(dlm, br->node_idx); dlm_set_reco_dead_node(dlm, br->dead_node); } dlm_reset_recovery() { dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); } will hang in dlm_remaster_locks() for request dlm locks info Before send FINALIZE_RECO message, recovery master should set DLM_RECO_STATE_FINALIZE for itself and clear it after the recovery done, this can break the race windows as the BEGIN_RECO messages will not be handled before DLM_RECO_STATE_FINALIZE flag is cleared. A similar race may happen between new recovery master and normal node which is in dlm_finalize_reco_handler(), also fix it. Signed-off-by: Junxiao Bi Reviewed-by: Srinivas Eeda Reviewed-by: Wengang Wang Cc: Joel Becker Cc: Mark Fasheh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index c2dd2589e04e..fe29f7978f81 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -537,7 +537,10 @@ master_here: /* success! see if any other nodes need recovery */ mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n", dlm->name, dlm->reco.dead_node, dlm->node_num); - dlm_reset_recovery(dlm); + spin_lock(&dlm->spinlock); + __dlm_reset_recovery(dlm); + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); } dlm_end_recovery(dlm); @@ -695,6 +698,14 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) if (all_nodes_done) { int ret; + /* Set this flag on recovery master to avoid + * a new recovery for another dead node start + * before the recovery is not done. That may + * cause recovery hung.*/ + spin_lock(&dlm->spinlock); + dlm->reco.state |= DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + /* all nodes are now in DLM_RECO_NODE_DATA_DONE state * just send a finalize message to everyone and * clean up */ @@ -2884,8 +2895,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, BUG(); } dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + __dlm_reset_recovery(dlm); spin_unlock(&dlm->spinlock); - dlm_reset_recovery(dlm); dlm_kick_recovery_thread(dlm); break; default: -- cgit v1.2.3 From 765aabbbc72923bdb9116e49b1fc27ef22c6e65a Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 3 Apr 2014 14:46:52 -0700 Subject: ocfs2: add dlm_recover_callback_support in sysfs This is a part of the nocontrold feature which was incorporated sometime back. This is required for backward compatibility of the tools, specifically the scenario where the tools with recovery callback is used with a kernel not using the recovery callbacks (older kernel + newer tools). The tools look for this file to understand if the kernel supports DLM recovery callbacks. For kernels which support recovery callbacks but will miss this patch, ocfs2 will continue to use the older API and would still be able to mount the filesystem. [akpm@linux-foundation.org: simplify] [sfr@canb.auug.org.au: VERIFY_OCTAL_PERMISSIONS fix up] Signed-off-by: Goldwyn Rodrigues Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stackglue.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index ca5ce14cbddc..5c8343fe7438 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -603,11 +603,25 @@ static struct kobj_attribute ocfs2_attr_cluster_stack = ocfs2_cluster_stack_show, ocfs2_cluster_stack_store); + + +static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "1\n"); +} + +static struct kobj_attribute ocfs2_attr_dlm_recover_support = + __ATTR(dlm_recover_callback_support, S_IRUGO, + ocfs2_dlm_recover_show, NULL); + static struct attribute *ocfs2_attrs[] = { &ocfs2_attr_max_locking_protocol.attr, &ocfs2_attr_loaded_cluster_plugins.attr, &ocfs2_attr_active_cluster_plugin.attr, &ocfs2_attr_cluster_stack.attr, + &ocfs2_attr_dlm_recover_support.attr, NULL, }; -- cgit v1.2.3 From 7bf619c1425d5f03e33c744921f6251f4d0d745f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:53 -0700 Subject: ocfs2: remove OCFS2_INODE_SKIP_DELETE flag The flag was never set, delete it. Signed-off-by: Jan Kara Reviewed-by: Mark Fasheh Reviewed-by: Srinivas Eeda Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.c | 6 ------ fs/ocfs2/inode.h | 8 +++----- fs/ocfs2/journal.c | 6 ------ 3 files changed, 3 insertions(+), 17 deletions(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 28ab8a9e88a1..2f7d75d59f04 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -849,12 +849,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) goto bail_unlock; } - /* If we have allowd wipe of this inode for another node, it - * will be marked here so we can safely skip it. Recovery will - * cleanup any inodes we might inadvertently skip here. */ - if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) - goto bail_unlock; - ret = 1; bail_unlock: spin_unlock(&oi->ip_lock); diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 837e5e42af85..a6c991c0fc98 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -91,8 +91,6 @@ struct ocfs2_inode_info #define OCFS2_INODE_BITMAP 0x00000004 /* This inode has been wiped from disk */ #define OCFS2_INODE_DELETED 0x00000008 -/* Another node is deleting, so our delete is a nop */ -#define OCFS2_INODE_SKIP_DELETE 0x00000010 /* Has the inode been orphaned on another node? * * This hints to ocfs2_drop_inode that it should clear i_nlink before @@ -107,11 +105,11 @@ struct ocfs2_inode_info * rely on ocfs2_delete_inode to sort things out under the proper * cluster locks. */ -#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 +#define OCFS2_INODE_MAYBE_ORPHANED 0x00000010 /* Does someone have the file open O_DIRECT */ -#define OCFS2_INODE_OPEN_DIRECT 0x00000040 +#define OCFS2_INODE_OPEN_DIRECT 0x00000020 /* Tell the inode wipe code it's not in orphan dir */ -#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000080 +#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) { diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 44fc3e530c3d..03ea9314fecd 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2132,12 +2132,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi->ip_next_orphan; spin_lock(&oi->ip_lock); - /* The remote delete code may have set these on the - * assumption that the other node would wipe them - * successfully. If they are still in the node's - * orphan dir, we need to reset that state. */ - oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); - /* Set the proper information to get us going into * ocfs2_delete_inode. */ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; -- cgit v1.2.3 From bd62ad7aebd8e8895bb7649ace948040332f27d3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:54 -0700 Subject: ocfs2: move dquot_initialize() in ocfs2_delete_inode() somewhat later Move dquot_initalize() call in ocfs2_delete_inode() after the moment we verify inode is actually a sane one to delete. We certainly don't want to initialize quota for system inodes etc. This also avoids calling into quota code from downconvert thread. Add more details into the comment why bailing out from ocfs2_delete_inode() when we are in downconvert thread is OK. Signed-off-by: Jan Kara Reviewed-by: Mark Fasheh Reviewed-by: Srinivas Eeda Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 2f7d75d59f04..809b5d57a6b8 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -831,11 +831,13 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) goto bail; } - /* If we're coming from downconvert_thread we can't go into our own - * voting [hello, deadlock city!], so unforuntately we just - * have to skip deleting this guy. That's OK though because - * the node who's doing the actual deleting should handle it - * anyway. */ + /* + * If we're coming from downconvert_thread we can't go into our own + * voting [hello, deadlock city!] so we cannot delete the inode. But + * since we dropped last inode ref when downconverting dentry lock, + * we cannot have the file open and thus the node doing unlink will + * take care of deleting the inode. + */ if (current == osb->dc_task) goto bail; @@ -981,8 +983,6 @@ static void ocfs2_delete_inode(struct inode *inode) if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) goto bail; - dquot_initialize(inode); - if (!ocfs2_inode_is_valid_to_delete(inode)) { /* It's probably not necessary to truncate_inode_pages * here but we do it for safety anyway (it will most @@ -991,6 +991,8 @@ static void ocfs2_delete_inode(struct inode *inode) goto bail; } + dquot_initialize(inode); + /* We want to block signals in delete_inode as the lock and * messaging paths may return us -ERESTARTSYS. Which would * cause us to exit early, resulting in inodes being orphaned -- cgit v1.2.3 From 9f985cb6c45bc3f8b7e161c9658d409d051d576f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:55 -0700 Subject: quota: provide function to grab quota structure reference Provide dqgrab() function to get quota structure reference when we are sure it already has at least one active reference. Make use of this function inside quota code. Signed-off-by: Jan Kara Reviewed-by: Mark Fasheh Reviewed-by: Srinivas Eeda Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/quota/dquot.c | 4 ++-- include/linux/quotaops.h | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index cfc8dcc16043..9cd5f63715c0 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -528,7 +528,7 @@ restart: if (atomic_read(&dquot->dq_count)) { DEFINE_WAIT(wait); - atomic_inc(&dquot->dq_count); + dqgrab(dquot); prepare_to_wait(&dquot->dq_wait_unused, &wait, TASK_UNINTERRUPTIBLE); spin_unlock(&dq_list_lock); @@ -632,7 +632,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type) /* Now we have active dquot from which someone is * holding reference so we can safely just increase * use count */ - atomic_inc(&dquot->dq_count); + dqgrab(dquot); spin_unlock(&dq_list_lock); dqstats_inc(DQST_LOOKUPS); err = sb->dq_op->write_dquot(dquot); diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 6965fe394c3b..1d3eee594cd6 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -46,6 +46,14 @@ void inode_reclaim_rsv_space(struct inode *inode, qsize_t number); void dquot_initialize(struct inode *inode); void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, struct kqid qid); +static inline struct dquot *dqgrab(struct dquot *dquot) +{ + /* Make sure someone else has active reference to dquot */ + WARN_ON_ONCE(!atomic_read(&dquot->dq_count)); + WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)); + atomic_inc(&dquot->dq_count); + return dquot; +} void dqput(struct dquot *dquot); int dquot_scan_active(struct super_block *sb, int (*fn)(struct dquot *dquot, unsigned long priv), -- cgit v1.2.3 From e3a767b60fd8a9f5e133f42f4970cff77ec43173 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:56 -0700 Subject: ocfs2: implement delayed dropping of last dquot reference We cannot drop last dquot reference from downconvert thread as that creates the following deadlock: NODE 1 NODE2 holds dentry lock for 'foo' holds inode lock for GLOBAL_BITMAP_SYSTEM_INODE dquot_initialize(bar) ocfs2_dquot_acquire() ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) ... downconvert thread (triggered from another node or a different process from NODE2) ocfs2_dentry_post_unlock() ... iput(foo) ocfs2_evict_inode(foo) ocfs2_clear_inode(foo) dquot_drop(inode) ... ocfs2_dquot_release() ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) - blocks finds we need more space in quota file ... ocfs2_extend_no_holes() ocfs2_inode_lock(GLOBAL_BITMAP_SYSTEM_INODE) - deadlocks waiting for downconvert thread We solve the problem by postponing dropping of the last dquot reference to a workqueue if it happens from the downconvert thread. Signed-off-by: Jan Kara Reviewed-by: Mark Fasheh Reviewed-by: Srinivas Eeda Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/ocfs2.h | 5 +++++ fs/ocfs2/quota.h | 2 ++ fs/ocfs2/quota_global.c | 35 +++++++++++++++++++++++++++++++++++ fs/ocfs2/super.c | 8 ++++++++ 4 files changed, 50 insertions(+) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 553f53cc73ae..64c02239ba46 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -419,6 +420,10 @@ struct ocfs2_super struct ocfs2_dentry_lock *dentry_lock_list; struct work_struct dentry_lock_work; + /* List of dquot structures to drop last reference to */ + struct llist_head dquot_drop_list; + struct work_struct dquot_drop_work; + wait_queue_head_t osb_mount_event; /* Truncate log info */ diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index d5ab56cbe5c5..f266d67df3c6 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -28,6 +28,7 @@ struct ocfs2_dquot { unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ s64 dq_origspace; /* Last globally synced space usage */ s64 dq_originodes; /* Last globally synced inode usage */ + struct llist_node list; /* Member of list of dquots to drop */ }; /* Description of one chunk to recover in memory */ @@ -110,6 +111,7 @@ int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block, int ocfs2_create_local_dquot(struct dquot *dquot); int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot); int ocfs2_local_write_dquot(struct dquot *dquot); +void ocfs2_drop_dquot_refs(struct work_struct *work); extern const struct dquot_operations ocfs2_quota_operations; extern struct quota_format_type ocfs2_quota_format; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index d7b5108789e2..b990a62cff50 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -679,6 +680,27 @@ static int ocfs2_calc_qdel_credits(struct super_block *sb, int type) OCFS2_INODE_UPDATE_CREDITS; } +void ocfs2_drop_dquot_refs(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, + dquot_drop_work); + struct llist_node *list; + struct ocfs2_dquot *odquot, *next_odquot; + + list = llist_del_all(&osb->dquot_drop_list); + llist_for_each_entry_safe(odquot, next_odquot, list, list) { + /* Drop the reference we acquired in ocfs2_dquot_release() */ + dqput(&odquot->dq_dquot); + } +} + +/* + * Called when the last reference to dquot is dropped. If we are called from + * downconvert thread, we cannot do all the handling here because grabbing + * quota lock could deadlock (the node holding the quota lock could need some + * other cluster lock to proceed but with blocked downconvert thread we cannot + * release any lock). + */ static int ocfs2_release_dquot(struct dquot *dquot) { handle_t *handle; @@ -694,6 +716,19 @@ static int ocfs2_release_dquot(struct dquot *dquot) /* Check whether we are not racing with some other dqget() */ if (atomic_read(&dquot->dq_count) > 1) goto out; + /* Running from downconvert thread? Postpone quota processing to wq */ + if (current == osb->dc_task) { + /* + * Grab our own reference to dquot and queue it for delayed + * dropping. Quota code rechecks after calling + * ->release_dquot() and won't free dquot structure. + */ + dqgrab(dquot); + /* First entry on list -> queue work */ + if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list)) + queue_work(ocfs2_wq, &osb->dquot_drop_work); + goto out; + } status = ocfs2_lock_global_qf(oinfo, 1); if (status < 0) goto out; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 9fef73da1ca5..b800a1f78d78 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1941,6 +1941,11 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_disable_quotas(osb); + /* All dquots should be freed by now */ + WARN_ON(!llist_empty(&osb->dquot_drop_list)); + /* Wait for worker to be done with the work structure in osb */ + cancel_work_sync(&osb->dquot_drop_work); + ocfs2_shutdown_local_alloc(osb); /* This will disable recovery and flush any recovery work. */ @@ -2276,6 +2281,9 @@ static int ocfs2_initialize_super(struct super_block *sb, INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); osb->dentry_lock_list = NULL; + INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); + init_llist_head(&osb->dquot_drop_list); + /* get some pseudo constants for clustersize bits */ osb->s_clustersize_bits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); -- cgit v1.2.3 From 84d86f83f9d0e8431a3c9eae4c47e9d7ff49a411 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:57 -0700 Subject: ocfs2: avoid blocking in ocfs2_mark_lockres_freeing() in downconvert thread If we are dropping last inode reference from downconvert thread, we will end up calling ocfs2_mark_lockres_freeing() which can block if the lock we are freeing is queued thus creating an A-A deadlock. Luckily, since we are the downconvert thread, we can immediately dequeue the lock and thus avoid waiting in this case. Signed-off-by: Jan Kara Reviewed-by: Mark Fasheh Reviewed-by: Srinivas Eeda Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 44 +++++++++++++++++++++++++++++++++++++++++--- fs/ocfs2/dlmglue.h | 3 ++- fs/ocfs2/inode.c | 7 ++++--- 3 files changed, 47 insertions(+), 7 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 19986959d149..6bd690b5a061 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3144,22 +3144,60 @@ out: return 0; } +static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); + /* Mark the lockres as being dropped. It will no longer be * queued if blocking, but we still may have to wait on it * being dequeued from the downconvert thread before we can consider * it safe to drop. * * You can *not* attempt to call cluster_lock on this lockres anymore. */ -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) { int status; struct ocfs2_mask_waiter mw; - unsigned long flags; + unsigned long flags, flags2; ocfs2_init_mask_waiter(&mw); spin_lock_irqsave(&lockres->l_lock, flags); lockres->l_flags |= OCFS2_LOCK_FREEING; + if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) { + /* + * We know the downconvert is queued but not in progress + * because we are the downconvert thread and processing + * different lock. So we can just remove the lock from the + * queue. This is not only an optimization but also a way + * to avoid the following deadlock: + * ocfs2_dentry_post_unlock() + * ocfs2_dentry_lock_put() + * ocfs2_drop_dentry_lock() + * iput() + * ocfs2_evict_inode() + * ocfs2_clear_inode() + * ocfs2_mark_lockres_freeing() + * ... blocks waiting for OCFS2_LOCK_QUEUED + * since we are the downconvert thread which + * should clear the flag. + */ + spin_unlock_irqrestore(&lockres->l_lock, flags); + spin_lock_irqsave(&osb->dc_task_lock, flags2); + list_del_init(&lockres->l_blocked_list); + osb->blocked_lock_count--; + spin_unlock_irqrestore(&osb->dc_task_lock, flags2); + /* + * Warn if we recurse into another post_unlock call. Strictly + * speaking it isn't a problem but we need to be careful if + * that happens (stack overflow, deadlocks, ...) so warn if + * ocfs2 grows a path for which this can happen. + */ + WARN_ON_ONCE(lockres->l_ops->post_unlock); + /* Since the lock is freeing we don't do much in the fn below */ + ocfs2_process_blocked_lock(osb, lockres); + return; + } while (lockres->l_flags & OCFS2_LOCK_QUEUED) { lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); spin_unlock_irqrestore(&lockres->l_lock, flags); @@ -3180,7 +3218,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, { int ret; - ocfs2_mark_lockres_freeing(lockres); + ocfs2_mark_lockres_freeing(osb, lockres); ret = ocfs2_drop_lock(osb, lockres); if (ret) mlog_errno(ret); diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 1d596d8c4a4a..d293a22c32c5 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -157,7 +157,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex); void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex); -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 809b5d57a6b8..d437f3ba90b0 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1080,6 +1080,7 @@ static void ocfs2_clear_inode(struct inode *inode) { int status; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); clear_inode(inode); trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, @@ -1096,9 +1097,9 @@ static void ocfs2_clear_inode(struct inode *inode) /* Do these before all the other work so that we don't bounce * the downconvert thread while waiting to destroy the locks. */ - ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); - ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); - ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres); ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, &oi->ip_la_data_resv); -- cgit v1.2.3 From 8ed6b23709b346f7bfc1edab47003a205a6a9f69 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 3 Apr 2014 14:46:59 -0700 Subject: ocfs2: revert iput deferring code in ocfs2_drop_dentry_lock The following patches are reverted in this patch because these patches caused performance regression in the remote unlink() calls. ea455f8ab683 - ocfs2: Push out dropping of dentry lock to ocfs2_wq f7b1aa69be13 - ocfs2: Fix deadlock on umount 5fd131893793 - ocfs2: Don't oops in ocfs2_kill_sb on a failed mount Previous patches in this series removed the possible deadlocks from downconvert thread so the above patches shouldn't be needed anymore. The regression is caused because these patches delay the iput() in case of dentry unlocks. This also delays the unlocking of the open lockres. The open lockresource is required to test if the inode can be wiped from disk or not. When the deleting node does not get the open lock, it marks it as orphan (even though it is not in use by another node/process) and causes a journal checkpoint. This delays operations following the inode eviction. This also moves the inode to the orphaned inode which further causes more I/O and a lot of unneccessary orphans. The following script can be used to generate the load causing issues: declare -a create declare -a remove declare -a iterations=(1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384) unique="`mktemp -u XXXXX`" script="/tmp/idontknow-${unique}.sh" cat < "${script}" for n in {1..8}; do mkdir -p test/dir\${n} eval touch test/dir\${n}/foo{1.."\$1"} done EOF chmod 700 "${script}" function fcreate () { exec 2>&1 /usr/bin/time --format=%E "${script}" "$1" } function fremove () { exec 2>&1 /usr/bin/time --format=%E ssh node2 "cd `pwd`; rm -Rf test*" } function fcp () { exec 2>&1 /usr/bin/time --format=%E ssh node3 "cd `pwd`; cp -R test test.new" } echo ------------------------------------------------- echo "| # files | create #s | copy #s | remove #s |" echo ------------------------------------------------- for ((x=0; x < ${#iterations[*]} ; x++)) do create[$x]="`fcreate ${iterations[$x]}`" copy[$x]="`fcp ${iterations[$x]}`" remove[$x]="`fremove`" printf "| %8d | %9s | %9s | %9s |\n" ${iterations[$x]} ${create[$x]} ${copy[$x]} ${remove[$x]} done rm "${script}" echo "------------------------" Signed-off-by: Srinivas Eeda Signed-off-by: Goldwyn Rodrigues Signed-off-by: Jan Kara Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dcache.c | 61 +++---------------------------------------------------- fs/ocfs2/dcache.h | 12 +---------- fs/ocfs2/ocfs2.h | 28 ++++--------------------- fs/ocfs2/super.c | 30 +-------------------------- 4 files changed, 9 insertions(+), 122 deletions(-) diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 0d3a97d2d5f6..e2e05a106beb 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -37,7 +37,6 @@ #include "dlmglue.h" #include "file.h" #include "inode.h" -#include "super.h" #include "ocfs2_trace.h" void ocfs2_dentry_attach_gen(struct dentry *dentry) @@ -346,52 +345,6 @@ out_attach: return ret; } -DEFINE_SPINLOCK(dentry_list_lock); - -/* We limit the number of dentry locks to drop in one go. We have - * this limit so that we don't starve other users of ocfs2_wq. */ -#define DL_INODE_DROP_COUNT 64 - -/* Drop inode references from dentry locks */ -static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) -{ - struct ocfs2_dentry_lock *dl; - - spin_lock(&dentry_list_lock); - while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { - dl = osb->dentry_lock_list; - osb->dentry_lock_list = dl->dl_next; - spin_unlock(&dentry_list_lock); - iput(dl->dl_inode); - kfree(dl); - spin_lock(&dentry_list_lock); - } - spin_unlock(&dentry_list_lock); -} - -void ocfs2_drop_dl_inodes(struct work_struct *work) -{ - struct ocfs2_super *osb = container_of(work, struct ocfs2_super, - dentry_lock_work); - - __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); - /* - * Don't queue dropping if umount is in progress. We flush the - * list in ocfs2_dismount_volume - */ - spin_lock(&dentry_list_lock); - if (osb->dentry_lock_list && - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) - queue_work(ocfs2_wq, &osb->dentry_lock_work); - spin_unlock(&dentry_list_lock); -} - -/* Flush the whole work queue */ -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) -{ - __ocfs2_drop_dl_inodes(osb, -1); -} - /* * ocfs2_dentry_iput() and friends. * @@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { + iput(dl->dl_inode); ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); ocfs2_lock_res_free(&dl->dl_lockres); - - /* We leave dropping of inode reference to ocfs2_wq as that can - * possibly lead to inode deletion which gets tricky */ - spin_lock(&dentry_list_lock); - if (!osb->dentry_lock_list && - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) - queue_work(ocfs2_wq, &osb->dentry_lock_work); - dl->dl_next = osb->dentry_lock_list; - osb->dentry_lock_list = dl; - spin_unlock(&dentry_list_lock); + kfree(dl); } void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { - int unlock; + int unlock = 0; BUG_ON(dl->dl_count == 0); diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index b79eff709958..55f58892b153 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -29,13 +29,8 @@ extern const struct dentry_operations ocfs2_dentry_ops; struct ocfs2_dentry_lock { - /* Use count of dentry lock */ unsigned int dl_count; - union { - /* Linked list of dentry locks to release */ - struct ocfs2_dentry_lock *dl_next; - u64 dl_parent_blkno; - }; + u64 dl_parent_blkno; /* * The ocfs2_dentry_lock keeps an inode reference until @@ -49,14 +44,9 @@ struct ocfs2_dentry_lock { int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, u64 parent_blkno); -extern spinlock_t dentry_list_lock; - void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl); -void ocfs2_drop_dl_inodes(struct work_struct *work); -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); - struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, int skip_unhashed); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 64c02239ba46..a780e20d4fba 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -275,19 +275,16 @@ enum ocfs2_mount_options OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ }; -#define OCFS2_OSB_SOFT_RO 0x0001 -#define OCFS2_OSB_HARD_RO 0x0002 -#define OCFS2_OSB_ERROR_FS 0x0004 -#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 - -#define OCFS2_DEFAULT_ATIME_QUANTUM 60 +#define OCFS2_OSB_SOFT_RO 0x0001 +#define OCFS2_OSB_HARD_RO 0x0002 +#define OCFS2_OSB_ERROR_FS 0x0004 +#define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; struct ocfs2_replay_map; struct ocfs2_quota_recovery; -struct ocfs2_dentry_lock; struct ocfs2_super { struct task_struct *commit_task; @@ -415,11 +412,6 @@ struct ocfs2_super struct list_head blocked_lock_list; unsigned long blocked_lock_count; - /* List of dentry locks to release. Anyone can add locks to - * the list, ocfs2_wq processes the list */ - struct ocfs2_dentry_lock *dentry_lock_list; - struct work_struct dentry_lock_work; - /* List of dquot structures to drop last reference to */ struct llist_head dquot_drop_list; struct work_struct dquot_drop_work; @@ -584,18 +576,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); } - -static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb, - unsigned long flag) -{ - unsigned long ret; - - spin_lock(&osb->osb_lock); - ret = osb->osb_flags & flag; - spin_unlock(&osb->osb_lock); - return ret; -} - static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, int hard) { diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index b800a1f78d78..888a1457af96 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1241,30 +1241,11 @@ static struct dentry *ocfs2_mount(struct file_system_type *fs_type, return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); } -static void ocfs2_kill_sb(struct super_block *sb) -{ - struct ocfs2_super *osb = OCFS2_SB(sb); - - /* Failed mount? */ - if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED) - goto out; - - /* Prevent further queueing of inode drop events */ - spin_lock(&dentry_list_lock); - ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); - spin_unlock(&dentry_list_lock); - /* Wait for work to finish and/or remove it */ - cancel_work_sync(&osb->dentry_lock_work); -out: - kill_block_super(sb); -} - static struct file_system_type ocfs2_fs_type = { .owner = THIS_MODULE, .name = "ocfs2", .mount = ocfs2_mount, - .kill_sb = ocfs2_kill_sb, - + .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, .next = NULL }; @@ -1930,12 +1911,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) debugfs_remove(osb->osb_ctxt); - /* - * Flush inode dropping work queue so that deletes are - * performed while the filesystem is still working - */ - ocfs2_drop_all_dl_inodes(osb); - /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); @@ -2278,9 +2253,6 @@ static int ocfs2_initialize_super(struct super_block *sb, INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); journal->j_state = OCFS2_JOURNAL_FREE; - INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); - osb->dentry_lock_list = NULL; - INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); init_llist_head(&osb->dquot_drop_list); -- cgit v1.2.3 From 41b63efb68115eaf48197672d59005765884b078 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 3 Apr 2014 14:47:00 -0700 Subject: ocfs2: fix type conversion risk when get cluster attributes In o2nm_cluster, cl_idle_timeout_ms, cl_keepalive_delay_ms, as well as cl_reconnect_delay_ms, are defined as type of unsigned int. So we should also use unsigned int in the helper functions. Signed-off-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2cd2406b4140..b4ab371d46d9 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -262,17 +262,17 @@ static void o2net_update_recv_stats(struct o2net_sock_container *sc) #endif /* CONFIG_OCFS2_FS_STATS */ -static inline int o2net_reconnect_delay(void) +static inline unsigned int o2net_reconnect_delay(void) { return o2nm_single_cluster->cl_reconnect_delay_ms; } -static inline int o2net_keepalive_delay(void) +static inline unsigned int o2net_keepalive_delay(void) { return o2nm_single_cluster->cl_keepalive_delay_ms; } -static inline int o2net_idle_timeout(void) +static inline unsigned int o2net_idle_timeout(void) { return o2nm_single_cluster->cl_idle_timeout_ms; } -- cgit v1.2.3 From c8d888d9f1469806c339218fc342817eb3b628a8 Mon Sep 17 00:00:00 2001 From: Jensen Date: Thu, 3 Apr 2014 14:47:01 -0700 Subject: ocfs2: llseek requires ocfs2 inode lock for the file in SEEK_END llseek requires ocfs2 inode lock for updating the file size in SEEK_END. because the file size maybe update on another node. This bug can be reproduce the following scenario: at first, we dd a test fileA, the file size is 10k. on NodeA: --------- 1) open the test fileA, lseek the end of file. and print the position. 2) close the test fileA on NodeB: 1) open the test fileA, append the 5k data to test FileA. 2) lseek the end of file. and print the position. 3) close file. At first we run the test program1 on NodeA , the result is 10k. And then run the test program2 on NodeB, the result is 15k. At last, we run the test program1 on NodeA again, the result is 10k. After applying this patch the three step result is 15k. test result: 1000000 times lseek call; index lseek with inode lock (unit:us) lseek without inode lock (unit:us) 1 1168162 555383 2 1168011 549504 3 1170538 549396 4 1170375 551685 5 1170444 556719 6 1174364 555307 7 1163294 551552 8 1170080 549350 9 1162464 553700 10 1165441 552594 avg 1168317 552519 avg with lock - avg without lock = 615798 (avg with lock - avg without lock)/1000000=0.615798 us Signed-off-by: Jensen Cc: Jie Liu Acked-by: Joel Becker Cc: Mark Fasheh Cc: Sunil Mushran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index bd94d26b0b21..0f14f906dc65 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2630,7 +2630,16 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_SET: break; case SEEK_END: - offset += inode->i_size; + /* SEEK_END requires the OCFS2 inode lock for the file + * because it references the file's size. + */ + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + offset += i_size_read(inode); + ocfs2_inode_unlock(inode, 0); break; case SEEK_CUR: if (offset == 0) { -- cgit v1.2.3 From a35ad97cd48cfab45f4ee5eb0e6f5787a05cab25 Mon Sep 17 00:00:00 2001 From: Zhonghua Guo Date: Thu, 3 Apr 2014 14:47:02 -0700 Subject: ocfs2: fix deadlock risk when kmalloc failed in dlm_query_region_handler In dlm_query_region_handler(), once kmalloc failed, it will unlock dlm_domain_lock without lock first, then deadlock happens. Signed-off-by: Zhonghua Guo Signed-off-by: Joseph Qi Reviewed-by: Srinivas Eeda Tested-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmdomain.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 1307a8cff8db..c973690dc0bc 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1123,7 +1123,6 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, struct dlm_ctxt *dlm = NULL; char *local = NULL; int status = 0; - int locked = 0; qr = (struct dlm_query_region *) msg->buf; @@ -1132,10 +1131,8 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, /* buffer used in dlm_mast_regions() */ local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); - if (!local) { - status = -ENOMEM; - goto bail; - } + if (!local) + return -ENOMEM; status = -EINVAL; @@ -1144,16 +1141,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, if (!dlm) { mlog(ML_ERROR, "Node %d queried hb regions on domain %s " "before join domain\n", qr->qr_node, qr->qr_domain); - goto bail; + goto out_domain_lock; } spin_lock(&dlm->spinlock); - locked = 1; if (dlm->joining_node != qr->qr_node) { mlog(ML_ERROR, "Node %d queried hb regions on domain %s " "but joining node is %d\n", qr->qr_node, qr->qr_domain, dlm->joining_node); - goto bail; + goto out_dlm_lock; } /* Support for global heartbeat was added in 1.1 */ @@ -1163,14 +1159,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, "but active dlm protocol is %d.%d\n", qr->qr_node, qr->qr_domain, dlm->dlm_locking_proto.pv_major, dlm->dlm_locking_proto.pv_minor); - goto bail; + goto out_dlm_lock; } status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions)); -bail: - if (locked) - spin_unlock(&dlm->spinlock); +out_dlm_lock: + spin_unlock(&dlm->spinlock); + +out_domain_lock: spin_unlock(&dlm_domain_lock); kfree(local); -- cgit v1.2.3 From 3ed2be719eb9770139da791342b190b20a7a9270 Mon Sep 17 00:00:00 2001 From: Tariq Saeed Date: Thu, 3 Apr 2014 14:47:03 -0700 Subject: ocfs2: allow for more than one data extent when creating xattr Orabug: 18108070 ocfs2_xattr_extend_allocation() hits panic when creating xattr during data extent alloc phase. The problem occurs if due to local alloc fragmentation, clusters are spread over multiple extents. In this case ocfs2_add_clusters_in_btree() finds no space to store more than one extent record and therefore fails returning RESTART_META. The situation is anticipated for xattr update case but not xattr create case. This fix simply ports that code to create case. Signed-off-by: Tariq Saeed Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/xattr.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 185fa3b7f962..4217fedb2c11 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -3200,8 +3200,15 @@ meta_guess: clusters_add += 1; } } else { - meta_add += 1; credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + struct ocfs2_extent_list *el = &def_xv.xv.xr_list; + meta_add += ocfs2_extend_meta_needed(el); + credits += ocfs2_calc_extend_credits(inode->i_sb, + el); + } else { + meta_add += 1; + } } out: if (clusters_need) -- cgit v1.2.3 From 466e68c4306317e7d239e3100f612d403e3e2c3c Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Thu, 3 Apr 2014 14:47:04 -0700 Subject: ocfs2: __ocfs2_mknod_locked should return error when ocfs2_create_new_inode_locks() failed When ocfs2_create_new_inode_locks() return error, inode open lock may not be obtainted for this inode. So other nodes can remove this file and free dinode when inode still remain in memory on this node, which is not correct and may trigger BUG. So __ocfs2_mknod_locked should return error when ocfs2_create_new_inode_locks() failed. Node_1 Node_2 create fileA, call ocfs2_mknod() -> ocfs2_get_init_inode(), allocate inodeA -> ocfs2_claim_new_inode(), claim dinode(dinodeA) -> call ocfs2_create_new_inode_locks(), create open lock failed, return error -> __ocfs2_mknod_locked return success unlink fileA try open lock succeed, and free dinodeA create another file, call ocfs2_mknod() -> ocfs2_get_init_inode(), allocate inodeB -> ocfs2_claim_new_inode(), as Node_2 had freed dinodeA, so claim dinodeA and update generation for dinodeA call __ocfs2_drop_dl_inodes()->ocfs2_delete_inode() to free inodeA, and finally triggers BUG on(inode->i_generation != le32_to_cpu(fe->i_generation)) in function ocfs2_inode_lock_update(). Signed-off-by: joyce.xue Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index e61e4c9a077c..931fd50a971f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -580,9 +580,6 @@ static int __ocfs2_mknod_locked(struct inode *dir, oi->i_sync_tid = handle->h_transaction->t_tid; oi->i_datasync_tid = handle->h_transaction->t_tid; - status = 0; /* error in ocfs2_create_new_inode_locks is not - * critical */ - leave: if (status < 0) { if (*new_fe_bh) { -- cgit v1.2.3 From f7cf4f5bfe073ad792ab49c04f247626b3e38db6 Mon Sep 17 00:00:00 2001 From: alex chen Date: Thu, 3 Apr 2014 14:47:05 -0700 Subject: ocfs2: do not put bh when buffer_uptodate failed Do not put bh when buffer_uptodate failed in ocfs2_write_block and ocfs2_write_super_or_backup, because it will put bh in b_end_io. Otherwise it will hit a warning "VFS: brelse: Trying to free free buffer". Signed-off-by: Alex Chen Reviewed-by: Joseph Qi Reviewed-by: Srinivas Eeda Cc: Mark Fasheh Acked-by: Joel Becker Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/buffer_head_io.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 5b704c63a103..1edcb141f639 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -90,7 +90,6 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, * information for this bh as it's not marked locally * uptodate. */ ret = -EIO; - put_bh(bh); mlog_errno(ret); } @@ -420,7 +419,6 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, if (!buffer_uptodate(bh)) { ret = -EIO; - put_bh(bh); mlog_errno(ret); } -- cgit v1.2.3 From f81c20158f8d5f7938d5eb86ecc42ecc09273ce6 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 3 Apr 2014 14:47:07 -0700 Subject: ocfs2: fix panic on kfree(xattr->name) Commit 9548906b2bb7 ('xattr: Constify ->name member of "struct xattr"') missed that ocfs2 is calling kfree(xattr->name). As a result, kernel panic occurs upon calling kfree(xattr->name) because xattr->name refers static constant names. This patch removes kfree(xattr->name) from ocfs2_mknod() and ocfs2_symlink(). Signed-off-by: Tetsuo Handa Reported-by: Tariq Saeed Tested-by: Tariq Saeed Reviewed-by: Srinivas Eeda Cc: Joel Becker Cc: Mark Fasheh Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 931fd50a971f..4a797f22239d 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -450,7 +450,6 @@ leave: brelse(new_fe_bh); brelse(parent_fe_bh); - kfree(si.name); kfree(si.value); ocfs2_free_dir_lookup_result(&lookup); @@ -1856,7 +1855,6 @@ bail: brelse(new_fe_bh); brelse(parent_fe_bh); - kfree(si.name); kfree(si.value); ocfs2_free_dir_lookup_result(&lookup); if (inode_ac) -- cgit v1.2.3 From 6fdb702d6262b18b1b41a35f1f81903b0a2bc2c9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 3 Apr 2014 14:47:08 -0700 Subject: ocfs2: call ocfs2_update_inode_fsync_trans when updating any inode Ensure that ocfs2_update_inode_fsync_trans() is called any time we touch an inode in a given transaction. This is a follow-on to the previous patch to reduce lock contention and deadlocking during an fsync operation. Signed-off-by: Darrick J. Wong Cc: Mark Fasheh Cc: Joel Becker Cc: Wengang Cc: Greg Marsden Cc: Srinivas Eeda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/acl.c | 1 + fs/ocfs2/alloc.c | 2 ++ fs/ocfs2/dir.c | 2 ++ fs/ocfs2/file.c | 7 +++++++ fs/ocfs2/move_extents.c | 2 ++ fs/ocfs2/namei.c | 1 + fs/ocfs2/suballoc.c | 3 ++- fs/ocfs2/xattr.c | 3 +++ 8 files changed, 20 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 555f4cddefe3..7e8282dcea2a 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -205,6 +205,7 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, di->i_mode = cpu_to_le16(inode->i_mode); di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 6b97d68e34d3..b4deb5f750d9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5728,6 +5728,7 @@ int ocfs2_remove_btree_range(struct inode *inode, } ocfs2_et_update_clusters(et, -len); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, et->et_root_bh); @@ -7209,6 +7210,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, di_bh); out_commit: diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 8b48e9b7ad0e..0717662b4aef 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3006,6 +3006,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, di->i_size = cpu_to_le64(sb->s_blocksize); di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, dir, 1); /* * This should never fail as our extent list is empty and all @@ -4405,6 +4406,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); spin_unlock(&OCFS2_I(dir)->ip_lock); di->i_dx_root = cpu_to_le64(0ULL); + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0f14f906dc65..ff33c5ef87f2 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -286,6 +286,7 @@ int ocfs2_update_inode_atime(struct inode *inode, inode->i_atime = CURRENT_TIME; di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, bh); out_commit: @@ -335,6 +336,7 @@ int ocfs2_simple_size_update(struct inode *inode, if (ret < 0) mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_commit_trans(osb, handle); out: return ret; @@ -429,6 +431,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, di->i_size = cpu_to_le64(new_i_size); di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); @@ -737,6 +740,7 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 1); out: if (ret) { @@ -834,6 +838,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); di->i_mtime_nsec = di->i_ctime_nsec; ocfs2_journal_dirty(handle, di_bh); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); } @@ -1338,6 +1343,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode, di = (struct ocfs2_dinode *) bh->b_data; di->i_mode = cpu_to_le16(inode->i_mode); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, bh); @@ -1570,6 +1576,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, if (ret) mlog_errno(ret); } + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_commit_trans(osb, handle); out: diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 64c304d668f0..3ca939552d9c 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -151,6 +151,7 @@ static int __ocfs2_move_extent(handle_t *handle, old_blkno, len); } + ocfs2_update_inode_fsync_trans(handle, inode, 0); out: ocfs2_free_path(path); return ret; @@ -957,6 +958,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) inode->i_ctime = CURRENT_TIME; di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 4a797f22239d..2060fc398445 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2480,6 +2480,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, di->i_orphaned_slot = 0; set_nlink(inode, 1); ocfs2_set_links_count(di, inode->i_nlink); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 47ae2663a6f5..482d6c2a3ea1 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -771,6 +771,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); + ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); status = 0; @@ -2091,7 +2092,7 @@ int ocfs2_find_new_inode_loc(struct inode *dir, ac->ac_find_loc_priv = res; *fe_blkno = res->sr_blkno; - + ocfs2_update_inode_fsync_trans(handle, dir, 0); out: if (handle) ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 4217fedb2c11..14b8c46b4fbb 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2602,6 +2602,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL); di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); out_commit: @@ -3621,6 +3622,7 @@ int ocfs2_xattr_set(struct inode *inode, } ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); + ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0); ocfs2_commit_trans(osb, ctxt.handle); @@ -5483,6 +5485,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, ret = ocfs2_truncate_log_append(osb, handle, blkno, len); if (ret) mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 0); out_commit: ocfs2_commit_trans(osb, handle); -- cgit v1.2.3 From e228f6439862359f9b26f9d62634e4fce180b54a Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Thu, 3 Apr 2014 14:47:09 -0700 Subject: ocfs2: flock: drop cross-node lock when failed locally ocfs2_do_flock() calls ocfs2_file_lock() to get the cross-node clock and then call flock_lock_file_wait() to compete with local processes. In case flock_lock_file_wait() failed, say -ENOMEM, clean up work is not done. This patch adds the cleanup --drop the cross-node lock which was just granted. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Wengang Wang Cc: Joel Becker Reviewed-by: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/locks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index e57c804069ea..6b6d092b0998 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -82,6 +82,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, } ret = flock_lock_file_wait(file, fl); + if (ret) + ocfs2_file_unlock(file); out: mutex_unlock(&fp->fp_mutex); -- cgit v1.2.3 From db66c71577d525c0cd65e66ff675747565783ba4 Mon Sep 17 00:00:00 2001 From: Younger Liu Date: Thu, 3 Apr 2014 14:47:10 -0700 Subject: ocfs2: rollback alloc_dinode counts when ocfs2_block_group_set_bits() failed After updating alloc_dinode counts in ocfs2_alloc_dinode_update_counts(), if ocfs2_alloc_dinode_update_bitmap() failed, there is a rare case that some space may be lost. So, roll back alloc_dinode counts when ocfs2_block_group_set_bits() failed. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Younger Liu Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/move_extents.c | 5 ++++- fs/ocfs2/suballoc.c | 25 ++++++++++++++++++++++++- fs/ocfs2/suballoc.h | 4 ++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 3ca939552d9c..599eb4c4c8be 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -691,8 +691,11 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, goal_bit, len); - if (ret) + if (ret) { + ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, + le16_to_cpu(gd->bg_chain)); mlog_errno(ret); + } /* * Here we should write the new page out first if we are diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 482d6c2a3ea1..94fb1f3d9e62 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1608,6 +1608,21 @@ out: return ret; } +void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain) +{ + u32 tmp_used; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + struct ocfs2_chain_list *cl; + + cl = (struct ocfs2_chain_list *)&di->id2.i_chain; + tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); + di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); + le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); +} + static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, struct ocfs2_extent_rec *rec, struct ocfs2_chain_list *cl) @@ -1708,8 +1723,12 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, res->sr_bit_offset, res->sr_bits); - if (ret < 0) + if (ret < 0) { + ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, + res->sr_bits, + le16_to_cpu(gd->bg_chain)); mlog_errno(ret); + } out_loc_only: *bits_left = le16_to_cpu(gd->bg_free_bits_count); @@ -1839,6 +1858,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, res->sr_bit_offset, res->sr_bits); if (status < 0) { + ocfs2_rollback_alloc_dinode_counts(alloc_inode, + ac->ac_bh, res->sr_bits, chain); mlog_errno(status); goto bail; } @@ -2150,6 +2171,8 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle, res->sr_bit_offset, res->sr_bits); if (ret < 0) { + ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, + ac->ac_bh, res->sr_bits, chain); mlog_errno(ret); goto out; } diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 218d8036b3e7..2d2501767c0c 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -91,6 +91,10 @@ int ocfs2_alloc_dinode_update_counts(struct inode *inode, struct buffer_head *di_bh, u32 num_bits, u16 chain); +void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain); int ocfs2_block_group_set_bits(handle_t *handle, struct inode *alloc_inode, struct ocfs2_group_desc *bg, -- cgit v1.2.3 From da8ded405de74de4b189d1128f4c102d06328c29 Mon Sep 17 00:00:00 2001 From: Tariq Saeed Date: Thu, 3 Apr 2014 14:47:11 -0700 Subject: ocfs2/o2net: o2net_listen_data_ready should do nothing if socket state is not TCP_LISTEN Orabug: 17330860 When accepting an incomming connection o2net_accept_one clones a child data socket from the parent listening socket. It then proceeds to setup the child with callback o2net_data_ready() and sk_user_data to NULL. If data arrives in this window, o2net_listen_data_ready will be called with some non-deterministic value in sk_user_data (not inherited). We panic when we page fault on sk_user_data -- in parent it is sock_def_readable(). The fix is to recognize that this is a data socket being set up by looking at the socket state and do nothing. Signed-off-by: Tariq Saseed Signed-off-by: Srinivas Eeda Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index b4ab371d46d9..eb649d23a4de 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1964,18 +1964,30 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes) goto out; } - /* ->sk_data_ready is also called for a newly established child socket - * before it has been accepted and the acceptor has set up their - * data_ready.. we only want to queue listen work for our listening - * socket */ + /* This callback may called twice when a new connection + * is being established as a child socket inherits everything + * from a parent LISTEN socket, including the data_ready cb of + * the parent. This leads to a hazard. In o2net_accept_one() + * we are still initializing the child socket but have not + * changed the inherited data_ready callback yet when + * data starts arriving. + * We avoid this hazard by checking the state. + * For the listening socket, the state will be TCP_LISTEN; for the new + * socket, will be TCP_ESTABLISHED. Also, in this case, + * sk->sk_user_data is not a valid function pointer. + */ + if (sk->sk_state == TCP_LISTEN) { mlog(ML_TCP, "bytes: %d\n", bytes); queue_work(o2net_wq, &o2net_listen_work); + } else { + ready = NULL; } out: read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + if (ready != NULL) + ready(sk, bytes); } static int o2net_open_listening_sock(__be32 addr, __be16 port) -- cgit v1.2.3 From 7dc3e83901b342ea7fe36262329c3784f2937361 Mon Sep 17 00:00:00 2001 From: jiangyiwen Date: Thu, 3 Apr 2014 14:47:12 -0700 Subject: ocfs2: iput inode alloc when failed locally In ocfs2_info_handle_freeinode() and ocfs2_test_inode_bit() func, after calls ocfs2_get_system_file_inode() to get inode ref, if calls ocfs2_info_scan_inode_alloc() or ocfs2_inode_lock() failed, we should iput inode alloc to avoid leaking the inode. Signed-off-by: jiangyiwen Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/ioctl.c | 5 +++-- fs/ocfs2/suballoc.c | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 8ca3c29accbf..490229f43731 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -413,11 +413,12 @@ int ocfs2_info_handle_freeinode(struct inode *inode, } status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); - if (status < 0) - goto bail; iput(inode_alloc); inode_alloc = NULL; + + if (status < 0) + goto bail; } o2info_set_request_filled(&oifi->ifi_req); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 94fb1f3d9e62..0cb889a17ae1 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2894,6 +2894,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); if (status < 0) { mutex_unlock(&inode_alloc_inode->i_mutex); + iput(inode_alloc_inode); mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", (u32)suballoc_slot, status); goto bail; -- cgit v1.2.3 From 43b10a20372d9a1c08391f33f1c8bd86179ddc5f Mon Sep 17 00:00:00 2001 From: jiangyiwen Date: Thu, 3 Apr 2014 14:47:13 -0700 Subject: ocfs2: avoid system inode ref confusion by adding mutex lock The following case may lead to the same system inode ref in confusion. A thread B thread ocfs2_get_system_file_inode ->get_local_system_inode ->_ocfs2_get_system_file_inode because of *arr == NULL, ocfs2_get_system_file_inode ->get_local_system_inode ->_ocfs2_get_system_file_inode gets first ref thru _ocfs2_get_system_file_inode, gets second ref thru igrab and set *arr = inode at the moment, B thread also gets two refs, so lead to one more inode ref. So add mutex lock to avoid multi thread set two inode ref once at the same time. Signed-off-by: jiangyiwen Reviewed-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/ocfs2.h | 2 ++ fs/ocfs2/super.c | 2 ++ fs/ocfs2/sysfile.c | 3 +++ 3 files changed, 7 insertions(+) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index a780e20d4fba..8d64a97a9d5e 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -446,6 +446,8 @@ struct ocfs2_super /* rb tree root for refcount lock. */ struct rb_root osb_rf_lock_tree; struct ocfs2_refcount_tree *osb_ref_tree_lru; + + struct mutex system_file_mutex; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 888a1457af96..1aecd626e645 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2100,6 +2100,8 @@ static int ocfs2_initialize_super(struct super_block *sb, spin_lock_init(&osb->osb_xattr_lock); ocfs2_init_steal_slots(osb); + mutex_init(&osb->system_file_mutex); + atomic_set(&osb->alloc_stats.moves, 0); atomic_set(&osb->alloc_stats.local_data, 0); atomic_set(&osb->alloc_stats.bitmap_data, 0); diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index f053688d22a3..af155c183123 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -113,9 +113,11 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, } else arr = get_local_system_inode(osb, type, slot); + mutex_lock(&osb->system_file_mutex); if (arr && ((inode = *arr) != NULL)) { /* get a ref in addition to the array ref */ inode = igrab(inode); + mutex_unlock(&osb->system_file_mutex); BUG_ON(!inode); return inode; @@ -129,6 +131,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, *arr = igrab(inode); BUG_ON(!*arr); } + mutex_unlock(&osb->system_file_mutex); return inode; } -- cgit v1.2.3 From 9c339255cb01806efbaec3d296f3e44c9357d574 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Thu, 3 Apr 2014 14:47:15 -0700 Subject: ocfs2: pass "new" parameter to ocfs2_init_xattr_bucket This patch fixes the following crash: kernel BUG at fs/ocfs2/uptodate.c:530! Modules linked in: ocfs2(F) ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue configfs bridge xen_pciback xen_netback xen_blkback xen_gntalloc xen_gntdev xen_evtchn xenfs xen_privcmd sunrpc 8021q garp stp llc bonding be2iscsi iscsi_boot_sysfs bnx2i cnic uio cxgb4i cxgb4 cxgb3i libcxgbi cxgb3 mdio ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi iTCO_wdt iTCO_vendor_support dcdbas coretemp freq_table mperf microcode pcspkr serio_raw bnx2 lpc_ich mfd_core i5k_amb i5000_edac edac_core e1000e sg shpchp ext4(F) jbd2(F) mbcache(F) dm_round_robin(F) sr_mod(F) cdrom(F) usb_storage(F) sd_mod(F) crc_t10dif(F) pata_acpi(F) ata_generic(F) ata_piix(F) mptsas(F) mptscsih(F) mptbase(F) scsi_transport_sas(F) radeon(F) ttm(F) drm_kms_helper(F) drm(F) hwmon(F) i2c_algo_bit(F) i2c_core(F) dm_multipath(F) dm_mirror(F) dm_region_hash(F) dm_log(F) dm_mod(F) CPU 5 Pid: 21303, comm: xattr-test Tainted: GF W 3.8.13-30.el6uek.x86_64 #2 Dell Inc. PowerEdge 1950/0M788G RIP: ocfs2_set_new_buffer_uptodate+0x51/0x60 [ocfs2] Process xattr-test (pid: 21303, threadinfo ffff880017aca000, task ffff880016a2c480) Call Trace: ocfs2_init_xattr_bucket+0x8a/0x120 [ocfs2] ocfs2_cp_xattr_bucket+0xbb/0x1b0 [ocfs2] ocfs2_extend_xattr_bucket+0x20a/0x2f0 [ocfs2] ocfs2_add_new_xattr_bucket+0x23e/0x4b0 [ocfs2] ocfs2_xattr_set_entry_index_block+0x13c/0x3d0 [ocfs2] ocfs2_xattr_block_set+0xf9/0x220 [ocfs2] __ocfs2_xattr_set_handle+0x118/0x710 [ocfs2] ocfs2_xattr_set+0x691/0x880 [ocfs2] ocfs2_xattr_user_set+0x46/0x50 [ocfs2] generic_setxattr+0x96/0xa0 __vfs_setxattr_noperm+0x7b/0x170 vfs_setxattr+0xbc/0xc0 setxattr+0xde/0x230 sys_fsetxattr+0xc6/0xf0 system_call_fastpath+0x16/0x1b Code: 41 80 0c 24 01 48 89 df e8 7d f0 ff ff 4c 89 e6 48 89 df e8 a2 fe ff ff 48 89 df e8 3a f0 ff ff 48 8b 1c 24 4c 8b 64 24 08 c9 c3 <0f> 0b eb fe 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 66 66 RIP ocfs2_set_new_buffer_uptodate+0x51/0x60 [ocfs2] It hit the BUG_ON() in ocfs2_set_new_buffer_uptodate(): void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci, struct buffer_head *bh) { /* This should definitely *not* exist in our cache */ if (ocfs2_buffer_cached(ci, bh)) printk(KERN_ERR "bh->b_blocknr: %lu @ %p\n", bh->b_blocknr, bh); BUG_ON(ocfs2_buffer_cached(ci, bh)); set_buffer_uptodate(bh); ocfs2_metadata_cache_io_lock(ci); ocfs2_set_buffer_uptodate(ci, bh); ocfs2_metadata_cache_io_unlock(ci); } The problem here is: We cached a block, but the buffer_head got reused. When we are to pick up this block again, a new buffer_head created with UPTODATE flag cleared. ocfs2_buffer_uptodate() returned false since no UPTODATE is set on the buffer_head. so we set this block to cache as a NEW block, then it failed at asserting block is not in cache. The fix is to add a new parameter indicating the bucket is a new allocated or not to ocfs2_init_xattr_bucket(). ocfs2_init_xattr_bucket() assert block not cached accordingly. Signed-off-by: Wengang Wang Cc: Joel Becker Reviewed-by: Mark Fasheh Cc: Joe Jin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/xattr.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 14b8c46b4fbb..016f01df3825 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -369,7 +369,7 @@ static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket) * them fully. */ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, - u64 xb_blkno) + u64 xb_blkno, int new) { int i, rc = 0; @@ -383,9 +383,16 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, } if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode), - bucket->bu_bhs[i])) - ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), - bucket->bu_bhs[i]); + bucket->bu_bhs[i])) { + if (new) + ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i]); + else { + set_buffer_uptodate(bucket->bu_bhs[i]); + ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i]); + } + } } if (rc) @@ -4303,7 +4310,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, trace_ocfs2_xattr_create_index_block((unsigned long long)blkno); - ret = ocfs2_init_xattr_bucket(xs->bucket, blkno); + ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1); if (ret) { mlog_errno(ret); goto out; @@ -4647,7 +4654,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, * Even if !new_bucket_head, we're overwriting t_bucket. Thus, * there's no need to read it. */ - ret = ocfs2_init_xattr_bucket(t_bucket, new_blk); + ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head); if (ret) { mlog_errno(ret); goto out; @@ -4813,7 +4820,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode, * Even if !t_is_new, we're overwriting t_bucket. Thus, * there's no need to read it. */ - ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno); + ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new); if (ret) goto out; @@ -6840,7 +6847,7 @@ static int ocfs2_reflink_xattr_bucket(handle_t *handle, break; } - ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno); + ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1); if (ret) { mlog_errno(ret); break; -- cgit v1.2.3 From 2b665e276c15ba7d9fc8cdd16931883a51ed13e4 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Thu, 3 Apr 2014 14:47:16 -0700 Subject: fs/direct-io.c: remove redundant comparison The return value of bio_get_nr_vecs() cannot be bigger than BIO_MAX_PAGES, so we can remove redundant the comparison between nr_pages and BIO_MAX_PAGES. Signed-off-by: Gu Zheng Cc: Al Viro Reviewed-by: Jeff Moyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 160a5489a939..6e6bff375244 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -664,7 +664,6 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, goto out; sector = start_sector << (sdio->blkbits - 9); nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); - nr_pages = min(nr_pages, BIO_MAX_PAGES); BUG_ON(nr_pages <= 0); dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); sdio->boundary = 0; -- cgit v1.2.3 From 45d4f855046631d63dab8832ba8a8369ed8e04bd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 3 Apr 2014 14:47:17 -0700 Subject: fs/direct-io.c: remove some left over checks We know that "ret > 0" is true here. These tests were left over from commit 02afc27faec9 ('direct-io: Handle O_(D)SYNC AIO') and aren't needed any more. Signed-off-by: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 2 +- fs/btrfs/file.c | 2 +- fs/ext4/file.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 1e86823a9cbd..e4cba21a627e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1523,7 +1523,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, ssize_t err; err = generic_write_sync(file, pos, ret); - if (err < 0 && ret > 0) + if (err < 0) ret = err; } blk_finish_plug(&plug); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0165b8672f09..7331a230e30b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1797,7 +1797,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, BTRFS_I(inode)->last_sub_trans = root->log_transid; if (num_written > 0) { err = generic_write_sync(file, pos, num_written); - if (err < 0 && num_written > 0) + if (err < 0) num_written = err; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 1a5073959f32..6db7f7db7777 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -153,7 +153,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, ssize_t err; err = generic_write_sync(file, iocb->ki_pos - ret, ret); - if (err < 0 && ret > 0) + if (err < 0) ret = err; } blk_finish_plug(&plug); -- cgit v1.2.3 From 62572e29bc530b38921ef6059088b4788a9832a5 Mon Sep 17 00:00:00 2001 From: Ben Zhang Date: Thu, 3 Apr 2014 14:47:18 -0700 Subject: kernel/watchdog.c: touch_nmi_watchdog should only touch local cpu not every one I ran into a scenario where while one cpu was stuck and should have panic'd because of the NMI watchdog, it didn't. The reason was another cpu was spewing stack dumps on to the console. Upon investigation, I noticed that when writing to the console and also when dumping the stack, the watchdog is touched. This causes all the cpus to reset their NMI watchdog flags and the 'stuck' cpu just spins forever. This change causes the semantics of touch_nmi_watchdog to be changed slightly. Previously, I accidentally changed the semantics and we noticed there was a codepath in which touch_nmi_watchdog could be touched from a preemtible area. That caused a BUG() to happen when CONFIG_DEBUG_PREEMPT was enabled. I believe it was the acpi code. My attempt here re-introduces the change to have the touch_nmi_watchdog() code only touch the local cpu instead of all of the cpus. But instead of using __get_cpu_var(), I use the __raw_get_cpu_var() version. This avoids the preemption problem. However my reasoning wasn't because I was trying to be lazy. Instead I rationalized it as, well if preemption is enabled then interrupts should be enabled to and the NMI watchdog will have no reason to trigger. So it won't matter if the wrong cpu is touched because the percpu interrupt counters the NMI watchdog uses should still be incrementing. Don said: : I'm ok with this patch, though it does alter the behaviour of how : touch_nmi_watchdog works. For the most part I don't think most callers : need to touch all of the watchdogs (on each cpu). Perhaps a corner case : will pop up (the scheduler?? to mimic touch_all_softlockup_watchdogs() ). : : But this does address an issue where if a system is locked up and one cpu : is spewing out useful debug messages (or error messages), the hard lockup : will fail to go off. We have seen this on RHEL also. Signed-off-by: Don Zickus Signed-off-by: Ben Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 01c6f979486f..e90089fd78e0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -158,14 +158,14 @@ void touch_all_softlockup_watchdogs(void) #ifdef CONFIG_HARDLOCKUP_DETECTOR void touch_nmi_watchdog(void) { - if (watchdog_user_enabled) { - unsigned cpu; - - for_each_present_cpu(cpu) { - if (per_cpu(watchdog_nmi_touch, cpu) != true) - per_cpu(watchdog_nmi_touch, cpu) = true; - } - } + /* + * Using __raw here because some code paths have + * preemption enabled. If preemption is enabled + * then interrupts should be enabled too, in which + * case we shouldn't have to worry about the watchdog + * going off. + */ + __raw_get_cpu_var(watchdog_nmi_touch) = true; touch_softlockup_watchdog(); } EXPORT_SYMBOL(touch_nmi_watchdog); -- cgit v1.2.3 From 99120b772b52853f9a2b829a21dd44d9b20558f1 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 3 Apr 2014 14:47:19 -0700 Subject: mm: vmscan: respect NUMA policy mask when shrinking slab on direct reclaim When direct reclaim is executed by a process bound to a set of NUMA nodes, we should scan only those nodes when possible, but currently we will scan kmem from all online nodes even if the kmem shrinker is NUMA aware. That said, binding a process to a particular NUMA node won't prevent it from shrinking inode/dentry caches from other nodes, which is not good. Fix this. Signed-off-by: Vladimir Davydov Cc: Mel Gorman Cc: Michal Hocko Cc: Johannes Weiner Cc: Rik van Riel Cc: Dave Chinner Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index a9c74b409681..4acee0fe65e7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2425,8 +2425,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, unsigned long lru_pages = 0; nodes_clear(shrink->nodes_to_scan); - for_each_zone_zonelist(zone, z, zonelist, - gfp_zone(sc->gfp_mask)) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(sc->gfp_mask), sc->nodemask) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; -- cgit v1.2.3 From 65ec02cb9ad440957777a2da6058956a52806460 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 3 Apr 2014 14:47:20 -0700 Subject: mm: vmscan: move call to shrink_slab() to shrink_zones() This reduces the indentation level of do_try_to_free_pages() and removes extra loop over all eligible zones counting the number of on-LRU pages. Signed-off-by: Vladimir Davydov Reviewed-by: Glauber Costa Cc: Mel Gorman Cc: Michal Hocko Cc: Johannes Weiner Cc: Rik van Riel Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 56 +++++++++++++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4acee0fe65e7..73d6fa747443 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2291,13 +2291,16 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * the caller that it should consider retrying the allocation instead of * further reclaim. */ -static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) +static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc, + struct shrink_control *shrink) { struct zoneref *z; struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + unsigned long lru_pages = 0; bool aborted_reclaim = false; + struct reclaim_state *reclaim_state = current->reclaim_state; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2307,6 +2310,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (buffer_heads_over_limit) sc->gfp_mask |= __GFP_HIGHMEM; + nodes_clear(shrink->nodes_to_scan); + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { if (!populated_zone(zone)) @@ -2318,6 +2323,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (global_reclaim(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; + + lru_pages += zone_reclaimable_pages(zone); + node_set(zone_to_nid(zone), shrink->nodes_to_scan); + if (sc->priority != DEF_PRIORITY && !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ @@ -2354,6 +2363,20 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) shrink_zone(zone, sc); } + /* + * Don't shrink slabs when reclaiming memory from over limit cgroups + * but do shrink slab at least once when aborting reclaim for + * compaction to avoid unevenly scanning file/anon LRU pages over slab + * pages. + */ + if (global_reclaim(sc)) { + shrink_slab(shrink, sc->nr_scanned, lru_pages); + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + } + return aborted_reclaim; } @@ -2398,9 +2421,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct shrink_control *shrink) { unsigned long total_scanned = 0; - struct reclaim_state *reclaim_state = current->reclaim_state; - struct zoneref *z; - struct zone *zone; unsigned long writeback_threshold; bool aborted_reclaim; @@ -2413,34 +2433,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; - aborted_reclaim = shrink_zones(zonelist, sc); - - /* - * Don't shrink slabs when reclaiming memory from over limit - * cgroups but do shrink slab at least once when aborting - * reclaim for compaction to avoid unevenly scanning file/anon - * LRU pages over slab pages. - */ - if (global_reclaim(sc)) { - unsigned long lru_pages = 0; - - nodes_clear(shrink->nodes_to_scan); - for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - - lru_pages += zone_reclaimable_pages(zone); - node_set(zone_to_nid(zone), - shrink->nodes_to_scan); - } + aborted_reclaim = shrink_zones(zonelist, sc, shrink); - shrink_slab(shrink, sc->nr_scanned, lru_pages); - if (reclaim_state) { - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } - } total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) goto out; -- cgit v1.2.3 From 3115cd9145511c9159a19b118836dd5115d0a781 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 3 Apr 2014 14:47:22 -0700 Subject: mm: vmscan: remove shrink_control arg from do_try_to_free_pages() There is no need passing on a shrink_control struct from try_to_free_pages() and friends to do_try_to_free_pages() and then to shrink_zones(), because it is only used in shrink_zones() and the only field initialized on the top level is gfp_mask, which is always equal to scan_control.gfp_mask. So let's move shrink_control initialization to shrink_zones(). Signed-off-by: Vladimir Davydov Cc: Mel Gorman Cc: Michal Hocko Cc: Johannes Weiner Cc: Rik van Riel Cc: Dave Chinner Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 73d6fa747443..1cd48519f506 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2291,8 +2291,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * the caller that it should consider retrying the allocation instead of * further reclaim. */ -static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc, - struct shrink_control *shrink) +static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; @@ -2301,6 +2300,9 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc, unsigned long lru_pages = 0; bool aborted_reclaim = false; struct reclaim_state *reclaim_state = current->reclaim_state; + struct shrink_control shrink = { + .gfp_mask = sc->gfp_mask, + }; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2310,7 +2312,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc, if (buffer_heads_over_limit) sc->gfp_mask |= __GFP_HIGHMEM; - nodes_clear(shrink->nodes_to_scan); + nodes_clear(shrink.nodes_to_scan); for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -2325,7 +2327,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc, continue; lru_pages += zone_reclaimable_pages(zone); - node_set(zone_to_nid(zone), shrink->nodes_to_scan); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); if (sc->priority != DEF_PRIORITY && !zone_reclaimable(zone)) @@ -2370,7 +2372,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc, * pages. */ if (global_reclaim(sc)) { - shrink_slab(shrink, sc->nr_scanned, lru_pages); + shrink_slab(&shrink, sc->nr_scanned, lru_pages); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; @@ -2417,8 +2419,7 @@ static bool all_unreclaimable(struct zonelist *zonelist, * else, the number of pages reclaimed */ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, - struct scan_control *sc, - struct shrink_control *shrink) + struct scan_control *sc) { unsigned long total_scanned = 0; unsigned long writeback_threshold; @@ -2433,7 +2434,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; - aborted_reclaim = shrink_zones(zonelist, sc, shrink); + aborted_reclaim = shrink_zones(zonelist, sc); total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) @@ -2596,9 +2597,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .target_mem_cgroup = NULL, .nodemask = nodemask, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; /* * Do not enter reclaim if fatal signal was delivered while throttled. @@ -2612,7 +2610,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, sc.may_writepage, gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); @@ -2679,9 +2677,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't @@ -2696,7 +2691,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.may_writepage, sc.gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -3331,9 +3326,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .order = 0, .priority = DEF_PRIORITY, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; unsigned long nr_reclaimed; @@ -3343,7 +3335,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); p->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); -- cgit v1.2.3 From 91ca9186484809c57303b33778d841cc28f696ed Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 3 Apr 2014 14:47:23 -0700 Subject: mm, compaction: ignore pageblock skip when manually invoking compaction The cached pageblock hint should be ignored when triggering compaction through /proc/sys/vm/compact_memory so all eligible memory is isolated. Manually invoking compaction is known to be expensive, there's no need to skip pageblocks based on heuristics (mainly for debugging). Signed-off-by: David Rientjes Acked-by: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/compaction.c b/mm/compaction.c index 918577595ea8..37b3799d948c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1186,6 +1186,7 @@ static void compact_node(int nid) struct compact_control cc = { .order = -1, .sync = true, + .ignore_skip_hint = true, }; __compact_pgdat(NODE_DATA(nid), &cc); -- cgit v1.2.3 From d26914d11751b23ca2e8747725f2cae10c2f2c1b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 3 Apr 2014 14:47:24 -0700 Subject: mm: optimize put_mems_allowed() usage Since put_mems_allowed() is strictly optional, its a seqcount retry, we don't need to evaluate the function if the allocation was in fact successful, saving a smp_rmb some loads and comparisons on some relative fast-paths. Since the naming, get/put_mems_allowed() does suggest a mandatory pairing, rename the interface, as suggested by Mel, to resemble the seqcount interface. This gives us: read_mems_allowed_begin() and read_mems_allowed_retry(), where it is important to note that the return value of the latter call is inverted from its previous incarnation. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 27 ++++++++++++++------------- kernel/cpuset.c | 2 +- mm/filemap.c | 4 ++-- mm/hugetlb.c | 4 ++-- mm/mempolicy.c | 12 ++++++------ mm/page_alloc.c | 8 ++++---- mm/slab.c | 4 ++-- mm/slub.c | 16 +++++++--------- 8 files changed, 38 insertions(+), 39 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 3fe661fe96d1..b19d3dc2e651 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -87,25 +87,26 @@ extern void rebuild_sched_domains(void); extern void cpuset_print_task_mems_allowed(struct task_struct *p); /* - * get_mems_allowed is required when making decisions involving mems_allowed - * such as during page allocation. mems_allowed can be updated in parallel - * and depending on the new value an operation can fail potentially causing - * process failure. A retry loop with get_mems_allowed and put_mems_allowed - * prevents these artificial failures. + * read_mems_allowed_begin is required when making decisions involving + * mems_allowed such as during page allocation. mems_allowed can be updated in + * parallel and depending on the new value an operation can fail potentially + * causing process failure. A retry loop with read_mems_allowed_begin and + * read_mems_allowed_retry prevents these artificial failures. */ -static inline unsigned int get_mems_allowed(void) +static inline unsigned int read_mems_allowed_begin(void) { return read_seqcount_begin(¤t->mems_allowed_seq); } /* - * If this returns false, the operation that took place after get_mems_allowed - * may have failed. It is up to the caller to retry the operation if + * If this returns true, the operation that took place after + * read_mems_allowed_begin may have failed artificially due to a concurrent + * update of mems_allowed. It is up to the caller to retry the operation if * appropriate. */ -static inline bool put_mems_allowed(unsigned int seq) +static inline bool read_mems_allowed_retry(unsigned int seq) { - return !read_seqcount_retry(¤t->mems_allowed_seq, seq); + return read_seqcount_retry(¤t->mems_allowed_seq, seq); } static inline void set_mems_allowed(nodemask_t nodemask) @@ -225,14 +226,14 @@ static inline void set_mems_allowed(nodemask_t nodemask) { } -static inline unsigned int get_mems_allowed(void) +static inline unsigned int read_mems_allowed_begin(void) { return 0; } -static inline bool put_mems_allowed(unsigned int seq) +static inline bool read_mems_allowed_retry(unsigned int seq) { - return true; + return false; } #endif /* !CONFIG_CPUSETS */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e6b1b66afe52..f6fc7475f1a1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1022,7 +1022,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, task_lock(tsk); /* * Determine if a loop is necessary if another thread is doing - * get_mems_allowed(). If at least one node remains unchanged and + * read_mems_allowed_begin(). If at least one node remains unchanged and * tsk does not have a mempolicy, then an empty nodemask will not be * possible when mems_allowed is larger than a word. */ diff --git a/mm/filemap.c b/mm/filemap.c index 7a13f6ac5421..068cd2a63d32 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -520,10 +520,10 @@ struct page *__page_cache_alloc(gfp_t gfp) if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); page = alloc_pages_exact_node(n, gfp, 0); - } while (!put_mems_allowed(cpuset_mems_cookie) && !page); + } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); return page; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c01cb9fedb18..139b7462203b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -540,7 +540,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, goto err; retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = huge_zonelist(vma, address, htlb_alloc_mask(h), &mpol, &nodemask); @@ -562,7 +562,7 @@ retry_cpuset: } mpol_cond_put(mpol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4755c8576942..e3ab02822799 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1899,7 +1899,7 @@ int node_random(const nodemask_t *maskp) * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. * - * Must be protected by get_mems_allowed() + * Must be protected by read_mems_allowed_begin() */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, @@ -2063,7 +2063,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, retry_cpuset: pol = get_vma_policy(current, vma, addr); - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; @@ -2071,7 +2071,7 @@ retry_cpuset: nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; @@ -2081,7 +2081,7 @@ retry_cpuset: policy_nodemask(gfp, pol)); if (unlikely(mpol_needs_cond_ref(pol))) __mpol_put(pol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; } @@ -2115,7 +2115,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) pol = &default_policy; retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); /* * No reference counting needed for current->mempolicy @@ -2128,7 +2128,7 @@ retry_cpuset: policy_zonelist(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3bac76ae4b30..979378deccbf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2739,7 +2739,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, return NULL; retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); /* The preferred zone is used for statistics later */ first_zones_zonelist(zonelist, high_zoneidx, @@ -2777,7 +2777,7 @@ out: * the mask is being updated. If a page allocation is about to fail, * check if the cpuset changed during allocation and if so, retry. */ - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; memcg_kmem_commit_charge(page, memcg, order); @@ -3045,9 +3045,9 @@ bool skip_free_areas_node(unsigned int flags, int nid) goto out; do { - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); ret = !node_isset(nid, cpuset_current_mems_allowed); - } while (!put_mems_allowed(cpuset_mems_cookie)); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); out: return ret; } diff --git a/mm/slab.c b/mm/slab.c index b264214c77ea..9153c802e2fe 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3073,7 +3073,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(slab_node(), flags); retry: @@ -3131,7 +3131,7 @@ retry: } } - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) + if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return obj; } diff --git a/mm/slub.c b/mm/slub.c index 25f14ad8f817..7611f148ee81 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1684,7 +1684,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, return NULL; do { - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(slab_node(), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; @@ -1696,19 +1696,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, object = get_partial_node(s, n, c, flags); if (object) { /* - * Return the object even if - * put_mems_allowed indicated that - * the cpuset mems_allowed was - * updated in parallel. It's a - * harmless race between the alloc - * and the cpuset update. + * Don't check read_mems_allowed_retry() + * here - if mems_allowed was updated in + * parallel, that was a harmless race + * between allocation and the cpuset + * update */ - put_mems_allowed(cpuset_mems_cookie); return object; } } } - } while (!put_mems_allowed(cpuset_mems_cookie)); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); #endif return NULL; } -- cgit v1.2.3 From 9119a41e9091fb3a8204039d595bcdae24193c57 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 3 Apr 2014 14:47:25 -0700 Subject: mm, hugetlb: unify region structure handling Currently, to track reserved and allocated regions, we use two different ways, depending on the mapping. For MAP_SHARED, we use address_mapping's private_list and, while for MAP_PRIVATE, we use a resv_map. Now, we are preparing to change a coarse grained lock which protect a region structure to fine grained lock, and this difference hinder it. So, before changing it, unify region structure handling, consistently using a resv_map regardless of the kind of mapping. Signed-off-by: Joonsoo Kim Signed-off-by: Davidlohr Bueso Reviewed-by: Aneesh Kumar K.V Reviewed-by: Naoya Horiguchi Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 17 +++++++++++++++-- include/linux/hugetlb.h | 9 +++++++++ mm/hugetlb.c | 37 +++++++++++++++++++++---------------- 3 files changed, 45 insertions(+), 18 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d19b30ababf1..204027520937 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -366,7 +366,13 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart) static void hugetlbfs_evict_inode(struct inode *inode) { + struct resv_map *resv_map; + truncate_hugepages(inode, 0); + resv_map = (struct resv_map *)inode->i_mapping->private_data; + /* root inode doesn't have the resv_map, so we should check it */ + if (resv_map) + resv_map_release(&resv_map->refs); clear_inode(inode); } @@ -476,6 +482,11 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev) { struct inode *inode; + struct resv_map *resv_map; + + resv_map = resv_map_alloc(); + if (!resv_map) + return NULL; inode = new_inode(sb); if (inode) { @@ -487,7 +498,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - INIT_LIST_HEAD(&inode->i_mapping->private_list); + inode->i_mapping->private_data = resv_map; info = HUGETLBFS_I(inode); /* * The policy is initialized here even if we are creating a @@ -517,7 +528,9 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; } lockdep_annotate_inode_mutex_key(inode); - } + } else + kref_put(&resv_map->refs, resv_map_release); + return inode; } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8c43cc469d78..f62c2f6c6059 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -6,6 +6,8 @@ #include #include #include +#include +#include struct ctl_table; struct user_struct; @@ -23,6 +25,13 @@ struct hugepage_subpool { long max_hpages, used_hpages; }; +struct resv_map { + struct kref refs; + struct list_head regions; +}; +extern struct resv_map *resv_map_alloc(void); +void resv_map_release(struct kref *ref); + extern spinlock_t hugetlb_lock; extern int hugetlb_max_hstate __read_mostly; #define for_each_hstate(h) \ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 139b7462203b..63699afc7b7f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -376,12 +376,7 @@ static void set_vma_private_data(struct vm_area_struct *vma, vma->vm_private_data = (void *)value; } -struct resv_map { - struct kref refs; - struct list_head regions; -}; - -static struct resv_map *resv_map_alloc(void) +struct resv_map *resv_map_alloc(void) { struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); if (!resv_map) @@ -393,7 +388,7 @@ static struct resv_map *resv_map_alloc(void) return resv_map; } -static void resv_map_release(struct kref *ref) +void resv_map_release(struct kref *ref) { struct resv_map *resv_map = container_of(ref, struct resv_map, refs); @@ -1155,8 +1150,9 @@ static long vma_needs_reservation(struct hstate *h, if (vma->vm_flags & VM_MAYSHARE) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); - return region_chg(&inode->i_mapping->private_list, - idx, idx + 1); + struct resv_map *resv = inode->i_mapping->private_data; + + return region_chg(&resv->regions, idx, idx + 1); } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { return 1; @@ -1180,7 +1176,9 @@ static void vma_commit_reservation(struct hstate *h, if (vma->vm_flags & VM_MAYSHARE) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); - region_add(&inode->i_mapping->private_list, idx, idx + 1); + struct resv_map *resv = inode->i_mapping->private_data; + + region_add(&resv->regions, idx, idx + 1); } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); @@ -3161,6 +3159,7 @@ int hugetlb_reserve_pages(struct inode *inode, long ret, chg; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); + struct resv_map *resv_map; /* * Only apply hugepage reservation if asked. At fault time, an @@ -3176,10 +3175,13 @@ int hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !vma is a shm mapping */ - if (!vma || vma->vm_flags & VM_MAYSHARE) - chg = region_chg(&inode->i_mapping->private_list, from, to); - else { - struct resv_map *resv_map = resv_map_alloc(); + if (!vma || vma->vm_flags & VM_MAYSHARE) { + resv_map = inode->i_mapping->private_data; + + chg = region_chg(&resv_map->regions, from, to); + + } else { + resv_map = resv_map_alloc(); if (!resv_map) return -ENOMEM; @@ -3222,7 +3224,7 @@ int hugetlb_reserve_pages(struct inode *inode, * else has to be done for private mappings here */ if (!vma || vma->vm_flags & VM_MAYSHARE) - region_add(&inode->i_mapping->private_list, from, to); + region_add(&resv_map->regions, from, to); return 0; out_err: if (vma) @@ -3233,9 +3235,12 @@ out_err: void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) { struct hstate *h = hstate_inode(inode); - long chg = region_truncate(&inode->i_mapping->private_list, offset); + struct resv_map *resv_map = inode->i_mapping->private_data; + long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); + if (resv_map) + chg = region_truncate(&resv_map->regions, offset); spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); -- cgit v1.2.3 From 1406ec9ba6c65cb69e9243bff07ca3f51e2525e0 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 3 Apr 2014 14:47:26 -0700 Subject: mm, hugetlb: improve, cleanup resv_map parameters To change a protection method for region tracking to find grained one, we pass the resv_map, instead of list_head, to region manipulation functions. This doesn't introduce any functional change, and it is just for preparing a next step. [davidlohr@hp.com: update changelog] Signed-off-by: Joonsoo Kim Signed-off-by: Davidlohr Bueso Reviewed-by: Aneesh Kumar K.V Reviewed-by: Naoya Horiguchi Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 63699afc7b7f..454b311373c5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -151,8 +151,9 @@ struct file_region { long to; }; -static long region_add(struct list_head *head, long f, long t) +static long region_add(struct resv_map *resv, long f, long t) { + struct list_head *head = &resv->regions; struct file_region *rg, *nrg, *trg; /* Locate the region we are either in or before. */ @@ -187,8 +188,9 @@ static long region_add(struct list_head *head, long f, long t) return 0; } -static long region_chg(struct list_head *head, long f, long t) +static long region_chg(struct resv_map *resv, long f, long t) { + struct list_head *head = &resv->regions; struct file_region *rg, *nrg; long chg = 0; @@ -236,8 +238,9 @@ static long region_chg(struct list_head *head, long f, long t) return chg; } -static long region_truncate(struct list_head *head, long end) +static long region_truncate(struct resv_map *resv, long end) { + struct list_head *head = &resv->regions; struct file_region *rg, *trg; long chg = 0; @@ -266,8 +269,9 @@ static long region_truncate(struct list_head *head, long end) return chg; } -static long region_count(struct list_head *head, long f, long t) +static long region_count(struct resv_map *resv, long f, long t) { + struct list_head *head = &resv->regions; struct file_region *rg; long chg = 0; @@ -393,7 +397,7 @@ void resv_map_release(struct kref *ref) struct resv_map *resv_map = container_of(ref, struct resv_map, refs); /* Clear out any active regions before we release the map. */ - region_truncate(&resv_map->regions, 0); + region_truncate(resv_map, 0); kfree(resv_map); } @@ -1152,7 +1156,7 @@ static long vma_needs_reservation(struct hstate *h, pgoff_t idx = vma_hugecache_offset(h, vma, addr); struct resv_map *resv = inode->i_mapping->private_data; - return region_chg(&resv->regions, idx, idx + 1); + return region_chg(resv, idx, idx + 1); } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { return 1; @@ -1162,7 +1166,7 @@ static long vma_needs_reservation(struct hstate *h, pgoff_t idx = vma_hugecache_offset(h, vma, addr); struct resv_map *resv = vma_resv_map(vma); - err = region_chg(&resv->regions, idx, idx + 1); + err = region_chg(resv, idx, idx + 1); if (err < 0) return err; return 0; @@ -1178,14 +1182,14 @@ static void vma_commit_reservation(struct hstate *h, pgoff_t idx = vma_hugecache_offset(h, vma, addr); struct resv_map *resv = inode->i_mapping->private_data; - region_add(&resv->regions, idx, idx + 1); + region_add(resv, idx, idx + 1); } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); struct resv_map *resv = vma_resv_map(vma); /* Mark this page used in the map. */ - region_add(&resv->regions, idx, idx + 1); + region_add(resv, idx, idx + 1); } } @@ -2276,7 +2280,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) end = vma_hugecache_offset(h, vma, vma->vm_end); reserve = (end - start) - - region_count(&resv->regions, start, end); + region_count(resv, start, end); resv_map_put(vma); @@ -3178,7 +3182,7 @@ int hugetlb_reserve_pages(struct inode *inode, if (!vma || vma->vm_flags & VM_MAYSHARE) { resv_map = inode->i_mapping->private_data; - chg = region_chg(&resv_map->regions, from, to); + chg = region_chg(resv_map, from, to); } else { resv_map = resv_map_alloc(); @@ -3224,7 +3228,7 @@ int hugetlb_reserve_pages(struct inode *inode, * else has to be done for private mappings here */ if (!vma || vma->vm_flags & VM_MAYSHARE) - region_add(&resv_map->regions, from, to); + region_add(resv_map, from, to); return 0; out_err: if (vma) @@ -3240,7 +3244,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) struct hugepage_subpool *spool = subpool_inode(inode); if (resv_map) - chg = region_truncate(&resv_map->regions, offset); + chg = region_truncate(resv_map, offset); spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); -- cgit v1.2.3 From 7b24d8616be33616efd41ff67d3c76362c60ca84 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Apr 2014 14:47:27 -0700 Subject: mm, hugetlb: fix race in region tracking There is a race condition if we map a same file on different processes. Region tracking is protected by mmap_sem and hugetlb_instantiation_mutex. When we do mmap, we don't grab a hugetlb_instantiation_mutex, but only mmap_sem (exclusively). This doesn't prevent other tasks from modifying the region structure, so it can be modified by two processes concurrently. To solve this, introduce a spinlock to resv_map and make region manipulation function grab it before they do actual work. [davidlohr@hp.com: updated changelog] Signed-off-by: Davidlohr Bueso Signed-off-by: Joonsoo Kim Suggested-by: Joonsoo Kim Acked-by: David Gibson Cc: David Gibson Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 + mm/hugetlb.c | 58 ++++++++++++++++++++++++++++++++----------------- 2 files changed, 39 insertions(+), 20 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f62c2f6c6059..5b337cf8fb86 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -27,6 +27,7 @@ struct hugepage_subpool { struct resv_map { struct kref refs; + spinlock_t lock; struct list_head regions; }; extern struct resv_map *resv_map_alloc(void); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 454b311373c5..5a2515a774b5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -135,15 +135,8 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. * - * The region data structures are protected by a combination of the mmap_sem - * and the hugetlb_instantiation_mutex. To access or modify a region the caller - * must either hold the mmap_sem for write, or the mmap_sem for read and - * the hugetlb_instantiation_mutex: - * - * down_write(&mm->mmap_sem); - * or - * down_read(&mm->mmap_sem); - * mutex_lock(&hugetlb_instantiation_mutex); + * The region data structures are embedded into a resv_map and + * protected by a resv_map's lock */ struct file_region { struct list_head link; @@ -156,6 +149,7 @@ static long region_add(struct resv_map *resv, long f, long t) struct list_head *head = &resv->regions; struct file_region *rg, *nrg, *trg; + spin_lock(&resv->lock); /* Locate the region we are either in or before. */ list_for_each_entry(rg, head, link) if (f <= rg->to) @@ -185,15 +179,18 @@ static long region_add(struct resv_map *resv, long f, long t) } nrg->from = f; nrg->to = t; + spin_unlock(&resv->lock); return 0; } static long region_chg(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; - struct file_region *rg, *nrg; + struct file_region *rg, *nrg = NULL; long chg = 0; +retry: + spin_lock(&resv->lock); /* Locate the region we are before or in. */ list_for_each_entry(rg, head, link) if (f <= rg->to) @@ -203,15 +200,21 @@ static long region_chg(struct resv_map *resv, long f, long t) * Subtle, allocate a new region at the position but make it zero * size such that we can guarantee to record the reservation. */ if (&rg->link == head || t < rg->from) { - nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); - if (!nrg) - return -ENOMEM; - nrg->from = f; - nrg->to = f; - INIT_LIST_HEAD(&nrg->link); - list_add(&nrg->link, rg->link.prev); + if (!nrg) { + spin_unlock(&resv->lock); + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (!nrg) + return -ENOMEM; + + nrg->from = f; + nrg->to = f; + INIT_LIST_HEAD(&nrg->link); + goto retry; + } - return t - f; + list_add(&nrg->link, rg->link.prev); + chg = t - f; + goto out_nrg; } /* Round our left edge to the current segment if it encloses us. */ @@ -224,7 +227,7 @@ static long region_chg(struct resv_map *resv, long f, long t) if (&rg->link == head) break; if (rg->from > t) - return chg; + goto out; /* We overlap with this area, if it extends further than * us then we must extend ourselves. Account for its @@ -235,6 +238,14 @@ static long region_chg(struct resv_map *resv, long f, long t) } chg -= rg->to - rg->from; } + +out: + spin_unlock(&resv->lock); + /* We already know we raced and no longer need the new region */ + kfree(nrg); + return chg; +out_nrg: + spin_unlock(&resv->lock); return chg; } @@ -244,12 +255,13 @@ static long region_truncate(struct resv_map *resv, long end) struct file_region *rg, *trg; long chg = 0; + spin_lock(&resv->lock); /* Locate the region we are either in or before. */ list_for_each_entry(rg, head, link) if (end <= rg->to) break; if (&rg->link == head) - return 0; + goto out; /* If we are in the middle of a region then adjust it. */ if (end > rg->from) { @@ -266,6 +278,9 @@ static long region_truncate(struct resv_map *resv, long end) list_del(&rg->link); kfree(rg); } + +out: + spin_unlock(&resv->lock); return chg; } @@ -275,6 +290,7 @@ static long region_count(struct resv_map *resv, long f, long t) struct file_region *rg; long chg = 0; + spin_lock(&resv->lock); /* Locate each segment we overlap with, and count that overlap. */ list_for_each_entry(rg, head, link) { long seg_from; @@ -290,6 +306,7 @@ static long region_count(struct resv_map *resv, long f, long t) chg += seg_to - seg_from; } + spin_unlock(&resv->lock); return chg; } @@ -387,6 +404,7 @@ struct resv_map *resv_map_alloc(void) return NULL; kref_init(&resv_map->refs); + spin_lock_init(&resv_map->lock); INIT_LIST_HEAD(&resv_map->regions); return resv_map; -- cgit v1.2.3 From f031dd274ccb7069012ede73f537cc81c42fc80b Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 3 Apr 2014 14:47:28 -0700 Subject: mm, hugetlb: remove resv_map_put This is a preparation patch to unify the use of vma_resv_map() regardless of the map type. This patch prepares it by removing resv_map_put(), which only works for HPAGE_RESV_OWNER's resv_map, not for all resv_maps. [davidlohr@hp.com: update changelog] Signed-off-by: Joonsoo Kim Signed-off-by: Davidlohr Bueso Reviewed-by: Aneesh Kumar K.V Reviewed-by: Naoya Horiguchi Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5a2515a774b5..c7918cd3a153 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2275,15 +2275,6 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) kref_get(&resv->refs); } -static void resv_map_put(struct vm_area_struct *vma) -{ - struct resv_map *resv = vma_resv_map(vma); - - if (!resv) - return; - kref_put(&resv->refs, resv_map_release); -} - static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); @@ -2300,7 +2291,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) reserve = (end - start) - region_count(resv, start, end); - resv_map_put(vma); + kref_put(&resv->refs, resv_map_release); if (reserve) { hugetlb_acct_memory(h, -reserve); @@ -3249,8 +3240,8 @@ int hugetlb_reserve_pages(struct inode *inode, region_add(resv_map, from, to); return 0; out_err: - if (vma) - resv_map_put(vma); + if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + kref_put(&resv_map->refs, resv_map_release); return ret; } -- cgit v1.2.3 From 4e35f483850ba46b838adfd312b3052416e15204 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 3 Apr 2014 14:47:30 -0700 Subject: mm, hugetlb: use vma_resv_map() map types Util now, we get a resv_map by two ways according to each mapping type. This makes code dirty and unreadable. Unify it. [davidlohr@hp.com: code cleanups] Signed-off-by: Joonsoo Kim Signed-off-by: Davidlohr Bueso Reviewed-by: Aneesh Kumar K.V Reviewed-by: Naoya Horiguchi Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 95 ++++++++++++++++++++++++++++-------------------------------- 1 file changed, 45 insertions(+), 50 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c7918cd3a153..1c7baff65f9d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -419,13 +419,24 @@ void resv_map_release(struct kref *ref) kfree(resv_map); } +static inline struct resv_map *inode_resv_map(struct inode *inode) +{ + return inode->i_mapping->private_data; +} + static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_MAYSHARE)) + if (vma->vm_flags & VM_MAYSHARE) { + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + + return inode_resv_map(inode); + + } else { return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); - return NULL; + } } static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) @@ -1167,48 +1178,34 @@ static void return_unused_surplus_pages(struct hstate *h, static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct address_space *mapping = vma->vm_file->f_mapping; - struct inode *inode = mapping->host; - - if (vma->vm_flags & VM_MAYSHARE) { - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *resv = inode->i_mapping->private_data; - - return region_chg(resv, idx, idx + 1); + struct resv_map *resv; + pgoff_t idx; + long chg; - } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + resv = vma_resv_map(vma); + if (!resv) return 1; - } else { - long err; - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *resv = vma_resv_map(vma); + idx = vma_hugecache_offset(h, vma, addr); + chg = region_chg(resv, idx, idx + 1); - err = region_chg(resv, idx, idx + 1); - if (err < 0) - return err; - return 0; - } + if (vma->vm_flags & VM_MAYSHARE) + return chg; + else + return chg < 0 ? chg : 0; } static void vma_commit_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct address_space *mapping = vma->vm_file->f_mapping; - struct inode *inode = mapping->host; - - if (vma->vm_flags & VM_MAYSHARE) { - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *resv = inode->i_mapping->private_data; - - region_add(resv, idx, idx + 1); + struct resv_map *resv; + pgoff_t idx; - } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *resv = vma_resv_map(vma); + resv = vma_resv_map(vma); + if (!resv) + return; - /* Mark this page used in the map. */ - region_add(resv, idx, idx + 1); - } + idx = vma_hugecache_offset(h, vma, addr); + region_add(resv, idx, idx + 1); } static struct page *alloc_huge_page(struct vm_area_struct *vma, @@ -2271,7 +2268,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ - if (resv) + if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) kref_get(&resv->refs); } @@ -2280,23 +2277,21 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) struct hstate *h = hstate_vma(vma); struct resv_map *resv = vma_resv_map(vma); struct hugepage_subpool *spool = subpool_vma(vma); - unsigned long reserve; - unsigned long start; - unsigned long end; + unsigned long reserve, start, end; - if (resv) { - start = vma_hugecache_offset(h, vma, vma->vm_start); - end = vma_hugecache_offset(h, vma, vma->vm_end); + if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + return; - reserve = (end - start) - - region_count(resv, start, end); + start = vma_hugecache_offset(h, vma, vma->vm_start); + end = vma_hugecache_offset(h, vma, vma->vm_end); - kref_put(&resv->refs, resv_map_release); + reserve = (end - start) - region_count(resv, start, end); - if (reserve) { - hugetlb_acct_memory(h, -reserve); - hugepage_subpool_put_pages(spool, reserve); - } + kref_put(&resv->refs, resv_map_release); + + if (reserve) { + hugetlb_acct_memory(h, -reserve); + hugepage_subpool_put_pages(spool, reserve); } } @@ -3189,7 +3184,7 @@ int hugetlb_reserve_pages(struct inode *inode, * called to make the mapping read-write. Assume !vma is a shm mapping */ if (!vma || vma->vm_flags & VM_MAYSHARE) { - resv_map = inode->i_mapping->private_data; + resv_map = inode_resv_map(inode); chg = region_chg(resv_map, from, to); @@ -3248,7 +3243,7 @@ out_err: void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) { struct hstate *h = hstate_inode(inode); - struct resv_map *resv_map = inode->i_mapping->private_data; + struct resv_map *resv_map = inode_resv_map(inode); long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); -- cgit v1.2.3 From 8382d914ebf72092aa15cdc2a5dcedb2daa0209d Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Apr 2014 14:47:31 -0700 Subject: mm, hugetlb: improve page-fault scalability The kernel can currently only handle a single hugetlb page fault at a time. This is due to a single mutex that serializes the entire path. This lock protects from spurious OOM errors under conditions of low availability of free hugepages. This problem is specific to hugepages, because it is normal to want to use every single hugepage in the system - with normal pages we simply assume there will always be a few spare pages which can be used temporarily until the race is resolved. Address this problem by using a table of mutexes, allowing a better chance of parallelization, where each hugepage is individually serialized. The hash key is selected depending on the mapping type. For shared ones it consists of the address space and file offset being faulted; while for private ones the mm and virtual address are used. The size of the table is selected based on a compromise of collisions and memory footprint of a series of database workloads. Large database workloads that make heavy use of hugepages can be particularly exposed to this issue, causing start-up times to be painfully slow. This patch reduces the startup time of a 10 Gb Oracle DB (with ~5000 faults) from 37.5 secs to 25.7 secs. Larger workloads will naturally benefit even more. NOTE: The only downside to this patch, detected by Joonsoo Kim, is that a small race is possible in private mappings: A child process (with its own mm, after cow) can instantiate a page that is already being handled by the parent in a cow fault. When low on pages, can trigger spurious OOMs. I have not been able to think of a efficient way of handling this... but do we really care about such a tiny window? We already maintain another theoretical race with normal pages. If not, one possible way to is to maintain the single hash for private mappings -- any workloads that *really* suffer from this scaling problem should already use shared mappings. [akpm@linux-foundation.org: remove stray + characters, go BUG if hugetlb_init() kmalloc fails] Signed-off-by: Davidlohr Bueso Cc: Aneesh Kumar K.V Cc: David Gibson Cc: Joonsoo Kim Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 71 insertions(+), 13 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1c7baff65f9d..38d9bed88dc8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -53,6 +54,13 @@ static unsigned long __initdata default_hstate_size; */ DEFINE_SPINLOCK(hugetlb_lock); +/* + * Serializes faults on the same logical page. This is used to + * prevent spurious OOMs when the hugepage pool is fully utilized. + */ +static int num_fault_mutexes; +static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; + static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { bool free = (spool->count == 0) && (spool->used_hpages == 0); @@ -1961,11 +1969,14 @@ static void __exit hugetlb_exit(void) } kobject_put(hugepages_kobj); + kfree(htlb_fault_mutex_table); } module_exit(hugetlb_exit); static int __init hugetlb_init(void) { + int i; + /* Some platform decide whether they support huge pages at boot * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when * there is no such support @@ -1990,6 +2001,17 @@ static int __init hugetlb_init(void) hugetlb_register_all_nodes(); hugetlb_cgroup_file_init(); +#ifdef CONFIG_SMP + num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); +#else + num_fault_mutexes = 1; +#endif + htlb_fault_mutex_table = + kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); + BUG_ON(!htlb_fault_mutex_table); + + for (i = 0; i < num_fault_mutexes; i++) + mutex_init(&htlb_fault_mutex_table[i]); return 0; } module_init(hugetlb_init); @@ -2767,15 +2789,14 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, } static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, unsigned int flags) + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, unsigned int flags) { struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; int anon_rmap = 0; - pgoff_t idx; unsigned long size; struct page *page; - struct address_space *mapping; pte_t new_pte; spinlock_t *ptl; @@ -2790,9 +2811,6 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } - mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); - /* * Use page lock to guard against racing truncation * before we get page_table_lock. @@ -2902,17 +2920,53 @@ backout_unlocked: goto out; } +#ifdef CONFIG_SMP +static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, unsigned long address) +{ + unsigned long key[2]; + u32 hash; + + if (vma->vm_flags & VM_SHARED) { + key[0] = (unsigned long) mapping; + key[1] = idx; + } else { + key[0] = (unsigned long) mm; + key[1] = address >> huge_page_shift(h); + } + + hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); + + return hash & (num_fault_mutexes - 1); +} +#else +/* + * For uniprocesor systems we always use a single mutex, so just + * return 0 and avoid the hashing overhead. + */ +static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, unsigned long address) +{ + return 0; +} +#endif + int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - pte_t *ptep; - pte_t entry; + pte_t *ptep, entry; spinlock_t *ptl; int ret; + u32 hash; + pgoff_t idx; struct page *page = NULL; struct page *pagecache_page = NULL; - static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + struct address_space *mapping; address &= huge_page_mask(h); @@ -2931,15 +2985,20 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!ptep) return VM_FAULT_OOM; + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - mutex_lock(&hugetlb_instantiation_mutex); + hash = fault_mutex_hash(h, mm, vma, mapping, idx, address); + mutex_lock(&htlb_fault_mutex_table[hash]); + entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { - ret = hugetlb_no_page(mm, vma, address, ptep, flags); + ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); goto out_mutex; } @@ -3008,8 +3067,7 @@ out_ptl: put_page(page); out_mutex: - mutex_unlock(&hugetlb_instantiation_mutex); - + mutex_unlock(&htlb_fault_mutex_table[hash]); return ret; } -- cgit v1.2.3 From d5bc5fd3fcb7b8dfb431694a8c8052466504c10c Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 3 Apr 2014 14:47:32 -0700 Subject: mm: vmscan: shrink_slab: rename max_pass -> freeable The name `max_pass' is misleading, because this variable actually keeps the estimate number of freeable objects, not the maximal number of objects we can scan in this pass, which can be twice that. Rename it to reflect its actual meaning. Signed-off-by: Vladimir Davydov Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1cd48519f506..c53d1a54964c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -224,15 +224,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, unsigned long freed = 0; unsigned long long delta; long total_scan; - long max_pass; + long freeable; long nr; long new_nr; int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; - max_pass = shrinker->count_objects(shrinker, shrinkctl); - if (max_pass == 0) + freeable = shrinker->count_objects(shrinker, shrinkctl); + if (freeable == 0) return 0; /* @@ -244,14 +244,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, total_scan = nr; delta = (4 * nr_pages_scanned) / shrinker->seeks; - delta *= max_pass; + delta *= freeable; do_div(delta, lru_pages + 1); total_scan += delta; if (total_scan < 0) { printk(KERN_ERR "shrink_slab: %pF negative objects to delete nr=%ld\n", shrinker->scan_objects, total_scan); - total_scan = max_pass; + total_scan = freeable; } /* @@ -260,26 +260,26 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, * shrinkers to return -1 all the time. This results in a large * nr being built up so when a shrink that can do some work * comes along it empties the entire cache due to nr >>> - * max_pass. This is bad for sustaining a working set in + * freeable. This is bad for sustaining a working set in * memory. * * Hence only allow the shrinker to scan the entire cache when * a large delta change is calculated directly. */ - if (delta < max_pass / 4) - total_scan = min(total_scan, max_pass / 2); + if (delta < freeable / 4) + total_scan = min(total_scan, freeable / 2); /* * Avoid risking looping forever due to too large nr value: * never try to free more than twice the estimate number of * freeable entries. */ - if (total_scan > max_pass * 2) - total_scan = max_pass * 2; + if (total_scan > freeable * 2) + total_scan = freeable * 2; trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, nr_pages_scanned, lru_pages, - max_pass, delta, total_scan); + freeable, delta, total_scan); /* * Normally, we should not scan less than batch_size objects in one @@ -292,12 +292,12 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, * * We detect the "tight on memory" situations by looking at the total * number of objects we want to scan (total_scan). If it is greater - * than the total number of objects on slab (max_pass), we must be + * than the total number of objects on slab (freeable), we must be * scanning at high prio and therefore should try to reclaim as much as * possible. */ while (total_scan >= batch_size || - total_scan >= max_pass) { + total_scan >= freeable) { unsigned long ret; unsigned long nr_to_scan = min(batch_size, total_scan); -- cgit v1.2.3 From 6a3ed2123a78de22a9e2b2855068a8d89f8e14f4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:34 -0700 Subject: mm: vmstat: fix UP zone state accounting Summary: The VM maintains cached filesystem pages on two types of lists. One list holds the pages recently faulted into the cache, the other list holds pages that have been referenced repeatedly on that first list. The idea is to prefer reclaiming young pages over those that have shown to benefit from caching in the past. We call the recently used list "inactive list" and the frequently used list "active list". Currently, the VM aims for a 1:1 ratio between the lists, which is the "perfect" trade-off between the ability to *protect* frequently used pages and the ability to *detect* frequently used pages. This means that working set changes bigger than half of cache memory go undetected and thrash indefinitely, whereas working sets bigger than half of cache memory are unprotected against used-once streams that don't even need caching. This happens on file servers and media streaming servers, where the popular files and file sections change over time. Even though the individual files might be smaller than half of memory, concurrent access to many of them may still result in their inter-reference distance being greater than half of memory. It's also been reported as a problem on database workloads that switch back and forth between tables that are bigger than half of memory. In these cases the VM never recognizes the new working set and will for the remainder of the workload thrash disk data which could easily live in memory. Historically, every reclaim scan of the inactive list also took a smaller number of pages from the tail of the active list and moved them to the head of the inactive list. This model gave established working sets more gracetime in the face of temporary use-once streams, but ultimately was not significantly better than a FIFO policy and still thrashed cache based on eviction speed, rather than actual demand for cache. This series solves the problem by maintaining a history of pages evicted from the inactive list, enabling the VM to detect frequently used pages regardless of inactive list size and facilitate working set transitions. Tests: The reported database workload is easily demonstrated on a 8G machine with two filesets a 6G. This fio workload operates on one set first, then switches to the other. The VM should obviously always cache the set that the workload is currently using. This test is based on a problem encountered by Citus Data customers: http://citusdata.com/blog/72-linux-memory-manager-and-your-big-data unpatched: db1: READ: io=98304MB, aggrb=885559KB/s, minb=885559KB/s, maxb=885559KB/s, mint= 113672msec, maxt= 113672msec db2: READ: io=98304MB, aggrb= 66169KB/s, minb= 66169KB/s, maxb= 66169KB/s, mint=1521302msec, maxt=1521302msec sdb: ios=835750/4, merge=2/1, ticks=4659739/60016, in_queue=4719203, util=98.92% real 27m15.541s user 0m19.059s sys 0m51.459s patched: db1: READ: io=98304MB, aggrb=877783KB/s, minb=877783KB/s, maxb=877783KB/s, mint=114679msec, maxt=114679msec db2: READ: io=98304MB, aggrb=397449KB/s, minb=397449KB/s, maxb=397449KB/s, mint=253273msec, maxt=253273msec sdb: ios=170587/4, merge=2/1, ticks=954910/61123, in_queue=1015923, util=90.40% real 6m8.630s user 0m14.714s sys 0m31.233s As can be seen, the unpatched kernel simply never adapts to the workingset change and db2 is stuck indefinitely with secondary storage speed. The patched kernel needs 2-3 iterations over db2 before it replaces db1 and reaches full memory speed. Given the unbounded negative affect of the existing VM behavior, these patches should be considered correctness fixes rather than performance optimizations. Another test resembles a fileserver or streaming server workload, where data in excess of memory size is accessed at different frequencies. There is very hot data accessed at a high frequency. Machines should be fitted so that the hot set of such a workload can be fully cached or all bets are off. Then there is a very big (compared to available memory) set of data that is used-once or at a very low frequency; this is what drives the inactive list and does not really benefit from caching. Lastly, there is a big set of warm data in between that is accessed at medium frequencies and benefits from caching the pages between the first and last streamer of each burst. unpatched: hot: READ: io=128000MB, aggrb=160693KB/s, minb=160693KB/s, maxb=160693KB/s, mint=815665msec, maxt=815665msec warm: READ: io= 81920MB, aggrb=109853KB/s, minb= 27463KB/s, maxb= 29244KB/s, mint=717110msec, maxt=763617msec cold: READ: io= 30720MB, aggrb= 35245KB/s, minb= 35245KB/s, maxb= 35245KB/s, mint=892530msec, maxt=892530msec sdb: ios=797960/4, merge=11763/1, ticks=4307910/796, in_queue=4308380, util=100.00% patched: hot: READ: io=128000MB, aggrb=160678KB/s, minb=160678KB/s, maxb=160678KB/s, mint=815740msec, maxt=815740msec warm: READ: io= 81920MB, aggrb=147747KB/s, minb= 36936KB/s, maxb= 40960KB/s, mint=512000msec, maxt=567767msec cold: READ: io= 30720MB, aggrb= 40960KB/s, minb= 40960KB/s, maxb= 40960KB/s, mint=768000msec, maxt=768000msec sdb: ios=596514/4, merge=9341/1, ticks=2395362/997, in_queue=2396484, util=79.18% In both kernels, the hot set is propagated to the active list and then served from cache. In both kernels, the beginning of the warm set is propagated to the active list as well, but in the unpatched case the active list eventually takes up half of memory and no new pages from the warm set get activated, despite repeated access, and despite most of the active list soon being stale. The patched kernel on the other hand detects the thrashing and manages to keep this cache window rolling through the data set. This frees up enough IO bandwidth that the cold set is served at full speed as well and disk utilization even drops by 20%. For reference, this same test was performed with the traditional demotion mechanism, where deactivation is coupled to inactive list reclaim. However, this had the same outcome as the unpatched kernel: while the warm set does indeed get activated continuously, it is forced out of the active list by inactive list pressure, which is dictated primarily by the unrelated cold set. The warm set is evicted before subsequent streamers can benefit from it, even though there would be enough space available to cache the pages of interest. Costs: Page reclaim used to shrink the radix trees but now the tree nodes are reused for shadow entries, where the cost depends heavily on the page cache access patterns. However, with workloads that maintain spatial or temporal locality, the shadow entries are either refaulted quickly or reclaimed along with the inode object itself. Workloads that will experience a memory cost increase are those that don't really benefit from caching in the first place. A more predictable alternative would be a fixed-cost separate pool of shadow entries, but this would incur relatively higher memory cost for well-behaved workloads at the benefit of cornercases. It would also make the shadow entry lookup more costly compared to storing them directly in the cache structure. Future: To simplify the merging process, this patch set is implementing thrash detection on a global per-zone level only for now, but the design is such that it can be extended to memory cgroups as well. All we need to do is store the unique cgroup ID along the node and zone identifier inside the eviction cookie to identify the lruvec. Right now we have a fixed ratio (50:50) between inactive and active list but we already have complaints about working sets exceeding half of memory being pushed out of the cache by simple streaming in the background. Ultimately, we want to adjust this ratio and allow for a much smaller inactive list. These patches are an essential step in this direction because they decouple the VMs ability to detect working set changes from the inactive list size. This would allow us to base the inactive list size on the combined readahead window size for example and potentially protect a much bigger working set. It's also a big step towards activating pages with a reuse distance larger than memory, as long as they are the most frequently used pages in the workload. This will require knowing more about the access frequency of active pages than what we measure right now, so it's also deferred in this series. Another possibility of having thrashing information would be to revisit the idea of local reclaim in the form of zero-config memory control groups. Instead of having allocating tasks go straight to global reclaim, they could try to reclaim the pages in the memcg they are part of first as long as the group is not thrashing. This would allow a user to drop e.g. a back-up job in an otherwise unconfigured memcg and it would only inflate (and possibly do global reclaim) until it has enough memory to do proper readahead. But once it reaches that point and stops thrashing it would just recycle its own used-once pages without kicking out the cache of any other tasks in the system more than necessary. This patch (of 10): Fengguang Wu's build testing spotted problems with inc_zone_state() and dec_zone_state() on UP configurations in out-of-tree patches. inc_zone_state() is declared but not defined, dec_zone_state() is missing entirely. Just like with *_zone_page_state(), they can be defined like their preemption-unsafe counterparts on UP. [akpm@linux-foundation.org: make it build] Signed-off-by: Johannes Weiner Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Mel Gorman Cc: Metin Doslu Cc: Michel Lespinasse Cc: Minchan Kim Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Rik van Riel Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 67ce70c8279b..ea4476157e00 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -187,8 +187,6 @@ extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp); #define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d) #define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d)) -extern void inc_zone_state(struct zone *, enum zone_stat_item); - #ifdef CONFIG_SMP void __mod_zone_page_state(struct zone *, enum zone_stat_item item, int); void __inc_zone_page_state(struct page *, enum zone_stat_item); @@ -230,18 +228,18 @@ static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item) atomic_long_inc(&vm_stat[item]); } -static inline void __inc_zone_page_state(struct page *page, - enum zone_stat_item item) -{ - __inc_zone_state(page_zone(page), item); -} - static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_dec(&zone->vm_stat[item]); atomic_long_dec(&vm_stat[item]); } +static inline void __inc_zone_page_state(struct page *page, + enum zone_stat_item item) +{ + __inc_zone_state(page_zone(page), item); +} + static inline void __dec_zone_page_state(struct page *page, enum zone_stat_item item) { @@ -256,6 +254,9 @@ static inline void __dec_zone_page_state(struct page *page, #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state +#define inc_zone_state __inc_zone_state +#define dec_zone_state __dec_zone_state + #define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_cpu_vm_stats(int cpu) { } -- cgit v1.2.3 From 55881bc76faf54653a1ba71770150f13fc85739c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:36 -0700 Subject: fs: cachefiles: use add_to_page_cache_lru() This code used to have its own lru cache pagevec up until a0b8cab3 ("mm: remove lru parameter from __pagevec_lru_add and remove parts of pagevec API"). Now it's just add_to_page_cache() followed by lru_cache_add(), might as well use add_to_page_cache_lru() directly. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Mel Gorman Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/cachefiles/rdwr.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index ebaff368120d..4b1fb5ca65b8 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -265,24 +265,22 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, goto nomem_monitor; } - ret = add_to_page_cache(newpage, bmapping, - netpage->index, cachefiles_gfp); + ret = add_to_page_cache_lru(newpage, bmapping, + netpage->index, cachefiles_gfp); if (ret == 0) goto installed_new_backing_page; if (ret != -EEXIST) goto nomem_page; } - /* we've installed a new backing page, so now we need to add it - * to the LRU list and start it reading */ + /* we've installed a new backing page, so now we need to start + * it reading */ installed_new_backing_page: _debug("- new %p", newpage); backpage = newpage; newpage = NULL; - lru_cache_add_file(backpage); - read_backing_page: ret = bmapping->a_ops->readpage(NULL, backpage); if (ret < 0) @@ -510,24 +508,23 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, goto nomem; } - ret = add_to_page_cache(newpage, bmapping, - netpage->index, cachefiles_gfp); + ret = add_to_page_cache_lru(newpage, bmapping, + netpage->index, + cachefiles_gfp); if (ret == 0) goto installed_new_backing_page; if (ret != -EEXIST) goto nomem; } - /* we've installed a new backing page, so now we need to add it - * to the LRU list and start it reading */ + /* we've installed a new backing page, so now we need + * to start it reading */ installed_new_backing_page: _debug("- new %p", newpage); backpage = newpage; newpage = NULL; - lru_cache_add_file(backpage); - reread_backing_page: ret = bmapping->a_ops->readpage(NULL, backpage); if (ret < 0) @@ -538,8 +535,8 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, monitor_backing_page: _debug("- monitor add"); - ret = add_to_page_cache(netpage, op->mapping, netpage->index, - cachefiles_gfp); + ret = add_to_page_cache_lru(netpage, op->mapping, + netpage->index, cachefiles_gfp); if (ret < 0) { if (ret == -EEXIST) { page_cache_release(netpage); @@ -549,8 +546,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, goto nomem; } - lru_cache_add_file(netpage); - /* install a monitor */ page_cache_get(netpage); monitor->netfs_page = netpage; @@ -613,8 +608,8 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, backing_page_already_uptodate: _debug("- uptodate"); - ret = add_to_page_cache(netpage, op->mapping, netpage->index, - cachefiles_gfp); + ret = add_to_page_cache_lru(netpage, op->mapping, + netpage->index, cachefiles_gfp); if (ret < 0) { if (ret == -EEXIST) { page_cache_release(netpage); @@ -631,8 +626,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, fscache_mark_page_cached(op, netpage); - lru_cache_add_file(netpage); - /* the netpage is unlocked and marked up to date here */ fscache_end_io(op, netpage, 0); page_cache_release(netpage); -- cgit v1.2.3 From 53c59f262d747ea82e7414774c59a489501186a0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:39 -0700 Subject: lib: radix-tree: add radix_tree_delete_item() Provide a function that does not just delete an entry at a given index, but also allows passing in an expected item. Delete only if that item is still located at the specified index. This is handy when lockless tree traversals want to delete entries as well because they don't have to do an second, locked lookup to verify the slot has not changed under them before deleting the entry. Signed-off-by: Johannes Weiner Reviewed-by: Minchan Kim Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 1 + lib/radix-tree.c | 31 +++++++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 403940787be1..1bf0a9c388d9 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -219,6 +219,7 @@ static inline void radix_tree_replace_slot(void **pslot, void *item) int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, diff --git a/lib/radix-tree.c b/lib/radix-tree.c index bd4a8dfdf0b8..d832117e7d94 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1337,15 +1337,18 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) } /** - * radix_tree_delete - delete an item from a radix tree + * radix_tree_delete_item - delete an item from a radix tree * @root: radix tree root * @index: index key + * @item: expected item * - * Remove the item at @index from the radix tree rooted at @root. + * Remove @item at @index from the radix tree rooted at @root. * - * Returns the address of the deleted item, or NULL if it was not present. + * Returns the address of the deleted item, or NULL if it was not present + * or the entry at the given @index was not @item. */ -void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) +void *radix_tree_delete_item(struct radix_tree_root *root, + unsigned long index, void *item) { struct radix_tree_node *node = NULL; struct radix_tree_node *slot = NULL; @@ -1380,6 +1383,11 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) if (slot == NULL) goto out; + if (item && slot != item) { + slot = NULL; + goto out; + } + /* * Clear all tags associated with the item to be deleted. * This way of doing it would be inefficient, but seldom is any set. @@ -1424,6 +1432,21 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) out: return slot; } +EXPORT_SYMBOL(radix_tree_delete_item); + +/** + * radix_tree_delete - delete an item from a radix tree + * @root: radix tree root + * @index: index key + * + * Remove the item at @index from the radix tree rooted at @root. + * + * Returns the address of the deleted item, or NULL if it was not present. + */ +void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) +{ + return radix_tree_delete_item(root, index, NULL); +} EXPORT_SYMBOL(radix_tree_delete); /** -- cgit v1.2.3 From 6dbaf22ce1f1dfba33313198eb5bd989ae76dd87 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:41 -0700 Subject: mm: shmem: save one radix tree lookup when truncating swapped pages Page cache radix tree slots are usually stabilized by the page lock, but shmem's swap cookies have no such thing. Because the overall truncation loop is lockless, the swap entry is currently confirmed by a tree lookup and then deleted by another tree lookup under the same tree lock region. Use radix_tree_delete_item() instead, which does the verification and deletion with only one lookup. This also allows removing the delete-only special case from shmem_radix_tree_replace(). Signed-off-by: Johannes Weiner Reviewed-by: Minchan Kim Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 1f18c9d0d93e..e470997010cd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -242,19 +242,17 @@ static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { void **pslot; - void *item = NULL; + void *item; VM_BUG_ON(!expected); + VM_BUG_ON(!replacement); pslot = radix_tree_lookup_slot(&mapping->page_tree, index); - if (pslot) - item = radix_tree_deref_slot_protected(pslot, - &mapping->tree_lock); + if (!pslot) + return -ENOENT; + item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); if (item != expected) return -ENOENT; - if (replacement) - radix_tree_replace_slot(pslot, replacement); - else - radix_tree_delete(&mapping->page_tree, index); + radix_tree_replace_slot(pslot, replacement); return 0; } @@ -386,14 +384,15 @@ export: static int shmem_free_swap(struct address_space *mapping, pgoff_t index, void *radswap) { - int error; + void *old; spin_lock_irq(&mapping->tree_lock); - error = shmem_radix_tree_replace(mapping, index, radswap, NULL); + old = radix_tree_delete_item(&mapping->page_tree, index, radswap); spin_unlock_irq(&mapping->tree_lock); - if (!error) - free_swap_and_cache(radix_to_swp_entry(radswap)); - return error; + if (old != radswap) + return -ENOENT; + free_swap_and_cache(radix_to_swp_entry(radswap)); + return 0; } /* -- cgit v1.2.3 From e7b563bb2a6f4d974208da46200784b9c5b5a47e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:44 -0700 Subject: mm: filemap: move radix tree hole searching here The radix tree hole searching code is only used for page cache, for example the readahead code trying to get a a picture of the area surrounding a fault. It sufficed to rely on the radix tree definition of holes, which is "empty tree slot". But this is about to change, though, as shadow page descriptors will be stored in the page cache after the actual pages get evicted from memory. Move the functions over to mm/filemap.c and make them native page cache operations, where they can later be adapted to handle the new definition of "page cache hole". Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Acked-by: Mel Gorman Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfs/blocklayout/blocklayout.c | 2 +- include/linux/pagemap.h | 5 +++ include/linux/radix-tree.h | 4 --- lib/radix-tree.c | 75 --------------------------------------- mm/filemap.c | 76 ++++++++++++++++++++++++++++++++++++++++ mm/readahead.c | 4 +-- 6 files changed, 84 insertions(+), 82 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 56ff823ca82e..65d849bdf77a 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -1213,7 +1213,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); if (end != NFS_I(inode)->npages) { rcu_read_lock(); - end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX); + end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX); rcu_read_unlock(); } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 1710d1b060ba..52d56872fe26 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -243,6 +243,11 @@ static inline struct page *page_cache_alloc_readahead(struct address_space *x) typedef int filler_t(void *, struct page *); +pgoff_t page_cache_next_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan); +pgoff_t page_cache_prev_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan); + extern struct page * find_get_page(struct address_space *mapping, pgoff_t index); extern struct page * find_lock_page(struct address_space *mapping, diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 1bf0a9c388d9..e8be53ecfc45 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -227,10 +227,6 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, unsigned long *indices, unsigned long first_index, unsigned int max_items); -unsigned long radix_tree_next_hole(struct radix_tree_root *root, - unsigned long index, unsigned long max_scan); -unsigned long radix_tree_prev_hole(struct radix_tree_root *root, - unsigned long index, unsigned long max_scan); int radix_tree_preload(gfp_t gfp_mask); int radix_tree_maybe_preload(gfp_t gfp_mask); void radix_tree_init(void); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index d832117e7d94..7e30d2a7f346 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -946,81 +946,6 @@ next: } EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); - -/** - * radix_tree_next_hole - find the next hole (not-present entry) - * @root: tree root - * @index: index key - * @max_scan: maximum range to search - * - * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest - * indexed hole. - * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'return - index >= max_scan' - * will be true). In rare cases of index wrap-around, 0 will be returned. - * - * radix_tree_next_hole may be called under rcu_read_lock. However, like - * radix_tree_gang_lookup, this will not atomically search a snapshot of - * the tree at a single point in time. For example, if a hole is created - * at index 5, then subsequently a hole is created at index 10, - * radix_tree_next_hole covering both indexes may return 10 if called - * under rcu_read_lock. - */ -unsigned long radix_tree_next_hole(struct radix_tree_root *root, - unsigned long index, unsigned long max_scan) -{ - unsigned long i; - - for (i = 0; i < max_scan; i++) { - if (!radix_tree_lookup(root, index)) - break; - index++; - if (index == 0) - break; - } - - return index; -} -EXPORT_SYMBOL(radix_tree_next_hole); - -/** - * radix_tree_prev_hole - find the prev hole (not-present entry) - * @root: tree root - * @index: index key - * @max_scan: maximum range to search - * - * Search backwards in the range [max(index-max_scan+1, 0), index] - * for the first hole. - * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'index - return >= max_scan' - * will be true). In rare cases of wrap-around, ULONG_MAX will be returned. - * - * radix_tree_next_hole may be called under rcu_read_lock. However, like - * radix_tree_gang_lookup, this will not atomically search a snapshot of - * the tree at a single point in time. For example, if a hole is created - * at index 10, then subsequently a hole is created at index 5, - * radix_tree_prev_hole covering both indexes may return 5 if called under - * rcu_read_lock. - */ -unsigned long radix_tree_prev_hole(struct radix_tree_root *root, - unsigned long index, unsigned long max_scan) -{ - unsigned long i; - - for (i = 0; i < max_scan; i++) { - if (!radix_tree_lookup(root, index)) - break; - index--; - if (index == ULONG_MAX) - break; - } - - return index; -} -EXPORT_SYMBOL(radix_tree_prev_hole); - /** * radix_tree_gang_lookup - perform multiple lookup on a radix tree * @root: radix tree root diff --git a/mm/filemap.c b/mm/filemap.c index 068cd2a63d32..40115c6c0791 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -685,6 +685,82 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, } } +/** + * page_cache_next_hole - find the next hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the + * lowest indexed hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'return - index >= + * max_scan' will be true). In rare cases of index wrap-around, 0 will + * be returned. + * + * page_cache_next_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 5, then subsequently a hole is created at + * index 10, page_cache_next_hole covering both indexes may return 10 + * if called under rcu_read_lock. + */ +pgoff_t page_cache_next_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + if (!radix_tree_lookup(&mapping->page_tree, index)) + break; + index++; + if (index == 0) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_next_hole); + +/** + * page_cache_prev_hole - find the prev hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search backwards in the range [max(index-max_scan+1, 0), index] for + * the first hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'index - return >= + * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX + * will be returned. + * + * page_cache_prev_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 10, then subsequently a hole is created at + * index 5, page_cache_prev_hole covering both indexes may return 5 if + * called under rcu_read_lock. + */ +pgoff_t page_cache_prev_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + if (!radix_tree_lookup(&mapping->page_tree, index)) + break; + index--; + if (index == ULONG_MAX) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_prev_hole); + /** * find_get_page - find and get a page reference * @mapping: the address_space to search diff --git a/mm/readahead.c b/mm/readahead.c index 0de2360d65f3..c62d85ace0cc 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -347,7 +347,7 @@ static pgoff_t count_history_pages(struct address_space *mapping, pgoff_t head; rcu_read_lock(); - head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); + head = page_cache_prev_hole(mapping, offset - 1, max); rcu_read_unlock(); return offset - 1 - head; @@ -427,7 +427,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); + start = page_cache_next_hole(mapping, offset + 1, max); rcu_read_unlock(); if (!start || start - offset > max) -- cgit v1.2.3 From 0cd6144aadd2afd19d1aca880153530c52957604 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:46 -0700 Subject: mm + fs: prepare for non-page entries in page cache radix trees shmem mappings already contain exceptional entries where swap slot information is remembered. To be able to store eviction information for regular page cache, prepare every site dealing with the radix trees directly to handle entries other than pages. The common lookup functions will filter out non-page entries and return NULL for page cache holes, just as before. But provide a raw version of the API which returns non-page entries as well, and switch shmem over to use it. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Mel Gorman Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/compression.c | 2 +- include/linux/mm.h | 8 ++ include/linux/pagemap.h | 15 ++-- include/linux/pagevec.h | 5 ++ include/linux/shmem_fs.h | 1 + mm/filemap.c | 202 +++++++++++++++++++++++++++++++++++++++++------ mm/mincore.c | 20 +++-- mm/readahead.c | 2 +- mm/shmem.c | 99 +++++------------------ mm/swap.c | 51 ++++++++++++ mm/truncate.c | 74 ++++++++++++++--- 11 files changed, 349 insertions(+), 130 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b01fb6c527e3..d43c544d3b68 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -472,7 +472,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, pg_index); rcu_read_unlock(); - if (page) { + if (page && !radix_tree_exceptional_entry(page)) { misses++; if (misses > 4) break; diff --git a/include/linux/mm.h b/include/linux/mm.h index 2eec61fe75c9..b1331aff769c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1041,6 +1041,14 @@ extern void show_free_areas(unsigned int flags); extern bool skip_free_areas_node(unsigned int flags, int nid); int shmem_zero_setup(struct vm_area_struct *); +#ifdef CONFIG_SHMEM +bool shmem_mapping(struct address_space *mapping); +#else +static inline bool shmem_mapping(struct address_space *mapping) +{ + return false; +} +#endif extern int can_do_mlock(void); extern int user_shm_lock(size_t, struct user_struct *); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 52d56872fe26..493bfd85214e 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -248,12 +248,15 @@ pgoff_t page_cache_next_hole(struct address_space *mapping, pgoff_t page_cache_prev_hole(struct address_space *mapping, pgoff_t index, unsigned long max_scan); -extern struct page * find_get_page(struct address_space *mapping, - pgoff_t index); -extern struct page * find_lock_page(struct address_space *mapping, - pgoff_t index); -extern struct page * find_or_create_page(struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask); +struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); +struct page *find_get_page(struct address_space *mapping, pgoff_t offset); +struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset); +struct page *find_or_create_page(struct address_space *mapping, pgoff_t index, + gfp_t gfp_mask); +unsigned find_get_entries(struct address_space *mapping, pgoff_t start, + unsigned int nr_entries, struct page **entries, + pgoff_t *indices); unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index e4dbfab37729..b45d391b4540 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -22,6 +22,11 @@ struct pagevec { void __pagevec_release(struct pagevec *pvec); void __pagevec_lru_add(struct pagevec *pvec); +unsigned pagevec_lookup_entries(struct pagevec *pvec, + struct address_space *mapping, + pgoff_t start, unsigned nr_entries, + pgoff_t *indices); +void pagevec_remove_exceptionals(struct pagevec *pvec); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_pages); unsigned pagevec_lookup_tag(struct pagevec *pvec, diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 9d55438bc4ad..4d1771c2d29f 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -51,6 +51,7 @@ extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); extern int shmem_zero_setup(struct vm_area_struct *); extern int shmem_lock(struct file *file, int lock, struct user_struct *user); +extern bool shmem_mapping(struct address_space *mapping); extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); diff --git a/mm/filemap.c b/mm/filemap.c index 40115c6c0791..efc63876477f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -446,6 +446,29 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(replace_page_cache_page); +static int page_cache_tree_insert(struct address_space *mapping, + struct page *page) +{ + void **slot; + int error; + + slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); + if (slot) { + void *p; + + p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + if (!radix_tree_exceptional_entry(p)) + return -EEXIST; + radix_tree_replace_slot(slot, page); + mapping->nrpages++; + return 0; + } + error = radix_tree_insert(&mapping->page_tree, page->index, page); + if (!error) + mapping->nrpages++; + return error; +} + /** * add_to_page_cache_locked - add a locked page to the pagecache * @page: page to add @@ -480,11 +503,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, page->index = offset; spin_lock_irq(&mapping->tree_lock); - error = radix_tree_insert(&mapping->page_tree, offset, page); + error = page_cache_tree_insert(mapping, page); radix_tree_preload_end(); if (unlikely(error)) goto err_insert; - mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); trace_mm_filemap_add_to_page_cache(page); @@ -712,7 +734,10 @@ pgoff_t page_cache_next_hole(struct address_space *mapping, unsigned long i; for (i = 0; i < max_scan; i++) { - if (!radix_tree_lookup(&mapping->page_tree, index)) + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) break; index++; if (index == 0) @@ -750,7 +775,10 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping, unsigned long i; for (i = 0; i < max_scan; i++) { - if (!radix_tree_lookup(&mapping->page_tree, index)) + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) break; index--; if (index == ULONG_MAX) @@ -762,14 +790,19 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping, EXPORT_SYMBOL(page_cache_prev_hole); /** - * find_get_page - find and get a page reference + * find_get_entry - find and get a page cache entry * @mapping: the address_space to search - * @offset: the page index + * @offset: the page cache index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned with an increased refcount. * - * Is there a pagecache struct page at the given (mapping, offset) tuple? - * If yes, increment its refcount and return it; if no, return NULL. + * If the slot holds a shadow entry of a previously evicted page, it + * is returned. + * + * Otherwise, %NULL is returned. */ -struct page *find_get_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { void **pagep; struct page *page; @@ -810,24 +843,50 @@ out: return page; } -EXPORT_SYMBOL(find_get_page); +EXPORT_SYMBOL(find_get_entry); /** - * find_lock_page - locate, pin and lock a pagecache page + * find_get_page - find and get a page reference * @mapping: the address_space to search * @offset: the page index * - * Locates the desired pagecache page, locks it, increments its reference - * count and returns its address. + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned with an increased refcount. * - * Returns zero if the page was not present. find_lock_page() may sleep. + * Otherwise, %NULL is returned. */ -struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_page(struct address_space *mapping, pgoff_t offset) +{ + struct page *page = find_get_entry(mapping, offset); + + if (radix_tree_exceptional_entry(page)) + page = NULL; + return page; +} +EXPORT_SYMBOL(find_get_page); + +/** + * find_lock_entry - locate, pin and lock a page cache entry + * @mapping: the address_space to search + * @offset: the page cache index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. + * + * If the slot holds a shadow entry of a previously evicted page, it + * is returned. + * + * Otherwise, %NULL is returned. + * + * find_lock_entry() may sleep. + */ +struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) { struct page *page; repeat: - page = find_get_page(mapping, offset); + page = find_get_entry(mapping, offset); if (page && !radix_tree_exception(page)) { lock_page(page); /* Has the page been truncated? */ @@ -840,6 +899,29 @@ repeat: } return page; } +EXPORT_SYMBOL(find_lock_entry); + +/** + * find_lock_page - locate, pin and lock a pagecache page + * @mapping: the address_space to search + * @offset: the page index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. + * + * Otherwise, %NULL is returned. + * + * find_lock_page() may sleep. + */ +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) +{ + struct page *page = find_lock_entry(mapping, offset); + + if (radix_tree_exceptional_entry(page)) + page = NULL; + return page; +} EXPORT_SYMBOL(find_lock_page); /** @@ -848,16 +930,18 @@ EXPORT_SYMBOL(find_lock_page); * @index: the page's index into the mapping * @gfp_mask: page allocation mode * - * Locates a page in the pagecache. If the page is not present, a new page - * is allocated using @gfp_mask and is added to the pagecache and to the VM's - * LRU list. The returned page is locked and has its reference count - * incremented. + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. + * + * If the page is not present, a new page is allocated using @gfp_mask + * and added to the page cache and the VM's LRU list. The page is + * returned locked and with an increased refcount. * - * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic - * allocation! + * On memory exhaustion, %NULL is returned. * - * find_or_create_page() returns the desired page's address, or zero on - * memory exhaustion. + * find_or_create_page() may sleep, even if @gfp_flags specifies an + * atomic allocation! */ struct page *find_or_create_page(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask) @@ -889,6 +973,76 @@ repeat: } EXPORT_SYMBOL(find_or_create_page); +/** + * find_get_entries - gang pagecache lookup + * @mapping: The address_space to search + * @start: The starting page cache index + * @nr_entries: The maximum number of entries + * @entries: Where the resulting entries are placed + * @indices: The cache indices corresponding to the entries in @entries + * + * find_get_entries() will search for and return a group of up to + * @nr_entries entries in the mapping. The entries are placed at + * @entries. find_get_entries() takes a reference against any actual + * pages it returns. + * + * The search returns a group of mapping-contiguous page cache entries + * with ascending indexes. There may be holes in the indices due to + * not-present pages. + * + * Any shadow entries of evicted pages are included in the returned + * array. + * + * find_get_entries() returns the number of pages and shadow entries + * which were found. + */ +unsigned find_get_entries(struct address_space *mapping, + pgoff_t start, unsigned int nr_entries, + struct page **entries, pgoff_t *indices) +{ + void **slot; + unsigned int ret = 0; + struct radix_tree_iter iter; + + if (!nr_entries) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + /* + * Otherwise, we must be storing a swap entry + * here as an exceptional entry: so return it + * without attempting to raise page count. + */ + goto export; + } + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } +export: + indices[ret] = iter.index; + entries[ret] = page; + if (++ret == nr_entries) + break; + } + rcu_read_unlock(); + return ret; +} + /** * find_get_pages - gang pagecache lookup * @mapping: The address_space to search diff --git a/mm/mincore.c b/mm/mincore.c index 101623378fbf..725c80961048 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ - page = find_get_page(mapping, pgoff); #ifdef CONFIG_SWAP - /* shmem/tmpfs may return swap: account for swapcache page too. */ - if (radix_tree_exceptional_entry(page)) { - swp_entry_t swap = radix_to_swp_entry(page); - page = find_get_page(swap_address_space(swap), swap.val); - } + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + /* + * shmem/tmpfs may return swap: account for swapcache + * page too. + */ + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + page = find_get_page(swap_address_space(swp), swp.val); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); #endif if (page) { present = PageUptodate(page); diff --git a/mm/readahead.c b/mm/readahead.c index c62d85ace0cc..62c500a088a7 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -179,7 +179,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, page_offset); rcu_read_unlock(); - if (page) + if (page && !radix_tree_exceptional_entry(page)) continue; page = page_cache_alloc_readahead(mapping); diff --git a/mm/shmem.c b/mm/shmem.c index e470997010cd..a3ba988ec946 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -328,56 +328,6 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) BUG_ON(error); } -/* - * Like find_get_pages, but collecting swap entries as well as pages. - */ -static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, - pgoff_t start, unsigned int nr_pages, - struct page **pages, pgoff_t *indices) -{ - void **slot; - unsigned int ret = 0; - struct radix_tree_iter iter; - - if (!nr_pages) - return 0; - - rcu_read_lock(); -restart: - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - struct page *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) - continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; - /* - * Otherwise, we must be storing a swap entry - * here as an exceptional entry: so return it - * without attempting to raise page count. - */ - goto export; - } - if (!page_cache_get_speculative(page)) - goto repeat; - - /* Has the page moved? */ - if (unlikely(page != *slot)) { - page_cache_release(page); - goto repeat; - } -export: - indices[ret] = iter.index; - pages[ret] = page; - if (++ret == nr_pages) - break; - } - rcu_read_unlock(); - return ret; -} - /* * Remove swap entry from radix tree, free the swap and its page cache. */ @@ -395,21 +345,6 @@ static int shmem_free_swap(struct address_space *mapping, return 0; } -/* - * Pagevec may contain swap entries, so shuffle up pages before releasing. - */ -static void shmem_deswap_pagevec(struct pagevec *pvec) -{ - int i, j; - - for (i = 0, j = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - if (!radix_tree_exceptional_entry(page)) - pvec->pages[j++] = page; - } - pvec->nr = j; -} - /* * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. */ @@ -428,12 +363,12 @@ void shmem_unlock_mapping(struct address_space *mapping) * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it * has finished, if it hits a row of PAGEVEC_SIZE swap entries. */ - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - PAGEVEC_SIZE, pvec.pages, indices); + pvec.nr = find_get_entries(mapping, index, + PAGEVEC_SIZE, pvec.pages, indices); if (!pvec.nr) break; index = indices[pvec.nr - 1] + 1; - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); check_move_unevictable_pages(pvec.pages, pvec.nr); pagevec_release(&pvec); cond_resched(); @@ -465,9 +400,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, pagevec_init(&pvec, 0); index = start; while (index < end) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); + pvec.nr = find_get_entries(mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + pvec.pages, indices); if (!pvec.nr) break; mem_cgroup_uncharge_start(); @@ -496,7 +431,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } unlock_page(page); } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -534,9 +469,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, index = start; for ( ; ; ) { cond_resched(); - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + + pvec.nr = find_get_entries(mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); + pvec.pages, indices); if (!pvec.nr) { if (index == start || unfalloc) break; @@ -544,7 +480,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, continue; } if ((index == start || unfalloc) && indices[0] >= end) { - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); break; } @@ -573,7 +509,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } unlock_page(page); } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); index++; @@ -1079,7 +1015,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return -EFBIG; repeat: swap.val = 0; - page = find_lock_page(mapping, index); + page = find_lock_entry(mapping, index); if (radix_tree_exceptional_entry(page)) { swap = radix_to_swp_entry(page); page = NULL; @@ -1416,6 +1352,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; } +bool shmem_mapping(struct address_space *mapping) +{ + return mapping->backing_dev_info == &shmem_backing_dev_info; +} + #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; @@ -1728,7 +1669,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, pagevec_init(&pvec, 0); pvec.nr = 1; /* start small: we may be there already */ while (!done) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + pvec.nr = find_get_entries(mapping, index, pvec.nr, pvec.pages, indices); if (!pvec.nr) { if (whence == SEEK_DATA) @@ -1755,7 +1696,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, break; } } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); pvec.nr = PAGEVEC_SIZE; cond_resched(); diff --git a/mm/swap.c b/mm/swap.c index 0092097b3f4c..c8048d71c642 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -947,6 +947,57 @@ void __pagevec_lru_add(struct pagevec *pvec) } EXPORT_SYMBOL(__pagevec_lru_add); +/** + * pagevec_lookup_entries - gang pagecache lookup + * @pvec: Where the resulting entries are placed + * @mapping: The address_space to search + * @start: The starting entry index + * @nr_entries: The maximum number of entries + * @indices: The cache indices corresponding to the entries in @pvec + * + * pagevec_lookup_entries() will search for and return a group of up + * to @nr_entries pages and shadow entries in the mapping. All + * entries are placed in @pvec. pagevec_lookup_entries() takes a + * reference against actual pages in @pvec. + * + * The search returns a group of mapping-contiguous entries with + * ascending indexes. There may be holes in the indices due to + * not-present entries. + * + * pagevec_lookup_entries() returns the number of entries which were + * found. + */ +unsigned pagevec_lookup_entries(struct pagevec *pvec, + struct address_space *mapping, + pgoff_t start, unsigned nr_pages, + pgoff_t *indices) +{ + pvec->nr = find_get_entries(mapping, start, nr_pages, + pvec->pages, indices); + return pagevec_count(pvec); +} + +/** + * pagevec_remove_exceptionals - pagevec exceptionals pruning + * @pvec: The pagevec to prune + * + * pagevec_lookup_entries() fills both pages and exceptional radix + * tree entries into the pagevec. This function prunes all + * exceptionals from @pvec without leaving holes, so that it can be + * passed on to page-only pagevec operations. + */ +void pagevec_remove_exceptionals(struct pagevec *pvec) +{ + int i, j; + + for (i = 0, j = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + if (!radix_tree_exceptional_entry(page)) + pvec->pages[j++] = page; + } + pvec->nr = j; +} + /** * pagevec_lookup - gang pagecache lookup * @pvec: Where the resulting pages are placed diff --git a/mm/truncate.c b/mm/truncate.c index 353b683afd6e..2e84fe59190b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -22,6 +22,22 @@ #include #include "internal.h" +static void clear_exceptional_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + /* Handled by shmem itself */ + if (shmem_mapping(mapping)) + return; + + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + radix_tree_delete_item(&mapping->page_tree, index, entry); + spin_unlock_irq(&mapping->tree_lock); +} /** * do_invalidatepage - invalidate part or all of a page @@ -208,6 +224,7 @@ void truncate_inode_pages_range(struct address_space *mapping, unsigned int partial_start; /* inclusive */ unsigned int partial_end; /* exclusive */ struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; @@ -238,17 +255,23 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_init(&pvec, 0); index = start; - while (index < end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE))) { + while (index < end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index >= end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + if (!trylock_page(page)) continue; WARN_ON(page->index != index); @@ -259,6 +282,7 @@ void truncate_inode_pages_range(struct address_space *mapping, truncate_inode_page(mapping, page); unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -307,14 +331,16 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE))) { + if (!pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { if (index == start) break; index = start; continue; } - if (index == start && pvec.pages[0]->index >= end) { + if (index == start && indices[0] >= end) { + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); break; } @@ -323,16 +349,22 @@ void truncate_inode_pages_range(struct address_space *mapping, struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index >= end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + lock_page(page); WARN_ON(page->index != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); index++; @@ -375,6 +407,7 @@ EXPORT_SYMBOL(truncate_inode_pages); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { + pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index = start; unsigned long ret; @@ -390,17 +423,23 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, */ pagevec_init(&pvec, 0); - while (index <= end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index > end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + if (!trylock_page(page)) continue; WARN_ON(page->index != index); @@ -414,6 +453,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, deactivate_page(page); count += ret; } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -481,6 +521,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { + pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index; int i; @@ -491,17 +532,23 @@ int invalidate_inode_pages2_range(struct address_space *mapping, cleancache_invalidate_inode(mapping); pagevec_init(&pvec, 0); index = start; - while (index <= end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index > end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + lock_page(page); WARN_ON(page->index != index); if (page->mapping != mapping) { @@ -539,6 +586,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, ret = ret2; unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); -- cgit v1.2.3 From 91b0abe36a7b2b3b02d7500925a5f8455334f0e5 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:49 -0700 Subject: mm + fs: store shadow entries in page cache Reclaim will be leaving shadow entries in the page cache radix tree upon evicting the real page. As those pages are found from the LRU, an iput() can lead to the inode being freed concurrently. At this point, reclaim must no longer install shadow pages because the inode freeing code needs to ensure the page tree is really empty. Add an address_space flag, AS_EXITING, that the inode freeing code sets under the tree lock before doing the final truncate. Reclaim will check for this flag before installing shadow pages. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Mel Gorman Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/porting | 6 +-- drivers/staging/lustre/lustre/llite/llite_lib.c | 2 +- fs/9p/vfs_inode.c | 2 +- fs/affs/inode.c | 2 +- fs/afs/inode.c | 2 +- fs/bfs/inode.c | 2 +- fs/block_dev.c | 4 +- fs/btrfs/inode.c | 2 +- fs/cifs/cifsfs.c | 2 +- fs/coda/inode.c | 2 +- fs/ecryptfs/super.c | 2 +- fs/exofs/inode.c | 2 +- fs/ext2/inode.c | 2 +- fs/ext3/inode.c | 2 +- fs/ext4/inode.c | 4 +- fs/f2fs/inode.c | 2 +- fs/fat/inode.c | 2 +- fs/freevxfs/vxfs_inode.c | 2 +- fs/fuse/inode.c | 2 +- fs/gfs2/super.c | 2 +- fs/hfs/inode.c | 2 +- fs/hfsplus/super.c | 2 +- fs/hostfs/hostfs_kern.c | 2 +- fs/hpfs/inode.c | 2 +- fs/inode.c | 4 +- fs/jffs2/fs.c | 2 +- fs/jfs/inode.c | 4 +- fs/kernfs/inode.c | 2 +- fs/logfs/readwrite.c | 2 +- fs/minix/inode.c | 2 +- fs/ncpfs/inode.c | 2 +- fs/nfs/inode.c | 2 +- fs/nfs/nfs4super.c | 2 +- fs/nilfs2/inode.c | 6 +-- fs/ntfs/inode.c | 2 +- fs/ocfs2/inode.c | 4 +- fs/omfs/inode.c | 2 +- fs/proc/inode.c | 2 +- fs/reiserfs/inode.c | 2 +- fs/sysv/inode.c | 2 +- fs/ubifs/super.c | 2 +- fs/udf/inode.c | 4 +- fs/ufs/inode.c | 2 +- fs/xfs/xfs_super.c | 2 +- include/linux/fs.h | 1 + include/linux/mm.h | 1 + include/linux/pagemap.h | 13 +++++- mm/filemap.c | 33 ++++++++++++--- mm/truncate.c | 54 +++++++++++++++++++++++-- mm/vmscan.c | 2 +- 50 files changed, 147 insertions(+), 65 deletions(-) diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index fe2b7ae6f962..0f3a1390bf00 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -295,9 +295,9 @@ in the beginning of ->setattr unconditionally. ->clear_inode() and ->delete_inode() are gone; ->evict_inode() should be used instead. It gets called whenever the inode is evicted, whether it has remaining links or not. Caller does *not* evict the pagecache or inode-associated -metadata buffers; getting rid of those is responsibility of method, as it had -been for ->delete_inode(). Caller makes sure async writeback cannot be running -for the inode while (or after) ->evict_inode() is called. +metadata buffers; the method has to use truncate_inode_pages_final() to get rid +of those. Caller makes sure async writeback cannot be running for the inode while +(or after) ->evict_inode() is called. ->drop_inode() returns int now; it's called on final iput() with inode->i_lock held and it returns true if filesystems wants the inode to be diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c index 26003d3c1be7..7c4fd97a7fa0 100644 --- a/drivers/staging/lustre/lustre/llite/llite_lib.c +++ b/drivers/staging/lustre/lustre/llite/llite_lib.c @@ -1877,7 +1877,7 @@ void ll_delete_inode(struct inode *inode) cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_DISCARD, 1); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); /* Workaround for LU-118 */ if (inode->i_data.nrpages) { diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index bb7991c7e5c7..53161ec058a7 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -451,7 +451,7 @@ void v9fs_evict_inode(struct inode *inode) { struct v9fs_inode *v9inode = V9FS_I(inode); - truncate_inode_pages(inode->i_mapping, 0); + truncate_inode_pages_final(inode->i_mapping); clear_inode(inode); filemap_fdatawrite(inode->i_mapping); diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 0e092d08680e..96df91e8c334 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -259,7 +259,7 @@ affs_evict_inode(struct inode *inode) { unsigned long cache_page; pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index ce25d755b7aa..294671288449 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -422,7 +422,7 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); afs_give_up_callback(vnode); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 8defc6b3f9a2..29aa5cf6639b 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -172,7 +172,7 @@ static void bfs_evict_inode(struct inode *inode) dprintf("ino=%08lx\n", ino); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); clear_inode(inode); diff --git a/fs/block_dev.c b/fs/block_dev.c index e4cba21a627e..ba0d2b05bb78 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -83,7 +83,7 @@ void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0) + if (mapping->nrpages == 0 && mapping->nrshadows == 0) return; invalidate_bh_lrus(); @@ -419,7 +419,7 @@ static void bdev_evict_inode(struct inode *inode) { struct block_device *bdev = &BDEV_I(inode)->bdev; struct list_head *p; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); spin_lock(&bdev_lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d3d44486290b..49ec1398879f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4593,7 +4593,7 @@ static void evict_inode_truncate_pages(struct inode *inode) struct rb_node *node; ASSERT(inode->i_state & I_FREEING); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); write_lock(&map_tree->lock); while (!RB_EMPTY_ROOT(&map_tree->map)) { diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e8ae8323c058..ab8ad2546c3e 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -286,7 +286,7 @@ cifs_destroy_inode(struct inode *inode) static void cifs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); cifs_fscache_release_inode_cookie(inode); } diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 506de34a4ef3..62618ec9356c 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -250,7 +250,7 @@ static void coda_put_super(struct super_block *sb) static void coda_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); coda_cache_clear_inode(inode); } diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index e879cf8ff0b1..afa1b81c3418 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -132,7 +132,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ static void ecryptfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); iput(ecryptfs_inode_to_lower(inode)); } diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index ee4317faccb1..d1c244d67667 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1486,7 +1486,7 @@ void exofs_evict_inode(struct inode *inode) struct ore_io_state *ios; int ret; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); /* TODO: should do better here */ if (inode->i_nlink || is_bad_inode(inode)) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 94ed36849b71..b1d2a4675d42 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -78,7 +78,7 @@ void ext2_evict_inode(struct inode * inode) dquot_drop(inode); } - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (want_delete) { sb_start_intwrite(inode->i_sb); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 384b6ebb655f..efce2bbfb5e5 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -228,7 +228,7 @@ void ext3_evict_inode (struct inode *inode) log_wait_commit(journal, commit_tid); filemap_write_and_wait(&inode->i_data); } - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); ext3_discard_reservation(inode); rsv = ei->i_block_alloc_info; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 24bfd7ff3049..175c3f933816 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -215,7 +215,7 @@ void ext4_evict_inode(struct inode *inode) jbd2_complete_transaction(journal, commit_tid); filemap_write_and_wait(&inode->i_data); } - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); goto no_delete; @@ -226,7 +226,7 @@ void ext4_evict_inode(struct inode *inode) if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); if (is_bad_inode(inode)) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 4d67ed736dca..28cea76d78c6 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -260,7 +260,7 @@ void f2fs_evict_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); trace_f2fs_evict_inode(inode); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 854b578f6695..c68d9f27135e 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -490,7 +490,7 @@ EXPORT_SYMBOL_GPL(fat_build_inode); static void fat_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; fat_truncate_blocks(inode, 0); diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index f47df72cef17..363e3ae25f6b 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -354,7 +354,7 @@ static void vxfs_i_callback(struct rcu_head *head) void vxfs_evict_inode(struct inode *ip) { - truncate_inode_pages(&ip->i_data, 0); + truncate_inode_pages_final(&ip->i_data); clear_inode(ip); call_rcu(&ip->i_rcu, vxfs_i_callback); } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d468643a68b2..9c761b611c54 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -123,7 +123,7 @@ static void fuse_destroy_inode(struct inode *inode) static void fuse_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_sb->s_flags & MS_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 60f60f6181f3..24410cd9a82a 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1558,7 +1558,7 @@ out_unlock: fs_warn(sdp, "gfs2_evict_inode: %d\n", error); out: /* Case 3 starts here */ - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); gfs2_rs_delete(ip, NULL); gfs2_ordered_del_inode(ip); clear_inode(inode); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 380ab31b5e0f..9e2fecd62f62 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -547,7 +547,7 @@ out: void hfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) { HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 80875aa640ef..a6abf87d79d0 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -161,7 +161,7 @@ static int hfsplus_write_inode(struct inode *inode, static void hfsplus_evict_inode(struct inode *inode) { hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HFSPLUS_IS_RSRC(inode)) { HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index fe649d325b1f..9c470fde9878 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -230,7 +230,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb) static void hostfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HOSTFS_I(inode)->fd != -1) { close_file(&HOSTFS_I(inode)->fd); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 9edeeb0ea97e..50a427313835 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -304,7 +304,7 @@ void hpfs_write_if_changed(struct inode *inode) void hpfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (!inode->i_nlink) { hpfs_lock(inode->i_sb); diff --git a/fs/inode.c b/fs/inode.c index 4bcdad3c9361..e6905152c39f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -503,6 +503,7 @@ void clear_inode(struct inode *inode) */ spin_lock_irq(&inode->i_data.tree_lock); BUG_ON(inode->i_data.nrpages); + BUG_ON(inode->i_data.nrshadows); spin_unlock_irq(&inode->i_data.tree_lock); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); @@ -548,8 +549,7 @@ static void evict(struct inode *inode) if (op->evict_inode) { op->evict_inode(inode); } else { - if (inode->i_data.nrpages) - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); } if (S_ISBLK(inode->i_mode) && inode->i_bdev) diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index a69e426435dd..a012e16a8bb3 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -242,7 +242,7 @@ void jffs2_evict_inode (struct inode *inode) jffs2_dbg(1, "%s(): ino #%lu mode %o\n", __func__, inode->i_ino, inode->i_mode); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); jffs2_do_clear_inode(c, f); } diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index f4aab719add5..6f8fe72c2a7a 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -154,7 +154,7 @@ void jfs_evict_inode(struct inode *inode) dquot_initialize(inode); if (JFS_IP(inode)->fileset == FILESYSTEM_I) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (test_cflag(COMMIT_Freewmap, inode)) jfs_free_zero_link(inode); @@ -168,7 +168,7 @@ void jfs_evict_inode(struct inode *inode) dquot_free_inode(inode); } } else { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); } clear_inode(inode); dquot_drop(inode); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index e55126f85bd2..abb0f1f53d93 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -355,7 +355,7 @@ void kernfs_evict_inode(struct inode *inode) { struct kernfs_node *kn = inode->i_private; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); kernfs_put(kn); } diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 9a59cbade2fb..48140315f627 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -2180,7 +2180,7 @@ void logfs_evict_inode(struct inode *inode) do_delete_inode(inode); } } - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); /* Cheaper version of write_inode. All changes are concealed in diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 0332109162a5..03aaeb1a694a 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -26,7 +26,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data); static void minix_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; minix_truncate(inode); diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 2cf2ebecb55f..ee59d35ff069 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -296,7 +296,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info) static void ncp_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (S_ISDIR(inode->i_mode)) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 360114ae8b82..c4702baa22b8 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -128,7 +128,7 @@ EXPORT_SYMBOL_GPL(nfs_clear_inode); void nfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); nfs_clear_inode(inode); } diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 808f29574412..6f340f02f2ba 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -90,7 +90,7 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc) */ static void nfs4_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); pnfs_return_layout(inode); pnfs_destroy_layout(NFS_I(inode)); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 7e350c562e0e..b9c5726120e3 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -783,16 +783,14 @@ void nilfs_evict_inode(struct inode *inode) int ret; if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { - if (inode->i_data.nrpages) - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); nilfs_clear_inode(inode); return; } nilfs_transaction_begin(sb, &ti, 0); /* never fails */ - if (inode->i_data.nrpages) - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); /* TODO: some of the following operations may fail. */ nilfs_truncate_bmap(ii, 0); diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index ffb9b3675736..9d8153ebacfb 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2259,7 +2259,7 @@ void ntfs_evict_big_inode(struct inode *vi) { ntfs_inode *ni = NTFS_I(vi); - truncate_inode_pages(&vi->i_data, 0); + truncate_inode_pages_final(&vi->i_data); clear_inode(vi); #ifdef NTFS_RW diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index d437f3ba90b0..437de7f768c6 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -964,7 +964,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); if (sync_data) filemap_write_and_wait(inode->i_mapping); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); } static void ocfs2_delete_inode(struct inode *inode) @@ -1181,7 +1181,7 @@ void ocfs2_evict_inode(struct inode *inode) (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { ocfs2_delete_inode(inode); } else { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); } ocfs2_clear_inode(inode); } diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index d8b0afde2179..ec58c7659183 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -183,7 +183,7 @@ int omfs_sync_inode(struct inode *inode) */ static void omfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_nlink) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 124fc43c7090..8f20e3404fd2 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -35,7 +35,7 @@ static void proc_evict_inode(struct inode *inode) const struct proc_ns_operations *ns_ops; void *ns; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); /* Stop tracking associated processes */ diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ad62bdbb451e..bc8b8009897d 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -35,7 +35,7 @@ void reiserfs_evict_inode(struct inode *inode) if (!inode->i_nlink && !is_bad_inode(inode)) dquot_initialize(inode); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (inode->i_nlink) goto no_delete; diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index c327d4ee1235..5625ca920f5e 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -295,7 +295,7 @@ int sysv_sync_inode(struct inode *inode) static void sysv_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; sysv_truncate(inode); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 5ded8490c0c6..48f943f7f5d5 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -351,7 +351,7 @@ static void ubifs_evict_inode(struct inode *inode) dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); ubifs_assert(!atomic_read(&inode->i_count)); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (inode->i_nlink) goto done; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 982ce05c87ed..5d643706212f 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -146,8 +146,8 @@ void udf_evict_inode(struct inode *inode) want_delete = 1; udf_setsize(inode, 0); udf_update_inode(inode, IS_SYNC(inode)); - } else - truncate_inode_pages(&inode->i_data, 0); + } + truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); clear_inode(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index c8ca96086784..61e8a9b021dd 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -885,7 +885,7 @@ void ufs_evict_inode(struct inode * inode) if (!inode->i_nlink && !is_bad_inode(inode)) want_delete = 1; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (want_delete) { loff_t old_i_size; /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d971f4932b5d..0ef599218991 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -996,7 +996,7 @@ xfs_fs_evict_inode( trace_xfs_evict_inode(ip); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); XFS_STATS_INC(vn_rele); XFS_STATS_INC(vn_remove); diff --git a/include/linux/fs.h b/include/linux/fs.h index 6e765d28841b..3ca9420f627e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -419,6 +419,7 @@ struct address_space { struct mutex i_mmap_mutex; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ + unsigned long nrshadows; /* number of shadow entries */ pgoff_t writeback_index;/* writeback starts here */ const struct address_space_operations *a_ops; /* methods */ unsigned long flags; /* error bits/gfp mask */ diff --git a/include/linux/mm.h b/include/linux/mm.h index b1331aff769c..58d6ddba5db3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1834,6 +1834,7 @@ vm_unmapped_area(struct vm_unmapped_area_info *info) extern void truncate_inode_pages(struct address_space *, loff_t); extern void truncate_inode_pages_range(struct address_space *, loff_t lstart, loff_t lend); +extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 493bfd85214e..ff43253f568a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -25,6 +25,7 @@ enum mapping_flags { AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ AS_BALLOON_MAP = __GFP_BITS_SHIFT + 4, /* balloon page special map */ + AS_EXITING = __GFP_BITS_SHIFT + 5, /* final truncate in progress */ }; static inline void mapping_set_error(struct address_space *mapping, int error) @@ -69,6 +70,16 @@ static inline int mapping_balloon(struct address_space *mapping) return mapping && test_bit(AS_BALLOON_MAP, &mapping->flags); } +static inline void mapping_set_exiting(struct address_space *mapping) +{ + set_bit(AS_EXITING, &mapping->flags); +} + +static inline int mapping_exiting(struct address_space *mapping) +{ + return test_bit(AS_EXITING, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return (__force gfp_t)mapping->flags & __GFP_BITS_MASK; @@ -547,7 +558,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void delete_from_page_cache(struct page *page); -extern void __delete_from_page_cache(struct page *page); +extern void __delete_from_page_cache(struct page *page, void *shadow); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); /* diff --git a/mm/filemap.c b/mm/filemap.c index efc63876477f..05c44aa44188 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -107,12 +107,33 @@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ +static void page_cache_tree_delete(struct address_space *mapping, + struct page *page, void *shadow) +{ + if (shadow) { + void **slot; + + slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); + radix_tree_replace_slot(slot, shadow); + mapping->nrshadows++; + /* + * Make sure the nrshadows update is committed before + * the nrpages update so that final truncate racing + * with reclaim does not see both counters 0 at the + * same time and miss a shadow entry. + */ + smp_wmb(); + } else + radix_tree_delete(&mapping->page_tree, page->index); + mapping->nrpages--; +} + /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the mapping's tree_lock. */ -void __delete_from_page_cache(struct page *page) +void __delete_from_page_cache(struct page *page, void *shadow) { struct address_space *mapping = page->mapping; @@ -127,10 +148,11 @@ void __delete_from_page_cache(struct page *page) else cleancache_invalidate_page(mapping, page); - radix_tree_delete(&mapping->page_tree, page->index); + page_cache_tree_delete(mapping, page, shadow); + page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ - mapping->nrpages--; + __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) __dec_zone_page_state(page, NR_SHMEM); @@ -166,7 +188,7 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(page); + __delete_from_page_cache(page, NULL); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); @@ -426,7 +448,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->index = offset; spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(old); + __delete_from_page_cache(old, NULL); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; @@ -460,6 +482,7 @@ static int page_cache_tree_insert(struct address_space *mapping, if (!radix_tree_exceptional_entry(p)) return -EEXIST; radix_tree_replace_slot(slot, page); + mapping->nrshadows--; mapping->nrpages++; return 0; } diff --git a/mm/truncate.c b/mm/truncate.c index 2e84fe59190b..0db9258319f0 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -35,7 +35,8 @@ static void clear_exceptional_entry(struct address_space *mapping, * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ - radix_tree_delete_item(&mapping->page_tree, index, entry); + if (radix_tree_delete_item(&mapping->page_tree, index, entry) == entry) + mapping->nrshadows--; spin_unlock_irq(&mapping->tree_lock); } @@ -229,7 +230,7 @@ void truncate_inode_pages_range(struct address_space *mapping, int i; cleancache_invalidate_inode(mapping); - if (mapping->nrpages == 0) + if (mapping->nrpages == 0 && mapping->nrshadows == 0) return; /* Offsets within partial pages */ @@ -391,6 +392,53 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) } EXPORT_SYMBOL(truncate_inode_pages); +/** + * truncate_inode_pages_final - truncate *all* pages before inode dies + * @mapping: mapping to truncate + * + * Called under (and serialized by) inode->i_mutex. + * + * Filesystems have to use this in the .evict_inode path to inform the + * VM that this is the final truncate and the inode is going away. + */ +void truncate_inode_pages_final(struct address_space *mapping) +{ + unsigned long nrshadows; + unsigned long nrpages; + + /* + * Page reclaim can not participate in regular inode lifetime + * management (can't call iput()) and thus can race with the + * inode teardown. Tell it when the address space is exiting, + * so that it does not install eviction information after the + * final truncate has begun. + */ + mapping_set_exiting(mapping); + + /* + * When reclaim installs eviction entries, it increases + * nrshadows first, then decreases nrpages. Make sure we see + * this in the right order or we might miss an entry. + */ + nrpages = mapping->nrpages; + smp_rmb(); + nrshadows = mapping->nrshadows; + + if (nrpages || nrshadows) { + /* + * As truncation uses a lockless tree lookup, cycle + * the tree lock to make sure any ongoing tree + * modification that does not see AS_EXITING is + * completed before starting the final truncate. + */ + spin_lock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); + + truncate_inode_pages(mapping, 0); + } +} +EXPORT_SYMBOL(truncate_inode_pages_final); + /** * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode * @mapping: the address_space which holds the pages to invalidate @@ -484,7 +532,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) goto failed; BUG_ON(page_has_private(page)); - __delete_from_page_cache(page); + __delete_from_page_cache(page, NULL); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index c53d1a54964c..2a0bb8fdb259 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -572,7 +572,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) freepage = mapping->a_ops->freepage; - __delete_from_page_cache(page); + __delete_from_page_cache(page, NULL); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); -- cgit v1.2.3 From a528910e12ec7ee203095eb1711468a66b9b60b0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:51 -0700 Subject: mm: thrash detection-based file cache sizing The VM maintains cached filesystem pages on two types of lists. One list holds the pages recently faulted into the cache, the other list holds pages that have been referenced repeatedly on that first list. The idea is to prefer reclaiming young pages over those that have shown to benefit from caching in the past. We call the recently usedbut ultimately was not significantly better than a FIFO policy and still thrashed cache based on eviction speed, rather than actual demand for cache. This patch solves one half of the problem by decoupling the ability to detect working set changes from the inactive list size. By maintaining a history of recently evicted file pages it can detect frequently used pages with an arbitrarily small inactive list size, and subsequently apply pressure on the active list based on actual demand for cache, not just overall eviction speed. Every zone maintains a counter that tracks inactive list aging speed. When a page is evicted, a snapshot of this counter is stored in the now-empty page cache radix tree slot. On refault, the minimum access distance of the page can be assessed, to evaluate whether the page should be part of the active list or not. This fixes the VM's blindness towards working set changes in excess of the inactive list. And it's the foundation to further improve the protection ability and reduce the minimum inactive list size of 50%. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Reviewed-by: Bob Liu Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Mel Gorman Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 5 + include/linux/swap.h | 5 + mm/Makefile | 2 +- mm/filemap.c | 61 ++++++++---- mm/swap.c | 2 + mm/vmscan.c | 24 ++++- mm/vmstat.c | 2 + mm/workingset.c | 253 +++++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 331 insertions(+), 23 deletions(-) create mode 100644 mm/workingset.c diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9b61b9bf81ac..f25db1d74a21 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -142,6 +142,8 @@ enum zone_stat_item { NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ #endif + WORKINGSET_REFAULT, + WORKINGSET_ACTIVATE, NR_ANON_TRANSPARENT_HUGEPAGES, NR_FREE_CMA_PAGES, NR_VM_ZONE_STAT_ITEMS }; @@ -392,6 +394,9 @@ struct zone { spinlock_t lru_lock; struct lruvec lruvec; + /* Evictions & activations on the inactive file list */ + atomic_long_t inactive_age; + unsigned long pages_scanned; /* since last reclaim */ unsigned long flags; /* zone flags, see below */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 46ba0c6c219f..b83cf61403ed 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -260,6 +260,11 @@ struct swap_list_t { int next; /* swapfile to be used next */ }; +/* linux/mm/workingset.c */ +void *workingset_eviction(struct address_space *mapping, struct page *page); +bool workingset_refault(void *shadow); +void workingset_activation(struct page *page); + /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; diff --git a/mm/Makefile b/mm/Makefile index 310c90a09264..cdd741519ee0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -17,7 +17,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o balloon_compaction.o \ - interval_tree.o list_lru.o $(mmu-y) + interval_tree.o list_lru.o workingset.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/filemap.c b/mm/filemap.c index 05c44aa44188..a603c4d7d3c9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -469,7 +469,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) EXPORT_SYMBOL_GPL(replace_page_cache_page); static int page_cache_tree_insert(struct address_space *mapping, - struct page *page) + struct page *page, void **shadowp) { void **slot; int error; @@ -484,6 +484,8 @@ static int page_cache_tree_insert(struct address_space *mapping, radix_tree_replace_slot(slot, page); mapping->nrshadows--; mapping->nrpages++; + if (shadowp) + *shadowp = p; return 0; } error = radix_tree_insert(&mapping->page_tree, page->index, page); @@ -492,18 +494,10 @@ static int page_cache_tree_insert(struct address_space *mapping, return error; } -/** - * add_to_page_cache_locked - add a locked page to the pagecache - * @page: page to add - * @mapping: the page's address_space - * @offset: page index - * @gfp_mask: page allocation mode - * - * This function is used to add a page to the pagecache. It must be locked. - * This function does not add the page to the LRU. The caller must do that. - */ -int add_to_page_cache_locked(struct page *page, struct address_space *mapping, - pgoff_t offset, gfp_t gfp_mask) +static int __add_to_page_cache_locked(struct page *page, + struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask, + void **shadowp) { int error; @@ -526,7 +520,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, page->index = offset; spin_lock_irq(&mapping->tree_lock); - error = page_cache_tree_insert(mapping, page); + error = page_cache_tree_insert(mapping, page, shadowp); radix_tree_preload_end(); if (unlikely(error)) goto err_insert; @@ -542,16 +536,49 @@ err_insert: page_cache_release(page); return error; } + +/** + * add_to_page_cache_locked - add a locked page to the pagecache + * @page: page to add + * @mapping: the page's address_space + * @offset: page index + * @gfp_mask: page allocation mode + * + * This function is used to add a page to the pagecache. It must be locked. + * This function does not add the page to the LRU. The caller must do that. + */ +int add_to_page_cache_locked(struct page *page, struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask) +{ + return __add_to_page_cache_locked(page, mapping, offset, + gfp_mask, NULL); +} EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { + void *shadow = NULL; int ret; - ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) - lru_cache_add_file(page); + __set_page_locked(page); + ret = __add_to_page_cache_locked(page, mapping, offset, + gfp_mask, &shadow); + if (unlikely(ret)) + __clear_page_locked(page); + else { + /* + * The page might have been evicted from cache only + * recently, in which case it should be activated like + * any other repeatedly accessed page. + */ + if (shadow && workingset_refault(shadow)) { + SetPageActive(page); + workingset_activation(page); + } else + ClearPageActive(page); + lru_cache_add(page); + } return ret; } EXPORT_SYMBOL_GPL(add_to_page_cache_lru); diff --git a/mm/swap.c b/mm/swap.c index c8048d71c642..9ce43ba4498b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -574,6 +574,8 @@ void mark_page_accessed(struct page *page) else __lru_cache_activate_page(page); ClearPageReferenced(page); + if (page_is_file_cache(page)) + workingset_activation(page); } else if (!PageReferenced(page)) { SetPageReferenced(page); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 2a0bb8fdb259..1f56a80a7c41 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -523,7 +523,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * Same as remove_mapping, but if the page is removed from the mapping, it * gets returned with a refcount of 0. */ -static int __remove_mapping(struct address_space *mapping, struct page *page) +static int __remove_mapping(struct address_space *mapping, struct page *page, + bool reclaimed) { BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); @@ -569,10 +570,23 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) swapcache_free(swap, page); } else { void (*freepage)(struct page *); + void *shadow = NULL; freepage = mapping->a_ops->freepage; - - __delete_from_page_cache(page, NULL); + /* + * Remember a shadow entry for reclaimed file cache in + * order to detect refaults, thus thrashing, later on. + * + * But don't store shadows in an address space that is + * already exiting. This is not just an optizimation, + * inode reclaim needs to empty out the radix tree or + * the nodes are lost. Don't plant shadows behind its + * back. + */ + if (reclaimed && page_is_file_cache(page) && + !mapping_exiting(mapping)) + shadow = workingset_eviction(mapping, page); + __delete_from_page_cache(page, shadow); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); @@ -595,7 +609,7 @@ cannot_free: */ int remove_mapping(struct address_space *mapping, struct page *page) { - if (__remove_mapping(mapping, page)) { + if (__remove_mapping(mapping, page, false)) { /* * Unfreezing the refcount with 1 rather than 2 effectively * drops the pagecache ref for us without requiring another @@ -1065,7 +1079,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } - if (!mapping || !__remove_mapping(mapping, page)) + if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index def5dd2fbe61..843125a0e255 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -770,6 +770,8 @@ const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "workingset_refault", + "workingset_activate", "nr_anon_transparent_hugepages", "nr_free_cma", "nr_dirty_threshold", diff --git a/mm/workingset.c b/mm/workingset.c new file mode 100644 index 000000000000..8a6c7cff4923 --- /dev/null +++ b/mm/workingset.c @@ -0,0 +1,253 @@ +/* + * Workingset detection + * + * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Double CLOCK lists + * + * Per zone, two clock lists are maintained for file pages: the + * inactive and the active list. Freshly faulted pages start out at + * the head of the inactive list and page reclaim scans pages from the + * tail. Pages that are accessed multiple times on the inactive list + * are promoted to the active list, to protect them from reclaim, + * whereas active pages are demoted to the inactive list when the + * active list grows too big. + * + * fault ------------------------+ + * | + * +--------------+ | +-------------+ + * reclaim <- | inactive | <-+-- demotion | active | <--+ + * +--------------+ +-------------+ | + * | | + * +-------------- promotion ------------------+ + * + * + * Access frequency and refault distance + * + * A workload is thrashing when its pages are frequently used but they + * are evicted from the inactive list every time before another access + * would have promoted them to the active list. + * + * In cases where the average access distance between thrashing pages + * is bigger than the size of memory there is nothing that can be + * done - the thrashing set could never fit into memory under any + * circumstance. + * + * However, the average access distance could be bigger than the + * inactive list, yet smaller than the size of memory. In this case, + * the set could fit into memory if it weren't for the currently + * active pages - which may be used more, hopefully less frequently: + * + * +-memory available to cache-+ + * | | + * +-inactive------+-active----+ + * a b | c d e f g h i | J K L M N | + * +---------------+-----------+ + * + * It is prohibitively expensive to accurately track access frequency + * of pages. But a reasonable approximation can be made to measure + * thrashing on the inactive list, after which refaulting pages can be + * activated optimistically to compete with the existing active pages. + * + * Approximating inactive page access frequency - Observations: + * + * 1. When a page is accessed for the first time, it is added to the + * head of the inactive list, slides every existing inactive page + * towards the tail by one slot, and pushes the current tail page + * out of memory. + * + * 2. When a page is accessed for the second time, it is promoted to + * the active list, shrinking the inactive list by one slot. This + * also slides all inactive pages that were faulted into the cache + * more recently than the activated page towards the tail of the + * inactive list. + * + * Thus: + * + * 1. The sum of evictions and activations between any two points in + * time indicate the minimum number of inactive pages accessed in + * between. + * + * 2. Moving one inactive page N page slots towards the tail of the + * list requires at least N inactive page accesses. + * + * Combining these: + * + * 1. When a page is finally evicted from memory, the number of + * inactive pages accessed while the page was in cache is at least + * the number of page slots on the inactive list. + * + * 2. In addition, measuring the sum of evictions and activations (E) + * at the time of a page's eviction, and comparing it to another + * reading (R) at the time the page faults back into memory tells + * the minimum number of accesses while the page was not cached. + * This is called the refault distance. + * + * Because the first access of the page was the fault and the second + * access the refault, we combine the in-cache distance with the + * out-of-cache distance to get the complete minimum access distance + * of this page: + * + * NR_inactive + (R - E) + * + * And knowing the minimum access distance of a page, we can easily + * tell if the page would be able to stay in cache assuming all page + * slots in the cache were available: + * + * NR_inactive + (R - E) <= NR_inactive + NR_active + * + * which can be further simplified to + * + * (R - E) <= NR_active + * + * Put into words, the refault distance (out-of-cache) can be seen as + * a deficit in inactive list space (in-cache). If the inactive list + * had (R - E) more page slots, the page would not have been evicted + * in between accesses, but activated instead. And on a full system, + * the only thing eating into inactive list space is active pages. + * + * + * Activating refaulting pages + * + * All that is known about the active list is that the pages have been + * accessed more than once in the past. This means that at any given + * time there is actually a good chance that pages on the active list + * are no longer in active use. + * + * So when a refault distance of (R - E) is observed and there are at + * least (R - E) active pages, the refaulting page is activated + * optimistically in the hope that (R - E) active pages are actually + * used less frequently than the refaulting page - or even not used at + * all anymore. + * + * If this is wrong and demotion kicks in, the pages which are truly + * used more frequently will be reactivated while the less frequently + * used once will be evicted from memory. + * + * But if this is right, the stale pages will be pushed out of memory + * and the used pages get to stay in cache. + * + * + * Implementation + * + * For each zone's file LRU lists, a counter for inactive evictions + * and activations is maintained (zone->inactive_age). + * + * On eviction, a snapshot of this counter (along with some bits to + * identify the zone) is stored in the now empty page cache radix tree + * slot of the evicted page. This is called a shadow entry. + * + * On cache misses for which there are shadow entries, an eligible + * refault distance will immediately activate the refaulting page. + */ + +static void *pack_shadow(unsigned long eviction, struct zone *zone) +{ + eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); + eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); + eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); + + return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); +} + +static void unpack_shadow(void *shadow, + struct zone **zone, + unsigned long *distance) +{ + unsigned long entry = (unsigned long)shadow; + unsigned long eviction; + unsigned long refault; + unsigned long mask; + int zid, nid; + + entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; + zid = entry & ((1UL << ZONES_SHIFT) - 1); + entry >>= ZONES_SHIFT; + nid = entry & ((1UL << NODES_SHIFT) - 1); + entry >>= NODES_SHIFT; + eviction = entry; + + *zone = NODE_DATA(nid)->node_zones + zid; + + refault = atomic_long_read(&(*zone)->inactive_age); + mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT + + RADIX_TREE_EXCEPTIONAL_SHIFT); + /* + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. + * + * There is a special case: usually, shadow entries have a + * short lifetime and are either refaulted or reclaimed along + * with the inode before they get too old. But it is not + * impossible for the inactive_age to lap a shadow entry in + * the field, which can then can result in a false small + * refault distance, leading to a false activation should this + * old entry actually refault again. However, earlier kernels + * used to deactivate unconditionally with *every* reclaim + * invocation for the longest time, so the occasional + * inappropriate activation leading to pressure on the active + * list is not a problem. + */ + *distance = (refault - eviction) & mask; +} + +/** + * workingset_eviction - note the eviction of a page from memory + * @mapping: address space the page was backing + * @page: the page being evicted + * + * Returns a shadow entry to be stored in @mapping->page_tree in place + * of the evicted @page so that a later refault can be detected. + */ +void *workingset_eviction(struct address_space *mapping, struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long eviction; + + eviction = atomic_long_inc_return(&zone->inactive_age); + return pack_shadow(eviction, zone); +} + +/** + * workingset_refault - evaluate the refault of a previously evicted page + * @shadow: shadow entry of the evicted page + * + * Calculates and evaluates the refault distance of the previously + * evicted page in the context of the zone it was allocated in. + * + * Returns %true if the page should be activated, %false otherwise. + */ +bool workingset_refault(void *shadow) +{ + unsigned long refault_distance; + struct zone *zone; + + unpack_shadow(shadow, &zone, &refault_distance); + inc_zone_state(zone, WORKINGSET_REFAULT); + + if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { + inc_zone_state(zone, WORKINGSET_ACTIVATE); + return true; + } + return false; +} + +/** + * workingset_activation - note a page activation + * @page: page that is being activated + */ +void workingset_activation(struct page *page) +{ + atomic_long_inc(&page_zone(page)->inactive_age); +} -- cgit v1.2.3 From 139e561660fe11e0fc35e142a800df3dd7d03e9d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:54 -0700 Subject: lib: radix_tree: tree node interface Make struct radix_tree_node part of the public interface and provide API functions to create, look up, and delete whole nodes. Refactor the existing insert, look up, delete functions on top of these new node primitives. This will allow the VM to track and garbage collect page cache radix tree nodes. [sasha.levin@oracle.com: return correct error code on insertion failure] Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Mel Gorman Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 34 ++++++ lib/radix-tree.c | 263 +++++++++++++++++++++++++-------------------- 2 files changed, 182 insertions(+), 115 deletions(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index e8be53ecfc45..13636c40bc42 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -60,6 +60,33 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) #define RADIX_TREE_MAX_TAGS 3 +#ifdef __KERNEL__ +#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) +#else +#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ +#endif + +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) + +#define RADIX_TREE_TAG_LONGS \ + ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) + +struct radix_tree_node { + unsigned int height; /* Height from the bottom */ + unsigned int count; + union { + struct radix_tree_node *parent; /* Used when ascending tree */ + struct rcu_head rcu_head; /* Used when freeing node */ + }; + void __rcu *slots[RADIX_TREE_MAP_SIZE]; + unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; +}; + +#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) +#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ + RADIX_TREE_MAP_SHIFT)) + /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ struct radix_tree_root { unsigned int height; @@ -101,6 +128,7 @@ do { \ * concurrently with other readers. * * The notable exceptions to this rule are the following functions: + * __radix_tree_lookup * radix_tree_lookup * radix_tree_lookup_slot * radix_tree_tag_get @@ -216,9 +244,15 @@ static inline void radix_tree_replace_slot(void **pslot, void *item) rcu_assign_pointer(*pslot, item); } +int __radix_tree_create(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node **nodep, void ***slotp); int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); +void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node **nodep, void ***slotp); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); unsigned int diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 7e30d2a7f346..d60be40c111b 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -35,33 +35,6 @@ #include /* in_interrupt() */ -#ifdef __KERNEL__ -#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) -#else -#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ -#endif - -#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) -#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) - -#define RADIX_TREE_TAG_LONGS \ - ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) - -struct radix_tree_node { - unsigned int height; /* Height from the bottom */ - unsigned int count; - union { - struct radix_tree_node *parent; /* Used when ascending tree */ - struct rcu_head rcu_head; /* Used when freeing node */ - }; - void __rcu *slots[RADIX_TREE_MAP_SIZE]; - unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; -}; - -#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ - RADIX_TREE_MAP_SHIFT)) - /* * The height_to_maxindex array needs to be one deeper than the maximum * path as height 0 holds only 1 entry. @@ -387,23 +360,28 @@ out: } /** - * radix_tree_insert - insert into a radix tree + * __radix_tree_create - create a slot in a radix tree * @root: radix tree root * @index: index key - * @item: item to insert + * @nodep: returns node + * @slotp: returns slot * - * Insert an item into the radix tree at position @index. + * Create, if necessary, and return the node and slot for an item + * at position @index in the radix tree @root. + * + * Until there is more than one item in the tree, no nodes are + * allocated and @root->rnode is used as a direct slot instead of + * pointing to a node, in which case *@nodep will be NULL. + * + * Returns -ENOMEM, or 0 for success. */ -int radix_tree_insert(struct radix_tree_root *root, - unsigned long index, void *item) +int __radix_tree_create(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node **nodep, void ***slotp) { struct radix_tree_node *node = NULL, *slot; - unsigned int height, shift; - int offset; + unsigned int height, shift, offset; int error; - BUG_ON(radix_tree_is_indirect_ptr(item)); - /* Make sure the tree is high enough. */ if (index > radix_tree_maxindex(root->height)) { error = radix_tree_extend(root, index); @@ -439,16 +417,42 @@ int radix_tree_insert(struct radix_tree_root *root, height--; } - if (slot != NULL) + if (nodep) + *nodep = node; + if (slotp) + *slotp = node ? node->slots + offset : (void **)&root->rnode; + return 0; +} + +/** + * radix_tree_insert - insert into a radix tree + * @root: radix tree root + * @index: index key + * @item: item to insert + * + * Insert an item into the radix tree at position @index. + */ +int radix_tree_insert(struct radix_tree_root *root, + unsigned long index, void *item) +{ + struct radix_tree_node *node; + void **slot; + int error; + + BUG_ON(radix_tree_is_indirect_ptr(item)); + + error = __radix_tree_create(root, index, &node, &slot); + if (error) + return error; + if (*slot != NULL) return -EEXIST; + rcu_assign_pointer(*slot, item); if (node) { node->count++; - rcu_assign_pointer(node->slots[offset], item); - BUG_ON(tag_get(node, 0, offset)); - BUG_ON(tag_get(node, 1, offset)); + BUG_ON(tag_get(node, 0, index & RADIX_TREE_MAP_MASK)); + BUG_ON(tag_get(node, 1, index & RADIX_TREE_MAP_MASK)); } else { - rcu_assign_pointer(root->rnode, item); BUG_ON(root_tag_get(root, 0)); BUG_ON(root_tag_get(root, 1)); } @@ -457,15 +461,26 @@ int radix_tree_insert(struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_insert); -/* - * is_slot == 1 : search for the slot. - * is_slot == 0 : search for the node. +/** + * __radix_tree_lookup - lookup an item in a radix tree + * @root: radix tree root + * @index: index key + * @nodep: returns node + * @slotp: returns slot + * + * Lookup and return the item at position @index in the radix + * tree @root. + * + * Until there is more than one item in the tree, no nodes are + * allocated and @root->rnode is used as a direct slot instead of + * pointing to a node, in which case *@nodep will be NULL. */ -static void *radix_tree_lookup_element(struct radix_tree_root *root, - unsigned long index, int is_slot) +void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node **nodep, void ***slotp) { + struct radix_tree_node *node, *parent; unsigned int height, shift; - struct radix_tree_node *node, **slot; + void **slot; node = rcu_dereference_raw(root->rnode); if (node == NULL) @@ -474,7 +489,12 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, if (!radix_tree_is_indirect_ptr(node)) { if (index > 0) return NULL; - return is_slot ? (void *)&root->rnode : node; + + if (nodep) + *nodep = NULL; + if (slotp) + *slotp = (void **)&root->rnode; + return node; } node = indirect_to_ptr(node); @@ -485,8 +505,8 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, shift = (height-1) * RADIX_TREE_MAP_SHIFT; do { - slot = (struct radix_tree_node **) - (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK)); + parent = node; + slot = node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK); node = rcu_dereference_raw(*slot); if (node == NULL) return NULL; @@ -495,7 +515,11 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, height--; } while (height > 0); - return is_slot ? (void *)slot : indirect_to_ptr(node); + if (nodep) + *nodep = parent; + if (slotp) + *slotp = slot; + return node; } /** @@ -513,7 +537,11 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, */ void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) { - return (void **)radix_tree_lookup_element(root, index, 1); + void **slot; + + if (!__radix_tree_lookup(root, index, NULL, &slot)) + return NULL; + return slot; } EXPORT_SYMBOL(radix_tree_lookup_slot); @@ -531,7 +559,7 @@ EXPORT_SYMBOL(radix_tree_lookup_slot); */ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) { - return radix_tree_lookup_element(root, index, 0); + return __radix_tree_lookup(root, index, NULL, NULL); } EXPORT_SYMBOL(radix_tree_lookup); @@ -1261,6 +1289,56 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) } } +/** + * __radix_tree_delete_node - try to free node after clearing a slot + * @root: radix tree root + * @index: index key + * @node: node containing @index + * + * After clearing the slot at @index in @node from radix tree + * rooted at @root, call this function to attempt freeing the + * node and shrinking the tree. + * + * Returns %true if @node was freed, %false otherwise. + */ +bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node *node) +{ + bool deleted = false; + + do { + struct radix_tree_node *parent; + + if (node->count) { + if (node == indirect_to_ptr(root->rnode)) { + radix_tree_shrink(root); + if (root->height == 0) + deleted = true; + } + return deleted; + } + + parent = node->parent; + if (parent) { + index >>= RADIX_TREE_MAP_SHIFT; + + parent->slots[index & RADIX_TREE_MAP_MASK] = NULL; + parent->count--; + } else { + root_tag_clear_all(root); + root->height = 0; + root->rnode = NULL; + } + + radix_tree_node_free(node); + deleted = true; + + node = parent; + } while (node); + + return deleted; +} + /** * radix_tree_delete_item - delete an item from a radix tree * @root: radix tree root @@ -1275,43 +1353,26 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) void *radix_tree_delete_item(struct radix_tree_root *root, unsigned long index, void *item) { - struct radix_tree_node *node = NULL; - struct radix_tree_node *slot = NULL; - struct radix_tree_node *to_free; - unsigned int height, shift; + struct radix_tree_node *node; + unsigned int offset; + void **slot; + void *entry; int tag; - int uninitialized_var(offset); - height = root->height; - if (index > radix_tree_maxindex(height)) - goto out; + entry = __radix_tree_lookup(root, index, &node, &slot); + if (!entry) + return NULL; - slot = root->rnode; - if (height == 0) { + if (item && entry != item) + return NULL; + + if (!node) { root_tag_clear_all(root); root->rnode = NULL; - goto out; + return entry; } - slot = indirect_to_ptr(slot); - shift = height * RADIX_TREE_MAP_SHIFT; - - do { - if (slot == NULL) - goto out; - - shift -= RADIX_TREE_MAP_SHIFT; - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - node = slot; - slot = slot->slots[offset]; - } while (shift); - - if (slot == NULL) - goto out; - if (item && slot != item) { - slot = NULL; - goto out; - } + offset = index & RADIX_TREE_MAP_MASK; /* * Clear all tags associated with the item to be deleted. @@ -1322,40 +1383,12 @@ void *radix_tree_delete_item(struct radix_tree_root *root, radix_tree_tag_clear(root, index, tag); } - to_free = NULL; - /* Now free the nodes we do not need anymore */ - while (node) { - node->slots[offset] = NULL; - node->count--; - /* - * Queue the node for deferred freeing after the - * last reference to it disappears (set NULL, above). - */ - if (to_free) - radix_tree_node_free(to_free); - - if (node->count) { - if (node == indirect_to_ptr(root->rnode)) - radix_tree_shrink(root); - goto out; - } - - /* Node with zero slots in use so free it */ - to_free = node; - - index >>= RADIX_TREE_MAP_SHIFT; - offset = index & RADIX_TREE_MAP_MASK; - node = node->parent; - } + node->slots[offset] = NULL; + node->count--; - root_tag_clear_all(root); - root->height = 0; - root->rnode = NULL; - if (to_free) - radix_tree_node_free(to_free); + __radix_tree_delete_node(root, index, node); -out: - return slot; + return entry; } EXPORT_SYMBOL(radix_tree_delete_item); -- cgit v1.2.3 From 449dd6984d0e47643c04c807f609dd56d48d5bcc Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:56 -0700 Subject: mm: keep page cache radix tree nodes in check Previously, page cache radix tree nodes were freed after reclaim emptied out their page pointers. But now reclaim stores shadow entries in their place, which are only reclaimed when the inodes themselves are reclaimed. This is problematic for bigger files that are still in use after they have a significant amount of their cache reclaimed, without any of those pages actually refaulting. The shadow entries will just sit there and waste memory. In the worst case, the shadow entries will accumulate until the machine runs out of memory. To get this under control, the VM will track radix tree nodes exclusively containing shadow entries on a per-NUMA node list. Per-NUMA rather than global because we expect the radix tree nodes themselves to be allocated node-locally and we want to reduce cross-node references of otherwise independent cache workloads. A simple shrinker will then reclaim these nodes on memory pressure. A few things need to be stored in the radix tree node to implement the shadow node LRU and allow tree deletions coming from the list: 1. There is no index available that would describe the reverse path from the node up to the tree root, which is needed to perform a deletion. To solve this, encode in each node its offset inside the parent. This can be stored in the unused upper bits of the same member that stores the node's height at no extra space cost. 2. The number of shadow entries needs to be counted in addition to the regular entries, to quickly detect when the node is ready to go to the shadow node LRU list. The current entry count is an unsigned int but the maximum number of entries is 64, so a shadow counter can easily be stored in the unused upper bits. 3. Tree modification needs tree lock and tree root, which are located in the address space, so store an address_space backpointer in the node. The parent pointer of the node is in a union with the 2-word rcu_head, so the backpointer comes at no extra cost as well. 4. The node needs to be linked to an LRU list, which requires a list head inside the node. This does increase the size of the node, but it does not change the number of objects that fit into a slab page. [akpm@linux-foundation.org: export the right function] Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Mel Gorman Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list_lru.h | 8 ++- include/linux/mmzone.h | 1 + include/linux/radix-tree.h | 32 ++++++--- include/linux/swap.h | 31 +++++++++ lib/radix-tree.c | 36 ++++++---- mm/filemap.c | 90 ++++++++++++++++++++----- mm/list_lru.c | 16 ++++- mm/truncate.c | 26 +++++++- mm/vmstat.c | 1 + mm/workingset.c | 161 +++++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 359 insertions(+), 43 deletions(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 3ce541753c88..f3434533fbf8 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -13,6 +13,8 @@ /* list_lru_walk_cb has to always return one of those */ enum lru_status { LRU_REMOVED, /* item removed from list */ + LRU_REMOVED_RETRY, /* item removed, but lock has been + dropped and reacquired */ LRU_ROTATE, /* item referenced, give another pass */ LRU_SKIP, /* item cannot be locked, skip */ LRU_RETRY, /* item not freeable. May drop the lock @@ -32,7 +34,11 @@ struct list_lru { }; void list_lru_destroy(struct list_lru *lru); -int list_lru_init(struct list_lru *lru); +int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key); +static inline int list_lru_init(struct list_lru *lru) +{ + return list_lru_init_key(lru, NULL); +} /** * list_lru_add: add an element to the lru list's tail diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f25db1d74a21..fac5509c18f0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -144,6 +144,7 @@ enum zone_stat_item { #endif WORKINGSET_REFAULT, WORKINGSET_ACTIVATE, + WORKINGSET_NODERECLAIM, NR_ANON_TRANSPARENT_HUGEPAGES, NR_FREE_CMA_PAGES, NR_VM_ZONE_STAT_ITEMS }; diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 13636c40bc42..33170dbd9db4 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -72,21 +72,37 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) #define RADIX_TREE_TAG_LONGS \ ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) +#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) +#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ + RADIX_TREE_MAP_SHIFT)) + +/* Height component in node->path */ +#define RADIX_TREE_HEIGHT_SHIFT (RADIX_TREE_MAX_PATH + 1) +#define RADIX_TREE_HEIGHT_MASK ((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1) + +/* Internally used bits of node->count */ +#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1) +#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1) + struct radix_tree_node { - unsigned int height; /* Height from the bottom */ + unsigned int path; /* Offset in parent & height from the bottom */ unsigned int count; union { - struct radix_tree_node *parent; /* Used when ascending tree */ - struct rcu_head rcu_head; /* Used when freeing node */ + struct { + /* Used when ascending tree */ + struct radix_tree_node *parent; + /* For tree user */ + void *private_data; + }; + /* Used when freeing node */ + struct rcu_head rcu_head; }; + /* For tree user */ + struct list_head private_list; void __rcu *slots[RADIX_TREE_MAP_SIZE]; unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; -#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ - RADIX_TREE_MAP_SHIFT)) - /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ struct radix_tree_root { unsigned int height; @@ -251,7 +267,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void ***slotp); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); -bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, +bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); diff --git a/include/linux/swap.h b/include/linux/swap.h index b83cf61403ed..350711560753 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -264,6 +264,37 @@ struct swap_list_t { void *workingset_eviction(struct address_space *mapping, struct page *page); bool workingset_refault(void *shadow); void workingset_activation(struct page *page); +extern struct list_lru workingset_shadow_nodes; + +static inline unsigned int workingset_node_pages(struct radix_tree_node *node) +{ + return node->count & RADIX_TREE_COUNT_MASK; +} + +static inline void workingset_node_pages_inc(struct radix_tree_node *node) +{ + node->count++; +} + +static inline void workingset_node_pages_dec(struct radix_tree_node *node) +{ + node->count--; +} + +static inline unsigned int workingset_node_shadows(struct radix_tree_node *node) +{ + return node->count >> RADIX_TREE_COUNT_SHIFT; +} + +static inline void workingset_node_shadows_inc(struct radix_tree_node *node) +{ + node->count += 1U << RADIX_TREE_COUNT_SHIFT; +} + +static inline void workingset_node_shadows_dec(struct radix_tree_node *node) +{ + node->count -= 1U << RADIX_TREE_COUNT_SHIFT; +} /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index d60be40c111b..9599aa72d7a0 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -342,7 +342,8 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) /* Increase the height. */ newheight = root->height+1; - node->height = newheight; + BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK); + node->path = newheight; node->count = 1; node->parent = NULL; slot = root->rnode; @@ -400,11 +401,12 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, /* Have to add a child node. */ if (!(slot = radix_tree_node_alloc(root))) return -ENOMEM; - slot->height = height; + slot->path = height; slot->parent = node; if (node) { rcu_assign_pointer(node->slots[offset], slot); node->count++; + slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT; } else rcu_assign_pointer(root->rnode, ptr_to_indirect(slot)); } @@ -498,7 +500,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, } node = indirect_to_ptr(node); - height = node->height; + height = node->path & RADIX_TREE_HEIGHT_MASK; if (index > radix_tree_maxindex(height)) return NULL; @@ -704,7 +706,7 @@ int radix_tree_tag_get(struct radix_tree_root *root, return (index == 0); node = indirect_to_ptr(node); - height = node->height; + height = node->path & RADIX_TREE_HEIGHT_MASK; if (index > radix_tree_maxindex(height)) return 0; @@ -741,7 +743,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, { unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK; struct radix_tree_node *rnode, *node; - unsigned long index, offset; + unsigned long index, offset, height; if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) return NULL; @@ -772,7 +774,8 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, return NULL; restart: - shift = (rnode->height - 1) * RADIX_TREE_MAP_SHIFT; + height = rnode->path & RADIX_TREE_HEIGHT_MASK; + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; offset = index >> shift; /* Index outside of the tree */ @@ -1142,7 +1145,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item, unsigned int shift, height; unsigned long i; - height = slot->height; + height = slot->path & RADIX_TREE_HEIGHT_MASK; shift = (height-1) * RADIX_TREE_MAP_SHIFT; for ( ; height > 1; height--) { @@ -1205,7 +1208,8 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) } node = indirect_to_ptr(node); - max_index = radix_tree_maxindex(node->height); + max_index = radix_tree_maxindex(node->path & + RADIX_TREE_HEIGHT_MASK); if (cur_index > max_index) { rcu_read_unlock(); break; @@ -1301,7 +1305,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) * * Returns %true if @node was freed, %false otherwise. */ -bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, +bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { bool deleted = false; @@ -1320,9 +1324,10 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, parent = node->parent; if (parent) { - index >>= RADIX_TREE_MAP_SHIFT; + unsigned int offset; - parent->slots[index & RADIX_TREE_MAP_MASK] = NULL; + offset = node->path >> RADIX_TREE_HEIGHT_SHIFT; + parent->slots[offset] = NULL; parent->count--; } else { root_tag_clear_all(root); @@ -1386,7 +1391,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, node->slots[offset] = NULL; node->count--; - __radix_tree_delete_node(root, index, node); + __radix_tree_delete_node(root, node); return entry; } @@ -1419,9 +1424,12 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag) EXPORT_SYMBOL(radix_tree_tagged); static void -radix_tree_node_ctor(void *node) +radix_tree_node_ctor(void *arg) { - memset(node, 0, sizeof(struct radix_tree_node)); + struct radix_tree_node *node = arg; + + memset(node, 0, sizeof(*node)); + INIT_LIST_HEAD(&node->private_list); } static __init unsigned long __maxindex(unsigned int height) diff --git a/mm/filemap.c b/mm/filemap.c index a603c4d7d3c9..d6df3bacb0fb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -110,11 +110,17 @@ static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { - if (shadow) { - void **slot; + struct radix_tree_node *node; + unsigned long index; + unsigned int offset; + unsigned int tag; + void **slot; - slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); - radix_tree_replace_slot(slot, shadow); + VM_BUG_ON(!PageLocked(page)); + + __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); + + if (shadow) { mapping->nrshadows++; /* * Make sure the nrshadows update is committed before @@ -123,9 +129,45 @@ static void page_cache_tree_delete(struct address_space *mapping, * same time and miss a shadow entry. */ smp_wmb(); - } else - radix_tree_delete(&mapping->page_tree, page->index); + } mapping->nrpages--; + + if (!node) { + /* Clear direct pointer tags in root node */ + mapping->page_tree.gfp_mask &= __GFP_BITS_MASK; + radix_tree_replace_slot(slot, shadow); + return; + } + + /* Clear tree tags for the removed page */ + index = page->index; + offset = index & RADIX_TREE_MAP_MASK; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (test_bit(offset, node->tags[tag])) + radix_tree_tag_clear(&mapping->page_tree, index, tag); + } + + /* Delete page, swap shadow entry */ + radix_tree_replace_slot(slot, shadow); + workingset_node_pages_dec(node); + if (shadow) + workingset_node_shadows_inc(node); + else + if (__radix_tree_delete_node(&mapping->page_tree, node)) + return; + + /* + * Track node that only contains shadow entries. + * + * Avoid acquiring the list_lru lock if already tracked. The + * list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_pages(node) && + list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&workingset_shadow_nodes, &node->private_list); + } } /* @@ -471,27 +513,43 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page); static int page_cache_tree_insert(struct address_space *mapping, struct page *page, void **shadowp) { + struct radix_tree_node *node; void **slot; int error; - slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); - if (slot) { + error = __radix_tree_create(&mapping->page_tree, page->index, + &node, &slot); + if (error) + return error; + if (*slot) { void *p; p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); if (!radix_tree_exceptional_entry(p)) return -EEXIST; - radix_tree_replace_slot(slot, page); - mapping->nrshadows--; - mapping->nrpages++; if (shadowp) *shadowp = p; - return 0; + mapping->nrshadows--; + if (node) + workingset_node_shadows_dec(node); } - error = radix_tree_insert(&mapping->page_tree, page->index, page); - if (!error) - mapping->nrpages++; - return error; + radix_tree_replace_slot(slot, page); + mapping->nrpages++; + if (node) { + workingset_node_pages_inc(node); + /* + * Don't track node that contains actual pages. + * + * Avoid acquiring the list_lru lock if already + * untracked. The list_empty() test is safe as + * node->private_list is protected by + * mapping->tree_lock. + */ + if (!list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + } + return 0; } static int __add_to_page_cache_locked(struct page *page, diff --git a/mm/list_lru.c b/mm/list_lru.c index 72f9decb0104..f1a0db194173 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -87,11 +87,20 @@ restart: ret = isolate(item, &nlru->lock, cb_arg); switch (ret) { + case LRU_REMOVED_RETRY: + assert_spin_locked(&nlru->lock); case LRU_REMOVED: if (--nlru->nr_items == 0) node_clear(nid, lru->active_nodes); WARN_ON_ONCE(nlru->nr_items < 0); isolated++; + /* + * If the lru lock has been dropped, our list + * traversal is now invalid and so we have to + * restart from scratch. + */ + if (ret == LRU_REMOVED_RETRY) + goto restart; break; case LRU_ROTATE: list_move_tail(item, &nlru->list); @@ -103,6 +112,7 @@ restart: * The lru lock has been dropped, our list traversal is * now invalid and so we have to restart from scratch. */ + assert_spin_locked(&nlru->lock); goto restart; default: BUG(); @@ -114,7 +124,7 @@ restart: } EXPORT_SYMBOL_GPL(list_lru_walk_node); -int list_lru_init(struct list_lru *lru) +int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) { int i; size_t size = sizeof(*lru->node) * nr_node_ids; @@ -126,12 +136,14 @@ int list_lru_init(struct list_lru *lru) nodes_clear(lru->active_nodes); for (i = 0; i < nr_node_ids; i++) { spin_lock_init(&lru->node[i].lock); + if (key) + lockdep_set_class(&lru->node[i].lock, key); INIT_LIST_HEAD(&lru->node[i].list); lru->node[i].nr_items = 0; } return 0; } -EXPORT_SYMBOL_GPL(list_lru_init); +EXPORT_SYMBOL_GPL(list_lru_init_key); void list_lru_destroy(struct list_lru *lru) { diff --git a/mm/truncate.c b/mm/truncate.c index 0db9258319f0..e5cc39ab0751 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -25,6 +25,9 @@ static void clear_exceptional_entry(struct address_space *mapping, pgoff_t index, void *entry) { + struct radix_tree_node *node; + void **slot; + /* Handled by shmem itself */ if (shmem_mapping(mapping)) return; @@ -35,8 +38,27 @@ static void clear_exceptional_entry(struct address_space *mapping, * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ - if (radix_tree_delete_item(&mapping->page_tree, index, entry) == entry) - mapping->nrshadows--; + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + radix_tree_replace_slot(slot, NULL); + mapping->nrshadows--; + if (!node) + goto unlock; + workingset_node_shadows_dec(node); + /* + * Don't track node without shadow entries. + * + * Avoid acquiring the list_lru lock if already untracked. + * The list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_shadows(node) && + !list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, &node->private_list); + __radix_tree_delete_node(&mapping->page_tree, node); +unlock: spin_unlock_irq(&mapping->tree_lock); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 843125a0e255..f3155d51acfd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -772,6 +772,7 @@ const char * const vmstat_text[] = { #endif "workingset_refault", "workingset_activate", + "workingset_nodereclaim", "nr_anon_transparent_hugepages", "nr_free_cma", "nr_dirty_threshold", diff --git a/mm/workingset.c b/mm/workingset.c index 8a6c7cff4923..f7216fa7da27 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -251,3 +251,164 @@ void workingset_activation(struct page *page) { atomic_long_inc(&page_zone(page)->inactive_age); } + +/* + * Shadow entries reflect the share of the working set that does not + * fit into memory, so their number depends on the access pattern of + * the workload. In most cases, they will refault or get reclaimed + * along with the inode, but a (malicious) workload that streams + * through files with a total size several times that of available + * memory, while preventing the inodes from being reclaimed, can + * create excessive amounts of shadow nodes. To keep a lid on this, + * track shadow nodes and reclaim them when they grow way past the + * point where they would still be useful. + */ + +struct list_lru workingset_shadow_nodes; + +static unsigned long count_shadow_nodes(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long shadow_nodes; + unsigned long max_nodes; + unsigned long pages; + + /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + local_irq_disable(); + shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); + local_irq_enable(); + + pages = node_present_pages(sc->nid); + /* + * Active cache pages are limited to 50% of memory, and shadow + * entries that represent a refault distance bigger than that + * do not have any effect. Limit the number of shadow nodes + * such that shadow entries do not exceed the number of active + * cache pages, assuming a worst-case node population density + * of 1/8th on average. + * + * On 64-bit with 7 radix_tree_nodes per page and 64 slots + * each, this will reclaim shadow entries when they consume + * ~2% of available memory: + * + * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE + */ + max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); + + if (shadow_nodes <= max_nodes) + return 0; + + return shadow_nodes - max_nodes; +} + +static enum lru_status shadow_lru_isolate(struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct address_space *mapping; + struct radix_tree_node *node; + unsigned int i; + int ret; + + /* + * Page cache insertions and deletions synchroneously maintain + * the shadow node LRU under the mapping->tree_lock and the + * lru_lock. Because the page cache tree is emptied before + * the inode can be destroyed, holding the lru_lock pins any + * address_space that has radix tree nodes on the LRU. + * + * We can then safely transition to the mapping->tree_lock to + * pin only the address_space of the particular node we want + * to reclaim, take the node off-LRU, and drop the lru_lock. + */ + + node = container_of(item, struct radix_tree_node, private_list); + mapping = node->private_data; + + /* Coming from the list, invert the lock order */ + if (!spin_trylock(&mapping->tree_lock)) { + spin_unlock(lru_lock); + ret = LRU_RETRY; + goto out; + } + + list_del_init(item); + spin_unlock(lru_lock); + + /* + * The nodes should only contain one or more shadow entries, + * no pages, so we expect to be able to remove them all and + * delete and free the empty node afterwards. + */ + + BUG_ON(!node->count); + BUG_ON(node->count & RADIX_TREE_COUNT_MASK); + + for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[i]) { + BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); + node->slots[i] = NULL; + BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); + node->count -= 1U << RADIX_TREE_COUNT_SHIFT; + BUG_ON(!mapping->nrshadows); + mapping->nrshadows--; + } + } + BUG_ON(node->count); + inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM); + if (!__radix_tree_delete_node(&mapping->page_tree, node)) + BUG(); + + spin_unlock(&mapping->tree_lock); + ret = LRU_REMOVED_RETRY; +out: + local_irq_enable(); + cond_resched(); + local_irq_disable(); + spin_lock(lru_lock); + return ret; +} + +static unsigned long scan_shadow_nodes(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long ret; + + /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + local_irq_disable(); + ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, + shadow_lru_isolate, NULL, &sc->nr_to_scan); + local_irq_enable(); + return ret; +} + +static struct shrinker workingset_shadow_shrinker = { + .count_objects = count_shadow_nodes, + .scan_objects = scan_shadow_nodes, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, +}; + +/* + * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe + * mapping->tree_lock. + */ +static struct lock_class_key shadow_nodes_key; + +static int __init workingset_init(void) +{ + int ret; + + ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); + if (ret) + goto err; + ret = register_shrinker(&workingset_shadow_shrinker); + if (ret) + goto err_list_lru; + return 0; +err_list_lru: + list_lru_destroy(&workingset_shadow_nodes); +err: + return ret; +} +module_init(workingset_init); -- cgit v1.2.3 From f412c97abef71026d8192ca8efca231f1e3906b3 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 3 Apr 2014 14:47:59 -0700 Subject: mm, hugetlb: mark some bootstrap functions as __init Both prep_compound_huge_page() and prep_compound_gigantic_page() are only called at bootstrap and can be marked as __init. The __SetPageTail(page) in prep_compound_gigantic_page() happening before page->first_page is initialized is not concerning since this is bootstrap. Signed-off-by: David Rientjes Reviewed-by: Michal Hocko Cc: Joonsoo Kim Reviewed-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 38d9bed88dc8..7c02b9dadfb0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -689,7 +689,8 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) put_page(page); /* free it into the hugepage allocator */ } -static void prep_compound_gigantic_page(struct page *page, unsigned long order) +static void __init prep_compound_gigantic_page(struct page *page, + unsigned long order) { int i; int nr_pages = 1 << order; @@ -1319,7 +1320,7 @@ found: return 1; } -static void prep_compound_huge_page(struct page *page, int order) +static void __init prep_compound_huge_page(struct page *page, int order) { if (unlikely(order > (MAX_ORDER - 1))) prep_compound_gigantic_page(page, order); -- cgit v1.2.3 From 119d6d59dcc0980dcd581fdadb6b2033b512a473 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 3 Apr 2014 14:48:00 -0700 Subject: mm, compaction: avoid isolating pinned pages Page migration will fail for memory that is pinned in memory with, for example, get_user_pages(). In this case, it is unnecessary to take zone->lru_lock or isolating the page and passing it to page migration which will ultimately fail. This is a racy check, the page can still change from under us, but in that case we'll just fail later when attempting to move the page. This avoids very expensive memory compaction when faulting transparent hugepages after pinning a lot of memory with a Mellanox driver. On a 128GB machine and pinning ~120GB of memory, before this patch we see the enormous disparity in the number of page migration failures because of the pinning (from /proc/vmstat): compact_pages_moved 8450 compact_pagemigrate_failed 15614415 0.05% of pages isolated are successfully migrated and explicitly triggering memory compaction takes 102 seconds. After the patch: compact_pages_moved 9197 compact_pagemigrate_failed 7 99.9% of pages isolated are now successfully migrated in this configuration and memory compaction takes less than one second. Signed-off-by: David Rientjes Acked-by: Hugh Dickins Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Rik van Riel Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index 37b3799d948c..212f41c6278b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -584,6 +584,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; } + /* + * Migration will fail if an anonymous page is pinned in memory, + * so avoid taking lru_lock and isolating it unnecessarily in an + * admittedly racy check. + */ + if (!page_mapping(page) && + page_count(page) > page_mapcount(page)) + continue; + /* Check if it is ok to still hold the lock */ locked = compact_checklock_irqsave(&zone->lru_lock, &flags, locked, cc); -- cgit v1.2.3 From 74e77fb9a23813f5c2864d8e11cfec268a45c4cc Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 3 Apr 2014 14:48:01 -0700 Subject: mm/compaction.c: mark function as static Mark function as static in compaction.c because it is not used outside this file. This eliminates the following warning from mm/compaction.c: mm/compaction.c:1190:9: warning: no previous prototype for `sysfs_compact_node' [-Wmissing-prototypes Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Reviewed-by: Rik van Riel Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 212f41c6278b..b6ab77160068 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1235,7 +1235,7 @@ int sysctl_extfrag_handler(struct ctl_table *table, int write, } #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) -ssize_t sysfs_compact_node(struct device *dev, +static ssize_t sysfs_compact_node(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { -- cgit v1.2.3 From b19a99392a6344e34a82c5e33b9c9d25d0d0fda7 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 3 Apr 2014 14:48:02 -0700 Subject: mm/memory.c: mark functions as static mark functions as static in memory.c because they are not used outside this file. This eliminates the following warnings in mm/memory.c: mm/memory.c:3530:5: warning: no previous prototype for `numa_migrate_prep' [-Wmissing-prototypes] mm/memory.c:3545:5: warning: no previous prototype for `do_numa_page' [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Reviewed-by: Rik van Riel Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 22dfa617bddb..57ff3510ddbd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3531,7 +3531,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } -int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, +static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags) { @@ -3546,7 +3546,7 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, +static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) { struct page *page = NULL; -- cgit v1.2.3 From eafd4dc4d76a9c6f184cc1de5a7891b7f6f76f2c Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 3 Apr 2014 14:48:03 -0700 Subject: mm/mmap.c: mark function as static Mark function as static in mmap.c because they are not used outside this file. This eliminates the following warning in mm/mmap.c: mm/mmap.c:407:6: warning: no previous prototype for `validate_mm' [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Reviewed-by: Rik van Riel Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 81ba54ff96c7..ac1d6671b1ed 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -405,7 +405,7 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) } } -void validate_mm(struct mm_struct *mm) +static void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; -- cgit v1.2.3 From 2eb2e141c018b768c88ff84eafa1279f11f2e248 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 3 Apr 2014 14:48:04 -0700 Subject: mm/process_vm_access.c: mark function as static Mark function as static in process_vm_access.c because it is not used outside this file. This eliminates the following warning in mm/process_vm_access.c: mm/process_vm_access.c:416:1: warning: no previous prototype for `compat_process_vm_rw' [-Wmissing-prototypes] [akpm@linux-foundation.org: remove unneeded asmlinkage - compat_process_vm_rw isn't referenced from asm] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/process_vm_access.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 3c5cf68566ec..cb79065c19e5 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -412,7 +412,7 @@ SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, #ifdef CONFIG_COMPAT -asmlinkage ssize_t +static ssize_t compat_process_vm_rw(compat_pid_t pid, const struct compat_iovec __user *lvec, unsigned long liovcnt, -- cgit v1.2.3 From d20199e1f5930c4819e63bd21be240f317a53815 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 3 Apr 2014 14:48:05 -0700 Subject: mm/page_cgroup.c: mark functions as static Mark functions as static in page_cgroup.c because they are not used outside this file. This eliminates the following warning in mm/page_cgroup.c: mm/page_cgroup.c:177:6: warning: no previous prototype for `__free_page_cgroup' [-Wmissing-prototypes] mm/page_cgroup.c:190:15: warning: no previous prototype for `online_page_cgroup' [-Wmissing-prototypes] mm/page_cgroup.c:225:15: warning: no previous prototype for `offline_page_cgroup' [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index cfd162882c00..3708264d2833 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -175,7 +175,7 @@ static void free_page_cgroup(void *addr) } } -void __free_page_cgroup(unsigned long pfn) +static void __free_page_cgroup(unsigned long pfn) { struct mem_section *ms; struct page_cgroup *base; @@ -188,9 +188,9 @@ void __free_page_cgroup(unsigned long pfn) ms->page_cgroup = NULL; } -int __meminit online_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, - int nid) +static int __meminit online_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, + int nid) { unsigned long start, end, pfn; int fail = 0; @@ -223,8 +223,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn, return -ENOMEM; } -int __meminit offline_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, int nid) +static int __meminit offline_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, int nid) { unsigned long start, end, pfn; -- cgit v1.2.3 From de498507258210b06a008138abbb24d148dee49d Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 3 Apr 2014 14:48:06 -0700 Subject: mm/nobootmem.c: mark function as static Mark function as static in nobootmem.c because it is not used outside this file. This eliminates the following warning in mm/nobootmem.c: mm/nobootmem.c:324:15: warning: no previous prototype for `___alloc_bootmem_node' [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nobootmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/nobootmem.c b/mm/nobootmem.c index f73f2987a852..04a9d94333a5 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -334,7 +334,7 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); } -void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, +static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { -- cgit v1.2.3 From c558784fa9e65363ed32cf6ff137c89bc8b54f81 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 3 Apr 2014 14:48:07 -0700 Subject: include/linux/mm.h: remove ifdef condition The ifdef conditions in include/linux/mm.h presents three cases: - !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) There is no actual definition of function but include/linux/mm.h has a static inline stub defined. - defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) linux/mm.h does not define a prototype, but mm/page_alloc.c defines the function. Hence, compiler reports the following warning: mm/page_alloc.c:4300:15: warning: no previous prototype for `__early_pfn_to_nid' [-Wmissing-prototypes] - defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) The architecture defines the function, and linux/mm.h has a prototype. Thus, join the conditions of Case 2 and 3 ie eliminate the ifdef condition of CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID to eliminate the missing prototype warning from file mm/page_alloc.c. Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Reviewed-by: Rik van Riel Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 58d6ddba5db3..35300f390eb6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1666,10 +1666,8 @@ static inline int __early_pfn_to_nid(unsigned long pfn) #else /* please see mm/page_alloc.c */ extern int __meminit early_pfn_to_nid(unsigned long pfn); -#ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID /* there is a per-arch backend function. */ extern int __meminit __early_pfn_to_nid(unsigned long pfn); -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ #endif extern void set_dma_reserve(unsigned long new_dma_reserve); -- cgit v1.2.3 From 80d7ef66142b0b4358223790e7a4cb153b48a05c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 3 Apr 2014 14:48:08 -0700 Subject: mm: rename __do_fault() -> do_fault() Current __do_fault() is awful and unmaintainable. These patches try to sort it out by split __do_fault() into three destinct codepaths: - to handle read page fault; - to handle write page fault to private mappings; - to handle write page fault to shared mappings; I also found page refcount leak in PageHWPoison() path of __do_fault(). This patch (of 7): do_fault() is unused: no reason for underscores. Signed-off-by: Kirill A. Shutemov Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 57ff3510ddbd..144e8cd07805 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2748,7 +2748,7 @@ reuse: * bit after it clear all dirty ptes, but before a racing * do_wp_page installs a dirty pte. * - * __do_fault is protected similarly. + * do_fault is protected similarly. */ if (!page_mkwrite) { wait_on_page_locked(dirty_page); @@ -3287,7 +3287,7 @@ oom: } /* - * __do_fault() tries to create a new page mapping. It aggressively + * do_fault() tries to create a new page mapping. It aggressively * tries to share with existing pages, but makes a separate copy if * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid * the next page fault. @@ -3299,7 +3299,7 @@ oom: * but allow concurrent faults), and pte neither mapped nor locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, +static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { @@ -3496,7 +3496,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; pte_unmap(page_table); - return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + return do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } /* @@ -3528,7 +3528,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, } pgoff = pte_to_pgoff(orig_pte); - return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + return do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, -- cgit v1.2.3 From 7eae74af32d2048d27c38bad1c767a8f3ce4ddb6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 3 Apr 2014 14:48:10 -0700 Subject: mm: do_fault(): extract to call vm_ops->do_fault() to separate function Extract code to vm_ops->do_fault() and basic error handling to separate function. The code will be reused. Signed-off-by: Kirill A. Shutemov Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 76 ++++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 31 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 144e8cd07805..af76397c2c54 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3286,6 +3286,37 @@ oom: return VM_FAULT_OOM; } +static int __do_fault(struct vm_area_struct *vma, unsigned long address, + pgoff_t pgoff, unsigned int flags, struct page **page) +{ + struct vm_fault vmf; + int ret; + + vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.pgoff = pgoff; + vmf.flags = flags; + vmf.page = NULL; + + ret = vma->vm_ops->fault(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; + + if (unlikely(PageHWPoison(vmf.page))) { + if (ret & VM_FAULT_LOCKED) + unlock_page(vmf.page); + page_cache_release(vmf.page); + return VM_FAULT_HWPOISON; + } + + if (unlikely(!(ret & VM_FAULT_LOCKED))) + lock_page(vmf.page); + else + VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + + *page = vmf.page; + return ret; +} + /* * do_fault() tries to create a new page mapping. It aggressively * tries to share with existing pages, but makes a separate copy if @@ -3305,12 +3336,11 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, { pte_t *page_table; spinlock_t *ptl; - struct page *page; + struct page *page, *fault_page; struct page *cow_page; pte_t entry; int anon = 0; struct page *dirty_page = NULL; - struct vm_fault vmf; int ret; int page_mkwrite = 0; @@ -3334,42 +3364,19 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else cow_page = NULL; - vmf.virtual_address = (void __user *)(address & PAGE_MASK); - vmf.pgoff = pgoff; - vmf.flags = flags; - vmf.page = NULL; - - ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | - VM_FAULT_RETRY))) + ret = __do_fault(vma, address, pgoff, flags, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; - if (unlikely(PageHWPoison(vmf.page))) { - if (ret & VM_FAULT_LOCKED) - unlock_page(vmf.page); - ret = VM_FAULT_HWPOISON; - page_cache_release(vmf.page); - goto uncharge_out; - } - - /* - * For consistency in subsequent calls, make the faulted page always - * locked. - */ - if (unlikely(!(ret & VM_FAULT_LOCKED))) - lock_page(vmf.page); - else - VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); - /* * Should we do an early C-O-W break? */ - page = vmf.page; + page = fault_page; if (flags & FAULT_FLAG_WRITE) { if (!(vma->vm_flags & VM_SHARED)) { page = cow_page; anon = 1; - copy_user_highpage(page, vmf.page, address, vma); + copy_user_highpage(page, fault_page, address, vma); __SetPageUptodate(page); } else { /* @@ -3378,8 +3385,15 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, * to become writable */ if (vma->vm_ops->page_mkwrite) { + struct vm_fault vmf; int tmp; + vmf.virtual_address = + (void __user *)(address & PAGE_MASK); + vmf.pgoff = pgoff; + vmf.flags = flags; + vmf.page = fault_page; + unlock_page(page); vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; tmp = vma->vm_ops->page_mkwrite(vma, &vmf); @@ -3469,9 +3483,9 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_file && !page_mkwrite) file_update_time(vma->vm_file); } else { - unlock_page(vmf.page); + unlock_page(fault_page); if (anon) - page_cache_release(vmf.page); + page_cache_release(fault_page); } return ret; -- cgit v1.2.3 From e655fb29074a7aa471bfc9f51a0139c6f636a649 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 3 Apr 2014 14:48:11 -0700 Subject: mm: introduce do_read_fault() Introduce do_read_fault(). The function does what do_fault() does for read page faults. Unlike do_fault(), do_read_fault() is pretty clean and straightforward. Signed-off-by: Kirill A. Shutemov Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/mm/memory.c b/mm/memory.c index af76397c2c54..56784e9a7151 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3317,6 +3317,43 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, return ret; } +static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page; + spinlock_t *ptl; + pte_t entry, *pte; + int ret; + + ret = __do_fault(vma, address, pgoff, flags, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + return ret; + } + + flush_icache_page(vma, fault_page); + entry = mk_pte(fault_page, vma->vm_page_prot); + if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte)) + pte_mksoft_dirty(entry); + inc_mm_counter_fast(mm, MM_FILEPAGES); + page_add_file_rmap(fault_page); + set_pte_at(mm, address, pte, entry); + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, pte); + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + + return ret; +} + /* * do_fault() tries to create a new page mapping. It aggressively * tries to share with existing pages, but makes a separate copy if @@ -3510,6 +3547,9 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; pte_unmap(page_table); + if (!(flags & FAULT_FLAG_WRITE)) + return do_read_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); return do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } @@ -3542,6 +3582,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, } pgoff = pte_to_pgoff(orig_pte); + if (!(flags & FAULT_FLAG_WRITE)) + return do_read_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); return do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } -- cgit v1.2.3 From ec47c3b9543054f6f255d027100fa8214e637003 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 3 Apr 2014 14:48:12 -0700 Subject: mm: introduce do_cow_fault() Introduce do_cow_fault(). The function does what do_fault() does for write page faults to private mappings. Unlike do_fault(), do_read_fault() is relatively clean and straight-forward. Signed-off-by: Kirill A. Shutemov Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/mm/memory.c b/mm/memory.c index 56784e9a7151..5be13e794a7c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3354,6 +3354,62 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } +static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page, *new_page; + spinlock_t *ptl; + pte_t entry, *pte; + int ret; + + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) + return VM_FAULT_OOM; + + if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) { + page_cache_release(new_page); + return VM_FAULT_OOM; + } + + ret = __do_fault(vma, address, pgoff, flags, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + goto uncharge_out; + + copy_user_highpage(new_page, fault_page, address, vma); + __SetPageUptodate(new_page); + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + goto uncharge_out; + } + + flush_icache_page(vma, new_page); + entry = mk_pte(new_page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + inc_mm_counter_fast(mm, MM_ANONPAGES); + page_add_new_anon_rmap(new_page, vma, address); + set_pte_at(mm, address, pte, entry); + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, pte); + + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + return ret; +uncharge_out: + mem_cgroup_uncharge_page(new_page); + page_cache_release(new_page); + return ret; +} + /* * do_fault() tries to create a new page mapping. It aggressively * tries to share with existing pages, but makes a separate copy if @@ -3550,6 +3606,9 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!(flags & FAULT_FLAG_WRITE)) return do_read_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + if (!(vma->vm_flags & VM_SHARED)) + return do_cow_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); return do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } @@ -3585,6 +3644,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!(flags & FAULT_FLAG_WRITE)) return do_read_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + if (!(vma->vm_flags & VM_SHARED)) + return do_cow_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); return do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } -- cgit v1.2.3 From f0c6d4d295e4ea9a47375304420baa38ca279542 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 3 Apr 2014 14:48:13 -0700 Subject: mm: introduce do_shared_fault() and drop do_fault() Introduce do_shared_fault(). The function does what do_fault() does for write faults to shared mappings Unlike do_fault(), do_shared_fault() is relatively clean and straight-forward. Old do_fault() is not needed anymore. Let it die. [lliubbo@gmail.com: fix NULL pointer dereference] Signed-off-by: Kirill A. Shutemov Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Cc: Naoya Horiguchi Signed-off-by: Bob Liu Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 226 +++++++++++++++++------------------------------------------- 1 file changed, 62 insertions(+), 164 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 5be13e794a7c..d4320e42989d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2748,7 +2748,7 @@ reuse: * bit after it clear all dirty ptes, but before a racing * do_wp_page installs a dirty pte. * - * do_fault is protected similarly. + * do_shared_fault is protected similarly. */ if (!page_mkwrite) { wait_on_page_locked(dirty_page); @@ -3410,188 +3410,86 @@ uncharge_out: return ret; } -/* - * do_fault() tries to create a new page mapping. It aggressively - * tries to share with existing pages, but makes a separate copy if - * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid - * the next page fault. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte neither mapped nor locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. - */ -static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, +static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { - pte_t *page_table; + struct page *fault_page; + struct address_space *mapping; spinlock_t *ptl; - struct page *page, *fault_page; - struct page *cow_page; - pte_t entry; - int anon = 0; - struct page *dirty_page = NULL; - int ret; - int page_mkwrite = 0; - - /* - * If we do COW later, allocate page befor taking lock_page() - * on the file cache page. This will reduce lock holding time. - */ - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; - - cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!cow_page) - return VM_FAULT_OOM; - - if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { - page_cache_release(cow_page); - return VM_FAULT_OOM; - } - } else - cow_page = NULL; + pte_t entry, *pte; + int dirtied = 0; + struct vm_fault vmf; + int ret, tmp; ret = __do_fault(vma, address, pgoff, flags, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - goto uncharge_out; + return ret; /* - * Should we do an early C-O-W break? + * Check if the backing address space wants to know that the page is + * about to become writable */ - page = fault_page; - if (flags & FAULT_FLAG_WRITE) { - if (!(vma->vm_flags & VM_SHARED)) { - page = cow_page; - anon = 1; - copy_user_highpage(page, fault_page, address, vma); - __SetPageUptodate(page); - } else { - /* - * If the page will be shareable, see if the backing - * address space wants to know that the page is about - * to become writable - */ - if (vma->vm_ops->page_mkwrite) { - struct vm_fault vmf; - int tmp; - - vmf.virtual_address = - (void __user *)(address & PAGE_MASK); - vmf.pgoff = pgoff; - vmf.flags = flags; - vmf.page = fault_page; - - unlock_page(page); - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - tmp = vma->vm_ops->page_mkwrite(vma, &vmf); - if (unlikely(tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { - ret = tmp; - goto unwritable_page; - } - if (unlikely(!(tmp & VM_FAULT_LOCKED))) { - lock_page(page); - if (!page->mapping) { - ret = 0; /* retry the fault */ - unlock_page(page); - goto unwritable_page; - } - } else - VM_BUG_ON_PAGE(!PageLocked(page), page); - page_mkwrite = 1; - } - } + if (!vma->vm_ops->page_mkwrite) + goto set_pte; - } + unlock_page(fault_page); + vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.pgoff = pgoff; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.page = fault_page; - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + tmp = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + page_cache_release(fault_page); + return tmp; + } - /* - * This silly early PAGE_DIRTY setting removes a race - * due to the bad i386 page protection. But it's valid - * for other architectures too. - * - * Note that if FAULT_FLAG_WRITE is set, we either now have - * an exclusive copy of the page, or this is a shared mapping, - * so we can make it writable and dirty to avoid having to - * handle that later. - */ - /* Only go through if we didn't race with anybody else... */ - if (likely(pte_same(*page_table, orig_pte))) { - flush_icache_page(vma, page); - entry = mk_pte(page, vma->vm_page_prot); - if (flags & FAULT_FLAG_WRITE) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte)) - pte_mksoft_dirty(entry); - if (anon) { - inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); - } else { - inc_mm_counter_fast(mm, MM_FILEPAGES); - page_add_file_rmap(page); - if (flags & FAULT_FLAG_WRITE) { - dirty_page = page; - get_page(dirty_page); - } + if (unlikely(!(tmp & VM_FAULT_LOCKED))) { + lock_page(fault_page); + if (!fault_page->mapping) { + unlock_page(fault_page); + page_cache_release(fault_page); + return 0; /* retry */ } - set_pte_at(mm, address, page_table, entry); - - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, page_table); - } else { - if (cow_page) - mem_cgroup_uncharge_page(cow_page); - if (anon) - page_cache_release(page); - else - anon = 1; /* no anon but release faulted_page */ + } else + VM_BUG_ON_PAGE(!PageLocked(fault_page), fault_page); +set_pte: + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + return ret; } - pte_unmap_unlock(page_table, ptl); - - if (dirty_page) { - struct address_space *mapping = page->mapping; - int dirtied = 0; + flush_icache_page(vma, fault_page); + entry = mk_pte(fault_page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + inc_mm_counter_fast(mm, MM_FILEPAGES); + page_add_file_rmap(fault_page); + set_pte_at(mm, address, pte, entry); - if (set_page_dirty(dirty_page)) - dirtied = 1; - unlock_page(dirty_page); - put_page(dirty_page); - if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping but still - * dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, pte); + pte_unmap_unlock(pte, ptl); - /* file_update_time outside page_lock */ - if (vma->vm_file && !page_mkwrite) - file_update_time(vma->vm_file); - } else { - unlock_page(fault_page); - if (anon) - page_cache_release(fault_page); + if (set_page_dirty(fault_page)) + dirtied = 1; + mapping = fault_page->mapping; + unlock_page(fault_page); + if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping but still + * dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); } - return ret; + /* file_update_time outside page_lock */ + if (vma->vm_file && !vma->vm_ops->page_mkwrite) + file_update_time(vma->vm_file); -unwritable_page: - page_cache_release(page); - return ret; -uncharge_out: - /* fs's fault handler get error */ - if (cow_page) { - mem_cgroup_uncharge_page(cow_page); - page_cache_release(cow_page); - } return ret; } @@ -3609,7 +3507,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!(vma->vm_flags & VM_SHARED)) return do_cow_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); - return do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } /* @@ -3647,7 +3545,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!(vma->vm_flags & VM_SHARED)) return do_cow_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); - return do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, -- cgit v1.2.3 From fb09a46425823604bf337d2c9999756f9b753cf1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 3 Apr 2014 14:48:15 -0700 Subject: mm: consolidate code to call vm_ops->page_mkwrite() There are two functions which need to call vm_ops->page_mkwrite(): do_shared_fault() and do_wp_page(). We can consolidate preparation code. Signed-off-by: Kirill A. Shutemov Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 105 ++++++++++++++++++++++++++---------------------------------- 1 file changed, 45 insertions(+), 60 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index d4320e42989d..ac3990d77ec9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2586,6 +2586,38 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo copy_user_highpage(dst, src, va, vma); } +/* + * Notify the address space that the page is about to become writable so that + * it can prohibit this or wait for the page to get into an appropriate state. + * + * We do this without the lock held, so that it can sleep if it needs to. + */ +static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, + unsigned long address) +{ + struct vm_fault vmf; + int ret; + + vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.pgoff = page->index; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.page = page; + + ret = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + return ret; + if (unlikely(!(ret & VM_FAULT_LOCKED))) { + lock_page(page); + if (!page->mapping) { + unlock_page(page); + return 0; /* retry */ + } + ret |= VM_FAULT_LOCKED; + } else + VM_BUG_ON_PAGE(!PageLocked(page), page); + return ret; +} + /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address @@ -2668,42 +2700,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * get_user_pages(.write=1, .force=1). */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { - struct vm_fault vmf; int tmp; - - vmf.virtual_address = (void __user *)(address & - PAGE_MASK); - vmf.pgoff = old_page->index; - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - vmf.page = old_page; - - /* - * Notify the address space that the page is about to - * become writable so that it can prohibit this or wait - * for the page to get into an appropriate state. - * - * We do this without the lock held, so that it can - * sleep if it needs to. - */ page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); - - tmp = vma->vm_ops->page_mkwrite(vma, &vmf); - if (unlikely(tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { - ret = tmp; - goto unwritable_page; + tmp = do_page_mkwrite(vma, old_page, address); + if (unlikely(!tmp || (tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + page_cache_release(old_page); + return tmp; } - if (unlikely(!(tmp & VM_FAULT_LOCKED))) { - lock_page(old_page); - if (!old_page->mapping) { - ret = 0; /* retry the fault */ - unlock_page(old_page); - goto unwritable_page; - } - } else - VM_BUG_ON_PAGE(!PageLocked(old_page), old_page); - /* * Since we dropped the lock we need to revalidate * the PTE as someone else may have changed it. If @@ -2892,10 +2897,6 @@ oom: if (old_page) page_cache_release(old_page); return VM_FAULT_OOM; - -unwritable_page: - page_cache_release(old_page); - return ret; } static void unmap_mapping_range_vma(struct vm_area_struct *vma, @@ -3419,7 +3420,6 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; pte_t entry, *pte; int dirtied = 0; - struct vm_fault vmf; int ret, tmp; ret = __do_fault(vma, address, pgoff, flags, &fault_page); @@ -3430,31 +3430,16 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, * Check if the backing address space wants to know that the page is * about to become writable */ - if (!vma->vm_ops->page_mkwrite) - goto set_pte; - - unlock_page(fault_page); - vmf.virtual_address = (void __user *)(address & PAGE_MASK); - vmf.pgoff = pgoff; - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - vmf.page = fault_page; - - tmp = vma->vm_ops->page_mkwrite(vma, &vmf); - if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { - page_cache_release(fault_page); - return tmp; - } - - if (unlikely(!(tmp & VM_FAULT_LOCKED))) { - lock_page(fault_page); - if (!fault_page->mapping) { - unlock_page(fault_page); + if (vma->vm_ops->page_mkwrite) { + unlock_page(fault_page); + tmp = do_page_mkwrite(vma, fault_page, address); + if (unlikely(!tmp || + (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { page_cache_release(fault_page); - return 0; /* retry */ + return tmp; } - } else - VM_BUG_ON_PAGE(!PageLocked(fault_page), fault_page); -set_pte: + } + pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (unlikely(!pte_same(*pte, orig_pte))) { pte_unmap_unlock(pte, ptl); -- cgit v1.2.3 From 3bb977946998ae0d756279c5a108435d04636e2b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 3 Apr 2014 14:48:16 -0700 Subject: mm: consolidate code to setup pte Extract and consolidate code to setup pte from do_read_fault(), do_cow_fault() and do_shared_fault(). Signed-off-by: Kirill A. Shutemov Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 66 ++++++++++++++++++++++++++++--------------------------------- 1 file changed, 30 insertions(+), 36 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index ac3990d77ec9..90cea22001ef 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3318,13 +3318,37 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, return ret; } +static void do_set_pte(struct vm_area_struct *vma, unsigned long address, + struct page *page, pte_t *pte, bool write, bool anon) +{ + pte_t entry; + + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); + if (write) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) + pte_mksoft_dirty(entry); + if (anon) { + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, address); + } else { + inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES); + page_add_file_rmap(page); + } + set_pte_at(vma->vm_mm, address, pte, entry); + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, pte); +} + static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { struct page *fault_page; spinlock_t *ptl; - pte_t entry, *pte; + pte_t *pte; int ret; ret = __do_fault(vma, address, pgoff, flags, &fault_page); @@ -3338,20 +3362,9 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_release(fault_page); return ret; } - - flush_icache_page(vma, fault_page); - entry = mk_pte(fault_page, vma->vm_page_prot); - if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte)) - pte_mksoft_dirty(entry); - inc_mm_counter_fast(mm, MM_FILEPAGES); - page_add_file_rmap(fault_page); - set_pte_at(mm, address, pte, entry); - - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, pte); + do_set_pte(vma, address, fault_page, pte, false, false); pte_unmap_unlock(pte, ptl); unlock_page(fault_page); - return ret; } @@ -3361,7 +3374,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *fault_page, *new_page; spinlock_t *ptl; - pte_t entry, *pte; + pte_t *pte; int ret; if (unlikely(anon_vma_prepare(vma))) @@ -3390,17 +3403,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_release(fault_page); goto uncharge_out; } - - flush_icache_page(vma, new_page); - entry = mk_pte(new_page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(new_page, vma, address); - set_pte_at(mm, address, pte, entry); - - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, pte); - + do_set_pte(vma, address, new_page, pte, true, true); pte_unmap_unlock(pte, ptl); unlock_page(fault_page); page_cache_release(fault_page); @@ -3418,7 +3421,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct page *fault_page; struct address_space *mapping; spinlock_t *ptl; - pte_t entry, *pte; + pte_t *pte; int dirtied = 0; int ret, tmp; @@ -3447,16 +3450,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_release(fault_page); return ret; } - - flush_icache_page(vma, fault_page); - entry = mk_pte(fault_page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - inc_mm_counter_fast(mm, MM_FILEPAGES); - page_add_file_rmap(fault_page); - set_pte_at(mm, address, pte, entry); - - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, pte); + do_set_pte(vma, address, fault_page, pte, true, false); pte_unmap_unlock(pte, ptl); if (set_page_dirty(fault_page)) -- cgit v1.2.3 From e9b71ca91aedb295097bd47066a06542751ecca8 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 3 Apr 2014 14:48:17 -0700 Subject: mm, thp: drop do_huge_pmd_wp_zero_page_fallback() I've realized that there's no need for do_huge_pmd_wp_zero_page_fallback(). We can just split zero page with split_huge_page_pmd() and return VM_FAULT_FALLBACK. handle_pte_fault() will handle write-protection fault for us. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 79 ++------------------------------------------------------ 1 file changed, 2 insertions(+), 77 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1546655a2d78..6ac89e9f82ef 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -941,81 +941,6 @@ unlock: spin_unlock(ptl); } -static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) -{ - spinlock_t *ptl; - pgtable_t pgtable; - pmd_t _pmd; - struct page *page; - int i, ret = 0; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ - - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!page) { - ret |= VM_FAULT_OOM; - goto out; - } - - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { - put_page(page); - ret |= VM_FAULT_OOM; - goto out; - } - - clear_user_highpage(page, address); - __SetPageUptodate(page); - - mmun_start = haddr; - mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, orig_pmd))) - goto out_free_page; - - pmdp_clear_flush(vma, haddr, pmd); - /* leave pmd empty until pte is filled */ - - pgtable = pgtable_trans_huge_withdraw(mm, pmd); - pmd_populate(mm, &_pmd, pgtable); - - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t *pte, entry; - if (haddr == (address & PAGE_MASK)) { - entry = mk_pte(page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - page_add_new_anon_rmap(page, vma, haddr); - } else { - entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); - entry = pte_mkspecial(entry); - } - pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*pte)); - set_pte_at(mm, haddr, pte, entry); - pte_unmap(pte); - } - smp_wmb(); /* make pte visible before pmd */ - pmd_populate(mm, pmd, pgtable); - spin_unlock(ptl); - put_huge_zero_page(); - inc_mm_counter(mm, MM_ANONPAGES); - - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - - ret |= VM_FAULT_WRITE; -out: - return ret; -out_free_page: - spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - mem_cgroup_uncharge_page(page); - put_page(page); - goto out; -} - static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -1161,8 +1086,8 @@ alloc: if (unlikely(!new_page)) { if (!page) { - ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, - address, pmd, orig_pmd, haddr); + split_huge_page_pmd(vma, address, pmd); + ret |= VM_FAULT_FALLBACK; } else { ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); -- cgit v1.2.3 From 67f9fd91f93c582b7de2ab9325b6e179db77e4d5 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 3 Apr 2014 14:48:18 -0700 Subject: mm: remove read_cache_page_async() This patch removes read_cache_page_async() which wasn't really needed anywhere and simplifies the code around it a bit. read_cache_page_async() is useful when we want to read a page into the cache without waiting for it to complete. This happens when the appropriate callback 'filler' doesn't complete its read operation and releases the page lock immediately, and instead queues a different completion routine to do that. This never actually happened anywhere in the code. read_cache_page_async() had 3 different callers: - read_cache_page() which is the sync version, it would just wait for the requested read to complete using wait_on_page_read(). - JFFS2 would call it from jffs2_gc_fetch_page(), but the filler function it supplied doesn't do any async reads, and would complete before the filler function returns - making it actually a sync read. - CRAMFS would call it using the read_mapping_page_async() wrapper, with a similar story to JFFS2 - the filler function doesn't do anything that reminds async reads and would always complete before the filler function returns. To sum it up, the code in mm/filemap.c never took advantage of having read_cache_page_async(). While there are filler callbacks that do async reads (such as the block one), we always called it with the read_cache_page(). This patch adds a mandatory wait for read to complete when adding a new page to the cache, and removes read_cache_page_async() and its wrappers. Signed-off-by: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/cramfs/inode.c | 3 +-- fs/jffs2/fs.c | 2 +- include/linux/pagemap.h | 10 -------- mm/filemap.c | 64 ++++++++++++++++++------------------------------- 4 files changed, 25 insertions(+), 54 deletions(-) diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 06610cf94d57..a1f801c14fbc 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -195,8 +195,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i struct page *page = NULL; if (blocknr + i < devsize) { - page = read_mapping_page_async(mapping, blocknr + i, - NULL); + page = read_mapping_page(mapping, blocknr + i, NULL); /* synchronous error? */ if (IS_ERR(page)) page = NULL; diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index a012e16a8bb3..f73991522672 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -687,7 +687,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c, struct inode *inode = OFNI_EDONI_2SFFJ(f); struct page *pg; - pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, + pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, (void *)jffs2_do_readpage_unlock, inode); if (IS_ERR(pg)) return (void *)pg; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ff43253f568a..45598f1e9aa3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -289,8 +289,6 @@ static inline struct page *grab_cache_page(struct address_space *mapping, extern struct page * grab_cache_page_nowait(struct address_space *mapping, pgoff_t index); -extern struct page * read_cache_page_async(struct address_space *mapping, - pgoff_t index, filler_t *filler, void *data); extern struct page * read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, void *data); extern struct page * read_cache_page_gfp(struct address_space *mapping, @@ -298,14 +296,6 @@ extern struct page * read_cache_page_gfp(struct address_space *mapping, extern int read_cache_pages(struct address_space *mapping, struct list_head *pages, filler_t *filler, void *data); -static inline struct page *read_mapping_page_async( - struct address_space *mapping, - pgoff_t index, void *data) -{ - filler_t *filler = (filler_t *)mapping->a_ops->readpage; - return read_cache_page_async(mapping, index, filler, data); -} - static inline struct page *read_mapping_page(struct address_space *mapping, pgoff_t index, void *data) { diff --git a/mm/filemap.c b/mm/filemap.c index d6df3bacb0fb..21781f1fe52b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2133,6 +2133,18 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); +static struct page *wait_on_page_read(struct page *page) +{ + if (!IS_ERR(page)) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + } + return page; +} + static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), @@ -2159,6 +2171,8 @@ repeat: if (err < 0) { page_cache_release(page); page = ERR_PTR(err); + } else { + page = wait_on_page_read(page); } } return page; @@ -2195,6 +2209,10 @@ retry: if (err < 0) { page_cache_release(page); return ERR_PTR(err); + } else { + page = wait_on_page_read(page); + if (IS_ERR(page)) + return page; } out: mark_page_accessed(page); @@ -2202,40 +2220,25 @@ out: } /** - * read_cache_page_async - read into page cache, fill it if needed + * read_cache_page - read into page cache, fill it if needed * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read * @data: first arg to filler(data, page) function, often left as NULL * - * Same as read_cache_page, but don't wait for page to become unlocked - * after submitting it to the filler. - * * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page but don't wait for it to become unlocked. + * not set, try to fill the page and wait for it to become unlocked. * * If the page does not get brought uptodate, return -EIO. */ -struct page *read_cache_page_async(struct address_space *mapping, +struct page *read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); } -EXPORT_SYMBOL(read_cache_page_async); - -static struct page *wait_on_page_read(struct page *page) -{ - if (!IS_ERR(page)) { - wait_on_page_locked(page); - if (!PageUptodate(page)) { - page_cache_release(page); - page = ERR_PTR(-EIO); - } - } - return page; -} +EXPORT_SYMBOL(read_cache_page); /** * read_cache_page_gfp - read into page cache, using specified page allocation flags. @@ -2254,31 +2257,10 @@ struct page *read_cache_page_gfp(struct address_space *mapping, { filler_t *filler = (filler_t *)mapping->a_ops->readpage; - return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); + return do_read_cache_page(mapping, index, filler, NULL, gfp); } EXPORT_SYMBOL(read_cache_page_gfp); -/** - * read_cache_page - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: first arg to filler(data, page) function, often left as NULL - * - * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page then wait for it to become unlocked. - * - * If the page does not get brought uptodate, return -EIO. - */ -struct page *read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data) -{ - return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); -} -EXPORT_SYMBOL(read_cache_page); - static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { -- cgit v1.2.3 From 5509a5d27b971a90b940e148ca9ca53312e4fa7a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 3 Apr 2014 14:48:19 -0700 Subject: drop_caches: add some documentation and info message There is plenty of anecdotal evidence and a load of blog posts suggesting that using "drop_caches" periodically keeps your system running in "tip top shape". Perhaps adding some kernel documentation will increase the amount of accurate data on its use. If we are not shrinking caches effectively, then we have real bugs. Using drop_caches will simply mask the bugs and make them harder to find, but certainly does not fix them, nor is it an appropriate "workaround" to limit the size of the caches. On the contrary, there have been bug reports on issues that turned out to be misguided use of cache dropping. Dropping caches is a very drastic and disruptive operation that is good for debugging and running tests, but if it creates bug reports from production use, kernel developers should be aware of its use. Add a bit more documentation about it, a syslog message to track down abusers, and vmstat drop counters to help analyze problem reports. [akpm@linux-foundation.org: checkpatch fixes] [hannes@cmpxchg.org: add runtime suppression control] Signed-off-by: Dave Hansen Signed-off-by: Michal Hocko Acked-by: KOSAKI Motohiro Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 33 +++++++++++++++++++++++++++------ fs/drop_caches.c | 16 ++++++++++++++-- include/linux/vm_event_item.h | 1 + kernel/sysctl.c | 4 ++-- mm/vmstat.c | 3 +++ 5 files changed, 47 insertions(+), 10 deletions(-) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index d614a9b6a280..dd9d0e33b443 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -175,18 +175,39 @@ Setting this to zero disables periodic writeback altogether. drop_caches -Writing to this will cause the kernel to drop clean caches, dentries and -inodes from memory, causing that memory to become free. +Writing to this will cause the kernel to drop clean caches, as well as +reclaimable slab objects like dentries and inodes. Once dropped, their +memory becomes free. To free pagecache: echo 1 > /proc/sys/vm/drop_caches -To free dentries and inodes: +To free reclaimable slab objects (includes dentries and inodes): echo 2 > /proc/sys/vm/drop_caches -To free pagecache, dentries and inodes: +To free slab objects and pagecache: echo 3 > /proc/sys/vm/drop_caches -As this is a non-destructive operation and dirty objects are not freeable, the -user should run `sync' first. +This is a non-destructive operation and will not free any dirty objects. +To increase the number of objects freed by this operation, the user may run +`sync' prior to writing to /proc/sys/vm/drop_caches. This will minimize the +number of dirty objects on the system and create more candidates to be +dropped. + +This file is not a means to control the growth of the various kernel caches +(inodes, dentries, pagecache, etc...) These objects are automatically +reclaimed by the kernel when memory is needed elsewhere on the system. + +Use of this file can cause performance problems. Since it discards cached +objects, it may cost a significant amount of I/O and CPU to recreate the +dropped objects, especially if they were under heavy use. Because of this, +use outside of a testing or debugging environment is not recommended. + +You may see informational messages in your kernel log when this file is +used: + + cat (1234): drop_caches: 3 + +These are informational only. They do not mean that anything is wrong +with your system. To disable them, echo 4 (bit 3) into drop_caches. ============================================================== diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 9fd702f5bfb2..9280202e488c 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -59,10 +59,22 @@ int drop_caches_sysctl_handler(ctl_table *table, int write, if (ret) return ret; if (write) { - if (sysctl_drop_caches & 1) + static int stfu; + + if (sysctl_drop_caches & 1) { iterate_supers(drop_pagecache_sb, NULL); - if (sysctl_drop_caches & 2) + count_vm_event(DROP_PAGECACHE); + } + if (sysctl_drop_caches & 2) { drop_slab(); + count_vm_event(DROP_SLAB); + } + if (!stfu) { + pr_info("%s (%d): drop_caches: %d\n", + current->comm, task_pid_nr(current), + sysctl_drop_caches); + } + stfu |= sysctl_drop_caches & 4; } return 0; } diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 3a712e2e7d76..486c3972c0be 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -37,6 +37,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL, KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, PAGEOUTRUN, ALLOCSTALL, PGROTATED, + DROP_PAGECACHE, DROP_SLAB, #ifdef CONFIG_NUMA_BALANCING NUMA_PTE_UPDATES, NUMA_HUGE_PTE_UPDATES, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 09d2e2413605..5c14b547882e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -123,7 +123,7 @@ static int __maybe_unused neg_one = -1; static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; -static int __maybe_unused three = 3; +static int __maybe_unused four = 4; static unsigned long one_ul = 1; static int one_hundred = 100; #ifdef CONFIG_PRINTK @@ -1264,7 +1264,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = drop_caches_sysctl_handler, .extra1 = &one, - .extra2 = &three, + .extra2 = &four, }, #ifdef CONFIG_COMPACTION { diff --git a/mm/vmstat.c b/mm/vmstat.c index f3155d51acfd..197b4c4a9587 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -813,6 +813,9 @@ const char * const vmstat_text[] = { "pgrotated", + "drop_pagecache", + "drop_slab", + #ifdef CONFIG_NUMA_BALANCING "numa_pte_updates", "numa_huge_pte_updates", -- cgit v1.2.3 From bcccff93af359533683603255124fc19eb12613d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 3 Apr 2014 14:48:21 -0700 Subject: kobject: don't block for each kobject_uevent Currently kobject_uevent has somewhat unpredictable semantics. The point is, since it may call a usermode helper and wait for it to execute (UMH_WAIT_EXEC), it is impossible to say for sure what lock dependencies it will introduce for the caller - strictly speaking it depends on what fs the binary is located on and the set of locks fork may take. There are quite a few kobject_uevent's users that do not take this into account and call it with various mutexes taken, e.g. rtnl_mutex, net_mutex, which might potentially lead to a deadlock. Since there is actually no reason to wait for the usermode helper to execute there, let's make kobject_uevent start the helper asynchronously with the aid of the UMH_NO_WAIT flag. Personally, I'm interested in this, because I really want kobject_uevent to be called under the slab_mutex in the slub implementation as it used to be some time ago, because it greatly simplifies synchronization and automatically fixes a kmemcg-related race. However, there was a deadlock detected on an attempt to call kobject_uevent under the slab_mutex (see https://lkml.org/lkml/2012/1/14/45), which was reported to be fixed by releasing the slab_mutex for kobject_uevent. Unfortunately, there was no information about who exactly blocked on the slab_mutex causing the usermode helper to stall, neither have I managed to find this out or reproduce the issue. BTW, this is not the first attempt to make kobject_uevent use UMH_NO_WAIT. Previous one was made by commit f520360d93cd ("kobject: don't block for each kobject_uevent"), but it was wrong (it passed arguments allocated on stack to async thread) so it was reverted in 05f54c13cd0c ("Revert "kobject: don't block for each kobject_uevent"."). It targeted on speeding up the boot process though. Signed-off-by: Vladimir Davydov Cc: Greg KH Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kobject.h | 1 + lib/kobject_uevent.c | 42 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/include/linux/kobject.h b/include/linux/kobject.h index 926afb6f6b5f..f896a33e8341 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -119,6 +119,7 @@ struct kobj_type { }; struct kobj_uevent_env { + char *argv[3]; char *envp[UEVENT_NUM_ENVP]; int envp_idx; char buf[UEVENT_BUFFER_SIZE]; diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 5f72767ddd9b..4e3bd71bd949 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -124,6 +124,30 @@ static int kobj_usermode_filter(struct kobject *kobj) return 0; } +static int init_uevent_argv(struct kobj_uevent_env *env, const char *subsystem) +{ + int len; + + len = strlcpy(&env->buf[env->buflen], subsystem, + sizeof(env->buf) - env->buflen); + if (len >= (sizeof(env->buf) - env->buflen)) { + WARN(1, KERN_ERR "init_uevent_argv: buffer size too small\n"); + return -ENOMEM; + } + + env->argv[0] = uevent_helper; + env->argv[1] = &env->buf[env->buflen]; + env->argv[2] = NULL; + + env->buflen += len + 1; + return 0; +} + +static void cleanup_uevent_env(struct subprocess_info *info) +{ + kfree(info->data); +} + /** * kobject_uevent_env - send an uevent with environmental data * @@ -301,11 +325,8 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, /* call uevent_helper, usually only enabled during early boot */ if (uevent_helper[0] && !kobj_usermode_filter(kobj)) { - char *argv [3]; + struct subprocess_info *info; - argv [0] = uevent_helper; - argv [1] = (char *)subsystem; - argv [2] = NULL; retval = add_uevent_var(env, "HOME=/"); if (retval) goto exit; @@ -313,9 +334,18 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, "PATH=/sbin:/bin:/usr/sbin:/usr/bin"); if (retval) goto exit; + retval = init_uevent_argv(env, subsystem); + if (retval) + goto exit; - retval = call_usermodehelper(argv[0], argv, - env->envp, UMH_WAIT_EXEC); + retval = -ENOMEM; + info = call_usermodehelper_setup(env->argv[0], env->argv, + env->envp, GFP_KERNEL, + NULL, cleanup_uevent_env, env); + if (info) { + retval = call_usermodehelper_exec(info, UMH_NO_WAIT); + env = NULL; /* freed by cleanup_uevent_env */ + } } exit: -- cgit v1.2.3 From 421af243b1a5ec2fe62f070d3b84d925301cc008 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 3 Apr 2014 14:48:22 -0700 Subject: slub: do not drop slab_mutex for sysfs_slab_add We release the slab_mutex while calling sysfs_slab_add from __kmem_cache_create since commit 66c4c35c6bc5 ("slub: Do not hold slub_lock when calling sysfs_slab_add()"), because kobject_uevent called by sysfs_slab_add might block waiting for the usermode helper to exec, which would result in a deadlock if we took the slab_mutex while executing it. However, apart from complicating synchronization rules, releasing the slab_mutex on kmem cache creation can result in a kmemcg-related race. The point is that we check if the memcg cache exists before going to __kmem_cache_create, but register the new cache in memcg subsys after it. Since we can drop the mutex there, several threads can see that the memcg cache does not exist and proceed to creating it, which is wrong. Fortunately, recently kobject_uevent was patched to call the usermode helper with the UMH_NO_WAIT flag, making the deadlock impossible. Therefore there is no point in releasing the slab_mutex while calling sysfs_slab_add, so let's simplify kmem_cache_create synchronization and fix the kmemcg-race mentioned above by holding the slab_mutex during the whole cache creation path. Signed-off-by: Vladimir Davydov Acked-by: Christoph Lameter Cc: Greg KH Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 7611f148ee81..fe6d7be22ef0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3237,8 +3237,9 @@ int __kmem_cache_shutdown(struct kmem_cache *s) if (!rc) { /* - * We do the same lock strategy around sysfs_slab_add, see - * __kmem_cache_create. Because this is pretty much the last + * Since slab_attr_store may take the slab_mutex, we should + * release the lock while removing the sysfs entry in order to + * avoid a deadlock. Because this is pretty much the last * operation we do and the lock will be released shortly after * that in slab_common.c, we could just move sysfs_slab_remove * to a later point in common code. We should do that when we @@ -3778,10 +3779,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) return 0; memcg_propagate_slab_attrs(s); - mutex_unlock(&slab_mutex); err = sysfs_slab_add(s); - mutex_lock(&slab_mutex); - if (err) kmem_cache_close(s); -- cgit v1.2.3 From 6d2be915e589b58cb11418cbe1f22ff90732b6ac Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Thu, 3 Apr 2014 14:48:23 -0700 Subject: mm/readahead.c: fix readahead failure for memoryless NUMA nodes and limit readahead pages Currently max_sane_readahead() returns zero on the cpu whose NUMA node has no local memory which leads to readahead failure. Fix this readahead failure by returning minimum of (requested pages, 512). Users running applications on a memory-less cpu which needs readahead such as streaming application see considerable boost in the performance. Result: fadvise experiment with FADV_WILLNEED on a PPC machine having memoryless CPU with 1GB testfile (12 iterations) yielded around 46.66% improvement. fadvise experiment with FADV_WILLNEED on a x240 machine with 1GB testfile 32GB* 4G RAM numa machine (12 iterations) showed no impact on the normal NUMA cases w/ patch. Kernel Avg Stddev base 7.4975 3.92% patched 7.4174 3.26% [Andrew: making return value PAGE_SIZE independent] Suggested-by: Linus Torvalds Signed-off-by: Raghavendra K T Acked-by: Jan Kara Cc: Wu Fengguang Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 62c500a088a7..29c5e1af5a0c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -233,14 +233,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, return 0; } +#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE) /* * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a * sensible upper limit. */ unsigned long max_sane_readahead(unsigned long nr) { - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) - + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); + return min(nr, MAX_READAHEAD); } /* -- cgit v1.2.3 From e3a0cfdc8c8904236f91a506c1e760b0d60dd918 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 3 Apr 2014 14:48:24 -0700 Subject: include/linux/syscalls.h: add sys32_quotactl() prototype This eliminates the following warning in quota/compat.c: fs/quota/compat.c:43:17: warning: no previous prototype for `sys32_quotactl' [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 1e67b7a5968c..2aa8b749f13d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -200,6 +200,8 @@ extern struct trace_event_functions exit_syscall_print_funcs; } \ static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)) +asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, + qid_t id, void __user *addr); asmlinkage long sys_time(time_t __user *tloc); asmlinkage long sys_stime(time_t __user *tptr); asmlinkage long sys_gettimeofday(struct timeval __user *tv, -- cgit v1.2.3 From 6af9f7bf3c399e0ab1eee048e13572c6d4e15fe9 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 3 Apr 2014 14:48:25 -0700 Subject: sys_sysfs: Add CONFIG_SYSFS_SYSCALL sys_sysfs is an obsolete system call no longer supported by libc. - This patch adds a default CONFIG_SYSFS_SYSCALL=y - Option can be turned off in expert mode. - cond_syscall added to kernel/sys_ni.c [akpm@linux-foundation.org: tweak Kconfig help text] Signed-off-by: Fabian Frederick Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/filesystems.c | 2 ++ init/Kconfig | 10 ++++++++++ kernel/sys_ni.c | 1 + 3 files changed, 13 insertions(+) diff --git a/fs/filesystems.c b/fs/filesystems.c index 92567d95ba6a..5797d45a78cb 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -121,6 +121,7 @@ int unregister_filesystem(struct file_system_type * fs) EXPORT_SYMBOL(unregister_filesystem); +#ifdef CONFIG_SYSFS_SYSCALL static int fs_index(const char __user * __name) { struct file_system_type * tmp; @@ -199,6 +200,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) } return retval; } +#endif int __init get_filesystem_list(char *buf) { diff --git a/init/Kconfig b/init/Kconfig index d56cb03c1b49..e45cc62904b3 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1290,6 +1290,16 @@ config UID16 help This enables the legacy 16-bit UID syscall wrappers. +config SYSFS_SYSCALL + bool "Sysfs syscall support" if EXPERT + default y + ---help--- + sys_sysfs is an obsolete system call no longer supported in libc. + Note that disabling this option is more secure but might break + compatibility with some systems. + + If unsure say Y here. + config SYSCTL_SYSCALL bool "Sysctl syscall support" if EXPERT depends on PROC_SYSCTL diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7078052284fd..74395a95b7e9 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -146,6 +146,7 @@ cond_syscall(sys_io_destroy); cond_syscall(sys_io_submit); cond_syscall(sys_io_cancel); cond_syscall(sys_io_getevents); +cond_syscall(sys_sysfs); cond_syscall(sys_syslog); cond_syscall(sys_process_vm_readv); cond_syscall(sys_process_vm_writev); -- cgit v1.2.3 From 8f6c5ffc8987f4f5b5a3e9d557d94bbf3a9bf216 Mon Sep 17 00:00:00 2001 From: Wang YanQing Date: Thu, 3 Apr 2014 14:48:26 -0700 Subject: kernel/groups.c: remove return value of set_groups After commit 6307f8fee295 ("security: remove dead hook task_setgroups"), set_groups will always return zero, so we could just remove return value of set_groups. This patch reduces code size, and simplfies code to use set_groups, because we don't need to check its return value any more. [akpm@linux-foundation.org: remove obsolete claims from set_groups() comment] Signed-off-by: Wang YanQing Cc: "Eric W. Biederman" Cc: Serge Hallyn Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/auth.c | 5 +---- include/linux/cred.h | 2 +- kernel/groups.c | 14 ++------------ 3 files changed, 4 insertions(+), 17 deletions(-) diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 06cddd572264..2645be435e75 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -71,10 +71,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) if (gid_eq(new->fsgid, INVALID_GID)) new->fsgid = exp->ex_anon_gid; - ret = set_groups(new, gi); + set_groups(new, gi); put_group_info(gi); - if (ret < 0) - goto error; if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID)) new->cap_effective = cap_drop_nfsd_set(new->cap_effective); @@ -89,7 +87,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) oom: ret = -ENOMEM; -error: abort_creds(new); return ret; } diff --git a/include/linux/cred.h b/include/linux/cred.h index 04421e825365..f61d6c8f5ef3 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -66,7 +66,7 @@ extern struct group_info *groups_alloc(int); extern struct group_info init_groups; extern void groups_free(struct group_info *); extern int set_current_groups(struct group_info *); -extern int set_groups(struct cred *, struct group_info *); +extern void set_groups(struct cred *, struct group_info *); extern int groups_search(const struct group_info *, kgid_t); /* access the groups "array" with this macro */ diff --git a/kernel/groups.c b/kernel/groups.c index 90cf1c38c8ea..451698f86cfa 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -157,17 +157,13 @@ int groups_search(const struct group_info *group_info, kgid_t grp) * set_groups - Change a group subscription in a set of credentials * @new: The newly prepared set of credentials to alter * @group_info: The group list to install - * - * Validate a group subscription and, if valid, insert it into a set - * of credentials. */ -int set_groups(struct cred *new, struct group_info *group_info) +void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); groups_sort(group_info); get_group_info(group_info); new->group_info = group_info; - return 0; } EXPORT_SYMBOL(set_groups); @@ -182,18 +178,12 @@ EXPORT_SYMBOL(set_groups); int set_current_groups(struct group_info *group_info) { struct cred *new; - int ret; new = prepare_creds(); if (!new) return -ENOMEM; - ret = set_groups(new, group_info); - if (ret < 0) { - abort_creds(new); - return ret; - } - + set_groups(new, group_info); return commit_creds(new); } -- cgit v1.2.3 From 69369a7003735d0d8ef22097e27a55a8bad9557a Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Thu, 3 Apr 2014 14:48:27 -0700 Subject: fs, kernel: permit disabling the uselib syscall uselib hasn't been used since libc5; glibc does not use it. Support turning it off. When disabled, also omit the load_elf_library implementation from binfmt_elf.c, which only uselib invokes. bloat-o-meter: add/remove: 0/4 grow/shrink: 0/1 up/down: 0/-785 (-785) function old new delta padzero 39 36 -3 uselib_flags 20 - -20 sys_uselib 168 - -168 SyS_uselib 168 - -168 load_elf_library 426 - -426 The new CONFIG_USELIB defaults to `y'. Signed-off-by: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 9 ++++++++- fs/exec.c | 2 ++ init/Kconfig | 10 ++++++++++ kernel/sys_ni.c | 1 + 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 67be2951b98a..0f59799fa105 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -46,10 +46,15 @@ #endif static int load_elf_binary(struct linux_binprm *bprm); -static int load_elf_library(struct file *); static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long); +#ifdef CONFIG_USELIB +static int load_elf_library(struct file *); +#else +#define load_elf_library NULL +#endif + /* * If we don't support core dumping, then supply a NULL so we * don't even try. @@ -1005,6 +1010,7 @@ out_free_ph: goto out; } +#ifdef CONFIG_USELIB /* This is really simpleminded and specialized - we are loading an a.out library that is given an ELF header. */ static int load_elf_library(struct file *file) @@ -1083,6 +1089,7 @@ out_free_ph: out: return error; } +#endif /* #ifdef CONFIG_USELIB */ #ifdef CONFIG_ELF_CORE /* diff --git a/fs/exec.c b/fs/exec.c index 4f59402fdda5..25dfeba6d55f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -97,6 +97,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt) module_put(fmt->module); } +#ifdef CONFIG_USELIB /* * Note that a shared library must be both readable and executable due to * security reasons. @@ -156,6 +157,7 @@ exit: out: return error; } +#endif /* #ifdef CONFIG_USELIB */ #ifdef CONFIG_MMU /* diff --git a/init/Kconfig b/init/Kconfig index e45cc62904b3..8114a06117e3 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -273,6 +273,16 @@ config FHANDLE get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2) syscalls. +config USELIB + bool "uselib syscall" + default y + help + This option enables the uselib syscall, a system call used in the + dynamic linker from libc5 and earlier. glibc does not use this + system call. If you intend to run programs built on libc5 or + earlier, you may need to enable this syscall. Current systems + running glibc can safely disable this. + config AUDIT bool "Auditing support" depends on NET diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 74395a95b7e9..bc8d1b74a6b9 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -152,6 +152,7 @@ cond_syscall(sys_process_vm_readv); cond_syscall(sys_process_vm_writev); cond_syscall(compat_sys_process_vm_readv); cond_syscall(compat_sys_process_vm_writev); +cond_syscall(sys_uselib); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); -- cgit v1.2.3 From 74a475acea49459721ae4b062d3da68c74259009 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Thu, 3 Apr 2014 14:48:28 -0700 Subject: SubmittingPatches: add style recommendation to use imperative descriptions Most commit messages use this style, and the recommendation frequently comes up in discussions (especially in response to patches that don't use it), but that recommendation doesn't actually appear anywhere in Documentation. Add this style guideline to SubmittingPatches, using the description from git's SubmittingPatches. Signed-off-by: Josh Triplett Acked-by: Borislav Petkov Cc: Rob Landley Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/SubmittingPatches | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches index 26b1e31d5a13..c74e73c37dcc 100644 --- a/Documentation/SubmittingPatches +++ b/Documentation/SubmittingPatches @@ -106,6 +106,11 @@ I.e., the patch (series) and its description should be self-contained. This benefits both the patch merger(s) and reviewers. Some reviewers probably didn't even receive earlier versions of the patch. +Describe your changes in imperative mood, e.g. "make xyzzy do frotz" +instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy +to do frotz", as if you are giving orders to the codebase to change +its behaviour. + If the patch fixes a logged bug entry, refer to that bug entry by number and URL. -- cgit v1.2.3 From 9547c706d279392d53922c0f9d57a0b37a4dfcdc Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Thu, 3 Apr 2014 14:48:29 -0700 Subject: SubmittingPatches: add recommendation for mailing list references SubmittingPatches already mentions referencing bugs fixed by a commit, but doesn't mention citing relevant mailing list discussions. Add a note to that effect, along with a recommendation to use the https://lkml.kernel.org/ redirector. Portions based on text from git's SubmittingPatches. Signed-off-by: Josh Triplett Acked-by: Borislav Petkov Cc: Rob Landley Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/SubmittingPatches | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches index c74e73c37dcc..53e6590263a1 100644 --- a/Documentation/SubmittingPatches +++ b/Documentation/SubmittingPatches @@ -112,7 +112,15 @@ to do frotz", as if you are giving orders to the codebase to change its behaviour. If the patch fixes a logged bug entry, refer to that bug entry by -number and URL. +number and URL. If the patch follows from a mailing list discussion, +give a URL to the mailing list archive; use the https://lkml.kernel.org/ +redirector with a Message-Id, to ensure that the links cannot become +stale. + +However, try to make your explanation understandable without external +resources. In addition to giving a URL to a mailing list archive or +bug, summarize the relevant points of the discussion that led to the +patch as submitted. If you want to refer to a specific commit, don't just refer to the SHA-1 ID of the commit. Please also include the oneline summary of -- cgit v1.2.3 From 8e3072a23ff8cf69b3e654a7cd64eae04f06a0e6 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Thu, 3 Apr 2014 14:48:30 -0700 Subject: SubmittingPatches: document the use of git Most of the mechanical portions of SubmittingPatches exist to help patch submitters replicate the output of git. Mention this explicitly, both as a reminder that git will help with this process, and as signposting to let git users know what they can safely skip. Signed-off-by: Josh Triplett Acked-by: Borislav Petkov Cc: Rob Landley Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/SubmittingPatches | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches index 53e6590263a1..fdad7d197062 100644 --- a/Documentation/SubmittingPatches +++ b/Documentation/SubmittingPatches @@ -14,7 +14,10 @@ Read Documentation/SubmitChecklist for a list of items to check before submitting code. If you are submitting a driver, also read Documentation/SubmittingDrivers. - +Many of these steps describe the default behavior of the git version +control system; if you use git to prepare your patches, you'll find much +of the mechanical work done for you, though you'll still need to prepare +and document a sensible set of patches. -------------------------------------------- SECTION 1 - CREATING AND SENDING YOUR CHANGE @@ -25,7 +28,9 @@ SECTION 1 - CREATING AND SENDING YOUR CHANGE 1) "diff -up" ------------ -Use "diff -up" or "diff -uprN" to create patches. +Use "diff -up" or "diff -uprN" to create patches. git generates patches +in this form by default; if you're using git, you can skip this section +entirely. All changes to the Linux kernel occur in the form of patches, as generated by diff(1). When creating your patch, make sure to create it @@ -66,19 +71,14 @@ Make sure your patch does not include any extra files which do not belong in a patch submission. Make sure to review your patch -after- generated it with diff(1), to ensure accuracy. -If your changes produce a lot of deltas, you may want to look into -splitting them into individual patches which modify things in -logical stages. This will facilitate easier reviewing by other -kernel developers, very important if you want your patch accepted. -There are a number of scripts which can aid in this: - -Quilt: -http://savannah.nongnu.org/projects/quilt +If your changes produce a lot of deltas, you need to split them into +individual patches which modify things in logical stages; see section +#3. This will facilitate easier reviewing by other kernel developers, +very important if you want your patch accepted. -Andrew Morton's patch scripts: -http://userweb.kernel.org/~akpm/stuff/patch-scripts.tar.gz -Instead of these scripts, quilt is the recommended patch management -tool (see above). +If you're using git, "git rebase -i" can help you with this process. If +you're not using git, quilt +is another popular alternative. @@ -607,7 +607,8 @@ patch. If you are going to include a diffstat after the "---" marker, please use diffstat options "-p 1 -w 70" so that filenames are listed from the top of the kernel source tree and don't use too much horizontal -space (easily fit in 80 columns, maybe with some indentation). +space (easily fit in 80 columns, maybe with some indentation). (git +generates appropriate diffstats by default.) See more details on the proper patch format in the following references. -- cgit v1.2.3 From a5ed3cee944abf2c80ae146b06f9055d7da03f94 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:48:31 -0700 Subject: err.h: use bool for IS_ERR and IS_ERR_OR_NULL Use the more natural return of bool for these tests. No difference observed in .o files produced by gcc for x86. Remove the dentry description of kernel pointers left over from the 90's and 2002's cleanup move of parts of fs.h to err.h. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/err.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/err.h b/include/linux/err.h index 15f92e072450..a729120644d5 100644 --- a/include/linux/err.h +++ b/include/linux/err.h @@ -2,12 +2,13 @@ #define _LINUX_ERR_H #include +#include #include /* * Kernel pointers have redundant information, so we can use a - * scheme where we can return either an error code or a dentry + * scheme where we can return either an error code or a normal * pointer with the same return value. * * This should be a per-architecture thing, to allow different @@ -29,12 +30,12 @@ static inline long __must_check PTR_ERR(__force const void *ptr) return (long) ptr; } -static inline long __must_check IS_ERR(__force const void *ptr) +static inline bool __must_check IS_ERR(__force const void *ptr) { return IS_ERR_VALUE((unsigned long)ptr); } -static inline long __must_check IS_ERR_OR_NULL(__force const void *ptr) +static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) { return !ptr || IS_ERR_VALUE((unsigned long)ptr); } -- cgit v1.2.3 From e9107f88c985bcda5a8ec692cd692005738136f1 Mon Sep 17 00:00:00 2001 From: Markos Chandras Date: Thu, 3 Apr 2014 14:48:32 -0700 Subject: samples/seccomp/Makefile: do not build tests if cross-compiling for MIPS The Makefile is designed to use the host toolchain so it may be unsafe to build the tests if the kernel has been configured and built for another architecture. This fixes a build problem when the kernel has been configured and built for the MIPS architecture but the host is not MIPS (cross-compiled). The MIPS syscalls are only defined if one of the following is true: 1) _MIPS_SIM == _MIPS_SIM_ABI64 2) _MIPS_SIM == _MIPS_SIM_ABI32 3) _MIPS_SIM == _MIPS_SIM_NABI32 Of course, none of these make sense on a non-MIPS toolchain and the following build problem occurs when building on a non-MIPS host. linux/usr/include/linux/kexec.h:50: userspace cannot reference function or variable defined in the kernel samples/seccomp/bpf-direct.c: In function `emulator': samples/seccomp/bpf-direct.c:76:17: error: `__NR_write' undeclared (first use in this function) Signed-off-by: Markos Chandras Reported-by: Paul Gortmaker Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- samples/seccomp/Makefile | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile index 7203e66dcd6f..1b4e4b8f5e47 100644 --- a/samples/seccomp/Makefile +++ b/samples/seccomp/Makefile @@ -18,8 +18,8 @@ HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include bpf-direct-objs := bpf-direct.o # Try to match the kernel target. -ifndef CONFIG_64BIT ifndef CROSS_COMPILE +ifndef CONFIG_64BIT # s390 has -m31 flag to build 31 bit binaries ifndef CONFIG_S390 @@ -36,7 +36,13 @@ HOSTLOADLIBES_bpf-direct += $(MFLAG) HOSTLOADLIBES_bpf-fancy += $(MFLAG) HOSTLOADLIBES_dropper += $(MFLAG) endif -endif - -# Tell kbuild to always build the programs always := $(hostprogs-y) +else +# MIPS system calls are defined based on the -mabi that is passed +# to the toolchain which may or may not be a valid option +# for the host toolchain. So disable tests if target architecture +# is MIPS but the host isn't. +ifndef CONFIG_MIPS +always := $(hostprogs-y) +endif +endif -- cgit v1.2.3 From ea1a8217b06b41b31a2b60b0b83f75c77ef9c873 Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Thu, 3 Apr 2014 14:48:33 -0700 Subject: xattr: guard against simultaneous glibc header inclusion If the glibc xattr.h header is included after the uapi header, compilation fails due to an enum re-using a #define from the uapi header. Protect against this by guarding the define and enum inclusions against each other. (See https://lists.debian.org/debian-glibc/2014/03/msg00029.html and https://sourceware.org/glibc/wiki/Synchronizing_Headers for more information.) Signed-off-by: Serge Hallyn Cc: Andrew Morton Cc: Allan McRae Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/libc-compat.h | 9 +++++++++ include/uapi/linux/xattr.h | 7 +++++++ 2 files changed, 16 insertions(+) diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h index 335e8a7cad39..c140620dad92 100644 --- a/include/uapi/linux/libc-compat.h +++ b/include/uapi/linux/libc-compat.h @@ -85,6 +85,12 @@ #endif /* _NETINET_IN_H */ +/* Definitions for xattr.h */ +#if defined(_SYS_XATTR_H) +#define __UAPI_DEF_XATTR 0 +#else +#define __UAPI_DEF_XATTR 1 +#endif /* If we did not see any headers from any supported C libraries, * or we are being included in the kernel, then define everything @@ -98,6 +104,9 @@ #define __UAPI_DEF_IPV6_MREQ 1 #define __UAPI_DEF_IPPROTO_V6 1 +/* Definitions for xattr.h */ +#define __UAPI_DEF_XATTR 1 + #endif /* __GLIBC__ */ #endif /* _UAPI_LIBC_COMPAT_H */ diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h index 40bbc04b6f81..c38355c1f3c9 100644 --- a/include/uapi/linux/xattr.h +++ b/include/uapi/linux/xattr.h @@ -7,11 +7,18 @@ Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. Copyright (c) 2004 Red Hat, Inc., James Morris */ + +#include + #ifndef _UAPI_LINUX_XATTR_H #define _UAPI_LINUX_XATTR_H +#ifdef __UAPI_DEF_XATTR +#define __USE_KERNEL_XATTR_DEFS + #define XATTR_CREATE 0x1 /* set value, fail if attr already exists */ #define XATTR_REPLACE 0x2 /* set value, fail if attr does not exist */ +#endif /* Namespaces */ #define XATTR_OS2_PREFIX "os2." -- cgit v1.2.3 From d300d59ca179fd1de427cde2a96eb2871347491f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 3 Apr 2014 14:48:34 -0700 Subject: lib/syscall.c: unexport task_current_syscall() It is only used by procfs and procfs cannot be a module. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/syscall.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/syscall.c b/lib/syscall.c index 58710eefeac8..e30e03932480 100644 --- a/lib/syscall.c +++ b/lib/syscall.c @@ -72,4 +72,3 @@ int task_current_syscall(struct task_struct *target, long *callno, return 0; } -EXPORT_SYMBOL_GPL(task_current_syscall); -- cgit v1.2.3 From c96d6660dc65b0a90aea9834bfd8be1d5656da18 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 3 Apr 2014 14:48:35 -0700 Subject: kernel: audit/fix non-modular users of module_init in core code Code that is obj-y (always built-in) or dependent on a bool Kconfig (built-in or absent) can never be modular. So using module_init as an alias for __initcall can be somewhat misleading. Fix these up now, so that we can relocate module_init from init.h into module.h in the future. If we don't do this, we'd have to add module.h to obviously non-modular code, and that would be a worse thing. The audit targets the following module_init users for change: kernel/user.c obj-y kernel/kexec.c bool KEXEC (one instance per arch) kernel/profile.c bool PROFILING kernel/hung_task.c bool DETECT_HUNG_TASK kernel/sched/stats.c bool SCHEDSTATS kernel/user_namespace.c bool USER_NS Note that direct use of __initcall is discouraged, vs. one of the priority categorized subgroups. As __initcall gets mapped onto device_initcall, our use of subsys_initcall (which makes sense for these files) will thus change this registration from level 6-device to level 4-subsys (i.e. slightly earlier). However no observable impact of that difference has been observed during testing. Also, two instances of missing ";" at EOL are fixed in kexec. Signed-off-by: Paul Gortmaker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 3 +-- kernel/kexec.c | 4 ++-- kernel/profile.c | 2 +- kernel/sched/stats.c | 2 +- kernel/user.c | 3 +-- kernel/user_namespace.c | 2 +- 6 files changed, 7 insertions(+), 9 deletions(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 0b9c169d577f..06bb1417b063 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -246,5 +246,4 @@ static int __init hung_task_init(void) return 0; } - -module_init(hung_task_init); +subsys_initcall(hung_task_init); diff --git a/kernel/kexec.c b/kernel/kexec.c index 45601cf41bee..c0d261c7db7b 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1235,7 +1235,7 @@ static int __init crash_notes_memory_init(void) } return 0; } -module_init(crash_notes_memory_init) +subsys_initcall(crash_notes_memory_init); /* @@ -1629,7 +1629,7 @@ static int __init crash_save_vmcoreinfo_init(void) return 0; } -module_init(crash_save_vmcoreinfo_init) +subsys_initcall(crash_save_vmcoreinfo_init); /* * Move into place and start executing a preloaded standalone diff --git a/kernel/profile.c b/kernel/profile.c index ebdd9c1a86b4..1b266dbe755a 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -604,5 +604,5 @@ int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ hotcpu_notifier(profile_cpu_callback, 0); return 0; } -module_init(create_proc_profile); +subsys_initcall(create_proc_profile); #endif /* CONFIG_PROC_FS */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index da98af347e8b..a476bea17fbc 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -142,4 +142,4 @@ static int __init proc_schedstat_init(void) proc_create("schedstat", 0, NULL, &proc_schedstat_operations); return 0; } -module_init(proc_schedstat_init); +subsys_initcall(proc_schedstat_init); diff --git a/kernel/user.c b/kernel/user.c index c006131beb77..294fc6a94168 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -222,5 +222,4 @@ static int __init uid_cache_init(void) return 0; } - -module_init(uid_cache_init); +subsys_initcall(uid_cache_init); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index dd06439b9c84..0d8f6023fd8d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -902,4 +902,4 @@ static __init int user_namespaces_init(void) user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); return 0; } -module_init(user_namespaces_init); +subsys_initcall(user_namespaces_init); -- cgit v1.2.3 From 28ab49ff7f3dcaf4df8d2bd0d4099b8c08285ed7 Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Thu, 3 Apr 2014 14:48:36 -0700 Subject: kernel/resource.c: make reallocate_resource() static sparse says: kernel/resource.c:518:5: warning: symbol 'reallocate_resource' was not declared. Should it be static? Signed-off-by: Daeseok Youn Reviewed-by: Yasuaki Ishimatsu Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/resource.c b/kernel/resource.c index 673061c06da1..8957d686e29b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -511,7 +511,7 @@ static int find_resource(struct resource *root, struct resource *new, * @newsize: new size of the resource descriptor * @constraint: the size and alignment constraints to be met. */ -int reallocate_resource(struct resource *root, struct resource *old, +static int reallocate_resource(struct resource *root, struct resource *old, resource_size_t newsize, struct resource_constraint *constraint) { -- cgit v1.2.3 From 708d96fd060bd1e729fc93048cea8901f8bacb7c Mon Sep 17 00:00:00 2001 From: Ryan Mallon Date: Thu, 3 Apr 2014 14:48:37 -0700 Subject: vsprintf: remove %n handling All in-kernel users of %n in format strings have now been removed and the %n directive is ignored. Remove the handling of %n so that it is treated the same as any other invalid format string directive. Keep a warning in place to deter new instances of %n in format strings. Signed-off-by: Ryan Mallon Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 45 +++++++++------------------------------------ 1 file changed, 9 insertions(+), 36 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 5e2cf6f342f8..0648291cdafe 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -364,7 +364,6 @@ enum format_type { FORMAT_TYPE_SHORT, FORMAT_TYPE_UINT, FORMAT_TYPE_INT, - FORMAT_TYPE_NRCHARS, FORMAT_TYPE_SIZE_T, FORMAT_TYPE_PTRDIFF }; @@ -1538,10 +1537,6 @@ qualifier: return fmt - start; /* skip alnum */ - case 'n': - spec->type = FORMAT_TYPE_NRCHARS; - return ++fmt - start; - case '%': spec->type = FORMAT_TYPE_PERCENT_CHAR; return ++fmt - start; @@ -1564,6 +1559,15 @@ qualifier: case 'u': break; + case 'n': + /* + * Since %n poses a greater security risk than utility, treat + * it as an invalid format specifier. Warn about its use so + * that new instances don't get added. + */ + WARN_ONCE(1, "Please remove ignored %%n in '%s'\n", fmt); + /* Fall-through */ + default: spec->type = FORMAT_TYPE_INVALID; return fmt - start; @@ -1737,20 +1741,6 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) ++str; break; - case FORMAT_TYPE_NRCHARS: { - /* - * Since %n poses a greater security risk than - * utility, ignore %n and skip its argument. - */ - void *skip_arg; - - WARN_ONCE(1, "Please remove ignored %%n in '%s'\n", - old_fmt); - - skip_arg = va_arg(args, void *); - break; - } - default: switch (spec.type) { case FORMAT_TYPE_LONG_LONG: @@ -2025,19 +2015,6 @@ do { \ fmt++; break; - case FORMAT_TYPE_NRCHARS: { - /* skip %n 's argument */ - u8 qualifier = spec.qualifier; - void *skip_arg; - if (qualifier == 'l') - skip_arg = va_arg(args, long *); - else if (_tolower(qualifier) == 'z') - skip_arg = va_arg(args, size_t *); - else - skip_arg = va_arg(args, int *); - break; - } - default: switch (spec.type) { @@ -2196,10 +2173,6 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) ++str; break; - case FORMAT_TYPE_NRCHARS: - /* skip */ - break; - default: { unsigned long long num; -- cgit v1.2.3 From 0185698c0f543d4e8677bb97227ebe46024e3eb8 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 3 Apr 2014 14:48:38 -0700 Subject: printk: remove duplicated check for log level The check for the exact log level is already done in printk_get_level. We do not need to duplicate it in printk_skip_level. Signed-off-by: Petr Mladek Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Jan Kara Cc: Michal Hocko Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/include/linux/printk.h b/include/linux/printk.h index fa47e2708c01..8860575899f2 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -24,13 +24,9 @@ static inline int printk_get_level(const char *buffer) static inline const char *printk_skip_level(const char *buffer) { - if (printk_get_level(buffer)) { - switch (buffer[1]) { - case '0' ... '7': - case 'd': /* KERN_DEFAULT */ - return buffer + 2; - } - } + if (printk_get_level(buffer)) + return buffer + 2; + return buffer; } -- cgit v1.2.3 From c64730b26f08cccfbc8fcbf169c304b4bd71dcac Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 3 Apr 2014 14:48:39 -0700 Subject: printk: remove obsolete check for log level "c" The kernel log level "c" was removed in commit 61e99ab8e35a ("printk: remove the now unnecessary "C" annotation for KERN_CONT"). It is no longer detected in printk_get_level(). Hence we do not need to check it in vprintk_emit. Signed-off-by: Petr Mladek Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Jan Kara Cc: Michal Hocko Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4dae9cbe9259..db7a02e05241 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1560,8 +1560,6 @@ asmlinkage int vprintk_emit(int facility, int level, level = kern_level - '0'; case 'd': /* KERN_DEFAULT */ lflags |= LOG_PREFIX; - case 'c': /* KERN_CONT */ - break; } text_len -= end_of_header - text; text = (char *)end_of_header; -- cgit v1.2.3 From e8c42d36ab86cf45f88c3a0e344233b1032fbf3d Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 3 Apr 2014 14:48:41 -0700 Subject: printk: add comment about tricky check for text buffer size There is no check for potential "text_len" overflow. It is not needed because only valid level is detected. It took me some time to understand why. It would deserve a comment ;-) Signed-off-by: Petr Mladek Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Jan Kara Cc: Michal Hocko Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index db7a02e05241..012f3e40671d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1561,6 +1561,11 @@ asmlinkage int vprintk_emit(int facility, int level, case 'd': /* KERN_DEFAULT */ lflags |= LOG_PREFIX; } + /* + * No need to check length here because vscnprintf + * put '\0' at the end of the string. Only valid and + * newly printed level is detected. + */ text_len -= end_of_header - text; text = (char *)end_of_header; } -- cgit v1.2.3 From 39b25109b400ea397e64c417d8b965a53e2ee0f0 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 3 Apr 2014 14:48:42 -0700 Subject: printk: use also the last bytes in the ring buffer It seems that we have newer used the last byte in the ring buffer. In fact, we have newer used the last 4 bytes because of padding. First problem is in the check for free space. The exact number of free bytes is enough to store the length of data. Second problem is in the check where the ring buffer is rotated. The left side counts the first unused index. It is unused, so it might be the same as the size of the buffer. Note that the first problem has to be fixed together with the second one. Otherwise, the buffer is rotated even when there is enough space on the end of the buffer. Then the beginning of the buffer is rewritten and valid entries get corrupted. Signed-off-by: Petr Mladek Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Jan Kara Cc: Michal Hocko Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 012f3e40671d..b3a1790f9e05 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -319,7 +319,7 @@ static void log_store(int facility, int level, else free = log_first_idx - log_next_idx; - if (free > size + sizeof(struct printk_log)) + if (free >= size + sizeof(struct printk_log)) break; /* drop old messages until we have enough contiuous space */ @@ -327,7 +327,7 @@ static void log_store(int facility, int level, log_first_seq++; } - if (log_next_idx + size + sizeof(struct printk_log) >= log_buf_len) { + if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { /* * This message + an additional empty header does not fit * at the end of the buffer. Add an empty header with len == 0 -- cgit v1.2.3 From fce6e0338abe910ba6d4db0657ae8adc6aa1a72b Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 3 Apr 2014 14:48:43 -0700 Subject: printk: do not compute the size of the message twice This is just a tiny optimization. It removes duplicate computation of the message size. Signed-off-by: Petr Mladek Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Jan Kara Cc: Michal Hocko Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b3a1790f9e05..ff9faf4e3cd5 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -351,7 +351,7 @@ static void log_store(int facility, int level, else msg->ts_nsec = local_clock(); memset(log_dict(msg) + dict_len, 0, pad_len); - msg->len = sizeof(struct printk_log) + text_len + dict_len + pad_len; + msg->len = size; /* insert message */ log_next_idx += msg->len; -- cgit v1.2.3 From d487d57581057abd271651334ea2996aa1b31e28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20K=C3=A5gstr=C3=B6m?= Date: Thu, 3 Apr 2014 14:48:44 -0700 Subject: include/linux/printk.h: remove double asmlinkage in printk_emit The double asmlinkage was introduced in commit 7ff9554bb578 ("printk: convert byte-buffer to variable-length record buffer"). Signed-off-by: Simon Kagstrom Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/printk.h b/include/linux/printk.h index 8860575899f2..8752f7595b27 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -120,9 +120,9 @@ asmlinkage __printf(1, 0) int vprintk(const char *fmt, va_list args); asmlinkage __printf(5, 6) __cold -asmlinkage int printk_emit(int facility, int level, - const char *dict, size_t dictlen, - const char *fmt, ...); +int printk_emit(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, ...); asmlinkage __printf(1, 2) __cold int printk(const char *fmt, ...); -- cgit v1.2.3 From 72581487a61f6304a7cc32e189eb65fb1c920a53 Mon Sep 17 00:00:00 2001 From: Jane Li Date: Thu, 3 Apr 2014 14:48:45 -0700 Subject: printk: fix one circular lockdep warning about console_lock Fix a warning about possible circular locking dependency. If do in following sequence: enter suspend -> resume -> plug-out CPUx (echo 0 > cpux/online) lockdep will show warning as following: ====================================================== [ INFO: possible circular locking dependency detected ] 3.10.0 #2 Tainted: G O ------------------------------------------------------- sh/1271 is trying to acquire lock: (console_lock){+.+.+.}, at: console_cpu_notify+0x20/0x2c but task is already holding lock: (cpu_hotplug.lock){+.+.+.}, at: cpu_hotplug_begin+0x2c/0x58 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (cpu_hotplug.lock){+.+.+.}: lock_acquire+0x98/0x12c mutex_lock_nested+0x50/0x3d8 cpu_hotplug_begin+0x2c/0x58 _cpu_up+0x24/0x154 cpu_up+0x64/0x84 smp_init+0x9c/0xd4 kernel_init_freeable+0x78/0x1c8 kernel_init+0x8/0xe4 ret_from_fork+0x14/0x2c -> #1 (cpu_add_remove_lock){+.+.+.}: lock_acquire+0x98/0x12c mutex_lock_nested+0x50/0x3d8 disable_nonboot_cpus+0x8/0xe8 suspend_devices_and_enter+0x214/0x448 pm_suspend+0x1e4/0x284 try_to_suspend+0xa4/0xbc process_one_work+0x1c4/0x4fc worker_thread+0x138/0x37c kthread+0xa4/0xb0 ret_from_fork+0x14/0x2c -> #0 (console_lock){+.+.+.}: __lock_acquire+0x1b38/0x1b80 lock_acquire+0x98/0x12c console_lock+0x54/0x68 console_cpu_notify+0x20/0x2c notifier_call_chain+0x44/0x84 __cpu_notify+0x2c/0x48 cpu_notify_nofail+0x8/0x14 _cpu_down+0xf4/0x258 cpu_down+0x24/0x40 store_online+0x30/0x74 dev_attr_store+0x18/0x24 sysfs_write_file+0x16c/0x19c vfs_write+0xb4/0x190 SyS_write+0x3c/0x70 ret_fast_syscall+0x0/0x48 Chain exists of: console_lock --> cpu_add_remove_lock --> cpu_hotplug.lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(cpu_hotplug.lock); lock(cpu_add_remove_lock); lock(cpu_hotplug.lock); lock(console_lock); *** DEADLOCK *** There are three locks involved in two sequence: a) pm suspend: console_lock (@suspend_console()) cpu_add_remove_lock (@disable_nonboot_cpus()) cpu_hotplug.lock (@_cpu_down()) b) Plug-out CPUx: cpu_add_remove_lock (@(cpu_down()) cpu_hotplug.lock (@_cpu_down()) console_lock (@console_cpu_notify()) => Lockdeps prints warning log. There should be not real deadlock, as flag of console_suspended can protect this. Although console_suspend() releases console_sem, it doesn't tell lockdep about it. That results in the lockdep warning about circular locking when doing the following: enter suspend -> resume -> plug-out CPUx (echo 0 > cpux/online) Fix the problem by telling lockdep we actually released the semaphore in console_suspend() and acquired it again in console_resume(). Signed-off-by: Jane Li Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ff9faf4e3cd5..a45b50962295 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1883,6 +1883,7 @@ void suspend_console(void) console_lock(); console_suspended = 1; up(&console_sem); + mutex_release(&console_lock_dep_map, 1, _RET_IP_); } void resume_console(void) @@ -1890,6 +1891,7 @@ void resume_console(void) if (!console_suspend_enabled) return; down(&console_sem); + mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); console_suspended = 0; console_unlock(); } -- cgit v1.2.3 From 70d14fcf365c1b20d380d542ec53906f2b5e2435 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:48:46 -0700 Subject: MAINTAINERS: add backlight co-maintainers Bryan Wu and Lee Jones volunteer to maintain backlight drivers and help to setup git-tree for backlight subsystem. Thus, I add them as backlight co-maintainers. Signed-off-by: Jingoo Han Acked-by: Bryan Wu Acked-by: Lee Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 346744599b4f..4c0035e75ce1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1704,6 +1704,8 @@ F: drivers/net/wireless/b43legacy/ BACKLIGHT CLASS/SUBSYSTEM M: Jingoo Han +M: Bryan Wu +M: Lee Jones S: Maintained F: drivers/video/backlight/ F: include/linux/backlight.h -- cgit v1.2.3 From dcaa57d529a00983eb5fda511a4959bf44506670 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 3 Apr 2014 14:48:47 -0700 Subject: MAINTAINERS: mark SuperH orphan Paul Mundt's email address bounces regularly, and he hasn't taken any SuperH patches for about one year. Signed-off-by: Geert Uytterhoeven Suggested-by: Paul Bolle Cc: Paul Mundt Cc: Magnus Damm Cc: Kuninori Morimoto Cc: Laurent Pinchart Cc: Simon Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- CREDITS | 4 ++++ MAINTAINERS | 4 +--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CREDITS b/CREDITS index 4f2f45deeb70..cef042453fa4 100644 --- a/CREDITS +++ b/CREDITS @@ -2564,6 +2564,10 @@ N: Wolfgang Muees E: wolfgang@iksw-muees.de D: Auerswald USB driver +N: Paul Mundt +E: paul.mundt@gmail.com +D: SuperH maintainer + N: Ian A. Murdock E: imurdock@gnu.ai.mit.edu D: Creator of Debian distribution diff --git a/MAINTAINERS b/MAINTAINERS index 4c0035e75ce1..39f27f8620a0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8478,12 +8478,10 @@ S: Maintained F: drivers/net/ethernet/dlink/sundance.c SUPERH -M: Paul Mundt L: linux-sh@vger.kernel.org W: http://www.linux-sh.org Q: http://patchwork.kernel.org/project/linux-sh/list/ -T: git git://github.com/pmundt/linux-sh.git sh-latest -S: Supported +S: Orphan F: Documentation/sh/ F: arch/sh/ F: drivers/sh/ -- cgit v1.2.3 From 3dc9985705b5a6e60de533ff505ecaee2c020968 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Thu, 3 Apr 2014 14:48:48 -0700 Subject: MAINTAINERS: add xtensa irqchips to xtensa port entry Now that irqchip drivers for xtensa live outside arch/xtensa we'd like to add them to our maintenance list. Signed-off-by: Max Filippov Cc: Chris Zankel Cc: Marc Gauthier Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 39f27f8620a0..273fbd4b51ab 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8765,6 +8765,7 @@ M: Max Filippov L: linux-xtensa@linux-xtensa.org S: Maintained F: arch/xtensa/ +F: drivers/irqchip/irq-xtensa-* THANKO'S RAREMONO AM/FM/SW RADIO RECEIVER USB DRIVER M: Hans Verkuil -- cgit v1.2.3 From 63fae2192a8996d0d2e7cc13d1d7c576a853e6ad Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:48:49 -0700 Subject: MAINTAINERS: wimax@linuxwimax.org is subscribers-only Mark it so. Signed-off-by: Joe Perches Acked-by: Geert Uytterhoeven Cc: Inaky Perez-Gonzalez Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 273fbd4b51ab..140084a227f0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4622,7 +4622,7 @@ F: arch/x86/kernel/tboot.c INTEL WIRELESS WIMAX CONNECTION 2400 M: Inaky Perez-Gonzalez M: linux-wimax@intel.com -L: wimax@linuxwimax.org +L: wimax@linuxwimax.org (subscribers-only) S: Supported W: http://linuxwimax.org F: Documentation/wimax/README.i2400m @@ -9657,7 +9657,7 @@ F: drivers/media/rc/winbond-cir.c WIMAX STACK M: Inaky Perez-Gonzalez M: linux-wimax@intel.com -L: wimax@linuxwimax.org +L: wimax@linuxwimax.org (subscribers-only) S: Supported W: http://linuxwimax.org F: Documentation/wimax/README.wimax -- cgit v1.2.3 From c0d995aafc1df69a3ae2f7976f35372b16ea915f Mon Sep 17 00:00:00 2001 From: "Opensource [Steve Twiss]" Date: Thu, 3 Apr 2014 14:48:51 -0700 Subject: MAINTAINERS: addition of Dialog Semiconductor files Dialog Semiconductor Ltd would like to add a new section called DIALOG SEMICONDUCTOR DRIVERS which contains a new e-mail address that can cover all Dialog supported drivers: support.opensource@diasemi.com. Signed-off-by: Opensource [Steve Twiss] Signed-off-by: David Dajun Chen Cc: Mark Brown Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 140084a227f0..4b2734b35843 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2730,6 +2730,31 @@ F: include/linux/device-mapper.h F: include/linux/dm-*.h F: include/uapi/linux/dm-*.h +DIALOG SEMICONDUCTOR DRIVERS +M: Support Opensource +W: http://www.dialog-semiconductor.com/products +S: Supported +F: Documentation/hwmon/da90?? +F: drivers/gpio/gpio-da90??.c +F: drivers/hwmon/da90??-hwmon.c +F: drivers/input/misc/da90??_onkey.c +F: drivers/input/touchscreen/da9052_tsi.c +F: drivers/leds/leds-da90??.c +F: drivers/mfd/da903x.c +F: drivers/mfd/da90??-*.c +F: drivers/power/da9052-battery.c +F: drivers/regulator/da903x.c +F: drivers/regulator/da9???-regulator.[ch] +F: drivers/rtc/rtc-da90??.c +F: drivers/video/backlight/da90??_bl.c +F: drivers/watchdog/da90??_wdt.c +F: include/linux/mfd/da903x.h +F: include/linux/mfd/da9052/ +F: include/linux/mfd/da9055/ +F: include/linux/mfd/da9063/ +F: include/sound/da[79]*.h +F: sound/soc/codecs/da[79]*.[ch] + DIGI NEO AND CLASSIC PCI PRODUCTS M: Lidza Louina L: driverdev-devel@linuxdriverproject.org -- cgit v1.2.3 From 8a72f57efa1b0a12502f152231e94c8254a5ecd9 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Thu, 3 Apr 2014 14:48:52 -0700 Subject: MAINTAINERS: microblaze: use LKML as mailing list microblaze-uclinux mailing list is almost dead and it is just causing troubles for non subscribers which are getting email about waiting for moderator. Approval never happens. Move it to LKML. Signed-off-by: Michal Simek Reported-by: Richard Guy Briggs Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4b2734b35843..ad3b5a459ebc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5761,7 +5761,6 @@ F: fs/imgdafs/ MICROBLAZE ARCHITECTURE M: Michal Simek -L: microblaze-uclinux@itee.uq.edu.au (moderated for non-subscribers) W: http://www.monstr.eu/fdt/ T: git git://git.monstr.eu/linux-2.6-microblaze.git S: Supported -- cgit v1.2.3 From 9e06f631ecef84a949a89765e0787c61d63c3f56 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:48:53 -0700 Subject: MAINTAINERS: remove Venkatesh from HPET, move to CREDITS Seems he's gone off to bigger/better things. So long, etc... Signed-off-by: Joe Perches Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- CREDITS | 3 +++ MAINTAINERS | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CREDITS b/CREDITS index cef042453fa4..61eabf7ca376 100644 --- a/CREDITS +++ b/CREDITS @@ -2711,6 +2711,9 @@ N: Greg Page E: gpage@sovereign.org D: IPX development and support +N: Venkatesh Pallipadi (Venki) +D: x86/HPET + N: David Parsons E: orc@pell.chi.il.us D: improved memory detection code. diff --git a/MAINTAINERS b/MAINTAINERS index ad3b5a459ebc..2f2675493b3a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4155,8 +4155,7 @@ F: include/linux/hpet.h F: include/uapi/linux/hpet.h HPET: x86 -M: "Venkatesh Pallipadi (Venki)" -S: Maintained +S: Orphan F: arch/x86/kernel/hpet.c F: arch/x86/include/asm/hpet.h -- cgit v1.2.3 From a55944ca82d287ca099ca90413af857af9086773 Mon Sep 17 00:00:00 2001 From: Liu Ying Date: Thu, 3 Apr 2014 14:48:54 -0700 Subject: backlight: update bd state & fb_blank properties when necessary We don't have to update the state and fb_blank properties of a backlight device every time a blanking or unblanking event comes because they may have already been what we want. Another thought is that one backlight device may be shared by multiple framebuffers. The backlight driver should take the backlight device as a resource shared by all the associated framebuffers. This patch adds some logic to record each framebuffer's backlight usage to determine the backlight device use count and whether the two properties should be updated or not. To be more specific, only one unblank operation on a certain blanked framebuffer may increase the backlight device's use count by one, while one blank operation on a certain unblanked framebuffer may decrease the use count by one, because the userspace is likely to unblank an unblanked framebuffer or blank a blanked framebuffer. Signed-off-by: Liu Ying Cc: Jingoo Han Cc: Jean-Christophe PLAGNIOL-VILLARD Cc: Tomi Valkeinen Cc: Jani Nikula Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/backlight.c | 23 ++++++++++++++++++----- include/linux/backlight.h | 6 ++++++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index 5d05555fe841..efc9a2da918d 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -41,6 +41,8 @@ static int fb_notifier_callback(struct notifier_block *self, { struct backlight_device *bd; struct fb_event *evdata = data; + int node = evdata->info->node; + int fb_blank = 0; /* If we aren't interested in this event, skip it immediately ... */ if (event != FB_EVENT_BLANK && event != FB_EVENT_CONBLANK) @@ -51,11 +53,22 @@ static int fb_notifier_callback(struct notifier_block *self, if (bd->ops) if (!bd->ops->check_fb || bd->ops->check_fb(bd, evdata->info)) { - bd->props.fb_blank = *(int *)evdata->data; - if (bd->props.fb_blank == FB_BLANK_UNBLANK) - bd->props.state &= ~BL_CORE_FBBLANK; - else - bd->props.state |= BL_CORE_FBBLANK; + fb_blank = *(int *)evdata->data; + if (fb_blank == FB_BLANK_UNBLANK && + !bd->fb_bl_on[node]) { + bd->fb_bl_on[node] = true; + if (!bd->use_count++) { + bd->props.state &= ~BL_CORE_FBBLANK; + bd->props.fb_blank = FB_BLANK_UNBLANK; + } + } else if (fb_blank != FB_BLANK_UNBLANK && + bd->fb_bl_on[node]) { + bd->fb_bl_on[node] = false; + if (!(--bd->use_count)) { + bd->props.state |= BL_CORE_FBBLANK; + bd->props.fb_blank = fb_blank; + } + } backlight_update_status(bd); } mutex_unlock(&bd->ops_lock); diff --git a/include/linux/backlight.h b/include/linux/backlight.h index 5f9cd963213d..72647429adf6 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -9,6 +9,7 @@ #define _LINUX_BACKLIGHT_H #include +#include #include #include @@ -104,6 +105,11 @@ struct backlight_device { struct list_head entry; struct device dev; + + /* Multiple framebuffers may share one backlight device */ + bool fb_bl_on[FB_MAX]; + + int use_count; }; static inline void backlight_update_status(struct backlight_device *bd) -- cgit v1.2.3 From 8c16f3303c3410d4af19c8bdc73b74c83c042e87 Mon Sep 17 00:00:00 2001 From: Liu Ying Date: Thu, 3 Apr 2014 14:48:55 -0700 Subject: backlight: update backlight status when necessary We don't have to update a backlight status every time a blanking or unblanking event comes because the backlight status may have already been what we want. Another thought is that one backlight device may be shared by multiple framebuffers. We don't hope blanking one of the framebuffers may turn the backlight off for all the other framebuffers, since they are likely being active to display something. This patch makes the backlight status be updated only when the relevant backlight device's use count changes from zero to one or from one to zero. Signed-off-by: Liu Ying Cc: Jingoo Han Cc: Jean-Christophe PLAGNIOL-VILLARD Cc: Tomi Valkeinen Cc: Jani Nikula Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/backlight.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index efc9a2da918d..27d3cf255e78 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -34,7 +34,7 @@ static const char *const backlight_types[] = { defined(CONFIG_BACKLIGHT_CLASS_DEVICE_MODULE)) /* This callback gets called when something important happens inside a * framebuffer driver. We're looking if that important event is blanking, - * and if it is, we're switching backlight power as well ... + * and if it is and necessary, we're switching backlight power as well ... */ static int fb_notifier_callback(struct notifier_block *self, unsigned long event, void *data) @@ -60,6 +60,7 @@ static int fb_notifier_callback(struct notifier_block *self, if (!bd->use_count++) { bd->props.state &= ~BL_CORE_FBBLANK; bd->props.fb_blank = FB_BLANK_UNBLANK; + backlight_update_status(bd); } } else if (fb_blank != FB_BLANK_UNBLANK && bd->fb_bl_on[node]) { @@ -67,9 +68,9 @@ static int fb_notifier_callback(struct notifier_block *self, if (!(--bd->use_count)) { bd->props.state |= BL_CORE_FBBLANK; bd->props.fb_blank = fb_blank; + backlight_update_status(bd); } } - backlight_update_status(bd); } mutex_unlock(&bd->ops_lock); return 0; -- cgit v1.2.3 From 78b35cf81e5d8b449f09aef0a65ac2d521dcce36 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:48:56 -0700 Subject: backlight: aat2870: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Acked-by: Jinyoung Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/aat2870_bl.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/video/backlight/aat2870_bl.c b/drivers/video/backlight/aat2870_bl.c index ee0c0a982e4e..ec5350f2c28a 100644 --- a/drivers/video/backlight/aat2870_bl.c +++ b/drivers/video/backlight/aat2870_bl.c @@ -149,8 +149,6 @@ static int aat2870_bl_probe(struct platform_device *pdev) sizeof(struct aat2870_bl_driver_data), GFP_KERNEL); if (!aat2870_bl) { - dev_err(&pdev->dev, - "Failed to allocate memory for aat2870 backlight\n"); ret = -ENOMEM; goto out; } -- cgit v1.2.3 From ba464d0c88af59ccd35929eff010747de0fb0dec Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:48:57 -0700 Subject: backlight: adp8860: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Acked-by: Michael Hennerich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/adp8860_bl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/video/backlight/adp8860_bl.c b/drivers/video/backlight/adp8860_bl.c index 9d656717d0f7..be8d83deca7d 100644 --- a/drivers/video/backlight/adp8860_bl.c +++ b/drivers/video/backlight/adp8860_bl.c @@ -224,10 +224,8 @@ static int adp8860_led_probe(struct i2c_client *client) led = devm_kzalloc(&client->dev, sizeof(*led) * pdata->num_leds, GFP_KERNEL); - if (led == NULL) { - dev_err(&client->dev, "failed to alloc memory\n"); + if (led == NULL) return -ENOMEM; - } ret = adp8860_write(client, ADP8860_ISCFR, pdata->led_fade_law); ret = adp8860_write(client, ADP8860_ISCT1, -- cgit v1.2.3 From 0167a956984e4ef1963fb76aa8bcd9481491d2f4 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:48:58 -0700 Subject: backlight: adp8870: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Acked-by: Michael Hennerich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/adp8870_bl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/video/backlight/adp8870_bl.c b/drivers/video/backlight/adp8870_bl.c index 63707205326b..251af4d38d86 100644 --- a/drivers/video/backlight/adp8870_bl.c +++ b/drivers/video/backlight/adp8870_bl.c @@ -246,10 +246,8 @@ static int adp8870_led_probe(struct i2c_client *client) led = devm_kzalloc(&client->dev, pdata->num_leds * sizeof(*led), GFP_KERNEL); - if (led == NULL) { - dev_err(&client->dev, "failed to alloc memory\n"); + if (led == NULL) return -ENOMEM; - } ret = adp8870_write(client, ADP8870_ISCLAW, pdata->led_fade_law); if (ret) -- cgit v1.2.3 From 3b20b894d4b3424824c8138e564d6f97bea0f114 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:48:59 -0700 Subject: backlight: corgi_lcd: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/corgi_lcd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/video/backlight/corgi_lcd.c b/drivers/video/backlight/corgi_lcd.c index db8db5fa6583..51d18d637e2b 100644 --- a/drivers/video/backlight/corgi_lcd.c +++ b/drivers/video/backlight/corgi_lcd.c @@ -543,10 +543,8 @@ static int corgi_lcd_probe(struct spi_device *spi) } lcd = devm_kzalloc(&spi->dev, sizeof(struct corgi_lcd), GFP_KERNEL); - if (!lcd) { - dev_err(&spi->dev, "failed to allocate memory\n"); + if (!lcd) return -ENOMEM; - } lcd->spi_dev = spi; -- cgit v1.2.3 From aff70f9ab1c3c1ce47ccc1b65880195be1dab8d4 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:00 -0700 Subject: backlight: hx8357: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Acked-by: Maxime Ripard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/hx8357.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/video/backlight/hx8357.c b/drivers/video/backlight/hx8357.c index 985e854e244b..23f50b92a930 100644 --- a/drivers/video/backlight/hx8357.c +++ b/drivers/video/backlight/hx8357.c @@ -587,10 +587,8 @@ static int hx8357_probe(struct spi_device *spi) int i, ret; lcd = devm_kzalloc(&spi->dev, sizeof(*lcd), GFP_KERNEL); - if (!lcd) { - dev_err(&spi->dev, "Couldn't allocate lcd internal structure!\n"); + if (!lcd) return -ENOMEM; - } ret = spi_setup(spi); if (ret < 0) { -- cgit v1.2.3 From 619e1b491b7065d5db6d547be12a8d073825e992 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:01 -0700 Subject: backlight: ili922x: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Acked-by: Stefano Babic Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/ili922x.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/video/backlight/ili922x.c b/drivers/video/backlight/ili922x.c index 73464e4b4c74..ea67fe199e34 100644 --- a/drivers/video/backlight/ili922x.c +++ b/drivers/video/backlight/ili922x.c @@ -482,10 +482,8 @@ static int ili922x_probe(struct spi_device *spi) u16 reg = 0; ili = devm_kzalloc(&spi->dev, sizeof(*ili), GFP_KERNEL); - if (!ili) { - dev_err(&spi->dev, "cannot alloc priv data\n"); + if (!ili) return -ENOMEM; - } ili->spi = spi; spi_set_drvdata(spi, ili); -- cgit v1.2.3 From a82bdcfb3ba25843320cde0aa624532e90fa2c43 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:02 -0700 Subject: backlight: ili9320: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/ili9320.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/video/backlight/ili9320.c b/drivers/video/backlight/ili9320.c index e2b8b40a9bd9..2cf39e6d519d 100644 --- a/drivers/video/backlight/ili9320.c +++ b/drivers/video/backlight/ili9320.c @@ -219,10 +219,8 @@ int ili9320_probe_spi(struct spi_device *spi, /* allocate and initialse our state */ ili = devm_kzalloc(&spi->dev, sizeof(struct ili9320), GFP_KERNEL); - if (ili == NULL) { - dev_err(dev, "no memory for device\n"); + if (ili == NULL) return -ENOMEM; - } ili->access.spi.id = ILI9320_SPI_IDCODE | ILI9320_SPI_ID(1); -- cgit v1.2.3 From 1bd04820b921b4aac9a8793388b0c14607cd860a Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:02 -0700 Subject: backlight: l4f00242t03: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/l4f00242t03.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/video/backlight/l4f00242t03.c b/drivers/video/backlight/l4f00242t03.c index 63e763828e0e..5fa2649c9631 100644 --- a/drivers/video/backlight/l4f00242t03.c +++ b/drivers/video/backlight/l4f00242t03.c @@ -181,11 +181,8 @@ static int l4f00242t03_probe(struct spi_device *spi) priv = devm_kzalloc(&spi->dev, sizeof(struct l4f00242t03_priv), GFP_KERNEL); - - if (priv == NULL) { - dev_err(&spi->dev, "No memory for this device.\n"); + if (priv == NULL) return -ENOMEM; - } spi_set_drvdata(spi, priv); spi->bits_per_word = 9; -- cgit v1.2.3 From 44d516f99b96d6e55868fd4e87679f1cef6f1344 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:03 -0700 Subject: backlight: lm3533_bl: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Acked-by: Johan Hovold Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/lm3533_bl.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/video/backlight/lm3533_bl.c b/drivers/video/backlight/lm3533_bl.c index 187d1c283c1d..cff1fbe89a1b 100644 --- a/drivers/video/backlight/lm3533_bl.c +++ b/drivers/video/backlight/lm3533_bl.c @@ -296,11 +296,8 @@ static int lm3533_bl_probe(struct platform_device *pdev) } bl = devm_kzalloc(&pdev->dev, sizeof(*bl), GFP_KERNEL); - if (!bl) { - dev_err(&pdev->dev, - "failed to allocate memory for backlight\n"); + if (!bl) return -ENOMEM; - } bl->lm3533 = lm3533; bl->id = pdev->id; -- cgit v1.2.3 From e1094f9f74cc728c06160a2600e77a25bc5fac68 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:04 -0700 Subject: backlight: lms283gf05: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Acked-by: Marek Vasut Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/lms283gf05.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/video/backlight/lms283gf05.c b/drivers/video/backlight/lms283gf05.c index de8832504f68..14590c54aedf 100644 --- a/drivers/video/backlight/lms283gf05.c +++ b/drivers/video/backlight/lms283gf05.c @@ -168,10 +168,8 @@ static int lms283gf05_probe(struct spi_device *spi) st = devm_kzalloc(&spi->dev, sizeof(struct lms283gf05_state), GFP_KERNEL); - if (st == NULL) { - dev_err(&spi->dev, "No memory for device state\n"); + if (st == NULL) return -ENOMEM; - } ld = devm_lcd_device_register(&spi->dev, "lms283gf05", &spi->dev, st, &lms_ops); -- cgit v1.2.3 From c6915be435f93d725cd158d1a70251d3ff70737e Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:05 -0700 Subject: backlight: platform_lcd: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/platform_lcd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/video/backlight/platform_lcd.c b/drivers/video/backlight/platform_lcd.c index d01884d4f1bf..c3d2e209fc8f 100644 --- a/drivers/video/backlight/platform_lcd.c +++ b/drivers/video/backlight/platform_lcd.c @@ -94,10 +94,8 @@ static int platform_lcd_probe(struct platform_device *pdev) plcd = devm_kzalloc(&pdev->dev, sizeof(struct platform_lcd), GFP_KERNEL); - if (!plcd) { - dev_err(dev, "no memory for state\n"); + if (!plcd) return -ENOMEM; - } plcd->us = dev; plcd->pdata = pdata; -- cgit v1.2.3 From 3d8e4b401cdf6fcfdd1a897f585c88afc2e386c7 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:06 -0700 Subject: backlight: tps65217_bl: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/tps65217_bl.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/video/backlight/tps65217_bl.c b/drivers/video/backlight/tps65217_bl.c index cbba37e6836e..595dcf561020 100644 --- a/drivers/video/backlight/tps65217_bl.c +++ b/drivers/video/backlight/tps65217_bl.c @@ -200,7 +200,6 @@ tps65217_bl_parse_dt(struct platform_device *pdev) pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) { - dev_err(&pdev->dev, "failed to allocate platform data\n"); err = ERR_PTR(-ENOMEM); goto err; } @@ -296,10 +295,8 @@ static int tps65217_bl_probe(struct platform_device *pdev) tps65217_bl = devm_kzalloc(&pdev->dev, sizeof(*tps65217_bl), GFP_KERNEL); - if (tps65217_bl == NULL) { - dev_err(&pdev->dev, "allocation of struct tps65217_bl failed\n"); + if (tps65217_bl == NULL) return -ENOMEM; - } tps65217_bl->tps = tps; tps65217_bl->dev = &pdev->dev; -- cgit v1.2.3 From b104d6a5a82a56dbba8f743144e21d63ad181773 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Apr 2014 14:49:07 -0700 Subject: lib/devres.c: fix some sparse warnings Having a discussion about sparse warnings in the kernel, and that we should clean them up, I decided to pick a random file to do so. This happened to be devres.c which gives the following warnings: CHECK lib/devres.c lib/devres.c:83:9: warning: cast removes address space of expression lib/devres.c:117:31: warning: incorrect type in return expression (different address spaces) lib/devres.c:117:31: expected void [noderef] * lib/devres.c:117:31: got void * lib/devres.c:125:31: warning: incorrect type in return expression (different address spaces) lib/devres.c:125:31: expected void [noderef] * lib/devres.c:125:31: got void * lib/devres.c:136:26: warning: incorrect type in assignment (different address spaces) lib/devres.c:136:26: expected void [noderef] *[assigned] dest_ptr lib/devres.c:136:26: got void * lib/devres.c:226:9: warning: cast removes address space of expression Mostly it's just the use of typecasting to void * without adding __force, or returning ERR_PTR(-ESOMEERR) without typecasting to a __iomem type. I added a helper macro IOMEM_ERR_PTR() that does the typecast to make the code a little nicer than adding ugly typecasts to the code. Signed-off-by: Steven Rostedt Cc: Tejun Heo Acked-by: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/devres.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/devres.c b/lib/devres.c index 823533138fa0..48cb3c7bd7de 100644 --- a/lib/devres.c +++ b/lib/devres.c @@ -81,11 +81,13 @@ EXPORT_SYMBOL(devm_ioremap_nocache); void devm_iounmap(struct device *dev, void __iomem *addr) { WARN_ON(devres_destroy(dev, devm_ioremap_release, devm_ioremap_match, - (void *)addr)); + (__force void *)addr)); iounmap(addr); } EXPORT_SYMBOL(devm_iounmap); +#define IOMEM_ERR_PTR(err) (__force void __iomem *)ERR_PTR(err) + /** * devm_ioremap_resource() - check, request region, and ioremap resource * @dev: generic device to handle the resource for @@ -114,7 +116,7 @@ void __iomem *devm_ioremap_resource(struct device *dev, struct resource *res) if (!res || resource_type(res) != IORESOURCE_MEM) { dev_err(dev, "invalid resource\n"); - return ERR_PTR(-EINVAL); + return IOMEM_ERR_PTR(-EINVAL); } size = resource_size(res); @@ -122,7 +124,7 @@ void __iomem *devm_ioremap_resource(struct device *dev, struct resource *res) if (!devm_request_mem_region(dev, res->start, size, name)) { dev_err(dev, "can't request region for resource %pR\n", res); - return ERR_PTR(-EBUSY); + return IOMEM_ERR_PTR(-EBUSY); } if (res->flags & IORESOURCE_CACHEABLE) @@ -133,7 +135,7 @@ void __iomem *devm_ioremap_resource(struct device *dev, struct resource *res) if (!dest_ptr) { dev_err(dev, "ioremap failed for resource %pR\n", res); devm_release_mem_region(dev, res->start, size); - dest_ptr = ERR_PTR(-ENOMEM); + dest_ptr = IOMEM_ERR_PTR(-ENOMEM); } return dest_ptr; @@ -224,7 +226,7 @@ void devm_ioport_unmap(struct device *dev, void __iomem *addr) { ioport_unmap(addr); WARN_ON(devres_destroy(dev, devm_ioport_map_release, - devm_ioport_map_match, (void *)addr)); + devm_ioport_map_match, (__force void *)addr)); } EXPORT_SYMBOL(devm_ioport_unmap); #endif /* CONFIG_HAS_IOPORT */ -- cgit v1.2.3 From d3d47eb265c2bc6f4a6f1bc6971b84d27035b964 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Apr 2014 14:49:08 -0700 Subject: lib/random32.c: minor cleanups and kdoc fix These are just some very minor and misc cleanups in the PRNG. In prandom_u32() we store the result in an unsigned long which is unnecessary as it should be u32 instead that we get from prandom_u32_state(). prandom_bytes_state()'s comment is in kdoc format, so change it into such as it's done everywhere else. Also, use the normal comment style for the header comment. Last but not least for readability, add some newlines. Signed-off-by: Daniel Borkmann Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/random32.c | 76 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 37 deletions(-) diff --git a/lib/random32.c b/lib/random32.c index 614896778700..fa5da61ce7ad 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -1,37 +1,35 @@ /* - This is a maximally equidistributed combined Tausworthe generator - based on code from GNU Scientific Library 1.5 (30 Jun 2004) - - lfsr113 version: - - x_n = (s1_n ^ s2_n ^ s3_n ^ s4_n) - - s1_{n+1} = (((s1_n & 4294967294) << 18) ^ (((s1_n << 6) ^ s1_n) >> 13)) - s2_{n+1} = (((s2_n & 4294967288) << 2) ^ (((s2_n << 2) ^ s2_n) >> 27)) - s3_{n+1} = (((s3_n & 4294967280) << 7) ^ (((s3_n << 13) ^ s3_n) >> 21)) - s4_{n+1} = (((s4_n & 4294967168) << 13) ^ (((s4_n << 3) ^ s4_n) >> 12)) - - The period of this generator is about 2^113 (see erratum paper). - - From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe - Generators", Mathematics of Computation, 65, 213 (1996), 203--213: - http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps - ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps - - There is an erratum in the paper "Tables of Maximally - Equidistributed Combined LFSR Generators", Mathematics of - Computation, 68, 225 (1999), 261--269: - http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps - - ... the k_j most significant bits of z_j must be non- - zero, for each j. (Note: this restriction also applies to the - computer code given in [4], but was mistakenly not mentioned in - that paper.) - - This affects the seeding procedure by imposing the requirement - s1 > 1, s2 > 7, s3 > 15, s4 > 127. - -*/ + * This is a maximally equidistributed combined Tausworthe generator + * based on code from GNU Scientific Library 1.5 (30 Jun 2004) + * + * lfsr113 version: + * + * x_n = (s1_n ^ s2_n ^ s3_n ^ s4_n) + * + * s1_{n+1} = (((s1_n & 4294967294) << 18) ^ (((s1_n << 6) ^ s1_n) >> 13)) + * s2_{n+1} = (((s2_n & 4294967288) << 2) ^ (((s2_n << 2) ^ s2_n) >> 27)) + * s3_{n+1} = (((s3_n & 4294967280) << 7) ^ (((s3_n << 13) ^ s3_n) >> 21)) + * s4_{n+1} = (((s4_n & 4294967168) << 13) ^ (((s4_n << 3) ^ s4_n) >> 12)) + * + * The period of this generator is about 2^113 (see erratum paper). + * + * From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe + * Generators", Mathematics of Computation, 65, 213 (1996), 203--213: + * http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps + * ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps + * + * There is an erratum in the paper "Tables of Maximally Equidistributed + * Combined LFSR Generators", Mathematics of Computation, 68, 225 (1999), + * 261--269: http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps + * + * ... the k_j most significant bits of z_j must be non-zero, + * for each j. (Note: this restriction also applies to the + * computer code given in [4], but was mistakenly not mentioned + * in that paper.) + * + * This affects the seeding procedure by imposing the requirement + * s1 > 1, s2 > 7, s3 > 15, s4 > 127. + */ #include #include @@ -75,15 +73,17 @@ EXPORT_SYMBOL(prandom_u32_state); */ u32 prandom_u32(void) { - unsigned long r; struct rnd_state *state = &get_cpu_var(net_rand_state); - r = prandom_u32_state(state); + u32 res; + + res = prandom_u32_state(state); put_cpu_var(state); - return r; + + return res; } EXPORT_SYMBOL(prandom_u32); -/* +/** * prandom_bytes_state - get the requested number of pseudo-random bytes * * @state: pointer to state structure holding seeded state. @@ -204,6 +204,7 @@ static int __init prandom_init(void) prandom_seed_very_weak(state, (i + jiffies) ^ random_get_entropy()); prandom_warmup(state); } + return 0; } core_initcall(prandom_init); @@ -259,6 +260,7 @@ static void __prandom_reseed(bool late) if (latch && !late) goto out; + latch = true; for_each_possible_cpu(i) { -- cgit v1.2.3 From 3c516cdd02633bda90759ea227e6abb003f8c20a Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 3 Apr 2014 14:49:09 -0700 Subject: lib/clz_ctz.c: add prototype declarations in lib/clz_ctz.c Add prototype declarations of functions in lib/clz_ctz.c. These functions are required by GCC builtins and hence can not be removed despite of their unreferenced appearance in kernel source. This eliminates the following warning in lib/clz_ctz.c: lib/clz_ctz.c:16:12: warning: no previous prototype for `__ctzsi2' [-Wmissing-prototypes] lib/clz_ctz.c:22:12: warning: no previous prototype for `__clzsi2' [-Wmissing-prototypes] lib/clz_ctz.c:44:12: warning: no previous prototype for `__clzdi2' [-Wmissing-prototypes] lib/clz_ctz.c:50:12: warning: no previous prototype for `__ctzdi2' [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Acked-by: Chanho Min Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/clz_ctz.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lib/clz_ctz.c b/lib/clz_ctz.c index a8f8379eb49f..2e11e48446ab 100644 --- a/lib/clz_ctz.c +++ b/lib/clz_ctz.c @@ -6,6 +6,9 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. + * The functions in this file aren't called directly, but are required by + * GCC builtins such as __builtin_ctz, and therefore they can't be removed + * despite appearing unreferenced in kernel source. * * __c[lt]z[sd]i2 can be overridden by linking arch-specific versions. */ @@ -13,18 +16,22 @@ #include #include +int __weak __ctzsi2(int val); int __weak __ctzsi2(int val) { return __ffs(val); } EXPORT_SYMBOL(__ctzsi2); +int __weak __clzsi2(int val); int __weak __clzsi2(int val) { return 32 - fls(val); } EXPORT_SYMBOL(__clzsi2); +int __weak __clzdi2(long val); +int __weak __ctzdi2(long val); #if BITS_PER_LONG == 32 int __weak __clzdi2(long val) -- cgit v1.2.3 From af661e836039cbc2aadb3bccfd7d3ddfc1bcc456 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 3 Apr 2014 14:49:10 -0700 Subject: lib/decompress_inflate.c: include appropriate header file Include appropriate header file include/linux/decompress/inflate.h in lib/decompress_inflate.c because it has prototype declaration of function defined in lib/decompress_inflate.c. Also, fix the guard around the header file include/linux/decompress/inflate.h to use a more unique guard symbol. This avoids conflict with the INFLATE_H defined by zlib_inflate/inflate.h. This eliminates the following warning in lib/decompress_inflate.c: lib/decompress_inflate.c:35:17: warning: no previous prototype for `gunzip' [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/decompress/inflate.h | 4 ++-- lib/decompress_inflate.c | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/decompress/inflate.h b/include/linux/decompress/inflate.h index 8c0aef1ba5f5..1d0aedef9822 100644 --- a/include/linux/decompress/inflate.h +++ b/include/linux/decompress/inflate.h @@ -1,5 +1,5 @@ -#ifndef INFLATE_H -#define INFLATE_H +#ifndef LINUX_DECOMPRESS_INFLATE_H +#define LINUX_DECOMPRESS_INFLATE_H int gunzip(unsigned char *inbuf, int len, int(*fill)(void*, unsigned int), diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c index d619b28c456f..0edfd742a154 100644 --- a/lib/decompress_inflate.c +++ b/lib/decompress_inflate.c @@ -19,6 +19,7 @@ #include "zlib_inflate/inflate.h" #include "zlib_inflate/infutil.h" +#include #endif /* STATIC */ -- cgit v1.2.3 From 43c1d77c393205645cdd6337e657341a5216a7a8 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:49:11 -0700 Subject: checkpatch: add test for long udelay Holger reported: : The macro udelay cannot handle large values because of loss-of-precision. : : IMHO udelay on ARM is broken, because it also cannot work with fast : ARM processors (where bogomips >= 3355, which is in sight now). It's : just not broken enough that someone did something against it ... so : the current kludge is good enough. Until then, warn on long udelay uses. Also fix uses of $line that should have been $herecurr. Signed-off-by: Joe Perches Reported-by: Holger Schurig Cc: Sujith Manoharan Cc: John Linville Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 464dcef79b35..7d3bc2f3c326 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3912,10 +3912,15 @@ sub process { # prefer usleep_range over udelay if ($line =~ /\budelay\s*\(\s*(\d+)\s*\)/) { + my $delay = $1; # ignore udelay's < 10, however - if (! ($1 < 10) ) { + if (! ($delay < 10) ) { CHK("USLEEP_RANGE", - "usleep_range is preferred over udelay; see Documentation/timers/timers-howto.txt\n" . $line); + "usleep_range is preferred over udelay; see Documentation/timers/timers-howto.txt\n" . $herecurr); + } + if ($delay > 2000) { + WARN("LONG_UDELAY", + "long udelay - prefer mdelay; see arch/arm/include/asm/delay.h\n" . $herecurr); } } @@ -3923,7 +3928,7 @@ sub process { if ($line =~ /\bmsleep\s*\((\d+)\);/) { if ($1 < 20) { WARN("MSLEEP", - "msleep < 20ms can sleep for up to 20ms; see Documentation/timers/timers-howto.txt\n" . $line); + "msleep < 20ms can sleep for up to 20ms; see Documentation/timers/timers-howto.txt\n" . $herecurr); } } -- cgit v1.2.3 From 91f72e9c6ed1433b65c2944a2953968088607987 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:49:12 -0700 Subject: checkpatch: don't warn on some function pointer return styles Checks for some function pointer return styles are too strict. Fix them. Multiple spaces after function pointer return types are allowed. int (*foo)(int bar) Spaces after function pointer returns of pointer types are not required. int *(*foo)(int bar) Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 7d3bc2f3c326..19591af206c5 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2848,10 +2848,7 @@ sub process { # Function pointer declarations # check spacing between type, funcptr, and args # canonical declaration is "type (*funcptr)(args...)" -# -# the $Declare variable will capture all spaces after the type -# so check it for trailing missing spaces or multiple spaces - if ($line =~ /^.\s*($Declare)\((\s*)\*(\s*)$Ident(\s*)\)(\s*)\(/) { + if ($line =~ /^.\s*($Declare)\((\s*)\*(\s*)($Ident)(\s*)\)(\s*)\(/) { my $declare = $1; my $pre_pointer_space = $2; my $post_pointer_space = $3; @@ -2859,16 +2856,30 @@ sub process { my $post_funcname_space = $5; my $pre_args_space = $6; - if ($declare !~ /\s$/) { +# the $Declare variable will capture all spaces after the type +# so check it for a missing trailing missing space but pointer return types +# don't need a space so don't warn for those. + my $post_declare_space = ""; + if ($declare =~ /(\s+)$/) { + $post_declare_space = $1; + $declare = rtrim($declare); + } + if ($declare !~ /\*$/ && $post_declare_space =~ /^$/) { WARN("SPACING", "missing space after return type\n" . $herecurr); + $post_declare_space = " "; } # unnecessary space "type (*funcptr)(args...)" - elsif ($declare =~ /\s{2,}$/) { - WARN("SPACING", - "Multiple spaces after return type\n" . $herecurr); - } +# This test is not currently implemented because these declarations are +# equivalent to +# int foo(int bar, ...) +# and this is form shouldn't/doesn't generate a checkpatch warning. +# +# elsif ($declare =~ /\s{2,}$/) { +# WARN("SPACING", +# "Multiple spaces after return type\n" . $herecurr); +# } # unnecessary space "type ( *funcptr)(args...)" if (defined $pre_pointer_space && @@ -2900,7 +2911,7 @@ sub process { if (show_type("SPACING") && $fix) { $fixed[$linenr - 1] =~ - s/^(.\s*$Declare)\(\s*\*\s*($Ident)\s*\)\s*\(/rtrim($1) . " " . "\(\*$2\)\("/ex; + s/^(.\s*)$Declare\s*\(\s*\*\s*$Ident\s*\)\s*\(/$1 . $declare . $post_declare_space . '(*' . $funcname . ')('/ex; } } -- cgit v1.2.3 From 2435880fe5cd51cd73c403aa4c07eadc3de799db Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:49:13 -0700 Subject: checkpatch: add checks for constant non-octal permissions umode_t permissions are sometimes mistakenly written with decimal constants. Verify that numeric permissions are using octal. Add a list of the most commonly used functions and macros that have umode_t permissions and the argument position. Add a $Octal type to $Constant. Allow $LvalOrFunc to be a pointer indirection too. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 19591af206c5..0b40af57e71e 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -289,11 +289,12 @@ our $Int_type = qr{(?i)llu|ull|ll|lu|ul|l|u}; our $Binary = qr{(?i)0b[01]+$Int_type?}; our $Hex = qr{(?i)0x[0-9a-f]+$Int_type?}; our $Int = qr{[0-9]+$Int_type?}; +our $Octal = qr{0[0-7]+$Int_type?}; our $Float_hex = qr{(?i)0x[0-9a-f]+p-?[0-9]+[fl]?}; our $Float_dec = qr{(?i)(?:[0-9]+\.[0-9]*|[0-9]*\.[0-9]+)(?:e-?[0-9]+)?[fl]?}; our $Float_int = qr{(?i)[0-9]+e-?[0-9]+[fl]?}; our $Float = qr{$Float_hex|$Float_dec|$Float_int}; -our $Constant = qr{$Float|$Binary|$Hex|$Int}; +our $Constant = qr{$Float|$Binary|$Octal|$Hex|$Int}; our $Assignment = qr{\*\=|/=|%=|\+=|-=|<<=|>>=|&=|\^=|\|=|=}; our $Compare = qr{<=|>=|==|!=|<|>}; our $Arithmetic = qr{\+|-|\*|\/|%}; @@ -378,6 +379,15 @@ our @modifierList = ( qr{fastcall}, ); +our @mode_permission_funcs = ( + ["module_param", 3], + ["module_param_(?:array|named|string)", 4], + ["module_param_array_named", 5], + ["debugfs_create_(?:file|u8|u16|u32|u64|x8|x16|x32|x64|size_t|atomic_t|bool|blob|regset32|u32_array)", 2], + ["proc_create(?:_data|)", 2], + ["(?:CLASS|DEVICE|SENSOR)_ATTR", 2], +); + our $allowed_asm_includes = qr{(?x: irq| memory @@ -423,7 +433,7 @@ our $Typecast = qr{\s*(\(\s*$NonptrType\s*\)){0,1}\s*}; # Any use must be runtime checked with $^V our $balanced_parens = qr/(\((?:[^\(\)]++|(?-1))*\))/; -our $LvalOrFunc = qr{($Lval)\s*($balanced_parens{0,1})\s*}; +our $LvalOrFunc = qr{((?:[\&\*]\s*)?$Lval)\s*($balanced_parens{0,1})\s*}; our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant)}; sub deparenthesize { @@ -4473,6 +4483,28 @@ sub process { WARN("EXPORTED_WORLD_WRITABLE", "Exporting world writable files is usually an error. Consider more restrictive permissions.\n" . $herecurr); } + + foreach my $entry (@mode_permission_funcs) { + my $func = $entry->[0]; + my $arg_pos = $entry->[1]; + + my $skip_args = ""; + if ($arg_pos > 1) { + $arg_pos--; + $skip_args = "(?:\\s*$FuncArg\\s*,\\s*){$arg_pos,$arg_pos}"; + } + my $test = "\\b$func\\s*\\(${skip_args}([\\d]+)\\s*[,\\)]"; + if ($^V && $^V ge 5.10.0 && + $line =~ /$test/) { + my $val = $1; + $val = $6 if ($skip_args ne ""); + + if ($val =~ /^$Int$/ && $val !~ /^$Octal$/) { + ERROR("NON_OCTAL_PERMISSIONS", + "Use octal not decimal permissions\n" . $herecurr); + } + } + } } # If we have no input at all, then there is nothing to report on -- cgit v1.2.3 From fbdb8138cf0c75a0cf21991ca05ecc9fdff6e070 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:49:14 -0700 Subject: checkpatch: warn on uses of __constant_ functions Emit a warning when using any of these __constant_ forms: __constant_cpu_to_be[x] __constant_cpu_to_le[x] __constant_be[x]_to_cpu __constant_le[x]_to_cpu __constant_htons __constant_ntohs Using any of these outside of include/uapi/ isn't preferred as using the function without __constant_ is identical when the argument is a constant. Signed-off-by: Joe Perches Cc: Andy Whitcroft Cc: Simon Wunderlich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 0b40af57e71e..1054283c6e70 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3931,6 +3931,19 @@ sub process { } } +# don't use __constant_ functions outside of include/uapi/ + if ($realfile !~ m@^include/uapi/@ && + $line =~ /(__constant_(?:htons|ntohs|[bl]e(?:16|32|64)_to_cpu|cpu_to_[bl]e(?:16|32|64)))\s*\(/) { + my $constant_func = $1; + my $func = $constant_func; + $func =~ s/^__constant_//; + if (WARN("CONSTANT_CONVERSION", + "$constant_func should be $func\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/\b$constant_func\b/$func/g; + } + } + # prefer usleep_range over udelay if ($line =~ /\budelay\s*\(\s*(\d+)\s*\)/) { my $delay = $1; -- cgit v1.2.3 From 1727cc70451017e6d9c0129681792c18f166af75 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:49:15 -0700 Subject: checkpatch: update octal permissions warning When checking permissions, make sure 4 octal digits are used, but allow a single 0 too. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 1054283c6e70..9f12213d81cf 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4512,9 +4512,11 @@ sub process { my $val = $1; $val = $6 if ($skip_args ne ""); - if ($val =~ /^$Int$/ && $val !~ /^$Octal$/) { + if ($val !~ /^0$/ && + (($val =~ /^$Int$/ && $val !~ /^$Octal$/) || + length($val) ne 4)) { ERROR("NON_OCTAL_PERMISSIONS", - "Use octal not decimal permissions\n" . $herecurr); + "Use 4 digit octal (0777) not decimal permissions\n" . $herecurr); } } } -- cgit v1.2.3 From 6c8bd7076d79687183126fa7ffaa700fc2007d81 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:49:16 -0700 Subject: checkpatch: avoid sscanf test duplicated messages Change a test of $dstat to $line to avoid possibly emitting the sscanf warning multiple times. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9f12213d81cf..e7c50977fe3d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4188,7 +4188,7 @@ sub process { # check for naked sscanf if ($^V && $^V ge 5.10.0 && defined $stat && - $stat =~ /\bsscanf\b/ && + $line =~ /\bsscanf\b/ && ($stat !~ /$Ident\s*=\s*sscanf\s*$balanced_parens/ && $stat !~ /\bsscanf\s*$balanced_parens\s*(?:$Compare)/ && $stat !~ /(?:$Compare)\s*\bsscanf\s*$balanced_parens/)) { -- cgit v1.2.3 From 447432f32328dcd081ae5e23f27d2ffa19f2ecf8 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:49:17 -0700 Subject: checkpatch: fix jiffies comparison and others checkpatch could not distinguish between a variable in a struct named jiffies and the normal jiffies. foo->jiffies would emit a "Comparing jiffies" arning. Update the $Compare variable to do a negative look-behind for "-" when finding a ">" so that a pointer dereference like -> isn't a comparison. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e7c50977fe3d..646295cd9990 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -296,7 +296,7 @@ our $Float_int = qr{(?i)[0-9]+e-?[0-9]+[fl]?}; our $Float = qr{$Float_hex|$Float_dec|$Float_int}; our $Constant = qr{$Float|$Binary|$Octal|$Hex|$Int}; our $Assignment = qr{\*\=|/=|%=|\+=|-=|<<=|>>=|&=|\^=|\|=|=}; -our $Compare = qr{<=|>=|==|!=|<|>}; +our $Compare = qr{<=|>=|==|!=|<|(?}; our $Arithmetic = qr{\+|-|\*|\/|%}; our $Operators = qr{ <=|>=|==|!=| -- cgit v1.2.3 From 9b0fa60d9be0a82bc9e49bdd2dcacdfe4d1d5192 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:49:18 -0700 Subject: checkpatch: add test for char * arrays that could be static const static const char* arrays create smaller text as each function call does not have to populate the array. Emit a warning when char *arrays aren't static const and the array is not apparently global by being declared in the first column. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 646295cd9990..91308bec89b9 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2675,6 +2675,13 @@ sub process { $herecurr); } +# check for non-global char *foo[] = {"bar", ...} declarations. + if ($line =~ /^.\s+(?:static\s+|const\s+)?char\s+\*\s*\w+\s*\[\s*\]\s*=\s*\{/) { + WARN("STATIC_CONST_CHAR_ARRAY", + "char * array declaration might be better as static const\n" . + $herecurr); + } + # check for function declarations without arguments like "int foo()" if ($line =~ /(\b$Type\s+$Ident)\s*\(\s*\)/) { if (ERROR("FUNCTION_WITHOUT_ARGS", -- cgit v1.2.3 From cbec18afcc82a9b49953aeaa39ecb5ed553e3509 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:49:19 -0700 Subject: checkpatch: use a more consistent function argument style Instead of array indexing $_, use temporary variables like all the other subroutines in the script use. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 91308bec89b9..ae9f71d27a85 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1431,21 +1431,25 @@ sub possible { my $prefix = ''; sub show_type { - return defined $use_type{$_[0]} if (scalar keys %use_type > 0); + my ($type) = @_; - return !defined $ignore_type{$_[0]}; + return defined $use_type{$type} if (scalar keys %use_type > 0); + + return !defined $ignore_type{$type}; } sub report { - if (!show_type($_[1]) || - (defined $tst_only && $_[2] !~ /\Q$tst_only\E/)) { + my ($level, $type, $msg) = @_; + + if (!show_type($type) || + (defined $tst_only && $msg !~ /\Q$tst_only\E/)) { return 0; } my $line; if ($show_types) { - $line = "$prefix$_[0]:$_[1]: $_[2]\n"; + $line = "$prefix$level:$type: $msg\n"; } else { - $line = "$prefix$_[0]: $_[2]\n"; + $line = "$prefix$level: $msg\n"; } $line = (split('\n', $line))[0] . "\n" if ($terse); @@ -1453,12 +1457,15 @@ sub report { return 1; } + sub report_dump { our @report; } sub ERROR { - if (report("ERROR", $_[0], $_[1])) { + my ($type, $msg) = @_; + + if (report("ERROR", $type, $msg)) { our $clean = 0; our $cnt_error++; return 1; @@ -1466,7 +1473,9 @@ sub ERROR { return 0; } sub WARN { - if (report("WARNING", $_[0], $_[1])) { + my ($type, $msg) = @_; + + if (report("WARNING", $type, $msg)) { our $clean = 0; our $cnt_warn++; return 1; @@ -1474,7 +1483,9 @@ sub WARN { return 0; } sub CHK { - if ($check && report("CHECK", $_[0], $_[1])) { + my ($type, $msg) = @_; + + if ($check && report("CHECK", $type, $msg)) { our $clean = 0; our $cnt_chk++; return 1; -- cgit v1.2.3 From 85ad978c626a425c0bacd137bf15403e867c9134 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:49:20 -0700 Subject: checkpatch: ignore networking block comment style first lines in file It's very common to have normal block comments for the initial comments of a file description preface. So for files in drivers/net and net/ don't emit a warning when the first comment block in the file uses the normal block comment style and not the networking block comment style. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index ae9f71d27a85..c624b04d0070 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2219,7 +2219,8 @@ sub process { if ($realfile =~ m@^(drivers/net/|net/)@ && $prevrawline =~ /^\+[ \t]*\/\*[ \t]*$/ && - $rawline =~ /^\+[ \t]*\*/) { + $rawline =~ /^\+[ \t]*\*/ && + $realline > 2) { WARN("NETWORKING_BLOCK_COMMENT_STYLE", "networking block comments don't use an empty /* line, use /* Comment...\n" . $hereprev); } -- cgit v1.2.3 From 5b9553abfc97f923b99868a04c3b3d99a6014163 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:49:21 -0700 Subject: checkpatch: make "return is not a function" test quieter This test is a bit noisy and opinions seem to agree that it should not warn in a lot more situations. It seems people agree that: return (foo || bar); and return foo || bar; are both acceptable style and checkpatch should be silent about them. For now, it warns on parentheses around a simple constant or a single function or a ternary. return (foo); return (foo(bar)); return (foo ? bar : baz); The last ternary test may be quieted in the future. Modify the deparenthesize function to only strip balanced leading and trailing parentheses. Signed-off-by: Joe Perches Cc: Julia Lawall Reviewed-by: Josh Triplett Cc: Monam Agarwal Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index c624b04d0070..f0ea8a037a6d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -439,9 +439,14 @@ our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant)}; sub deparenthesize { my ($string) = @_; return "" if (!defined($string)); - $string =~ s@^\s*\(\s*@@g; - $string =~ s@\s*\)\s*$@@g; + + while ($string =~ /^\s*\(.*\)\s*$/) { + $string =~ s@^\s*\(\s*@@; + $string =~ s@\s*\)\s*$@@; + } + $string =~ s@\s+@ @g; + return $string; } @@ -3374,14 +3379,17 @@ sub process { } } -# Return is not a function. +# return is not a function if (defined($stat) && $stat =~ /^.\s*return(\s*)\(/s) { my $spacing = $1; if ($^V && $^V ge 5.10.0 && - $stat =~ /^.\s*return\s*$balanced_parens\s*;\s*$/) { - ERROR("RETURN_PARENTHESES", - "return is not a function, parentheses are not required\n" . $herecurr); - + $stat =~ /^.\s*return\s*($balanced_parens)\s*;\s*$/) { + my $value = $1; + $value = deparenthesize($value); + if ($value =~ m/^\s*$FuncArg\s*(?:\?|$)/) { + ERROR("RETURN_PARENTHESES", + "return is not a function, parentheses are not required\n" . $herecurr); + } } elsif ($spacing !~ /\s+/) { ERROR("SPACING", "space required before the open parenthesis '('\n" . $herecurr); -- cgit v1.2.3 From daa8b0592ee062583b2532bf62450c15fbcc7f98 Mon Sep 17 00:00:00 2001 From: Yogesh Chaudhari Date: Thu, 3 Apr 2014 14:49:23 -0700 Subject: checkpatch.pl: modify warning message for printk usage Modify warning message when printk is used in a patch. It mentions to use subsystem_dbg instead of netdev_dbg as the first preferred format of logging debug messages. Signed-off-by: Yogesh Chaudhari Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index f0ea8a037a6d..3d3ef2ff62b2 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2833,7 +2833,7 @@ sub process { my $level2 = $level; $level2 = "dbg" if ($level eq "debug"); WARN("PREFER_PR_LEVEL", - "Prefer netdev_$level2(netdev, ... then dev_$level2(dev, ... then pr_$level(... to printk(KERN_$orig ...\n" . $herecurr); + "Prefer [subsystem eg: netdev]_$level2([subsystem]dev, ... then dev_$level2(dev, ... then pr_$level(... to printk(KERN_$orig ...\n" . $herecurr); } if ($line =~ /\bpr_warning\s*\(/) { -- cgit v1.2.3 From 515a235ef9bcd31faa672740720936b22fa99c0e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:49:24 -0700 Subject: checkpatch: improve octal permissions test speed The current octal permissions test is very slow. When patch ("checkpatch: add checks for constant non-octal permissions") was added, processing time approximately tripled. Regain almost all of the performance by not looping through all the possible functions unless the line contains one of the functions. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 51 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3d3ef2ff62b2..9f03345a1c5c 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -388,6 +388,13 @@ our @mode_permission_funcs = ( ["(?:CLASS|DEVICE|SENSOR)_ATTR", 2], ); +#Create a search pattern for all these functions to speed up a loop below +our $mode_perms_search = ""; +foreach my $entry (@mode_permission_funcs) { + $mode_perms_search .= '|' if ($mode_perms_search ne ""); + $mode_perms_search .= $entry->[0]; +} + our $allowed_asm_includes = qr{(?x: irq| memory @@ -4524,26 +4531,30 @@ sub process { "Exporting world writable files is usually an error. Consider more restrictive permissions.\n" . $herecurr); } - foreach my $entry (@mode_permission_funcs) { - my $func = $entry->[0]; - my $arg_pos = $entry->[1]; - - my $skip_args = ""; - if ($arg_pos > 1) { - $arg_pos--; - $skip_args = "(?:\\s*$FuncArg\\s*,\\s*){$arg_pos,$arg_pos}"; - } - my $test = "\\b$func\\s*\\(${skip_args}([\\d]+)\\s*[,\\)]"; - if ($^V && $^V ge 5.10.0 && - $line =~ /$test/) { - my $val = $1; - $val = $6 if ($skip_args ne ""); - - if ($val !~ /^0$/ && - (($val =~ /^$Int$/ && $val !~ /^$Octal$/) || - length($val) ne 4)) { - ERROR("NON_OCTAL_PERMISSIONS", - "Use 4 digit octal (0777) not decimal permissions\n" . $herecurr); +# Mode permission misuses where it seems decimal should be octal +# This uses a shortcut match to avoid unnecessary uses of a slow foreach loop + if ($^V && $^V ge 5.10.0 && + $line =~ /$mode_perms_search/) { + foreach my $entry (@mode_permission_funcs) { + my $func = $entry->[0]; + my $arg_pos = $entry->[1]; + + my $skip_args = ""; + if ($arg_pos > 1) { + $arg_pos--; + $skip_args = "(?:\\s*$FuncArg\\s*,\\s*){$arg_pos,$arg_pos}"; + } + my $test = "\\b$func\\s*\\(${skip_args}([\\d]+)\\s*[,\\)]"; + if ($line =~ /$test/) { + my $val = $1; + $val = $6 if ($skip_args ne ""); + + if ($val !~ /^0$/ && + (($val =~ /^$Int$/ && $val !~ /^$Octal$/) || + length($val) ne 4)) { + ERROR("NON_OCTAL_PERMISSIONS", + "Use 4 digit octal (0777) not decimal permissions\n" . $herecurr); + } } } } -- cgit v1.2.3 From 8f0dbfaf2715b80f491c3e23c318c4202abf89e2 Mon Sep 17 00:00:00 2001 From: Florian Vaussard Date: Thu, 3 Apr 2014 14:49:24 -0700 Subject: checkpatch: check vendor compatible with dashes The current vendor compatible check will not match vendors with dashes, like: compatible="asahi-kasei" Signed-off-by: Florian Vaussard Reported-by: Joe Perches Acked-by: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9f03345a1c5c..3b268b3f2362 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2091,7 +2091,7 @@ sub process { my $vendor = $compat; my $vendor_path = $dt_path . "vendor-prefixes.txt"; next if (! -f $vendor_path); - $vendor =~ s/^([a-zA-Z0-9]+)\,.*/$1/; + $vendor =~ s/^([a-zA-Z0-9\-]+)\,.*/$1/; `grep -Eq "$vendor" $vendor_path`; if ( $? >> 8 ) { WARN("UNDOCUMENTED_DT_STRING", -- cgit v1.2.3 From 4fbf32a69346afc87ac1ddceb92c860d644433f9 Mon Sep 17 00:00:00 2001 From: Florian Vaussard Date: Thu, 3 Apr 2014 14:49:25 -0700 Subject: checkpatch: fix spurious vendor compatible warnings With a compatible string like compatible = "foo"; checkpatch will currently try to find "foo" in vendor-prefixes.txt, which is wrong since the vendor prefix is empty in this specific case. Skip the vendor test if the compatible is not like compatible = "vendor,something"; Signed-off-by: Florian Vaussard Cc: Joe Perches Acked-by: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3b268b3f2362..75b587e8ddfd 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2088,10 +2088,10 @@ sub process { "DT compatible string \"$compat\" appears un-documented -- check $dt_path\n" . $herecurr); } - my $vendor = $compat; my $vendor_path = $dt_path . "vendor-prefixes.txt"; next if (! -f $vendor_path); - $vendor =~ s/^([a-zA-Z0-9\-]+)\,.*/$1/; + next if $compat !~ /^([a-zA-Z0-9\-]+)\,/; + my $vendor = $1; `grep -Eq "$vendor" $vendor_path`; if ( $? >> 8 ) { WARN("UNDOCUMENTED_DT_STRING", -- cgit v1.2.3 From 7dd05b38e5b729f412b617baad5c3363519cf1d4 Mon Sep 17 00:00:00 2001 From: Florian Vaussard Date: Thu, 3 Apr 2014 14:49:26 -0700 Subject: checkpatch: check compatible strings in .c and .h too Look for ".compatible = "foo" strings not only in .dts files, but in .c and .h too. Signed-off-by: Florian Vaussard Cc: Joe Perches Acked-by: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 75b587e8ddfd..271d2f96b407 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2074,8 +2074,10 @@ sub process { } # check for DT compatible documentation - if (defined $root && $realfile =~ /\.dts/ && - $rawline =~ /^\+\s*compatible\s*=/) { + if (defined $root && + (($realfile =~ /\.dtsi?$/ && $line =~ /^\+\s*compatible\s*=\s*\"/) || + ($realfile =~ /\.[ch]$/ && $line =~ /^\+.*\.compatible\s*=\s*\"/))) { + my @compats = $rawline =~ /\"([a-zA-Z0-9\-\,\.\+_]+)\"/g; foreach my $compat (@compats) { -- cgit v1.2.3 From cc93319b5f2471ff2042ce62ea6da1f3bb3ce394 Mon Sep 17 00:00:00 2001 From: Florian Vaussard Date: Thu, 3 Apr 2014 14:49:27 -0700 Subject: checkpatch: improve the compatible vendor match Improve the vendor name match in vendor-prefix.txt by only matching the exact vendor name at the beginning of lines. Signed-off-by: Florian Vaussard Cc: Joe Perches Acked-by: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 271d2f96b407..921c037f1968 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2080,9 +2080,11 @@ sub process { my @compats = $rawline =~ /\"([a-zA-Z0-9\-\,\.\+_]+)\"/g; + my $dt_path = $root . "/Documentation/devicetree/bindings/"; + my $vp_file = $dt_path . "vendor-prefixes.txt"; + foreach my $compat (@compats) { my $compat2 = $compat; - my $dt_path = $root . "/Documentation/devicetree/bindings/"; $compat2 =~ s/\,[a-z]*\-/\,<\.\*>\-/; `grep -Erq "$compat|$compat2" $dt_path`; if ( $? >> 8 ) { @@ -2090,14 +2092,12 @@ sub process { "DT compatible string \"$compat\" appears un-documented -- check $dt_path\n" . $herecurr); } - my $vendor_path = $dt_path . "vendor-prefixes.txt"; - next if (! -f $vendor_path); next if $compat !~ /^([a-zA-Z0-9\-]+)\,/; my $vendor = $1; - `grep -Eq "$vendor" $vendor_path`; + `grep -Eq "^$vendor\\b" $vp_file`; if ( $? >> 8 ) { WARN("UNDOCUMENTED_DT_STRING", - "DT compatible string vendor \"$vendor\" appears un-documented -- check $vendor_path\n" . $herecurr); + "DT compatible string vendor \"$vendor\" appears un-documented -- check $vp_file\n" . $herecurr); } } } -- cgit v1.2.3 From 3b617e3b80548096eda92ebd8382f76b139e011c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:49:28 -0700 Subject: checkpatch: net and drivers/net: warn on missing blank line after variable declaration Networking prefers this style, so warn when it's not used. Networking uses: void foo(int bar) { int baz; code... } not void foo(int bar) { int baz; code... } There are a limited number of false positives when using macros to declare variables like: WARNING: networking uses a blank line after declarations #330: FILE: net/ipv4/inet_hashtables.c:330: + int dif = sk->sk_bound_dev_if; + INET_ADDR_COOKIE(acookie, saddr, daddr) Signed-off-by: Joe Perches Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 921c037f1968..33072d6c1b5a 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2257,6 +2257,21 @@ sub process { "networking block comments put the trailing */ on a separate line\n" . $herecurr); } +# check for missing blank lines after declarations + if ($realfile =~ m@^(drivers/net/|net/)@ && + $prevline =~ /^\+\s+$Declare\s+$Ident/ && + !($prevline =~ /(?:$Compare|$Assignment|$Operators)\s*$/ || + $prevline =~ /(?:\{\s*|\\)$/) && #extended lines + $sline =~ /^\+\s+/ && #Not at char 1 + !($sline =~ /^\+\s+$Declare/ || + $sline =~ /^\+\s+$Ident\s+$Ident/ || #eg: typedef foo + $sline =~ /^\+\s+(?:union|struct|enum|typedef)\b/ || + $sline =~ /^\+\s+(?:$|[\{\}\.\#\"\?\:\(])/ || + $sline =~ /^\+\s+\(?\s*(?:$Compare|$Assignment|$Operators)/)) { + WARN("SPACING", + "networking uses a blank line after declarations\n" . $hereprev); + } + # check for spaces at the beginning of a line. # Exceptions: # 1) within comments -- cgit v1.2.3 From 74915c7dd0e144cc735058a77a3901c98f1e7039 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 3 Apr 2014 14:49:29 -0700 Subject: scripts/checkpatch.pl: __GFP_NOFAIL isn't going away Revert commit 7e4915e78992 ("checkpatch: add warning of future __GFP_NOFAIL use"). There are no plans to remove __GFP_NOFAIL. __GFP_NOFAIL exists to a) centralise the retry-allocation-for-ever operation into the core allocator, which is the appropriate implementation site and b) permit us to identify code sites which aren't handling memory exhaustion appropriately. Cc: David Rientjes Cc: Joe Perches Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ------ 1 file changed, 6 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 33072d6c1b5a..d9b09eb767bb 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4330,12 +4330,6 @@ sub process { "$1 uses number as first arg, sizeof is generally wrong\n" . $herecurr); } -# check for GFP_NOWAIT use - if ($line =~ /\b__GFP_NOFAIL\b/) { - WARN("__GFP_NOFAIL", - "Use of __GFP_NOFAIL is deprecated, no new users should be added\n" . $herecurr); - } - # check for multiple semicolons if ($line =~ /;\s*;\s*$/) { if (WARN("ONE_SEMICOLON", -- cgit v1.2.3 From 7ebd05ef1646e8cbef54e38343722741a4744626 Mon Sep 17 00:00:00 2001 From: Christopher Covington Date: Thu, 3 Apr 2014 14:49:31 -0700 Subject: checkpatch.pl: add check for Change-Id A commit hook for the Gerrit code review server [1] inserts change identifiers so Gerrit can track patches through multiple revisions. These identifiers are noise in the context of the upstream kernel. (Many Gerrit servers are private. Even given a public instance, given only a Change-Id, one must guess which server a change was tracked on. Patches submitted to the Linux kernel mailing lists should be able to stand on their own. If it's truly useful to reference code review on a Gerrit server, a URL is a much clearer way to do so.) Thus, issue an error when a Change-Id line is encountered before the Signed-off-by. 1. https://gerrit.googlesource.com/gerrit/+/master/gerrit-server/src/main/resources/com/google/gerrit/server/tools/root/hooks/commit-msg Signed-off-by: Christopher Covington Cc: Joe Perches Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index d9b09eb767bb..fbb1b7e88e87 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1924,6 +1924,12 @@ sub process { } } +# Check for unwanted Gerrit info + if ($in_commit_log && $line =~ /^\s*change-id:/i) { + ERROR("GERRIT_CHANGE_ID", + "Remove Gerrit Change-Id's before submitting upstream.\n" . $herecurr); + } + # Check for wrappage within a valid hunk of the file if ($realcnt != 0 && $line !~ m{^(?:\+|-| |\\ No newline|$)}) { ERROR("CORRUPTED_PATCH", -- cgit v1.2.3 From 91cb5195ff224dd9044cf927f80d9c633cdcffec Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:49:32 -0700 Subject: checkpatch: expand parenthesis alignment test to declarations, functions and assignments Currently the parenthesis alignment test works only on misalignments of if statements like if (foo(bar, baz) Expand the test to find misalignments like: static inline int foo(int bar, int baz) and foo(bar, baz); and foo = bar(baz, qux); Expand the $Inline keyword for __inline and __inline__ too. Add $Inline to $Declare so it also matches "static inline ". These checks are only performed with --strict. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index fbb1b7e88e87..d3dcb370fc37 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -281,7 +281,7 @@ our $Attribute = qr{ __weak }x; our $Modifier; -our $Inline = qr{inline|__always_inline|noinline}; +our $Inline = qr{inline|__always_inline|noinline|__inline|__inline__}; our $Member = qr{->$Ident|\.$Ident|\[[^]]*\]}; our $Lval = qr{$Ident(?:$Member)*}; @@ -304,6 +304,8 @@ our $Operators = qr{ &&|\|\||,|\^|\+\+|--|&|\||$Arithmetic }x; +our $c90_Keywords = qr{do|for|while|if|else|return|goto|continue|switch|default|case|break}x; + our $NonptrType; our $NonptrTypeWithAttr; our $Type; @@ -429,7 +431,7 @@ sub build_types { (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*|\[\])+|(?:\s*\[\s*\])+)? (?:\s+$Inline|\s+$Modifier)* }x; - $Declare = qr{(?:$Storage\s+)?$Type}; + $Declare = qr{(?:$Storage\s+(?:$Inline\s+)?)?$Type}; } build_types(); @@ -1607,7 +1609,7 @@ sub pos_last_openparen { } } - return $last_openparen + 1; + return length(expand_tabs(substr($line, 0, $last_openparen))) + 1; } sub process { @@ -2200,7 +2202,7 @@ sub process { # check multi-line statement indentation matches previous line if ($^V && $^V ge 5.10.0 && - $prevline =~ /^\+(\t*)(if \(|$Ident\().*(\&\&|\|\||,)\s*$/) { + $prevline =~ /^\+([ \t]*)((?:$c90_Keywords(?:\s+if)\s*)|(?:$Declare\s*)?(?:$Ident|\(\s*\*\s*$Ident\s*\))\s*|$Ident\s*=\s*$Ident\s*)\(.*(\&\&|\|\||,)\s*$/) { $prevline =~ /^\+(\t*)(.*)$/; my $oldindent = $1; my $rest = $2; -- cgit v1.2.3 From b00e48148e99a20c3d81346390d60c7d23826f61 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:49:33 -0700 Subject: checkpatch: don't warn on bitfield spaces around : This test prevents code from being aligned around the : for easy visual counting of bitfield lengths. ie: int foo : 1, int bar : 2, int foobar :29; should be acceptable so remove the test. Signed-off-by: Joe Perches Suggested-by: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index d3dcb370fc37..34eb2160489d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3138,10 +3138,13 @@ sub process { # // is a comment } elsif ($op eq '//') { + # : when part of a bitfield + } elsif ($opv eq ':B') { + # skip the bitfield test for now + # No spaces for: # -> - # : when part of a bitfield - } elsif ($op eq '->' || $opv eq ':B') { + } elsif ($op eq '->') { if ($ctx =~ /Wx.|.xW/) { if (ERROR("SPACING", "spaces prohibited around that '$op' $at\n" . $hereptr)) { -- cgit v1.2.3 From 7a42d4b6a10c93ff15cff4f09c5a2d65877fa6ba Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 3 Apr 2014 14:49:33 -0700 Subject: fs/efs/super.c: add __init to init_inodecache() init_inodecache is only called by __init init_efs_fs. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/efs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/efs/super.c b/fs/efs/super.c index 50215bbd6463..f8def1acf08c 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -91,7 +91,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { efs_inode_cachep = kmem_cache_create("efs_inode_cache", sizeof(struct efs_inode_info), -- cgit v1.2.3 From b003f9650210fa1b3caea43201852bf2d9df7689 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Thu, 3 Apr 2014 14:49:34 -0700 Subject: binfmt_misc: add missing 'break' statement A missing 'break' statement in bm_status_write() results in a user program receiving '3' when doing the following: write(fd, "-1", 2); Signed-off-by: Luis Henriques Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 1c740e152f38..b60500300dd7 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -656,6 +656,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer, mutex_unlock(&root->d_inode->i_mutex); dput(root); + break; default: return res; } return count; -- cgit v1.2.3 From a68b31080912dae377bee4994716b357dc74286d Mon Sep 17 00:00:00 2001 From: chishanmingshen Date: Thu, 3 Apr 2014 14:49:35 -0700 Subject: init/do_mounts.c: fix comment error Signed-off-by: chishanmingshen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/do_mounts.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index 8e5addc45874..82f22885c87e 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -102,13 +102,13 @@ no_match: /** * devt_from_partuuid - looks up the dev_t of a partition by its UUID - * @uuid: char array containing ascii UUID + * @uuid_str: char array containing ascii UUID * * The function will return the first partition which contains a matching * UUID value in its partition_meta_info struct. This does not search * by filesystem UUIDs. * - * If @uuid is followed by a "/PARTNROFF=%d", then the number will be + * If @uuid_str is followed by a "/PARTNROFF=%d", then the number will be * extracted and used as an offset from the partition identified by the UUID. * * Returns the matching dev_t on success or 0 on failure. -- cgit v1.2.3 From 4071ea25cc08d41002746cca2d69ac700d67a2ac Mon Sep 17 00:00:00 2001 From: Alessandro Zummo Date: Thu, 3 Apr 2014 14:49:36 -0700 Subject: rtc: fix potential race condition RTC drivers must not return an error after device registration. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Alessandro Zummo Reported-by: Ales Novak Cc: Alexander Shiyan Cc: Atsushi Nemoto Cc: Jiri Kosina Cc: Srikanth Srinivasan Cc: Lee Jones Cc: Sascha Hauer Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ds1305.c | 10 ++++---- drivers/rtc/rtc-ds1307.c | 60 ++++++++++++++++++++++++++---------------------- drivers/rtc/rtc-ds1511.c | 18 ++++++++------- drivers/rtc/rtc-ds1553.c | 18 ++++++++------- drivers/rtc/rtc-ds1672.c | 11 ++++----- drivers/rtc/rtc-ds1742.c | 5 +++- drivers/rtc/rtc-ds3232.c | 2 -- drivers/rtc/rtc-test.c | 9 +++----- drivers/rtc/rtc-x1205.c | 2 +- 9 files changed, 70 insertions(+), 65 deletions(-) diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c index 2dd586a19b59..129add77065d 100644 --- a/drivers/rtc/rtc-ds1305.c +++ b/drivers/rtc/rtc-ds1305.c @@ -756,19 +756,17 @@ static int ds1305_probe(struct spi_device *spi) status = devm_request_irq(&spi->dev, spi->irq, ds1305_irq, 0, dev_name(&ds1305->rtc->dev), ds1305); if (status < 0) { - dev_dbg(&spi->dev, "request_irq %d --> %d\n", + dev_err(&spi->dev, "request_irq %d --> %d\n", spi->irq, status); - return status; + } else { + device_set_wakeup_capable(&spi->dev, 1); } - - device_set_wakeup_capable(&spi->dev, 1); } /* export NVRAM */ status = sysfs_create_bin_file(&spi->dev.kobj, &nvram); if (status < 0) { - dev_dbg(&spi->dev, "register nvram --> %d\n", status); - return status; + dev_err(&spi->dev, "register nvram --> %d\n", status); } return 0; diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index 4e75345a559a..9e2aad68f96d 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -930,52 +930,58 @@ read_rtc: ds1307->rtc = devm_rtc_device_register(&client->dev, client->name, &ds13xx_rtc_ops, THIS_MODULE); if (IS_ERR(ds1307->rtc)) { - err = PTR_ERR(ds1307->rtc); - dev_err(&client->dev, - "unable to register the class device\n"); - goto exit; + return PTR_ERR(ds1307->rtc); } if (want_irq) { err = request_irq(client->irq, ds1307_irq, IRQF_SHARED, ds1307->rtc->name, client); if (err) { - dev_err(&client->dev, - "unable to request IRQ!\n"); - goto exit; - } + client->irq = 0; + dev_err(&client->dev, "unable to request IRQ!\n"); + } else { - device_set_wakeup_capable(&client->dev, 1); - set_bit(HAS_ALARM, &ds1307->flags); - dev_dbg(&client->dev, "got IRQ %d\n", client->irq); + device_set_wakeup_capable(&client->dev, 1); + set_bit(HAS_ALARM, &ds1307->flags); + dev_dbg(&client->dev, "got IRQ %d\n", client->irq); + } } if (chip->nvram_size) { + ds1307->nvram = devm_kzalloc(&client->dev, sizeof(struct bin_attribute), GFP_KERNEL); if (!ds1307->nvram) { - err = -ENOMEM; - goto err_irq; + dev_err(&client->dev, "cannot allocate memory for nvram sysfs\n"); + } else { + + ds1307->nvram->attr.name = "nvram"; + ds1307->nvram->attr.mode = S_IRUGO | S_IWUSR; + + sysfs_bin_attr_init(ds1307->nvram); + + ds1307->nvram->read = ds1307_nvram_read; + ds1307->nvram->write = ds1307_nvram_write; + ds1307->nvram->size = chip->nvram_size; + ds1307->nvram_offset = chip->nvram_offset; + + err = sysfs_create_bin_file(&client->dev.kobj, + ds1307->nvram); + if (err) { + dev_err(&client->dev, + "unable to create sysfs file: %s\n", + ds1307->nvram->attr.name); + } else { + set_bit(HAS_NVRAM, &ds1307->flags); + dev_info(&client->dev, "%zu bytes nvram\n", + ds1307->nvram->size); + } } - ds1307->nvram->attr.name = "nvram"; - ds1307->nvram->attr.mode = S_IRUGO | S_IWUSR; - sysfs_bin_attr_init(ds1307->nvram); - ds1307->nvram->read = ds1307_nvram_read; - ds1307->nvram->write = ds1307_nvram_write; - ds1307->nvram->size = chip->nvram_size; - ds1307->nvram_offset = chip->nvram_offset; - err = sysfs_create_bin_file(&client->dev.kobj, ds1307->nvram); - if (err) - goto err_irq; - set_bit(HAS_NVRAM, &ds1307->flags); - dev_info(&client->dev, "%zu bytes nvram\n", ds1307->nvram->size); } return 0; -err_irq: - free_irq(client->irq, client); exit: return err; } diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index bc7b4fcf603c..4ff20f9dc212 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -473,7 +473,6 @@ static struct bin_attribute ds1511_nvram_attr = { static int ds1511_rtc_probe(struct platform_device *pdev) { - struct rtc_device *rtc; struct resource *res; struct rtc_plat_data *pdata; int ret = 0; @@ -512,6 +511,12 @@ static int ds1511_rtc_probe(struct platform_device *pdev) spin_lock_init(&pdata->lock); platform_set_drvdata(pdev, pdata); + + pdata->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, + &ds1511_rtc_ops, THIS_MODULE); + if (IS_ERR(pdata->rtc)) + return PTR_ERR(pdata->rtc); + /* * if the platform has an interrupt in mind for this device, * then by all means, set it @@ -526,15 +531,12 @@ static int ds1511_rtc_probe(struct platform_device *pdev) } } - rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &ds1511_rtc_ops, - THIS_MODULE); - if (IS_ERR(rtc)) - return PTR_ERR(rtc); - pdata->rtc = rtc; - ret = sysfs_create_bin_file(&pdev->dev.kobj, &ds1511_nvram_attr); + if (ret) + dev_err(&pdev->dev, "Unable to create sysfs entry: %s\n", + ds1511_nvram_attr.attr.name); - return ret; + return 0; } static int ds1511_rtc_remove(struct platform_device *pdev) diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c index fd31571941f5..9ac2fa07dffa 100644 --- a/drivers/rtc/rtc-ds1553.c +++ b/drivers/rtc/rtc-ds1553.c @@ -278,7 +278,6 @@ static struct bin_attribute ds1553_nvram_attr = { static int ds1553_rtc_probe(struct platform_device *pdev) { - struct rtc_device *rtc; struct resource *res; unsigned int cen, sec; struct rtc_plat_data *pdata; @@ -311,6 +310,12 @@ static int ds1553_rtc_probe(struct platform_device *pdev) spin_lock_init(&pdata->lock); pdata->last_jiffies = jiffies; platform_set_drvdata(pdev, pdata); + + pdata->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, + &ds1553_rtc_ops, THIS_MODULE); + if (IS_ERR(pdata->rtc)) + return PTR_ERR(pdata->rtc); + if (pdata->irq > 0) { writeb(0, ioaddr + RTC_INTERRUPTS); if (devm_request_irq(&pdev->dev, pdata->irq, @@ -321,15 +326,12 @@ static int ds1553_rtc_probe(struct platform_device *pdev) } } - rtc = devm_rtc_device_register(&pdev->dev, pdev->name, - &ds1553_rtc_ops, THIS_MODULE); - if (IS_ERR(rtc)) - return PTR_ERR(rtc); - pdata->rtc = rtc; - ret = sysfs_create_bin_file(&pdev->dev.kobj, &ds1553_nvram_attr); + if (ret) + dev_err(&pdev->dev, "unable to create sysfs file: %s\n", + ds1553_nvram_attr.attr.name); - return ret; + return 0; } static int ds1553_rtc_remove(struct platform_device *pdev) diff --git a/drivers/rtc/rtc-ds1672.c b/drivers/rtc/rtc-ds1672.c index 18e2d8471472..a4888dbca2e1 100644 --- a/drivers/rtc/rtc-ds1672.c +++ b/drivers/rtc/rtc-ds1672.c @@ -177,8 +177,9 @@ static int ds1672_probe(struct i2c_client *client, /* read control register */ err = ds1672_get_control(client, &control); - if (err) - goto exit_devreg; + if (err) { + dev_warn(&client->dev, "Unable to read the control register\n"); + } if (control & DS1672_REG_CONTROL_EOSC) dev_warn(&client->dev, "Oscillator not enabled. " @@ -187,12 +188,10 @@ static int ds1672_probe(struct i2c_client *client, /* Register sysfs hooks */ err = device_create_file(&client->dev, &dev_attr_control); if (err) - goto exit_devreg; + dev_err(&client->dev, "Unable to create sysfs entry: %s\n", + dev_attr_control.attr.name); return 0; - - exit_devreg: - return err; } static struct i2c_device_id ds1672_id[] = { diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c index 5a1f3b2a8f1e..942103dac30f 100644 --- a/drivers/rtc/rtc-ds1742.c +++ b/drivers/rtc/rtc-ds1742.c @@ -204,8 +204,11 @@ static int ds1742_rtc_probe(struct platform_device *pdev) return PTR_ERR(rtc); ret = sysfs_create_bin_file(&pdev->dev.kobj, &pdata->nvram_attr); + if (ret) + dev_err(&pdev->dev, "Unable to create sysfs entry: %s\n", + pdata->nvram_attr.attr.name); - return ret; + return 0; } static int ds1742_rtc_remove(struct platform_device *pdev) diff --git a/drivers/rtc/rtc-ds3232.c b/drivers/rtc/rtc-ds3232.c index b83bb5a527f8..15497c578af5 100644 --- a/drivers/rtc/rtc-ds3232.c +++ b/drivers/rtc/rtc-ds3232.c @@ -414,7 +414,6 @@ static int ds3232_probe(struct i2c_client *client, ds3232->rtc = devm_rtc_device_register(&client->dev, client->name, &ds3232_rtc_ops, THIS_MODULE); if (IS_ERR(ds3232->rtc)) { - dev_err(&client->dev, "unable to register the class device\n"); return PTR_ERR(ds3232->rtc); } @@ -423,7 +422,6 @@ static int ds3232_probe(struct i2c_client *client, "ds3232", client); if (ret) { dev_err(&client->dev, "unable to request IRQ\n"); - return ret; } } diff --git a/drivers/rtc/rtc-test.c b/drivers/rtc/rtc-test.c index 7746e65b93f2..6599c20bc454 100644 --- a/drivers/rtc/rtc-test.c +++ b/drivers/rtc/rtc-test.c @@ -104,20 +104,17 @@ static int test_probe(struct platform_device *plat_dev) rtc = devm_rtc_device_register(&plat_dev->dev, "test", &test_rtc_ops, THIS_MODULE); if (IS_ERR(rtc)) { - err = PTR_ERR(rtc); - return err; + return PTR_ERR(rtc); } err = device_create_file(&plat_dev->dev, &dev_attr_irq); if (err) - goto err; + dev_err(&plat_dev->dev, "Unable to create sysfs entry: %s\n", + dev_attr_irq.attr.name); platform_set_drvdata(plat_dev, rtc); return 0; - -err: - return err; } static int test_remove(struct platform_device *plat_dev) diff --git a/drivers/rtc/rtc-x1205.c b/drivers/rtc/rtc-x1205.c index 365dc6505148..b1de58e0b3d0 100644 --- a/drivers/rtc/rtc-x1205.c +++ b/drivers/rtc/rtc-x1205.c @@ -660,7 +660,7 @@ static int x1205_probe(struct i2c_client *client, err = x1205_sysfs_register(&client->dev); if (err) - return err; + dev_err(&client->dev, "Unable to create sysfs entries\n"); return 0; } -- cgit v1.2.3 From 3378f73da2d7fb765240dbf348cad11648c5b775 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 3 Apr 2014 14:49:38 -0700 Subject: drivers/rtc/rtc-imxdi.c: check the return value from clk_prepare_enable() clk_prepare_enable() may fail, so let's check its return value and propagate it in the case of error. Signed-off-by: Fabio Estevam Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-imxdi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c index abd7f9091f34..cd741c77e085 100644 --- a/drivers/rtc/rtc-imxdi.c +++ b/drivers/rtc/rtc-imxdi.c @@ -401,7 +401,9 @@ static int __init dryice_rtc_probe(struct platform_device *pdev) imxdi->clk = devm_clk_get(&pdev->dev, NULL); if (IS_ERR(imxdi->clk)) return PTR_ERR(imxdi->clk); - clk_prepare_enable(imxdi->clk); + rc = clk_prepare_enable(imxdi->clk); + if (rc) + return rc; /* * Initialize dryice hardware -- cgit v1.2.3 From 61dba62506491d83ee9bfd86aee7f7b6e9622d49 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:38 -0700 Subject: rtc: rtc-at32ap700x: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Acked-by: Hans-Christian Egtvedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-at32ap700x.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-at32ap700x.c b/drivers/rtc/rtc-at32ap700x.c index 3161ab5263ed..aee3387fb099 100644 --- a/drivers/rtc/rtc-at32ap700x.c +++ b/drivers/rtc/rtc-at32ap700x.c @@ -204,10 +204,8 @@ static int __init at32_rtc_probe(struct platform_device *pdev) rtc = devm_kzalloc(&pdev->dev, sizeof(struct rtc_at32ap700x), GFP_KERNEL); - if (!rtc) { - dev_dbg(&pdev->dev, "out of memory\n"); + if (!rtc) return -ENOMEM; - } regs = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!regs) { -- cgit v1.2.3 From a3080fa7ec028dca0ac24399e19292bcf4feaf6c Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:39 -0700 Subject: rtc: rtc-davinci: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-davinci.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c index 24677ef8c39a..1f3495e4e634 100644 --- a/drivers/rtc/rtc-davinci.c +++ b/drivers/rtc/rtc-davinci.c @@ -486,10 +486,8 @@ static int __init davinci_rtc_probe(struct platform_device *pdev) int ret = 0; davinci_rtc = devm_kzalloc(&pdev->dev, sizeof(struct davinci_rtc), GFP_KERNEL); - if (!davinci_rtc) { - dev_dbg(dev, "could not allocate memory for private data\n"); + if (!davinci_rtc) return -ENOMEM; - } davinci_rtc->irq = platform_get_irq(pdev, 0); if (davinci_rtc->irq < 0) { -- cgit v1.2.3 From 7841fe5396a33eaa7fd180dbed6f551a92657435 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:40 -0700 Subject: rtc: rtc-ds1390: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ds1390.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-ds1390.c b/drivers/rtc/rtc-ds1390.c index be9d8c0a7e3a..e67bfcb3a1aa 100644 --- a/drivers/rtc/rtc-ds1390.c +++ b/drivers/rtc/rtc-ds1390.c @@ -132,10 +132,9 @@ static int ds1390_probe(struct spi_device *spi) spi_setup(spi); chip = devm_kzalloc(&spi->dev, sizeof(*chip), GFP_KERNEL); - if (!chip) { - dev_err(&spi->dev, "unable to allocate device memory\n"); + if (!chip) return -ENOMEM; - } + spi_set_drvdata(spi, chip); res = ds1390_get_reg(&spi->dev, DS1390_REG_SECONDS, &tmp); -- cgit v1.2.3 From 4410af6e7eef106d21b6698d507661b546b25cfa Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:41 -0700 Subject: rtc: rtc-moxart: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-moxart.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-moxart.c b/drivers/rtc/rtc-moxart.c index c29dee0946e6..c31846238871 100644 --- a/drivers/rtc/rtc-moxart.c +++ b/drivers/rtc/rtc-moxart.c @@ -247,10 +247,8 @@ static int moxart_rtc_probe(struct platform_device *pdev) int ret = 0; moxart_rtc = devm_kzalloc(&pdev->dev, sizeof(*moxart_rtc), GFP_KERNEL); - if (!moxart_rtc) { - dev_err(&pdev->dev, "devm_kzalloc failed\n"); + if (!moxart_rtc) return -ENOMEM; - } moxart_rtc->gpio_data = of_get_named_gpio(pdev->dev.of_node, "gpio-rtc-data", 0); -- cgit v1.2.3 From ee567199941245e1d3f7b8df0113d1077e78ce0b Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:42 -0700 Subject: rtc: rtc-nuc900: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-nuc900.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-nuc900.c b/drivers/rtc/rtc-nuc900.c index 248653c74b80..a53da0958e95 100644 --- a/drivers/rtc/rtc-nuc900.c +++ b/drivers/rtc/rtc-nuc900.c @@ -229,10 +229,9 @@ static int __init nuc900_rtc_probe(struct platform_device *pdev) nuc900_rtc = devm_kzalloc(&pdev->dev, sizeof(struct nuc900_rtc), GFP_KERNEL); - if (!nuc900_rtc) { - dev_err(&pdev->dev, "kzalloc nuc900_rtc failed\n"); + if (!nuc900_rtc) return -ENOMEM; - } + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); nuc900_rtc->rtc_reg = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(nuc900_rtc->rtc_reg)) -- cgit v1.2.3 From 49ae425b8273d0856ffef41df51a1d1bdd06d0d0 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:43 -0700 Subject: rtc: rtc-pm8xxx: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-pm8xxx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-pm8xxx.c b/drivers/rtc/rtc-pm8xxx.c index 03f8f75d5af2..bd76ffe92784 100644 --- a/drivers/rtc/rtc-pm8xxx.c +++ b/drivers/rtc/rtc-pm8xxx.c @@ -396,10 +396,8 @@ static int pm8xxx_rtc_probe(struct platform_device *pdev) rtc_write_enable = pdata->rtc_write_enable; rtc_dd = devm_kzalloc(&pdev->dev, sizeof(*rtc_dd), GFP_KERNEL); - if (rtc_dd == NULL) { - dev_err(&pdev->dev, "Unable to allocate memory!\n"); + if (rtc_dd == NULL) return -ENOMEM; - } /* Initialise spinlock to protect RTC control register */ spin_lock_init(&rtc_dd->ctrl_reg_lock); -- cgit v1.2.3 From 147a9855364dcd5271338c4fe0707b2d670d103d Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:44 -0700 Subject: rtc: rtc-rx8025: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-rx8025.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/rtc/rtc-rx8025.c b/drivers/rtc/rtc-rx8025.c index 8fa23eabcb68..e6298e02b400 100644 --- a/drivers/rtc/rtc-rx8025.c +++ b/drivers/rtc/rtc-rx8025.c @@ -551,7 +551,6 @@ static int rx8025_probe(struct i2c_client *client, rx8025 = devm_kzalloc(&client->dev, sizeof(*rx8025), GFP_KERNEL); if (!rx8025) { - dev_err(&adapter->dev, "failed to alloc memory\n"); err = -ENOMEM; goto errout; } -- cgit v1.2.3 From 98e2d21faaf96d879e2746cedd96f5c38c04f9b3 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:45 -0700 Subject: rtc: rtc-sirfsoc: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Acked-by: Barry Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-sirfsoc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/rtc/rtc-sirfsoc.c b/drivers/rtc/rtc-sirfsoc.c index 3eb3642ae299..9e3cbce56ec1 100644 --- a/drivers/rtc/rtc-sirfsoc.c +++ b/drivers/rtc/rtc-sirfsoc.c @@ -264,12 +264,8 @@ static int sirfsoc_rtc_probe(struct platform_device *pdev) rtcdrv = devm_kzalloc(&pdev->dev, sizeof(struct sirfsoc_rtc_drv), GFP_KERNEL); - if (rtcdrv == NULL) { - dev_err(&pdev->dev, - "%s: can't alloc mem for drv struct\n", - pdev->name); + if (rtcdrv == NULL) return -ENOMEM; - } err = of_property_read_u32(np, "reg", &rtcdrv->rtc_base); if (err) { -- cgit v1.2.3 From 5c336b0a7a7636d81e9960bea29574e561df4d74 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:46 -0700 Subject: rtc: rtc-lpc32xx: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-lpc32xx.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-lpc32xx.c b/drivers/rtc/rtc-lpc32xx.c index bfdbcb82d069..f130c08c98f8 100644 --- a/drivers/rtc/rtc-lpc32xx.c +++ b/drivers/rtc/rtc-lpc32xx.c @@ -211,10 +211,9 @@ static int lpc32xx_rtc_probe(struct platform_device *pdev) } rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL); - if (unlikely(!rtc)) { - dev_err(&pdev->dev, "Can't allocate memory\n"); + if (unlikely(!rtc)) return -ENOMEM; - } + rtc->irq = rtcirq; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); -- cgit v1.2.3 From 681acc9f53210f8a4c1b280b077f2cd73e63ac4c Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:46 -0700 Subject: rtc: rtc-spear: remove unnecessary OOM messages The site-specific OOM messages are unnecessary, because they duplicate the MM subsystem generic OOM message. Signed-off-by: Jingoo Han Acked-by: Viresh Kumar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-spear.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-spear.c b/drivers/rtc/rtc-spear.c index c492cf0ab8cd..d2cdb9823a15 100644 --- a/drivers/rtc/rtc-spear.c +++ b/drivers/rtc/rtc-spear.c @@ -365,10 +365,8 @@ static int spear_rtc_probe(struct platform_device *pdev) } config = devm_kzalloc(&pdev->dev, sizeof(*config), GFP_KERNEL); - if (!config) { - dev_err(&pdev->dev, "out of memory\n"); + if (!config) return -ENOMEM; - } /* alarm irqs */ irq = platform_get_irq(pdev, 0); -- cgit v1.2.3 From daaf90f0b014106e5672381c7b576a149b346dd9 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:47 -0700 Subject: rtc: rtc-coh901331: use devm_ioremap_resource() Use devm_ioremap_resource() in order to make the code simpler, and remove redundant return value check of platform_get_resource() because the value is checked by devm_ioremap_resource(). Signed-off-by: Jingoo Han Acked-by: Linus Walleij Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-coh901331.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/drivers/rtc/rtc-coh901331.c b/drivers/rtc/rtc-coh901331.c index 73f157519dff..869cae273799 100644 --- a/drivers/rtc/rtc-coh901331.c +++ b/drivers/rtc/rtc-coh901331.c @@ -43,8 +43,6 @@ struct coh901331_port { struct rtc_device *rtc; struct clk *clk; - u32 phybase; - u32 physize; void __iomem *virtbase; int irq; #ifdef CONFIG_PM_SLEEP @@ -173,19 +171,9 @@ static int __init coh901331_probe(struct platform_device *pdev) return -ENOMEM; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENOENT; - - rtap->phybase = res->start; - rtap->physize = resource_size(res); - - if (devm_request_mem_region(&pdev->dev, rtap->phybase, rtap->physize, - "rtc-coh901331") == NULL) - return -EBUSY; - - rtap->virtbase = devm_ioremap(&pdev->dev, rtap->phybase, rtap->physize); - if (!rtap->virtbase) - return -ENOMEM; + rtap->virtbase = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(rtap->virtbase)) + return PTR_ERR(rtap->virtbase); rtap->irq = platform_get_irq(pdev, 0); if (devm_request_irq(&pdev->dev, rtap->irq, coh901331_interrupt, 0, -- cgit v1.2.3 From 1e6789f6e2e2b1645527d08832bfb95982da17ff Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:48 -0700 Subject: rtc: rtc-davinci: use devm_ioremap_resource() Use devm_ioremap_resource() in order to make the code simpler, and remove redundant return value check of platform_get_resource() because the value is checked by devm_ioremap_resource(). Signed-off-by: Jingoo Han Cc: Kevin Hilman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-davinci.c | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c index 1f3495e4e634..c0a3b59f65a2 100644 --- a/drivers/rtc/rtc-davinci.c +++ b/drivers/rtc/rtc-davinci.c @@ -119,8 +119,6 @@ static DEFINE_SPINLOCK(davinci_rtc_lock); struct davinci_rtc { struct rtc_device *rtc; void __iomem *base; - resource_size_t pbase; - size_t base_size; int irq; }; @@ -482,7 +480,7 @@ static int __init davinci_rtc_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct davinci_rtc *davinci_rtc; - struct resource *res, *mem; + struct resource *res; int ret = 0; davinci_rtc = devm_kzalloc(&pdev->dev, sizeof(struct davinci_rtc), GFP_KERNEL); @@ -496,28 +494,9 @@ static int __init davinci_rtc_probe(struct platform_device *pdev) } res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { - dev_err(dev, "no mem resource\n"); - return -EINVAL; - } - - davinci_rtc->pbase = res->start; - davinci_rtc->base_size = resource_size(res); - - mem = devm_request_mem_region(dev, davinci_rtc->pbase, - davinci_rtc->base_size, pdev->name); - if (!mem) { - dev_err(dev, "RTC registers at %08x are not free\n", - davinci_rtc->pbase); - return -EBUSY; - } - - davinci_rtc->base = devm_ioremap(dev, davinci_rtc->pbase, - davinci_rtc->base_size); - if (!davinci_rtc->base) { - dev_err(dev, "unable to ioremap MEM resource\n"); - return -ENOMEM; - } + davinci_rtc->base = devm_ioremap_resource(dev, res); + if (IS_ERR(davinci_rtc->base)) + return PTR_ERR(davinci_rtc->base); platform_set_drvdata(pdev, davinci_rtc); -- cgit v1.2.3 From caa3af259d5439e28c6ed1fc0efbed994146e403 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:49 -0700 Subject: rtc: rtc-vt8500: use devm_ioremap_resource() Use devm_ioremap_resource() in order to make the code simpler, and move 'struct resource *res' from 'struct vt8500_rtc' to vt8500_rtc_probe() because the 'res' variable is used only in vt8500_rtc_probe(). Signed-off-by: Jingoo Han Cc: Tony Prisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-vt8500.c | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/drivers/rtc/rtc-vt8500.c b/drivers/rtc/rtc-vt8500.c index df2ef3eba7cd..051da968da6d 100644 --- a/drivers/rtc/rtc-vt8500.c +++ b/drivers/rtc/rtc-vt8500.c @@ -79,7 +79,6 @@ struct vt8500_rtc { void __iomem *regbase; - struct resource *res; int irq_alarm; struct rtc_device *rtc; spinlock_t lock; /* Protects this structure */ @@ -209,6 +208,7 @@ static const struct rtc_class_ops vt8500_rtc_ops = { static int vt8500_rtc_probe(struct platform_device *pdev) { struct vt8500_rtc *vt8500_rtc; + struct resource *res; int ret; vt8500_rtc = devm_kzalloc(&pdev->dev, @@ -219,34 +219,16 @@ static int vt8500_rtc_probe(struct platform_device *pdev) spin_lock_init(&vt8500_rtc->lock); platform_set_drvdata(pdev, vt8500_rtc); - vt8500_rtc->res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!vt8500_rtc->res) { - dev_err(&pdev->dev, "No I/O memory resource defined\n"); - return -ENXIO; - } - vt8500_rtc->irq_alarm = platform_get_irq(pdev, 0); if (vt8500_rtc->irq_alarm < 0) { dev_err(&pdev->dev, "No alarm IRQ resource defined\n"); return vt8500_rtc->irq_alarm; } - vt8500_rtc->res = devm_request_mem_region(&pdev->dev, - vt8500_rtc->res->start, - resource_size(vt8500_rtc->res), - "vt8500-rtc"); - if (vt8500_rtc->res == NULL) { - dev_err(&pdev->dev, "failed to request I/O memory\n"); - return -EBUSY; - } - - vt8500_rtc->regbase = devm_ioremap(&pdev->dev, vt8500_rtc->res->start, - resource_size(vt8500_rtc->res)); - if (!vt8500_rtc->regbase) { - dev_err(&pdev->dev, "Unable to map RTC I/O memory\n"); - ret = -EBUSY; - goto err_return; - } + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + vt8500_rtc->regbase = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(vt8500_rtc->regbase)) + return PTR_ERR(vt8500_rtc->regbase); /* Enable RTC and set it to 24-hour mode */ writel(VT8500_RTC_CR_ENABLE, -- cgit v1.2.3 From 3b6aa907f316a51345987327083aa266870d06cc Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:49:50 -0700 Subject: rtc: rtc-jz4740: use devm_ioremap_resource() Use devm_ioremap_resource() in order to make the code simpler, and move 'struct resource *mem' from 'struct jz4740_rtc' to jz4740_rtc_probe() because the 'mem' variable is used only in jz4740_rtc_probe(). Also the redundant return value check of platform_get_resource() is removed, because the value is checked by devm_ioremap_resource(). Signed-off-by: Jingoo Han Cc: Lars-Peter Clausen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-jz4740.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/drivers/rtc/rtc-jz4740.c b/drivers/rtc/rtc-jz4740.c index 1b126d2513de..08f5160fb6d4 100644 --- a/drivers/rtc/rtc-jz4740.c +++ b/drivers/rtc/rtc-jz4740.c @@ -38,7 +38,6 @@ #define JZ_RTC_CTRL_ENABLE BIT(0) struct jz4740_rtc { - struct resource *mem; void __iomem *base; struct rtc_device *rtc; @@ -216,6 +215,7 @@ static int jz4740_rtc_probe(struct platform_device *pdev) int ret; struct jz4740_rtc *rtc; uint32_t scratchpad; + struct resource *mem; rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL); if (!rtc) @@ -227,25 +227,10 @@ static int jz4740_rtc_probe(struct platform_device *pdev) return -ENOENT; } - rtc->mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!rtc->mem) { - dev_err(&pdev->dev, "Failed to get platform mmio memory\n"); - return -ENOENT; - } - - rtc->mem = devm_request_mem_region(&pdev->dev, rtc->mem->start, - resource_size(rtc->mem), pdev->name); - if (!rtc->mem) { - dev_err(&pdev->dev, "Failed to request mmio memory region\n"); - return -EBUSY; - } - - rtc->base = devm_ioremap_nocache(&pdev->dev, rtc->mem->start, - resource_size(rtc->mem)); - if (!rtc->base) { - dev_err(&pdev->dev, "Failed to ioremap mmio memory\n"); - return -EBUSY; - } + mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); + rtc->base = devm_ioremap_resource(&pdev->dev, mem); + if (IS_ERR(rtc->base)) + return PTR_ERR(rtc->base); spin_lock_init(&rtc->lock); -- cgit v1.2.3 From 7b8dc3eb678304420c09135fe0576bea65b7b08f Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 3 Apr 2014 14:49:51 -0700 Subject: drivers/rtc/rtc-isl12057.c: remove duplicate include linux/rtc.h was included twice. Signed-off-by: Sachin Kamat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-isl12057.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/rtc/rtc-isl12057.c b/drivers/rtc/rtc-isl12057.c index 7854a656628f..7e5ead936a04 100644 --- a/drivers/rtc/rtc-isl12057.c +++ b/drivers/rtc/rtc-isl12057.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From c2c0eed7f20cbfb20b346f1854afaf485bc6c207 Mon Sep 17 00:00:00 2001 From: Anthony Olech Date: Thu, 3 Apr 2014 14:49:52 -0700 Subject: drivers/rtc/rtc-da9052.c: remove redundant private structure field Remove redundant irq field in private rtc structure. Signed-off-by: Anthony Olech Acked-by: David Dajun Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-da9052.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-da9052.c b/drivers/rtc/rtc-da9052.c index 4385ca4503da..a1cbf64242a5 100644 --- a/drivers/rtc/rtc-da9052.c +++ b/drivers/rtc/rtc-da9052.c @@ -26,7 +26,6 @@ struct da9052_rtc { struct rtc_device *rtc; struct da9052 *da9052; - int irq; }; static int da9052_rtc_enable_alarm(struct da9052 *da9052, bool enable) @@ -240,8 +239,7 @@ static int da9052_rtc_probe(struct platform_device *pdev) rtc->da9052 = dev_get_drvdata(pdev->dev.parent); platform_set_drvdata(pdev, rtc); - rtc->irq = DA9052_IRQ_ALARM; - ret = da9052_request_irq(rtc->da9052, rtc->irq, "ALM", + ret = da9052_request_irq(rtc->da9052, DA9052_IRQ_ALARM, "ALM", da9052_rtc_irq, rtc); if (ret != 0) { rtc_err(rtc->da9052, "irq registration failed: %d\n", ret); -- cgit v1.2.3 From 3916b09eed268705f738957aba6fd380470e994a Mon Sep 17 00:00:00 2001 From: Xianglong Du Date: Thu, 3 Apr 2014 14:49:53 -0700 Subject: drivers/rtc/rtc-sirfsoc.c: fix kernel panic of backing from hibernation RTC settings will be lost if power supply is cut off after hibernation finished, but the current "restore" function does not restore RTC related settings, this causes rtc_read_time failure and kernel panic: rtc rtc0: **** DPM device timeout **** Stack trace: unwind_backtrace+0x0/0xf4 show_stack+0x10/0x14 dpm_wd_handler+0x24/0x28 call_timer_fn.isra.33+0x24/0x88 run_timer_softirq+0x178/0x1f0 __do_softirq+0x120/0x200 do_softirq+0x54/0x5c irq_exit+0x9c/0xd0 handle_IRQ+0x44/0x90 __irq_svc+0x40/0x70 _raw_spin_unlock_irqrestore+0x10/0x48 sirfsoc_rtc_iobrg_readl+0x34/0x3c sirfsoc_rtc_read_time+0x24/0x48 __rtc_read_time.isra.3+0x48/0x5c rtc_read_time+0x30/0x44 rtc_resume.part.9+0x20/0x104 rtc_resume+0x5c/0x64 dpm_run_callback.isra.4+0x2c/0x74 device_resume+0x9c/0x144 dpm_resume+0x100/0x224 hibernation_snapshot+0x170/0x398 hibernate+0x13c/0x1d8 state_store+0xb4/0xb8 kobj_attr_store+0x14/0x20 sysfs_write_file+0x160/0x190 vfs_write+0xb4/0x194 SyS_write+0x3c/0x78 this patch uses SIMPLE_DEV_PM_OPS() to make restore() execute the existing resume() function which will restore the set of RTC. Signed-off-by: Xianglong Du Signed-off-by: Barry Song Cc: Grant Likely Cc: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-sirfsoc.c | 62 +++++++---------------------------------------- 1 file changed, 9 insertions(+), 53 deletions(-) diff --git a/drivers/rtc/rtc-sirfsoc.c b/drivers/rtc/rtc-sirfsoc.c index 9e3cbce56ec1..76e38007ba90 100644 --- a/drivers/rtc/rtc-sirfsoc.c +++ b/drivers/rtc/rtc-sirfsoc.c @@ -331,39 +331,29 @@ static int sirfsoc_rtc_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_PM - +#ifdef CONFIG_PM_SLEEP static int sirfsoc_rtc_suspend(struct device *dev) { - struct platform_device *pdev = to_platform_device(dev); - struct sirfsoc_rtc_drv *rtcdrv = platform_get_drvdata(pdev); + struct sirfsoc_rtc_drv *rtcdrv = dev_get_drvdata(dev); rtcdrv->overflow_rtc = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_SW_VALUE); rtcdrv->saved_counter = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_CN); rtcdrv->saved_overflow_rtc = rtcdrv->overflow_rtc; - if (device_may_wakeup(&pdev->dev) && !enable_irq_wake(rtcdrv->irq)) + if (device_may_wakeup(dev) && !enable_irq_wake(rtcdrv->irq)) rtcdrv->irq_wake = 1; return 0; } -static int sirfsoc_rtc_freeze(struct device *dev) -{ - sirfsoc_rtc_suspend(dev); - - return 0; -} - -static int sirfsoc_rtc_thaw(struct device *dev) +static int sirfsoc_rtc_resume(struct device *dev) { u32 tmp; - struct sirfsoc_rtc_drv *rtcdrv; - rtcdrv = dev_get_drvdata(dev); + struct sirfsoc_rtc_drv *rtcdrv = dev_get_drvdata(dev); /* - * if resume from snapshot and the rtc power is losed, + * if resume from snapshot and the rtc power is lost, * restroe the rtc settings */ if (SIRFSOC_RTC_CLK != sirfsoc_rtc_iobrg_readl( @@ -403,57 +393,23 @@ static int sirfsoc_rtc_thaw(struct device *dev) sirfsoc_rtc_iobrg_writel(rtcdrv->overflow_rtc, rtcdrv->rtc_base + RTC_SW_VALUE); - return 0; -} - -static int sirfsoc_rtc_resume(struct device *dev) -{ - struct platform_device *pdev = to_platform_device(dev); - struct sirfsoc_rtc_drv *rtcdrv = platform_get_drvdata(pdev); - sirfsoc_rtc_thaw(dev); - if (device_may_wakeup(&pdev->dev) && rtcdrv->irq_wake) { + if (device_may_wakeup(dev) && rtcdrv->irq_wake) { disable_irq_wake(rtcdrv->irq); rtcdrv->irq_wake = 0; } return 0; } - -static int sirfsoc_rtc_restore(struct device *dev) -{ - struct platform_device *pdev = to_platform_device(dev); - struct sirfsoc_rtc_drv *rtcdrv = platform_get_drvdata(pdev); - - if (device_may_wakeup(&pdev->dev) && rtcdrv->irq_wake) { - disable_irq_wake(rtcdrv->irq); - rtcdrv->irq_wake = 0; - } - return 0; -} - -#else -#define sirfsoc_rtc_suspend NULL -#define sirfsoc_rtc_resume NULL -#define sirfsoc_rtc_freeze NULL -#define sirfsoc_rtc_thaw NULL -#define sirfsoc_rtc_restore NULL #endif -static const struct dev_pm_ops sirfsoc_rtc_pm_ops = { - .suspend = sirfsoc_rtc_suspend, - .resume = sirfsoc_rtc_resume, - .freeze = sirfsoc_rtc_freeze, - .thaw = sirfsoc_rtc_thaw, - .restore = sirfsoc_rtc_restore, -}; +static SIMPLE_DEV_PM_OPS(sirfsoc_rtc_pm_ops, + sirfsoc_rtc_suspend, sirfsoc_rtc_resume); static struct platform_driver sirfsoc_rtc_driver = { .driver = { .name = "sirfsoc-rtc", .owner = THIS_MODULE, -#ifdef CONFIG_PM .pm = &sirfsoc_rtc_pm_ops, -#endif .of_match_table = sirfsoc_rtc_of_match, }, .probe = sirfsoc_rtc_probe, -- cgit v1.2.3 From 5ea735144dd4d2716557ff62dbcc940d62d62208 Mon Sep 17 00:00:00 2001 From: Simon Guinot Date: Thu, 3 Apr 2014 14:49:54 -0700 Subject: drivers/rtc/rtc-ds1307.c: fix sysfs wakealarm attribute creation In order to allow the creation of the sysfs attribute wakealarm, this patch moves the device_set_wakeup_capable() call above the RTC device registration. Signed-off-by: Simon Guinot Cc: Jason Cooper Cc: Andrew Lunn Cc: Gregory Clement Cc: Sebastian Hesselbarth Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ds1307.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index 9e2aad68f96d..f739be96cbc0 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -927,6 +927,7 @@ read_rtc: bin2bcd(tmp)); } + device_set_wakeup_capable(&client->dev, want_irq); ds1307->rtc = devm_rtc_device_register(&client->dev, client->name, &ds13xx_rtc_ops, THIS_MODULE); if (IS_ERR(ds1307->rtc)) { @@ -941,7 +942,6 @@ read_rtc: dev_err(&client->dev, "unable to request IRQ!\n"); } else { - device_set_wakeup_capable(&client->dev, 1); set_bit(HAS_ALARM, &ds1307->flags); dev_dbg(&client->dev, "got IRQ %d\n", client->irq); } -- cgit v1.2.3 From 1d1945d261a2af4e1ddaf609308c30b83143c2da Mon Sep 17 00:00:00 2001 From: Simon Guinot Date: Thu, 3 Apr 2014 14:49:55 -0700 Subject: drivers/rtc/rtc-ds1307.c: add alarm support for mcp7941x chips Add alarm support for the Microchip RTC devices MCP794xx. Note that two programmable alarms are provided by the chip but only one is used by the driver. Signed-off-by: Simon Guinot Cc: Jason Cooper Cc: Andrew Lunn Cc: Gregory Clement Cc: Sebastian Hesselbarth Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ds1307.c | 183 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 182 insertions(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index f739be96cbc0..f03d5ba96db1 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -154,6 +154,7 @@ static const struct chip_desc chips[last_ds_type] = { .alarm = 1, }, [mcp7941x] = { + .alarm = 1, /* this is battery backed SRAM */ .nvram_offset = 0x20, .nvram_size = 0x40, @@ -606,6 +607,178 @@ static const struct rtc_class_ops ds13xx_rtc_ops = { /*----------------------------------------------------------------------*/ +/* + * Alarm support for mcp7941x devices. + */ + +#define MCP7941X_REG_CONTROL 0x07 +# define MCP7941X_BIT_ALM0_EN 0x10 +# define MCP7941X_BIT_ALM1_EN 0x20 +#define MCP7941X_REG_ALARM0_BASE 0x0a +#define MCP7941X_REG_ALARM0_CTRL 0x0d +#define MCP7941X_REG_ALARM1_BASE 0x11 +#define MCP7941X_REG_ALARM1_CTRL 0x14 +# define MCP7941X_BIT_ALMX_IF (1 << 3) +# define MCP7941X_BIT_ALMX_C0 (1 << 4) +# define MCP7941X_BIT_ALMX_C1 (1 << 5) +# define MCP7941X_BIT_ALMX_C2 (1 << 6) +# define MCP7941X_BIT_ALMX_POL (1 << 7) +# define MCP7941X_MSK_ALMX_MATCH (MCP7941X_BIT_ALMX_C0 | \ + MCP7941X_BIT_ALMX_C1 | \ + MCP7941X_BIT_ALMX_C2) + +static void mcp7941x_work(struct work_struct *work) +{ + struct ds1307 *ds1307 = container_of(work, struct ds1307, work); + struct i2c_client *client = ds1307->client; + int reg, ret; + + mutex_lock(&ds1307->rtc->ops_lock); + + /* Check and clear alarm 0 interrupt flag. */ + reg = i2c_smbus_read_byte_data(client, MCP7941X_REG_ALARM0_CTRL); + if (reg < 0) + goto out; + if (!(reg & MCP7941X_BIT_ALMX_IF)) + goto out; + reg &= ~MCP7941X_BIT_ALMX_IF; + ret = i2c_smbus_write_byte_data(client, MCP7941X_REG_ALARM0_CTRL, reg); + if (ret < 0) + goto out; + + /* Disable alarm 0. */ + reg = i2c_smbus_read_byte_data(client, MCP7941X_REG_CONTROL); + if (reg < 0) + goto out; + reg &= ~MCP7941X_BIT_ALM0_EN; + ret = i2c_smbus_write_byte_data(client, MCP7941X_REG_CONTROL, reg); + if (ret < 0) + goto out; + + rtc_update_irq(ds1307->rtc, 1, RTC_AF | RTC_IRQF); + +out: + if (test_bit(HAS_ALARM, &ds1307->flags)) + enable_irq(client->irq); + mutex_unlock(&ds1307->rtc->ops_lock); +} + +static int mcp7941x_read_alarm(struct device *dev, struct rtc_wkalrm *t) +{ + struct i2c_client *client = to_i2c_client(dev); + struct ds1307 *ds1307 = i2c_get_clientdata(client); + u8 *regs = ds1307->regs; + int ret; + + if (!test_bit(HAS_ALARM, &ds1307->flags)) + return -EINVAL; + + /* Read control and alarm 0 registers. */ + ret = ds1307->read_block_data(client, MCP7941X_REG_CONTROL, 10, regs); + if (ret < 0) + return ret; + + t->enabled = !!(regs[0] & MCP7941X_BIT_ALM0_EN); + + /* Report alarm 0 time assuming 24-hour and day-of-month modes. */ + t->time.tm_sec = bcd2bin(ds1307->regs[3] & 0x7f); + t->time.tm_min = bcd2bin(ds1307->regs[4] & 0x7f); + t->time.tm_hour = bcd2bin(ds1307->regs[5] & 0x3f); + t->time.tm_wday = bcd2bin(ds1307->regs[6] & 0x7) - 1; + t->time.tm_mday = bcd2bin(ds1307->regs[7] & 0x3f); + t->time.tm_mon = bcd2bin(ds1307->regs[8] & 0x1f) - 1; + t->time.tm_year = -1; + t->time.tm_yday = -1; + t->time.tm_isdst = -1; + + dev_dbg(dev, "%s, sec=%d min=%d hour=%d wday=%d mday=%d mon=%d " + "enabled=%d polarity=%d irq=%d match=%d\n", __func__, + t->time.tm_sec, t->time.tm_min, t->time.tm_hour, + t->time.tm_wday, t->time.tm_mday, t->time.tm_mon, t->enabled, + !!(ds1307->regs[6] & MCP7941X_BIT_ALMX_POL), + !!(ds1307->regs[6] & MCP7941X_BIT_ALMX_IF), + (ds1307->regs[6] & MCP7941X_MSK_ALMX_MATCH) >> 4); + + return 0; +} + +static int mcp7941x_set_alarm(struct device *dev, struct rtc_wkalrm *t) +{ + struct i2c_client *client = to_i2c_client(dev); + struct ds1307 *ds1307 = i2c_get_clientdata(client); + unsigned char *regs = ds1307->regs; + int ret; + + if (!test_bit(HAS_ALARM, &ds1307->flags)) + return -EINVAL; + + dev_dbg(dev, "%s, sec=%d min=%d hour=%d wday=%d mday=%d mon=%d " + "enabled=%d pending=%d\n", __func__, + t->time.tm_sec, t->time.tm_min, t->time.tm_hour, + t->time.tm_wday, t->time.tm_mday, t->time.tm_mon, + t->enabled, t->pending); + + /* Read control and alarm 0 registers. */ + ret = ds1307->read_block_data(client, MCP7941X_REG_CONTROL, 10, regs); + if (ret < 0) + return ret; + + /* Set alarm 0, using 24-hour and day-of-month modes. */ + regs[3] = bin2bcd(t->time.tm_sec); + regs[4] = bin2bcd(t->time.tm_min); + regs[5] = bin2bcd(t->time.tm_hour); + regs[6] = bin2bcd(t->time.tm_wday) + 1; + regs[7] = bin2bcd(t->time.tm_mday); + regs[8] = bin2bcd(t->time.tm_mon) + 1; + + /* Clear the alarm 0 interrupt flag. */ + regs[6] &= ~MCP7941X_BIT_ALMX_IF; + /* Set alarm match: second, minute, hour, day, date, month. */ + regs[6] |= MCP7941X_MSK_ALMX_MATCH; + + if (t->enabled) + regs[0] |= MCP7941X_BIT_ALM0_EN; + else + regs[0] &= ~MCP7941X_BIT_ALM0_EN; + + ret = ds1307->write_block_data(client, MCP7941X_REG_CONTROL, 10, regs); + if (ret < 0) + return ret; + + return 0; +} + +static int mcp7941x_alarm_irq_enable(struct device *dev, unsigned int enabled) +{ + struct i2c_client *client = to_i2c_client(dev); + struct ds1307 *ds1307 = i2c_get_clientdata(client); + int reg; + + if (!test_bit(HAS_ALARM, &ds1307->flags)) + return -EINVAL; + + reg = i2c_smbus_read_byte_data(client, MCP7941X_REG_CONTROL); + if (reg < 0) + return reg; + + if (enabled) + reg |= MCP7941X_BIT_ALM0_EN; + else + reg &= ~MCP7941X_BIT_ALM0_EN; + + return i2c_smbus_write_byte_data(client, MCP7941X_REG_CONTROL, reg); +} + +static const struct rtc_class_ops mcp7941x_rtc_ops = { + .read_time = ds1307_get_time, + .set_time = ds1307_set_time, + .read_alarm = mcp7941x_read_alarm, + .set_alarm = mcp7941x_set_alarm, + .alarm_irq_enable = mcp7941x_alarm_irq_enable, +}; + +/*----------------------------------------------------------------------*/ + static ssize_t ds1307_nvram_read(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, @@ -678,6 +851,7 @@ static int ds1307_probe(struct i2c_client *client, [ds_1339] = DS1339_BIT_BBSQI, [ds_3231] = DS3231_BIT_BBSQW, }; + const struct rtc_class_ops *rtc_ops = &ds13xx_rtc_ops; if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA) && !i2c_check_functionality(adapter, I2C_FUNC_SMBUS_I2C_BLOCK)) @@ -816,6 +990,13 @@ static int ds1307_probe(struct i2c_client *client, case ds_1388: ds1307->offset = 1; /* Seconds starts at 1 */ break; + case mcp7941x: + rtc_ops = &mcp7941x_rtc_ops; + if (ds1307->client->irq > 0 && chip->alarm) { + INIT_WORK(&ds1307->work, mcp7941x_work); + want_irq = true; + } + break; default: break; } @@ -929,7 +1110,7 @@ read_rtc: device_set_wakeup_capable(&client->dev, want_irq); ds1307->rtc = devm_rtc_device_register(&client->dev, client->name, - &ds13xx_rtc_ops, THIS_MODULE); + rtc_ops, THIS_MODULE); if (IS_ERR(ds1307->rtc)) { return PTR_ERR(ds1307->rtc); } -- cgit v1.2.3 From 77bf2ea8de78ce7dd842409a847b888e720f62d8 Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Thu, 3 Apr 2014 14:49:56 -0700 Subject: rtc: mc13xxx: remove __exit_p() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we no longer allow building without hotplug, the mc13xxx_rtc_remove() function is always present and we should not use __exit_p() to refer to it. Signed-off-by: Alexander Shiyan Cc: Uwe Kleine-König Cc: Sascha Hauer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-mc13xxx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index 77ea9896b5ba..480d51f71064 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -375,7 +375,7 @@ err_reset_irq_request: return ret; } -static int __exit mc13xxx_rtc_remove(struct platform_device *pdev) +static int mc13xxx_rtc_remove(struct platform_device *pdev) { struct mc13xxx_rtc *priv = platform_get_drvdata(pdev); @@ -404,7 +404,7 @@ MODULE_DEVICE_TABLE(platform, mc13xxx_rtc_idtable); static struct platform_driver mc13xxx_rtc_driver = { .id_table = mc13xxx_rtc_idtable, - .remove = __exit_p(mc13xxx_rtc_remove), + .remove = mc13xxx_rtc_remove, .driver = { .name = DRIVER_NAME, .owner = THIS_MODULE, -- cgit v1.2.3 From 589e501439f9866b4bc415743b0ae0bb9a461c69 Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Thu, 3 Apr 2014 14:49:57 -0700 Subject: rtc: mc13xxx: request IRQs after RTC registration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Interrupts can appear after request_irq and interrupt handlers can use the RTC device, but currently we register RTC after IRQs. This patch changes this order and simplify error path a bit. Signed-off-by: Alexander Shiyan Cc: Uwe Kleine-König Cc: Sascha Hauer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-mc13xxx.c | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index 480d51f71064..6b5d7d43af89 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -325,6 +325,11 @@ static int __init mc13xxx_rtc_probe(struct platform_device *pdev) platform_set_drvdata(pdev, priv); + priv->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, + &mc13xxx_rtc_ops, THIS_MODULE); + if (IS_ERR(priv->rtc)) + return PTR_ERR(priv->rtc); + mc13xxx_lock(mc13xxx); ret = mc13xxx_irq_request(mc13xxx, MC13XXX_IRQ_RTCRST, @@ -342,35 +347,20 @@ static int __init mc13xxx_rtc_probe(struct platform_device *pdev) ret = mc13xxx_irq_request_nounmask(mc13xxx, MC13XXX_IRQ_1HZ, mc13xxx_rtc_update_handler, DRIVER_NAME, priv); if (ret) - goto err_update_irq_request; + goto err_reset_irq_status; ret = mc13xxx_irq_request_nounmask(mc13xxx, MC13XXX_IRQ_TODA, mc13xxx_rtc_alarm_handler, DRIVER_NAME, priv); - if (ret) - goto err_alarm_irq_request; - - mc13xxx_unlock(mc13xxx); - - priv->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, - &mc13xxx_rtc_ops, THIS_MODULE); - if (IS_ERR(priv->rtc)) { - ret = PTR_ERR(priv->rtc); - - mc13xxx_lock(mc13xxx); - - mc13xxx_irq_free(mc13xxx, MC13XXX_IRQ_TODA, priv); -err_alarm_irq_request: + if (!ret) + goto err_reset_irq_request; - mc13xxx_irq_free(mc13xxx, MC13XXX_IRQ_1HZ, priv); -err_update_irq_request: + mc13xxx_irq_free(mc13xxx, MC13XXX_IRQ_1HZ, priv); err_reset_irq_status: + mc13xxx_irq_free(mc13xxx, MC13XXX_IRQ_RTCRST, priv); - mc13xxx_irq_free(mc13xxx, MC13XXX_IRQ_RTCRST, priv); err_reset_irq_request: - - mc13xxx_unlock(mc13xxx); - } + mc13xxx_unlock(mc13xxx); return ret; } -- cgit v1.2.3 From 2fb004a9b00e34908e59a1c1c5770910690a7588 Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Thu, 3 Apr 2014 14:49:58 -0700 Subject: rtc: mc13xxx: simplify alarm_irq_enable() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch removes excess layer for alarm_irq_enable() function. Signed-off-by: Alexander Shiyan Cc: Uwe Kleine-König Cc: Sascha Hauer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-mc13xxx.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index 6b5d7d43af89..1fa2748ab40e 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -42,15 +42,15 @@ static int mc13xxx_rtc_irq_enable_unlocked(struct device *dev, return func(priv->mc13xxx, irq); } -static int mc13xxx_rtc_irq_enable(struct device *dev, - unsigned int enabled, int irq) +static int mc13xxx_rtc_alarm_irq_enable(struct device *dev, + unsigned int enabled) { struct mc13xxx_rtc *priv = dev_get_drvdata(dev); int ret; mc13xxx_lock(priv->mc13xxx); - ret = mc13xxx_rtc_irq_enable_unlocked(dev, enabled, irq); + ret = mc13xxx_rtc_irq_enable_unlocked(dev, enabled, MC13XXX_IRQ_TODA); mc13xxx_unlock(priv->mc13xxx); @@ -282,12 +282,6 @@ static irqreturn_t mc13xxx_rtc_update_handler(int irq, void *dev) return IRQ_HANDLED; } -static int mc13xxx_rtc_alarm_irq_enable(struct device *dev, - unsigned int enabled) -{ - return mc13xxx_rtc_irq_enable(dev, enabled, MC13XXX_IRQ_TODA); -} - static const struct rtc_class_ops mc13xxx_rtc_ops = { .read_time = mc13xxx_rtc_read_time, .set_mmss = mc13xxx_rtc_set_mmss, -- cgit v1.2.3 From fb1bd9a22da3d0f2436ffa95fdc0382b89659288 Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Thu, 3 Apr 2014 14:49:59 -0700 Subject: rtc: mc13xxx: fix 1Hz interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1Hz interrupt is never unmasked, so no interrupts appears. This patch fix this issue. Signed-off-by: Alexander Shiyan Cc: Uwe Kleine-König Cc: Sascha Hauer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-mc13xxx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index 1fa2748ab40e..0d60dd5273d1 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -338,7 +338,7 @@ static int __init mc13xxx_rtc_probe(struct platform_device *pdev) priv->valid = !rtcrst_pending; - ret = mc13xxx_irq_request_nounmask(mc13xxx, MC13XXX_IRQ_1HZ, + ret = mc13xxx_irq_request(mc13xxx, MC13XXX_IRQ_1HZ, mc13xxx_rtc_update_handler, DRIVER_NAME, priv); if (ret) goto err_reset_irq_status; -- cgit v1.2.3 From 12de362108d5ec24cce1bbe520570dc8fdccca9c Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Thu, 3 Apr 2014 14:50:00 -0700 Subject: rtc: mc13xxx: change RTC validation scheme MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Datasheet says: "When the VSRTC voltage drops to the range of 0.9 - 0.8V, the RTCPORB reset signal is generated and the contents of the RTC will be reset. . To inform the processor that the contents of the RTC are no longer valid due to the reset, a timer reset interrupt function is implemented with the RTCRSTI bit." This patch makes the RTC valid by default until RTCRST interrupt occurs. Signed-off-by: Alexander Shiyan Cc: Uwe Kleine-König Cc: Sascha Hauer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-mc13xxx.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index 0d60dd5273d1..7b39852eb542 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -64,12 +64,10 @@ static int mc13xxx_rtc_read_time(struct device *dev, struct rtc_time *tm) unsigned long s1970; int ret; - mc13xxx_lock(priv->mc13xxx); + if (!priv->valid) + return -ENODATA; - if (!priv->valid) { - ret = -ENODATA; - goto out; - } + mc13xxx_lock(priv->mc13xxx); ret = mc13xxx_reg_read(priv->mc13xxx, MC13XXX_RTCDAY, &days1); if (unlikely(ret)) @@ -154,11 +152,14 @@ static int mc13xxx_rtc_set_mmss(struct device *dev, unsigned long secs) goto out; } - ret = mc13xxx_irq_ack(priv->mc13xxx, MC13XXX_IRQ_RTCRST); - if (unlikely(ret)) - goto out; + if (!priv->valid) { + ret = mc13xxx_irq_ack(priv->mc13xxx, MC13XXX_IRQ_RTCRST); + if (unlikely(ret)) + goto out; + + ret = mc13xxx_irq_unmask(priv->mc13xxx, MC13XXX_IRQ_RTCRST); + } - ret = mc13xxx_irq_unmask(priv->mc13xxx, MC13XXX_IRQ_RTCRST); out: priv->valid = !ret; @@ -295,7 +296,7 @@ static irqreturn_t mc13xxx_rtc_reset_handler(int irq, void *dev) struct mc13xxx_rtc *priv = dev; struct mc13xxx *mc13xxx = priv->mc13xxx; - dev_dbg(&priv->rtc->dev, "RTCRST\n"); + dev_warn(&priv->rtc->dev, "Contents of the RTC are no longer valid\n"); priv->valid = 0; mc13xxx_irq_mask(mc13xxx, irq); @@ -308,7 +309,6 @@ static int __init mc13xxx_rtc_probe(struct platform_device *pdev) int ret; struct mc13xxx_rtc *priv; struct mc13xxx *mc13xxx; - int rtcrst_pending; priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); if (!priv) @@ -316,6 +316,7 @@ static int __init mc13xxx_rtc_probe(struct platform_device *pdev) mc13xxx = dev_get_drvdata(pdev->dev.parent); priv->mc13xxx = mc13xxx; + priv->valid = 1; platform_set_drvdata(pdev, priv); @@ -326,18 +327,13 @@ static int __init mc13xxx_rtc_probe(struct platform_device *pdev) mc13xxx_lock(mc13xxx); + mc13xxx_irq_ack(mc13xxx, MC13XXX_IRQ_RTCRST); + ret = mc13xxx_irq_request(mc13xxx, MC13XXX_IRQ_RTCRST, mc13xxx_rtc_reset_handler, DRIVER_NAME, priv); if (ret) goto err_reset_irq_request; - ret = mc13xxx_irq_status(mc13xxx, MC13XXX_IRQ_RTCRST, - NULL, &rtcrst_pending); - if (ret) - goto err_reset_irq_status; - - priv->valid = !rtcrst_pending; - ret = mc13xxx_irq_request(mc13xxx, MC13XXX_IRQ_1HZ, mc13xxx_rtc_update_handler, DRIVER_NAME, priv); if (ret) -- cgit v1.2.3 From 6a2b342228b98432a585c83d59ed6aa0620be7c4 Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Thu, 3 Apr 2014 14:50:01 -0700 Subject: rtc: mc13xxx: make rtc_read_time() more readable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove unnecessary locks when reading the time and make the read operation until the values of day matched between reading the seconds, it will make the mc13xxx_rtc_read_time() procedure more readable. Additionally, patch introduced a "seconds in a day" definition. Signed-off-by: Alexander Shiyan Cc: Uwe Kleine-König Cc: Sascha Hauer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-mc13xxx.c | 59 ++++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index 7b39852eb542..9b1a77eb91a9 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -23,6 +23,8 @@ #define MC13XXX_RTCDAY 22 #define MC13XXX_RTCDAYA 23 +#define SEC_PER_DAY (24 * 60 * 60) + struct mc13xxx_rtc { struct rtc_device *rtc; struct mc13xxx *mc13xxx; @@ -61,42 +63,27 @@ static int mc13xxx_rtc_read_time(struct device *dev, struct rtc_time *tm) { struct mc13xxx_rtc *priv = dev_get_drvdata(dev); unsigned int seconds, days1, days2; - unsigned long s1970; - int ret; if (!priv->valid) return -ENODATA; - mc13xxx_lock(priv->mc13xxx); + do { + int ret; - ret = mc13xxx_reg_read(priv->mc13xxx, MC13XXX_RTCDAY, &days1); - if (unlikely(ret)) - goto out; - - ret = mc13xxx_reg_read(priv->mc13xxx, MC13XXX_RTCTOD, &seconds); - if (unlikely(ret)) - goto out; - - ret = mc13xxx_reg_read(priv->mc13xxx, MC13XXX_RTCDAY, &days2); -out: - mc13xxx_unlock(priv->mc13xxx); - - if (ret) - return ret; - - if (days2 == days1 + 1) { - if (seconds >= 86400 / 2) - days2 = days1; - else - days1 = days2; - } + ret = mc13xxx_reg_read(priv->mc13xxx, MC13XXX_RTCDAY, &days1); + if (ret) + return ret; - if (days1 != days2) - return -EIO; + ret = mc13xxx_reg_read(priv->mc13xxx, MC13XXX_RTCTOD, &seconds); + if (ret) + return ret; - s1970 = days1 * 86400 + seconds; + ret = mc13xxx_reg_read(priv->mc13xxx, MC13XXX_RTCDAY, &days2); + if (ret) + return ret; + } while (days1 != days2); - rtc_time_to_tm(s1970, tm); + rtc_time_to_tm(days1 * SEC_PER_DAY + seconds, tm); return rtc_valid_tm(tm); } @@ -108,8 +95,8 @@ static int mc13xxx_rtc_set_mmss(struct device *dev, unsigned long secs) unsigned int alarmseconds; int ret; - seconds = secs % 86400; - days = secs / 86400; + seconds = secs % SEC_PER_DAY; + days = secs / SEC_PER_DAY; mc13xxx_lock(priv->mc13xxx); @@ -121,7 +108,7 @@ static int mc13xxx_rtc_set_mmss(struct device *dev, unsigned long secs) if (unlikely(ret)) goto out; - if (alarmseconds < 86400) { + if (alarmseconds < SEC_PER_DAY) { ret = mc13xxx_reg_write(priv->mc13xxx, MC13XXX_RTCTODA, 0x1ffff); if (unlikely(ret)) @@ -145,7 +132,7 @@ static int mc13xxx_rtc_set_mmss(struct device *dev, unsigned long secs) goto out; /* restore alarm */ - if (alarmseconds < 86400) { + if (alarmseconds < SEC_PER_DAY) { ret = mc13xxx_reg_write(priv->mc13xxx, MC13XXX_RTCTODA, alarmseconds); if (unlikely(ret)) @@ -181,7 +168,7 @@ static int mc13xxx_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alarm) ret = mc13xxx_reg_read(priv->mc13xxx, MC13XXX_RTCTODA, &seconds); if (unlikely(ret)) goto out; - if (seconds >= 86400) { + if (seconds >= SEC_PER_DAY) { ret = -ENODATA; goto out; } @@ -202,7 +189,7 @@ out: alarm->enabled = enabled; alarm->pending = pending; - s1970 = days * 86400 + seconds; + s1970 = days * SEC_PER_DAY + seconds; rtc_time_to_tm(s1970, &alarm->time); dev_dbg(dev, "%s: %lu\n", __func__, s1970); @@ -240,8 +227,8 @@ static int mc13xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) if (unlikely(ret)) goto out; - seconds = s1970 % 86400; - days = s1970 / 86400; + seconds = s1970 % SEC_PER_DAY; + days = s1970 / SEC_PER_DAY; ret = mc13xxx_reg_write(priv->mc13xxx, MC13XXX_RTCDAYA, days); if (unlikely(ret)) -- cgit v1.2.3 From f49bd06e1d423e045f36a85fb3935fab706b1bf6 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Thu, 3 Apr 2014 14:50:02 -0700 Subject: rtc: sunxi: change compatibles The Allwinner A10 compatibles were following a slightly different compatible patterns than the rest of the SoCs for historical reasons. Change the compatibles to match the other pattern in the RTC driver for consistency. Signed-off-by: Maxime Ripard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/devicetree/bindings/rtc/sunxi-rtc.txt | 4 ++-- drivers/rtc/rtc-sunxi.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/rtc/sunxi-rtc.txt b/Documentation/devicetree/bindings/rtc/sunxi-rtc.txt index 7cb9dbf34878..6983aad376c3 100644 --- a/Documentation/devicetree/bindings/rtc/sunxi-rtc.txt +++ b/Documentation/devicetree/bindings/rtc/sunxi-rtc.txt @@ -3,7 +3,7 @@ RTC controller for the Allwinner A10/A20 Required properties: -- compatible : Should be "allwinner,sun4i-rtc" or "allwinner,sun7i-a20-rtc" +- compatible : Should be "allwinner,sun4i-a10-rtc" or "allwinner,sun7i-a20-rtc" - reg: physical base address of the controller and length of memory mapped region. - interrupts: IRQ line for the RTC. @@ -11,7 +11,7 @@ Required properties: Example: rtc: rtc@01c20d00 { - compatible = "allwinner,sun4i-rtc"; + compatible = "allwinner,sun4i-a10-rtc"; reg = <0x01c20d00 0x20>; interrupts = <24>; }; diff --git a/drivers/rtc/rtc-sunxi.c b/drivers/rtc/rtc-sunxi.c index 68a35284e5ad..b6f21f73d508 100644 --- a/drivers/rtc/rtc-sunxi.c +++ b/drivers/rtc/rtc-sunxi.c @@ -428,7 +428,7 @@ static const struct rtc_class_ops sunxi_rtc_ops = { }; static const struct of_device_id sunxi_rtc_dt_ids[] = { - { .compatible = "allwinner,sun4i-rtc", .data = &data_year_param[0] }, + { .compatible = "allwinner,sun4i-a10-rtc", .data = &data_year_param[0] }, { .compatible = "allwinner,sun7i-a20-rtc", .data = &data_year_param[1] }, { /* sentinel */ }, }; -- cgit v1.2.3 From 5fc4bc8919b217d6fe6046f6d81a783c73facbac Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Thu, 3 Apr 2014 14:50:03 -0700 Subject: arch/arm/boot/dts/sun4i-a10.dtsi: convert to the new RTC compatibles Switch the device tree to the new compatibles introduced in the RTC drivers to have a common pattern accross all Allwinner SoCs. Signed-off-by: Maxime Ripard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/boot/dts/sun4i-a10.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/sun4i-a10.dtsi b/arch/arm/boot/dts/sun4i-a10.dtsi index 7753be0c86d7..9321681cc45a 100644 --- a/arch/arm/boot/dts/sun4i-a10.dtsi +++ b/arch/arm/boot/dts/sun4i-a10.dtsi @@ -415,7 +415,7 @@ }; rtc: rtc@01c20d00 { - compatible = "allwinner,sun4i-rtc"; + compatible = "allwinner,sun4i-a10-rtc"; reg = <0x01c20d00 0x20>; interrupts = <24>; }; -- cgit v1.2.3 From a8a15eb19d885bec172c396178c148264987922f Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Thu, 3 Apr 2014 14:50:04 -0700 Subject: drivers/rtc/rtc-ds3232.c: make it possible to share an irq It's possible to have RTC irq shared with other device (e.g. t4240qds board shares ds3232irq with phy one). Handle this in driver. Signed-off-by: Bharat Bhushan Cc: Scott Wood Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ds3232.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-ds3232.c b/drivers/rtc/rtc-ds3232.c index 15497c578af5..813c6aa70d3d 100644 --- a/drivers/rtc/rtc-ds3232.c +++ b/drivers/rtc/rtc-ds3232.c @@ -418,8 +418,8 @@ static int ds3232_probe(struct i2c_client *client, } if (client->irq >= 0) { - ret = devm_request_irq(&client->dev, client->irq, ds3232_irq, 0, - "ds3232", client); + ret = devm_request_irq(&client->dev, client->irq, ds3232_irq, + IRQF_SHARED, "ds3232", client); if (ret) { dev_err(&client->dev, "unable to request IRQ\n"); } -- cgit v1.2.3 From b5ada4600dfd4f07f570a6dda7b5b020d96f8591 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 3 Apr 2014 14:50:05 -0700 Subject: drivers/rtc/rtc-cmos.c: fix compilation warning when !CONFIG_PM_SLEEP CONFIG_PM will be set also if only CONFIG_PM_RUNTIME is set which causes the compiler to emit following warning: drivers/rtc/rtc-cmos.c:845:12: warning: =E2=80=98cmos_resume=E2=80=99 defined but not used [-Wunused-function] Fix this by using CONFIG_PM_SLEEP instead of CONFIG_PM and removing it from the driver pm ops as this has been taken care by SIMPLE_DEV_PM_OPS() already. Signed-off-by: Mika Westerberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-cmos.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index cae212f30d65..0963c9309c74 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -837,7 +837,7 @@ static void __exit cmos_do_remove(struct device *dev) cmos->dev = NULL; } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP static int cmos_suspend(struct device *dev) { @@ -935,8 +935,6 @@ static int cmos_resume(struct device *dev) return 0; } -static SIMPLE_DEV_PM_OPS(cmos_pm_ops, cmos_suspend, cmos_resume); - #else static inline int cmos_poweroff(struct device *dev) @@ -946,6 +944,8 @@ static inline int cmos_poweroff(struct device *dev) #endif +static SIMPLE_DEV_PM_OPS(cmos_pm_ops, cmos_suspend, cmos_resume); + /*----------------------------------------------------------------*/ /* On non-x86 systems, a "CMOS" RTC lives most naturally on platform_bus. @@ -1088,11 +1088,9 @@ static struct pnp_driver cmos_pnp_driver = { /* flag ensures resume() gets called, and stops syslog spam */ .flags = PNP_DRIVER_RES_DO_NOT_CHANGE, -#ifdef CONFIG_PM_SLEEP .driver = { .pm = &cmos_pm_ops, }, -#endif }; #endif /* CONFIG_PNP */ -- cgit v1.2.3 From d55255cb11ca2220d09c2a08adee7ca3e14e0ba1 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:50:06 -0700 Subject: drivers/rtc/rtc-as3722.c: use SIMPLE_DEV_PM_OPS macro Use SIMPLE_DEV_PM_OPS macro in order to make the code simpler. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-as3722.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-as3722.c b/drivers/rtc/rtc-as3722.c index 4af016985890..9f38eda69154 100644 --- a/drivers/rtc/rtc-as3722.c +++ b/drivers/rtc/rtc-as3722.c @@ -242,9 +242,8 @@ static int as3722_rtc_resume(struct device *dev) } #endif -static const struct dev_pm_ops as3722_rtc_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(as3722_rtc_suspend, as3722_rtc_resume) -}; +static SIMPLE_DEV_PM_OPS(as3722_rtc_pm_ops, as3722_rtc_suspend, + as3722_rtc_resume); static struct platform_driver as3722_rtc_driver = { .probe = as3722_rtc_probe, -- cgit v1.2.3 From 3f93822d36fc8413658a67f5fb853bbe1b2a1f4d Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 3 Apr 2014 14:50:07 -0700 Subject: drivers/rtc/rtc-palmas.c: use SIMPLE_DEV_PM_OPS macro Use SIMPLE_DEV_PM_OPS macro in order to make the code simpler. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-palmas.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-palmas.c b/drivers/rtc/rtc-palmas.c index fffb7d3449d7..c360d62fb3f6 100644 --- a/drivers/rtc/rtc-palmas.c +++ b/drivers/rtc/rtc-palmas.c @@ -348,9 +348,8 @@ static int palmas_rtc_resume(struct device *dev) } #endif -static const struct dev_pm_ops palmas_rtc_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(palmas_rtc_suspend, palmas_rtc_resume) -}; +static SIMPLE_DEV_PM_OPS(palmas_rtc_pm_ops, palmas_rtc_suspend, + palmas_rtc_resume); #ifdef CONFIG_OF static struct of_device_id of_palmas_rtc_match[] = { -- cgit v1.2.3 From c93a3ae2d213ff75a279fe6e28d8f41ca7f01483 Mon Sep 17 00:00:00 2001 From: Wang Dongsheng Date: Thu, 3 Apr 2014 14:50:08 -0700 Subject: drivers/rtc/rtc-ds3232.c: enable ds3232 to work as wakeup source Add suspend/resume and device_init_wakeup to enable ds3232 as wakeup source, /sys/class/rtc/rtcX/wakealarm for set wakeup alarm. Signed-off-by: Wang Dongsheng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ds3232.c | 94 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 70 insertions(+), 24 deletions(-) diff --git a/drivers/rtc/rtc-ds3232.c b/drivers/rtc/rtc-ds3232.c index 813c6aa70d3d..adaf06c41479 100644 --- a/drivers/rtc/rtc-ds3232.c +++ b/drivers/rtc/rtc-ds3232.c @@ -57,6 +57,7 @@ struct ds3232 { * in the remove function. */ struct mutex mutex; + bool suspended; int exiting; }; @@ -345,7 +346,15 @@ static irqreturn_t ds3232_irq(int irq, void *dev_id) struct ds3232 *ds3232 = i2c_get_clientdata(client); disable_irq_nosync(irq); - schedule_work(&ds3232->work); + + /* + * If rtc as a wakeup source, can't schedule the work + * at system resume flow, because at this time the i2c bus + * has not been resumed. + */ + if (!ds3232->suspended) + schedule_work(&ds3232->work); + return IRQ_HANDLED; } @@ -363,22 +372,26 @@ static void ds3232_work(struct work_struct *work) if (stat & DS3232_REG_SR_A1F) { control = i2c_smbus_read_byte_data(client, DS3232_REG_CR); - if (control < 0) - goto out; - /* disable alarm1 interrupt */ - control &= ~(DS3232_REG_CR_A1IE); - i2c_smbus_write_byte_data(client, DS3232_REG_CR, control); - - /* clear the alarm pend flag */ - stat &= ~DS3232_REG_SR_A1F; - i2c_smbus_write_byte_data(client, DS3232_REG_SR, stat); - - rtc_update_irq(ds3232->rtc, 1, RTC_AF | RTC_IRQF); + if (control < 0) { + pr_warn("Read DS3232 Control Register error." + "Disable IRQ%d.\n", client->irq); + } else { + /* disable alarm1 interrupt */ + control &= ~(DS3232_REG_CR_A1IE); + i2c_smbus_write_byte_data(client, DS3232_REG_CR, + control); + + /* clear the alarm pend flag */ + stat &= ~DS3232_REG_SR_A1F; + i2c_smbus_write_byte_data(client, DS3232_REG_SR, stat); + + rtc_update_irq(ds3232->rtc, 1, RTC_AF | RTC_IRQF); + + if (!ds3232->exiting) + enable_irq(client->irq); + } } -out: - if (!ds3232->exiting) - enable_irq(client->irq); unlock: mutex_unlock(&ds3232->mutex); } @@ -411,21 +424,17 @@ static int ds3232_probe(struct i2c_client *client, if (ret) return ret; - ds3232->rtc = devm_rtc_device_register(&client->dev, client->name, - &ds3232_rtc_ops, THIS_MODULE); - if (IS_ERR(ds3232->rtc)) { - return PTR_ERR(ds3232->rtc); - } - - if (client->irq >= 0) { + if (client->irq > 0) { ret = devm_request_irq(&client->dev, client->irq, ds3232_irq, IRQF_SHARED, "ds3232", client); if (ret) { dev_err(&client->dev, "unable to request IRQ\n"); } + device_init_wakeup(&client->dev, 1); } - - return 0; + ds3232->rtc = devm_rtc_device_register(&client->dev, client->name, + &ds3232_rtc_ops, THIS_MODULE); + return PTR_ERR_OR_ZERO(ds3232->rtc); } static int ds3232_remove(struct i2c_client *client) @@ -444,6 +453,42 @@ static int ds3232_remove(struct i2c_client *client) return 0; } +#ifdef CONFIG_PM_SLEEP +static int ds3232_suspend(struct device *dev) +{ + struct ds3232 *ds3232 = dev_get_drvdata(dev); + struct i2c_client *client = to_i2c_client(dev); + + if (device_can_wakeup(dev)) { + ds3232->suspended = true; + irq_set_irq_wake(client->irq, 1); + } + + return 0; +} + +static int ds3232_resume(struct device *dev) +{ + struct ds3232 *ds3232 = dev_get_drvdata(dev); + struct i2c_client *client = to_i2c_client(dev); + + if (ds3232->suspended) { + ds3232->suspended = false; + + /* Clear the hardware alarm pend flag */ + schedule_work(&ds3232->work); + + irq_set_irq_wake(client->irq, 0); + } + + return 0; +} +#endif + +static const struct dev_pm_ops ds3232_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(ds3232_suspend, ds3232_resume) +}; + static const struct i2c_device_id ds3232_id[] = { { "ds3232", 0 }, { } @@ -454,6 +499,7 @@ static struct i2c_driver ds3232_driver = { .driver = { .name = "rtc-ds3232", .owner = THIS_MODULE, + .pm = &ds3232_pm_ops, }, .probe = ds3232_probe, .remove = ds3232_remove, -- cgit v1.2.3 From 131c9cc832162bfe45be1065fa88d1ec2c165a3d Mon Sep 17 00:00:00 2001 From: Alessandro Zummo Date: Thu, 3 Apr 2014 14:50:09 -0700 Subject: rtc: verify a critical argument to rtc_update_irq() before using it This small addition to the core simplifies code in the drivers and makes them more robust when handling shared IRQs. Signed-off-by: Alessandro Zummo Cc: Alexander Shiyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/interface.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index 544be722937c..c2eff6082363 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -584,6 +584,9 @@ enum hrtimer_restart rtc_pie_update_irq(struct hrtimer *timer) void rtc_update_irq(struct rtc_device *rtc, unsigned long num, unsigned long events) { + if (unlikely(IS_ERR_OR_NULL(rtc))) + return; + pm_stay_awake(rtc->dev.parent); schedule_work(&rtc->irqwork); } -- cgit v1.2.3 From 5bed811d668ca042b5ee711e27a3c4496179145c Mon Sep 17 00:00:00 2001 From: Josh Cartwright Date: Thu, 3 Apr 2014 14:50:10 -0700 Subject: rtc: pm8xxx: fixup checkpatch/style issues This patchset is based on Stephen Boyd's PM8921 modernization/cleanups (http://lkml.kernel.org/g/1393441166-32692-1-git-send-email-sboyd@codeaurora.org), and allows for this RTC driver to be usable again. This patch (of 6): Before performing additional cleanups to this driver, do the easy cleanups first. Signed-off-by: Josh Cartwright Reviewed-by: Stephen Boyd Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-pm8xxx.c | 97 ++++++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 44 deletions(-) diff --git a/drivers/rtc/rtc-pm8xxx.c b/drivers/rtc/rtc-pm8xxx.c index bd76ffe92784..af60ee46075d 100644 --- a/drivers/rtc/rtc-pm8xxx.c +++ b/drivers/rtc/rtc-pm8xxx.c @@ -53,7 +53,7 @@ struct pm8xxx_rtc { int rtc_read_base; int rtc_write_base; int alarm_rw_base; - u8 ctrl_reg; + u8 ctrl_reg; struct device *rtc_dev; spinlock_t ctrl_reg_lock; }; @@ -63,7 +63,7 @@ struct pm8xxx_rtc { * hardware limitation. */ static int pm8xxx_read_wrapper(struct pm8xxx_rtc *rtc_dd, u8 *rtc_val, - int base, int count) + int base, int count) { int i, rc; struct device *parent = rtc_dd->rtc_dev->parent; @@ -80,7 +80,7 @@ static int pm8xxx_read_wrapper(struct pm8xxx_rtc *rtc_dd, u8 *rtc_val, } static int pm8xxx_write_wrapper(struct pm8xxx_rtc *rtc_dd, u8 *rtc_val, - int base, int count) + int base, int count) { int i, rc; struct device *parent = rtc_dd->rtc_dev->parent; @@ -126,15 +126,15 @@ static int pm8xxx_rtc_set_time(struct device *dev, struct rtc_time *tm) alarm_enabled = 1; ctrl_reg &= ~PM8xxx_RTC_ALARM_ENABLE; rc = pm8xxx_write_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base, - 1); + 1); if (rc < 0) { - dev_err(dev, "Write to RTC control register " - "failed\n"); + dev_err(dev, "Write to RTC control register failed\n"); goto rtc_rw_fail; } rtc_dd->ctrl_reg = ctrl_reg; - } else + } else { spin_unlock_irqrestore(&rtc_dd->ctrl_reg_lock, irq_flags); + } /* Write 0 to Byte[0] */ reg = 0; @@ -146,7 +146,7 @@ static int pm8xxx_rtc_set_time(struct device *dev, struct rtc_time *tm) /* Write Byte[1], Byte[2], Byte[3] */ rc = pm8xxx_write_wrapper(rtc_dd, value + 1, - rtc_dd->rtc_write_base + 1, 3); + rtc_dd->rtc_write_base + 1, 3); if (rc < 0) { dev_err(dev, "Write to RTC write data register failed\n"); goto rtc_rw_fail; @@ -162,10 +162,9 @@ static int pm8xxx_rtc_set_time(struct device *dev, struct rtc_time *tm) if (alarm_enabled) { ctrl_reg |= PM8xxx_RTC_ALARM_ENABLE; rc = pm8xxx_write_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base, - 1); + 1); if (rc < 0) { - dev_err(dev, "Write to RTC control register " - "failed\n"); + dev_err(dev, "Write to RTC control register failed\n"); goto rtc_rw_fail; } rtc_dd->ctrl_reg = ctrl_reg; @@ -186,7 +185,7 @@ static int pm8xxx_rtc_read_time(struct device *dev, struct rtc_time *tm) struct pm8xxx_rtc *rtc_dd = dev_get_drvdata(dev); rc = pm8xxx_read_wrapper(rtc_dd, value, rtc_dd->rtc_read_base, - NUM_8_BIT_RTC_REGS); + NUM_8_BIT_RTC_REGS); if (rc < 0) { dev_err(dev, "RTC read data register failed\n"); return rc; @@ -204,7 +203,8 @@ static int pm8xxx_rtc_read_time(struct device *dev, struct rtc_time *tm) if (unlikely(reg < value[0])) { rc = pm8xxx_read_wrapper(rtc_dd, value, - rtc_dd->rtc_read_base, NUM_8_BIT_RTC_REGS); + rtc_dd->rtc_read_base, + NUM_8_BIT_RTC_REGS); if (rc < 0) { dev_err(dev, "RTC read data register failed\n"); return rc; @@ -222,8 +222,8 @@ static int pm8xxx_rtc_read_time(struct device *dev, struct rtc_time *tm) } dev_dbg(dev, "secs = %lu, h:m:s == %d:%d:%d, d/m/y = %d/%d/%d\n", - secs, tm->tm_hour, tm->tm_min, tm->tm_sec, - tm->tm_mday, tm->tm_mon, tm->tm_year); + secs, tm->tm_hour, tm->tm_min, tm->tm_sec, + tm->tm_mday, tm->tm_mon, tm->tm_year); return 0; } @@ -245,15 +245,18 @@ static int pm8xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) spin_lock_irqsave(&rtc_dd->ctrl_reg_lock, irq_flags); rc = pm8xxx_write_wrapper(rtc_dd, value, rtc_dd->alarm_rw_base, - NUM_8_BIT_RTC_REGS); + NUM_8_BIT_RTC_REGS); if (rc < 0) { dev_err(dev, "Write to RTC ALARM register failed\n"); goto rtc_rw_fail; } ctrl_reg = rtc_dd->ctrl_reg; - ctrl_reg = alarm->enabled ? (ctrl_reg | PM8xxx_RTC_ALARM_ENABLE) : - (ctrl_reg & ~PM8xxx_RTC_ALARM_ENABLE); + + if (alarm->enabled) + ctrl_reg |= PM8xxx_RTC_ALARM_ENABLE; + else + ctrl_reg &= ~PM8xxx_RTC_ALARM_ENABLE; rc = pm8xxx_write_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base, 1); if (rc < 0) { @@ -264,9 +267,9 @@ static int pm8xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) rtc_dd->ctrl_reg = ctrl_reg; dev_dbg(dev, "Alarm Set for h:r:s=%d:%d:%d, d/m/y=%d/%d/%d\n", - alarm->time.tm_hour, alarm->time.tm_min, - alarm->time.tm_sec, alarm->time.tm_mday, - alarm->time.tm_mon, alarm->time.tm_year); + alarm->time.tm_hour, alarm->time.tm_min, + alarm->time.tm_sec, alarm->time.tm_mday, + alarm->time.tm_mon, alarm->time.tm_year); rtc_rw_fail: spin_unlock_irqrestore(&rtc_dd->ctrl_reg_lock, irq_flags); return rc; @@ -280,7 +283,7 @@ static int pm8xxx_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alarm) struct pm8xxx_rtc *rtc_dd = dev_get_drvdata(dev); rc = pm8xxx_read_wrapper(rtc_dd, value, rtc_dd->alarm_rw_base, - NUM_8_BIT_RTC_REGS); + NUM_8_BIT_RTC_REGS); if (rc < 0) { dev_err(dev, "RTC alarm time read failed\n"); return rc; @@ -297,9 +300,9 @@ static int pm8xxx_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alarm) } dev_dbg(dev, "Alarm set for - h:r:s=%d:%d:%d, d/m/y=%d/%d/%d\n", - alarm->time.tm_hour, alarm->time.tm_min, - alarm->time.tm_sec, alarm->time.tm_mday, - alarm->time.tm_mon, alarm->time.tm_year); + alarm->time.tm_hour, alarm->time.tm_min, + alarm->time.tm_sec, alarm->time.tm_mday, + alarm->time.tm_mon, alarm->time.tm_year); return 0; } @@ -312,9 +315,13 @@ static int pm8xxx_rtc_alarm_irq_enable(struct device *dev, unsigned int enable) u8 ctrl_reg; spin_lock_irqsave(&rtc_dd->ctrl_reg_lock, irq_flags); + ctrl_reg = rtc_dd->ctrl_reg; - ctrl_reg = (enable) ? (ctrl_reg | PM8xxx_RTC_ALARM_ENABLE) : - (ctrl_reg & ~PM8xxx_RTC_ALARM_ENABLE); + + if (enable) + ctrl_reg |= PM8xxx_RTC_ALARM_ENABLE; + else + ctrl_reg &= ~PM8xxx_RTC_ALARM_ENABLE; rc = pm8xxx_write_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base, 1); if (rc < 0) { @@ -354,8 +361,8 @@ static irqreturn_t pm8xxx_alarm_trigger(int irq, void *dev_id) rc = pm8xxx_write_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base, 1); if (rc < 0) { spin_unlock_irqrestore(&rtc_dd->ctrl_reg_lock, irq_flags); - dev_err(rtc_dd->rtc_dev, "Write to RTC control register " - "failed\n"); + dev_err(rtc_dd->rtc_dev, + "Write to RTC control register failed\n"); goto rtc_alarm_handled; } @@ -364,19 +371,19 @@ static irqreturn_t pm8xxx_alarm_trigger(int irq, void *dev_id) /* Clear RTC alarm register */ rc = pm8xxx_read_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base + - PM8XXX_ALARM_CTRL_OFFSET, 1); + PM8XXX_ALARM_CTRL_OFFSET, 1); if (rc < 0) { - dev_err(rtc_dd->rtc_dev, "RTC Alarm control register read " - "failed\n"); + dev_err(rtc_dd->rtc_dev, + "RTC Alarm control register read failed\n"); goto rtc_alarm_handled; } ctrl_reg &= ~PM8xxx_RTC_ALARM_CLEAR; rc = pm8xxx_write_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base + - PM8XXX_ALARM_CTRL_OFFSET, 1); + PM8XXX_ALARM_CTRL_OFFSET, 1); if (rc < 0) - dev_err(rtc_dd->rtc_dev, "Write to RTC Alarm control register" - " failed\n"); + dev_err(rtc_dd->rtc_dev, + "Write to RTC Alarm control register failed\n"); rtc_alarm_handled: return IRQ_HANDLED; @@ -409,7 +416,7 @@ static int pm8xxx_rtc_probe(struct platform_device *pdev) } rtc_resource = platform_get_resource_byname(pdev, IORESOURCE_IO, - "pmic_rtc_base"); + "pmic_rtc_base"); if (!(rtc_resource && rtc_resource->start)) { dev_err(&pdev->dev, "RTC IO resource absent!\n"); return -ENXIO; @@ -436,31 +443,31 @@ static int pm8xxx_rtc_probe(struct platform_device *pdev) rc = pm8xxx_write_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base, 1); if (rc < 0) { - dev_err(&pdev->dev, "Write to RTC control register " - "failed\n"); + dev_err(&pdev->dev, + "Write to RTC control register failed\n"); return rc; } } rtc_dd->ctrl_reg = ctrl_reg; - if (rtc_write_enable == true) + if (rtc_write_enable) pm8xxx_rtc_ops.set_time = pm8xxx_rtc_set_time; platform_set_drvdata(pdev, rtc_dd); /* Register the RTC device */ rtc_dd->rtc = devm_rtc_device_register(&pdev->dev, "pm8xxx_rtc", - &pm8xxx_rtc_ops, THIS_MODULE); + &pm8xxx_rtc_ops, THIS_MODULE); if (IS_ERR(rtc_dd->rtc)) { dev_err(&pdev->dev, "%s: RTC registration failed (%ld)\n", - __func__, PTR_ERR(rtc_dd->rtc)); + __func__, PTR_ERR(rtc_dd->rtc)); return PTR_ERR(rtc_dd->rtc); } /* Request the alarm IRQ */ rc = request_any_context_irq(rtc_dd->rtc_alarm_irq, - pm8xxx_alarm_trigger, IRQF_TRIGGER_RISING, - "pm8xxx_rtc_alarm", rtc_dd); + pm8xxx_alarm_trigger, IRQF_TRIGGER_RISING, + "pm8xxx_rtc_alarm", rtc_dd); if (rc < 0) { dev_err(&pdev->dev, "Request IRQ failed (%d)\n", rc); return rc; @@ -505,7 +512,9 @@ static int pm8xxx_rtc_suspend(struct device *dev) } #endif -static SIMPLE_DEV_PM_OPS(pm8xxx_rtc_pm_ops, pm8xxx_rtc_suspend, pm8xxx_rtc_resume); +static SIMPLE_DEV_PM_OPS(pm8xxx_rtc_pm_ops, + pm8xxx_rtc_suspend, + pm8xxx_rtc_resume); static struct platform_driver pm8xxx_rtc_driver = { .probe = pm8xxx_rtc_probe, -- cgit v1.2.3 From 5d7dc4cf14da184fb7aca8f298e9667746e9f980 Mon Sep 17 00:00:00 2001 From: Josh Cartwright Date: Thu, 3 Apr 2014 14:50:11 -0700 Subject: rtc: pm8xxx: use regmap API for register accesses Now that the parent mfd driver has been made to work again, and has been reworked to create a regmap instance intended for its children to use, rework the pm8xxx driver to use the regmap API for its register accesses. Signed-off-by: Josh Cartwright Reviewed-by: Stephen Boyd Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-pm8xxx.c | 145 +++++++++++++++++++---------------------------- 1 file changed, 57 insertions(+), 88 deletions(-) diff --git a/drivers/rtc/rtc-pm8xxx.c b/drivers/rtc/rtc-pm8xxx.c index af60ee46075d..cdc9dc211eb0 100644 --- a/drivers/rtc/rtc-pm8xxx.c +++ b/drivers/rtc/rtc-pm8xxx.c @@ -13,11 +13,12 @@ #include #include #include +#include #include +#include #include #include -#include #include @@ -37,6 +38,7 @@ /** * struct pm8xxx_rtc - rtc driver internal structure * @rtc: rtc device for this driver. + * @regmap: regmap used to access RTC registers * @rtc_alarm_irq: rtc alarm irq number. * @rtc_base: address of rtc control register. * @rtc_read_base: base address of read registers. @@ -48,6 +50,7 @@ */ struct pm8xxx_rtc { struct rtc_device *rtc; + struct regmap *regmap; int rtc_alarm_irq; int rtc_base; int rtc_read_base; @@ -58,44 +61,6 @@ struct pm8xxx_rtc { spinlock_t ctrl_reg_lock; }; -/* - * The RTC registers need to be read/written one byte at a time. This is a - * hardware limitation. - */ -static int pm8xxx_read_wrapper(struct pm8xxx_rtc *rtc_dd, u8 *rtc_val, - int base, int count) -{ - int i, rc; - struct device *parent = rtc_dd->rtc_dev->parent; - - for (i = 0; i < count; i++) { - rc = pm8xxx_readb(parent, base + i, &rtc_val[i]); - if (rc < 0) { - dev_err(rtc_dd->rtc_dev, "PMIC read failed\n"); - return rc; - } - } - - return 0; -} - -static int pm8xxx_write_wrapper(struct pm8xxx_rtc *rtc_dd, u8 *rtc_val, - int base, int count) -{ - int i, rc; - struct device *parent = rtc_dd->rtc_dev->parent; - - for (i = 0; i < count; i++) { - rc = pm8xxx_writeb(parent, base + i, rtc_val[i]); - if (rc < 0) { - dev_err(rtc_dd->rtc_dev, "PMIC write failed\n"); - return rc; - } - } - - return 0; -} - /* * Steps to write the RTC registers. * 1. Disable alarm if enabled. @@ -107,7 +72,7 @@ static int pm8xxx_rtc_set_time(struct device *dev, struct rtc_time *tm) { int rc, i; unsigned long secs, irq_flags; - u8 value[NUM_8_BIT_RTC_REGS], reg = 0, alarm_enabled = 0, ctrl_reg; + u8 value[NUM_8_BIT_RTC_REGS], alarm_enabled = 0, ctrl_reg; struct pm8xxx_rtc *rtc_dd = dev_get_drvdata(dev); rtc_tm_to_time(tm, &secs); @@ -125,9 +90,8 @@ static int pm8xxx_rtc_set_time(struct device *dev, struct rtc_time *tm) if (ctrl_reg & PM8xxx_RTC_ALARM_ENABLE) { alarm_enabled = 1; ctrl_reg &= ~PM8xxx_RTC_ALARM_ENABLE; - rc = pm8xxx_write_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base, - 1); - if (rc < 0) { + rc = regmap_write(rtc_dd->regmap, rtc_dd->rtc_base, ctrl_reg); + if (rc) { dev_err(dev, "Write to RTC control register failed\n"); goto rtc_rw_fail; } @@ -137,33 +101,31 @@ static int pm8xxx_rtc_set_time(struct device *dev, struct rtc_time *tm) } /* Write 0 to Byte[0] */ - reg = 0; - rc = pm8xxx_write_wrapper(rtc_dd, ®, rtc_dd->rtc_write_base, 1); - if (rc < 0) { + rc = regmap_write(rtc_dd->regmap, rtc_dd->rtc_write_base, 0); + if (rc) { dev_err(dev, "Write to RTC write data register failed\n"); goto rtc_rw_fail; } /* Write Byte[1], Byte[2], Byte[3] */ - rc = pm8xxx_write_wrapper(rtc_dd, value + 1, - rtc_dd->rtc_write_base + 1, 3); - if (rc < 0) { + rc = regmap_bulk_write(rtc_dd->regmap, rtc_dd->rtc_write_base + 1, + &value[1], sizeof(value) - 1); + if (rc) { dev_err(dev, "Write to RTC write data register failed\n"); goto rtc_rw_fail; } /* Write Byte[0] */ - rc = pm8xxx_write_wrapper(rtc_dd, value, rtc_dd->rtc_write_base, 1); - if (rc < 0) { + rc = regmap_write(rtc_dd->regmap, rtc_dd->rtc_write_base, value[0]); + if (rc) { dev_err(dev, "Write to RTC write data register failed\n"); goto rtc_rw_fail; } if (alarm_enabled) { ctrl_reg |= PM8xxx_RTC_ALARM_ENABLE; - rc = pm8xxx_write_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base, - 1); - if (rc < 0) { + rc = regmap_write(rtc_dd->regmap, rtc_dd->rtc_base, ctrl_reg); + if (rc) { dev_err(dev, "Write to RTC control register failed\n"); goto rtc_rw_fail; } @@ -180,13 +142,14 @@ rtc_rw_fail: static int pm8xxx_rtc_read_time(struct device *dev, struct rtc_time *tm) { int rc; - u8 value[NUM_8_BIT_RTC_REGS], reg; + u8 value[NUM_8_BIT_RTC_REGS]; unsigned long secs; + unsigned int reg; struct pm8xxx_rtc *rtc_dd = dev_get_drvdata(dev); - rc = pm8xxx_read_wrapper(rtc_dd, value, rtc_dd->rtc_read_base, - NUM_8_BIT_RTC_REGS); - if (rc < 0) { + rc = regmap_bulk_read(rtc_dd->regmap, rtc_dd->rtc_read_base, + value, sizeof(value)); + if (rc) { dev_err(dev, "RTC read data register failed\n"); return rc; } @@ -195,17 +158,16 @@ static int pm8xxx_rtc_read_time(struct device *dev, struct rtc_time *tm) * Read the LSB again and check if there has been a carry over. * If there is, redo the read operation. */ - rc = pm8xxx_read_wrapper(rtc_dd, ®, rtc_dd->rtc_read_base, 1); + rc = regmap_read(rtc_dd->regmap, rtc_dd->rtc_read_base, ®); if (rc < 0) { dev_err(dev, "RTC read data register failed\n"); return rc; } if (unlikely(reg < value[0])) { - rc = pm8xxx_read_wrapper(rtc_dd, value, - rtc_dd->rtc_read_base, - NUM_8_BIT_RTC_REGS); - if (rc < 0) { + rc = regmap_bulk_read(rtc_dd->regmap, rtc_dd->rtc_read_base, + value, sizeof(value)); + if (rc) { dev_err(dev, "RTC read data register failed\n"); return rc; } @@ -244,9 +206,9 @@ static int pm8xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) spin_lock_irqsave(&rtc_dd->ctrl_reg_lock, irq_flags); - rc = pm8xxx_write_wrapper(rtc_dd, value, rtc_dd->alarm_rw_base, - NUM_8_BIT_RTC_REGS); - if (rc < 0) { + rc = regmap_bulk_write(rtc_dd->regmap, rtc_dd->alarm_rw_base, value, + sizeof(value)); + if (rc) { dev_err(dev, "Write to RTC ALARM register failed\n"); goto rtc_rw_fail; } @@ -258,8 +220,8 @@ static int pm8xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) else ctrl_reg &= ~PM8xxx_RTC_ALARM_ENABLE; - rc = pm8xxx_write_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base, 1); - if (rc < 0) { + rc = regmap_write(rtc_dd->regmap, rtc_dd->rtc_base, ctrl_reg); + if (rc) { dev_err(dev, "Write to RTC control register failed\n"); goto rtc_rw_fail; } @@ -282,9 +244,9 @@ static int pm8xxx_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alarm) unsigned long secs; struct pm8xxx_rtc *rtc_dd = dev_get_drvdata(dev); - rc = pm8xxx_read_wrapper(rtc_dd, value, rtc_dd->alarm_rw_base, - NUM_8_BIT_RTC_REGS); - if (rc < 0) { + rc = regmap_bulk_read(rtc_dd->regmap, rtc_dd->alarm_rw_base, value, + sizeof(value)); + if (rc) { dev_err(dev, "RTC alarm time read failed\n"); return rc; } @@ -323,8 +285,8 @@ static int pm8xxx_rtc_alarm_irq_enable(struct device *dev, unsigned int enable) else ctrl_reg &= ~PM8xxx_RTC_ALARM_ENABLE; - rc = pm8xxx_write_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base, 1); - if (rc < 0) { + rc = regmap_write(rtc_dd->regmap, rtc_dd->rtc_base, ctrl_reg); + if (rc) { dev_err(dev, "Write to RTC control register failed\n"); goto rtc_rw_fail; } @@ -346,7 +308,7 @@ static struct rtc_class_ops pm8xxx_rtc_ops = { static irqreturn_t pm8xxx_alarm_trigger(int irq, void *dev_id) { struct pm8xxx_rtc *rtc_dd = dev_id; - u8 ctrl_reg; + unsigned int ctrl_reg; int rc; unsigned long irq_flags; @@ -358,8 +320,8 @@ static irqreturn_t pm8xxx_alarm_trigger(int irq, void *dev_id) ctrl_reg = rtc_dd->ctrl_reg; ctrl_reg &= ~PM8xxx_RTC_ALARM_ENABLE; - rc = pm8xxx_write_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base, 1); - if (rc < 0) { + rc = regmap_write(rtc_dd->regmap, rtc_dd->rtc_base, ctrl_reg); + if (rc) { spin_unlock_irqrestore(&rtc_dd->ctrl_reg_lock, irq_flags); dev_err(rtc_dd->rtc_dev, "Write to RTC control register failed\n"); @@ -370,18 +332,20 @@ static irqreturn_t pm8xxx_alarm_trigger(int irq, void *dev_id) spin_unlock_irqrestore(&rtc_dd->ctrl_reg_lock, irq_flags); /* Clear RTC alarm register */ - rc = pm8xxx_read_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base + - PM8XXX_ALARM_CTRL_OFFSET, 1); - if (rc < 0) { + rc = regmap_read(rtc_dd->regmap, + rtc_dd->rtc_base + PM8XXX_ALARM_CTRL_OFFSET, + &ctrl_reg); + if (rc) { dev_err(rtc_dd->rtc_dev, "RTC Alarm control register read failed\n"); goto rtc_alarm_handled; } ctrl_reg &= ~PM8xxx_RTC_ALARM_CLEAR; - rc = pm8xxx_write_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base + - PM8XXX_ALARM_CTRL_OFFSET, 1); - if (rc < 0) + rc = regmap_write(rtc_dd->regmap, + rtc_dd->rtc_base + PM8XXX_ALARM_CTRL_OFFSET, + ctrl_reg); + if (rc) dev_err(rtc_dd->rtc_dev, "Write to RTC Alarm control register failed\n"); @@ -392,7 +356,7 @@ rtc_alarm_handled: static int pm8xxx_rtc_probe(struct platform_device *pdev) { int rc; - u8 ctrl_reg; + unsigned int ctrl_reg; bool rtc_write_enable = false; struct pm8xxx_rtc *rtc_dd; struct resource *rtc_resource; @@ -409,6 +373,12 @@ static int pm8xxx_rtc_probe(struct platform_device *pdev) /* Initialise spinlock to protect RTC control register */ spin_lock_init(&rtc_dd->ctrl_reg_lock); + rtc_dd->regmap = dev_get_regmap(pdev->dev.parent, NULL); + if (!rtc_dd->regmap) { + dev_err(&pdev->dev, "Parent regmap unavailable.\n"); + return -ENXIO; + } + rtc_dd->rtc_alarm_irq = platform_get_irq(pdev, 0); if (rtc_dd->rtc_alarm_irq < 0) { dev_err(&pdev->dev, "Alarm IRQ resource absent!\n"); @@ -432,17 +402,16 @@ static int pm8xxx_rtc_probe(struct platform_device *pdev) rtc_dd->rtc_dev = &pdev->dev; /* Check if the RTC is on, else turn it on */ - rc = pm8xxx_read_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base, 1); - if (rc < 0) { + rc = regmap_read(rtc_dd->regmap, rtc_dd->rtc_base, &ctrl_reg); + if (rc) { dev_err(&pdev->dev, "RTC control register read failed!\n"); return rc; } if (!(ctrl_reg & PM8xxx_RTC_ENABLE)) { ctrl_reg |= PM8xxx_RTC_ENABLE; - rc = pm8xxx_write_wrapper(rtc_dd, &ctrl_reg, rtc_dd->rtc_base, - 1); - if (rc < 0) { + rc = regmap_write(rtc_dd->regmap, rtc_dd->rtc_base, ctrl_reg); + if (rc) { dev_err(&pdev->dev, "Write to RTC control register failed\n"); return rc; -- cgit v1.2.3 From bffcbc0887a7496c19639d55ff0e7483ee17aa42 Mon Sep 17 00:00:00 2001 From: Josh Cartwright Date: Thu, 3 Apr 2014 14:50:12 -0700 Subject: rtc: pm8xxx: use devm_request_any_context_irq Make use of the devm_* variant of request_any_context_irq to allow for elimination of remove(). Signed-off-by: Josh Cartwright Reviewed-by: Stephen Boyd Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-pm8xxx.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/drivers/rtc/rtc-pm8xxx.c b/drivers/rtc/rtc-pm8xxx.c index cdc9dc211eb0..91ac2e346aea 100644 --- a/drivers/rtc/rtc-pm8xxx.c +++ b/drivers/rtc/rtc-pm8xxx.c @@ -434,9 +434,10 @@ static int pm8xxx_rtc_probe(struct platform_device *pdev) } /* Request the alarm IRQ */ - rc = request_any_context_irq(rtc_dd->rtc_alarm_irq, - pm8xxx_alarm_trigger, IRQF_TRIGGER_RISING, - "pm8xxx_rtc_alarm", rtc_dd); + rc = devm_request_any_context_irq(&pdev->dev, rtc_dd->rtc_alarm_irq, + pm8xxx_alarm_trigger, + IRQF_TRIGGER_RISING, + "pm8xxx_rtc_alarm", rtc_dd); if (rc < 0) { dev_err(&pdev->dev, "Request IRQ failed (%d)\n", rc); return rc; @@ -449,16 +450,6 @@ static int pm8xxx_rtc_probe(struct platform_device *pdev) return 0; } -static int pm8xxx_rtc_remove(struct platform_device *pdev) -{ - struct pm8xxx_rtc *rtc_dd = platform_get_drvdata(pdev); - - device_init_wakeup(&pdev->dev, 0); - free_irq(rtc_dd->rtc_alarm_irq, rtc_dd); - - return 0; -} - #ifdef CONFIG_PM_SLEEP static int pm8xxx_rtc_resume(struct device *dev) { @@ -487,7 +478,6 @@ static SIMPLE_DEV_PM_OPS(pm8xxx_rtc_pm_ops, static struct platform_driver pm8xxx_rtc_driver = { .probe = pm8xxx_rtc_probe, - .remove = pm8xxx_rtc_remove, .driver = { .name = PM8XXX_RTC_DEV_NAME, .owner = THIS_MODULE, -- cgit v1.2.3 From 5a418558cdae4fd0699b7b6a960f03a8bb38a338 Mon Sep 17 00:00:00 2001 From: Josh Cartwright Date: Thu, 3 Apr 2014 14:50:13 -0700 Subject: rtc: pm8xxx: add support for devicetree Add support for describing the PM8921/PM8058 RTC in device tree. Additionally: - drop support for describing the RTC using platform data, as there are no current in tree users who do so. - make allow_set_time a device-specific flag, instead of mucking with the rtc_ops Signed-off-by: Josh Cartwright Reviewed-by: Stephen Boyd Acked-by: Lee Jones Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-pm8xxx.c | 52 +++++++++++++++++++++++------------------- include/linux/mfd/pm8xxx/rtc.h | 25 -------------------- 2 files changed, 29 insertions(+), 48 deletions(-) delete mode 100644 include/linux/mfd/pm8xxx/rtc.h diff --git a/drivers/rtc/rtc-pm8xxx.c b/drivers/rtc/rtc-pm8xxx.c index 91ac2e346aea..6e3cd345379d 100644 --- a/drivers/rtc/rtc-pm8xxx.c +++ b/drivers/rtc/rtc-pm8xxx.c @@ -9,7 +9,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ - +#include #include #include #include @@ -19,9 +19,6 @@ #include #include -#include - - /* RTC Register offsets from RTC CTRL REG */ #define PM8XXX_ALARM_CTRL_OFFSET 0x01 #define PM8XXX_RTC_WRITE_OFFSET 0x02 @@ -39,6 +36,7 @@ * struct pm8xxx_rtc - rtc driver internal structure * @rtc: rtc device for this driver. * @regmap: regmap used to access RTC registers + * @allow_set_time: indicates whether writing to the RTC is allowed * @rtc_alarm_irq: rtc alarm irq number. * @rtc_base: address of rtc control register. * @rtc_read_base: base address of read registers. @@ -51,6 +49,7 @@ struct pm8xxx_rtc { struct rtc_device *rtc; struct regmap *regmap; + bool allow_set_time; int rtc_alarm_irq; int rtc_base; int rtc_read_base; @@ -75,6 +74,9 @@ static int pm8xxx_rtc_set_time(struct device *dev, struct rtc_time *tm) u8 value[NUM_8_BIT_RTC_REGS], alarm_enabled = 0, ctrl_reg; struct pm8xxx_rtc *rtc_dd = dev_get_drvdata(dev); + if (!rtc_dd->allow_set_time) + return -EACCES; + rtc_tm_to_time(tm, &secs); for (i = 0; i < NUM_8_BIT_RTC_REGS; i++) { @@ -298,8 +300,9 @@ rtc_rw_fail: return rc; } -static struct rtc_class_ops pm8xxx_rtc_ops = { +static const struct rtc_class_ops pm8xxx_rtc_ops = { .read_time = pm8xxx_rtc_read_time, + .set_time = pm8xxx_rtc_set_time, .set_alarm = pm8xxx_rtc_set_alarm, .read_alarm = pm8xxx_rtc_read_alarm, .alarm_irq_enable = pm8xxx_rtc_alarm_irq_enable, @@ -353,18 +356,26 @@ rtc_alarm_handled: return IRQ_HANDLED; } +/* + * Hardcoded RTC bases until IORESOURCE_REG mapping is figured out + */ +static const struct of_device_id pm8xxx_id_table[] = { + { .compatible = "qcom,pm8921-rtc", .data = (void *) 0x11D }, + { .compatible = "qcom,pm8058-rtc", .data = (void *) 0x1E8 }, + { }, +}; +MODULE_DEVICE_TABLE(of, pm8xxx_id_table); + static int pm8xxx_rtc_probe(struct platform_device *pdev) { int rc; unsigned int ctrl_reg; - bool rtc_write_enable = false; struct pm8xxx_rtc *rtc_dd; - struct resource *rtc_resource; - const struct pm8xxx_rtc_platform_data *pdata = - dev_get_platdata(&pdev->dev); + const struct of_device_id *match; - if (pdata != NULL) - rtc_write_enable = pdata->rtc_write_enable; + match = of_match_node(pm8xxx_id_table, pdev->dev.of_node); + if (!match) + return -ENXIO; rtc_dd = devm_kzalloc(&pdev->dev, sizeof(*rtc_dd), GFP_KERNEL); if (rtc_dd == NULL) @@ -385,14 +396,10 @@ static int pm8xxx_rtc_probe(struct platform_device *pdev) return -ENXIO; } - rtc_resource = platform_get_resource_byname(pdev, IORESOURCE_IO, - "pmic_rtc_base"); - if (!(rtc_resource && rtc_resource->start)) { - dev_err(&pdev->dev, "RTC IO resource absent!\n"); - return -ENXIO; - } + rtc_dd->allow_set_time = of_property_read_bool(pdev->dev.of_node, + "allow-set-time"); - rtc_dd->rtc_base = rtc_resource->start; + rtc_dd->rtc_base = (long) match->data; /* Setup RTC register addresses */ rtc_dd->rtc_write_base = rtc_dd->rtc_base + PM8XXX_RTC_WRITE_OFFSET; @@ -419,8 +426,6 @@ static int pm8xxx_rtc_probe(struct platform_device *pdev) } rtc_dd->ctrl_reg = ctrl_reg; - if (rtc_write_enable) - pm8xxx_rtc_ops.set_time = pm8xxx_rtc_set_time; platform_set_drvdata(pdev, rtc_dd); @@ -479,9 +484,10 @@ static SIMPLE_DEV_PM_OPS(pm8xxx_rtc_pm_ops, static struct platform_driver pm8xxx_rtc_driver = { .probe = pm8xxx_rtc_probe, .driver = { - .name = PM8XXX_RTC_DEV_NAME, - .owner = THIS_MODULE, - .pm = &pm8xxx_rtc_pm_ops, + .name = "rtc-pm8xxx", + .owner = THIS_MODULE, + .pm = &pm8xxx_rtc_pm_ops, + .of_match_table = pm8xxx_id_table, }, }; diff --git a/include/linux/mfd/pm8xxx/rtc.h b/include/linux/mfd/pm8xxx/rtc.h deleted file mode 100644 index 14f1983eaecc..000000000000 --- a/include/linux/mfd/pm8xxx/rtc.h +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 and - * only version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#ifndef __RTC_PM8XXX_H__ -#define __RTC_PM8XXX_H__ - -#define PM8XXX_RTC_DEV_NAME "rtc-pm8xxx" -/** - * struct pm8xxx_rtc_pdata - RTC driver platform data - * @rtc_write_enable: variable stating RTC write capability - */ -struct pm8xxx_rtc_platform_data { - bool rtc_write_enable; -}; - -#endif /* __RTC_PM8XXX_H__ */ -- cgit v1.2.3 From fda9909ddaab13fb7b35b94f204623c1f9571fc7 Mon Sep 17 00:00:00 2001 From: Josh Cartwright Date: Thu, 3 Apr 2014 14:50:14 -0700 Subject: rtc: pm8xxx: move device_init_wakeup() before rtc_register Setup wakeup capability before rtc_register to ensure the rtc class core properly sets up our 'wakealarm' sysfs attribute. Signed-off-by: Josh Cartwright Reviewed-by: Stephen Boyd Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-pm8xxx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-pm8xxx.c b/drivers/rtc/rtc-pm8xxx.c index 6e3cd345379d..197699f358c7 100644 --- a/drivers/rtc/rtc-pm8xxx.c +++ b/drivers/rtc/rtc-pm8xxx.c @@ -429,6 +429,8 @@ static int pm8xxx_rtc_probe(struct platform_device *pdev) platform_set_drvdata(pdev, rtc_dd); + device_init_wakeup(&pdev->dev, 1); + /* Register the RTC device */ rtc_dd->rtc = devm_rtc_device_register(&pdev->dev, "pm8xxx_rtc", &pm8xxx_rtc_ops, THIS_MODULE); @@ -448,8 +450,6 @@ static int pm8xxx_rtc_probe(struct platform_device *pdev) return rc; } - device_init_wakeup(&pdev->dev, 1); - dev_dbg(&pdev->dev, "Probe success !!\n"); return 0; -- cgit v1.2.3 From 5ab9a52a3af613b74a7cda033c1623c7913c50d9 Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Thu, 3 Apr 2014 14:50:15 -0700 Subject: drivers/rtc/rtc-mc13xxx.c: fix potential race condition RTC drivers must not return an error after device registration. This patch makes RTC registration as the last action. Signed-off-by: Alexander Shiyan Acked-by: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-mc13xxx.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index 9b1a77eb91a9..0765606a2d14 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -247,8 +247,6 @@ static irqreturn_t mc13xxx_rtc_alarm_handler(int irq, void *dev) struct mc13xxx_rtc *priv = dev; struct mc13xxx *mc13xxx = priv->mc13xxx; - dev_dbg(&priv->rtc->dev, "Alarm\n"); - rtc_update_irq(priv->rtc, 1, RTC_IRQF | RTC_AF); mc13xxx_irq_ack(mc13xxx, irq); @@ -261,8 +259,6 @@ static irqreturn_t mc13xxx_rtc_update_handler(int irq, void *dev) struct mc13xxx_rtc *priv = dev; struct mc13xxx *mc13xxx = priv->mc13xxx; - dev_dbg(&priv->rtc->dev, "1HZ\n"); - rtc_update_irq(priv->rtc, 1, RTC_IRQF | RTC_UF); mc13xxx_irq_ack(mc13xxx, irq); @@ -283,7 +279,6 @@ static irqreturn_t mc13xxx_rtc_reset_handler(int irq, void *dev) struct mc13xxx_rtc *priv = dev; struct mc13xxx *mc13xxx = priv->mc13xxx; - dev_warn(&priv->rtc->dev, "Contents of the RTC are no longer valid\n"); priv->valid = 0; mc13xxx_irq_mask(mc13xxx, irq); @@ -307,11 +302,6 @@ static int __init mc13xxx_rtc_probe(struct platform_device *pdev) platform_set_drvdata(pdev, priv); - priv->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, - &mc13xxx_rtc_ops, THIS_MODULE); - if (IS_ERR(priv->rtc)) - return PTR_ERR(priv->rtc); - mc13xxx_lock(mc13xxx); mc13xxx_irq_ack(mc13xxx, MC13XXX_IRQ_RTCRST); @@ -319,24 +309,30 @@ static int __init mc13xxx_rtc_probe(struct platform_device *pdev) ret = mc13xxx_irq_request(mc13xxx, MC13XXX_IRQ_RTCRST, mc13xxx_rtc_reset_handler, DRIVER_NAME, priv); if (ret) - goto err_reset_irq_request; + goto err_irq_request; ret = mc13xxx_irq_request(mc13xxx, MC13XXX_IRQ_1HZ, mc13xxx_rtc_update_handler, DRIVER_NAME, priv); if (ret) - goto err_reset_irq_status; + goto err_irq_request; ret = mc13xxx_irq_request_nounmask(mc13xxx, MC13XXX_IRQ_TODA, mc13xxx_rtc_alarm_handler, DRIVER_NAME, priv); - if (!ret) - goto err_reset_irq_request; + if (ret) + goto err_irq_request; - mc13xxx_irq_free(mc13xxx, MC13XXX_IRQ_1HZ, priv); + mc13xxx_unlock(mc13xxx); + + priv->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, + &mc13xxx_rtc_ops, THIS_MODULE); -err_reset_irq_status: + return 0; + +err_irq_request: + mc13xxx_irq_free(mc13xxx, MC13XXX_IRQ_TODA, priv); + mc13xxx_irq_free(mc13xxx, MC13XXX_IRQ_1HZ, priv); mc13xxx_irq_free(mc13xxx, MC13XXX_IRQ_RTCRST, priv); -err_reset_irq_request: mc13xxx_unlock(mc13xxx); return ret; -- cgit v1.2.3 From 617b26a0e1cf8f183328763b6f1a33a422b60614 Mon Sep 17 00:00:00 2001 From: Raghavendra Ganiga Date: Thu, 3 Apr 2014 14:50:16 -0700 Subject: rtc: add support for maxim dallas rtc ds1347 Add support for maxim dallas rtc ds1347 Signed-off-by: Raghavendra Chandra Ganiga Acked-by: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/Kconfig | 12 ++++ drivers/rtc/Makefile | 1 + drivers/rtc/rtc-ds1347.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 179 insertions(+) create mode 100644 drivers/rtc/rtc-ds1347.c diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index db933decc39c..2e565f8e5165 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -573,6 +573,18 @@ config RTC_DRV_DS1305 This driver can also be built as a module. If so, the module will be called rtc-ds1305. +config RTC_DRV_DS1347 + tristate "Dallas/Maxim DS1347" + help + If you say yes here you get support for the + Dallas/Maxim DS1347 chips. + + This driver only supports the RTC feature, and not other chip + features such as alarms. + + This driver can also be built as a module. If so, the module + will be called rtc-ds1347. + config RTC_DRV_DS1390 tristate "Dallas/Maxim DS1390/93/94" help diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index b427bf7dd20d..40a09915c8f6 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_RTC_DRV_DS1286) += rtc-ds1286.o obj-$(CONFIG_RTC_DRV_DS1302) += rtc-ds1302.o obj-$(CONFIG_RTC_DRV_DS1305) += rtc-ds1305.o obj-$(CONFIG_RTC_DRV_DS1307) += rtc-ds1307.o +obj-$(CONFIG_RTC_DRV_DS1347) += rtc-ds1347.o obj-$(CONFIG_RTC_DRV_DS1374) += rtc-ds1374.o obj-$(CONFIG_RTC_DRV_DS1390) += rtc-ds1390.o obj-$(CONFIG_RTC_DRV_DS1511) += rtc-ds1511.o diff --git a/drivers/rtc/rtc-ds1347.c b/drivers/rtc/rtc-ds1347.c new file mode 100644 index 000000000000..c82b4c050326 --- /dev/null +++ b/drivers/rtc/rtc-ds1347.c @@ -0,0 +1,166 @@ +/* rtc-ds1347.c + * + * Driver for Dallas Semiconductor DS1347 Low Current, SPI Compatible + * Real Time Clock + * + * Author : Raghavendra Chandra Ganiga + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +/* Registers in ds1347 rtc */ + +#define DS1347_SECONDS_REG 0x01 +#define DS1347_MINUTES_REG 0x03 +#define DS1347_HOURS_REG 0x05 +#define DS1347_DATE_REG 0x07 +#define DS1347_MONTH_REG 0x09 +#define DS1347_DAY_REG 0x0B +#define DS1347_YEAR_REG 0x0D +#define DS1347_CONTROL_REG 0x0F +#define DS1347_STATUS_REG 0x17 +#define DS1347_CLOCK_BURST 0x3F + +static int ds1347_read_reg(struct device *dev, unsigned char address, + unsigned char *data) +{ + struct spi_device *spi = to_spi_device(dev); + + *data = address | 0x80; + + return spi_write_then_read(spi, data, 1, data, 1); +} + +static int ds1347_write_reg(struct device *dev, unsigned char address, + unsigned char data) +{ + struct spi_device *spi = to_spi_device(dev); + unsigned char buf[2]; + + buf[0] = address & 0x7F; + buf[1] = data; + + return spi_write_then_read(spi, buf, 2, NULL, 0); +} + +static int ds1347_read_time(struct device *dev, struct rtc_time *dt) +{ + struct spi_device *spi = to_spi_device(dev); + int err; + unsigned char buf[8]; + + buf[0] = DS1347_CLOCK_BURST | 0x80; + + err = spi_write_then_read(spi, buf, 1, buf, 8); + if (err) + return err; + + dt->tm_sec = bcd2bin(buf[0]); + dt->tm_min = bcd2bin(buf[1]); + dt->tm_hour = bcd2bin(buf[2] & 0x3F); + dt->tm_mday = bcd2bin(buf[3]); + dt->tm_mon = bcd2bin(buf[4]) - 1; + dt->tm_wday = bcd2bin(buf[5]) - 1; + dt->tm_year = bcd2bin(buf[6]) + 100; + + return rtc_valid_tm(dt); +} + +static int ds1347_set_time(struct device *dev, struct rtc_time *dt) +{ + struct spi_device *spi = to_spi_device(dev); + unsigned char buf[9]; + + buf[0] = DS1347_CLOCK_BURST & 0x7F; + buf[1] = bin2bcd(dt->tm_sec); + buf[2] = bin2bcd(dt->tm_min); + buf[3] = (bin2bcd(dt->tm_hour) & 0x3F); + buf[4] = bin2bcd(dt->tm_mday); + buf[5] = bin2bcd(dt->tm_mon + 1); + buf[6] = bin2bcd(dt->tm_wday + 1); + + /* year in linux is from 1900 i.e in range of 100 + in rtc it is from 00 to 99 */ + dt->tm_year = dt->tm_year % 100; + + buf[7] = bin2bcd(dt->tm_year); + buf[8] = bin2bcd(0x00); + + /* write the rtc settings */ + return spi_write_then_read(spi, buf, 9, NULL, 0); +} + +static const struct rtc_class_ops ds1347_rtc_ops = { + .read_time = ds1347_read_time, + .set_time = ds1347_set_time, +}; + +static int ds1347_probe(struct spi_device *spi) +{ + struct rtc_device *rtc; + unsigned char data; + int res; + + /* spi setup with ds1347 in mode 3 and bits per word as 8 */ + spi->mode = SPI_MODE_3; + spi->bits_per_word = 8; + spi_setup(spi); + + /* RTC Settings */ + res = ds1347_read_reg(&spi->dev, DS1347_SECONDS_REG, &data); + if (res) + return res; + + /* Disable the write protect of rtc */ + ds1347_read_reg(&spi->dev, DS1347_CONTROL_REG, &data); + data = data & ~(1<<7); + ds1347_write_reg(&spi->dev, DS1347_CONTROL_REG, data); + + /* Enable the oscillator , disable the oscillator stop flag, + and glitch filter to reduce current consumption */ + ds1347_read_reg(&spi->dev, DS1347_STATUS_REG, &data); + data = data & 0x1B; + ds1347_write_reg(&spi->dev, DS1347_STATUS_REG, data); + + /* display the settings */ + ds1347_read_reg(&spi->dev, DS1347_CONTROL_REG, &data); + dev_info(&spi->dev, "DS1347 RTC CTRL Reg = 0x%02x\n", data); + + ds1347_read_reg(&spi->dev, DS1347_STATUS_REG, &data); + dev_info(&spi->dev, "DS1347 RTC Status Reg = 0x%02x\n", data); + + rtc = devm_rtc_device_register(&spi->dev, "ds1347", + &ds1347_rtc_ops, THIS_MODULE); + + if (IS_ERR(rtc)) + return PTR_ERR(rtc); + + spi_set_drvdata(spi, rtc); + + return 0; +} + +static struct spi_driver ds1347_driver = { + .driver = { + .name = "ds1347", + .owner = THIS_MODULE, + }, + .probe = ds1347_probe, +}; + +module_spi_driver(ds1347_driver); + +MODULE_DESCRIPTION("DS1347 SPI RTC DRIVER"); +MODULE_AUTHOR("Raghavendra C Ganiga "); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 67ab2440b40610b3b91068844640df6d3de3464f Mon Sep 17 00:00:00 2001 From: Gregory Hermant Date: Thu, 3 Apr 2014 14:50:17 -0700 Subject: drivers/rtc/rtc-rv3029c2.c: fix potential race condition RTC drivers must not return an error after device registration. Signed-off-by: Gregory Hermant Acked-by: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-rv3029c2.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/rtc/rtc-rv3029c2.c b/drivers/rtc/rtc-rv3029c2.c index 1a779a67ff66..e9ac5a43be1a 100644 --- a/drivers/rtc/rtc-rv3029c2.c +++ b/drivers/rtc/rtc-rv3029c2.c @@ -395,6 +395,12 @@ static int rv3029c2_probe(struct i2c_client *client, if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_EMUL)) return -ENODEV; + rc = rv3029c2_i2c_get_sr(client, buf); + if (rc < 0) { + dev_err(&client->dev, "reading status failed\n"); + return rc; + } + rtc = devm_rtc_device_register(&client->dev, client->name, &rv3029c2_rtc_ops, THIS_MODULE); @@ -403,12 +409,6 @@ static int rv3029c2_probe(struct i2c_client *client, i2c_set_clientdata(client, rtc); - rc = rv3029c2_i2c_get_sr(client, buf); - if (rc < 0) { - dev_err(&client->dev, "reading status failed\n"); - return rc; - } - return 0; } -- cgit v1.2.3 From dc9d8887408ac3dfd907b826c5dfdfe953c21a0f Mon Sep 17 00:00:00 2001 From: Adam Thomson Date: Thu, 3 Apr 2014 14:50:18 -0700 Subject: drivers/rtc/rtc-da9055.c: remove use of regmap_irq_get_virq() Using platform_get_irq_byname() to retrieve the IRQ number returns the VIRQ number rather than the local IRQ number for the device. Passing that value then into regmap_irq_get_virq() causes a failure because the function is expecting the local IRQ number (e.g. 0, 1, 2, 3, etc). This patch removes use of regmap_irq_get_virq() to prevent this failure from happening Signed-off-by: Adam Thomson Cc: Alessandro Zummo Cc: Dmitry Torokhov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-da9055.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-da9055.c b/drivers/rtc/rtc-da9055.c index 48cb2ac3bd3e..a825491331c8 100644 --- a/drivers/rtc/rtc-da9055.c +++ b/drivers/rtc/rtc-da9055.c @@ -302,7 +302,9 @@ static int da9055_rtc_probe(struct platform_device *pdev) } alm_irq = platform_get_irq_byname(pdev, "ALM"); - alm_irq = regmap_irq_get_virq(rtc->da9055->irq_data, alm_irq); + if (alm_irq < 0) + return alm_irq; + ret = devm_request_threaded_irq(&pdev->dev, alm_irq, NULL, da9055_rtc_alm_irq, IRQF_TRIGGER_HIGH | IRQF_ONESHOT, -- cgit v1.2.3 From ebe753357ecd10a6dc521dc89f2bfea6c0ee0ca2 Mon Sep 17 00:00:00 2001 From: Pankaj Dubey Date: Thu, 3 Apr 2014 14:50:19 -0700 Subject: drivers/rtc/rtc-s3c.c: remove NO_IRQ macro NO_IRQ may be defined as '(unsigned int) -1' in some architectures (arm, sh ...), and either may not be defined in some architectures (arm64) which can enable RTC_DRV_S3C. Also since platform_get_irq returns err-code in case of any error, we do not need to intialize s3c_rtc_alarmno and s3c_rtc_tickno. Signed-off-by: Pankaj Dubey Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-s3c.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index c4cde9c08f1f..4958a363b2c7 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -48,8 +48,8 @@ struct s3c_rtc_drv_data { static struct clk *rtc_clk; static void __iomem *s3c_rtc_base; -static int s3c_rtc_alarmno = NO_IRQ; -static int s3c_rtc_tickno = NO_IRQ; +static int s3c_rtc_alarmno; +static int s3c_rtc_tickno; static enum s3c_cpu_type s3c_rtc_cpu_type; static DEFINE_SPINLOCK(s3c_rtc_pie_lock); -- cgit v1.2.3 From 0d71915d27a6bfd896513c5aa3bae4a09ca81c2c Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Thu, 3 Apr 2014 14:50:20 -0700 Subject: rtc: treewide: remove excess rtc_device validation The patch "rtc: verify a critical argument to rtc_update_irq() before using it" introduces validation for rtc_device in the RTC core, so there are no need to check this argument for rtc_update_irq() from the drivers. This patch removes such check for the existing rtc_update_irq() users. Signed-off-by: Alexander Shiyan Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ds1511.c | 3 +-- drivers/rtc/rtc-ds1553.c | 3 +-- drivers/rtc/rtc-stk17ta8.c | 3 +-- drivers/rtc/rtc-tx4939.c | 4 ++-- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 4ff20f9dc212..b13d1399b81a 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -371,8 +371,7 @@ ds1511_interrupt(int irq, void *dev_id) events |= RTC_UF; else events |= RTC_AF; - if (likely(pdata->rtc)) - rtc_update_irq(pdata->rtc, 1, events); + rtc_update_irq(pdata->rtc, 1, events); } spin_unlock(&pdata->lock); return events ? IRQ_HANDLED : IRQ_NONE; diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c index 9ac2fa07dffa..ab56893aac73 100644 --- a/drivers/rtc/rtc-ds1553.c +++ b/drivers/rtc/rtc-ds1553.c @@ -206,8 +206,7 @@ static irqreturn_t ds1553_rtc_interrupt(int irq, void *dev_id) events |= RTC_UF; else events |= RTC_AF; - if (likely(pdata->rtc)) - rtc_update_irq(pdata->rtc, 1, events); + rtc_update_irq(pdata->rtc, 1, events); } spin_unlock(&pdata->lock); return events ? IRQ_HANDLED : IRQ_NONE; diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c index a176ba614683..35ed49ea1f81 100644 --- a/drivers/rtc/rtc-stk17ta8.c +++ b/drivers/rtc/rtc-stk17ta8.c @@ -214,8 +214,7 @@ static irqreturn_t stk17ta8_rtc_interrupt(int irq, void *dev_id) events |= RTC_UF; else events |= RTC_AF; - if (likely(pdata->rtc)) - rtc_update_irq(pdata->rtc, 1, events); + rtc_update_irq(pdata->rtc, 1, events); } spin_unlock(&pdata->lock); return events ? IRQ_HANDLED : IRQ_NONE; diff --git a/drivers/rtc/rtc-tx4939.c b/drivers/rtc/rtc-tx4939.c index 4f87234e0dee..2e678c681b13 100644 --- a/drivers/rtc/rtc-tx4939.c +++ b/drivers/rtc/rtc-tx4939.c @@ -176,8 +176,8 @@ static irqreturn_t tx4939_rtc_interrupt(int irq, void *dev_id) tx4939_rtc_cmd(rtcreg, TX4939_RTCCTL_COMMAND_NOP); } spin_unlock(&pdata->lock); - if (likely(pdata->rtc)) - rtc_update_irq(pdata->rtc, 1, events); + rtc_update_irq(pdata->rtc, 1, events); + return IRQ_HANDLED; } -- cgit v1.2.3 From 84ee353df07e9beec200da44ac70f583449fffb1 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 3 Apr 2014 14:50:21 -0700 Subject: fs/minix/inode.c: add __init to init_inodecache() init_inodecache is only called by __init init_minix_fs. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/minix/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 03aaeb1a694a..0ad2ec9601de 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -86,7 +86,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { minix_inode_cachep = kmem_cache_create("minix_inode_cache", sizeof(struct minix_inode_info), -- cgit v1.2.3 From 2d4319ef570d44a8957b2af0a35d3e92b844b1b5 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 3 Apr 2014 14:50:21 -0700 Subject: befs: replace kmalloc/memset 0 by kzalloc Use kzalloc for clean fs_info allocation like other filesystems. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/befs/linuxvfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 845d2d690ce2..a1a09462fe47 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -791,7 +791,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) save_mount_options(sb, data); - sb->s_fs_info = kmalloc(sizeof (*befs_sb), GFP_KERNEL); + sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL); if (sb->s_fs_info == NULL) { printk(KERN_ERR "BeFS(%s): Unable to allocate memory for private " @@ -799,7 +799,6 @@ befs_fill_super(struct super_block *sb, void *data, int silent) goto unacquire_none; } befs_sb = BEFS_SB(sb); - memset(befs_sb, 0, sizeof(befs_sb_info)); if (!parse_options((char *) data, &befs_sb->mount_opts)) { befs_error(sb, "cannot parse mount options"); -- cgit v1.2.3 From 91a52ab7d664a1c8972a0ecb30955d34aea54d7f Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 3 Apr 2014 14:50:22 -0700 Subject: fs/befs/linuxvfs.c: add __init to befs_init_inodecache() init_inodecache is only called by __init init_befs_fs. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/befs/linuxvfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index a1a09462fe47..079872d61f75 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -39,7 +39,6 @@ static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int) static struct inode *befs_iget(struct super_block *, unsigned long); static struct inode *befs_alloc_inode(struct super_block *sb); static void befs_destroy_inode(struct inode *inode); -static int befs_init_inodecache(void); static void befs_destroy_inodecache(void); static void *befs_follow_link(struct dentry *, struct nameidata *); static void *befs_fast_follow_link(struct dentry *, struct nameidata *); @@ -445,7 +444,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) * * Taken from NFS implementation by Al Viro. */ -static int +static int __init befs_init_inodecache(void) { befs_inode_cachep = kmem_cache_create("befs_inode_cache", -- cgit v1.2.3 From dac52fc1826a788d2591a4f77e3c482b30f577e2 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 3 Apr 2014 14:50:23 -0700 Subject: BEFS: logging cleanup Summary: - all printk(KERN_foo converted to pr_foo() - add pr_fmt and remove redundant prefixes - convert befs_() to va_format (based on patch by Joe Perches) - remove non standard %Lu - use __func__ for all debugging [akpm@linux-foundation.org: fix printk warnings, reported by Fengguang] Signed-off-by: Fabian Frederick Cc: Joe Perches Cc: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/befs/Makefile | 2 +- fs/befs/befs.h | 3 ++ fs/befs/btree.c | 93 ++++++++++++++++++++++---------------------- fs/befs/datastream.c | 87 ++++++++++++++++++++++-------------------- fs/befs/debug.c | 74 ++++++++++++----------------------- fs/befs/inode.c | 10 +++-- fs/befs/io.c | 24 ++++++------ fs/befs/linuxvfs.c | 106 ++++++++++++++++++++++++++------------------------- 8 files changed, 195 insertions(+), 204 deletions(-) diff --git a/fs/befs/Makefile b/fs/befs/Makefile index 2f370bd7a50d..8b9f66642a83 100644 --- a/fs/befs/Makefile +++ b/fs/befs/Makefile @@ -3,5 +3,5 @@ # obj-$(CONFIG_BEFS_FS) += befs.o - +ccflags-$(CONFIG_BEFS_DEBUG) += -DDEBUG befs-objs := datastream.o btree.o super.o inode.o debug.o io.o linuxvfs.o diff --git a/fs/befs/befs.h b/fs/befs/befs.h index b26642839156..3a7813ab8c95 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -88,8 +88,11 @@ enum befs_err { /****************************/ /* debug.c */ +__printf(2, 3) void befs_error(const struct super_block *sb, const char *fmt, ...); +__printf(2, 3) void befs_warning(const struct super_block *sb, const char *fmt, ...); +__printf(2, 3) void befs_debug(const struct super_block *sb, const char *fmt, ...); void befs_dump_super_block(const struct super_block *sb, befs_super_block *); diff --git a/fs/befs/btree.c b/fs/befs/btree.c index 74e397db0b8b..a2cd305a993a 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -137,7 +137,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, struct buffer_head *bh = NULL; befs_disk_btree_super *od_sup = NULL; - befs_debug(sb, "---> befs_btree_read_super()"); + befs_debug(sb, "---> %s", __func__); bh = befs_read_datastream(sb, ds, 0, NULL); @@ -162,11 +162,11 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, goto error; } - befs_debug(sb, "<--- befs_btree_read_super()"); + befs_debug(sb, "<--- %s", __func__); return BEFS_OK; error: - befs_debug(sb, "<--- befs_btree_read_super() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -195,16 +195,16 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, { uint off = 0; - befs_debug(sb, "---> befs_bt_read_node()"); + befs_debug(sb, "---> %s", __func__); if (node->bh) brelse(node->bh); node->bh = befs_read_datastream(sb, ds, node_off, &off); if (!node->bh) { - befs_error(sb, "befs_bt_read_node() failed to read " - "node at %Lu", node_off); - befs_debug(sb, "<--- befs_bt_read_node() ERROR"); + befs_error(sb, "%s failed to read " + "node at %llu", __func__, node_off); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -221,7 +221,7 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, node->head.all_key_length = fs16_to_cpu(sb, node->od_node->all_key_length); - befs_debug(sb, "<--- befs_btree_read_node()"); + befs_debug(sb, "<--- %s", __func__); return BEFS_OK; } @@ -252,7 +252,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, befs_off_t node_off; int res; - befs_debug(sb, "---> befs_btree_find() Key: %s", key); + befs_debug(sb, "---> %s Key: %s", __func__, key); if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) { befs_error(sb, @@ -263,7 +263,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS); if (!this_node) { - befs_error(sb, "befs_btree_find() failed to allocate %u " + befs_error(sb, "befs_btree_find() failed to allocate %zu " "bytes of memory", sizeof (befs_btree_node)); goto error; } @@ -274,7 +274,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, node_off = bt_super.root_node_ptr; if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { befs_error(sb, "befs_btree_find() failed to read " - "node at %Lu", node_off); + "node at %llu", node_off); goto error_alloc; } @@ -285,7 +285,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, /* if no match, go to overflow node */ if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { befs_error(sb, "befs_btree_find() failed to read " - "node at %Lu", node_off); + "node at %llu", node_off); goto error_alloc; } } @@ -298,11 +298,11 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, kfree(this_node); if (res != BEFS_BT_MATCH) { - befs_debug(sb, "<--- befs_btree_find() Key %s not found", key); + befs_debug(sb, "<--- %s Key %s not found", __func__, key); *value = 0; return BEFS_BT_NOT_FOUND; } - befs_debug(sb, "<--- befs_btree_find() Found key %s, value %Lu", + befs_debug(sb, "<--- %s Found key %s, value %llu", __func__, key, *value); return BEFS_OK; @@ -310,7 +310,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, kfree(this_node); error: *value = 0; - befs_debug(sb, "<--- befs_btree_find() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -343,7 +343,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node, char *thiskey; fs64 *valarray; - befs_debug(sb, "---> befs_find_key() %s", findkey); + befs_debug(sb, "---> %s %s", __func__, findkey); *value = 0; @@ -355,7 +355,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node, eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len); if (eq < 0) { - befs_debug(sb, "<--- befs_find_key() %s not found", findkey); + befs_debug(sb, "<--- %s %s not found", __func__, findkey); return BEFS_BT_NOT_FOUND; } @@ -373,8 +373,8 @@ befs_find_key(struct super_block *sb, befs_btree_node * node, findkey_len); if (eq == 0) { - befs_debug(sb, "<--- befs_find_key() found %s at %d", - thiskey, mid); + befs_debug(sb, "<--- %s found %s at %d", + __func__, thiskey, mid); *value = fs64_to_cpu(sb, valarray[mid]); return BEFS_BT_MATCH; @@ -388,7 +388,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node, *value = fs64_to_cpu(sb, valarray[mid + 1]); else *value = fs64_to_cpu(sb, valarray[mid]); - befs_debug(sb, "<--- befs_find_key() found %s at %d", thiskey, mid); + befs_debug(sb, "<--- %s found %s at %d", __func__, thiskey, mid); return BEFS_BT_PARMATCH; } @@ -428,7 +428,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, uint key_sum = 0; - befs_debug(sb, "---> befs_btree_read()"); + befs_debug(sb, "---> %s", __func__); if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) { befs_error(sb, @@ -437,7 +437,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, } if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) { - befs_error(sb, "befs_btree_read() failed to allocate %u " + befs_error(sb, "befs_btree_read() failed to allocate %zu " "bytes of memory", sizeof (befs_btree_node)); goto error; } @@ -452,7 +452,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, kfree(this_node); *value = 0; *keysize = 0; - befs_debug(sb, "<--- befs_btree_read() Tree is EMPTY"); + befs_debug(sb, "<--- %s Tree is EMPTY", __func__); return BEFS_BT_EMPTY; } else if (res == BEFS_ERR) { goto error_alloc; @@ -467,7 +467,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, *keysize = 0; *value = 0; befs_debug(sb, - "<--- befs_btree_read() END of keys at %Lu", + "<--- %s END of keys at %llu", __func__, + (unsigned long long) key_sum + this_node->head.all_key_count); brelse(this_node->bh); kfree(this_node); @@ -478,8 +479,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, node_off = this_node->head.right; if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { - befs_error(sb, "befs_btree_read() failed to read " - "node at %Lu", node_off); + befs_error(sb, "%s failed to read node at %llu", + __func__, (unsigned long long)node_off); goto error_alloc; } } @@ -492,11 +493,13 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, keystart = befs_bt_get_key(sb, this_node, cur_key, &keylen); - befs_debug(sb, "Read [%Lu,%d]: keysize %d", node_off, cur_key, keylen); + befs_debug(sb, "Read [%llu,%d]: keysize %d", + (long long unsigned int)node_off, (int)cur_key, + (int)keylen); if (bufsize < keylen + 1) { - befs_error(sb, "befs_btree_read() keybuf too small (%u) " - "for key of size %d", bufsize, keylen); + befs_error(sb, "%s keybuf too small (%zu) " + "for key of size %d", __func__, bufsize, keylen); brelse(this_node->bh); goto error_alloc; }; @@ -506,13 +509,13 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, *keysize = keylen; keybuf[keylen] = '\0'; - befs_debug(sb, "Read [%Lu,%d]: Key \"%.*s\", Value %Lu", node_off, + befs_debug(sb, "Read [%llu,%d]: Key \"%.*s\", Value %llu", node_off, cur_key, keylen, keybuf, *value); brelse(this_node->bh); kfree(this_node); - befs_debug(sb, "<--- befs_btree_read()"); + befs_debug(sb, "<--- %s", __func__); return BEFS_OK; @@ -522,7 +525,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, error: *keysize = 0; *value = 0; - befs_debug(sb, "<--- befs_btree_read() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -547,26 +550,26 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, befs_off_t * node_off) { - befs_debug(sb, "---> befs_btree_seekleaf()"); + befs_debug(sb, "---> %s", __func__); if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { - befs_error(sb, "befs_btree_seekleaf() failed to read " - "node at %Lu", *node_off); + befs_error(sb, "%s failed to read " + "node at %llu", __func__, *node_off); goto error; } - befs_debug(sb, "Seekleaf to root node %Lu", *node_off); + befs_debug(sb, "Seekleaf to root node %llu", *node_off); if (this_node->head.all_key_count == 0 && befs_leafnode(this_node)) { - befs_debug(sb, "<--- befs_btree_seekleaf() Tree is EMPTY"); + befs_debug(sb, "<--- %s Tree is EMPTY", __func__); return BEFS_BT_EMPTY; } while (!befs_leafnode(this_node)) { if (this_node->head.all_key_count == 0) { - befs_debug(sb, "befs_btree_seekleaf() encountered " - "an empty interior node: %Lu. Using Overflow " - "node: %Lu", *node_off, + befs_debug(sb, "%s encountered " + "an empty interior node: %llu. Using Overflow " + "node: %llu", __func__, *node_off, this_node->head.overflow); *node_off = this_node->head.overflow; } else { @@ -574,19 +577,19 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, *node_off = fs64_to_cpu(sb, valarray[0]); } if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { - befs_error(sb, "befs_btree_seekleaf() failed to read " - "node at %Lu", *node_off); + befs_error(sb, "%s failed to read " + "node at %llu", __func__, *node_off); goto error; } - befs_debug(sb, "Seekleaf to child node %Lu", *node_off); + befs_debug(sb, "Seekleaf to child node %llu", *node_off); } - befs_debug(sb, "Node %Lu is a leaf node", *node_off); + befs_debug(sb, "Node %llu is a leaf node", *node_off); return BEFS_OK; error: - befs_debug(sb, "<--- befs_btree_seekleaf() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c index 59096b5e0fc7..c467bebd50af 100644 --- a/fs/befs/datastream.c +++ b/fs/befs/datastream.c @@ -52,26 +52,25 @@ befs_read_datastream(struct super_block *sb, befs_data_stream * ds, befs_block_run run; befs_blocknr_t block; /* block coresponding to pos */ - befs_debug(sb, "---> befs_read_datastream() %Lu", pos); + befs_debug(sb, "---> %s %llu", __func__, pos); block = pos >> BEFS_SB(sb)->block_shift; if (off) *off = pos - (block << BEFS_SB(sb)->block_shift); if (befs_fblock2brun(sb, ds, block, &run) != BEFS_OK) { befs_error(sb, "BeFS: Error finding disk addr of block %lu", - block); - befs_debug(sb, "<--- befs_read_datastream() ERROR"); + (unsigned long)block); + befs_debug(sb, "<--- %s ERROR", __func__); return NULL; } bh = befs_bread_iaddr(sb, run); if (!bh) { befs_error(sb, "BeFS: Error reading block %lu from datastream", - block); + (unsigned long)block); return NULL; } - befs_debug(sb, "<--- befs_read_datastream() read data, starting at %Lu", - pos); + befs_debug(sb, "<--- %s read data, starting at %llu", __func__, pos); return bh; } @@ -106,7 +105,8 @@ befs_fblock2brun(struct super_block *sb, befs_data_stream * data, } else { befs_error(sb, "befs_fblock2brun() was asked to find block %lu, " - "which is not mapped by the datastream\n", fblock); + "which is not mapped by the datastream\n", + (unsigned long)fblock); err = BEFS_ERR; } return err; @@ -128,14 +128,14 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff, befs_off_t bytes_read = 0; /* bytes readed */ u16 plen; struct buffer_head *bh = NULL; - befs_debug(sb, "---> befs_read_lsymlink() length: %Lu", len); + befs_debug(sb, "---> %s length: %llu", __func__, len); while (bytes_read < len) { bh = befs_read_datastream(sb, ds, bytes_read, NULL); if (!bh) { befs_error(sb, "BeFS: Error reading datastream block " - "starting from %Lu", bytes_read); - befs_debug(sb, "<--- befs_read_lsymlink() ERROR"); + "starting from %llu", bytes_read); + befs_debug(sb, "<--- %s ERROR", __func__); return bytes_read; } @@ -146,7 +146,8 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff, bytes_read += plen; } - befs_debug(sb, "<--- befs_read_lsymlink() read %u bytes", bytes_read); + befs_debug(sb, "<--- %s read %u bytes", __func__, (unsigned int) + bytes_read); return bytes_read; } @@ -169,7 +170,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds) befs_blocknr_t metablocks; /* FS metadata blocks */ befs_sb_info *befs_sb = BEFS_SB(sb); - befs_debug(sb, "---> befs_count_blocks()"); + befs_debug(sb, "---> %s", __func__); datablocks = ds->size >> befs_sb->block_shift; if (ds->size & (befs_sb->block_size - 1)) @@ -206,7 +207,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds) } blocks = datablocks + metablocks; - befs_debug(sb, "<--- befs_count_blocks() %u blocks", blocks); + befs_debug(sb, "<--- %s %u blocks", __func__, (unsigned int)blocks); return blocks; } @@ -251,11 +252,11 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data, befs_blocknr_t max_block = data->max_direct_range >> BEFS_SB(sb)->block_shift; - befs_debug(sb, "---> befs_find_brun_direct(), find %lu", blockno); + befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno); if (blockno > max_block) { - befs_error(sb, "befs_find_brun_direct() passed block outside of" - "direct region"); + befs_error(sb, "%s passed block outside of direct region", + __func__); return BEFS_ERR; } @@ -267,13 +268,14 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data, run->start = array[i].start + offset; run->len = array[i].len - offset; - befs_debug(sb, "---> befs_find_brun_direct(), " - "found %lu at direct[%d]", blockno, i); + befs_debug(sb, "---> %s, " + "found %lu at direct[%d]", __func__, + (unsigned long)blockno, i); return BEFS_OK; } } - befs_debug(sb, "---> befs_find_brun_direct() ERROR"); + befs_debug(sb, "---> %s ERROR", __func__); return BEFS_ERR; } @@ -316,7 +318,7 @@ befs_find_brun_indirect(struct super_block *sb, befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect); int arraylen = befs_iaddrs_per_block(sb); - befs_debug(sb, "---> befs_find_brun_indirect(), find %lu", blockno); + befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno); indir_start_blk = data->max_direct_range >> BEFS_SB(sb)->block_shift; search_blk = blockno - indir_start_blk; @@ -325,10 +327,9 @@ befs_find_brun_indirect(struct super_block *sb, for (i = 0; i < indirect.len; i++) { indirblock = befs_bread(sb, indirblockno + i); if (indirblock == NULL) { - befs_debug(sb, - "---> befs_find_brun_indirect() failed to " - "read disk block %lu from the indirect brun", - indirblockno + i); + befs_debug(sb, "---> %s failed to read " + "disk block %lu from the indirect brun", + __func__, (unsigned long)indirblockno + i); return BEFS_ERR; } @@ -348,9 +349,10 @@ befs_find_brun_indirect(struct super_block *sb, brelse(indirblock); befs_debug(sb, - "<--- befs_find_brun_indirect() found " - "file block %lu at indirect[%d]", - blockno, j + (i * arraylen)); + "<--- %s found file block " + "%lu at indirect[%d]", __func__, + (unsigned long)blockno, + j + (i * arraylen)); return BEFS_OK; } sum += len; @@ -360,10 +362,10 @@ befs_find_brun_indirect(struct super_block *sb, } /* Only fallthrough is an error */ - befs_error(sb, "BeFS: befs_find_brun_indirect() failed to find " - "file block %lu", blockno); + befs_error(sb, "BeFS: %s failed to find " + "file block %lu", __func__, (unsigned long)blockno); - befs_debug(sb, "<--- befs_find_brun_indirect() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -444,7 +446,7 @@ befs_find_brun_dblindirect(struct super_block *sb, size_t diblklen = iblklen * befs_iaddrs_per_block(sb) * BEFS_DBLINDIR_BRUN_LEN; - befs_debug(sb, "---> befs_find_brun_dblindirect() find %lu", blockno); + befs_debug(sb, "---> %s find %lu", __func__, (unsigned long)blockno); /* First, discover which of the double_indir->indir blocks * contains pos. Then figure out how much of pos that @@ -460,8 +462,9 @@ befs_find_brun_dblindirect(struct super_block *sb, dbl_which_block = dblindir_indx / befs_iaddrs_per_block(sb); if (dbl_which_block > data->double_indirect.len) { befs_error(sb, "The double-indirect index calculated by " - "befs_read_brun_dblindirect(), %d, is outside the range " - "of the double-indirect block", dblindir_indx); + "%s, %d, is outside the range " + "of the double-indirect block", __func__, + dblindir_indx); return BEFS_ERR; } @@ -469,10 +472,10 @@ befs_find_brun_dblindirect(struct super_block *sb, befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) + dbl_which_block); if (dbl_indir_block == NULL) { - befs_error(sb, "befs_read_brun_dblindirect() couldn't read the " - "double-indirect block at blockno %lu", - iaddr2blockno(sb, - &data->double_indirect) + + befs_error(sb, "%s couldn't read the " + "double-indirect block at blockno %lu", __func__, + (unsigned long) + iaddr2blockno(sb, &data->double_indirect) + dbl_which_block); brelse(dbl_indir_block); return BEFS_ERR; @@ -489,16 +492,16 @@ befs_find_brun_dblindirect(struct super_block *sb, which_block = indir_indx / befs_iaddrs_per_block(sb); if (which_block > indir_run.len) { befs_error(sb, "The indirect index calculated by " - "befs_read_brun_dblindirect(), %d, is outside the range " - "of the indirect block", indir_indx); + "%s, %d, is outside the range " + "of the indirect block", __func__, indir_indx); return BEFS_ERR; } indir_block = befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block); if (indir_block == NULL) { - befs_error(sb, "befs_read_brun_dblindirect() couldn't read the " - "indirect block at blockno %lu", + befs_error(sb, "%s couldn't read the indirect block " + "at blockno %lu", __func__, (unsigned long) iaddr2blockno(sb, &indir_run) + which_block); brelse(indir_block); return BEFS_ERR; @@ -519,7 +522,7 @@ befs_find_brun_dblindirect(struct super_block *sb, run->len -= offset; befs_debug(sb, "Found file block %lu in double_indirect[%d][%d]," - " double_indirect_leftover = %lu", + " double_indirect_leftover = %lu", (unsigned long) blockno, dblindir_indx, indir_indx, dblindir_leftover); return BEFS_OK; diff --git a/fs/befs/debug.c b/fs/befs/debug.c index 622e73775c83..4de7cffcd662 100644 --- a/fs/befs/debug.c +++ b/fs/befs/debug.c @@ -10,6 +10,7 @@ * debug functions */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #ifdef __KERNEL__ #include @@ -23,43 +24,30 @@ #include "befs.h" -#define ERRBUFSIZE 1024 - void befs_error(const struct super_block *sb, const char *fmt, ...) { + struct va_format vaf; va_list args; - char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); - if (err_buf == NULL) { - printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE); - return; - } va_start(args, fmt); - vsnprintf(err_buf, ERRBUFSIZE, fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("(%s): %pV\n", sb->s_id, &vaf); va_end(args); - - printk(KERN_ERR "BeFS(%s): %s\n", sb->s_id, err_buf); - kfree(err_buf); } void befs_warning(const struct super_block *sb, const char *fmt, ...) { + struct va_format vaf; va_list args; - char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); - if (err_buf == NULL) { - printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE); - return; - } va_start(args, fmt); - vsnprintf(err_buf, ERRBUFSIZE, fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_warn("(%s): %pV\n", sb->s_id, &vaf); va_end(args); - - printk(KERN_WARNING "BeFS(%s): %s\n", sb->s_id, err_buf); - - kfree(err_buf); } void @@ -67,25 +55,13 @@ befs_debug(const struct super_block *sb, const char *fmt, ...) { #ifdef CONFIG_BEFS_DEBUG + struct va_format vaf; va_list args; - char *err_buf = NULL; - - if (BEFS_SB(sb)->mount_opts.debug) { - err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); - if (err_buf == NULL) { - printk(KERN_ERR "could not allocate %d bytes\n", - ERRBUFSIZE); - return; - } - - va_start(args, fmt); - vsnprintf(err_buf, ERRBUFSIZE, fmt, args); - va_end(args); - - printk(KERN_DEBUG "BeFS(%s): %s\n", sb->s_id, err_buf); - - kfree(err_buf); - } + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_debug("(%s): %pV\n", sb->s_id, &vaf); + va_end(args); #endif //CONFIG_BEFS_DEBUG } @@ -109,9 +85,9 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode) befs_debug(sb, " gid %u", fs32_to_cpu(sb, inode->gid)); befs_debug(sb, " mode %08x", fs32_to_cpu(sb, inode->mode)); befs_debug(sb, " flags %08x", fs32_to_cpu(sb, inode->flags)); - befs_debug(sb, " create_time %Lu", + befs_debug(sb, " create_time %llu", fs64_to_cpu(sb, inode->create_time)); - befs_debug(sb, " last_modified_time %Lu", + befs_debug(sb, " last_modified_time %llu", fs64_to_cpu(sb, inode->last_modified_time)); tmp_run = fsrun_to_cpu(sb, inode->parent); @@ -137,7 +113,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode) tmp_run.allocation_group, tmp_run.start, tmp_run.len); } - befs_debug(sb, " max_direct_range %Lu", + befs_debug(sb, " max_direct_range %llu", fs64_to_cpu(sb, inode->data.datastream. max_direct_range)); @@ -147,7 +123,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode) tmp_run.allocation_group, tmp_run.start, tmp_run.len); - befs_debug(sb, " max_indirect_range %Lu", + befs_debug(sb, " max_indirect_range %llu", fs64_to_cpu(sb, inode->data.datastream. max_indirect_range)); @@ -158,12 +134,12 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode) tmp_run.allocation_group, tmp_run.start, tmp_run.len); - befs_debug(sb, " max_double_indirect_range %Lu", + befs_debug(sb, " max_double_indirect_range %llu", fs64_to_cpu(sb, inode->data.datastream. max_double_indirect_range)); - befs_debug(sb, " size %Lu", + befs_debug(sb, " size %llu", fs64_to_cpu(sb, inode->data.datastream.size)); } @@ -191,8 +167,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup) befs_debug(sb, " block_size %u", fs32_to_cpu(sb, sup->block_size)); befs_debug(sb, " block_shift %u", fs32_to_cpu(sb, sup->block_shift)); - befs_debug(sb, " num_blocks %Lu", fs64_to_cpu(sb, sup->num_blocks)); - befs_debug(sb, " used_blocks %Lu", fs64_to_cpu(sb, sup->used_blocks)); + befs_debug(sb, " num_blocks %llu", fs64_to_cpu(sb, sup->num_blocks)); + befs_debug(sb, " used_blocks %llu", fs64_to_cpu(sb, sup->used_blocks)); befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2)); befs_debug(sb, " blocks_per_ag %u", @@ -206,8 +182,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup) befs_debug(sb, " log_blocks %u, %hu, %hu", tmp_run.allocation_group, tmp_run.start, tmp_run.len); - befs_debug(sb, " log_start %Ld", fs64_to_cpu(sb, sup->log_start)); - befs_debug(sb, " log_end %Ld", fs64_to_cpu(sb, sup->log_end)); + befs_debug(sb, " log_start %lld", fs64_to_cpu(sb, sup->log_start)); + befs_debug(sb, " log_end %lld", fs64_to_cpu(sb, sup->log_end)); befs_debug(sb, " magic3 %08x", fs32_to_cpu(sb, sup->magic3)); diff --git a/fs/befs/inode.c b/fs/befs/inode.c index 94c17f9a9576..fa4b718de597 100644 --- a/fs/befs/inode.c +++ b/fs/befs/inode.c @@ -25,7 +25,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode, /* check magic header. */ if (magic1 != BEFS_INODE_MAGIC1) { befs_error(sb, - "Inode has a bad magic header - inode = %lu", inode); + "Inode has a bad magic header - inode = %lu", + (unsigned long)inode); return BEFS_BAD_INODE; } @@ -34,8 +35,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode, */ if (inode != iaddr2blockno(sb, &ino_num)) { befs_error(sb, "inode blocknr field disagrees with vfs " - "VFS: %lu, Inode %lu", - inode, iaddr2blockno(sb, &ino_num)); + "VFS: %lu, Inode %lu", (unsigned long) + inode, (unsigned long)iaddr2blockno(sb, &ino_num)); return BEFS_BAD_INODE; } @@ -44,7 +45,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode, */ if (!(flags & BEFS_INODE_IN_USE)) { - befs_error(sb, "inode is not used - inode = %lu", inode); + befs_error(sb, "inode is not used - inode = %lu", + (unsigned long)inode); return BEFS_BAD_INODE; } diff --git a/fs/befs/io.c b/fs/befs/io.c index ddef98aa255d..0408a3d601d0 100644 --- a/fs/befs/io.c +++ b/fs/befs/io.c @@ -30,9 +30,9 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) befs_blocknr_t block = 0; befs_sb_info *befs_sb = BEFS_SB(sb); - befs_debug(sb, "---> Enter befs_read_iaddr() " - "[%u, %hu, %hu]", - iaddr.allocation_group, iaddr.start, iaddr.len); + befs_debug(sb, "---> Enter %s " + "[%u, %hu, %hu]", __func__, iaddr.allocation_group, + iaddr.start, iaddr.len); if (iaddr.allocation_group > befs_sb->num_ags) { befs_error(sb, "BEFS: Invalid allocation group %u, max is %u", @@ -42,20 +42,21 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) block = iaddr2blockno(sb, &iaddr); - befs_debug(sb, "befs_read_iaddr: offset = %lu", block); + befs_debug(sb, "%s: offset = %lu", __func__, (unsigned long)block); bh = sb_bread(sb, block); if (bh == NULL) { - befs_error(sb, "Failed to read block %lu", block); + befs_error(sb, "Failed to read block %lu", + (unsigned long)block); goto error; } - befs_debug(sb, "<--- befs_read_iaddr()"); + befs_debug(sb, "<--- %s", __func__); return bh; error: - befs_debug(sb, "<--- befs_read_iaddr() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return NULL; } @@ -64,20 +65,21 @@ befs_bread(struct super_block *sb, befs_blocknr_t block) { struct buffer_head *bh = NULL; - befs_debug(sb, "---> Enter befs_read() %Lu", block); + befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block); bh = sb_bread(sb, block); if (bh == NULL) { - befs_error(sb, "Failed to read block %lu", block); + befs_error(sb, "Failed to read block %lu", + (unsigned long)block); goto error; } - befs_debug(sb, "<--- befs_read()"); + befs_debug(sb, "<--- %s", __func__); return bh; error: - befs_debug(sb, "<--- befs_read() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return NULL; } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 079872d61f75..5188f1222987 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -5,6 +5,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -130,26 +132,28 @@ befs_get_block(struct inode *inode, sector_t block, ulong disk_off; befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld", - inode->i_ino, block); + (unsigned long)inode->i_ino, (long)block); if (block < 0) { befs_error(sb, "befs_get_block() was asked for a block " "number less than zero: block %ld in inode %lu", - block, inode->i_ino); + (long)block, (unsigned long)inode->i_ino); return -EIO; } if (create) { befs_error(sb, "befs_get_block() was asked to write to " - "block %ld in inode %lu", block, inode->i_ino); + "block %ld in inode %lu", (long)block, + (unsigned long)inode->i_ino); return -EPERM; } res = befs_fblock2brun(sb, ds, block, &run); if (res != BEFS_OK) { befs_error(sb, - "<--- befs_get_block() for inode %lu, block " - "%ld ERROR", inode->i_ino, block); + "<--- %s for inode %lu, block %ld ERROR", + __func__, (unsigned long)inode->i_ino, + (long)block); return -EFBIG; } @@ -157,8 +161,9 @@ befs_get_block(struct inode *inode, sector_t block, map_bh(bh_result, inode->i_sb, disk_off); - befs_debug(sb, "<--- befs_get_block() for inode %lu, block %ld, " - "disk address %lu", inode->i_ino, block, disk_off); + befs_debug(sb, "<--- %s for inode %lu, block %ld, disk address %lu", + __func__, (unsigned long)inode->i_ino, (long)block, + (unsigned long)disk_off); return 0; } @@ -175,15 +180,15 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) char *utfname; const char *name = dentry->d_name.name; - befs_debug(sb, "---> befs_lookup() " - "name %s inode %ld", dentry->d_name.name, dir->i_ino); + befs_debug(sb, "---> %s name %s inode %ld", __func__, + dentry->d_name.name, dir->i_ino); /* Convert to UTF-8 */ if (BEFS_SB(sb)->nls) { ret = befs_nls2utf(sb, name, strlen(name), &utfname, &utfnamelen); if (ret < 0) { - befs_debug(sb, "<--- befs_lookup() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return ERR_PTR(ret); } ret = befs_btree_find(sb, ds, utfname, &offset); @@ -194,12 +199,12 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) } if (ret == BEFS_BT_NOT_FOUND) { - befs_debug(sb, "<--- befs_lookup() %s not found", + befs_debug(sb, "<--- %s %s not found", __func__, dentry->d_name.name); return ERR_PTR(-ENOENT); } else if (ret != BEFS_OK || offset == 0) { - befs_warning(sb, "<--- befs_lookup() Error"); + befs_warning(sb, "<--- %s Error", __func__); return ERR_PTR(-ENODATA); } @@ -209,7 +214,7 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) d_add(dentry, inode); - befs_debug(sb, "<--- befs_lookup()"); + befs_debug(sb, "<--- %s", __func__); return NULL; } @@ -227,26 +232,25 @@ befs_readdir(struct file *file, struct dir_context *ctx) char keybuf[BEFS_NAME_LEN + 1]; const char *dirname = file->f_path.dentry->d_name.name; - befs_debug(sb, "---> befs_readdir() " - "name %s, inode %ld, ctx->pos %Ld", - dirname, inode->i_ino, ctx->pos); + befs_debug(sb, "---> %s name %s, inode %ld, ctx->pos %lld", + __func__, dirname, inode->i_ino, ctx->pos); more: result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1, keybuf, &keysize, &value); if (result == BEFS_ERR) { - befs_debug(sb, "<--- befs_readdir() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); befs_error(sb, "IO error reading %s (inode %lu)", dirname, inode->i_ino); return -EIO; } else if (result == BEFS_BT_END) { - befs_debug(sb, "<--- befs_readdir() END"); + befs_debug(sb, "<--- %s END", __func__); return 0; } else if (result == BEFS_BT_EMPTY) { - befs_debug(sb, "<--- befs_readdir() Empty directory"); + befs_debug(sb, "<--- %s Empty directory", __func__); return 0; } @@ -259,7 +263,7 @@ more: result = befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen); if (result < 0) { - befs_debug(sb, "<--- befs_readdir() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return result; } if (!dir_emit(ctx, nlsname, nlsnamelen, @@ -276,7 +280,7 @@ more: ctx->pos++; goto more; - befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos); + befs_debug(sb, "<--- %s pos %lld", __func__, ctx->pos); return 0; } @@ -320,7 +324,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) struct inode *inode; long ret = -EIO; - befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino); + befs_debug(sb, "---> %s inode = %lu", __func__, ino); inode = iget_locked(sb, ino); if (!inode) @@ -427,7 +431,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) } brelse(bh); - befs_debug(sb, "<--- befs_read_inode()"); + befs_debug(sb, "<--- %s", __func__); unlock_new_inode(inode); return inode; @@ -436,7 +440,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) unacquire_none: iget_failed(inode); - befs_debug(sb, "<--- befs_read_inode() - Bad inode"); + befs_debug(sb, "<--- %s - Bad inode", __func__); return ERR_PTR(ret); } @@ -453,11 +457,9 @@ befs_init_inodecache(void) SLAB_MEM_SPREAD), init_once); if (befs_inode_cachep == NULL) { - printk(KERN_ERR "befs_init_inodecache: " - "Couldn't initialize inode slabcache\n"); + pr_err("%s: Couldn't initialize inode slabcache\n", __func__); return -ENOMEM; } - return 0; } @@ -543,16 +545,16 @@ befs_utf2nls(struct super_block *sb, const char *in, */ int maxlen = in_len + 1; - befs_debug(sb, "---> utf2nls()"); + befs_debug(sb, "---> %s", __func__); if (!nls) { - befs_error(sb, "befs_utf2nls called with no NLS table loaded"); + befs_error(sb, "%s called with no NLS table loaded", __func__); return -EINVAL; } *out = result = kmalloc(maxlen, GFP_NOFS); if (!*out) { - befs_error(sb, "befs_utf2nls() cannot allocate memory"); + befs_error(sb, "%s cannot allocate memory", __func__); *out_len = 0; return -ENOMEM; } @@ -574,14 +576,14 @@ befs_utf2nls(struct super_block *sb, const char *in, result[o] = '\0'; *out_len = o; - befs_debug(sb, "<--- utf2nls()"); + befs_debug(sb, "<--- %s", __func__); return o; conv_err: befs_error(sb, "Name using character set %s contains a character that " "cannot be converted to unicode.", nls->charset); - befs_debug(sb, "<--- utf2nls()"); + befs_debug(sb, "<--- %s", __func__); kfree(result); return -EILSEQ; } @@ -622,16 +624,17 @@ befs_nls2utf(struct super_block *sb, const char *in, * in special cases */ int maxlen = (3 * in_len) + 1; - befs_debug(sb, "---> nls2utf()\n"); + befs_debug(sb, "---> %s\n", __func__); if (!nls) { - befs_error(sb, "befs_nls2utf called with no NLS table loaded."); + befs_error(sb, "%s called with no NLS table loaded.", + __func__); return -EINVAL; } *out = result = kmalloc(maxlen, GFP_NOFS); if (!*out) { - befs_error(sb, "befs_nls2utf() cannot allocate memory"); + befs_error(sb, "%s cannot allocate memory", __func__); *out_len = 0; return -ENOMEM; } @@ -652,14 +655,14 @@ befs_nls2utf(struct super_block *sb, const char *in, result[o] = '\0'; *out_len = o; - befs_debug(sb, "<--- nls2utf()"); + befs_debug(sb, "<--- %s", __func__); return i; conv_err: befs_error(sb, "Name using charecter set %s contains a charecter that " "cannot be converted to unicode.", nls->charset); - befs_debug(sb, "<--- nls2utf()"); + befs_debug(sb, "<--- %s", __func__); kfree(result); return -EILSEQ; } @@ -714,8 +717,8 @@ parse_options(char *options, befs_mount_options * opts) if (option >= 0) uid = make_kuid(current_user_ns(), option); if (!uid_valid(uid)) { - printk(KERN_ERR "BeFS: Invalid uid %d, " - "using default\n", option); + pr_err("Invalid uid %d, " + "using default\n", option); break; } opts->uid = uid; @@ -728,8 +731,8 @@ parse_options(char *options, befs_mount_options * opts) if (option >= 0) gid = make_kgid(current_user_ns(), option); if (!gid_valid(gid)) { - printk(KERN_ERR "BeFS: Invalid gid %d, " - "using default\n", option); + pr_err("Invalid gid %d, " + "using default\n", option); break; } opts->gid = gid; @@ -739,8 +742,8 @@ parse_options(char *options, befs_mount_options * opts) kfree(opts->iocharset); opts->iocharset = match_strdup(&args[0]); if (!opts->iocharset) { - printk(KERN_ERR "BeFS: allocation failure for " - "iocharset string\n"); + pr_err("allocation failure for " + "iocharset string\n"); return 0; } break; @@ -748,8 +751,8 @@ parse_options(char *options, befs_mount_options * opts) opts->debug = 1; break; default: - printk(KERN_ERR "BeFS: Unrecognized mount option \"%s\" " - "or missing value\n", p); + pr_err("Unrecognized mount option \"%s\" " + "or missing value\n", p); return 0; } } @@ -792,8 +795,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL); if (sb->s_fs_info == NULL) { - printk(KERN_ERR - "BeFS(%s): Unable to allocate memory for private " + pr_err("(%s): Unable to allocate memory for private " "portion of superblock. Bailing.\n", sb->s_id); goto unacquire_none; } @@ -804,7 +806,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) goto unacquire_priv_sbp; } - befs_debug(sb, "---> befs_fill_super()"); + befs_debug(sb, "---> %s", __func__); #ifndef CONFIG_BEFS_RW if (!(sb->s_flags & MS_RDONLY)) { @@ -852,7 +854,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) goto unacquire_priv_sbp; if( befs_sb->num_blocks > ~((sector_t)0) ) { - befs_error(sb, "blocks count: %Lu " + befs_error(sb, "blocks count: %llu " "is larger than the host can use", befs_sb->num_blocks); goto unacquire_priv_sbp; @@ -922,7 +924,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - befs_debug(sb, "---> befs_statfs()"); + befs_debug(sb, "---> %s", __func__); buf->f_type = BEFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -935,7 +937,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = BEFS_NAME_LEN; - befs_debug(sb, "<--- befs_statfs()"); + befs_debug(sb, "<--- %s", __func__); return 0; } @@ -961,7 +963,7 @@ init_befs_fs(void) { int err; - printk(KERN_INFO "BeFS version: %s\n", BEFS_VERSION); + pr_info("version: %s\n", BEFS_VERSION); err = befs_init_inodecache(); if (err) -- cgit v1.2.3 From 5f356fd4d75fd66e62c803fc7a684863c4b38e59 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 3 Apr 2014 14:50:24 -0700 Subject: fs/coda/inode.c: add __init to init_inodecache() init_inodecache is only called by __init init_coda Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coda/coda_int.h | 2 +- fs/coda/inode.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h index b7143cf783ac..381c993b1427 100644 --- a/fs/coda/coda_int.h +++ b/fs/coda/coda_int.h @@ -10,7 +10,7 @@ extern int coda_hard; extern int coda_fake_statfs; void coda_destroy_inodecache(void); -int coda_init_inodecache(void); +int __init coda_init_inodecache(void); int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync); void coda_sysctl_init(void); void coda_sysctl_clean(void); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 62618ec9356c..626abc02b694 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -73,7 +73,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -int coda_init_inodecache(void) +int __init coda_init_inodecache(void) { coda_inode_cachep = kmem_cache_create("coda_inode_cache", sizeof(struct coda_inode_info), -- cgit v1.2.3 From e21269355eb18aa7be72d5dd92f421da0b50abdb Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 3 Apr 2014 14:50:25 -0700 Subject: nilfs2: update MAINTAINERS file entries Update git repository entry of nilfs2 file system and maintainer's email description. Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2f2675493b3a..3bf8bbbbee75 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6181,10 +6181,10 @@ F: include/uapi/linux/nfs* F: include/uapi/linux/sunrpc/ NILFS2 FILESYSTEM -M: KONISHI Ryusuke +M: Ryusuke Konishi L: linux-nilfs@vger.kernel.org W: http://www.nilfs.org/en/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/ryusuke/nilfs2.git +T: git git://github.com/konis/nilfs2.git S: Supported F: Documentation/filesystems/nilfs2.txt F: fs/nilfs2/ -- cgit v1.2.3 From 90ccf7dde9527672553c3a809aa93654c95672d0 Mon Sep 17 00:00:00 2001 From: Andreas Rohner Date: Thu, 3 Apr 2014 14:50:26 -0700 Subject: nilfs2: add struct nilfs_suinfo_update and flags Add the nilfs_suinfo_update structure, which contains the information needed to update one segment usage entry. The flags specify, which fields need to be updated. Signed-off-by: Andreas Rohner Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nilfs2_fs.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h index 98755767c7b0..252657874a19 100644 --- a/include/linux/nilfs2_fs.h +++ b/include/linux/nilfs2_fs.h @@ -710,6 +710,48 @@ static inline int nilfs_suinfo_clean(const struct nilfs_suinfo *si) } /* ioctl */ +/** + * nilfs_suinfo_update - segment usage information update + * @sup_segnum: segment number + * @sup_flags: flags for which fields are active in sup_sui + * @sup_reserved: reserved necessary for alignment + * @sup_sui: segment usage information + */ +struct nilfs_suinfo_update { + __u64 sup_segnum; + __u32 sup_flags; + __u32 sup_reserved; + struct nilfs_suinfo sup_sui; +}; + +enum { + NILFS_SUINFO_UPDATE_LASTMOD, + NILFS_SUINFO_UPDATE_NBLOCKS, + NILFS_SUINFO_UPDATE_FLAGS, + __NR_NILFS_SUINFO_UPDATE_FIELDS, +}; + +#define NILFS_SUINFO_UPDATE_FNS(flag, name) \ +static inline void \ +nilfs_suinfo_update_set_##name(struct nilfs_suinfo_update *sup) \ +{ \ + sup->sup_flags |= 1UL << NILFS_SUINFO_UPDATE_##flag; \ +} \ +static inline void \ +nilfs_suinfo_update_clear_##name(struct nilfs_suinfo_update *sup) \ +{ \ + sup->sup_flags &= ~(1UL << NILFS_SUINFO_UPDATE_##flag); \ +} \ +static inline int \ +nilfs_suinfo_update_##name(const struct nilfs_suinfo_update *sup) \ +{ \ + return !!(sup->sup_flags & (1UL << NILFS_SUINFO_UPDATE_##flag));\ +} + +NILFS_SUINFO_UPDATE_FNS(LASTMOD, lastmod) +NILFS_SUINFO_UPDATE_FNS(NBLOCKS, nblocks) +NILFS_SUINFO_UPDATE_FNS(FLAGS, flags) + enum { NILFS_CHECKPOINT, NILFS_SNAPSHOT, -- cgit v1.2.3 From 00e9ffcd27cc5d0af9076383c6242c32335546f8 Mon Sep 17 00:00:00 2001 From: Andreas Rohner Date: Thu, 3 Apr 2014 14:50:27 -0700 Subject: nilfs2: add nilfs_sufile_set_suinfo to update segment usage Introduce nilfs_sufile_set_suinfo(), which expects an array of nilfs_suinfo_update structures and updates the segment usage information accordingly. This is basically a helper function for the newly introduced NILFS_IOCTL_SET_SUINFO ioctl. [konishi.ryusuke@lab.ntt.co.jp: use put_bh() instead of brelse() because we know bh != NULL] Signed-off-by: Andreas Rohner Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/sufile.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/nilfs2/sufile.h | 1 + 2 files changed, 132 insertions(+) diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 3127e9f438a7..5628b99d454e 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -869,6 +869,137 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, return ret; } +/** + * nilfs_sufile_set_suinfo - sets segment usage info + * @sufile: inode of segment usage file + * @buf: array of suinfo_update + * @supsz: byte size of suinfo_update + * @nsup: size of suinfo_update array + * + * Description: Takes an array of nilfs_suinfo_update structs and updates + * segment usage accordingly. Only the fields indicated by the sup_flags + * are updated. + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid values in input (segment number, flags or nblocks) + */ +ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, + unsigned supsz, size_t nsup) +{ + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + struct buffer_head *header_bh, *bh; + struct nilfs_suinfo_update *sup, *supend = buf + supsz * nsup; + struct nilfs_segment_usage *su; + void *kaddr; + unsigned long blkoff, prev_blkoff; + int cleansi, cleansu, dirtysi, dirtysu; + long ncleaned = 0, ndirtied = 0; + int ret = 0; + + if (unlikely(nsup == 0)) + return ret; + + for (sup = buf; sup < supend; sup = (void *)sup + supsz) { + if (sup->sup_segnum >= nilfs->ns_nsegments + || (sup->sup_flags & + (~0UL << __NR_NILFS_SUINFO_UPDATE_FIELDS)) + || (nilfs_suinfo_update_nblocks(sup) && + sup->sup_sui.sui_nblocks > + nilfs->ns_blocks_per_segment)) + return -EINVAL; + } + + down_write(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + sup = buf; + blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum); + ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh); + if (ret < 0) + goto out_header; + + for (;;) { + kaddr = kmap_atomic(bh->b_page); + su = nilfs_sufile_block_get_segment_usage( + sufile, sup->sup_segnum, bh, kaddr); + + if (nilfs_suinfo_update_lastmod(sup)) + su->su_lastmod = cpu_to_le64(sup->sup_sui.sui_lastmod); + + if (nilfs_suinfo_update_nblocks(sup)) + su->su_nblocks = cpu_to_le32(sup->sup_sui.sui_nblocks); + + if (nilfs_suinfo_update_flags(sup)) { + /* + * Active flag is a virtual flag projected by running + * nilfs kernel code - drop it not to write it to + * disk. + */ + sup->sup_sui.sui_flags &= + ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE); + + cleansi = nilfs_suinfo_clean(&sup->sup_sui); + cleansu = nilfs_segment_usage_clean(su); + dirtysi = nilfs_suinfo_dirty(&sup->sup_sui); + dirtysu = nilfs_segment_usage_dirty(su); + + if (cleansi && !cleansu) + ++ncleaned; + else if (!cleansi && cleansu) + --ncleaned; + + if (dirtysi && !dirtysu) + ++ndirtied; + else if (!dirtysi && dirtysu) + --ndirtied; + + su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags); + } + + kunmap_atomic(kaddr); + + sup = (void *)sup + supsz; + if (sup >= supend) + break; + + prev_blkoff = blkoff; + blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum); + if (blkoff == prev_blkoff) + continue; + + /* get different block */ + mark_buffer_dirty(bh); + put_bh(bh); + ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh); + if (unlikely(ret < 0)) + goto out_mark; + } + mark_buffer_dirty(bh); + put_bh(bh); + + out_mark: + if (ncleaned || ndirtied) { + nilfs_sufile_mod_counter(header_bh, (u64)ncleaned, + (u64)ndirtied); + NILFS_SUI(sufile)->ncleansegs += ncleaned; + } + nilfs_mdt_mark_dirty(sufile); + out_header: + put_bh(header_bh); + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + /** * nilfs_sufile_read - read or get sufile inode * @sb: super block instance diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index e84bc5b51fc1..366003c93ec9 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -44,6 +44,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned, size_t); +ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned , size_t); int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *, void (*dofunc)(struct inode *, __u64, -- cgit v1.2.3 From 2cc88f3a5f16ae9a3c8f54de1b2fd4a397b36075 Mon Sep 17 00:00:00 2001 From: Andreas Rohner Date: Thu, 3 Apr 2014 14:50:28 -0700 Subject: nilfs2: implementation of NILFS_IOCTL_SET_SUINFO ioctl With this ioctl the segment usage entries in the SUFILE can be updated from userspace. This is useful, because it allows the userspace GC to modify and update segment usage entries for specific segments, which enables it to avoid unnecessary write operations. If a segment needs to be cleaned, but there is no or very little reclaimable space in it, the cleaning operation basically degrades to a useless moving operation. In the end the only thing that changes is the location of the data and a timestamp in the segment usage information. With this ioctl the GC can skip the cleaning and update the segment usage entries directly instead. This is basically a shortcut to cleaning the segment. It is still necessary to read the segment summary information, but the writing of the live blocks can be skipped if it's not worth it. [konishi.ryusuke@lab.ntt.co.jp: add description of NILFS_IOCTL_SET_SUINFO ioctl] Signed-off-by: Andreas Rohner Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/nilfs2.txt | 7 +++ fs/nilfs2/ioctl.c | 92 ++++++++++++++++++++++++++++++++++++ include/linux/nilfs2_fs.h | 2 + 3 files changed, 101 insertions(+) diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index 06887d46ccf2..8b887ae4e39e 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt @@ -111,6 +111,13 @@ Table of NILFS2 specific ioctls nilfs_resize utilities and by nilfs_cleanerd daemon. + NILFS_IOCTL_SET_SUINFO Modify segment usage info of requested + segments. This ioctl is used by + nilfs_cleanerd daemon to skip unnecessary + cleaning operation of segments and reduce + performance penalty or wear of flash device + due to redundant move of in-use blocks. + NILFS_IOCTL_GET_SUSTAT Return segment usage statistics. This ioctl is used in lssu, nilfs_resize utilities and by nilfs_cleanerd daemon. diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 2b34021948e4..c19a23158487 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1163,6 +1163,95 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp, return ret; } +/** + * nilfs_ioctl_set_suinfo - set segment usage info + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: Expects an array of nilfs_suinfo_update structures + * encapsulated in nilfs_argv and updates the segment usage info + * according to the flags in nilfs_suinfo_update. + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EPERM - Not enough permissions + * + * %-EFAULT - Error copying input data + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid values in input (segment number, flags or nblocks) + */ +static int nilfs_ioctl_set_suinfo(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_transaction_info ti; + struct nilfs_argv argv; + size_t len; + void __user *base; + void *kbuf; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = -EFAULT; + if (copy_from_user(&argv, argp, sizeof(argv))) + goto out; + + ret = -EINVAL; + if (argv.v_size < sizeof(struct nilfs_suinfo_update)) + goto out; + + if (argv.v_nmembs > nilfs->ns_nsegments) + goto out; + + if (argv.v_nmembs >= UINT_MAX / argv.v_size) + goto out; + + len = argv.v_size * argv.v_nmembs; + if (!len) { + ret = 0; + goto out; + } + + base = (void __user *)(unsigned long)argv.v_base; + kbuf = vmalloc(len); + if (!kbuf) { + ret = -ENOMEM; + goto out; + } + + if (copy_from_user(kbuf, base, len)) { + ret = -EFAULT; + goto out_free; + } + + nilfs_transaction_begin(inode->i_sb, &ti, 0); + ret = nilfs_sufile_set_suinfo(nilfs->ns_sufile, kbuf, argv.v_size, + argv.v_nmembs); + if (unlikely(ret < 0)) + nilfs_transaction_abort(inode->i_sb); + else + nilfs_transaction_commit(inode->i_sb); /* never fails */ + +out_free: + vfree(kbuf); +out: + mnt_drop_write_file(filp); + return ret; +} + long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1189,6 +1278,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return nilfs_ioctl_get_info(inode, filp, cmd, argp, sizeof(struct nilfs_suinfo), nilfs_ioctl_do_get_suinfo); + case NILFS_IOCTL_SET_SUINFO: + return nilfs_ioctl_set_suinfo(inode, filp, cmd, argp); case NILFS_IOCTL_GET_SUSTAT: return nilfs_ioctl_get_sustat(inode, filp, cmd, argp); case NILFS_IOCTL_GET_VINFO: @@ -1228,6 +1319,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case NILFS_IOCTL_GET_CPINFO: case NILFS_IOCTL_GET_CPSTAT: case NILFS_IOCTL_GET_SUINFO: + case NILFS_IOCTL_SET_SUINFO: case NILFS_IOCTL_GET_SUSTAT: case NILFS_IOCTL_GET_VINFO: case NILFS_IOCTL_GET_BDESCS: diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h index 252657874a19..1fb465f9baf2 100644 --- a/include/linux/nilfs2_fs.h +++ b/include/linux/nilfs2_fs.h @@ -905,5 +905,7 @@ struct nilfs_bdesc { _IOW(NILFS_IOCTL_IDENT, 0x8B, __u64) #define NILFS_IOCTL_SET_ALLOC_RANGE \ _IOW(NILFS_IOCTL_IDENT, 0x8C, __u64[2]) +#define NILFS_IOCTL_SET_SUINFO \ + _IOW(NILFS_IOCTL_IDENT, 0x8D, struct nilfs_argv) #endif /* _LINUX_NILFS_FS_H */ -- cgit v1.2.3 From 82e11e857be3ffd2a0a952c9db8aa2379e2b9e44 Mon Sep 17 00:00:00 2001 From: Andreas Rohner Date: Thu, 3 Apr 2014 14:50:29 -0700 Subject: nilfs2: add nilfs_sufile_trim_fs to trim clean segs Add nilfs_sufile_trim_fs(), which takes an fstrim_range structure and calls blkdev_issue_discard for every clean segment in the specified range. The range is truncated to file system block boundaries. Signed-off-by: Andreas Rohner Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/sufile.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/nilfs2/sufile.h | 1 + 2 files changed, 153 insertions(+) diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 5628b99d454e..84e384dae663 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -1000,6 +1000,158 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, return ret; } +/** + * nilfs_sufile_trim_fs() - trim ioctl handle function + * @sufile: inode of segment usage file + * @range: fstrim_range structure + * + * start: First Byte to trim + * len: number of Bytes to trim from start + * minlen: minimum extent length in Bytes + * + * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes + * from start to start+len. start is rounded up to the next block boundary + * and start+len is rounded down. For each clean segment blkdev_issue_discard + * function is invoked. + * + * Return Value: On success, 0 is returned or negative error code, otherwise. + */ +int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) +{ + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + struct buffer_head *su_bh; + struct nilfs_segment_usage *su; + void *kaddr; + size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size; + sector_t seg_start, seg_end, start_block, end_block; + sector_t start = 0, nblocks = 0; + u64 segnum, segnum_end, minlen, len, max_blocks, ndiscarded = 0; + int ret = 0; + unsigned int sects_per_block; + + sects_per_block = (1 << nilfs->ns_blocksize_bits) / + bdev_logical_block_size(nilfs->ns_bdev); + len = range->len >> nilfs->ns_blocksize_bits; + minlen = range->minlen >> nilfs->ns_blocksize_bits; + max_blocks = ((u64)nilfs->ns_nsegments * nilfs->ns_blocks_per_segment); + + if (!len || range->start >= max_blocks << nilfs->ns_blocksize_bits) + return -EINVAL; + + start_block = (range->start + nilfs->ns_blocksize - 1) >> + nilfs->ns_blocksize_bits; + + /* + * range->len can be very large (actually, it is set to + * ULLONG_MAX by default) - truncate upper end of the range + * carefully so as not to overflow. + */ + if (max_blocks - start_block < len) + end_block = max_blocks - 1; + else + end_block = start_block + len - 1; + + segnum = nilfs_get_segnum_of_block(nilfs, start_block); + segnum_end = nilfs_get_segnum_of_block(nilfs, end_block); + + down_read(&NILFS_MDT(sufile)->mi_sem); + + while (segnum <= segnum_end) { + n = nilfs_sufile_segment_usages_in_block(sufile, segnum, + segnum_end); + + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, + &su_bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out_sem; + /* hole */ + segnum += n; + continue; + } + + kaddr = kmap_atomic(su_bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, + su_bh, kaddr); + for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) { + if (!nilfs_segment_usage_clean(su)) + continue; + + nilfs_get_segment_range(nilfs, segnum, &seg_start, + &seg_end); + + if (!nblocks) { + /* start new extent */ + start = seg_start; + nblocks = seg_end - seg_start + 1; + continue; + } + + if (start + nblocks == seg_start) { + /* add to previous extent */ + nblocks += seg_end - seg_start + 1; + continue; + } + + /* discard previous extent */ + if (start < start_block) { + nblocks -= start_block - start; + start = start_block; + } + + if (nblocks >= minlen) { + kunmap_atomic(kaddr); + + ret = blkdev_issue_discard(nilfs->ns_bdev, + start * sects_per_block, + nblocks * sects_per_block, + GFP_NOFS, 0); + if (ret < 0) { + put_bh(su_bh); + goto out_sem; + } + + ndiscarded += nblocks; + kaddr = kmap_atomic(su_bh->b_page); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + } + + /* start new extent */ + start = seg_start; + nblocks = seg_end - seg_start + 1; + } + kunmap_atomic(kaddr); + put_bh(su_bh); + } + + + if (nblocks) { + /* discard last extent */ + if (start < start_block) { + nblocks -= start_block - start; + start = start_block; + } + if (start + nblocks > end_block + 1) + nblocks = end_block - start + 1; + + if (nblocks >= minlen) { + ret = blkdev_issue_discard(nilfs->ns_bdev, + start * sects_per_block, + nblocks * sects_per_block, + GFP_NOFS, 0); + if (!ret) + ndiscarded += nblocks; + } + } + +out_sem: + up_read(&NILFS_MDT(sufile)->mi_sem); + + range->len = ndiscarded << nilfs->ns_blocksize_bits; + return ret; +} + /** * nilfs_sufile_read - read or get sufile inode * @sb: super block instance diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index 366003c93ec9..b8afd72f2379 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -66,6 +66,7 @@ void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs); int nilfs_sufile_read(struct super_block *sb, size_t susize, struct nilfs_inode *raw_inode, struct inode **inodep); +int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range); /** * nilfs_sufile_scrap - make a segment garbage -- cgit v1.2.3 From f9f32c44e7016c61f8c60afbe461fbc7d5a6c7cc Mon Sep 17 00:00:00 2001 From: Andreas Rohner Date: Thu, 3 Apr 2014 14:50:30 -0700 Subject: nilfs2: add FITRIM ioctl support for nilfs2 Add support for the FITRIM ioctl, which enables user space tools to issue TRIM/DISCARD requests to the underlying device. Every clean segment within the specified range will be discarded. Signed-off-by: Andreas Rohner Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/ioctl.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index c19a23158487..422fb54b7377 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1071,6 +1071,48 @@ out: return ret; } +/** + * nilfs_ioctl_trim_fs() - trim ioctl handle function + * @inode: inode object + * @argp: pointer on argument from userspace + * + * Decription: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It + * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which + * performs the actual trim operation. + * + * Return Value: On success, 0 is returned or negative error code, otherwise. + */ +static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct request_queue *q = bdev_get_queue(nilfs->ns_bdev); + struct fstrim_range range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (copy_from_user(&range, argp, sizeof(range))) + return -EFAULT; + + range.minlen = max_t(u64, range.minlen, q->limits.discard_granularity); + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range); + up_read(&nilfs->ns_segctor_sem); + + if (ret < 0) + return ret; + + if (copy_to_user(argp, &range, sizeof(range))) + return -EFAULT; + + return 0; +} + /** * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated * @inode: inode object @@ -1296,6 +1338,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return nilfs_ioctl_resize(inode, filp, argp); case NILFS_IOCTL_SET_ALLOC_RANGE: return nilfs_ioctl_set_alloc_range(inode, argp); + case FITRIM: + return nilfs_ioctl_trim_fs(inode, argp); default: return -ENOTTY; } @@ -1327,6 +1371,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case NILFS_IOCTL_SYNC: case NILFS_IOCTL_RESIZE: case NILFS_IOCTL_SET_ALLOC_RANGE: + case FITRIM: break; default: return -ENOIOCTLCMD; -- cgit v1.2.3 From 0ec060d1881a24c270fdf0d6616e33e23a209ef2 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 3 Apr 2014 14:50:31 -0700 Subject: nilfs2: verify metadata sizes read from disk Add code to check sizes of on-disk data of metadata files such as inode size, segment usage size, DAT entry size, and checkpoint size. Although these sizes are read from disk, the current implementation doesn't check them. If these sizes are not sane on disk, it can cause out-of-range access to metadata or memory access overrun on metadata block buffers due to overflow in sundry calculations. Both lower limit and upper limit of metadata sizes are verified to prevent these issues. Signed-off-by: Ryusuke Konishi Cc: Andreas Rohner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/cpfile.c | 12 ++++++++++++ fs/nilfs2/dat.c | 12 ++++++++++++ fs/nilfs2/sufile.c | 12 ++++++++++++ fs/nilfs2/the_nilfs.c | 10 ++++++++++ include/linux/nilfs2_fs.h | 8 ++++++++ 5 files changed, 54 insertions(+) diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index deaa3d33a0aa..0d58075f34e2 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -942,6 +942,18 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize, struct inode *cpfile; int err; + if (cpsize > sb->s_blocksize) { + printk(KERN_ERR + "NILFS: too large checkpoint size: %zu bytes.\n", + cpsize); + return -EINVAL; + } else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) { + printk(KERN_ERR + "NILFS: too small checkpoint size: %zu bytes.\n", + cpsize); + return -EINVAL; + } + cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO); if (unlikely(!cpfile)) return -ENOMEM; diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index fa0f80308c2d..0d5fada91191 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -484,6 +484,18 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size, struct nilfs_dat_info *di; int err; + if (entry_size > sb->s_blocksize) { + printk(KERN_ERR + "NILFS: too large DAT entry size: %zu bytes.\n", + entry_size); + return -EINVAL; + } else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) { + printk(KERN_ERR + "NILFS: too small DAT entry size: %zu bytes.\n", + entry_size); + return -EINVAL; + } + dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO); if (unlikely(!dat)) return -ENOMEM; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 84e384dae663..2a869c35c362 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -1169,6 +1169,18 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize, void *kaddr; int err; + if (susize > sb->s_blocksize) { + printk(KERN_ERR + "NILFS: too large segment usage size: %zu bytes.\n", + susize); + return -EINVAL; + } else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) { + printk(KERN_ERR + "NILFS: too small segment usage size: %zu bytes.\n", + susize); + return -EINVAL; + } + sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO); if (unlikely(!sufile)) return -ENOMEM; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 94c451ce6d24..8ba8229ba076 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -399,6 +399,16 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, return -EINVAL; nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size); + if (nilfs->ns_inode_size > nilfs->ns_blocksize) { + printk(KERN_ERR "NILFS: too large inode size: %d bytes.\n", + nilfs->ns_inode_size); + return -EINVAL; + } else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) { + printk(KERN_ERR "NILFS: too small inode size: %d bytes.\n", + nilfs->ns_inode_size); + return -EINVAL; + } + nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino); nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h index 1fb465f9baf2..ff3fea3194c6 100644 --- a/include/linux/nilfs2_fs.h +++ b/include/linux/nilfs2_fs.h @@ -82,6 +82,8 @@ struct nilfs_inode { __le32 i_pad; }; +#define NILFS_MIN_INODE_SIZE 128 + /** * struct nilfs_super_root - structure of super root * @sr_sum: check sum @@ -482,6 +484,8 @@ struct nilfs_dat_entry { __le64 de_rsv; }; +#define NILFS_MIN_DAT_ENTRY_SIZE 32 + /** * struct nilfs_snapshot_list - snapshot list * @ssl_next: next checkpoint number on snapshot list @@ -520,6 +524,8 @@ struct nilfs_checkpoint { struct nilfs_inode cp_ifile_inode; }; +#define NILFS_MIN_CHECKPOINT_SIZE (64 + NILFS_MIN_INODE_SIZE) + /* checkpoint flags */ enum { NILFS_CHECKPOINT_SNAPSHOT, @@ -615,6 +621,8 @@ struct nilfs_segment_usage { __le32 su_flags; }; +#define NILFS_MIN_SEGMENT_USAGE_SIZE 16 + /* segment usage flag */ enum { NILFS_SEGMENT_USAGE_ACTIVE, -- cgit v1.2.3 From f94722653c577546b9b20015ec08aa67c99a999c Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 3 Apr 2014 14:50:32 -0700 Subject: nilfs2: update MAINTAINERS file entries fix Also, web-page entry is updated according to relocation of project's web site. Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 3bf8bbbbee75..e1af73272fd7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6183,7 +6183,7 @@ F: include/uapi/linux/sunrpc/ NILFS2 FILESYSTEM M: Ryusuke Konishi L: linux-nilfs@vger.kernel.org -W: http://www.nilfs.org/en/ +W: http://nilfs.sourceforge.net/ T: git git://github.com/konis/nilfs2.git S: Supported F: Documentation/filesystems/nilfs2.txt -- cgit v1.2.3 From 7fac376d78da96cc8debdd3f8e817c9136321486 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 3 Apr 2014 14:50:32 -0700 Subject: nilfs2: update project's web site in nilfs2.txt Project's web site was moved to nilfs.sourceforge.net from www.nilfs.org. This updates the site information in Documentation/filesystems/nilfs2.txt with the new location. Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/nilfs2.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index 8b887ae4e39e..41c3d332acc9 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt @@ -25,9 +25,8 @@ available from the following download page. At least "mkfs.nilfs2", cleaner or garbage collector) are required. Details on the tools are described in the man pages included in the package. -Project web page: http://www.nilfs.org/en/ -Download page: http://www.nilfs.org/en/download.html -Git tree web page: http://www.nilfs.org/git/ +Project web page: http://nilfs.sourceforge.net/ +Download page: http://nilfs.sourceforge.net/en/download.html List info: http://vger.kernel.org/vger-lists.html#linux-nilfs Caveats -- cgit v1.2.3 From abfeb724b43f371ea70ec2c1290ed33e6f65db60 Mon Sep 17 00:00:00 2001 From: Sougata Santra Date: Thu, 3 Apr 2014 14:50:33 -0700 Subject: fs/hfsplus/extents.c: remove unused variable in hfsplus_get_block The variable is defined but not used. Generally it compiles away with -O2 optimization hence it does not show a warning. Signed-off-by: Sougata Santra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/extents.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index fbb212fbb1ef..136d860cd746 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -227,10 +227,8 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, u32 ablock, dblock, mask; sector_t sector; int was_dirty = 0; - int shift; /* Convert inode block to disk allocation block */ - shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits; ablock = iblock >> sbi->fs_shift; if (iblock >= hip->fs_blocks) { -- cgit v1.2.3 From d7bdb996aef67ea24c62707ca4e29b07025e9683 Mon Sep 17 00:00:00 2001 From: Sougata Santra Date: Thu, 3 Apr 2014 14:50:34 -0700 Subject: fs/hfsplus/extents.c: fix concurrent acess of alloc_blocks Concurrent access to alloc_blocks in hfsplus_inode_info() is protected by extents_lock mutex. This patch fixes two instances where alloc_blocks modification was not protected with this lock. This fixes possible allocation bitmap corruption in race conditions while extending and truncating files. [akpm@linux-foundation.org: take extents_lock before taking a copy of ->alloc_blocks] [akpm@linux-foundation.org: remove now-unused label `out'] Signed-off-by: Sougata Santra Reviewed-by: Christoph Hellwig Cc: Vyacheslav Dubeyko Cc: Alexey Khoroshilov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/extents.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 136d860cd746..a7aafb35b624 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -496,11 +496,13 @@ int hfsplus_file_extend(struct inode *inode) goto insert_extent; } out: - mutex_unlock(&hip->extents_lock); if (!res) { hip->alloc_blocks += len; + mutex_unlock(&hip->extents_lock); hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY); + return 0; } + mutex_unlock(&hip->extents_lock); return res; insert_extent: @@ -554,11 +556,13 @@ void hfsplus_file_truncate(struct inode *inode) blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >> HFSPLUS_SB(sb)->alloc_blksz_shift; + + mutex_lock(&hip->extents_lock); + alloc_cnt = hip->alloc_blocks; if (blk_cnt == alloc_cnt) - goto out; + goto out_unlock; - mutex_lock(&hip->extents_lock); res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); if (res) { mutex_unlock(&hip->extents_lock); @@ -590,10 +594,10 @@ void hfsplus_file_truncate(struct inode *inode) hfs_brec_remove(&fd); } hfs_find_exit(&fd); - mutex_unlock(&hip->extents_lock); hip->alloc_blocks = blk_cnt; -out: +out_unlock: + mutex_unlock(&hip->extents_lock); hip->phys_size = inode->i_size; hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; -- cgit v1.2.3 From c11e614d712c4f3e89f31a0119111c01163f1508 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 3 Apr 2014 14:50:35 -0700 Subject: fs/hfsplus/attributes.c: add __init to hfsplus_create_attr_tree_cache() hfsplus_create_attr_tree_cache is only called by __init init_hfsplus_fs Signed-off-by: Fabian Frederick Reviewed-by: Vyacheslav Dubeyko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/attributes.c | 2 +- fs/hfsplus/hfsplus_fs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index 0f47890299c4..caf89a7be0a1 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -11,7 +11,7 @@ static struct kmem_cache *hfsplus_attr_tree_cachep; -int hfsplus_create_attr_tree_cache(void) +int __init hfsplus_create_attr_tree_cache(void) { if (hfsplus_attr_tree_cachep) return -EEXIST; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 62d571eb69ba..83dc29286b10 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -367,7 +367,7 @@ typedef int (*search_strategy_t)(struct hfs_bnode *, */ /* attributes.c */ -int hfsplus_create_attr_tree_cache(void); +int __init hfsplus_create_attr_tree_cache(void); void hfsplus_destroy_attr_tree_cache(void); hfsplus_attr_entry *hfsplus_alloc_attr_entry(void); void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p); -- cgit v1.2.3 From ea0856cd503e06d4e86a9171491ab9fcf7a2e741 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 3 Apr 2014 14:50:36 -0700 Subject: fs/reiserfs: move prototype declaration to header file Move prototype declaration to header file reiserfs/reiserfs.h from reiserfs/super.c because they are used by more than one file. This eliminates the following warning in reiserfs/bitmap.c: fs/reiserfs/bitmap.c:647:6: warning: no previous prototype for `show_alloc_options' [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Acked-by: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/reiserfs.h | 1 + fs/reiserfs/super.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 8d06adf89948..83d4eac8059a 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -2831,6 +2831,7 @@ void reiserfs_init_alloc_options(struct super_block *s); */ __le32 reiserfs_choose_packing(struct inode *dir); +void show_alloc_options(struct seq_file *seq, struct super_block *s); int reiserfs_init_bitmap_cache(struct super_block *sb); void reiserfs_free_bitmap_cache(struct super_block *sb); void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 2c803353f8ac..16d533597065 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -62,7 +62,6 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs) static int reiserfs_remount(struct super_block *s, int *flags, char *data); static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf); -void show_alloc_options(struct seq_file *seq, struct super_block *s); static int reiserfs_sync_fs(struct super_block *s, int wait) { -- cgit v1.2.3 From 31e143686a39bff19a72d1806f9b8392c03dca92 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 3 Apr 2014 14:50:37 -0700 Subject: fs/reiserfs/super.c: add __init to init_inodecache init_inodecache is only called by __init init_reiserfs_fs. Signed-off-by: Fabian Frederick Acked-by: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 16d533597065..ed54a04c33bd 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -596,7 +596,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache", sizeof(struct -- cgit v1.2.3 From 4762c98413836dfc3bff2857647f8d673a86d210 Mon Sep 17 00:00:00 2001 From: Wang YanQing Date: Thu, 3 Apr 2014 14:50:38 -0700 Subject: Documentation/kmemleak.txt: updates Update Documentatin/kmemleak.txt to reflect the following changes: Commit b69ec42b1b19 ("Kconfig: clean up the long arch list for the DEBUG_KMEMLEAK config option") made it so that we can't check supported architectures by read Kconfig.debug. Commit 85d3a316c71 ("kmemleak: use rbtree instead of prio tree") converted kmemleak to use rbtree instead of prio tree. Signed-off-by: Wang YanQing Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kmemleak.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt index 6c18be97f3dd..a7563ec4ea7b 100644 --- a/Documentation/kmemleak.txt +++ b/Documentation/kmemleak.txt @@ -11,9 +11,7 @@ with the difference that the orphan objects are not freed but only reported via /sys/kernel/debug/kmemleak. A similar method is used by the Valgrind tool (memcheck --leak-check) to detect the memory leaks in user-space applications. - -Please check DEBUG_KMEMLEAK dependencies in lib/Kconfig.debug for supported -architectures. +Kmemleak is supported on x86, arm, powerpc, sparc, sh, microblaze, ppc, mips, s390, metag and tile. Usage ----- @@ -69,7 +67,7 @@ Basic Algorithm The memory allocations via kmalloc, vmalloc, kmem_cache_alloc and friends are traced and the pointers, together with additional -information like size and stack trace, are stored in a prio search tree. +information like size and stack trace, are stored in a rbtree. The corresponding freeing function calls are tracked and the pointers removed from the kmemleak data structures. @@ -85,7 +83,7 @@ The scanning algorithm steps: 1. mark all objects as white (remaining white objects will later be considered orphan) 2. scan the memory starting with the data section and stacks, checking - the values against the addresses stored in the prio search tree. If + the values against the addresses stored in the rbtree. If a pointer to a white object is found, the object is added to the gray list 3. scan the gray objects for matching addresses (some white objects -- cgit v1.2.3 From 4adeacdf364640544ceecfc7b184af2eed91f183 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 3 Apr 2014 14:50:39 -0700 Subject: Documentation/filesystems/ntfs.txt: remove changelog reference File was removed in commit 7c821a179f91 ("Remove fs/ntfs/ChangeLog"). Signed-off-by: Fabian Frederick Acked-by: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/ntfs.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt index 791af8dac065..61947facfc07 100644 --- a/Documentation/filesystems/ntfs.txt +++ b/Documentation/filesystems/ntfs.txt @@ -455,8 +455,6 @@ not have this problem with odd numbers of sectors. ChangeLog ========= -Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog. - 2.1.30: - Fix writev() (it kept writing the first segment over and over again instead of moving onto subsequent segments). -- cgit v1.2.3 From 37c703f40dd8b35095f7f8d564bc57afa9a42e5f Mon Sep 17 00:00:00 2001 From: Mitchel Humpherys Date: Thu, 3 Apr 2014 14:50:40 -0700 Subject: Documentation/SubmittingPatches: update some dead URLs The links to "The perfect patch" and "NO!!!! No more huge patch bombs..." have gone stale. Update them to some working locations. Signed-off-by: Mitchel Humpherys Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/SubmittingPatches | 4 ++-- Documentation/ja_JP/SubmittingPatches | 4 ++-- Documentation/zh_CN/SubmittingPatches | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches index fdad7d197062..2a8e89e13e45 100644 --- a/Documentation/SubmittingPatches +++ b/Documentation/SubmittingPatches @@ -739,7 +739,7 @@ SECTION 3 - REFERENCES ---------------------- Andrew Morton, "The perfect patch" (tpp). - + Jeff Garzik, "Linux kernel patch submission format". @@ -752,7 +752,7 @@ Greg Kroah-Hartman, "How to piss off a kernel subsystem maintainer". NO!!!! No more huge patch bombs to linux-kernel@vger.kernel.org people! - + Kernel Documentation/CodingStyle: diff --git a/Documentation/ja_JP/SubmittingPatches b/Documentation/ja_JP/SubmittingPatches index 97f78dd0c085..ff6cb5729c84 100644 --- a/Documentation/ja_JP/SubmittingPatches +++ b/Documentation/ja_JP/SubmittingPatches @@ -695,7 +695,7 @@ gcc においては、マクロと同じくらい軽いです。 ---------------------- Andrew Morton, "The perfect patch" (tpp). - + Jeff Garzik, "Linux kernel patch submission format". @@ -707,7 +707,7 @@ Greg Kroah-Hartman, "How to piss off a kernel subsystem maintainer". NO!!!! No more huge patch bombs to linux-kernel@vger.kernel.org people! - + Kernel Documentation/CodingStyle: diff --git a/Documentation/zh_CN/SubmittingPatches b/Documentation/zh_CN/SubmittingPatches index be0bd4725062..788ab4730372 100644 --- a/Documentation/zh_CN/SubmittingPatches +++ b/Documentation/zh_CN/SubmittingPatches @@ -394,7 +394,7 @@ Static inline 函数相比宏来说,是好得多的选择。Static inline 函 ---------------- Andrew Morton, "The perfect patch" (tpp). - + Jeff Garzik, "Linux kernel patch submission format". @@ -406,7 +406,7 @@ Greg Kroah-Hartman, "How to piss off a kernel subsystem maintainer". NO!!!! No more huge patch bombs to linux-kernel@vger.kernel.org people! - + Kernel Documentation/CodingStyle: -- cgit v1.2.3 From 8d81e29fae18b93ab51f308b31babe1a6eb04314 Mon Sep 17 00:00:00 2001 From: Mitchel Humpherys Date: Thu, 3 Apr 2014 14:50:40 -0700 Subject: Documentation/SubmittingPatches: remove references to patch-scripts The link to the tarball for Andrew Morton's patch scripts is dead. These scripts don't seem to be used for kernel development these days anyways so just rip out all references to them. Signed-off-by: Mitchel Humpherys Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ja_JP/SubmittingPatches | 5 ----- Documentation/zh_CN/SubmittingPatches | 4 ---- 2 files changed, 9 deletions(-) diff --git a/Documentation/ja_JP/SubmittingPatches b/Documentation/ja_JP/SubmittingPatches index ff6cb5729c84..5d6ae639bfa0 100644 --- a/Documentation/ja_JP/SubmittingPatches +++ b/Documentation/ja_JP/SubmittingPatches @@ -98,11 +98,6 @@ dontdiff ファイルには Linux カーネルのビルドプロセスの過程 Quilt: http://savannah.nongnu.org/projects/quilt -Andrew Morton's patch scripts: -http://userweb.kernel.org/~akpm/stuff/patch-scripts.tar.gz -このリンクの先のスクリプトの代わりとして、quilt がパッチマネジメント -ツールとして推奨されています(上のリンクを見てください)。 - 2) パッチに対する説明 パッチの中の変更点に対する技術的な詳細について説明してください。 diff --git a/Documentation/zh_CN/SubmittingPatches b/Documentation/zh_CN/SubmittingPatches index 788ab4730372..1d3a10f8746b 100644 --- a/Documentation/zh_CN/SubmittingPatches +++ b/Documentation/zh_CN/SubmittingPatches @@ -82,10 +82,6 @@ Documentation/SubmittingDrivers 。 Quilt: http://savannah.nongnu.org/projects/quilt -Andrew Morton 的补丁脚本: -http://userweb.kernel.org/~akpm/stuff/patch-scripts.tar.gz -作为这些脚本的替代,quilt 是值得推荐的补丁管理工具(看上面的链接)。 - 2)描述你的改动。 描述你的改动包含的技术细节。 -- cgit v1.2.3