From 4fc828e24cd9c385d3a44e1b499ec7fc70239d8a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 2 May 2014 11:24:15 -0700 Subject: locking/rwsem: Support optimistic spinning We have reached the point where our mutexes are quite fine tuned for a number of situations. This includes the use of heuristics and optimistic spinning, based on MCS locking techniques. Exclusive ownership of read-write semaphores are, conceptually, just about the same as mutexes, making them close cousins. To this end we need to make them both perform similarly, and right now, rwsems are simply not up to it. This was discovered by both reverting commit 4fc3f1d6 (mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable) and similarly, converting some other mutexes (ie: i_mmap_mutex) to rwsems. This creates a situation where users have to choose between a rwsem and mutex taking into account this important performance difference. Specifically, biggest difference between both locks is when we fail to acquire a mutex in the fastpath, optimistic spinning comes in to play and we can avoid a large amount of unnecessary sleeping and overhead of moving tasks in and out of wait queue. Rwsems do not have such logic. This patch, based on the work from Tim Chen and I, adds support for write-side optimistic spinning when the lock is contended. It also includes support for the recently added cancelable MCS locking for adaptive spinning. Note that is is only applicable to the xadd method, and the spinlock rwsem variant remains intact. Allowing optimistic spinning before putting the writer on the wait queue reduces wait queue contention and provided greater chance for the rwsem to get acquired. With these changes, rwsem is on par with mutex. The performance benefits can be seen on a number of workloads. For instance, on a 8 socket, 80 core 64bit Westmere box, aim7 shows the following improvements in throughput: +--------------+---------------------+-----------------+ | Workload | throughput-increase | number of users | +--------------+---------------------+-----------------+ | alltests | 20% | >1000 | | custom | 27%, 60% | 10-100, >1000 | | high_systime | 36%, 30% | >100, >1000 | | shared | 58%, 29% | 10-100, >1000 | +--------------+---------------------+-----------------+ There was also improvement on smaller systems, such as a quad-core x86-64 laptop running a 30Gb PostgreSQL (pgbench) workload for up to +60% in throughput for over 50 clients. Additionally, benefits were also noticed in exim (mail server) workloads. Furthermore, no performance regression have been seen at all. Based-on-work-from: Tim Chen Signed-off-by: Davidlohr Bueso [peterz: rej fixup due to comment patches, sched/rt.h header] Signed-off-by: Peter Zijlstra Cc: Alex Shi Cc: Andi Kleen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Peter Hurley Cc: "Paul E.McKenney" Cc: Jason Low Cc: Aswin Chandramouleeswaran Cc: Andrew Morton Cc: Linus Torvalds Cc: "Scott J Norton" Cc: Andrea Arcangeli Cc: Chris Mason Cc: Josef Bacik Link: http://lkml.kernel.org/r/1399055055.6275.15.camel@buesod1.americas.hpqcorp.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 225 ++++++++++++++++++++++++++++++++++++++------ kernel/locking/rwsem.c | 31 +++++- 2 files changed, 226 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index b4219ff87b8c..4a75278142cd 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -5,11 +5,17 @@ * * Writer lock-stealing by Alex Shi * and Michel Lespinasse + * + * Optimistic spinning by Tim Chen + * and Davidlohr Bueso . Based on mutexes. */ #include #include #include #include +#include + +#include "mcs_spinlock.h" /* * Guide to the rw_semaphore's count field for common values. @@ -76,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, sem->count = RWSEM_UNLOCKED_VALUE; raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); +#ifdef CONFIG_SMP + sem->owner = NULL; + sem->osq = NULL; +#endif } EXPORT_SYMBOL(__init_rwsem); @@ -190,7 +200,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) } /* - * wait for the read lock to be granted + * Wait for the read lock to be granted */ __visible struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) @@ -237,64 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) return sem; } +static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) +{ + if (!(count & RWSEM_ACTIVE_MASK)) { + /* try acquiring the write lock */ + if (sem->count == RWSEM_WAITING_BIAS && + cmpxchg(&sem->count, RWSEM_WAITING_BIAS, + RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { + if (!list_is_singular(&sem->wait_list)) + rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + return true; + } + } + return false; +} + +#ifdef CONFIG_SMP /* - * wait until we successfully acquire the write lock + * Try to acquire write lock before the writer has been put on wait queue. + */ +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) +{ + long old, count = ACCESS_ONCE(sem->count); + + while (true) { + if (!(count == 0 || count == RWSEM_WAITING_BIAS)) + return false; + + old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); + if (old == count) + return true; + + count = old; + } +} + +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +{ + struct task_struct *owner; + bool on_cpu = true; + + if (need_resched()) + return 0; + + rcu_read_lock(); + owner = ACCESS_ONCE(sem->owner); + if (owner) + on_cpu = owner->on_cpu; + rcu_read_unlock(); + + /* + * If sem->owner is not set, the rwsem owner may have + * just acquired it and not set the owner yet or the rwsem + * has been released. + */ + return on_cpu; +} + +static inline bool owner_running(struct rw_semaphore *sem, + struct task_struct *owner) +{ + if (sem->owner != owner) + return false; + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * sem->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. + */ + barrier(); + + return owner->on_cpu; +} + +static noinline +bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) +{ + rcu_read_lock(); + while (owner_running(sem, owner)) { + if (need_resched()) + break; + + arch_mutex_cpu_relax(); + } + rcu_read_unlock(); + + /* + * We break out the loop above on need_resched() or when the + * owner changed, which is a sign for heavy contention. Return + * success only when sem->owner is NULL. + */ + return sem->owner == NULL; +} + +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + struct task_struct *owner; + bool taken = false; + + preempt_disable(); + + /* sem->wait_lock should not be held when doing optimistic spinning */ + if (!rwsem_can_spin_on_owner(sem)) + goto done; + + if (!osq_lock(&sem->osq)) + goto done; + + while (true) { + owner = ACCESS_ONCE(sem->owner); + if (owner && !rwsem_spin_on_owner(sem, owner)) + break; + + /* wait_lock will be acquired if write_lock is obtained */ + if (rwsem_try_write_lock_unqueued(sem)) { + taken = true; + break; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(current))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + arch_mutex_cpu_relax(); + } + osq_unlock(&sem->osq); +done: + preempt_enable(); + return taken; +} + +#else +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + return false; +} +#endif + +/* + * Wait until we successfully acquire the write lock */ __visible struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { - long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; + long count; + bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; - struct task_struct *tsk = current; - /* set up my own style of waitqueue */ - waiter.task = tsk; + /* undo write bias from down_write operation, stop active locking */ + count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); + + /* do optimistic spinning and steal lock if possible */ + if (rwsem_optimistic_spin(sem)) + return sem; + + /* + * Optimistic spinning failed, proceed to the slowpath + * and block until we can acquire the sem. + */ + waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; raw_spin_lock_irq(&sem->wait_lock); + + /* account for this before adding a new element to the list */ if (list_empty(&sem->wait_list)) - adjustment += RWSEM_WAITING_BIAS; + waiting = false; + list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); + if (waiting) { + count = ACCESS_ONCE(sem->count); - /* If there were already threads queued before us and there are no - * active writers, the lock must be read owned; so we try to wake - * any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS && - adjustment == -RWSEM_ACTIVE_WRITE_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + /* + * If there were already threads queued before us and there are no + * active writers, the lock must be read owned; so we try to wake + * any read locks that were queued ahead of us. + */ + if (count > RWSEM_WAITING_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + + } else + count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); /* wait until we successfully acquire the lock */ - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); while (true) { - if (!(count & RWSEM_ACTIVE_MASK)) { - /* Try acquiring the write lock. */ - count = RWSEM_ACTIVE_WRITE_BIAS; - if (!list_is_singular(&sem->wait_list)) - count += RWSEM_WAITING_BIAS; - - if (sem->count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == - RWSEM_WAITING_BIAS) - break; - } - + if (rwsem_try_write_lock(count, sem)) + break; raw_spin_unlock_irq(&sem->wait_lock); /* Block until there are no active lockers. */ do { schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); } while ((count = sem->count) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } + __set_current_state(TASK_RUNNING); list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); - tsk->state = TASK_RUNNING; return sem; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index cfff1435bdfb..42f806de49d4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -12,6 +12,27 @@ #include +#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ + sem->owner = current; +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ + sem->owner = NULL; +} + +#else +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ +} +#endif + /* * lock for reading */ @@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem) rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write); @@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem) { int ret = __down_write_trylock(sem); - if (ret == 1) + if (ret == 1) { rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); + rwsem_set_owner(sem); + } + return ret; } @@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_clear_owner(sem); __up_write(sem); } @@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem) * lockdep: a downgraded write will live on as a write * dependency. */ + rwsem_clear_owner(sem); __downgrade_write(sem); } @@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(_down_write_nest_lock); @@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write_nested); -- cgit v1.2.3 From 0cc3d01164aba483edd8232aa5c781136843c367 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 4 Jun 2014 20:19:48 +0200 Subject: locking/rwsem: Fix checkpatch.pl warnings WARNING: line over 80 characters #205: FILE: kernel/locking/rwsem-xadd.c:275: + old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); WARNING: line over 80 characters #376: FILE: kernel/locking/rwsem-xadd.c:434: + * If there were already threads queued before us and there are no WARNING: line over 80 characters #377: FILE: kernel/locking/rwsem-xadd.c:435: + * active writers, the lock must be read owned; so we try to wake total: 0 errors, 3 warnings, 417 lines checked Signed-off-by: Andrew Morton Signed-off-by: Peter Zijlstra Cc: "H. Peter Anvin" Cc: Davidlohr Bueso Cc: Tim Chen Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-pn6pslaplw031lykweojsn8c@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 4a75278142cd..dacc32142fcc 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -433,9 +433,9 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) count = ACCESS_ONCE(sem->count); /* - * If there were already threads queued before us and there are no - * active writers, the lock must be read owned; so we try to wake - * any read locks that were queued ahead of us. + * If there were already threads queued before us and there are + * no active writers, the lock must be read owned; so we try to + * wake any read locks that were queued ahead of us. */ if (count > RWSEM_WAITING_BIAS) sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); -- cgit v1.2.3 From 70af2f8a4f48d6cebdf92d533d3aef37853ce6de Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 3 Feb 2014 13:18:49 +0100 Subject: locking/rwlocks: Introduce 'qrwlocks' - fair, queued rwlocks This rwlock uses the arch_spin_lock_t as a waitqueue, and assuming the arch_spin_lock_t is a fair lock (ticket,mcs etc..) the resulting rwlock is a fair lock. It fits in the same 8 bytes as the regular rwlock_t by folding the reader and writer count into a single integer, using the remaining 4 bytes for the arch_spinlock_t. Architectures that can single-copy adress bytes can optimize queue_write_unlock() with a 0 write to the LSB (the write count). Performance as measured by Davidlohr Bueso (rwlock_t -> qrwlock_t): +--------------+-------------+---------------+ | Workload | #users | delta | +--------------+-------------+---------------+ | alltests | > 1400 | -4.83% | | custom | 0-100,> 100 | +1.43%,-1.57% | | high_systime | > 1000 | -2.61 | | shared | all | +0.32 | +--------------+-------------+---------------+ http://www.stgolabs.net/qrwlock-stuff/aim7-results-vs-rwsem_optsin/ Signed-off-by: Waiman Long [peterz: near complete rewrite] Signed-off-by: Peter Zijlstra Cc: Arnd Bergmann Cc: Linus Torvalds Cc: "Paul E.McKenney" Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-gac1nnl3wvs2ij87zv2xkdzq@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/Kconfig.locks | 7 +++ kernel/locking/Makefile | 1 + kernel/locking/qrwlock.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 141 insertions(+) create mode 100644 kernel/locking/qrwlock.c (limited to 'kernel') diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index d2b32ac27a39..35536d9c0964 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -223,3 +223,10 @@ endif config MUTEX_SPIN_ON_OWNER def_bool y depends on SMP && !DEBUG_MUTEXES + +config ARCH_USE_QUEUE_RWLOCK + bool + +config QUEUE_RWLOCK + def_bool y if ARCH_USE_QUEUE_RWLOCK + depends on SMP diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index b8bdcd4785b7..8541bfdfd232 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -24,4 +24,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o +obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c new file mode 100644 index 000000000000..fb5b8ac411a5 --- /dev/null +++ b/kernel/locking/qrwlock.c @@ -0,0 +1,133 @@ +/* + * Queue read/write lock + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. + * + * Authors: Waiman Long + */ +#include +#include +#include +#include +#include +#include +#include + +/** + * rspin_until_writer_unlock - inc reader count & spin until writer is gone + * @lock : Pointer to queue rwlock structure + * @writer: Current queue rwlock writer status byte + * + * In interrupt context or at the head of the queue, the reader will just + * increment the reader count & wait until the writer releases the lock. + */ +static __always_inline void +rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) +{ + while ((cnts & _QW_WMASK) == _QW_LOCKED) { + arch_mutex_cpu_relax(); + cnts = smp_load_acquire((u32 *)&lock->cnts); + } +} + +/** + * queue_read_lock_slowpath - acquire read lock of a queue rwlock + * @lock: Pointer to queue rwlock structure + */ +void queue_read_lock_slowpath(struct qrwlock *lock) +{ + u32 cnts; + + /* + * Readers come here when they cannot get the lock without waiting + */ + if (unlikely(in_interrupt())) { + /* + * Readers in interrupt context will spin until the lock is + * available without waiting in the queue. + */ + cnts = smp_load_acquire((u32 *)&lock->cnts); + rspin_until_writer_unlock(lock, cnts); + return; + } + atomic_sub(_QR_BIAS, &lock->cnts); + + /* + * Put the reader into the wait queue + */ + arch_spin_lock(&lock->lock); + + /* + * At the head of the wait queue now, wait until the writer state + * goes to 0 and then try to increment the reader count and get + * the lock. It is possible that an incoming writer may steal the + * lock in the interim, so it is necessary to check the writer byte + * to make sure that the write lock isn't taken. + */ + while (atomic_read(&lock->cnts) & _QW_WMASK) + arch_mutex_cpu_relax(); + + cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; + rspin_until_writer_unlock(lock, cnts); + + /* + * Signal the next one in queue to become queue head + */ + arch_spin_unlock(&lock->lock); +} +EXPORT_SYMBOL(queue_read_lock_slowpath); + +/** + * queue_write_lock_slowpath - acquire write lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + */ +void queue_write_lock_slowpath(struct qrwlock *lock) +{ + u32 cnts; + + /* Put the writer into the wait queue */ + arch_spin_lock(&lock->lock); + + /* Try to acquire the lock directly if no reader is present */ + if (!atomic_read(&lock->cnts) && + (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) + goto unlock; + + /* + * Set the waiting flag to notify readers that a writer is pending, + * or wait for a previous writer to go away. + */ + for (;;) { + cnts = atomic_read(&lock->cnts); + if (!(cnts & _QW_WMASK) && + (atomic_cmpxchg(&lock->cnts, cnts, + cnts | _QW_WAITING) == cnts)) + break; + + arch_mutex_cpu_relax(); + } + + /* When no more readers, set the locked flag */ + for (;;) { + cnts = atomic_read(&lock->cnts); + if ((cnts == _QW_WAITING) && + (atomic_cmpxchg(&lock->cnts, _QW_WAITING, + _QW_LOCKED) == _QW_WAITING)) + break; + + arch_mutex_cpu_relax(); + } +unlock: + arch_spin_unlock(&lock->lock); +} +EXPORT_SYMBOL(queue_write_lock_slowpath); -- cgit v1.2.3