summaryrefslogtreecommitdiff
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c171
1 files changed, 129 insertions, 42 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7fa3cd3fff4d..4127ea027a14 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -267,6 +267,9 @@ struct io_ring_ctx {
#if defined(CONFIG_UNIX)
struct socket *ring_sock;
#endif
+
+ struct list_head task_list;
+ spinlock_t task_lock;
};
struct sqe_submit {
@@ -276,6 +279,7 @@ struct sqe_submit {
bool has_user;
bool needs_lock;
bool needs_fixed_file;
+ u8 opcode;
};
/*
@@ -331,14 +335,18 @@ struct io_kiocb {
#define REQ_F_ISREG 2048 /* regular file */
#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */
#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
+#define REQ_F_CANCEL 16384 /* cancel request */
unsigned long fsize;
u64 user_data;
u32 result;
u32 sequence;
+ struct files_struct *files;
struct fs_struct *fs;
struct work_struct work;
+ struct task_struct *work_task;
+ struct list_head task_list;
};
#define IO_PLUG_THRESHOLD 2
@@ -425,6 +433,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->cancel_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
+ INIT_LIST_HEAD(&ctx->task_list);
+ spin_lock_init(&ctx->task_lock);
return ctx;
}
@@ -492,10 +502,11 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
static inline void io_queue_async_work(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
+ unsigned long flags;
int rw = 0;
if (req->submit.sqe) {
- switch (req->submit.sqe->opcode) {
+ switch (req->submit.opcode) {
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
rw = !(req->rw.ki_flags & IOCB_DIRECT);
@@ -503,6 +514,15 @@ static inline void io_queue_async_work(struct io_ring_ctx *ctx,
}
}
+ if (req->work.func == io_sq_wq_submit_work) {
+ req->files = current->files;
+
+ spin_lock_irqsave(&ctx->task_lock, flags);
+ list_add(&req->task_list, &ctx->task_list);
+ req->work_task = NULL;
+ spin_unlock_irqrestore(&ctx->task_lock, flags);
+ }
+
queue_work(ctx->sqo_wq[rw], &req->work);
}
@@ -650,6 +670,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
state->cur_req++;
}
+ INIT_LIST_HEAD(&req->task_list);
req->file = NULL;
req->ctx = ctx;
req->flags = 0;
@@ -1237,23 +1258,15 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
}
static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
- const struct sqe_submit *s, struct iovec **iovec,
+ struct io_kiocb *req, struct iovec **iovec,
struct iov_iter *iter)
{
- const struct io_uring_sqe *sqe = s->sqe;
+ const struct io_uring_sqe *sqe = req->submit.sqe;
void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
size_t sqe_len = READ_ONCE(sqe->len);
u8 opcode;
- /*
- * We're reading ->opcode for the second time, but the first read
- * doesn't care whether it's _FIXED or not, so it doesn't matter
- * whether ->opcode changes concurrently. The first read does care
- * about whether it is a READ or a WRITE, so we don't trust this read
- * for that purpose and instead let the caller pass in the read/write
- * flag.
- */
- opcode = READ_ONCE(sqe->opcode);
+ opcode = req->submit.opcode;
if (opcode == IORING_OP_READ_FIXED ||
opcode == IORING_OP_WRITE_FIXED) {
ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
@@ -1261,7 +1274,7 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
return ret;
}
- if (!s->has_user)
+ if (!req->submit.has_user)
return -EFAULT;
#ifdef CONFIG_COMPAT
@@ -1408,7 +1421,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
if (unlikely(!(file->f_mode & FMODE_READ)))
return -EBADF;
- ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
+ ret = io_import_iovec(req->ctx, READ, req, &iovec, &iter);
if (ret < 0)
return ret;
@@ -1423,8 +1436,10 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
if (file->f_op->read_iter)
ret2 = call_read_iter(file, kiocb, &iter);
- else
+ else if (req->file->f_op->read)
ret2 = loop_rw_iter(READ, file, kiocb, &iter);
+ else
+ ret2 = -EINVAL;
/*
* In case of a short read, punt to async. This can happen
@@ -1473,7 +1488,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
if (unlikely(!(file->f_mode & FMODE_WRITE)))
return -EBADF;
- ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
+ ret = io_import_iovec(req->ctx, WRITE, req, &iovec, &iter);
if (ret < 0)
return ret;
@@ -1514,8 +1529,10 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
if (file->f_op->write_iter)
ret2 = call_write_iter(file, kiocb, &iter);
- else
+ else if (req->file->f_op->write)
ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
+ else
+ ret2 = -EINVAL;
if (!force_nonblock)
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
@@ -2092,15 +2109,14 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct sqe_submit *s, bool force_nonblock)
{
- int ret, opcode;
+ int ret;
req->user_data = READ_ONCE(s->sqe->user_data);
if (unlikely(s->index >= ctx->sq_entries))
return -EINVAL;
- opcode = READ_ONCE(s->sqe->opcode);
- switch (opcode) {
+ switch (req->submit.opcode) {
case IORING_OP_NOP:
ret = io_nop(req, req->user_data);
break;
@@ -2164,10 +2180,10 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
return 0;
}
-static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx,
- const struct io_uring_sqe *sqe)
+static struct async_list *io_async_list_from_req(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
{
- switch (sqe->opcode) {
+ switch (req->submit.opcode) {
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
return &ctx->pending_async[READ];
@@ -2179,12 +2195,10 @@ static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx,
}
}
-static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
+static inline bool io_req_needs_user(struct io_kiocb *req)
{
- u8 opcode = READ_ONCE(sqe->opcode);
-
- return !(opcode == IORING_OP_READ_FIXED ||
- opcode == IORING_OP_WRITE_FIXED);
+ return !(req->submit.opcode == IORING_OP_READ_FIXED ||
+ req->submit.opcode == IORING_OP_WRITE_FIXED);
}
static void io_sq_wq_submit_work(struct work_struct *work)
@@ -2200,7 +2214,9 @@ static void io_sq_wq_submit_work(struct work_struct *work)
int ret;
old_cred = override_creds(ctx->creds);
- async_list = io_async_list_from_sqe(ctx, req->submit.sqe);
+ async_list = io_async_list_from_req(ctx, req);
+
+ allow_kernel_signal(SIGINT);
restart:
do {
struct sqe_submit *s = &req->submit;
@@ -2220,9 +2236,10 @@ restart:
}
ret = 0;
- if (io_sqe_needs_user(sqe) && !cur_mm) {
+ if (io_req_needs_user(req) && !cur_mm) {
if (!mmget_not_zero(ctx->sqo_mm)) {
ret = -EFAULT;
+ goto end_req;
} else {
cur_mm = ctx->sqo_mm;
use_mm(cur_mm);
@@ -2232,6 +2249,18 @@ restart:
}
if (!ret) {
+ req->work_task = current;
+
+ /*
+ * Pairs with the smp_store_mb() (B) in
+ * io_cancel_async_work().
+ */
+ smp_mb(); /* A */
+ if (req->flags & REQ_F_CANCEL) {
+ ret = -ECANCELED;
+ goto end_req;
+ }
+
s->has_user = cur_mm != NULL;
s->needs_lock = true;
do {
@@ -2247,6 +2276,10 @@ restart:
cond_resched();
} while (1);
}
+end_req:
+ spin_lock_irq(&ctx->task_lock);
+ list_del_init(&req->task_list);
+ spin_unlock_irq(&ctx->task_lock);
/* drop submission reference */
io_put_req(req);
@@ -2311,6 +2344,7 @@ restart:
}
out:
+ disallow_signal(SIGINT);
if (cur_mm) {
set_fs(old_fs);
unuse_mm(cur_mm);
@@ -2351,15 +2385,24 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
list_del_init(&req->list);
ret = false;
}
+
+ if (ret) {
+ struct io_ring_ctx *ctx = req->ctx;
+
+ req->files = current->files;
+
+ spin_lock_irq(&ctx->task_lock);
+ list_add(&req->task_list, &ctx->task_list);
+ req->work_task = NULL;
+ spin_unlock_irq(&ctx->task_lock);
+ }
spin_unlock(&list->lock);
return ret;
}
-static bool io_op_needs_file(const struct io_uring_sqe *sqe)
+static bool io_op_needs_file(struct io_kiocb *req)
{
- int op = READ_ONCE(sqe->opcode);
-
- switch (op) {
+ switch (req->submit.opcode) {
case IORING_OP_NOP:
case IORING_OP_POLL_REMOVE:
case IORING_OP_TIMEOUT:
@@ -2387,7 +2430,7 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
*/
req->sequence = s->sequence;
- if (!io_op_needs_file(s->sqe))
+ if (!io_op_needs_file(req))
return 0;
if (flags & IOSQE_FIXED_FILE) {
@@ -2428,7 +2471,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
s->sqe = sqe_copy;
memcpy(&req->submit, s, sizeof(*s));
- list = io_async_list_from_sqe(ctx, s->sqe);
+ list = io_async_list_from_req(ctx, req);
if (!io_add_to_prev_work(list, req)) {
if (list)
atomic_inc(&list->cnt);
@@ -2538,6 +2581,7 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
goto err;
}
+ memcpy(&req->submit, s, sizeof(*s));
ret = io_req_set_file(ctx, s, state, req);
if (unlikely(ret)) {
err_req:
@@ -2550,7 +2594,7 @@ err:
req->user_data = s->sqe->user_data;
#if defined(CONFIG_NET)
- switch (READ_ONCE(s->sqe->opcode)) {
+ switch (req->submit.opcode) {
case IORING_OP_SENDMSG:
case IORING_OP_RECVMSG:
spin_lock(&current->fs->lock);
@@ -2665,6 +2709,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
if (head < ctx->sq_entries) {
s->index = head;
s->sqe = &ctx->sq_sqes[head];
+ s->opcode = READ_ONCE(s->sqe->opcode);
s->sequence = ctx->cached_sq_head;
ctx->cached_sq_head++;
return true;
@@ -3368,6 +3413,9 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
return SIZE_MAX;
#endif
+ if (sq_offset)
+ *sq_offset = off;
+
sq_array_size = array_size(sizeof(u32), sq_entries);
if (sq_array_size == SIZE_MAX)
return SIZE_MAX;
@@ -3375,9 +3423,6 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
if (check_add_overflow(off, sq_array_size, &off))
return SIZE_MAX;
- if (sq_offset)
- *sq_offset = off;
-
return off;
}
@@ -3675,12 +3720,41 @@ static int io_uring_fasync(int fd, struct file *file, int on)
return fasync_helper(fd, file, on, &ctx->cq_fasync);
}
+static void io_cancel_async_work(struct io_ring_ctx *ctx,
+ struct files_struct *files)
+{
+ struct io_kiocb *req;
+
+ if (list_empty(&ctx->task_list))
+ return;
+
+ spin_lock_irq(&ctx->task_lock);
+
+ list_for_each_entry(req, &ctx->task_list, task_list) {
+ if (files && req->files != files)
+ continue;
+
+ /*
+ * The below executes an smp_mb(), which matches with the
+ * smp_mb() (A) in io_sq_wq_submit_work() such that either
+ * we store REQ_F_CANCEL flag to req->flags or we see the
+ * req->work_task setted in io_sq_wq_submit_work().
+ */
+ smp_store_mb(req->flags, req->flags | REQ_F_CANCEL); /* B */
+
+ if (req->work_task)
+ send_sig(SIGINT, req->work_task, 1);
+ }
+ spin_unlock_irq(&ctx->task_lock);
+}
+
static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
mutex_lock(&ctx->uring_lock);
percpu_ref_kill(&ctx->refs);
mutex_unlock(&ctx->uring_lock);
+ io_cancel_async_work(ctx, NULL);
io_kill_timeouts(ctx);
io_poll_remove_all(ctx);
io_iopoll_reap_events(ctx);
@@ -3688,6 +3762,16 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
io_ring_ctx_free(ctx);
}
+static int io_uring_flush(struct file *file, void *data)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+
+ if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
+ io_cancel_async_work(ctx, data);
+
+ return 0;
+}
+
static int io_uring_release(struct inode *inode, struct file *file)
{
struct io_ring_ctx *ctx = file->private_data;
@@ -3792,6 +3876,7 @@ out_fput:
static const struct file_operations io_uring_fops = {
.release = io_uring_release,
+ .flush = io_uring_flush,
.mmap = io_uring_mmap,
.poll = io_uring_poll,
.fasync = io_uring_fasync,
@@ -3803,6 +3888,10 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
struct io_rings *rings;
size_t size, sq_array_offset;
+ /* make sure these are sane, as we already accounted them */
+ ctx->sq_entries = p->sq_entries;
+ ctx->cq_entries = p->cq_entries;
+
size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
if (size == SIZE_MAX)
return -EOVERFLOW;
@@ -3819,8 +3908,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
rings->cq_ring_entries = p->cq_entries;
ctx->sq_mask = rings->sq_ring_mask;
ctx->cq_mask = rings->cq_ring_mask;
- ctx->sq_entries = rings->sq_ring_entries;
- ctx->cq_entries = rings->cq_ring_entries;
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
if (size == SIZE_MAX) {