* [RFC 1/8] io_uring: modularize io_sqe_buffer_register
2020-10-22 23:13 [RFC 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
@ 2020-10-22 23:13 ` Bijan Mottahedeh
2020-10-22 23:13 ` [RFC 2/8] io_uring: modularize io_sqe_buffers_register Bijan Mottahedeh
` (7 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Bijan Mottahedeh @ 2020-10-22 23:13 UTC (permalink / raw)
To: axboe, io-uring
Split io_sqe_buffer_register into two routines:
- io_sqe_buffer_register() registers a single buffer
- io_sqe_buffers_register iterates over all user specified buffers
Signed-off-by: Bijan Mottahedeh <[email protected]>
---
fs/io_uring.c | 210 ++++++++++++++++++++++++++++++----------------------------
1 file changed, 107 insertions(+), 103 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2356488..69a28d8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7946,7 +7946,7 @@ static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
return pages;
}
-static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
+static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
int i, j;
@@ -8064,14 +8064,103 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
return ret;
}
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args)
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
+ struct io_mapped_ubuf *imu,
+ struct page **last_hpage)
{
struct vm_area_struct **vmas = NULL;
struct page **pages = NULL;
+ unsigned long off, start, end, ubuf;
+ size_t size;
+ int ret, pret, nr_pages, i;
+
+ ubuf = (unsigned long) iov->iov_base;
+ end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = ubuf >> PAGE_SHIFT;
+ nr_pages = end - start;
+
+ ret = -ENOMEM;
+
+ pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ goto done;
+
+ vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+ if (!vmas)
+ goto done;
+
+ imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
+ GFP_KERNEL);
+ if (!imu->bvec)
+ goto done;
+
+ ret = 0;
+ mmap_read_lock(current->mm);
+ pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+ pages, vmas);
+ if (pret == nr_pages) {
+ /* don't support file backed memory */
+ for (i = 0; i < nr_pages; i++) {
+ struct vm_area_struct *vma = vmas[i];
+
+ if (vma->vm_file &&
+ !is_file_hugepages(vma->vm_file)) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ }
+ } else {
+ ret = pret < 0 ? pret : -EFAULT;
+ }
+ mmap_read_unlock(current->mm);
+ if (ret) {
+ /*
+ * if we did partial map, or found file backed vmas,
+ * release any pages we did get
+ */
+ if (pret > 0)
+ unpin_user_pages(pages, pret);
+ kvfree(imu->bvec);
+ goto done;
+ }
+
+ ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
+ if (ret) {
+ unpin_user_pages(pages, pret);
+ kvfree(imu->bvec);
+ goto done;
+ }
+
+ off = ubuf & ~PAGE_MASK;
+ size = iov->iov_len;
+ for (i = 0; i < nr_pages; i++) {
+ size_t vec_len;
+
+ vec_len = min_t(size_t, size, PAGE_SIZE - off);
+ imu->bvec[i].bv_page = pages[i];
+ imu->bvec[i].bv_len = vec_len;
+ imu->bvec[i].bv_offset = off;
+ off = 0;
+ size -= vec_len;
+ }
+ /* store original address for later verification */
+ imu->ubuf = ubuf;
+ imu->len = iov->iov_len;
+ imu->nr_bvecs = nr_pages;
+ ret = 0;
+done:
+ kvfree(pages);
+ kvfree(vmas);
+ return ret;
+}
+
+static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int nr_args)
+{
+ int i, ret;
+ struct iovec iov;
struct page *last_hpage = NULL;
- int i, j, got_pages = 0;
- int ret = -EINVAL;
if (ctx->user_bufs)
return -EBUSY;
@@ -8085,14 +8174,10 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
for (i = 0; i < nr_args; i++) {
struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
- unsigned long off, start, end, ubuf;
- int pret, nr_pages;
- struct iovec iov;
- size_t size;
ret = io_copy_iov(ctx, &iov, arg, i);
if (ret)
- goto err;
+ break;
/*
* Don't impose further limits on the size and buffer
@@ -8101,103 +8186,22 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
*/
ret = -EFAULT;
if (!iov.iov_base || !iov.iov_len)
- goto err;
+ break;
/* arbitrary limit, but we need something */
if (iov.iov_len > SZ_1G)
- goto err;
-
- ubuf = (unsigned long) iov.iov_base;
- end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- start = ubuf >> PAGE_SHIFT;
- nr_pages = end - start;
-
- ret = 0;
- if (!pages || nr_pages > got_pages) {
- kvfree(vmas);
- kvfree(pages);
- pages = kvmalloc_array(nr_pages, sizeof(struct page *),
- GFP_KERNEL);
- vmas = kvmalloc_array(nr_pages,
- sizeof(struct vm_area_struct *),
- GFP_KERNEL);
- if (!pages || !vmas) {
- ret = -ENOMEM;
- goto err;
- }
- got_pages = nr_pages;
- }
-
- imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
- GFP_KERNEL);
- ret = -ENOMEM;
- if (!imu->bvec)
- goto err;
-
- ret = 0;
- mmap_read_lock(current->mm);
- pret = pin_user_pages(ubuf, nr_pages,
- FOLL_WRITE | FOLL_LONGTERM,
- pages, vmas);
- if (pret == nr_pages) {
- /* don't support file backed memory */
- for (j = 0; j < nr_pages; j++) {
- struct vm_area_struct *vma = vmas[j];
-
- if (vma->vm_file &&
- !is_file_hugepages(vma->vm_file)) {
- ret = -EOPNOTSUPP;
- break;
- }
- }
- } else {
- ret = pret < 0 ? pret : -EFAULT;
- }
- mmap_read_unlock(current->mm);
- if (ret) {
- /*
- * if we did partial map, or found file backed vmas,
- * release any pages we did get
- */
- if (pret > 0)
- unpin_user_pages(pages, pret);
- kvfree(imu->bvec);
- goto err;
- }
-
- ret = io_buffer_account_pin(ctx, pages, pret, imu, &last_hpage);
- if (ret) {
- unpin_user_pages(pages, pret);
- kvfree(imu->bvec);
- goto err;
- }
+ break;
- off = ubuf & ~PAGE_MASK;
- size = iov.iov_len;
- for (j = 0; j < nr_pages; j++) {
- size_t vec_len;
-
- vec_len = min_t(size_t, size, PAGE_SIZE - off);
- imu->bvec[j].bv_page = pages[j];
- imu->bvec[j].bv_len = vec_len;
- imu->bvec[j].bv_offset = off;
- off = 0;
- size -= vec_len;
- }
- /* store original address for later verification */
- imu->ubuf = ubuf;
- imu->len = iov.iov_len;
- imu->nr_bvecs = nr_pages;
+ ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
+ if (ret)
+ break;
ctx->nr_user_bufs++;
}
- kvfree(pages);
- kvfree(vmas);
- return 0;
-err:
- kvfree(pages);
- kvfree(vmas);
- io_sqe_buffer_unregister(ctx);
+
+ if (ret)
+ io_sqe_buffers_unregister(ctx);
+
return ret;
}
@@ -8251,7 +8255,7 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
static void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_finish_async(ctx);
- io_sqe_buffer_unregister(ctx);
+ io_sqe_buffers_unregister(ctx);
if (ctx->sqo_task) {
put_task_struct(ctx->sqo_task);
@@ -9581,13 +9585,13 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
switch (opcode) {
case IORING_REGISTER_BUFFERS:
- ret = io_sqe_buffer_register(ctx, arg, nr_args);
+ ret = io_sqe_buffers_register(ctx, arg, nr_args);
break;
case IORING_UNREGISTER_BUFFERS:
ret = -EINVAL;
if (arg || nr_args)
break;
- ret = io_sqe_buffer_unregister(ctx);
+ ret = io_sqe_buffers_unregister(ctx);
break;
case IORING_REGISTER_FILES:
ret = io_sqe_files_register(ctx, arg, nr_args);
--
1.8.3.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [RFC 2/8] io_uring: modularize io_sqe_buffers_register
2020-10-22 23:13 [RFC 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
2020-10-22 23:13 ` [RFC 1/8] io_uring: modularize io_sqe_buffer_register Bijan Mottahedeh
@ 2020-10-22 23:13 ` Bijan Mottahedeh
2020-10-22 23:13 ` [RFC 3/8] io_uring: generalize fixed file functionality Bijan Mottahedeh
` (6 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Bijan Mottahedeh @ 2020-10-22 23:13 UTC (permalink / raw)
To: axboe, io-uring
Move allocation of buffer management structures, and validation of
buffers into separate routines.
Signed-off-by: Bijan Mottahedeh <[email protected]>
---
fs/io_uring.c | 51 ++++++++++++++++++++++++++++++++++-----------------
1 file changed, 34 insertions(+), 17 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 69a28d8..e635da2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8155,13 +8155,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
return ret;
}
-static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int nr_args)
+static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
{
- int i, ret;
- struct iovec iov;
- struct page *last_hpage = NULL;
-
if (ctx->user_bufs)
return -EBUSY;
if (!nr_args || nr_args > UIO_MAXIOV)
@@ -8172,6 +8167,37 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
if (!ctx->user_bufs)
return -ENOMEM;
+ return 0;
+}
+
+static int io_buffer_validate(struct iovec *iov)
+{
+ /*
+ * Don't impose further limits on the size and buffer
+ * constraints here, we'll -EINVAL later when IO is
+ * submitted if they are wrong.
+ */
+ if (!iov->iov_base || !iov->iov_len)
+ return -EFAULT;
+
+ /* arbitrary limit, but we need something */
+ if (iov->iov_len > SZ_1G)
+ return -EFAULT;
+
+ return 0;
+}
+
+static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int nr_args)
+{
+ int i, ret;
+ struct iovec iov;
+ struct page *last_hpage = NULL;
+
+ ret = io_buffers_map_alloc(ctx, nr_args);
+ if (ret)
+ return ret;
+
for (i = 0; i < nr_args; i++) {
struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
@@ -8179,17 +8205,8 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
if (ret)
break;
- /*
- * Don't impose further limits on the size and buffer
- * constraints here, we'll -EINVAL later when IO is
- * submitted if they are wrong.
- */
- ret = -EFAULT;
- if (!iov.iov_base || !iov.iov_len)
- break;
-
- /* arbitrary limit, but we need something */
- if (iov.iov_len > SZ_1G)
+ ret = io_buffer_validate(&iov);
+ if (ret)
break;
ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
--
1.8.3.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [RFC 3/8] io_uring: generalize fixed file functionality
2020-10-22 23:13 [RFC 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
2020-10-22 23:13 ` [RFC 1/8] io_uring: modularize io_sqe_buffer_register Bijan Mottahedeh
2020-10-22 23:13 ` [RFC 2/8] io_uring: modularize io_sqe_buffers_register Bijan Mottahedeh
@ 2020-10-22 23:13 ` Bijan Mottahedeh
2020-10-22 23:13 ` [RFC 4/8] io_uring: implement fixed buffers registration similar to fixed files Bijan Mottahedeh
` (5 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Bijan Mottahedeh @ 2020-10-22 23:13 UTC (permalink / raw)
To: axboe, io-uring
Generalize fixed_file functionality to fixed_rsrc in order to leverage
it for fixed buffers.
Signed-off-by: Bijan Mottahedeh <[email protected]>
---
fs/io_uring.c | 161 +++++++++++++++++++++++++++++++++-------------------------
1 file changed, 92 insertions(+), 69 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e635da2..1996c15 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -195,23 +195,39 @@ struct io_mapped_ubuf {
unsigned long acct_pages;
};
-struct fixed_file_table {
- struct file **files;
+struct io_ring_ctx;
+
+struct io_rsrc_put {
+ struct list_head list;
+ union {
+ void *rsrc;
+ struct file *file;
+ struct io_mapped_ubuf *buf;
+ };
};
-struct fixed_file_ref_node {
+struct fixed_rsrc_table {
+ union {
+ struct file **files;
+ struct io_mapped_ubuf *bufs;
+ };
+};
+
+struct fixed_rsrc_ref_node {
struct percpu_ref refs;
struct list_head node;
- struct list_head file_list;
- struct fixed_file_data *file_data;
+ struct list_head rsrc_list;
+ struct fixed_rsrc_data *rsrc_data;
+ void (*rsrc_put)(struct io_ring_ctx *ctx,
+ struct io_rsrc_put *prsrc);
struct llist_node llist;
};
-struct fixed_file_data {
- struct fixed_file_table *table;
+struct fixed_rsrc_data {
+ struct fixed_rsrc_table *table;
struct io_ring_ctx *ctx;
- struct fixed_file_ref_node *node;
+ struct fixed_rsrc_ref_node *node;
struct percpu_ref refs;
struct completion done;
struct list_head ref_list;
@@ -317,7 +333,7 @@ struct io_ring_ctx {
* readers must ensure that ->refs is alive as long as the file* is
* used. Only updated through io_uring_register(2).
*/
- struct fixed_file_data *file_data;
+ struct fixed_rsrc_data *file_data;
unsigned nr_user_files;
/* if used, fixed mapped user buffers */
@@ -6051,7 +6067,7 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
int index)
{
- struct fixed_file_table *table;
+ struct fixed_rsrc_table *table;
table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
return table->files[index & IORING_FILE_TABLE_MASK];
@@ -6938,18 +6954,18 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
#endif
}
-static void io_file_ref_kill(struct percpu_ref *ref)
+static void io_rsrc_ref_kill(struct percpu_ref *ref)
{
- struct fixed_file_data *data;
+ struct fixed_rsrc_data *data;
- data = container_of(ref, struct fixed_file_data, refs);
+ data = container_of(ref, struct fixed_rsrc_data, refs);
complete(&data->done);
}
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
- struct fixed_file_data *data = ctx->file_data;
- struct fixed_file_ref_node *ref_node = NULL;
+ struct fixed_rsrc_data *data = ctx->file_data;
+ struct fixed_rsrc_ref_node *ref_node = NULL;
unsigned nr_tables, i;
if (!data)
@@ -6958,7 +6974,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
spin_lock(&data->lock);
if (!list_empty(&data->ref_list))
ref_node = list_first_entry(&data->ref_list,
- struct fixed_file_ref_node, node);
+ struct fixed_rsrc_ref_node, node);
spin_unlock(&data->lock);
if (ref_node)
percpu_ref_kill(&ref_node->refs);
@@ -7201,13 +7217,13 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
}
#endif
-static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data,
+static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data,
unsigned nr_tables, unsigned nr_files)
{
int i;
for (i = 0; i < nr_tables; i++) {
- struct fixed_file_table *table = &file_data->table[i];
+ struct fixed_rsrc_table *table = &file_data->table[i];
unsigned this_files;
this_files = min(nr_files, IORING_MAX_FILES_TABLE);
@@ -7222,14 +7238,15 @@ static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data,
return 0;
for (i = 0; i < nr_tables; i++) {
- struct fixed_file_table *table = &file_data->table[i];
+ struct fixed_rsrc_table *table = &file_data->table[i];
kfree(table->files);
}
return 1;
}
-static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
+static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
+ struct file *file = prsrc->file;
#if defined(CONFIG_UNIX)
struct sock *sock = ctx->ring_sock->sk;
struct sk_buff_head list, *head = &sock->sk_receive_queue;
@@ -7290,30 +7307,38 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
#endif
}
-struct io_file_put {
- struct list_head list;
- struct file *file;
-};
-
-static void __io_file_put_work(struct fixed_file_ref_node *ref_node)
+static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node)
{
- struct fixed_file_data *file_data = ref_node->file_data;
- struct io_ring_ctx *ctx = file_data->ctx;
- struct io_file_put *pfile, *tmp;
+ struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data;
+ struct io_ring_ctx *ctx = rsrc_data->ctx;
+ struct io_rsrc_put *prsrc, *tmp;
- list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
- list_del(&pfile->list);
- io_ring_file_put(ctx, pfile->file);
- kfree(pfile);
+ list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
+ list_del(&prsrc->list);
+ ref_node->rsrc_put(ctx, prsrc);
+ kfree(prsrc);
}
- spin_lock(&file_data->lock);
+ spin_lock(&rsrc_data->lock);
list_del(&ref_node->node);
- spin_unlock(&file_data->lock);
+ spin_unlock(&rsrc_data->lock);
percpu_ref_exit(&ref_node->refs);
kfree(ref_node);
- percpu_ref_put(&file_data->refs);
+ percpu_ref_put(&rsrc_data->refs);
+}
+
+static void io_rsrc_put_work(struct llist_node *node)
+{
+ struct fixed_rsrc_ref_node *ref_node;
+ struct llist_node *next;
+
+ while (node) {
+ next = node->next;
+ ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist);
+ __io_rsrc_put_work(ref_node);
+ node = next;
+ }
}
static void io_file_put_work(struct work_struct *work)
@@ -7323,26 +7348,18 @@ static void io_file_put_work(struct work_struct *work)
ctx = container_of(work, struct io_ring_ctx, file_put_work.work);
node = llist_del_all(&ctx->file_put_llist);
-
- while (node) {
- struct fixed_file_ref_node *ref_node;
- struct llist_node *next = node->next;
-
- ref_node = llist_entry(node, struct fixed_file_ref_node, llist);
- __io_file_put_work(ref_node);
- node = next;
- }
+ io_rsrc_put_work(node);
}
static void io_file_data_ref_zero(struct percpu_ref *ref)
{
- struct fixed_file_ref_node *ref_node;
+ struct fixed_rsrc_ref_node *ref_node;
struct io_ring_ctx *ctx;
bool first_add;
int delay = HZ;
- ref_node = container_of(ref, struct fixed_file_ref_node, refs);
- ctx = ref_node->file_data->ctx;
+ ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
+ ctx = ref_node->rsrc_data->ctx;
if (percpu_ref_is_dying(&ctx->file_data->refs))
delay = 0;
@@ -7354,10 +7371,10 @@ static void io_file_data_ref_zero(struct percpu_ref *ref)
queue_delayed_work(system_wq, &ctx->file_put_work, delay);
}
-static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
+static struct fixed_rsrc_ref_node *alloc_fixed_file_ref_node(
struct io_ring_ctx *ctx)
{
- struct fixed_file_ref_node *ref_node;
+ struct fixed_rsrc_ref_node *ref_node;
ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
if (!ref_node)
@@ -7369,12 +7386,13 @@ static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
return ERR_PTR(-ENOMEM);
}
INIT_LIST_HEAD(&ref_node->node);
- INIT_LIST_HEAD(&ref_node->file_list);
- ref_node->file_data = ctx->file_data;
+ INIT_LIST_HEAD(&ref_node->rsrc_list);
+ ref_node->rsrc_data = ctx->file_data;
+ ref_node->rsrc_put = io_ring_file_put;
return ref_node;
}
-static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
+static void destroy_fixed_file_ref_node(struct fixed_rsrc_ref_node *ref_node)
{
percpu_ref_exit(&ref_node->refs);
kfree(ref_node);
@@ -7387,8 +7405,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_tables, i;
struct file *file;
int fd, ret = -ENOMEM;
- struct fixed_file_ref_node *ref_node;
- struct fixed_file_data *file_data;
+ struct fixed_rsrc_ref_node *ref_node;
+ struct fixed_rsrc_data *file_data;
if (ctx->file_data)
return -EBUSY;
@@ -7411,7 +7429,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
if (!file_data->table)
goto out_free;
- if (percpu_ref_init(&file_data->refs, io_file_ref_kill,
+ if (percpu_ref_init(&file_data->refs, io_rsrc_ref_kill,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto out_free;
@@ -7420,7 +7438,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
ctx->file_data = file_data;
for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
- struct fixed_file_table *table;
+ struct fixed_rsrc_table *table;
unsigned index;
if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
@@ -7531,28 +7549,33 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif
}
-static int io_queue_file_removal(struct fixed_file_data *data,
- struct file *file)
+static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc)
{
- struct io_file_put *pfile;
- struct fixed_file_ref_node *ref_node = data->node;
+ struct io_rsrc_put *prsrc;
+ struct fixed_rsrc_ref_node *ref_node = data->node;
- pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
- if (!pfile)
+ prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
+ if (!prsrc)
return -ENOMEM;
- pfile->file = file;
- list_add(&pfile->list, &ref_node->file_list);
+ prsrc->rsrc = rsrc;
+ list_add(&prsrc->list, &ref_node->rsrc_list);
return 0;
}
+static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
+ struct file *file)
+{
+ return io_queue_rsrc_removal(data, (void *)file);
+}
+
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *up,
unsigned nr_args)
{
- struct fixed_file_data *data = ctx->file_data;
- struct fixed_file_ref_node *ref_node;
+ struct fixed_rsrc_data *data = ctx->file_data;
+ struct fixed_rsrc_ref_node *ref_node;
struct file *file;
__s32 __user *fds;
int fd, i, err;
@@ -7571,7 +7594,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
done = 0;
fds = u64_to_user_ptr(up->fds);
while (nr_args) {
- struct fixed_file_table *table;
+ struct fixed_rsrc_table *table;
unsigned index;
err = 0;
@@ -9021,7 +9044,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
- struct fixed_file_table *table;
+ struct fixed_rsrc_table *table;
struct file *f;
table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
--
1.8.3.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [RFC 4/8] io_uring: implement fixed buffers registration similar to fixed files
2020-10-22 23:13 [RFC 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
` (2 preceding siblings ...)
2020-10-22 23:13 ` [RFC 3/8] io_uring: generalize fixed file functionality Bijan Mottahedeh
@ 2020-10-22 23:13 ` Bijan Mottahedeh
2020-10-22 23:14 ` [RFC 5/8] io_uring: generalize files_update functionlity to rsrc_update Bijan Mottahedeh
` (4 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Bijan Mottahedeh @ 2020-10-22 23:13 UTC (permalink / raw)
To: axboe, io-uring
Apply fixed_rsrc functionality for fixed buffers support.
Signed-off-by: Bijan Mottahedeh <[email protected]>
---
fs/io_uring.c | 294 +++++++++++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 258 insertions(+), 36 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1996c15..daf32d3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -104,6 +104,14 @@
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
+/*
+ * Shift of 7 is 128 entries, or exactly one page on 64-bit archs
+ */
+#define IORING_BUF_TABLE_SHIFT 7 /* struct io_mapped_ubuf */
+#define IORING_MAX_BUFS_TABLE (1U << IORING_BUF_TABLE_SHIFT)
+#define IORING_BUF_TABLE_MASK (IORING_MAX_BUFS_TABLE - 1)
+#define IORING_MAX_FIXED_BUFS UIO_MAXIOV
+
struct io_uring {
u32 head ____cacheline_aligned_in_smp;
u32 tail ____cacheline_aligned_in_smp;
@@ -337,8 +345,8 @@ struct io_ring_ctx {
unsigned nr_user_files;
/* if used, fixed mapped user buffers */
+ struct fixed_rsrc_data *buf_data;
unsigned nr_user_bufs;
- struct io_mapped_ubuf *user_bufs;
struct user_struct *user;
@@ -400,6 +408,9 @@ struct io_ring_ctx {
struct delayed_work file_put_work;
struct llist_head file_put_llist;
+ struct delayed_work buf_put_work;
+ struct llist_head buf_put_llist;
+
struct work_struct exit_work;
struct io_restriction restrictions;
};
@@ -971,6 +982,7 @@ static struct file *io_file_get(struct io_submit_state *state,
struct io_kiocb *req, int fd, bool fixed);
static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs);
static void io_file_put_work(struct work_struct *work);
+static void io_buf_put_work(struct work_struct *work);
static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
struct iovec **iovec, struct iov_iter *iter,
@@ -1179,6 +1191,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->inflight_list);
INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
init_llist_head(&ctx->file_put_llist);
+ INIT_DELAYED_WORK(&ctx->buf_put_work, io_buf_put_work);
+ init_llist_head(&ctx->buf_put_llist);
return ctx;
err:
if (ctx->fallback_req)
@@ -2855,6 +2869,15 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
io_rw_done(kiocb, ret);
}
+static inline struct io_mapped_ubuf *io_buf_from_index(struct io_ring_ctx *ctx,
+ int index)
+{
+ struct fixed_rsrc_table *table;
+
+ table = &ctx->buf_data->table[index >> IORING_BUF_TABLE_SHIFT];
+ return &table->bufs[index & IORING_BUF_TABLE_MASK];
+}
+
static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
struct iov_iter *iter)
{
@@ -2865,10 +2888,15 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
size_t offset;
u64 buf_addr;
+ /* attempt to use fixed buffers without having provided iovecs */
+ if (unlikely(!ctx->buf_data))
+ return -EFAULT;
+
+ buf_index = req->buf_index;
if (unlikely(buf_index >= ctx->nr_user_bufs))
return -EFAULT;
index = array_index_nospec(buf_index, ctx->nr_user_bufs);
- imu = &ctx->user_bufs[index];
+ imu = io_buf_from_index(ctx, index);
buf_addr = req->rw.addr;
/* overflow */
@@ -7969,28 +7997,73 @@ static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
return pages;
}
-static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
{
- int i, j;
+ unsigned i;
- if (!ctx->user_bufs)
- return -ENXIO;
+ for (i = 0; i < imu->nr_bvecs; i++)
+ unpin_user_page(imu->bvec[i].bv_page);
- for (i = 0; i < ctx->nr_user_bufs; i++) {
- struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+ if (imu->acct_pages)
+ io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED);
+ kvfree(imu->bvec);
+ imu->nr_bvecs = 0;
+}
- for (j = 0; j < imu->nr_bvecs; j++)
- unpin_user_page(imu->bvec[j].bv_page);
+static void io_buffers_unmap(struct io_ring_ctx *ctx)
+{
+ unsigned i;
+ struct io_mapped_ubuf *imu;
- if (imu->acct_pages)
- io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED);
- kvfree(imu->bvec);
- imu->nr_bvecs = 0;
+ for (i = 0; i < ctx->nr_user_bufs; i++) {
+ imu = io_buf_from_index(ctx, i);
+ io_buffer_unmap(ctx, imu);
}
+}
+
+static void io_buffers_map_free(struct io_ring_ctx *ctx)
+{
+ struct fixed_rsrc_data *data = ctx->buf_data;
+ unsigned nr_tables, i;
+
+ if (!data)
+ return;
- kfree(ctx->user_bufs);
- ctx->user_bufs = NULL;
+ nr_tables = DIV_ROUND_UP(ctx->nr_user_bufs, IORING_MAX_BUFS_TABLE);
+ for (i = 0; i < nr_tables; i++)
+ kfree(data->table[i].bufs);
+ kfree(data->table);
+ percpu_ref_exit(&data->refs);
+ kfree(data);
+ ctx->buf_data = NULL;
ctx->nr_user_bufs = 0;
+}
+
+static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+{
+ struct fixed_rsrc_data *data = ctx->buf_data;
+ struct fixed_rsrc_ref_node *ref_node = NULL;
+
+ if (!data)
+ return -ENXIO;
+
+ spin_lock(&data->lock);
+ if (!list_empty(&data->ref_list))
+ ref_node = list_first_entry(&data->ref_list,
+ struct fixed_rsrc_ref_node, node);
+ spin_unlock(&data->lock);
+ if (ref_node)
+ percpu_ref_kill(&ref_node->refs);
+
+ percpu_ref_kill(&data->refs);
+
+ /* wait for all refs nodes to complete */
+ flush_delayed_work(&ctx->buf_put_work);
+ wait_for_completion(&data->done);
+
+ io_buffers_unmap(ctx);
+ io_buffers_map_free(ctx);
+
return 0;
}
@@ -8043,7 +8116,13 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
/* check previously registered pages */
for (i = 0; i < ctx->nr_user_bufs; i++) {
- struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+ struct fixed_rsrc_table *table;
+ struct io_mapped_ubuf *imu;
+ unsigned index;
+
+ table = &ctx->buf_data->table[i >> IORING_BUF_TABLE_SHIFT];
+ index = i & IORING_BUF_TABLE_MASK;
+ imu = &table->bufs[index];
for (j = 0; j < imu->nr_bvecs; j++) {
if (!PageCompound(imu->bvec[j].bv_page))
@@ -8178,19 +8257,82 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
return ret;
}
-static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
+static void io_free_buf_tables(struct fixed_rsrc_data *buf_data,
+ unsigned nr_tables)
{
- if (ctx->user_bufs)
- return -EBUSY;
- if (!nr_args || nr_args > UIO_MAXIOV)
- return -EINVAL;
+ int i;
- ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
- GFP_KERNEL);
- if (!ctx->user_bufs)
- return -ENOMEM;
+ for (i = 0; i < nr_tables; i++) {
+ struct fixed_rsrc_table *table = &buf_data->table[i];
+ kfree(table->bufs);
+ }
+}
- return 0;
+static int io_alloc_buf_tables(struct fixed_rsrc_data *buf_data,
+ unsigned nr_tables, unsigned nr_bufs)
+{
+ int i;
+
+ for (i = 0; i < nr_tables; i++) {
+ struct fixed_rsrc_table *table = &buf_data->table[i];
+ unsigned this_bufs;
+
+ this_bufs = min(nr_bufs, IORING_MAX_BUFS_TABLE);
+ table->bufs = kcalloc(this_bufs, sizeof(struct io_mapped_ubuf),
+ GFP_KERNEL);
+ if (!table->bufs)
+ break;
+ nr_bufs -= this_bufs;
+ }
+
+ if (i == nr_tables)
+ return 0;
+
+ io_free_buf_tables(buf_data, nr_tables);
+ return 1;
+}
+
+static struct fixed_rsrc_data *io_buffers_map_alloc(struct io_ring_ctx *ctx,
+ unsigned int nr_args)
+{
+ unsigned nr_tables;
+ struct fixed_rsrc_data *buf_data;
+ int ret = -ENOMEM;
+
+ if (ctx->buf_data)
+ return ERR_PTR(-EBUSY);
+ if (!nr_args || nr_args > IORING_MAX_FIXED_BUFS)
+ return ERR_PTR(-EINVAL);
+
+ buf_data = kzalloc(sizeof(*ctx->buf_data), GFP_KERNEL);
+ if (!buf_data)
+ return ERR_PTR(-ENOMEM);
+ buf_data->ctx = ctx;
+ init_completion(&buf_data->done);
+ INIT_LIST_HEAD(&buf_data->ref_list);
+ spin_lock_init(&buf_data->lock);
+
+ nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_BUFS_TABLE);
+ buf_data->table = kcalloc(nr_tables, sizeof(buf_data->table),
+ GFP_KERNEL);
+ if (!buf_data->table)
+ goto out_free;
+
+ if (percpu_ref_init(&buf_data->refs, io_rsrc_ref_kill,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
+ goto out_free;
+
+ if (io_alloc_buf_tables(buf_data, nr_tables, nr_args))
+ goto out_ref;
+
+ return buf_data;
+
+out_ref:
+ percpu_ref_exit(&buf_data->refs);
+out_free:
+ kfree(buf_data->table);
+ kfree(buf_data);
+ return ERR_PTR(ret);
}
static int io_buffer_validate(struct iovec *iov)
@@ -8210,39 +8352,119 @@ static int io_buffer_validate(struct iovec *iov)
return 0;
}
+static void io_buf_put_work(struct work_struct *work)
+{
+ struct io_ring_ctx *ctx;
+ struct llist_node *node;
+
+ ctx = container_of(work, struct io_ring_ctx, buf_put_work.work);
+ node = llist_del_all(&ctx->buf_put_llist);
+ io_rsrc_put_work(node);
+}
+
+static void io_buf_data_ref_zero(struct percpu_ref *ref)
+{
+ struct fixed_rsrc_ref_node *ref_node;
+ struct io_ring_ctx *ctx;
+ bool first_add;
+ int delay = HZ;
+
+ ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
+ ctx = ref_node->rsrc_data->ctx;
+
+ if (percpu_ref_is_dying(&ctx->buf_data->refs))
+ delay = 0;
+
+ first_add = llist_add(&ref_node->llist, &ctx->buf_put_llist);
+ if (!delay)
+ mod_delayed_work(system_wq, &ctx->buf_put_work, 0);
+ else if (first_add)
+ queue_delayed_work(system_wq, &ctx->buf_put_work, delay);
+}
+
+static void io_ring_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
+{
+ io_buffer_unmap(ctx, prsrc->buf);
+}
+
+static struct fixed_rsrc_ref_node *alloc_fixed_buf_ref_node(
+ struct io_ring_ctx *ctx)
+{
+ struct fixed_rsrc_ref_node *ref_node;
+
+ ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+ if (!ref_node)
+ return ERR_PTR(-ENOMEM);
+
+ if (percpu_ref_init(&ref_node->refs, io_buf_data_ref_zero,
+ 0, GFP_KERNEL)) {
+ kfree(ref_node);
+ return ERR_PTR(-ENOMEM);
+ }
+ INIT_LIST_HEAD(&ref_node->node);
+ INIT_LIST_HEAD(&ref_node->rsrc_list);
+ ref_node->rsrc_data = ctx->buf_data;
+ ref_node->rsrc_put = io_ring_buf_put;
+ return ref_node;
+}
+
static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int nr_args)
{
int i, ret;
struct iovec iov;
struct page *last_hpage = NULL;
+ struct fixed_rsrc_ref_node *ref_node;
+ struct fixed_rsrc_data *buf_data;
- ret = io_buffers_map_alloc(ctx, nr_args);
- if (ret)
- return ret;
+ buf_data = io_buffers_map_alloc(ctx, nr_args);
+ if (IS_ERR(buf_data))
+ return PTR_ERR(buf_data);
- for (i = 0; i < nr_args; i++) {
- struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+ for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
+ struct fixed_rsrc_table *table;
+ struct io_mapped_ubuf *imu;
+ unsigned index;
ret = io_copy_iov(ctx, &iov, arg, i);
if (ret)
break;
+ /* allow sparse sets */
+ if (!iov.iov_base && !iov.iov_len)
+ continue;
+
ret = io_buffer_validate(&iov);
if (ret)
break;
+ table = &buf_data->table[i >> IORING_BUF_TABLE_SHIFT];
+ index = i & IORING_BUF_TABLE_MASK;
+ imu = &table->bufs[index];
+
ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
if (ret)
break;
+ }
- ctx->nr_user_bufs++;
+ ctx->buf_data = buf_data;
+ if (ret) {
+ io_sqe_buffers_unregister(ctx);
+ return ret;
}
- if (ret)
+ ref_node = alloc_fixed_buf_ref_node(ctx);
+ if (IS_ERR(ref_node)) {
io_sqe_buffers_unregister(ctx);
+ return PTR_ERR(ref_node);
+ }
- return ret;
+ buf_data->node = ref_node;
+ spin_lock(&buf_data->lock);
+ list_add(&ref_node->node, &buf_data->ref_list);
+ spin_unlock(&buf_data->lock);
+ percpu_ref_get(&buf_data->refs);
+ return 0;
}
static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
@@ -9056,7 +9278,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
}
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
- struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
+ struct io_mapped_ubuf *buf = io_buf_from_index(ctx, i);
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
(unsigned int) buf->len);
--
1.8.3.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [RFC 5/8] io_uring: generalize files_update functionlity to rsrc_update
2020-10-22 23:13 [RFC 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
` (3 preceding siblings ...)
2020-10-22 23:13 ` [RFC 4/8] io_uring: implement fixed buffers registration similar to fixed files Bijan Mottahedeh
@ 2020-10-22 23:14 ` Bijan Mottahedeh
2020-10-22 23:14 ` [RFC 6/8] io_uring: support buffer registration updates Bijan Mottahedeh
` (3 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Bijan Mottahedeh @ 2020-10-22 23:14 UTC (permalink / raw)
To: axboe, io-uring
Generalize files_update functionality to rsrc_update in order to leverage
it for buffers updates.
Signed-off-by: Bijan Mottahedeh <[email protected]>
---
fs/io_uring.c | 57 ++++++++++++++++++++++++++++---------------
include/uapi/linux/io_uring.h | 10 ++++++++
2 files changed, 48 insertions(+), 19 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index daf32d3..f8fcf00 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -510,7 +510,7 @@ struct io_open {
unsigned long nofile;
};
-struct io_files_update {
+struct io_rsrc_update {
struct file *file;
u64 arg;
u32 nr_args;
@@ -685,7 +685,7 @@ struct io_kiocb {
struct io_sr_msg sr_msg;
struct io_open open;
struct io_close close;
- struct io_files_update files_update;
+ struct io_rsrc_update rsrc_update;
struct io_fadvise fadvise;
struct io_madvise madvise;
struct io_epoll epoll;
@@ -975,7 +975,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
static void __io_queue_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
- struct io_uring_files_update *ip,
+ struct io_uring_rsrc_update *ip,
unsigned nr_args);
static void __io_clean_op(struct io_kiocb *req);
static struct file *io_file_get(struct io_submit_state *state,
@@ -5680,8 +5680,8 @@ static int io_async_cancel(struct io_kiocb *req)
return 0;
}
-static int io_files_update_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+static int io_rsrc_update_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
{
if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
return -EINVAL;
@@ -5690,29 +5690,32 @@ static int io_files_update_prep(struct io_kiocb *req,
if (sqe->ioprio || sqe->rw_flags)
return -EINVAL;
- req->files_update.offset = READ_ONCE(sqe->off);
- req->files_update.nr_args = READ_ONCE(sqe->len);
- if (!req->files_update.nr_args)
+ req->rsrc_update.offset = READ_ONCE(sqe->off);
+ req->rsrc_update.nr_args = READ_ONCE(sqe->len);
+ if (!req->rsrc_update.nr_args)
return -EINVAL;
- req->files_update.arg = READ_ONCE(sqe->addr);
+ req->rsrc_update.arg = READ_ONCE(sqe->addr);
return 0;
}
-static int io_files_update(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_rsrc_update(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs,
+ int (*update)(struct io_ring_ctx *ctx,
+ struct io_uring_rsrc_update *up,
+ unsigned nr_args))
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_uring_files_update up;
+ struct io_uring_rsrc_update up;
int ret;
if (force_nonblock)
return -EAGAIN;
- up.offset = req->files_update.offset;
- up.fds = req->files_update.arg;
+ up.offset = req->rsrc_update.offset;
+ up.rsrc = req->rsrc_update.arg;
mutex_lock(&ctx->uring_lock);
- ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
+ ret = (*update)(ctx, &up, req->rsrc_update.nr_args);
mutex_unlock(&ctx->uring_lock);
if (ret < 0)
@@ -5721,6 +5724,23 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock,
return 0;
}
+static int io_files_update_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ return io_rsrc_update_prep(req, sqe);
+}
+
+static int io_files_update(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
+{
+ return io_rsrc_update(req, force_nonblock, cs, __io_sqe_files_update);
+}
+
+static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
+{
+ percpu_ref_exit(&ref_node->refs);
+ kfree(ref_node);
+}
static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
switch (req->opcode) {
@@ -7422,8 +7442,7 @@ static struct fixed_rsrc_ref_node *alloc_fixed_file_ref_node(
static void destroy_fixed_file_ref_node(struct fixed_rsrc_ref_node *ref_node)
{
- percpu_ref_exit(&ref_node->refs);
- kfree(ref_node);
+ destroy_fixed_rsrc_ref_node(ref_node);
}
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
@@ -7599,7 +7618,7 @@ static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
}
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
- struct io_uring_files_update *up,
+ struct io_uring_rsrc_update *up,
unsigned nr_args)
{
struct fixed_rsrc_data *data = ctx->file_data;
@@ -7689,7 +7708,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
- struct io_uring_files_update up;
+ struct io_uring_rsrc_update up;
if (!ctx->file_data)
return -ENXIO;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 98d8e06..16344da 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -282,6 +282,16 @@ struct io_uring_files_update {
__aligned_u64 /* __s32 * */ fds;
};
+struct io_uring_rsrc_update {
+ __u32 offset;
+ __u32 resv;
+ union {
+ __aligned_u64 /* __s32 * */ fds;
+ __aligned_u64 /* __s32 * */ iovs;
+ __aligned_u64 /* __s32 * */ rsrc;
+ };
+};
+
#define IO_URING_OP_SUPPORTED (1U << 0)
struct io_uring_probe_op {
--
1.8.3.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [RFC 6/8] io_uring: support buffer registration updates
2020-10-22 23:13 [RFC 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
` (4 preceding siblings ...)
2020-10-22 23:14 ` [RFC 5/8] io_uring: generalize files_update functionlity to rsrc_update Bijan Mottahedeh
@ 2020-10-22 23:14 ` Bijan Mottahedeh
2020-10-22 23:14 ` [RFC 7/8] io_uring: support readv/writev with fixed buffers Bijan Mottahedeh
` (2 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Bijan Mottahedeh @ 2020-10-22 23:14 UTC (permalink / raw)
To: axboe, io-uring
Introduce IORING_REGISTER_BUFFERS_UPDATE and IORING_OP_BUFFERS_UPDATE,
consistent with file registration update.
Signed-off-by: Bijan Mottahedeh <[email protected]>
---
fs/io_uring.c | 139 +++++++++++++++++++++++++++++++++++++++++-
include/uapi/linux/io_uring.h | 8 +--
2 files changed, 140 insertions(+), 7 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index f8fcf00..d69d642 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -958,6 +958,9 @@ struct io_op_def {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
},
+ [IORING_OP_BUFFERS_UPDATE] = {
+ .needs_mm = 1,
+ },
};
enum io_mem_account {
@@ -977,6 +980,9 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update *ip,
unsigned nr_args);
+static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
+ struct io_uring_rsrc_update *up,
+ unsigned nr_args);
static void __io_clean_op(struct io_kiocb *req);
static struct file *io_file_get(struct io_submit_state *state,
struct io_kiocb *req, int fd, bool fixed);
@@ -5741,6 +5747,19 @@ static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
percpu_ref_exit(&ref_node->refs);
kfree(ref_node);
}
+
+static int io_buffers_update_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ return io_rsrc_update_prep(req, sqe);
+}
+
+static int io_buffers_update(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
+{
+ return io_rsrc_update(req, force_nonblock, cs, __io_sqe_buffers_update);
+}
+
static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
switch (req->opcode) {
@@ -5806,11 +5825,13 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_remove_buffers_prep(req, sqe);
case IORING_OP_TEE:
return io_tee_prep(req, sqe);
+ case IORING_OP_BUFFERS_UPDATE:
+ return io_buffers_update_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
req->opcode);
- return-EINVAL;
+ return -EINVAL;
}
static int io_req_defer_prep(struct io_kiocb *req,
@@ -6049,6 +6070,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
case IORING_OP_TEE:
ret = io_tee(req, force_nonblock);
break;
+ case IORING_OP_BUFFERS_UPDATE:
+ ret = io_buffers_update(req, force_nonblock, cs);
+ break;
default:
ret = -EINVAL;
break;
@@ -8026,6 +8050,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
if (imu->acct_pages)
io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED);
kvfree(imu->bvec);
+ imu->bvec = NULL;
imu->nr_bvecs = 0;
}
@@ -8243,6 +8268,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
if (pret > 0)
unpin_user_pages(pages, pret);
kvfree(imu->bvec);
+ imu->bvec = NULL;
goto done;
}
@@ -8404,6 +8430,8 @@ static void io_buf_data_ref_zero(struct percpu_ref *ref)
static void io_ring_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
io_buffer_unmap(ctx, prsrc->buf);
+ kvfree(prsrc->buf);
+ prsrc->buf = NULL;
}
static struct fixed_rsrc_ref_node *alloc_fixed_buf_ref_node(
@@ -8486,6 +8514,111 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return 0;
}
+static inline int io_queue_buffer_removal(struct fixed_rsrc_data *data,
+ struct io_mapped_ubuf *imu)
+{
+ return io_queue_rsrc_removal(data, (void *)imu);
+}
+
+static void destroy_fixed_buf_ref_node(struct fixed_rsrc_ref_node *ref_node)
+{
+ destroy_fixed_rsrc_ref_node(ref_node);
+}
+
+static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
+ struct io_uring_rsrc_update *up,
+ unsigned nr_args)
+{
+ struct fixed_rsrc_data *data = ctx->buf_data;
+ struct fixed_rsrc_ref_node *ref_node;
+ struct io_mapped_ubuf *imu;
+ struct iovec iov;
+ struct iovec __user *iovs;
+ struct page *last_hpage = NULL;
+ __u32 done;
+ int i, err;
+ bool needs_switch = false;
+
+ if (check_add_overflow(up->offset, nr_args, &done))
+ return -EOVERFLOW;
+ if (done > ctx->nr_user_bufs)
+ return -EINVAL;
+
+ ref_node = alloc_fixed_buf_ref_node(ctx);
+ if (IS_ERR(ref_node))
+ return PTR_ERR(ref_node);
+
+ done = 0;
+ iovs = u64_to_user_ptr(up->iovs);
+ while (nr_args) {
+ struct fixed_rsrc_table *table;
+ unsigned index;
+
+ err = 0;
+ if (copy_from_user(&iov, &iovs[done], sizeof(iov))) {
+ err = -EFAULT;
+ break;
+ }
+ i = array_index_nospec(up->offset, ctx->nr_user_bufs);
+ table = &ctx->buf_data->table[i >> IORING_BUF_TABLE_SHIFT];
+ index = i & IORING_BUF_TABLE_MASK;
+ imu = &table->bufs[index];
+ if (table->bufs[index].ubuf) {
+ struct io_mapped_ubuf *dup;
+ dup = kmemdup(imu, sizeof(*imu), GFP_KERNEL);
+ if (!dup) {
+ err = -ENOMEM;
+ break;
+ }
+ err = io_queue_buffer_removal(data, dup);
+ if (err)
+ break;
+ memset(imu, 0, sizeof(*imu));
+ needs_switch = true;
+ }
+ if (!io_buffer_validate(&iov)) {
+ err = io_sqe_buffer_register(ctx, &iov, imu,
+ &last_hpage);
+ if (err) {
+ memset(imu, 0, sizeof(*imu));
+ break;
+ }
+ }
+ nr_args--;
+ done++;
+ up->offset++;
+ }
+
+ if (needs_switch) {
+ percpu_ref_kill(&data->node->refs);
+ spin_lock(&data->lock);
+ list_add(&ref_node->node, &data->ref_list);
+ data->node = ref_node;
+ spin_unlock(&data->lock);
+ percpu_ref_get(&ctx->buf_data->refs);
+ } else
+ destroy_fixed_buf_ref_node(ref_node);
+
+ return done ? done : err;
+}
+
+static int io_sqe_buffers_update(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
+{
+ struct io_uring_rsrc_update up;
+
+ if (!ctx->buf_data)
+ return -ENXIO;
+ if (!nr_args)
+ return -EINVAL;
+ if (copy_from_user(&up, arg, sizeof(up)))
+ return -EFAULT;
+ if (up.resv)
+ return -EINVAL;
+
+ return __io_sqe_buffers_update(ctx, &up, nr_args);
+}
+
static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
{
__s32 __user *fds = arg;
@@ -9799,6 +9932,7 @@ static bool io_register_op_must_quiesce(int op)
switch (op) {
case IORING_UNREGISTER_FILES:
case IORING_REGISTER_FILES_UPDATE:
+ case IORING_REGISTER_BUFFERS_UPDATE:
case IORING_REGISTER_PROBE:
case IORING_REGISTER_PERSONALITY:
case IORING_UNREGISTER_PERSONALITY:
@@ -9874,6 +10008,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_sqe_buffers_unregister(ctx);
break;
+ case IORING_REGISTER_BUFFERS_UPDATE:
+ ret = io_sqe_buffers_update(ctx, arg, nr_args);
+ break;
case IORING_REGISTER_FILES:
ret = io_sqe_files_register(ctx, arg, nr_args);
break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 16344da..85de6b8 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -132,6 +132,7 @@ enum {
IORING_OP_PROVIDE_BUFFERS,
IORING_OP_REMOVE_BUFFERS,
IORING_OP_TEE,
+ IORING_OP_BUFFERS_UPDATE,
/* this goes last, obviously */
IORING_OP_LAST,
@@ -271,17 +272,12 @@ enum {
IORING_UNREGISTER_PERSONALITY = 10,
IORING_REGISTER_RESTRICTIONS = 11,
IORING_REGISTER_ENABLE_RINGS = 12,
+ IORING_REGISTER_BUFFERS_UPDATE = 13,
/* this goes last */
IORING_REGISTER_LAST
};
-struct io_uring_files_update {
- __u32 offset;
- __u32 resv;
- __aligned_u64 /* __s32 * */ fds;
-};
-
struct io_uring_rsrc_update {
__u32 offset;
__u32 resv;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [RFC 7/8] io_uring: support readv/writev with fixed buffers
2020-10-22 23:13 [RFC 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
` (5 preceding siblings ...)
2020-10-22 23:14 ` [RFC 6/8] io_uring: support buffer registration updates Bijan Mottahedeh
@ 2020-10-22 23:14 ` Bijan Mottahedeh
2020-10-22 23:14 ` [RFC 8/8] io_uring: support buffer registration sharing Bijan Mottahedeh
2020-10-30 17:11 ` [RFC 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
8 siblings, 0 replies; 10+ messages in thread
From: Bijan Mottahedeh @ 2020-10-22 23:14 UTC (permalink / raw)
To: axboe, io-uring
Support readv/writev with fixed buffers, and introduce IOSQE_FIXED_BUFFER,
consistent with fixed files.
Signed-off-by: Bijan Mottahedeh <[email protected]>
---
fs/io_uring.c | 61 +++++++++++++++++++++++++++++++++++++++----
include/uapi/linux/io_uring.h | 3 +++
2 files changed, 59 insertions(+), 5 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index d69d642..a8d8871 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -599,6 +599,7 @@ enum {
REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
+ REQ_F_FIXED_BUFFER_BIT = IOSQE_FIXED_BUFFER_BIT,
REQ_F_LINK_HEAD_BIT,
REQ_F_FAIL_LINK_BIT,
@@ -658,8 +659,12 @@ enum {
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
/* linked timeout is active, i.e. prepared by link's head */
REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
+ /* ctx owns buffer */
+ REQ_F_FIXED_BUFFER = BIT(REQ_F_FIXED_BUFFER_BIT),
};
+#define REQ_F_FIXED_RSRC (REQ_F_FIXED_FILE | REQ_F_FIXED_BUFFER)
+
struct async_poll {
struct io_poll_iocb poll;
struct io_poll_iocb *double_poll;
@@ -959,7 +964,7 @@ struct io_op_def {
.unbound_nonreg_file = 1,
},
[IORING_OP_BUFFERS_UPDATE] = {
- .needs_mm = 1,
+ .work_flags = IO_WQ_WORK_MM,
},
};
@@ -3097,6 +3102,46 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
return __io_iov_buffer_select(req, iov, needs_lock);
}
+static ssize_t io_import_iovec_fixed(int rw, struct io_kiocb *req, void *buf,
+ unsigned segs, unsigned fast_segs,
+ struct iovec **iovec,
+ struct iov_iter *iter)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_mapped_ubuf *imu;
+ struct iovec *iov;
+ u16 index, buf_index;
+ ssize_t ret;
+ unsigned long seg;
+
+ if (unlikely(!ctx->buf_data))
+ return -EFAULT;
+
+ ret = import_iovec(rw, buf, segs, fast_segs, iovec, iter);
+ if (ret < 0)
+ return ret;
+
+ iov = (struct iovec *)iter->iov;
+
+ for (seg = 0; seg < iter->nr_segs; seg++) {
+ buf_index = *(u16 *)(&iov[seg].iov_base);
+ if (unlikely(buf_index < 0 || buf_index >= ctx->nr_user_bufs))
+ return -EFAULT;
+
+ index = array_index_nospec(buf_index, ctx->nr_user_bufs);
+ imu = io_buf_from_index(ctx, index);
+ if (!imu->ubuf || !imu->len)
+ return -EFAULT;
+ if (iov[seg].iov_len > imu->len)
+ return -EFAULT;
+
+ iov[seg].iov_base = (void *)imu->ubuf;
+ ret += iov[seg].iov_len;
+ }
+
+ return ret;
+}
+
static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
struct iovec **iovec, struct iov_iter *iter,
bool needs_lock)
@@ -3107,6 +3152,12 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
u8 opcode;
opcode = req->opcode;
+
+ if ((opcode == IORING_OP_READV || opcode == IORING_OP_WRITEV) &&
+ req->flags & REQ_F_FIXED_BUFFER)
+ return (io_import_iovec_fixed(rw, req, buf, sqe_len,
+ UIO_FASTIOV, iovec, iter));
+
if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
*iovec = NULL;
return io_import_fixed(req, rw, iter);
@@ -5494,7 +5545,7 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
{
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ if (unlikely(req->flags & (REQ_F_FIXED_RSRC | REQ_F_BUFFER_SELECT)))
return -EINVAL;
if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags)
return -EINVAL;
@@ -5669,7 +5720,7 @@ static int io_async_cancel_prep(struct io_kiocb *req,
{
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ if (unlikely(req->flags & (REQ_F_FIXED_RSRC | REQ_F_BUFFER_SELECT)))
return -EINVAL;
if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
return -EINVAL;
@@ -5691,7 +5742,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
{
if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
return -EINVAL;
- if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ if (unlikely(req->flags & (REQ_F_FIXED_RSRC | REQ_F_BUFFER_SELECT)))
return -EINVAL;
if (sqe->ioprio || sqe->rw_flags)
return -EINVAL;
@@ -6546,7 +6597,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
- IOSQE_BUFFER_SELECT)
+ IOSQE_BUFFER_SELECT | IOSQE_FIXED_BUFFER)
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 85de6b8..8f92555 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -68,6 +68,7 @@ enum {
IOSQE_IO_HARDLINK_BIT,
IOSQE_ASYNC_BIT,
IOSQE_BUFFER_SELECT_BIT,
+ IOSQE_FIXED_BUFFER_BIT,
};
/*
@@ -85,6 +86,8 @@ enum {
#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
/* select buffer from sqe->buf_group */
#define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT)
+/* use fixed buffer set */
+#define IOSQE_FIXED_BUFFER (1U << IOSQE_FIXED_BUFFER_BIT)
/*
* io_uring_setup() flags
--
1.8.3.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [RFC 8/8] io_uring: support buffer registration sharing
2020-10-22 23:13 [RFC 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
` (6 preceding siblings ...)
2020-10-22 23:14 ` [RFC 7/8] io_uring: support readv/writev with fixed buffers Bijan Mottahedeh
@ 2020-10-22 23:14 ` Bijan Mottahedeh
2020-10-30 17:11 ` [RFC 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
8 siblings, 0 replies; 10+ messages in thread
From: Bijan Mottahedeh @ 2020-10-22 23:14 UTC (permalink / raw)
To: axboe, io-uring
Implement buffer sharing among multiple rings.
A ring shares its (future) buffer registrations at setup time with
IORING_SETUP_SHARE_BUF. A ring attaches to another ring's buffer
registration at setup time with IORING_SETUP_ATTACH_BUF, after
authenticating with the buffer registration owner's fd. Any updates to
the owner's buffer registrations become immediately available to the
attached rings.
Signed-off-by: Bijan Mottahedeh <[email protected]>
---
fs/io_uring.c | 140 +++++++++++++++++++++++++++++++++++-------
include/uapi/linux/io_uring.h | 2 +
2 files changed, 119 insertions(+), 23 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index a8d8871..050f18e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8134,6 +8134,12 @@ static void io_buffers_map_free(struct io_ring_ctx *ctx)
ctx->nr_user_bufs = 0;
}
+static void io_detach_buf_data(struct io_ring_ctx *ctx)
+{
+ percpu_ref_put(&ctx->buf_data->refs);
+ ctx->buf_data = NULL;
+}
+
static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
struct fixed_rsrc_data *data = ctx->buf_data;
@@ -8142,6 +8148,12 @@ static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
if (!data)
return -ENXIO;
+ if (ctx->flags & IORING_SETUP_ATTACH_BUF) {
+ io_detach_buf_data(ctx);
+ ctx->nr_user_bufs = 0;
+ return 0;
+ }
+
spin_lock(&data->lock);
if (!list_empty(&data->ref_list))
ref_node = list_first_entry(&data->ref_list,
@@ -8388,46 +8400,67 @@ static int io_alloc_buf_tables(struct fixed_rsrc_data *buf_data,
return 1;
}
+static struct fixed_rsrc_data *io_alloc_buf_data(struct io_ring_ctx *ctx)
+{
+ struct fixed_rsrc_data *buf_data;
+
+ buf_data = kzalloc(sizeof(*buf_data), GFP_KERNEL);
+ if (!buf_data)
+ return ERR_PTR(-ENOMEM);
+
+ buf_data->ctx = ctx;
+ init_completion(&buf_data->done);
+ INIT_LIST_HEAD(&buf_data->ref_list);
+ spin_lock_init(&buf_data->lock);
+
+ if (percpu_ref_init(&buf_data->refs, io_rsrc_ref_kill,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+ kfree(buf_data);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return buf_data;
+}
+
+static void io_free_buf_data(struct io_ring_ctx *ctx)
+{
+ percpu_ref_exit(&ctx->buf_data->refs);
+ kfree(ctx->buf_data->table);
+ kfree(ctx->buf_data);
+}
+
static struct fixed_rsrc_data *io_buffers_map_alloc(struct io_ring_ctx *ctx,
unsigned int nr_args)
{
- unsigned nr_tables;
struct fixed_rsrc_data *buf_data;
+ unsigned nr_tables;
int ret = -ENOMEM;
- if (ctx->buf_data)
+ if (ctx->nr_user_bufs)
return ERR_PTR(-EBUSY);
if (!nr_args || nr_args > IORING_MAX_FIXED_BUFS)
return ERR_PTR(-EINVAL);
- buf_data = kzalloc(sizeof(*ctx->buf_data), GFP_KERNEL);
- if (!buf_data)
- return ERR_PTR(-ENOMEM);
- buf_data->ctx = ctx;
- init_completion(&buf_data->done);
- INIT_LIST_HEAD(&buf_data->ref_list);
- spin_lock_init(&buf_data->lock);
+ if (ctx->buf_data)
+ buf_data = ctx->buf_data;
+ else {
+ buf_data = io_alloc_buf_data(ctx);
+ if (IS_ERR(buf_data))
+ return buf_data;
+ }
nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_BUFS_TABLE);
- buf_data->table = kcalloc(nr_tables, sizeof(buf_data->table),
+ buf_data->table = kcalloc(nr_tables, sizeof(struct fixed_rsrc_table),
GFP_KERNEL);
if (!buf_data->table)
- goto out_free;
-
- if (percpu_ref_init(&buf_data->refs, io_rsrc_ref_kill,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
- goto out_free;
+ goto out;
if (io_alloc_buf_tables(buf_data, nr_tables, nr_args))
- goto out_ref;
+ goto out;
return buf_data;
-
-out_ref:
- percpu_ref_exit(&buf_data->refs);
-out_free:
- kfree(buf_data->table);
- kfree(buf_data);
+out:
+ io_free_buf_data(ctx);
return ERR_PTR(ret);
}
@@ -8515,9 +8548,17 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
struct fixed_rsrc_ref_node *ref_node;
struct fixed_rsrc_data *buf_data;
+ if (ctx->flags & IORING_SETUP_ATTACH_BUF) {
+ if (!ctx->buf_data)
+ return -EFAULT;
+ ctx->nr_user_bufs = ctx->buf_data->ctx->nr_user_bufs;
+ return 0;
+ }
+
buf_data = io_buffers_map_alloc(ctx, nr_args);
if (IS_ERR(buf_data))
return PTR_ERR(buf_data);
+ ctx->buf_data = buf_data;
for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
struct fixed_rsrc_table *table;
@@ -8545,7 +8586,6 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
break;
}
- ctx->buf_data = buf_data;
if (ret) {
io_sqe_buffers_unregister(ctx);
return ret;
@@ -9623,6 +9663,55 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx)
return ret;
}
+static int io_attach_buf_data(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
+{
+ struct io_ring_ctx *ctx_attach;
+ struct fd f;
+
+ f = fdget(p->wq_fd);
+ if (!f.file)
+ return -EBADF;
+ if (f.file->f_op != &io_uring_fops) {
+ fdput(f);
+ return -EINVAL;
+ }
+
+ ctx_attach = f.file->private_data;
+ if (!ctx_attach->buf_data) {
+ fdput(f);
+ return -EINVAL;
+ }
+ ctx->buf_data = ctx_attach->buf_data;
+
+ percpu_ref_get(&ctx->buf_data->refs);
+ fdput(f);
+ return 0;
+}
+
+static int io_init_buf_data(struct io_ring_ctx *ctx, struct io_uring_params *p)
+{
+ if ((p->flags & (IORING_SETUP_SHARE_BUF | IORING_SETUP_ATTACH_BUF)) ==
+ (IORING_SETUP_SHARE_BUF | IORING_SETUP_ATTACH_BUF))
+ return -EINVAL;
+
+ if (p->flags & IORING_SETUP_SHARE_BUF) {
+ struct fixed_rsrc_data *buf_data;
+
+ buf_data = io_alloc_buf_data(ctx);
+ if (IS_ERR(buf_data))
+ return PTR_ERR(buf_data);
+
+ ctx->buf_data = buf_data;
+ return 0;
+ }
+
+ if (p->flags & IORING_SETUP_ATTACH_BUF)
+ return io_attach_buf_data(ctx, p);
+
+ return 0;
+}
+
static int io_uring_create(unsigned entries, struct io_uring_params *p,
struct io_uring_params __user *params)
{
@@ -9737,6 +9826,10 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
if (ret)
goto err;
+ ret = io_init_buf_data(ctx, p);
+ if (ret)
+ goto err;
+
ret = io_sq_offload_create(ctx, p);
if (ret)
goto err;
@@ -9807,6 +9900,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
+ IORING_SETUP_SHARE_BUF | IORING_SETUP_ATTACH_BUF |
IORING_SETUP_R_DISABLED))
return -EINVAL;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8f92555..1079048 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -99,6 +99,8 @@ enum {
#define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
#define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */
+#define IORING_SETUP_SHARE_BUF (1U << 7) /* share buffer registration */
+#define IORING_SETUP_ATTACH_BUF (1U << 8) /* attach buffer registration */
enum {
IORING_OP_NOP,
--
1.8.3.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [RFC 0/8] io_uring: buffer registration enhancements
2020-10-22 23:13 [RFC 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
` (7 preceding siblings ...)
2020-10-22 23:14 ` [RFC 8/8] io_uring: support buffer registration sharing Bijan Mottahedeh
@ 2020-10-30 17:11 ` Bijan Mottahedeh
8 siblings, 0 replies; 10+ messages in thread
From: Bijan Mottahedeh @ 2020-10-30 17:11 UTC (permalink / raw)
To: axboe, io-uring
A gentle ping.
On 10/22/2020 4:13 PM, Bijan Mottahedeh wrote:
> Hi Jens,
>
> This RFC implements a set of enhancements to buffer registration consistent
> with existing file registration functionality:
>
> - buffer registration updates IORING_REGISTER_BUFFERS_UPDATE
> IORING_OP_BUFFERS_UPDATE
>
> - readv/writev with fixed buffers IOSQE_FIXED_BUFFER
>
> - buffer registration sharing IORING_SETUP_SHARE_BUF
> IORING_SETUP_ATTACH_BUF
>
> Patches 1,2 modularize existing buffer registration code.
>
> Patch 3 generalizes fixed_file functionality to fixed_rsrc.
>
> Patch 4 applies fixed_rsrc functionality for fixed buffers support.
>
> Patch 5 generalizes files_update functionality to rsrc_update.
>
> Patch 6 implements buffer registration update, and introduces
> IORING_REGISTER_BUFFERS_UPDATE and IORING_OP_BUFFERS_UPDATE, consistent
> with file registration update.
>
> Patch 7 implements readv/writev support with fixed buffers, and introduces
> IOSQE_FIXED_BUFFER, consistent with fixed files.
>
> Patch 8 implements buffer sharing among multiple rings; it works as
> follows based on previous conversations:
>
> - A new ring, A, is setup. Since no buffers have been registered, the
> registered buffer state is an empty set, Z. That's different from the
> NULL state in current implementation.
>
> - Ring B is setup, attaching to Ring A. It's also attaching to it's
> buffer registrations, now we have two references to the same empty
> set, Z.
>
> - Ring A registers buffers into set Z, which is no longer empty.
>
> - Ring B sees this immediately, since it's already sharing that set.
>
> TBD
>
> - I think I have to add IORING_UNREGISTER_BUFFERS to
> io_register_op_must_quiesce() but wanted to confirm.
>
> - IORING_OP_SHUTDOWN has been removed but still in liburing, not sure why.
> I wanted to verify before removing that functionality.
>
> I have used liburing file-{register,update} tests as models for
> buffer-{register,update}, and it seems to work ok.
>
> Bijan Mottahedeh (8):
> io_uring: modularize io_sqe_buffer_register
> io_uring: modularize io_sqe_buffers_register
> io_uring: generalize fixed file functionality
> io_uring: implement fixed buffers registration similar to fixed files
> io_uring: generalize files_update functionlity to rsrc_update
> io_uring: support buffer registration updates
> io_uring: support readv/writev with fixed buffers
> io_uring: support buffer registration sharing
>
> fs/io_uring.c | 1021 ++++++++++++++++++++++++++++++++---------
> include/uapi/linux/io_uring.h | 15 +-
> 2 files changed, 807 insertions(+), 229 deletions(-)
>
^ permalink raw reply [flat|nested] 10+ messages in thread