* [PATCH 1/8] io_uring: modularize io_sqe_buffer_register
2020-11-12 23:00 [PATCH 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
@ 2020-11-12 23:00 ` Bijan Mottahedeh
2020-11-12 23:00 ` [PATCH 2/8] io_uring: modularize io_sqe_buffers_register Bijan Mottahedeh
` (7 subsequent siblings)
8 siblings, 0 replies; 22+ messages in thread
From: Bijan Mottahedeh @ 2020-11-12 23:00 UTC (permalink / raw)
To: axboe; +Cc: io-uring
Split io_sqe_buffer_register into two routines:
- io_sqe_buffer_register() registers a single buffer
- io_sqe_buffers_register iterates over all user specified buffers
Signed-off-by: Bijan Mottahedeh <[email protected]>
---
fs/io_uring.c | 210 ++++++++++++++++++++++++++++++----------------------------
1 file changed, 107 insertions(+), 103 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index c2688b6..29aa3e1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8144,7 +8144,7 @@ static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
return pages;
}
-static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
+static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
int i, j;
@@ -8262,14 +8262,103 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
return ret;
}
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args)
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
+ struct io_mapped_ubuf *imu,
+ struct page **last_hpage)
{
struct vm_area_struct **vmas = NULL;
struct page **pages = NULL;
+ unsigned long off, start, end, ubuf;
+ size_t size;
+ int ret, pret, nr_pages, i;
+
+ ubuf = (unsigned long) iov->iov_base;
+ end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = ubuf >> PAGE_SHIFT;
+ nr_pages = end - start;
+
+ ret = -ENOMEM;
+
+ pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ goto done;
+
+ vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+ if (!vmas)
+ goto done;
+
+ imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
+ GFP_KERNEL);
+ if (!imu->bvec)
+ goto done;
+
+ ret = 0;
+ mmap_read_lock(current->mm);
+ pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+ pages, vmas);
+ if (pret == nr_pages) {
+ /* don't support file backed memory */
+ for (i = 0; i < nr_pages; i++) {
+ struct vm_area_struct *vma = vmas[i];
+
+ if (vma->vm_file &&
+ !is_file_hugepages(vma->vm_file)) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ }
+ } else {
+ ret = pret < 0 ? pret : -EFAULT;
+ }
+ mmap_read_unlock(current->mm);
+ if (ret) {
+ /*
+ * if we did partial map, or found file backed vmas,
+ * release any pages we did get
+ */
+ if (pret > 0)
+ unpin_user_pages(pages, pret);
+ kvfree(imu->bvec);
+ goto done;
+ }
+
+ ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
+ if (ret) {
+ unpin_user_pages(pages, pret);
+ kvfree(imu->bvec);
+ goto done;
+ }
+
+ off = ubuf & ~PAGE_MASK;
+ size = iov->iov_len;
+ for (i = 0; i < nr_pages; i++) {
+ size_t vec_len;
+
+ vec_len = min_t(size_t, size, PAGE_SIZE - off);
+ imu->bvec[i].bv_page = pages[i];
+ imu->bvec[i].bv_len = vec_len;
+ imu->bvec[i].bv_offset = off;
+ off = 0;
+ size -= vec_len;
+ }
+ /* store original address for later verification */
+ imu->ubuf = ubuf;
+ imu->len = iov->iov_len;
+ imu->nr_bvecs = nr_pages;
+ ret = 0;
+done:
+ kvfree(pages);
+ kvfree(vmas);
+ return ret;
+}
+
+static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int nr_args)
+{
+ int i, ret;
+ struct iovec iov;
struct page *last_hpage = NULL;
- int i, j, got_pages = 0;
- int ret = -EINVAL;
if (ctx->user_bufs)
return -EBUSY;
@@ -8283,14 +8372,10 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
for (i = 0; i < nr_args; i++) {
struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
- unsigned long off, start, end, ubuf;
- int pret, nr_pages;
- struct iovec iov;
- size_t size;
ret = io_copy_iov(ctx, &iov, arg, i);
if (ret)
- goto err;
+ break;
/*
* Don't impose further limits on the size and buffer
@@ -8299,103 +8384,22 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
*/
ret = -EFAULT;
if (!iov.iov_base || !iov.iov_len)
- goto err;
+ break;
/* arbitrary limit, but we need something */
if (iov.iov_len > SZ_1G)
- goto err;
-
- ubuf = (unsigned long) iov.iov_base;
- end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- start = ubuf >> PAGE_SHIFT;
- nr_pages = end - start;
-
- ret = 0;
- if (!pages || nr_pages > got_pages) {
- kvfree(vmas);
- kvfree(pages);
- pages = kvmalloc_array(nr_pages, sizeof(struct page *),
- GFP_KERNEL);
- vmas = kvmalloc_array(nr_pages,
- sizeof(struct vm_area_struct *),
- GFP_KERNEL);
- if (!pages || !vmas) {
- ret = -ENOMEM;
- goto err;
- }
- got_pages = nr_pages;
- }
-
- imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
- GFP_KERNEL);
- ret = -ENOMEM;
- if (!imu->bvec)
- goto err;
-
- ret = 0;
- mmap_read_lock(current->mm);
- pret = pin_user_pages(ubuf, nr_pages,
- FOLL_WRITE | FOLL_LONGTERM,
- pages, vmas);
- if (pret == nr_pages) {
- /* don't support file backed memory */
- for (j = 0; j < nr_pages; j++) {
- struct vm_area_struct *vma = vmas[j];
-
- if (vma->vm_file &&
- !is_file_hugepages(vma->vm_file)) {
- ret = -EOPNOTSUPP;
- break;
- }
- }
- } else {
- ret = pret < 0 ? pret : -EFAULT;
- }
- mmap_read_unlock(current->mm);
- if (ret) {
- /*
- * if we did partial map, or found file backed vmas,
- * release any pages we did get
- */
- if (pret > 0)
- unpin_user_pages(pages, pret);
- kvfree(imu->bvec);
- goto err;
- }
-
- ret = io_buffer_account_pin(ctx, pages, pret, imu, &last_hpage);
- if (ret) {
- unpin_user_pages(pages, pret);
- kvfree(imu->bvec);
- goto err;
- }
+ break;
- off = ubuf & ~PAGE_MASK;
- size = iov.iov_len;
- for (j = 0; j < nr_pages; j++) {
- size_t vec_len;
-
- vec_len = min_t(size_t, size, PAGE_SIZE - off);
- imu->bvec[j].bv_page = pages[j];
- imu->bvec[j].bv_len = vec_len;
- imu->bvec[j].bv_offset = off;
- off = 0;
- size -= vec_len;
- }
- /* store original address for later verification */
- imu->ubuf = ubuf;
- imu->len = iov.iov_len;
- imu->nr_bvecs = nr_pages;
+ ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
+ if (ret)
+ break;
ctx->nr_user_bufs++;
}
- kvfree(pages);
- kvfree(vmas);
- return 0;
-err:
- kvfree(pages);
- kvfree(vmas);
- io_sqe_buffer_unregister(ctx);
+
+ if (ret)
+ io_sqe_buffers_unregister(ctx);
+
return ret;
}
@@ -8449,7 +8453,7 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
static void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_finish_async(ctx);
- io_sqe_buffer_unregister(ctx);
+ io_sqe_buffers_unregister(ctx);
if (ctx->sqo_task) {
put_task_struct(ctx->sqo_task);
@@ -9743,13 +9747,13 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
switch (opcode) {
case IORING_REGISTER_BUFFERS:
- ret = io_sqe_buffer_register(ctx, arg, nr_args);
+ ret = io_sqe_buffers_register(ctx, arg, nr_args);
break;
case IORING_UNREGISTER_BUFFERS:
ret = -EINVAL;
if (arg || nr_args)
break;
- ret = io_sqe_buffer_unregister(ctx);
+ ret = io_sqe_buffers_unregister(ctx);
break;
case IORING_REGISTER_FILES:
ret = io_sqe_files_register(ctx, arg, nr_args);
--
1.8.3.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [PATCH 2/8] io_uring: modularize io_sqe_buffers_register
2020-11-12 23:00 [PATCH 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
2020-11-12 23:00 ` [PATCH 1/8] io_uring: modularize io_sqe_buffer_register Bijan Mottahedeh
@ 2020-11-12 23:00 ` Bijan Mottahedeh
2020-11-12 23:00 ` [PATCH 3/8] io_uring: generalize fixed file functionality Bijan Mottahedeh
` (6 subsequent siblings)
8 siblings, 0 replies; 22+ messages in thread
From: Bijan Mottahedeh @ 2020-11-12 23:00 UTC (permalink / raw)
To: axboe; +Cc: io-uring
Move allocation of buffer management structures, and validation of
buffers into separate routines.
Signed-off-by: Bijan Mottahedeh <[email protected]>
---
fs/io_uring.c | 51 ++++++++++++++++++++++++++++++++++-----------------
1 file changed, 34 insertions(+), 17 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 29aa3e1..c6aa8bf 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8353,13 +8353,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
return ret;
}
-static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int nr_args)
+static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
{
- int i, ret;
- struct iovec iov;
- struct page *last_hpage = NULL;
-
if (ctx->user_bufs)
return -EBUSY;
if (!nr_args || nr_args > UIO_MAXIOV)
@@ -8370,6 +8365,37 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
if (!ctx->user_bufs)
return -ENOMEM;
+ return 0;
+}
+
+static int io_buffer_validate(struct iovec *iov)
+{
+ /*
+ * Don't impose further limits on the size and buffer
+ * constraints here, we'll -EINVAL later when IO is
+ * submitted if they are wrong.
+ */
+ if (!iov->iov_base || !iov->iov_len)
+ return -EFAULT;
+
+ /* arbitrary limit, but we need something */
+ if (iov->iov_len > SZ_1G)
+ return -EFAULT;
+
+ return 0;
+}
+
+static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int nr_args)
+{
+ int i, ret;
+ struct iovec iov;
+ struct page *last_hpage = NULL;
+
+ ret = io_buffers_map_alloc(ctx, nr_args);
+ if (ret)
+ return ret;
+
for (i = 0; i < nr_args; i++) {
struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
@@ -8377,17 +8403,8 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
if (ret)
break;
- /*
- * Don't impose further limits on the size and buffer
- * constraints here, we'll -EINVAL later when IO is
- * submitted if they are wrong.
- */
- ret = -EFAULT;
- if (!iov.iov_base || !iov.iov_len)
- break;
-
- /* arbitrary limit, but we need something */
- if (iov.iov_len > SZ_1G)
+ ret = io_buffer_validate(&iov);
+ if (ret)
break;
ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
--
1.8.3.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [PATCH 3/8] io_uring: generalize fixed file functionality
2020-11-12 23:00 [PATCH 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
2020-11-12 23:00 ` [PATCH 1/8] io_uring: modularize io_sqe_buffer_register Bijan Mottahedeh
2020-11-12 23:00 ` [PATCH 2/8] io_uring: modularize io_sqe_buffers_register Bijan Mottahedeh
@ 2020-11-12 23:00 ` Bijan Mottahedeh
2020-11-12 23:00 ` [PATCH 4/8] io_uring: implement fixed buffers registration similar to fixed files Bijan Mottahedeh
` (5 subsequent siblings)
8 siblings, 0 replies; 22+ messages in thread
From: Bijan Mottahedeh @ 2020-11-12 23:00 UTC (permalink / raw)
To: axboe; +Cc: io-uring
Generalize fixed_file functionality to fixed_rsrc in order to leverage
it for fixed buffers.
Signed-off-by: Bijan Mottahedeh <[email protected]>
---
fs/io_uring.c | 161 +++++++++++++++++++++++++++++++++-------------------------
1 file changed, 92 insertions(+), 69 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index c6aa8bf..974a619 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -195,23 +195,39 @@ struct io_mapped_ubuf {
unsigned long acct_pages;
};
-struct fixed_file_table {
- struct file **files;
+struct io_ring_ctx;
+
+struct io_rsrc_put {
+ struct list_head list;
+ union {
+ void *rsrc;
+ struct file *file;
+ struct io_mapped_ubuf *buf;
+ };
};
-struct fixed_file_ref_node {
+struct fixed_rsrc_table {
+ union {
+ struct file **files;
+ struct io_mapped_ubuf *bufs;
+ };
+};
+
+struct fixed_rsrc_ref_node {
struct percpu_ref refs;
struct list_head node;
- struct list_head file_list;
- struct fixed_file_data *file_data;
+ struct list_head rsrc_list;
+ struct fixed_rsrc_data *rsrc_data;
+ void (*rsrc_put)(struct io_ring_ctx *ctx,
+ struct io_rsrc_put *prsrc);
struct llist_node llist;
};
-struct fixed_file_data {
- struct fixed_file_table *table;
+struct fixed_rsrc_data {
+ struct fixed_rsrc_table *table;
struct io_ring_ctx *ctx;
- struct fixed_file_ref_node *node;
+ struct fixed_rsrc_ref_node *node;
struct percpu_ref refs;
struct completion done;
struct list_head ref_list;
@@ -318,7 +334,7 @@ struct io_ring_ctx {
* readers must ensure that ->refs is alive as long as the file* is
* used. Only updated through io_uring_register(2).
*/
- struct fixed_file_data *file_data;
+ struct fixed_rsrc_data *file_data;
unsigned nr_user_files;
/* if used, fixed mapped user buffers */
@@ -6270,7 +6286,7 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
int index)
{
- struct fixed_file_table *table;
+ struct fixed_rsrc_table *table;
table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
return table->files[index & IORING_FILE_TABLE_MASK];
@@ -7136,18 +7152,18 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
#endif
}
-static void io_file_ref_kill(struct percpu_ref *ref)
+static void io_rsrc_ref_kill(struct percpu_ref *ref)
{
- struct fixed_file_data *data;
+ struct fixed_rsrc_data *data;
- data = container_of(ref, struct fixed_file_data, refs);
+ data = container_of(ref, struct fixed_rsrc_data, refs);
complete(&data->done);
}
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
- struct fixed_file_data *data = ctx->file_data;
- struct fixed_file_ref_node *ref_node = NULL;
+ struct fixed_rsrc_data *data = ctx->file_data;
+ struct fixed_rsrc_ref_node *ref_node = NULL;
unsigned nr_tables, i;
if (!data)
@@ -7156,7 +7172,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
spin_lock(&data->lock);
if (!list_empty(&data->ref_list))
ref_node = list_first_entry(&data->ref_list,
- struct fixed_file_ref_node, node);
+ struct fixed_rsrc_ref_node, node);
spin_unlock(&data->lock);
if (ref_node)
percpu_ref_kill(&ref_node->refs);
@@ -7398,13 +7414,13 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
}
#endif
-static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data,
+static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data,
unsigned nr_tables, unsigned nr_files)
{
int i;
for (i = 0; i < nr_tables; i++) {
- struct fixed_file_table *table = &file_data->table[i];
+ struct fixed_rsrc_table *table = &file_data->table[i];
unsigned this_files;
this_files = min(nr_files, IORING_MAX_FILES_TABLE);
@@ -7419,14 +7435,15 @@ static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data,
return 0;
for (i = 0; i < nr_tables; i++) {
- struct fixed_file_table *table = &file_data->table[i];
+ struct fixed_rsrc_table *table = &file_data->table[i];
kfree(table->files);
}
return 1;
}
-static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
+static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
+ struct file *file = prsrc->file;
#if defined(CONFIG_UNIX)
struct sock *sock = ctx->ring_sock->sk;
struct sk_buff_head list, *head = &sock->sk_receive_queue;
@@ -7487,30 +7504,38 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
#endif
}
-struct io_file_put {
- struct list_head list;
- struct file *file;
-};
-
-static void __io_file_put_work(struct fixed_file_ref_node *ref_node)
+static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node)
{
- struct fixed_file_data *file_data = ref_node->file_data;
- struct io_ring_ctx *ctx = file_data->ctx;
- struct io_file_put *pfile, *tmp;
+ struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data;
+ struct io_ring_ctx *ctx = rsrc_data->ctx;
+ struct io_rsrc_put *prsrc, *tmp;
- list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
- list_del(&pfile->list);
- io_ring_file_put(ctx, pfile->file);
- kfree(pfile);
+ list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
+ list_del(&prsrc->list);
+ ref_node->rsrc_put(ctx, prsrc);
+ kfree(prsrc);
}
- spin_lock(&file_data->lock);
+ spin_lock(&rsrc_data->lock);
list_del(&ref_node->node);
- spin_unlock(&file_data->lock);
+ spin_unlock(&rsrc_data->lock);
percpu_ref_exit(&ref_node->refs);
kfree(ref_node);
- percpu_ref_put(&file_data->refs);
+ percpu_ref_put(&rsrc_data->refs);
+}
+
+static void io_rsrc_put_work(struct llist_node *node)
+{
+ struct fixed_rsrc_ref_node *ref_node;
+ struct llist_node *next;
+
+ while (node) {
+ next = node->next;
+ ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist);
+ __io_rsrc_put_work(ref_node);
+ node = next;
+ }
}
static void io_file_put_work(struct work_struct *work)
@@ -7520,26 +7545,18 @@ static void io_file_put_work(struct work_struct *work)
ctx = container_of(work, struct io_ring_ctx, file_put_work.work);
node = llist_del_all(&ctx->file_put_llist);
-
- while (node) {
- struct fixed_file_ref_node *ref_node;
- struct llist_node *next = node->next;
-
- ref_node = llist_entry(node, struct fixed_file_ref_node, llist);
- __io_file_put_work(ref_node);
- node = next;
- }
+ io_rsrc_put_work(node);
}
static void io_file_data_ref_zero(struct percpu_ref *ref)
{
- struct fixed_file_ref_node *ref_node;
+ struct fixed_rsrc_ref_node *ref_node;
struct io_ring_ctx *ctx;
bool first_add;
int delay = HZ;
- ref_node = container_of(ref, struct fixed_file_ref_node, refs);
- ctx = ref_node->file_data->ctx;
+ ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
+ ctx = ref_node->rsrc_data->ctx;
if (percpu_ref_is_dying(&ctx->file_data->refs))
delay = 0;
@@ -7551,10 +7568,10 @@ static void io_file_data_ref_zero(struct percpu_ref *ref)
queue_delayed_work(system_wq, &ctx->file_put_work, delay);
}
-static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
+static struct fixed_rsrc_ref_node *alloc_fixed_file_ref_node(
struct io_ring_ctx *ctx)
{
- struct fixed_file_ref_node *ref_node;
+ struct fixed_rsrc_ref_node *ref_node;
ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
if (!ref_node)
@@ -7566,12 +7583,13 @@ static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
return ERR_PTR(-ENOMEM);
}
INIT_LIST_HEAD(&ref_node->node);
- INIT_LIST_HEAD(&ref_node->file_list);
- ref_node->file_data = ctx->file_data;
+ INIT_LIST_HEAD(&ref_node->rsrc_list);
+ ref_node->rsrc_data = ctx->file_data;
+ ref_node->rsrc_put = io_ring_file_put;
return ref_node;
}
-static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
+static void destroy_fixed_file_ref_node(struct fixed_rsrc_ref_node *ref_node)
{
percpu_ref_exit(&ref_node->refs);
kfree(ref_node);
@@ -7584,8 +7602,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_tables, i;
struct file *file;
int fd, ret = -ENOMEM;
- struct fixed_file_ref_node *ref_node;
- struct fixed_file_data *file_data;
+ struct fixed_rsrc_ref_node *ref_node;
+ struct fixed_rsrc_data *file_data;
if (ctx->file_data)
return -EBUSY;
@@ -7608,7 +7626,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
if (!file_data->table)
goto out_free;
- if (percpu_ref_init(&file_data->refs, io_file_ref_kill,
+ if (percpu_ref_init(&file_data->refs, io_rsrc_ref_kill,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto out_free;
@@ -7617,7 +7635,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
ctx->file_data = file_data;
for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
- struct fixed_file_table *table;
+ struct fixed_rsrc_table *table;
unsigned index;
if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
@@ -7728,28 +7746,33 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif
}
-static int io_queue_file_removal(struct fixed_file_data *data,
- struct file *file)
+static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc)
{
- struct io_file_put *pfile;
- struct fixed_file_ref_node *ref_node = data->node;
+ struct io_rsrc_put *prsrc;
+ struct fixed_rsrc_ref_node *ref_node = data->node;
- pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
- if (!pfile)
+ prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
+ if (!prsrc)
return -ENOMEM;
- pfile->file = file;
- list_add(&pfile->list, &ref_node->file_list);
+ prsrc->rsrc = rsrc;
+ list_add(&prsrc->list, &ref_node->rsrc_list);
return 0;
}
+static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
+ struct file *file)
+{
+ return io_queue_rsrc_removal(data, (void *)file);
+}
+
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *up,
unsigned nr_args)
{
- struct fixed_file_data *data = ctx->file_data;
- struct fixed_file_ref_node *ref_node;
+ struct fixed_rsrc_data *data = ctx->file_data;
+ struct fixed_rsrc_ref_node *ref_node;
struct file *file;
__s32 __user *fds;
int fd, i, err;
@@ -7768,7 +7791,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
done = 0;
fds = u64_to_user_ptr(up->fds);
while (nr_args) {
- struct fixed_file_table *table;
+ struct fixed_rsrc_table *table;
unsigned index;
err = 0;
@@ -9182,7 +9205,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
- struct fixed_file_table *table;
+ struct fixed_rsrc_table *table;
struct file *f;
table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
--
1.8.3.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [PATCH 4/8] io_uring: implement fixed buffers registration similar to fixed files
2020-11-12 23:00 [PATCH 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
` (2 preceding siblings ...)
2020-11-12 23:00 ` [PATCH 3/8] io_uring: generalize fixed file functionality Bijan Mottahedeh
@ 2020-11-12 23:00 ` Bijan Mottahedeh
2020-11-15 13:33 ` Pavel Begunkov
2020-11-12 23:00 ` [PATCH 5/8] io_uring: generalize files_update functionlity to rsrc_update Bijan Mottahedeh
` (4 subsequent siblings)
8 siblings, 1 reply; 22+ messages in thread
From: Bijan Mottahedeh @ 2020-11-12 23:00 UTC (permalink / raw)
To: axboe; +Cc: io-uring
Apply fixed_rsrc functionality for fixed buffers support.
Signed-off-by: Bijan Mottahedeh <[email protected]>
---
fs/io_uring.c | 294 +++++++++++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 258 insertions(+), 36 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 974a619..de0019e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -104,6 +104,14 @@
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
+/*
+ * Shift of 7 is 128 entries, or exactly one page on 64-bit archs
+ */
+#define IORING_BUF_TABLE_SHIFT 7 /* struct io_mapped_ubuf */
+#define IORING_MAX_BUFS_TABLE (1U << IORING_BUF_TABLE_SHIFT)
+#define IORING_BUF_TABLE_MASK (IORING_MAX_BUFS_TABLE - 1)
+#define IORING_MAX_FIXED_BUFS UIO_MAXIOV
+
struct io_uring {
u32 head ____cacheline_aligned_in_smp;
u32 tail ____cacheline_aligned_in_smp;
@@ -338,8 +346,8 @@ struct io_ring_ctx {
unsigned nr_user_files;
/* if used, fixed mapped user buffers */
+ struct fixed_rsrc_data *buf_data;
unsigned nr_user_bufs;
- struct io_mapped_ubuf *user_bufs;
struct user_struct *user;
@@ -401,6 +409,9 @@ struct io_ring_ctx {
struct delayed_work file_put_work;
struct llist_head file_put_llist;
+ struct delayed_work buf_put_work;
+ struct llist_head buf_put_llist;
+
struct work_struct exit_work;
struct io_restriction restrictions;
};
@@ -1019,6 +1030,7 @@ static struct file *io_file_get(struct io_submit_state *state,
struct io_kiocb *req, int fd, bool fixed);
static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs);
static void io_file_put_work(struct work_struct *work);
+static void io_buf_put_work(struct work_struct *work);
static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
struct iovec **iovec, struct iov_iter *iter,
@@ -1318,6 +1330,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->inflight_list);
INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
init_llist_head(&ctx->file_put_llist);
+ INIT_DELAYED_WORK(&ctx->buf_put_work, io_buf_put_work);
+ init_llist_head(&ctx->buf_put_llist);
return ctx;
err:
if (ctx->fallback_req)
@@ -2949,6 +2963,15 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
io_rw_done(kiocb, ret);
}
+static inline struct io_mapped_ubuf *io_buf_from_index(struct io_ring_ctx *ctx,
+ int index)
+{
+ struct fixed_rsrc_table *table;
+
+ table = &ctx->buf_data->table[index >> IORING_BUF_TABLE_SHIFT];
+ return &table->bufs[index & IORING_BUF_TABLE_MASK];
+}
+
static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
struct iov_iter *iter)
{
@@ -2959,10 +2982,15 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
size_t offset;
u64 buf_addr;
+ /* attempt to use fixed buffers without having provided iovecs */
+ if (unlikely(!ctx->buf_data))
+ return -EFAULT;
+
+ buf_index = req->buf_index;
if (unlikely(buf_index >= ctx->nr_user_bufs))
return -EFAULT;
index = array_index_nospec(buf_index, ctx->nr_user_bufs);
- imu = &ctx->user_bufs[index];
+ imu = io_buf_from_index(ctx, index);
buf_addr = req->rw.addr;
/* overflow */
@@ -8167,28 +8195,73 @@ static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
return pages;
}
-static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
{
- int i, j;
+ unsigned i;
- if (!ctx->user_bufs)
- return -ENXIO;
+ for (i = 0; i < imu->nr_bvecs; i++)
+ unpin_user_page(imu->bvec[i].bv_page);
- for (i = 0; i < ctx->nr_user_bufs; i++) {
- struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+ if (imu->acct_pages)
+ io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED);
+ kvfree(imu->bvec);
+ imu->nr_bvecs = 0;
+}
- for (j = 0; j < imu->nr_bvecs; j++)
- unpin_user_page(imu->bvec[j].bv_page);
+static void io_buffers_unmap(struct io_ring_ctx *ctx)
+{
+ unsigned i;
+ struct io_mapped_ubuf *imu;
- if (imu->acct_pages)
- io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED);
- kvfree(imu->bvec);
- imu->nr_bvecs = 0;
+ for (i = 0; i < ctx->nr_user_bufs; i++) {
+ imu = io_buf_from_index(ctx, i);
+ io_buffer_unmap(ctx, imu);
}
+}
+
+static void io_buffers_map_free(struct io_ring_ctx *ctx)
+{
+ struct fixed_rsrc_data *data = ctx->buf_data;
+ unsigned nr_tables, i;
+
+ if (!data)
+ return;
- kfree(ctx->user_bufs);
- ctx->user_bufs = NULL;
+ nr_tables = DIV_ROUND_UP(ctx->nr_user_bufs, IORING_MAX_BUFS_TABLE);
+ for (i = 0; i < nr_tables; i++)
+ kfree(data->table[i].bufs);
+ kfree(data->table);
+ percpu_ref_exit(&data->refs);
+ kfree(data);
+ ctx->buf_data = NULL;
ctx->nr_user_bufs = 0;
+}
+
+static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+{
+ struct fixed_rsrc_data *data = ctx->buf_data;
+ struct fixed_rsrc_ref_node *ref_node = NULL;
+
+ if (!data)
+ return -ENXIO;
+
+ spin_lock(&data->lock);
+ if (!list_empty(&data->ref_list))
+ ref_node = list_first_entry(&data->ref_list,
+ struct fixed_rsrc_ref_node, node);
+ spin_unlock(&data->lock);
+ if (ref_node)
+ percpu_ref_kill(&ref_node->refs);
+
+ percpu_ref_kill(&data->refs);
+
+ /* wait for all refs nodes to complete */
+ flush_delayed_work(&ctx->buf_put_work);
+ wait_for_completion(&data->done);
+
+ io_buffers_unmap(ctx);
+ io_buffers_map_free(ctx);
+
return 0;
}
@@ -8241,7 +8314,13 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
/* check previously registered pages */
for (i = 0; i < ctx->nr_user_bufs; i++) {
- struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+ struct fixed_rsrc_table *table;
+ struct io_mapped_ubuf *imu;
+ unsigned index;
+
+ table = &ctx->buf_data->table[i >> IORING_BUF_TABLE_SHIFT];
+ index = i & IORING_BUF_TABLE_MASK;
+ imu = &table->bufs[index];
for (j = 0; j < imu->nr_bvecs; j++) {
if (!PageCompound(imu->bvec[j].bv_page))
@@ -8376,19 +8455,82 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
return ret;
}
-static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
+static void io_free_buf_tables(struct fixed_rsrc_data *buf_data,
+ unsigned nr_tables)
{
- if (ctx->user_bufs)
- return -EBUSY;
- if (!nr_args || nr_args > UIO_MAXIOV)
- return -EINVAL;
+ int i;
- ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
- GFP_KERNEL);
- if (!ctx->user_bufs)
- return -ENOMEM;
+ for (i = 0; i < nr_tables; i++) {
+ struct fixed_rsrc_table *table = &buf_data->table[i];
+ kfree(table->bufs);
+ }
+}
- return 0;
+static int io_alloc_buf_tables(struct fixed_rsrc_data *buf_data,
+ unsigned nr_tables, unsigned nr_bufs)
+{
+ int i;
+
+ for (i = 0; i < nr_tables; i++) {
+ struct fixed_rsrc_table *table = &buf_data->table[i];
+ unsigned this_bufs;
+
+ this_bufs = min(nr_bufs, IORING_MAX_BUFS_TABLE);
+ table->bufs = kcalloc(this_bufs, sizeof(struct io_mapped_ubuf),
+ GFP_KERNEL);
+ if (!table->bufs)
+ break;
+ nr_bufs -= this_bufs;
+ }
+
+ if (i == nr_tables)
+ return 0;
+
+ io_free_buf_tables(buf_data, nr_tables);
+ return 1;
+}
+
+static struct fixed_rsrc_data *io_buffers_map_alloc(struct io_ring_ctx *ctx,
+ unsigned int nr_args)
+{
+ unsigned nr_tables;
+ struct fixed_rsrc_data *buf_data;
+ int ret = -ENOMEM;
+
+ if (ctx->buf_data)
+ return ERR_PTR(-EBUSY);
+ if (!nr_args || nr_args > IORING_MAX_FIXED_BUFS)
+ return ERR_PTR(-EINVAL);
+
+ buf_data = kzalloc(sizeof(*ctx->buf_data), GFP_KERNEL);
+ if (!buf_data)
+ return ERR_PTR(-ENOMEM);
+ buf_data->ctx = ctx;
+ init_completion(&buf_data->done);
+ INIT_LIST_HEAD(&buf_data->ref_list);
+ spin_lock_init(&buf_data->lock);
+
+ nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_BUFS_TABLE);
+ buf_data->table = kcalloc(nr_tables, sizeof(buf_data->table),
+ GFP_KERNEL);
+ if (!buf_data->table)
+ goto out_free;
+
+ if (percpu_ref_init(&buf_data->refs, io_rsrc_ref_kill,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
+ goto out_free;
+
+ if (io_alloc_buf_tables(buf_data, nr_tables, nr_args))
+ goto out_ref;
+
+ return buf_data;
+
+out_ref:
+ percpu_ref_exit(&buf_data->refs);
+out_free:
+ kfree(buf_data->table);
+ kfree(buf_data);
+ return ERR_PTR(ret);
}
static int io_buffer_validate(struct iovec *iov)
@@ -8408,39 +8550,119 @@ static int io_buffer_validate(struct iovec *iov)
return 0;
}
+static void io_buf_put_work(struct work_struct *work)
+{
+ struct io_ring_ctx *ctx;
+ struct llist_node *node;
+
+ ctx = container_of(work, struct io_ring_ctx, buf_put_work.work);
+ node = llist_del_all(&ctx->buf_put_llist);
+ io_rsrc_put_work(node);
+}
+
+static void io_buf_data_ref_zero(struct percpu_ref *ref)
+{
+ struct fixed_rsrc_ref_node *ref_node;
+ struct io_ring_ctx *ctx;
+ bool first_add;
+ int delay = HZ;
+
+ ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
+ ctx = ref_node->rsrc_data->ctx;
+
+ if (percpu_ref_is_dying(&ctx->buf_data->refs))
+ delay = 0;
+
+ first_add = llist_add(&ref_node->llist, &ctx->buf_put_llist);
+ if (!delay)
+ mod_delayed_work(system_wq, &ctx->buf_put_work, 0);
+ else if (first_add)
+ queue_delayed_work(system_wq, &ctx->buf_put_work, delay);
+}
+
+static void io_ring_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
+{
+ io_buffer_unmap(ctx, prsrc->buf);
+}
+
+static struct fixed_rsrc_ref_node *alloc_fixed_buf_ref_node(
+ struct io_ring_ctx *ctx)
+{
+ struct fixed_rsrc_ref_node *ref_node;
+
+ ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+ if (!ref_node)
+ return ERR_PTR(-ENOMEM);
+
+ if (percpu_ref_init(&ref_node->refs, io_buf_data_ref_zero,
+ 0, GFP_KERNEL)) {
+ kfree(ref_node);
+ return ERR_PTR(-ENOMEM);
+ }
+ INIT_LIST_HEAD(&ref_node->node);
+ INIT_LIST_HEAD(&ref_node->rsrc_list);
+ ref_node->rsrc_data = ctx->buf_data;
+ ref_node->rsrc_put = io_ring_buf_put;
+ return ref_node;
+}
+
static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int nr_args)
{
int i, ret;
struct iovec iov;
struct page *last_hpage = NULL;
+ struct fixed_rsrc_ref_node *ref_node;
+ struct fixed_rsrc_data *buf_data;
- ret = io_buffers_map_alloc(ctx, nr_args);
- if (ret)
- return ret;
+ buf_data = io_buffers_map_alloc(ctx, nr_args);
+ if (IS_ERR(buf_data))
+ return PTR_ERR(buf_data);
- for (i = 0; i < nr_args; i++) {
- struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+ for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
+ struct fixed_rsrc_table *table;
+ struct io_mapped_ubuf *imu;
+ unsigned index;
ret = io_copy_iov(ctx, &iov, arg, i);
if (ret)
break;
+ /* allow sparse sets */
+ if (!iov.iov_base && !iov.iov_len)
+ continue;
+
ret = io_buffer_validate(&iov);
if (ret)
break;
+ table = &buf_data->table[i >> IORING_BUF_TABLE_SHIFT];
+ index = i & IORING_BUF_TABLE_MASK;
+ imu = &table->bufs[index];
+
ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
if (ret)
break;
+ }
- ctx->nr_user_bufs++;
+ ctx->buf_data = buf_data;
+ if (ret) {
+ io_sqe_buffers_unregister(ctx);
+ return ret;
}
- if (ret)
+ ref_node = alloc_fixed_buf_ref_node(ctx);
+ if (IS_ERR(ref_node)) {
io_sqe_buffers_unregister(ctx);
+ return PTR_ERR(ref_node);
+ }
- return ret;
+ buf_data->node = ref_node;
+ spin_lock(&buf_data->lock);
+ list_add(&ref_node->node, &buf_data->ref_list);
+ spin_unlock(&buf_data->lock);
+ percpu_ref_get(&buf_data->refs);
+ return 0;
}
static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
@@ -9217,7 +9439,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
}
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
- struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
+ struct io_mapped_ubuf *buf = io_buf_from_index(ctx, i);
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
(unsigned int) buf->len);
--
1.8.3.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [PATCH 4/8] io_uring: implement fixed buffers registration similar to fixed files
2020-11-12 23:00 ` [PATCH 4/8] io_uring: implement fixed buffers registration similar to fixed files Bijan Mottahedeh
@ 2020-11-15 13:33 ` Pavel Begunkov
2020-11-16 21:24 ` Bijan Mottahedeh
0 siblings, 1 reply; 22+ messages in thread
From: Pavel Begunkov @ 2020-11-15 13:33 UTC (permalink / raw)
To: Bijan Mottahedeh, axboe; +Cc: io-uring
On 12/11/2020 23:00, Bijan Mottahedeh wrote:
> Apply fixed_rsrc functionality for fixed buffers support.
I don't get it, requests with fixed files take a ref to a node (see
fixed_file_refs) and put it on free, but I don't see anything similar
here. Did you work around it somehow?
That's not critical for this particular patch as you still do full
quisce in __io_uring_register(), but IIRC was essential for
update/remove requests.
>
> Signed-off-by: Bijan Mottahedeh <[email protected]>
> ---
> fs/io_uring.c | 294 +++++++++++++++++++++++++++++++++++++++++++++++++++-------
> 1 file changed, 258 insertions(+), 36 deletions(-)
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 974a619..de0019e 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -104,6 +104,14 @@
> #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
> IORING_REGISTER_LAST + IORING_OP_LAST)
>
> +/*
> + * Shift of 7 is 128 entries, or exactly one page on 64-bit archs
> + */
> +#define IORING_BUF_TABLE_SHIFT 7 /* struct io_mapped_ubuf */
> +#define IORING_MAX_BUFS_TABLE (1U << IORING_BUF_TABLE_SHIFT)
> +#define IORING_BUF_TABLE_MASK (IORING_MAX_BUFS_TABLE - 1)
> +#define IORING_MAX_FIXED_BUFS UIO_MAXIOV
> +
> struct io_uring {
> u32 head ____cacheline_aligned_in_smp;
> u32 tail ____cacheline_aligned_in_smp;
> @@ -338,8 +346,8 @@ struct io_ring_ctx {
> unsigned nr_user_files;
>
> /* if used, fixed mapped user buffers */
> + struct fixed_rsrc_data *buf_data;
> unsigned nr_user_bufs;
> - struct io_mapped_ubuf *user_bufs;
>
> struct user_struct *user;
>
> @@ -401,6 +409,9 @@ struct io_ring_ctx {
> struct delayed_work file_put_work;
> struct llist_head file_put_llist;
>
> + struct delayed_work buf_put_work;
> + struct llist_head buf_put_llist;
> +
> struct work_struct exit_work;
> struct io_restriction restrictions;
> };
> @@ -1019,6 +1030,7 @@ static struct file *io_file_get(struct io_submit_state *state,
> struct io_kiocb *req, int fd, bool fixed);
> static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs);
> static void io_file_put_work(struct work_struct *work);
> +static void io_buf_put_work(struct work_struct *work);
>
> static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
> struct iovec **iovec, struct iov_iter *iter,
> @@ -1318,6 +1330,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
> INIT_LIST_HEAD(&ctx->inflight_list);
> INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
> init_llist_head(&ctx->file_put_llist);
> + INIT_DELAYED_WORK(&ctx->buf_put_work, io_buf_put_work);
> + init_llist_head(&ctx->buf_put_llist);
> return ctx;
> err:
> if (ctx->fallback_req)
> @@ -2949,6 +2963,15 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
> io_rw_done(kiocb, ret);
> }
>
> +static inline struct io_mapped_ubuf *io_buf_from_index(struct io_ring_ctx *ctx,
> + int index)
> +{
> + struct fixed_rsrc_table *table;
> +
> + table = &ctx->buf_data->table[index >> IORING_BUF_TABLE_SHIFT];
> + return &table->bufs[index & IORING_BUF_TABLE_MASK];
> +}
> +
> static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
> struct iov_iter *iter)
> {
> @@ -2959,10 +2982,15 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
> size_t offset;
> u64 buf_addr;
>
> + /* attempt to use fixed buffers without having provided iovecs */
> + if (unlikely(!ctx->buf_data))
> + return -EFAULT;
> +
> + buf_index = req->buf_index;
> if (unlikely(buf_index >= ctx->nr_user_bufs))
> return -EFAULT;
> index = array_index_nospec(buf_index, ctx->nr_user_bufs);
> - imu = &ctx->user_bufs[index];
> + imu = io_buf_from_index(ctx, index);
> buf_addr = req->rw.addr;
>
> /* overflow */
> @@ -8167,28 +8195,73 @@ static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
> return pages;
> }
>
> -static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
> +static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
> {
> - int i, j;
> + unsigned i;
>
> - if (!ctx->user_bufs)
> - return -ENXIO;
> + for (i = 0; i < imu->nr_bvecs; i++)
> + unpin_user_page(imu->bvec[i].bv_page);
>
> - for (i = 0; i < ctx->nr_user_bufs; i++) {
> - struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
> + if (imu->acct_pages)
> + io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED);
> + kvfree(imu->bvec);
> + imu->nr_bvecs = 0;
> +}
>
> - for (j = 0; j < imu->nr_bvecs; j++)
> - unpin_user_page(imu->bvec[j].bv_page);
> +static void io_buffers_unmap(struct io_ring_ctx *ctx)
> +{
> + unsigned i;
> + struct io_mapped_ubuf *imu;
>
> - if (imu->acct_pages)
> - io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED);
> - kvfree(imu->bvec);
> - imu->nr_bvecs = 0;
> + for (i = 0; i < ctx->nr_user_bufs; i++) {
> + imu = io_buf_from_index(ctx, i);
> + io_buffer_unmap(ctx, imu);
> }
> +}
> +
> +static void io_buffers_map_free(struct io_ring_ctx *ctx)
> +{
> + struct fixed_rsrc_data *data = ctx->buf_data;
> + unsigned nr_tables, i;
> +
> + if (!data)
> + return;
>
> - kfree(ctx->user_bufs);
> - ctx->user_bufs = NULL;
> + nr_tables = DIV_ROUND_UP(ctx->nr_user_bufs, IORING_MAX_BUFS_TABLE);
> + for (i = 0; i < nr_tables; i++)
> + kfree(data->table[i].bufs);
> + kfree(data->table);
> + percpu_ref_exit(&data->refs);
> + kfree(data);
> + ctx->buf_data = NULL;
> ctx->nr_user_bufs = 0;
> +}
> +
> +static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
> +{
> + struct fixed_rsrc_data *data = ctx->buf_data;
> + struct fixed_rsrc_ref_node *ref_node = NULL;
> +
> + if (!data)
> + return -ENXIO;
> +
> + spin_lock(&data->lock);
> + if (!list_empty(&data->ref_list))
> + ref_node = list_first_entry(&data->ref_list,
> + struct fixed_rsrc_ref_node, node);
> + spin_unlock(&data->lock);
> + if (ref_node)
> + percpu_ref_kill(&ref_node->refs);
> +
> + percpu_ref_kill(&data->refs);
> +
> + /* wait for all refs nodes to complete */
> + flush_delayed_work(&ctx->buf_put_work);
> + wait_for_completion(&data->done);
> +
> + io_buffers_unmap(ctx);
> + io_buffers_map_free(ctx);
> +
> return 0;
> }
>
> @@ -8241,7 +8314,13 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
>
> /* check previously registered pages */
> for (i = 0; i < ctx->nr_user_bufs; i++) {
> - struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
> + struct fixed_rsrc_table *table;
> + struct io_mapped_ubuf *imu;
> + unsigned index;
> +
> + table = &ctx->buf_data->table[i >> IORING_BUF_TABLE_SHIFT];
> + index = i & IORING_BUF_TABLE_MASK;
> + imu = &table->bufs[index];
>
> for (j = 0; j < imu->nr_bvecs; j++) {
> if (!PageCompound(imu->bvec[j].bv_page))
> @@ -8376,19 +8455,82 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
> return ret;
> }
>
> -static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
> +static void io_free_buf_tables(struct fixed_rsrc_data *buf_data,
> + unsigned nr_tables)
> {
> - if (ctx->user_bufs)
> - return -EBUSY;
> - if (!nr_args || nr_args > UIO_MAXIOV)
> - return -EINVAL;
> + int i;
>
> - ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
> - GFP_KERNEL);
> - if (!ctx->user_bufs)
> - return -ENOMEM;
> + for (i = 0; i < nr_tables; i++) {
> + struct fixed_rsrc_table *table = &buf_data->table[i];
> + kfree(table->bufs);
> + }
> +}
>
> - return 0;
> +static int io_alloc_buf_tables(struct fixed_rsrc_data *buf_data,
> + unsigned nr_tables, unsigned nr_bufs)
> +{
> + int i;
> +
> + for (i = 0; i < nr_tables; i++) {
> + struct fixed_rsrc_table *table = &buf_data->table[i];
> + unsigned this_bufs;
> +
> + this_bufs = min(nr_bufs, IORING_MAX_BUFS_TABLE);
> + table->bufs = kcalloc(this_bufs, sizeof(struct io_mapped_ubuf),
> + GFP_KERNEL);
> + if (!table->bufs)
> + break;
> + nr_bufs -= this_bufs;
> + }
> +
> + if (i == nr_tables)
> + return 0;
> +
> + io_free_buf_tables(buf_data, nr_tables);
> + return 1;
> +}
> +
> +static struct fixed_rsrc_data *io_buffers_map_alloc(struct io_ring_ctx *ctx,
> + unsigned int nr_args)
> +{
> + unsigned nr_tables;
> + struct fixed_rsrc_data *buf_data;
> + int ret = -ENOMEM;
> +
> + if (ctx->buf_data)
> + return ERR_PTR(-EBUSY);
> + if (!nr_args || nr_args > IORING_MAX_FIXED_BUFS)
> + return ERR_PTR(-EINVAL);
> +
> + buf_data = kzalloc(sizeof(*ctx->buf_data), GFP_KERNEL);
> + if (!buf_data)
> + return ERR_PTR(-ENOMEM);
> + buf_data->ctx = ctx;
> + init_completion(&buf_data->done);
> + INIT_LIST_HEAD(&buf_data->ref_list);
> + spin_lock_init(&buf_data->lock);
> +
> + nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_BUFS_TABLE);
> + buf_data->table = kcalloc(nr_tables, sizeof(buf_data->table),
> + GFP_KERNEL);
> + if (!buf_data->table)
> + goto out_free;
> +
> + if (percpu_ref_init(&buf_data->refs, io_rsrc_ref_kill,
> + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
> + goto out_free;
> +
> + if (io_alloc_buf_tables(buf_data, nr_tables, nr_args))
> + goto out_ref;
> +
> + return buf_data;
> +
> +out_ref:
> + percpu_ref_exit(&buf_data->refs);
> +out_free:
> + kfree(buf_data->table);
> + kfree(buf_data);
> + return ERR_PTR(ret);
> }
>
> static int io_buffer_validate(struct iovec *iov)
> @@ -8408,39 +8550,119 @@ static int io_buffer_validate(struct iovec *iov)
> return 0;
> }
>
> +static void io_buf_put_work(struct work_struct *work)
> +{
> + struct io_ring_ctx *ctx;
> + struct llist_node *node;
> +
> + ctx = container_of(work, struct io_ring_ctx, buf_put_work.work);
> + node = llist_del_all(&ctx->buf_put_llist);
> + io_rsrc_put_work(node);
> +}
> +
> +static void io_buf_data_ref_zero(struct percpu_ref *ref)
> +{
> + struct fixed_rsrc_ref_node *ref_node;
> + struct io_ring_ctx *ctx;
> + bool first_add;
> + int delay = HZ;
> +
> + ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
> + ctx = ref_node->rsrc_data->ctx;
> +
> + if (percpu_ref_is_dying(&ctx->buf_data->refs))
> + delay = 0;
> +
> + first_add = llist_add(&ref_node->llist, &ctx->buf_put_llist);
> + if (!delay)
> + mod_delayed_work(system_wq, &ctx->buf_put_work, 0);
> + else if (first_add)
> + queue_delayed_work(system_wq, &ctx->buf_put_work, delay);
> +}
> +
> +static void io_ring_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
> +{
> + io_buffer_unmap(ctx, prsrc->buf);
> +}
> +
> +static struct fixed_rsrc_ref_node *alloc_fixed_buf_ref_node(
> + struct io_ring_ctx *ctx)
> +{
> + struct fixed_rsrc_ref_node *ref_node;
> +
> + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
> + if (!ref_node)
> + return ERR_PTR(-ENOMEM);
> +
> + if (percpu_ref_init(&ref_node->refs, io_buf_data_ref_zero,
> + 0, GFP_KERNEL)) {
> + kfree(ref_node);
> + return ERR_PTR(-ENOMEM);
> + }
> + INIT_LIST_HEAD(&ref_node->node);
> + INIT_LIST_HEAD(&ref_node->rsrc_list);
> + ref_node->rsrc_data = ctx->buf_data;
> + ref_node->rsrc_put = io_ring_buf_put;
> + return ref_node;
> +}
> +
> static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> unsigned int nr_args)
> {
> int i, ret;
> struct iovec iov;
> struct page *last_hpage = NULL;
> + struct fixed_rsrc_ref_node *ref_node;
> + struct fixed_rsrc_data *buf_data;
>
> - ret = io_buffers_map_alloc(ctx, nr_args);
> - if (ret)
> - return ret;
> + buf_data = io_buffers_map_alloc(ctx, nr_args);
> + if (IS_ERR(buf_data))
> + return PTR_ERR(buf_data);
>
> - for (i = 0; i < nr_args; i++) {
> - struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
> + for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
> + struct fixed_rsrc_table *table;
> + struct io_mapped_ubuf *imu;
> + unsigned index;
>
> ret = io_copy_iov(ctx, &iov, arg, i);
> if (ret)
> break;
>
> + /* allow sparse sets */
> + if (!iov.iov_base && !iov.iov_len)
> + continue;
> +
> ret = io_buffer_validate(&iov);
> if (ret)
> break;
>
> + table = &buf_data->table[i >> IORING_BUF_TABLE_SHIFT];
> + index = i & IORING_BUF_TABLE_MASK;
> + imu = &table->bufs[index];
> +
> ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
> if (ret)
> break;
> + }
>
> - ctx->nr_user_bufs++;
> + ctx->buf_data = buf_data;
> + if (ret) {
> + io_sqe_buffers_unregister(ctx);
> + return ret;
> }
>
> - if (ret)
> + ref_node = alloc_fixed_buf_ref_node(ctx);
> + if (IS_ERR(ref_node)) {
> io_sqe_buffers_unregister(ctx);
> + return PTR_ERR(ref_node);
> + }
>
> - return ret;
> + buf_data->node = ref_node;
> + spin_lock(&buf_data->lock);
> + list_add(&ref_node->node, &buf_data->ref_list);
> + spin_unlock(&buf_data->lock);
> + percpu_ref_get(&buf_data->refs);
> + return 0;
> }
>
> static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
> @@ -9217,7 +9439,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
> }
> seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
> for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
> - struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
> + struct io_mapped_ubuf *buf = io_buf_from_index(ctx, i);
>
> seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
> (unsigned int) buf->len);
>
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH 4/8] io_uring: implement fixed buffers registration similar to fixed files
2020-11-15 13:33 ` Pavel Begunkov
@ 2020-11-16 21:24 ` Bijan Mottahedeh
2020-11-16 23:09 ` Pavel Begunkov
0 siblings, 1 reply; 22+ messages in thread
From: Bijan Mottahedeh @ 2020-11-16 21:24 UTC (permalink / raw)
To: Pavel Begunkov, axboe; +Cc: io-uring
On 11/15/2020 5:33 AM, Pavel Begunkov wrote:
> On 12/11/2020 23:00, Bijan Mottahedeh wrote:
>> Apply fixed_rsrc functionality for fixed buffers support.
>
> I don't get it, requests with fixed files take a ref to a node (see
> fixed_file_refs) and put it on free, but I don't see anything similar
> here. Did you work around it somehow?
No that's my oversight. I think I wrongfully assumed that
io_import_*fixed() would take care of that.
Should I basically do something similar to io_file_get()/io_put_file()?
io_import_fixed()
io_import_iovec_fixed()
-> io_buf_get()
io_dismantle_io()
-> io_put_buf()
>
> That's not critical for this particular patch as you still do full
> quisce in __io_uring_register(), but IIRC was essential for
> update/remove requests.
That's something I'm not clear about. Currently we quiesce for the
following cases:
case IORING_UNREGISTER_FILES:
case IORING_REGISTER_FILES_UPDATE:
case IORING_REGISTER_BUFFERS_UPDATE:
I had assume I have to add IORING_UNREGISTER_BUFFERS as well. But
above, do we in fact the quiesce give the ref counts?
Are you ok with the rest of the patches or should I address anything else?
>
>>
>> Signed-off-by: Bijan Mottahedeh <[email protected]>
>> ---
>> fs/io_uring.c | 294 +++++++++++++++++++++++++++++++++++++++++++++++++++-------
>> 1 file changed, 258 insertions(+), 36 deletions(-)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 974a619..de0019e 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -104,6 +104,14 @@
>> #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
>> IORING_REGISTER_LAST + IORING_OP_LAST)
>>
>> +/*
>> + * Shift of 7 is 128 entries, or exactly one page on 64-bit archs
>> + */
>> +#define IORING_BUF_TABLE_SHIFT 7 /* struct io_mapped_ubuf */
>> +#define IORING_MAX_BUFS_TABLE (1U << IORING_BUF_TABLE_SHIFT)
>> +#define IORING_BUF_TABLE_MASK (IORING_MAX_BUFS_TABLE - 1)
>> +#define IORING_MAX_FIXED_BUFS UIO_MAXIOV
>> +
>> struct io_uring {
>> u32 head ____cacheline_aligned_in_smp;
>> u32 tail ____cacheline_aligned_in_smp;
>> @@ -338,8 +346,8 @@ struct io_ring_ctx {
>> unsigned nr_user_files;
>>
>> /* if used, fixed mapped user buffers */
>> + struct fixed_rsrc_data *buf_data;
>> unsigned nr_user_bufs;
>> - struct io_mapped_ubuf *user_bufs;
>>
>> struct user_struct *user;
>>
>> @@ -401,6 +409,9 @@ struct io_ring_ctx {
>> struct delayed_work file_put_work;
>> struct llist_head file_put_llist;
>>
>> + struct delayed_work buf_put_work;
>> + struct llist_head buf_put_llist;
>> +
>> struct work_struct exit_work;
>> struct io_restriction restrictions;
>> };
>> @@ -1019,6 +1030,7 @@ static struct file *io_file_get(struct io_submit_state *state,
>> struct io_kiocb *req, int fd, bool fixed);
>> static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs);
>> static void io_file_put_work(struct work_struct *work);
>> +static void io_buf_put_work(struct work_struct *work);
>>
>> static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
>> struct iovec **iovec, struct iov_iter *iter,
>> @@ -1318,6 +1330,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
>> INIT_LIST_HEAD(&ctx->inflight_list);
>> INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
>> init_llist_head(&ctx->file_put_llist);
>> + INIT_DELAYED_WORK(&ctx->buf_put_work, io_buf_put_work);
>> + init_llist_head(&ctx->buf_put_llist);
>> return ctx;
>> err:
>> if (ctx->fallback_req)
>> @@ -2949,6 +2963,15 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
>> io_rw_done(kiocb, ret);
>> }
>>
>> +static inline struct io_mapped_ubuf *io_buf_from_index(struct io_ring_ctx *ctx,
>> + int index)
>> +{
>> + struct fixed_rsrc_table *table;
>> +
>> + table = &ctx->buf_data->table[index >> IORING_BUF_TABLE_SHIFT];
>> + return &table->bufs[index & IORING_BUF_TABLE_MASK];
>> +}
>> +
>> static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
>> struct iov_iter *iter)
>> {
>> @@ -2959,10 +2982,15 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
>> size_t offset;
>> u64 buf_addr;
>>
>> + /* attempt to use fixed buffers without having provided iovecs */
>> + if (unlikely(!ctx->buf_data))
>> + return -EFAULT;
>> +
>> + buf_index = req->buf_index;
>> if (unlikely(buf_index >= ctx->nr_user_bufs))
>> return -EFAULT;
>> index = array_index_nospec(buf_index, ctx->nr_user_bufs);
>> - imu = &ctx->user_bufs[index];
>> + imu = io_buf_from_index(ctx, index);
>> buf_addr = req->rw.addr;
>>
>> /* overflow */
>> @@ -8167,28 +8195,73 @@ static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
>> return pages;
>> }
>>
>> -static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
>> +static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
>> {
>> - int i, j;
>> + unsigned i;
>>
>> - if (!ctx->user_bufs)
>> - return -ENXIO;
>> + for (i = 0; i < imu->nr_bvecs; i++)
>> + unpin_user_page(imu->bvec[i].bv_page);
>>
>> - for (i = 0; i < ctx->nr_user_bufs; i++) {
>> - struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
>> + if (imu->acct_pages)
>> + io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED);
>> + kvfree(imu->bvec);
>> + imu->nr_bvecs = 0;
>> +}
>>
>> - for (j = 0; j < imu->nr_bvecs; j++)
>> - unpin_user_page(imu->bvec[j].bv_page);
>> +static void io_buffers_unmap(struct io_ring_ctx *ctx)
>> +{
>> + unsigned i;
>> + struct io_mapped_ubuf *imu;
>>
>> - if (imu->acct_pages)
>> - io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED);
>> - kvfree(imu->bvec);
>> - imu->nr_bvecs = 0;
>> + for (i = 0; i < ctx->nr_user_bufs; i++) {
>> + imu = io_buf_from_index(ctx, i);
>> + io_buffer_unmap(ctx, imu);
>> }
>> +}
>> +
>> +static void io_buffers_map_free(struct io_ring_ctx *ctx)
>> +{
>> + struct fixed_rsrc_data *data = ctx->buf_data;
>> + unsigned nr_tables, i;
>> +
>> + if (!data)
>> + return;
>>
>> - kfree(ctx->user_bufs);
>> - ctx->user_bufs = NULL;
>> + nr_tables = DIV_ROUND_UP(ctx->nr_user_bufs, IORING_MAX_BUFS_TABLE);
>> + for (i = 0; i < nr_tables; i++)
>> + kfree(data->table[i].bufs);
>> + kfree(data->table);
>> + percpu_ref_exit(&data->refs);
>> + kfree(data);
>> + ctx->buf_data = NULL;
>> ctx->nr_user_bufs = 0;
>> +}
>> +
>> +static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
>> +{
>> + struct fixed_rsrc_data *data = ctx->buf_data;
>> + struct fixed_rsrc_ref_node *ref_node = NULL;
>> +
>> + if (!data)
>> + return -ENXIO;
>> +
>> + spin_lock(&data->lock);
>> + if (!list_empty(&data->ref_list))
>> + ref_node = list_first_entry(&data->ref_list,
>> + struct fixed_rsrc_ref_node, node);
>> + spin_unlock(&data->lock);
>> + if (ref_node)
>> + percpu_ref_kill(&ref_node->refs);
>> +
>> + percpu_ref_kill(&data->refs);
>> +
>> + /* wait for all refs nodes to complete */
>> + flush_delayed_work(&ctx->buf_put_work);
>> + wait_for_completion(&data->done);
>> +
>> + io_buffers_unmap(ctx);
>> + io_buffers_map_free(ctx);
>> +
>> return 0;
>> }
>>
>> @@ -8241,7 +8314,13 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
>>
>> /* check previously registered pages */
>> for (i = 0; i < ctx->nr_user_bufs; i++) {
>> - struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
>> + struct fixed_rsrc_table *table;
>> + struct io_mapped_ubuf *imu;
>> + unsigned index;
>> +
>> + table = &ctx->buf_data->table[i >> IORING_BUF_TABLE_SHIFT];
>> + index = i & IORING_BUF_TABLE_MASK;
>> + imu = &table->bufs[index];
>>
>> for (j = 0; j < imu->nr_bvecs; j++) {
>> if (!PageCompound(imu->bvec[j].bv_page))
>> @@ -8376,19 +8455,82 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
>> return ret;
>> }
>>
>> -static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
>> +static void io_free_buf_tables(struct fixed_rsrc_data *buf_data,
>> + unsigned nr_tables)
>> {
>> - if (ctx->user_bufs)
>> - return -EBUSY;
>> - if (!nr_args || nr_args > UIO_MAXIOV)
>> - return -EINVAL;
>> + int i;
>>
>> - ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
>> - GFP_KERNEL);
>> - if (!ctx->user_bufs)
>> - return -ENOMEM;
>> + for (i = 0; i < nr_tables; i++) {
>> + struct fixed_rsrc_table *table = &buf_data->table[i];
>> + kfree(table->bufs);
>> + }
>> +}
>>
>> - return 0;
>> +static int io_alloc_buf_tables(struct fixed_rsrc_data *buf_data,
>> + unsigned nr_tables, unsigned nr_bufs)
>> +{
>> + int i;
>> +
>> + for (i = 0; i < nr_tables; i++) {
>> + struct fixed_rsrc_table *table = &buf_data->table[i];
>> + unsigned this_bufs;
>> +
>> + this_bufs = min(nr_bufs, IORING_MAX_BUFS_TABLE);
>> + table->bufs = kcalloc(this_bufs, sizeof(struct io_mapped_ubuf),
>> + GFP_KERNEL);
>> + if (!table->bufs)
>> + break;
>> + nr_bufs -= this_bufs;
>> + }
>> +
>> + if (i == nr_tables)
>> + return 0;
>> +
>> + io_free_buf_tables(buf_data, nr_tables);
>> + return 1;
>> +}
>> +
>> +static struct fixed_rsrc_data *io_buffers_map_alloc(struct io_ring_ctx *ctx,
>> + unsigned int nr_args)
>> +{
>> + unsigned nr_tables;
>> + struct fixed_rsrc_data *buf_data;
>> + int ret = -ENOMEM;
>> +
>> + if (ctx->buf_data)
>> + return ERR_PTR(-EBUSY);
>> + if (!nr_args || nr_args > IORING_MAX_FIXED_BUFS)
>> + return ERR_PTR(-EINVAL);
>> +
>> + buf_data = kzalloc(sizeof(*ctx->buf_data), GFP_KERNEL);
>> + if (!buf_data)
>> + return ERR_PTR(-ENOMEM);
>> + buf_data->ctx = ctx;
>> + init_completion(&buf_data->done);
>> + INIT_LIST_HEAD(&buf_data->ref_list);
>> + spin_lock_init(&buf_data->lock);
>> +
>> + nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_BUFS_TABLE);
>> + buf_data->table = kcalloc(nr_tables, sizeof(buf_data->table),
>> + GFP_KERNEL);
>> + if (!buf_data->table)
>> + goto out_free;
>> +
>> + if (percpu_ref_init(&buf_data->refs, io_rsrc_ref_kill,
>> + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
>> + goto out_free;
>> +
>> + if (io_alloc_buf_tables(buf_data, nr_tables, nr_args))
>> + goto out_ref;
>> +
>> + return buf_data;
>> +
>> +out_ref:
>> + percpu_ref_exit(&buf_data->refs);
>> +out_free:
>> + kfree(buf_data->table);
>> + kfree(buf_data);
>> + return ERR_PTR(ret);
>> }
>>
>> static int io_buffer_validate(struct iovec *iov)
>> @@ -8408,39 +8550,119 @@ static int io_buffer_validate(struct iovec *iov)
>> return 0;
>> }
>>
>> +static void io_buf_put_work(struct work_struct *work)
>> +{
>> + struct io_ring_ctx *ctx;
>> + struct llist_node *node;
>> +
>> + ctx = container_of(work, struct io_ring_ctx, buf_put_work.work);
>> + node = llist_del_all(&ctx->buf_put_llist);
>> + io_rsrc_put_work(node);
>> +}
>> +
>> +static void io_buf_data_ref_zero(struct percpu_ref *ref)
>> +{
>> + struct fixed_rsrc_ref_node *ref_node;
>> + struct io_ring_ctx *ctx;
>> + bool first_add;
>> + int delay = HZ;
>> +
>> + ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
>> + ctx = ref_node->rsrc_data->ctx;
>> +
>> + if (percpu_ref_is_dying(&ctx->buf_data->refs))
>> + delay = 0;
>> +
>> + first_add = llist_add(&ref_node->llist, &ctx->buf_put_llist);
>> + if (!delay)
>> + mod_delayed_work(system_wq, &ctx->buf_put_work, 0);
>> + else if (first_add)
>> + queue_delayed_work(system_wq, &ctx->buf_put_work, delay);
>> +}
>> +
>> +static void io_ring_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
>> +{
>> + io_buffer_unmap(ctx, prsrc->buf);
>> +}
>> +
>> +static struct fixed_rsrc_ref_node *alloc_fixed_buf_ref_node(
>> + struct io_ring_ctx *ctx)
>> +{
>> + struct fixed_rsrc_ref_node *ref_node;
>> +
>> + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
>> + if (!ref_node)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + if (percpu_ref_init(&ref_node->refs, io_buf_data_ref_zero,
>> + 0, GFP_KERNEL)) {
>> + kfree(ref_node);
>> + return ERR_PTR(-ENOMEM);
>> + }
>> + INIT_LIST_HEAD(&ref_node->node);
>> + INIT_LIST_HEAD(&ref_node->rsrc_list);
>> + ref_node->rsrc_data = ctx->buf_data;
>> + ref_node->rsrc_put = io_ring_buf_put;
>> + return ref_node;
>> +}
>> +
>> static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
>> unsigned int nr_args)
>> {
>> int i, ret;
>> struct iovec iov;
>> struct page *last_hpage = NULL;
>> + struct fixed_rsrc_ref_node *ref_node;
>> + struct fixed_rsrc_data *buf_data;
>>
>> - ret = io_buffers_map_alloc(ctx, nr_args);
>> - if (ret)
>> - return ret;
>> + buf_data = io_buffers_map_alloc(ctx, nr_args);
>> + if (IS_ERR(buf_data))
>> + return PTR_ERR(buf_data);
>>
>> - for (i = 0; i < nr_args; i++) {
>> - struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
>> + for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
>> + struct fixed_rsrc_table *table;
>> + struct io_mapped_ubuf *imu;
>> + unsigned index;
>>
>> ret = io_copy_iov(ctx, &iov, arg, i);
>> if (ret)
>> break;
>>
>> + /* allow sparse sets */
>> + if (!iov.iov_base && !iov.iov_len)
>> + continue;
>> +
>> ret = io_buffer_validate(&iov);
>> if (ret)
>> break;
>>
>> + table = &buf_data->table[i >> IORING_BUF_TABLE_SHIFT];
>> + index = i & IORING_BUF_TABLE_MASK;
>> + imu = &table->bufs[index];
>> +
>> ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
>> if (ret)
>> break;
>> + }
>>
>> - ctx->nr_user_bufs++;
>> + ctx->buf_data = buf_data;
>> + if (ret) {
>> + io_sqe_buffers_unregister(ctx);
>> + return ret;
>> }
>>
>> - if (ret)
>> + ref_node = alloc_fixed_buf_ref_node(ctx);
>> + if (IS_ERR(ref_node)) {
>> io_sqe_buffers_unregister(ctx);
>> + return PTR_ERR(ref_node);
>> + }
>>
>> - return ret;
>> + buf_data->node = ref_node;
>> + spin_lock(&buf_data->lock);
>> + list_add(&ref_node->node, &buf_data->ref_list);
>> + spin_unlock(&buf_data->lock);
>> + percpu_ref_get(&buf_data->refs);
>> + return 0;
>> }
>>
>> static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
>> @@ -9217,7 +9439,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
>> }
>> seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
>> for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
>> - struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
>> + struct io_mapped_ubuf *buf = io_buf_from_index(ctx, i);
>>
>> seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
>> (unsigned int) buf->len);
>>
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH 4/8] io_uring: implement fixed buffers registration similar to fixed files
2020-11-16 21:24 ` Bijan Mottahedeh
@ 2020-11-16 23:09 ` Pavel Begunkov
2020-11-17 0:41 ` Bijan Mottahedeh
0 siblings, 1 reply; 22+ messages in thread
From: Pavel Begunkov @ 2020-11-16 23:09 UTC (permalink / raw)
To: Bijan Mottahedeh, axboe; +Cc: io-uring
On 16/11/2020 21:24, Bijan Mottahedeh wrote:
> On 11/15/2020 5:33 AM, Pavel Begunkov wrote:
>> On 12/11/2020 23:00, Bijan Mottahedeh wrote:
>>> Apply fixed_rsrc functionality for fixed buffers support.
>>
>> I don't get it, requests with fixed files take a ref to a node (see
>> fixed_file_refs) and put it on free, but I don't see anything similar
>> here. Did you work around it somehow?
>
> No that's my oversight. I think I wrongfully assumed that io_import_*fixed() would take care of that.
>
> Should I basically do something similar to io_file_get()/io_put_file()?
If done in a dumb way, that'd mean extra pair of percpu get/put
and +8B in io_kiocb. Frankly, I don't like that idea.
However, if you don't split paths and make fixed_file_ref_node to
supports all types of resources at the same time, it should be
bearable. I.e. register removals of both types to a single node,
and use ->fixed_file_refs for all request's resources.
So you don't grow io_kiocb and do maximum one percpu_ref_get/put()
pair per request.
I'll send a small patch preparing grounds, because there is actually
another nasty thing from past that needs to be reworked.
>
> io_import_fixed()
> io_import_iovec_fixed()
> -> io_buf_get()
>
> io_dismantle_io()
> -> io_put_buf()
>
>>
>> That's not critical for this particular patch as you still do full
>> quisce in __io_uring_register(), but IIRC was essential for
>> update/remove requests.
>
> That's something I'm not clear about. Currently we quiesce for the following cases:
>
> case IORING_UNREGISTER_FILES:
> case IORING_REGISTER_FILES_UPDATE:
> case IORING_REGISTER_BUFFERS_UPDATE:
static bool io_register_op_must_quiesce(int op)
{
switch (op) {
case IORING_UNREGISTER_FILES:
case IORING_REGISTER_FILES_UPDATE:
case IORING_REGISTER_PROBE:
case IORING_REGISTER_PERSONALITY:
case IORING_UNREGISTER_PERSONALITY:
return false;
default:
return true;
}
}
It returns _false_ for these cases, so _doesn't_ quiesce for them.
>
> I had assume I have to add IORING_UNREGISTER_BUFFERS as well. But above, do we in fact the quiesce give the ref counts?
>
> Are you ok with the rest of the patches or should I address anything else?
io_import_fixed() currently can be called twice, and that would give
you 2 different bvecs. Hence after removing full quisce io_read()
retrying short reads will probably be able to partially read into 2
different buffers. That really have to be fixed.
I haven't looked the patchset properly yet. I'll reply to the
cover-letter + a small comment below
>>> static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
>>> struct iov_iter *iter)
>>> {
>>> @@ -2959,10 +2982,15 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
>>> size_t offset;
>>> u64 buf_addr;
>>> + /* attempt to use fixed buffers without having provided iovecs */
>>> + if (unlikely(!ctx->buf_data))
>>> + return -EFAULT;
I removed it for files,
because (ctx->buf_data) IFF (ctx->nr_user_bufs == 0),
so the following ctx->nr_user_bufs check is enough.
>>> +
>>> + buf_index = req->buf_index;
>>> if (unlikely(buf_index >= ctx->nr_user_bufs))
>>> return -EFAULT;
>>> index = array_index_nospec(buf_index, ctx->nr_user_bufs);
>>> - imu = &ctx->user_bufs[index];
>>> + imu = io_buf_from_index(ctx, index);
>>> buf_addr = req->rw.addr;
>>> /* overflow */
>>> @@ -8167,28 +8195,73 @@ static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
>>> return pages;
>>> }
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH 4/8] io_uring: implement fixed buffers registration similar to fixed files
2020-11-16 23:09 ` Pavel Begunkov
@ 2020-11-17 0:41 ` Bijan Mottahedeh
0 siblings, 0 replies; 22+ messages in thread
From: Bijan Mottahedeh @ 2020-11-17 0:41 UTC (permalink / raw)
To: Pavel Begunkov, axboe; +Cc: io-uring
>>>> Apply fixed_rsrc functionality for fixed buffers support.
>>>
>>> I don't get it, requests with fixed files take a ref to a node (see
>>> fixed_file_refs) and put it on free, but I don't see anything similar
>>> here. Did you work around it somehow?
>>
>> No that's my oversight. I think I wrongfully assumed that io_import_*fixed() would take care of that.
>>
>> Should I basically do something similar to io_file_get()/io_put_file()?
>
> If done in a dumb way, that'd mean extra pair of percpu get/put
> and +8B in io_kiocb. Frankly, I don't like that idea.
>
> However, if you don't split paths and make fixed_file_ref_node to
> supports all types of resources at the same time, it should be
> bearable. I.e. register removals of both types to a single node,
> and use ->fixed_file_refs for all request's resources.
> So you don't grow io_kiocb and do maximum one percpu_ref_get/put()
> pair per request.
So something like promoting fixed_file_ref_node to fixed_rsrc_ref_node,
and ref counting all fixed resources for a given request together?
>
> I'll send a small patch preparing grounds, because there is actually
> another nasty thing from past that needs to be reworked.
Ok, I'll wait for your patch then.
>> Are you ok with the rest of the patches or should I address anything else?
>
> io_import_fixed() currently can be called twice, and that would give
> you 2 different bvecs. Hence after removing full quisce io_read()
> retrying short reads will probably be able to partially read into 2
> different buffers. That really have to be fixed.
Will you address that in your patch?
>>>> static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
>>>> struct iov_iter *iter)
>>>> {
>>>> @@ -2959,10 +2982,15 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
>>>> size_t offset;
>>>> u64 buf_addr;
>>>> + /* attempt to use fixed buffers without having provided iovecs */
>>>> + if (unlikely(!ctx->buf_data))
>>>> + return -EFAULT;
>
> I removed it for files,
> because (ctx->buf_data) IFF (ctx->nr_user_bufs == 0),
> so the following ctx->nr_user_bufs check is enough.
Ok, will remove the check for buffers.
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH 5/8] io_uring: generalize files_update functionlity to rsrc_update
2020-11-12 23:00 [PATCH 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
` (3 preceding siblings ...)
2020-11-12 23:00 ` [PATCH 4/8] io_uring: implement fixed buffers registration similar to fixed files Bijan Mottahedeh
@ 2020-11-12 23:00 ` Bijan Mottahedeh
2020-11-12 23:00 ` [PATCH 6/8] io_uring: support buffer registration updates Bijan Mottahedeh
` (3 subsequent siblings)
8 siblings, 0 replies; 22+ messages in thread
From: Bijan Mottahedeh @ 2020-11-12 23:00 UTC (permalink / raw)
To: axboe; +Cc: io-uring
Generalize files_update functionality to rsrc_update in order to leverage
it for buffers updates.
Signed-off-by: Bijan Mottahedeh <[email protected]>
---
fs/io_uring.c | 57 ++++++++++++++++++++++++++++---------------
include/uapi/linux/io_uring.h | 10 ++++++++
2 files changed, 48 insertions(+), 19 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index de0019e..71f6d5c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -515,7 +515,7 @@ struct io_open {
unsigned long nofile;
};
-struct io_files_update {
+struct io_rsrc_update {
struct file *file;
u64 arg;
u32 nr_args;
@@ -709,7 +709,7 @@ struct io_kiocb {
struct io_sr_msg sr_msg;
struct io_open open;
struct io_close close;
- struct io_files_update files_update;
+ struct io_rsrc_update rsrc_update;
struct io_fadvise fadvise;
struct io_madvise madvise;
struct io_epoll epoll;
@@ -1023,7 +1023,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
static void __io_queue_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
- struct io_uring_files_update *ip,
+ struct io_uring_rsrc_update *ip,
unsigned nr_args);
static void __io_clean_op(struct io_kiocb *req);
static struct file *io_file_get(struct io_submit_state *state,
@@ -5878,8 +5878,8 @@ static int io_async_cancel(struct io_kiocb *req)
return 0;
}
-static int io_files_update_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+static int io_rsrc_update_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
{
if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
return -EINVAL;
@@ -5888,29 +5888,32 @@ static int io_files_update_prep(struct io_kiocb *req,
if (sqe->ioprio || sqe->rw_flags)
return -EINVAL;
- req->files_update.offset = READ_ONCE(sqe->off);
- req->files_update.nr_args = READ_ONCE(sqe->len);
- if (!req->files_update.nr_args)
+ req->rsrc_update.offset = READ_ONCE(sqe->off);
+ req->rsrc_update.nr_args = READ_ONCE(sqe->len);
+ if (!req->rsrc_update.nr_args)
return -EINVAL;
- req->files_update.arg = READ_ONCE(sqe->addr);
+ req->rsrc_update.arg = READ_ONCE(sqe->addr);
return 0;
}
-static int io_files_update(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_rsrc_update(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs,
+ int (*update)(struct io_ring_ctx *ctx,
+ struct io_uring_rsrc_update *up,
+ unsigned nr_args))
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_uring_files_update up;
+ struct io_uring_rsrc_update up;
int ret;
if (force_nonblock)
return -EAGAIN;
- up.offset = req->files_update.offset;
- up.fds = req->files_update.arg;
+ up.offset = req->rsrc_update.offset;
+ up.rsrc = req->rsrc_update.arg;
mutex_lock(&ctx->uring_lock);
- ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
+ ret = (*update)(ctx, &up, req->rsrc_update.nr_args);
mutex_unlock(&ctx->uring_lock);
if (ret < 0)
@@ -5919,6 +5922,23 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock,
return 0;
}
+static int io_files_update_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ return io_rsrc_update_prep(req, sqe);
+}
+
+static int io_files_update(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
+{
+ return io_rsrc_update(req, force_nonblock, cs, __io_sqe_files_update);
+}
+
+static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
+{
+ percpu_ref_exit(&ref_node->refs);
+ kfree(ref_node);
+}
static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
switch (req->opcode) {
@@ -7619,8 +7639,7 @@ static struct fixed_rsrc_ref_node *alloc_fixed_file_ref_node(
static void destroy_fixed_file_ref_node(struct fixed_rsrc_ref_node *ref_node)
{
- percpu_ref_exit(&ref_node->refs);
- kfree(ref_node);
+ destroy_fixed_rsrc_ref_node(ref_node);
}
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
@@ -7796,7 +7815,7 @@ static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
}
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
- struct io_uring_files_update *up,
+ struct io_uring_rsrc_update *up,
unsigned nr_args)
{
struct fixed_rsrc_data *data = ctx->file_data;
@@ -7886,7 +7905,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
- struct io_uring_files_update up;
+ struct io_uring_rsrc_update up;
if (!ctx->file_data)
return -ENXIO;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 6bb8229..87f0f56 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -290,6 +290,16 @@ struct io_uring_files_update {
__aligned_u64 /* __s32 * */ fds;
};
+struct io_uring_rsrc_update {
+ __u32 offset;
+ __u32 resv;
+ union {
+ __aligned_u64 /* __s32 * */ fds;
+ __aligned_u64 /* __s32 * */ iovs;
+ __aligned_u64 /* __s32 * */ rsrc;
+ };
+};
+
#define IO_URING_OP_SUPPORTED (1U << 0)
struct io_uring_probe_op {
--
1.8.3.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [PATCH 6/8] io_uring: support buffer registration updates
2020-11-12 23:00 [PATCH 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
` (4 preceding siblings ...)
2020-11-12 23:00 ` [PATCH 5/8] io_uring: generalize files_update functionlity to rsrc_update Bijan Mottahedeh
@ 2020-11-12 23:00 ` Bijan Mottahedeh
2020-11-18 20:17 ` Pavel Begunkov
2020-11-12 23:00 ` [PATCH 7/8] io_uring: support readv/writev with fixed buffers Bijan Mottahedeh
` (2 subsequent siblings)
8 siblings, 1 reply; 22+ messages in thread
From: Bijan Mottahedeh @ 2020-11-12 23:00 UTC (permalink / raw)
To: axboe; +Cc: io-uring
Introduce IORING_REGISTER_BUFFERS_UPDATE and IORING_OP_BUFFERS_UPDATE,
consistent with file registration update.
Signed-off-by: Bijan Mottahedeh <[email protected]>
---
fs/io_uring.c | 139 +++++++++++++++++++++++++++++++++++++++++-
include/uapi/linux/io_uring.h | 8 +--
2 files changed, 140 insertions(+), 7 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 71f6d5c..6020fd2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1006,6 +1006,9 @@ struct io_op_def {
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
},
+ [IORING_OP_BUFFERS_UPDATE] = {
+ .work_flags = IO_WQ_WORK_MM,
+ },
};
enum io_mem_account {
@@ -1025,6 +1028,9 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update *ip,
unsigned nr_args);
+static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
+ struct io_uring_rsrc_update *up,
+ unsigned nr_args);
static void __io_clean_op(struct io_kiocb *req);
static struct file *io_file_get(struct io_submit_state *state,
struct io_kiocb *req, int fd, bool fixed);
@@ -5939,6 +5945,19 @@ static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
percpu_ref_exit(&ref_node->refs);
kfree(ref_node);
}
+
+static int io_buffers_update_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ return io_rsrc_update_prep(req, sqe);
+}
+
+static int io_buffers_update(struct io_kiocb *req, bool force_nonblock,
+ struct io_comp_state *cs)
+{
+ return io_rsrc_update(req, force_nonblock, cs, __io_sqe_buffers_update);
+}
+
static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
switch (req->opcode) {
@@ -6010,11 +6029,13 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_renameat_prep(req, sqe);
case IORING_OP_UNLINKAT:
return io_unlinkat_prep(req, sqe);
+ case IORING_OP_BUFFERS_UPDATE:
+ return io_buffers_update_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
req->opcode);
- return-EINVAL;
+ return -EINVAL;
}
static int io_req_defer_prep(struct io_kiocb *req,
@@ -6268,6 +6289,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
case IORING_OP_UNLINKAT:
ret = io_unlinkat(req, force_nonblock);
break;
+ case IORING_OP_BUFFERS_UPDATE:
+ ret = io_buffers_update(req, force_nonblock, cs);
+ break;
default:
ret = -EINVAL;
break;
@@ -8224,6 +8248,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
if (imu->acct_pages)
io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED);
kvfree(imu->bvec);
+ imu->bvec = NULL;
imu->nr_bvecs = 0;
}
@@ -8441,6 +8466,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
if (pret > 0)
unpin_user_pages(pages, pret);
kvfree(imu->bvec);
+ imu->bvec = NULL;
goto done;
}
@@ -8602,6 +8628,8 @@ static void io_buf_data_ref_zero(struct percpu_ref *ref)
static void io_ring_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
io_buffer_unmap(ctx, prsrc->buf);
+ kvfree(prsrc->buf);
+ prsrc->buf = NULL;
}
static struct fixed_rsrc_ref_node *alloc_fixed_buf_ref_node(
@@ -8684,6 +8712,111 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return 0;
}
+static inline int io_queue_buffer_removal(struct fixed_rsrc_data *data,
+ struct io_mapped_ubuf *imu)
+{
+ return io_queue_rsrc_removal(data, (void *)imu);
+}
+
+static void destroy_fixed_buf_ref_node(struct fixed_rsrc_ref_node *ref_node)
+{
+ destroy_fixed_rsrc_ref_node(ref_node);
+}
+
+static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
+ struct io_uring_rsrc_update *up,
+ unsigned nr_args)
+{
+ struct fixed_rsrc_data *data = ctx->buf_data;
+ struct fixed_rsrc_ref_node *ref_node;
+ struct io_mapped_ubuf *imu;
+ struct iovec iov;
+ struct iovec __user *iovs;
+ struct page *last_hpage = NULL;
+ __u32 done;
+ int i, err;
+ bool needs_switch = false;
+
+ if (check_add_overflow(up->offset, nr_args, &done))
+ return -EOVERFLOW;
+ if (done > ctx->nr_user_bufs)
+ return -EINVAL;
+
+ ref_node = alloc_fixed_buf_ref_node(ctx);
+ if (IS_ERR(ref_node))
+ return PTR_ERR(ref_node);
+
+ done = 0;
+ iovs = u64_to_user_ptr(up->iovs);
+ while (nr_args) {
+ struct fixed_rsrc_table *table;
+ unsigned index;
+
+ err = 0;
+ if (copy_from_user(&iov, &iovs[done], sizeof(iov))) {
+ err = -EFAULT;
+ break;
+ }
+ i = array_index_nospec(up->offset, ctx->nr_user_bufs);
+ table = &ctx->buf_data->table[i >> IORING_BUF_TABLE_SHIFT];
+ index = i & IORING_BUF_TABLE_MASK;
+ imu = &table->bufs[index];
+ if (table->bufs[index].ubuf) {
+ struct io_mapped_ubuf *dup;
+ dup = kmemdup(imu, sizeof(*imu), GFP_KERNEL);
+ if (!dup) {
+ err = -ENOMEM;
+ break;
+ }
+ err = io_queue_buffer_removal(data, dup);
+ if (err)
+ break;
+ memset(imu, 0, sizeof(*imu));
+ needs_switch = true;
+ }
+ if (!io_buffer_validate(&iov)) {
+ err = io_sqe_buffer_register(ctx, &iov, imu,
+ &last_hpage);
+ if (err) {
+ memset(imu, 0, sizeof(*imu));
+ break;
+ }
+ }
+ nr_args--;
+ done++;
+ up->offset++;
+ }
+
+ if (needs_switch) {
+ percpu_ref_kill(&data->node->refs);
+ spin_lock(&data->lock);
+ list_add(&ref_node->node, &data->ref_list);
+ data->node = ref_node;
+ spin_unlock(&data->lock);
+ percpu_ref_get(&ctx->buf_data->refs);
+ } else
+ destroy_fixed_buf_ref_node(ref_node);
+
+ return done ? done : err;
+}
+
+static int io_sqe_buffers_update(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
+{
+ struct io_uring_rsrc_update up;
+
+ if (!ctx->buf_data)
+ return -ENXIO;
+ if (!nr_args)
+ return -EINVAL;
+ if (copy_from_user(&up, arg, sizeof(up)))
+ return -EFAULT;
+ if (up.resv)
+ return -EINVAL;
+
+ return __io_sqe_buffers_update(ctx, &up, nr_args);
+}
+
static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
{
__s32 __user *fds = arg;
@@ -9961,6 +10094,7 @@ static bool io_register_op_must_quiesce(int op)
switch (op) {
case IORING_UNREGISTER_FILES:
case IORING_REGISTER_FILES_UPDATE:
+ case IORING_REGISTER_BUFFERS_UPDATE:
case IORING_REGISTER_PROBE:
case IORING_REGISTER_PERSONALITY:
case IORING_UNREGISTER_PERSONALITY:
@@ -10036,6 +10170,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_sqe_buffers_unregister(ctx);
break;
+ case IORING_REGISTER_BUFFERS_UPDATE:
+ ret = io_sqe_buffers_update(ctx, arg, nr_args);
+ break;
case IORING_REGISTER_FILES:
ret = io_sqe_files_register(ctx, arg, nr_args);
break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 87f0f56..17682b5 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -137,6 +137,7 @@ enum {
IORING_OP_SHUTDOWN,
IORING_OP_RENAMEAT,
IORING_OP_UNLINKAT,
+ IORING_OP_BUFFERS_UPDATE,
/* this goes last, obviously */
IORING_OP_LAST,
@@ -279,17 +280,12 @@ enum {
IORING_UNREGISTER_PERSONALITY = 10,
IORING_REGISTER_RESTRICTIONS = 11,
IORING_REGISTER_ENABLE_RINGS = 12,
+ IORING_REGISTER_BUFFERS_UPDATE = 13,
/* this goes last */
IORING_REGISTER_LAST
};
-struct io_uring_files_update {
- __u32 offset;
- __u32 resv;
- __aligned_u64 /* __s32 * */ fds;
-};
-
struct io_uring_rsrc_update {
__u32 offset;
__u32 resv;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [PATCH 6/8] io_uring: support buffer registration updates
2020-11-12 23:00 ` [PATCH 6/8] io_uring: support buffer registration updates Bijan Mottahedeh
@ 2020-11-18 20:17 ` Pavel Begunkov
2020-12-09 0:42 ` Bijan Mottahedeh
0 siblings, 1 reply; 22+ messages in thread
From: Pavel Begunkov @ 2020-11-18 20:17 UTC (permalink / raw)
To: Bijan Mottahedeh, axboe; +Cc: io-uring
On 12/11/2020 23:00, Bijan Mottahedeh wrote:
> Introduce IORING_REGISTER_BUFFERS_UPDATE and IORING_OP_BUFFERS_UPDATE,
> consistent with file registration update.
I'd prefer to not add a new opcode for each new resource. Can we have
only IORING_OP_RESOURCE_UPDATE and multiplex inside? Even better if you
could fit all into IORING_OP_FILES_UPDATE and then
#define IORING_OP_RESOURCE_UPDATE IORING_OP_FILES_UPDATE
Jens, what do you think?
>
> Signed-off-by: Bijan Mottahedeh <[email protected]>
> ---
> fs/io_uring.c | 139 +++++++++++++++++++++++++++++++++++++++++-
> include/uapi/linux/io_uring.h | 8 +--
> 2 files changed, 140 insertions(+), 7 deletions(-)
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 71f6d5c..6020fd2 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -1006,6 +1006,9 @@ struct io_op_def {
> .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
> IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
> },
> + [IORING_OP_BUFFERS_UPDATE] = {
> + .work_flags = IO_WQ_WORK_MM,
> + },
> };
>
> enum io_mem_account {
> @@ -1025,6 +1028,9 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
> static int __io_sqe_files_update(struct io_ring_ctx *ctx,
> struct io_uring_rsrc_update *ip,
> unsigned nr_args);
> +static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
> + struct io_uring_rsrc_update *up,
> + unsigned nr_args);
> static void __io_clean_op(struct io_kiocb *req);
> static struct file *io_file_get(struct io_submit_state *state,
> struct io_kiocb *req, int fd, bool fixed);
> @@ -5939,6 +5945,19 @@ static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
> percpu_ref_exit(&ref_node->refs);
> kfree(ref_node);
> }
> +
> +static int io_buffers_update_prep(struct io_kiocb *req,
> + const struct io_uring_sqe *sqe)
> +{
> + return io_rsrc_update_prep(req, sqe);
> +}
> +
> +static int io_buffers_update(struct io_kiocb *req, bool force_nonblock,
> + struct io_comp_state *cs)
> +{
> + return io_rsrc_update(req, force_nonblock, cs, __io_sqe_buffers_update);
> +}
> +
> static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> {
> switch (req->opcode) {
> @@ -6010,11 +6029,13 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> return io_renameat_prep(req, sqe);
> case IORING_OP_UNLINKAT:
> return io_unlinkat_prep(req, sqe);
> + case IORING_OP_BUFFERS_UPDATE:
> + return io_buffers_update_prep(req, sqe);
> }
>
> printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
> req->opcode);
> - return-EINVAL;
> + return -EINVAL;
> }
>
> static int io_req_defer_prep(struct io_kiocb *req,
> @@ -6268,6 +6289,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
> case IORING_OP_UNLINKAT:
> ret = io_unlinkat(req, force_nonblock);
> break;
> + case IORING_OP_BUFFERS_UPDATE:
> + ret = io_buffers_update(req, force_nonblock, cs);
> + break;
> default:
> ret = -EINVAL;
> break;
> @@ -8224,6 +8248,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
> if (imu->acct_pages)
> io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED);
> kvfree(imu->bvec);
> + imu->bvec = NULL;
> imu->nr_bvecs = 0;
> }
>
> @@ -8441,6 +8466,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
> if (pret > 0)
> unpin_user_pages(pages, pret);
> kvfree(imu->bvec);
> + imu->bvec = NULL;
> goto done;
> }
>
> @@ -8602,6 +8628,8 @@ static void io_buf_data_ref_zero(struct percpu_ref *ref)
> static void io_ring_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
> {
> io_buffer_unmap(ctx, prsrc->buf);
> + kvfree(prsrc->buf);
> + prsrc->buf = NULL;
> }
>
> static struct fixed_rsrc_ref_node *alloc_fixed_buf_ref_node(
> @@ -8684,6 +8712,111 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> return 0;
> }
>
> +static inline int io_queue_buffer_removal(struct fixed_rsrc_data *data,
> + struct io_mapped_ubuf *imu)
> +{
> + return io_queue_rsrc_removal(data, (void *)imu);
> +}
> +
> +static void destroy_fixed_buf_ref_node(struct fixed_rsrc_ref_node *ref_node)
> +{
> + destroy_fixed_rsrc_ref_node(ref_node);
> +}
> +
> +static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
> + struct io_uring_rsrc_update *up,
> + unsigned nr_args)
> +{
> + struct fixed_rsrc_data *data = ctx->buf_data;
> + struct fixed_rsrc_ref_node *ref_node;
> + struct io_mapped_ubuf *imu;
> + struct iovec iov;
> + struct iovec __user *iovs;
> + struct page *last_hpage = NULL;
> + __u32 done;
> + int i, err;
> + bool needs_switch = false;
> +
> + if (check_add_overflow(up->offset, nr_args, &done))
> + return -EOVERFLOW;
> + if (done > ctx->nr_user_bufs)
> + return -EINVAL;
> +
> + ref_node = alloc_fixed_buf_ref_node(ctx);
> + if (IS_ERR(ref_node))
> + return PTR_ERR(ref_node);
> +
> + done = 0;
> + iovs = u64_to_user_ptr(up->iovs);
> + while (nr_args) {
> + struct fixed_rsrc_table *table;
> + unsigned index;
> +
> + err = 0;
> + if (copy_from_user(&iov, &iovs[done], sizeof(iov))) {
> + err = -EFAULT;
> + break;
> + }
> + i = array_index_nospec(up->offset, ctx->nr_user_bufs);
> + table = &ctx->buf_data->table[i >> IORING_BUF_TABLE_SHIFT];
> + index = i & IORING_BUF_TABLE_MASK;
> + imu = &table->bufs[index];
> + if (table->bufs[index].ubuf) {
> + struct io_mapped_ubuf *dup;
> + dup = kmemdup(imu, sizeof(*imu), GFP_KERNEL);
> + if (!dup) {
> + err = -ENOMEM;
> + break;
> + }
> + err = io_queue_buffer_removal(data, dup);
> + if (err)
> + break;
> + memset(imu, 0, sizeof(*imu));
> + needs_switch = true;
> + }
> + if (!io_buffer_validate(&iov)) {
> + err = io_sqe_buffer_register(ctx, &iov, imu,
> + &last_hpage);
> + if (err) {
> + memset(imu, 0, sizeof(*imu));
> + break;
> + }
> + }
> + nr_args--;
> + done++;
> + up->offset++;
> + }
> +
> + if (needs_switch) {
> + percpu_ref_kill(&data->node->refs);
> + spin_lock(&data->lock);
> + list_add(&ref_node->node, &data->ref_list);
> + data->node = ref_node;
> + spin_unlock(&data->lock);
> + percpu_ref_get(&ctx->buf_data->refs);
> + } else
> + destroy_fixed_buf_ref_node(ref_node);
> +
> + return done ? done : err;
> +}
> +
> +static int io_sqe_buffers_update(struct io_ring_ctx *ctx, void __user *arg,
> + unsigned nr_args)
> +{
> + struct io_uring_rsrc_update up;
> +
> + if (!ctx->buf_data)
> + return -ENXIO;
> + if (!nr_args)
> + return -EINVAL;
> + if (copy_from_user(&up, arg, sizeof(up)))
> + return -EFAULT;
> + if (up.resv)
> + return -EINVAL;
> +
> + return __io_sqe_buffers_update(ctx, &up, nr_args);
> +}
> +
> static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
> {
> __s32 __user *fds = arg;
> @@ -9961,6 +10094,7 @@ static bool io_register_op_must_quiesce(int op)
> switch (op) {
> case IORING_UNREGISTER_FILES:
> case IORING_REGISTER_FILES_UPDATE:
> + case IORING_REGISTER_BUFFERS_UPDATE:
> case IORING_REGISTER_PROBE:
> case IORING_REGISTER_PERSONALITY:
> case IORING_UNREGISTER_PERSONALITY:
> @@ -10036,6 +10170,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
> break;
> ret = io_sqe_buffers_unregister(ctx);
> break;
> + case IORING_REGISTER_BUFFERS_UPDATE:
> + ret = io_sqe_buffers_update(ctx, arg, nr_args);
> + break;
> case IORING_REGISTER_FILES:
> ret = io_sqe_files_register(ctx, arg, nr_args);
> break;
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 87f0f56..17682b5 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -137,6 +137,7 @@ enum {
> IORING_OP_SHUTDOWN,
> IORING_OP_RENAMEAT,
> IORING_OP_UNLINKAT,
> + IORING_OP_BUFFERS_UPDATE,
>
> /* this goes last, obviously */
> IORING_OP_LAST,
> @@ -279,17 +280,12 @@ enum {
> IORING_UNREGISTER_PERSONALITY = 10,
> IORING_REGISTER_RESTRICTIONS = 11,
> IORING_REGISTER_ENABLE_RINGS = 12,
> + IORING_REGISTER_BUFFERS_UPDATE = 13,
>
> /* this goes last */
> IORING_REGISTER_LAST
> };
>
> -struct io_uring_files_update {
> - __u32 offset;
> - __u32 resv;
> - __aligned_u64 /* __s32 * */ fds;
> -};
> -
> struct io_uring_rsrc_update {
> __u32 offset;
> __u32 resv;
>
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH 6/8] io_uring: support buffer registration updates
2020-11-18 20:17 ` Pavel Begunkov
@ 2020-12-09 0:42 ` Bijan Mottahedeh
0 siblings, 0 replies; 22+ messages in thread
From: Bijan Mottahedeh @ 2020-12-09 0:42 UTC (permalink / raw)
To: axboe, Pavel Begunkov; +Cc: io-uring
On 11/18/2020 12:17 PM, Pavel Begunkov wrote:
> On 12/11/2020 23:00, Bijan Mottahedeh wrote:
>> Introduce IORING_REGISTER_BUFFERS_UPDATE and IORING_OP_BUFFERS_UPDATE,
>> consistent with file registration update.
>
> I'd prefer to not add a new opcode for each new resource. Can we have
> only IORING_OP_RESOURCE_UPDATE and multiplex inside? Even better if you
> could fit all into IORING_OP_FILES_UPDATE and then
>
> #define IORING_OP_RESOURCE_UPDATE IORING_OP_FILES_UPDATE
>
> Jens, what do you think?
Hi Jens,
What do you think the right approach is here?
Thanks.
--bijan
>
>>
>> Signed-off-by: Bijan Mottahedeh <[email protected]>
>> ---
>> fs/io_uring.c | 139 +++++++++++++++++++++++++++++++++++++++++-
>> include/uapi/linux/io_uring.h | 8 +--
>> 2 files changed, 140 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 71f6d5c..6020fd2 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -1006,6 +1006,9 @@ struct io_op_def {
>> .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
>> IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
>> },
>> + [IORING_OP_BUFFERS_UPDATE] = {
>> + .work_flags = IO_WQ_WORK_MM,
>> + },
>> };
>>
>> enum io_mem_account {
>> @@ -1025,6 +1028,9 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
>> static int __io_sqe_files_update(struct io_ring_ctx *ctx,
>> struct io_uring_rsrc_update *ip,
>> unsigned nr_args);
>> +static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
>> + struct io_uring_rsrc_update *up,
>> + unsigned nr_args);
>> static void __io_clean_op(struct io_kiocb *req);
>> static struct file *io_file_get(struct io_submit_state *state,
>> struct io_kiocb *req, int fd, bool fixed);
>> @@ -5939,6 +5945,19 @@ static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
>> percpu_ref_exit(&ref_node->refs);
>> kfree(ref_node);
>> }
>> +
>> +static int io_buffers_update_prep(struct io_kiocb *req,
>> + const struct io_uring_sqe *sqe)
>> +{
>> + return io_rsrc_update_prep(req, sqe);
>> +}
>> +
>> +static int io_buffers_update(struct io_kiocb *req, bool force_nonblock,
>> + struct io_comp_state *cs)
>> +{
>> + return io_rsrc_update(req, force_nonblock, cs, __io_sqe_buffers_update);
>> +}
>> +
>> static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>> {
>> switch (req->opcode) {
>> @@ -6010,11 +6029,13 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>> return io_renameat_prep(req, sqe);
>> case IORING_OP_UNLINKAT:
>> return io_unlinkat_prep(req, sqe);
>> + case IORING_OP_BUFFERS_UPDATE:
>> + return io_buffers_update_prep(req, sqe);
>> }
>>
>> printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
>> req->opcode);
>> - return-EINVAL;
>> + return -EINVAL;
>> }
>>
>> static int io_req_defer_prep(struct io_kiocb *req,
>> @@ -6268,6 +6289,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
>> case IORING_OP_UNLINKAT:
>> ret = io_unlinkat(req, force_nonblock);
>> break;
>> + case IORING_OP_BUFFERS_UPDATE:
>> + ret = io_buffers_update(req, force_nonblock, cs);
>> + break;
>> default:
>> ret = -EINVAL;
>> break;
>> @@ -8224,6 +8248,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
>> if (imu->acct_pages)
>> io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED);
>> kvfree(imu->bvec);
>> + imu->bvec = NULL;
>> imu->nr_bvecs = 0;
>> }
>>
>> @@ -8441,6 +8466,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
>> if (pret > 0)
>> unpin_user_pages(pages, pret);
>> kvfree(imu->bvec);
>> + imu->bvec = NULL;
>> goto done;
>> }
>>
>> @@ -8602,6 +8628,8 @@ static void io_buf_data_ref_zero(struct percpu_ref *ref)
>> static void io_ring_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
>> {
>> io_buffer_unmap(ctx, prsrc->buf);
>> + kvfree(prsrc->buf);
>> + prsrc->buf = NULL;
>> }
>>
>> static struct fixed_rsrc_ref_node *alloc_fixed_buf_ref_node(
>> @@ -8684,6 +8712,111 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
>> return 0;
>> }
>>
>> +static inline int io_queue_buffer_removal(struct fixed_rsrc_data *data,
>> + struct io_mapped_ubuf *imu)
>> +{
>> + return io_queue_rsrc_removal(data, (void *)imu);
>> +}
>> +
>> +static void destroy_fixed_buf_ref_node(struct fixed_rsrc_ref_node *ref_node)
>> +{
>> + destroy_fixed_rsrc_ref_node(ref_node);
>> +}
>> +
>> +static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
>> + struct io_uring_rsrc_update *up,
>> + unsigned nr_args)
>> +{
>> + struct fixed_rsrc_data *data = ctx->buf_data;
>> + struct fixed_rsrc_ref_node *ref_node;
>> + struct io_mapped_ubuf *imu;
>> + struct iovec iov;
>> + struct iovec __user *iovs;
>> + struct page *last_hpage = NULL;
>> + __u32 done;
>> + int i, err;
>> + bool needs_switch = false;
>> +
>> + if (check_add_overflow(up->offset, nr_args, &done))
>> + return -EOVERFLOW;
>> + if (done > ctx->nr_user_bufs)
>> + return -EINVAL;
>> +
>> + ref_node = alloc_fixed_buf_ref_node(ctx);
>> + if (IS_ERR(ref_node))
>> + return PTR_ERR(ref_node);
>> +
>> + done = 0;
>> + iovs = u64_to_user_ptr(up->iovs);
>> + while (nr_args) {
>> + struct fixed_rsrc_table *table;
>> + unsigned index;
>> +
>> + err = 0;
>> + if (copy_from_user(&iov, &iovs[done], sizeof(iov))) {
>> + err = -EFAULT;
>> + break;
>> + }
>> + i = array_index_nospec(up->offset, ctx->nr_user_bufs);
>> + table = &ctx->buf_data->table[i >> IORING_BUF_TABLE_SHIFT];
>> + index = i & IORING_BUF_TABLE_MASK;
>> + imu = &table->bufs[index];
>> + if (table->bufs[index].ubuf) {
>> + struct io_mapped_ubuf *dup;
>> + dup = kmemdup(imu, sizeof(*imu), GFP_KERNEL);
>> + if (!dup) {
>> + err = -ENOMEM;
>> + break;
>> + }
>> + err = io_queue_buffer_removal(data, dup);
>> + if (err)
>> + break;
>> + memset(imu, 0, sizeof(*imu));
>> + needs_switch = true;
>> + }
>> + if (!io_buffer_validate(&iov)) {
>> + err = io_sqe_buffer_register(ctx, &iov, imu,
>> + &last_hpage);
>> + if (err) {
>> + memset(imu, 0, sizeof(*imu));
>> + break;
>> + }
>> + }
>> + nr_args--;
>> + done++;
>> + up->offset++;
>> + }
>> +
>> + if (needs_switch) {
>> + percpu_ref_kill(&data->node->refs);
>> + spin_lock(&data->lock);
>> + list_add(&ref_node->node, &data->ref_list);
>> + data->node = ref_node;
>> + spin_unlock(&data->lock);
>> + percpu_ref_get(&ctx->buf_data->refs);
>> + } else
>> + destroy_fixed_buf_ref_node(ref_node);
>> +
>> + return done ? done : err;
>> +}
>> +
>> +static int io_sqe_buffers_update(struct io_ring_ctx *ctx, void __user *arg,
>> + unsigned nr_args)
>> +{
>> + struct io_uring_rsrc_update up;
>> +
>> + if (!ctx->buf_data)
>> + return -ENXIO;
>> + if (!nr_args)
>> + return -EINVAL;
>> + if (copy_from_user(&up, arg, sizeof(up)))
>> + return -EFAULT;
>> + if (up.resv)
>> + return -EINVAL;
>> +
>> + return __io_sqe_buffers_update(ctx, &up, nr_args);
>> +}
>> +
>> static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
>> {
>> __s32 __user *fds = arg;
>> @@ -9961,6 +10094,7 @@ static bool io_register_op_must_quiesce(int op)
>> switch (op) {
>> case IORING_UNREGISTER_FILES:
>> case IORING_REGISTER_FILES_UPDATE:
>> + case IORING_REGISTER_BUFFERS_UPDATE:
>> case IORING_REGISTER_PROBE:
>> case IORING_REGISTER_PERSONALITY:
>> case IORING_UNREGISTER_PERSONALITY:
>> @@ -10036,6 +10170,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
>> break;
>> ret = io_sqe_buffers_unregister(ctx);
>> break;
>> + case IORING_REGISTER_BUFFERS_UPDATE:
>> + ret = io_sqe_buffers_update(ctx, arg, nr_args);
>> + break;
>> case IORING_REGISTER_FILES:
>> ret = io_sqe_files_register(ctx, arg, nr_args);
>> break;
>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>> index 87f0f56..17682b5 100644
>> --- a/include/uapi/linux/io_uring.h
>> +++ b/include/uapi/linux/io_uring.h
>> @@ -137,6 +137,7 @@ enum {
>> IORING_OP_SHUTDOWN,
>> IORING_OP_RENAMEAT,
>> IORING_OP_UNLINKAT,
>> + IORING_OP_BUFFERS_UPDATE,
>>
>> /* this goes last, obviously */
>> IORING_OP_LAST,
>> @@ -279,17 +280,12 @@ enum {
>> IORING_UNREGISTER_PERSONALITY = 10,
>> IORING_REGISTER_RESTRICTIONS = 11,
>> IORING_REGISTER_ENABLE_RINGS = 12,
>> + IORING_REGISTER_BUFFERS_UPDATE = 13,
>>
>> /* this goes last */
>> IORING_REGISTER_LAST
>> };
>>
>> -struct io_uring_files_update {
>> - __u32 offset;
>> - __u32 resv;
>> - __aligned_u64 /* __s32 * */ fds;
>> -};
>> -
>> struct io_uring_rsrc_update {
>> __u32 offset;
>> __u32 resv;
>>
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH 7/8] io_uring: support readv/writev with fixed buffers
2020-11-12 23:00 [PATCH 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
` (5 preceding siblings ...)
2020-11-12 23:00 ` [PATCH 6/8] io_uring: support buffer registration updates Bijan Mottahedeh
@ 2020-11-12 23:00 ` Bijan Mottahedeh
2020-11-17 11:04 ` Pavel Begunkov
2020-11-12 23:00 ` [PATCH 8/8] io_uring: support buffer registration sharing Bijan Mottahedeh
2020-11-16 23:28 ` [PATCH 0/8] io_uring: buffer registration enhancements Pavel Begunkov
8 siblings, 1 reply; 22+ messages in thread
From: Bijan Mottahedeh @ 2020-11-12 23:00 UTC (permalink / raw)
To: axboe; +Cc: io-uring
Support readv/writev with fixed buffers, and introduce IOSQE_FIXED_BUFFER,
consistent with fixed files.
Signed-off-by: Bijan Mottahedeh <[email protected]>
---
fs/io_uring.c | 59 ++++++++++++++++++++++++++++++++++++++++---
include/uapi/linux/io_uring.h | 3 +++
2 files changed, 58 insertions(+), 4 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6020fd2..12c4144 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -625,6 +625,7 @@ enum {
REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
+ REQ_F_FIXED_BUFFER_BIT = IOSQE_FIXED_BUFFER_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
@@ -681,8 +682,12 @@ enum {
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
/* linked timeout is active, i.e. prepared by link's head */
REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
+ /* ctx owns buffer */
+ REQ_F_FIXED_BUFFER = BIT(REQ_F_FIXED_BUFFER_BIT),
};
+#define REQ_F_FIXED_RSRC (REQ_F_FIXED_FILE | REQ_F_FIXED_BUFFER)
+
struct async_poll {
struct io_poll_iocb poll;
struct io_poll_iocb *double_poll;
@@ -3191,6 +3196,46 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
return __io_iov_buffer_select(req, iov, needs_lock);
}
+static ssize_t io_import_iovec_fixed(int rw, struct io_kiocb *req, void *buf,
+ unsigned segs, unsigned fast_segs,
+ struct iovec **iovec,
+ struct iov_iter *iter)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_mapped_ubuf *imu;
+ struct iovec *iov;
+ u16 index, buf_index;
+ ssize_t ret;
+ unsigned long seg;
+
+ if (unlikely(!ctx->buf_data))
+ return -EFAULT;
+
+ ret = import_iovec(rw, buf, segs, fast_segs, iovec, iter);
+ if (ret < 0)
+ return ret;
+
+ iov = (struct iovec *)iter->iov;
+
+ for (seg = 0; seg < iter->nr_segs; seg++) {
+ buf_index = *(u16 *)(&iov[seg].iov_base);
+ if (unlikely(buf_index < 0 || buf_index >= ctx->nr_user_bufs))
+ return -EFAULT;
+
+ index = array_index_nospec(buf_index, ctx->nr_user_bufs);
+ imu = io_buf_from_index(ctx, index);
+ if (!imu->ubuf || !imu->len)
+ return -EFAULT;
+ if (iov[seg].iov_len > imu->len)
+ return -EFAULT;
+
+ iov[seg].iov_base = (void *)imu->ubuf;
+ ret += iov[seg].iov_len;
+ }
+
+ return ret;
+}
+
static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
struct iovec **iovec, struct iov_iter *iter,
bool needs_lock)
@@ -3201,6 +3246,12 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
u8 opcode;
opcode = req->opcode;
+
+ if ((opcode == IORING_OP_READV || opcode == IORING_OP_WRITEV) &&
+ req->flags & REQ_F_FIXED_BUFFER)
+ return (io_import_iovec_fixed(rw, req, buf, sqe_len,
+ UIO_FASTIOV, iovec, iter));
+
if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
*iovec = NULL;
return io_import_fixed(req, rw, iter);
@@ -5692,7 +5743,7 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
{
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ if (unlikely(req->flags & (REQ_F_FIXED_RSRC | REQ_F_BUFFER_SELECT)))
return -EINVAL;
if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags)
return -EINVAL;
@@ -5867,7 +5918,7 @@ static int io_async_cancel_prep(struct io_kiocb *req,
{
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ if (unlikely(req->flags & (REQ_F_FIXED_RSRC | REQ_F_BUFFER_SELECT)))
return -EINVAL;
if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
return -EINVAL;
@@ -5889,7 +5940,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
{
if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
return -EINVAL;
- if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ if (unlikely(req->flags & (REQ_F_FIXED_RSRC | REQ_F_BUFFER_SELECT)))
return -EINVAL;
if (sqe->ioprio || sqe->rw_flags)
return -EINVAL;
@@ -6740,7 +6791,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
- IOSQE_BUFFER_SELECT)
+ IOSQE_BUFFER_SELECT | IOSQE_FIXED_BUFFER)
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 17682b5..41da59c 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -70,6 +70,7 @@ enum {
IOSQE_IO_HARDLINK_BIT,
IOSQE_ASYNC_BIT,
IOSQE_BUFFER_SELECT_BIT,
+ IOSQE_FIXED_BUFFER_BIT,
};
/*
@@ -87,6 +88,8 @@ enum {
#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
/* select buffer from sqe->buf_group */
#define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT)
+/* use fixed buffer set */
+#define IOSQE_FIXED_BUFFER (1U << IOSQE_FIXED_BUFFER_BIT)
/*
* io_uring_setup() flags
--
1.8.3.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [PATCH 7/8] io_uring: support readv/writev with fixed buffers
2020-11-12 23:00 ` [PATCH 7/8] io_uring: support readv/writev with fixed buffers Bijan Mottahedeh
@ 2020-11-17 11:04 ` Pavel Begunkov
2020-11-17 22:59 ` Bijan Mottahedeh
0 siblings, 1 reply; 22+ messages in thread
From: Pavel Begunkov @ 2020-11-17 11:04 UTC (permalink / raw)
To: Bijan Mottahedeh, axboe; +Cc: io-uring
On 12/11/2020 23:00, Bijan Mottahedeh wrote:
> Support readv/writev with fixed buffers, and introduce IOSQE_FIXED_BUFFER,
> consistent with fixed files.
I don't like it at all, see issues below. The actual implementation would
be much uglier.
I propose you to split the series and push separately. Your first 6 patches
first, I don't have conceptual objections to them. Then registration sharing
(I still need to look it up). And then we can return to this, if you're not
yet convinced.
>
> Signed-off-by: Bijan Mottahedeh <[email protected]>
> ---
> fs/io_uring.c | 59 ++++++++++++++++++++++++++++++++++++++++---
> include/uapi/linux/io_uring.h | 3 +++
> 2 files changed, 58 insertions(+), 4 deletions(-)
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 6020fd2..12c4144 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -625,6 +625,7 @@ enum {
> REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
> REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
> REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
> + REQ_F_FIXED_BUFFER_BIT = IOSQE_FIXED_BUFFER_BIT,
>
> REQ_F_FAIL_LINK_BIT,
> REQ_F_INFLIGHT_BIT,
> @@ -681,8 +682,12 @@ enum {
> REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
> /* linked timeout is active, i.e. prepared by link's head */
> REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
> + /* ctx owns buffer */
> + REQ_F_FIXED_BUFFER = BIT(REQ_F_FIXED_BUFFER_BIT),
> };
>
> +#define REQ_F_FIXED_RSRC (REQ_F_FIXED_FILE | REQ_F_FIXED_BUFFER)
> +
> struct async_poll {
> struct io_poll_iocb poll;
> struct io_poll_iocb *double_poll;
> @@ -3191,6 +3196,46 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
> return __io_iov_buffer_select(req, iov, needs_lock);
> }
>
> +static ssize_t io_import_iovec_fixed(int rw, struct io_kiocb *req, void *buf,
> + unsigned segs, unsigned fast_segs,
> + struct iovec **iovec,
> + struct iov_iter *iter)
> +{
> + struct io_ring_ctx *ctx = req->ctx;
> + struct io_mapped_ubuf *imu;
> + struct iovec *iov;
> + u16 index, buf_index;
> + ssize_t ret;
> + unsigned long seg;
> +
> + if (unlikely(!ctx->buf_data))
> + return -EFAULT;
> +
> + ret = import_iovec(rw, buf, segs, fast_segs, iovec, iter);
Did you test it? import_iovec() does access_ok() against each iov_base,
which in your case are an index.
> + if (ret < 0)
> + return ret;
> +
> + iov = (struct iovec *)iter->iov;
> +
> + for (seg = 0; seg < iter->nr_segs; seg++) {
> + buf_index = *(u16 *)(&iov[seg].iov_base);
That's ugly, and also not consistent with rw_fixed, because iov_base is
used to calculate offset.
> + if (unlikely(buf_index < 0 || buf_index >= ctx->nr_user_bufs))
> + return -EFAULT;
> +
> + index = array_index_nospec(buf_index, ctx->nr_user_bufs);
> + imu = io_buf_from_index(ctx, index);
> + if (!imu->ubuf || !imu->len)
> + return -EFAULT;
> + if (iov[seg].iov_len > imu->len)
> + return -EFAULT;
> +
> + iov[seg].iov_base = (void *)imu->ubuf;
Nope, that's not different from non registered version.
What import_fixed actually do is setting up the iter argument to point
to a bvec (a vector of struct page *).
So it either would need to keep a vector of bvecs, that's a vector of vectors,
that's not supported by iter, etc., so you'll also need to iterate over them
in io_read/write and so on. Or flat 2D structure into 1D, but that's still ugly.
> + ret += iov[seg].iov_len;
> + }
> +
> + return ret;
> +}
> +
> static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
> struct iovec **iovec, struct iov_iter *iter,
> bool needs_lock)
> @@ -3201,6 +3246,12 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
> u8 opcode;
>
> opcode = req->opcode;
> +
> + if ((opcode == IORING_OP_READV || opcode == IORING_OP_WRITEV) &&
> + req->flags & REQ_F_FIXED_BUFFER)
> + return (io_import_iovec_fixed(rw, req, buf, sqe_len,
> + UIO_FASTIOV, iovec, iter));
> +
> if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
> *iovec = NULL;
> return io_import_fixed(req, rw, iter);
> @@ -5692,7 +5743,7 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
> {
> if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
> return -EINVAL;
> - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
> + if (unlikely(req->flags & (REQ_F_FIXED_RSRC | REQ_F_BUFFER_SELECT)))
Why it's here?
#define REQ_F_FIXED_RSRC (REQ_F_FIXED_FILE | REQ_F_FIXED_BUFFER)
So, why do you | with REQ_F_BUFFER_SELECT again here?
> return -EINVAL;
> if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags)
> return -EINVAL;
> @@ -5867,7 +5918,7 @@ static int io_async_cancel_prep(struct io_kiocb *req,
> {
> if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
> return -EINVAL;
> - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
> + if (unlikely(req->flags & (REQ_F_FIXED_RSRC | REQ_F_BUFFER_SELECT)))
> return -EINVAL;
> if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
> return -EINVAL;
> @@ -5889,7 +5940,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
> {
> if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
> return -EINVAL;
> - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
> + if (unlikely(req->flags & (REQ_F_FIXED_RSRC | REQ_F_BUFFER_SELECT)))
> return -EINVAL;
> if (sqe->ioprio || sqe->rw_flags)
> return -EINVAL;
> @@ -6740,7 +6791,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
>
> #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
> IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
> - IOSQE_BUFFER_SELECT)
> + IOSQE_BUFFER_SELECT | IOSQE_FIXED_BUFFER)
>
> static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
> const struct io_uring_sqe *sqe,
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 17682b5..41da59c 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -70,6 +70,7 @@ enum {
> IOSQE_IO_HARDLINK_BIT,
> IOSQE_ASYNC_BIT,
> IOSQE_BUFFER_SELECT_BIT,
> + IOSQE_FIXED_BUFFER_BIT,
> };
>
> /*
> @@ -87,6 +88,8 @@ enum {
> #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
> /* select buffer from sqe->buf_group */
> #define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT)
> +/* use fixed buffer set */
> +#define IOSQE_FIXED_BUFFER (1U << IOSQE_FIXED_BUFFER_BIT)
Unfortenatuly, we're almost out of flags bits -- it's a 1 byte
field and 6 bits are already taken. Let's not use it.
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH 7/8] io_uring: support readv/writev with fixed buffers
2020-11-17 11:04 ` Pavel Begunkov
@ 2020-11-17 22:59 ` Bijan Mottahedeh
2020-11-18 9:14 ` Pavel Begunkov
2020-11-18 20:12 ` Pavel Begunkov
0 siblings, 2 replies; 22+ messages in thread
From: Bijan Mottahedeh @ 2020-11-17 22:59 UTC (permalink / raw)
To: Pavel Begunkov, axboe; +Cc: io-uring
>> Support readv/writev with fixed buffers, and introduce IOSQE_FIXED_BUFFER,
>> consistent with fixed files.
>
> I don't like it at all, see issues below. The actual implementation would
> be much uglier.
>
> I propose you to split the series and push separately. Your first 6 patches
> first, I don't have conceptual objections to them. Then registration sharing
> (I still need to look it up). And then we can return to this, if you're not
> yet convinced.
Ok. The sharing patch is actually the highest priority for us so it'd
be great to know if you think it's in the right direction.
Should I submit them as they are or address your fixed_file_ref comments
in Patch 4/8 as well? Would I need your prep patch beforehand?
>> +static ssize_t io_import_iovec_fixed(int rw, struct io_kiocb *req, void *buf,
>> + unsigned segs, unsigned fast_segs,
>> + struct iovec **iovec,
>> + struct iov_iter *iter)
>> +{
>> + struct io_ring_ctx *ctx = req->ctx;
>> + struct io_mapped_ubuf *imu;
>> + struct iovec *iov;
>> + u16 index, buf_index;
>> + ssize_t ret;
>> + unsigned long seg;
>> +
>> + if (unlikely(!ctx->buf_data))
>> + return -EFAULT;
>> +
>> + ret = import_iovec(rw, buf, segs, fast_segs, iovec, iter);
>
> Did you test it? import_iovec() does access_ok() against each iov_base,
> which in your case are an index.
I used liburing:test/file-{register,update} as models for the equivalent
buffer tests and they seem to work. I can send out the tests and the
liburing changes if you want.
The fixed io test registers an empty iov table first:
ret = io_uring_register_buffers(ring, iovs, UIO_MAXIOV);
It next updates the table with two actual buffers at offset 768:
ret = io_uring_register_buffers_update(ring, 768, ups, 2);
It later uses the buffer at index 768 for writev similar to the
file-register test (IOSQE_FIXED_BUFFER instead of IOSQE_FIXED_FILE):
iovs[768].iov_base = (void *)768;
iovs[768].iov_len = pagesize;
io_uring_prep_writev(sqe, fd, iovs, 1, 0);
sqe->flags |= IOSQE_FIXED_BUFFER;
ret = io_uring_submit(ring);
Below is the iovec returned from
io_import_iovec_fixed()
-> io_import_vec()
{iov_base = 0x300 <dm_early_create+412>, iov_len = 4096}
>> + if (ret < 0)
>> + return ret;
>> +
>> + iov = (struct iovec *)iter->iov;
>> +
>> + for (seg = 0; seg < iter->nr_segs; seg++) {
>> + buf_index = *(u16 *)(&iov[seg].iov_base);
>
> That's ugly, and also not consistent with rw_fixed, because iov_base is
> used to calculate offset.
Would offset be applicable when using readv/writev?
My thinkig was that for those cases, each iovec should be used exactly
as registered.
>
>> + if (unlikely(buf_index < 0 || buf_index >= ctx->nr_user_bufs))
>> + return -EFAULT;
>> +
>> + index = array_index_nospec(buf_index, ctx->nr_user_bufs);
>> + imu = io_buf_from_index(ctx, index);
>> + if (!imu->ubuf || !imu->len)
>> + return -EFAULT;
>> + if (iov[seg].iov_len > imu->len)
>> + return -EFAULT;
>> +
>> + iov[seg].iov_base = (void *)imu->ubuf;
>
> Nope, that's not different from non registered version.
> What import_fixed actually do is setting up the iter argument to point
> to a bvec (a vector of struct page *).
So in fact, the buffers end up being pinned again because they are not
being as bvecs?
>
> So it either would need to keep a vector of bvecs, that's a vector of vectors,
> that's not supported by iter, etc., so you'll also need to iterate over them
> in io_read/write and so on. Or flat 2D structure into 1D, but that's still ugly.
So you're saying there's no clean way to create a readv/writev + fixed
buffers API? It would've been nice to have a consistent API between
files and buffers.
>> @@ -5692,7 +5743,7 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
>> {
>> if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>> return -EINVAL;
>> - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
>> + if (unlikely(req->flags & (REQ_F_FIXED_RSRC | REQ_F_BUFFER_SELECT)))
>
> Why it's here?
>
> #define REQ_F_FIXED_RSRC (REQ_F_FIXED_FILE | REQ_F_FIXED_BUFFER)
> So, why do you | with REQ_F_BUFFER_SELECT again here?
I thought to group fixed files/buffers but distinguish them from
selected buffers.
>> @@ -87,6 +88,8 @@ enum {
>> #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
>> /* select buffer from sqe->buf_group */
>> #define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT)
>> +/* use fixed buffer set */
>> +#define IOSQE_FIXED_BUFFER (1U << IOSQE_FIXED_BUFFER_BIT)
>
> Unfortenatuly, we're almost out of flags bits -- it's a 1 byte
> field and 6 bits are already taken. Let's not use it.
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH 7/8] io_uring: support readv/writev with fixed buffers
2020-11-17 22:59 ` Bijan Mottahedeh
@ 2020-11-18 9:14 ` Pavel Begunkov
2020-11-18 20:12 ` Pavel Begunkov
1 sibling, 0 replies; 22+ messages in thread
From: Pavel Begunkov @ 2020-11-18 9:14 UTC (permalink / raw)
To: Bijan Mottahedeh, axboe; +Cc: io-uring
On 17/11/2020 22:59, Bijan Mottahedeh wrote:
>>> Support readv/writev with fixed buffers, and introduce IOSQE_FIXED_BUFFER,
>>> consistent with fixed files.
>>
>> I don't like it at all, see issues below. The actual implementation would
>> be much uglier.
>>
>> I propose you to split the series and push separately. Your first 6 patches
>> first, I don't have conceptual objections to them. Then registration sharing
>> (I still need to look it up). And then we can return to this, if you're not
>> yet convinced.
>
> Ok. The sharing patch is actually the highest priority for us so it'd be great to know if you think it's in the right direction.
>
> Should I submit them as they are or address your fixed_file_ref comments in Patch 4/8 as well? Would I need your prep patch beforehand?
I remembered one more thing that I need to do for your patches to work.
I'll ping you afterwards
>
>>> +static ssize_t io_import_iovec_fixed(int rw, struct io_kiocb *req, void *buf,
>>> + unsigned segs, unsigned fast_segs,
>>> + struct iovec **iovec,
>>> + struct iov_iter *iter)
>>> +{
>>> + struct io_ring_ctx *ctx = req->ctx;
>>> + struct io_mapped_ubuf *imu;
>>> + struct iovec *iov;
>>> + u16 index, buf_index;
>>> + ssize_t ret;
>>> + unsigned long seg;
>>> +
>>> + if (unlikely(!ctx->buf_data))
>>> + return -EFAULT;
>>> +
>>> + ret = import_iovec(rw, buf, segs, fast_segs, iovec, iter);
>>
>> Did you test it? import_iovec() does access_ok() against each iov_base,
>> which in your case are an index.
>
> I used liburing:test/file-{register,update} as models for the equivalent buffer tests and they seem to work. I can send out the tests and the liburing changes if you want.
Hmm, seems access_ok() is no-op for many architectures.
Still IIRC it's not portable.
>
> The fixed io test registers an empty iov table first:
>
> ret = io_uring_register_buffers(ring, iovs, UIO_MAXIOV);
>
> It next updates the table with two actual buffers at offset 768:
>
> ret = io_uring_register_buffers_update(ring, 768, ups, 2);
>
> It later uses the buffer at index 768 for writev similar to the file-register test (IOSQE_FIXED_BUFFER instead of IOSQE_FIXED_FILE):
>
> iovs[768].iov_base = (void *)768;
> iovs[768].iov_len = pagesize;
>
>
> io_uring_prep_writev(sqe, fd, iovs, 1, 0);
> sqe->flags |= IOSQE_FIXED_BUFFER;
>
> ret = io_uring_submit(ring);
>
> Below is the iovec returned from
>
> io_import_iovec_fixed()
> -> io_import_vec()
>
> {iov_base = 0x300 <dm_early_create+412>, iov_len = 4096}
>
>>> + if (ret < 0)
>>> + return ret;
>>> +
>>> + iov = (struct iovec *)iter->iov;
>>> +
>>> + for (seg = 0; seg < iter->nr_segs; seg++) {
>>> + buf_index = *(u16 *)(&iov[seg].iov_base);
>>
>> That's ugly, and also not consistent with rw_fixed, because iov_base is
>> used to calculate offset.
>
> Would offset be applicable when using readv/writev?
Not sure what you mean. You can register a huge chunk of memory,
but with buf_index alone you can't select a subchunk. E.g.
void *buf = alloc(4G);
idx = io_uring_register_buf(buf);
uring_read_fixed(reg_buf=idx,
data=buf+100, // offset 100
size=sz);
This writes [buf+100, buf+100+sz]. Without passing @buf you
wouldn't be able to specify an offset. Alternatively, the
API could have been accepting an offset directly, but this
option was chosen.
There is no such a problem with non-registered versions, it
just writes/reads all iov.
>
> My thinkig was that for those cases, each iovec should be used exactly as registered.
It may be confusing, but read/write_fixed use iov_base to calculate offset.
i.e.
imu = ctx->bufs[req->buffer_index];
if (iov_base not in range(imu->original_addr, imu->len))
fail;
offset = iov_base - imu->original_addr;
>
>>
>>> + if (unlikely(buf_index < 0 || buf_index >= ctx->nr_user_bufs))
>>> + return -EFAULT;
>>> +
>>> + index = array_index_nospec(buf_index, ctx->nr_user_bufs);
>>> + imu = io_buf_from_index(ctx, index);
>>> + if (!imu->ubuf || !imu->len)
>>> + return -EFAULT;
>>> + if (iov[seg].iov_len > imu->len)
>>> + return -EFAULT;
>>> +
>>> + iov[seg].iov_base = (void *)imu->ubuf;
>>
>> Nope, that's not different from non registered version.
>> What import_fixed actually do is setting up the iter argument to point
>> to a bvec (a vector of struct page *).
>
> So in fact, the buffers end up being pinned again because they are not being as bvecs?
right, it just passes iov with userspace virtual addresses in your case
and layer below don't know that they're pinned. And as they're virtual
in most cases they have to be translated to physical (that's solved by
having a vector of pages).
>
>>
>> So it either would need to keep a vector of bvecs, that's a vector of vectors,
>> that's not supported by iter, etc., so you'll also need to iterate over them
>> in io_read/write and so on. Or flat 2D structure into 1D, but that's still ugly.
>
> So you're saying there's no clean way to create a readv/writev + fixed buffers API? It would've been nice to have a consistent API between files and buffers.
I guess you can register an iov as a single bvec by flatting out the structure
to a single vector (bvec). Though, we'd need to drop an assumption that all
but first and last entries are page sized.
Or do that online with extra overhead + allocations.
>
>
>>> @@ -5692,7 +5743,7 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
>>> {
>>> if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>>> return -EINVAL;
>>> - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
>>> + if (unlikely(req->flags & (REQ_F_FIXED_RSRC | REQ_F_BUFFER_SELECT)))
>>
>> Why it's here?
>>
>> #define REQ_F_FIXED_RSRC (REQ_F_FIXED_FILE | REQ_F_FIXED_BUFFER)
>> So, why do you | with REQ_F_BUFFER_SELECT again here?
>
> I thought to group fixed files/buffers but distinguish them from selected buffers.
Ah, I've got this one wrong.
>
>>> @@ -87,6 +88,8 @@ enum {
>>> #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
>>> /* select buffer from sqe->buf_group */
>>> #define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT)
>>> +/* use fixed buffer set */
>>> +#define IOSQE_FIXED_BUFFER (1U << IOSQE_FIXED_BUFFER_BIT)
>>
>> Unfortenatuly, we're almost out of flags bits -- it's a 1 byte
>> field and 6 bits are already taken. Let's not use it.
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH 7/8] io_uring: support readv/writev with fixed buffers
2020-11-17 22:59 ` Bijan Mottahedeh
2020-11-18 9:14 ` Pavel Begunkov
@ 2020-11-18 20:12 ` Pavel Begunkov
[not found] ` <[email protected]>
1 sibling, 1 reply; 22+ messages in thread
From: Pavel Begunkov @ 2020-11-18 20:12 UTC (permalink / raw)
To: Bijan Mottahedeh; +Cc: axboe, io-uring
On 17/11/2020 22:59, Bijan Mottahedeh wrote:
>
>>> Support readv/writev with fixed buffers, and introduce IOSQE_FIXED_BUFFER,
>>> consistent with fixed files.
>>
>> I don't like it at all, see issues below. The actual implementation would
>> be much uglier.
>>
>> I propose you to split the series and push separately. Your first 6 patches
>> first, I don't have conceptual objections to them. Then registration sharing
>> (I still need to look it up). And then we can return to this, if you're not
>> yet convinced.
>
> Ok. The sharing patch is actually the highest priority for us so it'd be great to know if you think it's in the right direction.
>
> Should I submit them as they are or address your fixed_file_ref comments in Patch 4/8 as well? Would I need your prep patch beforehand?
Ok, there are 2 new patches in 5.10, you may wait for Jens to propagate it to
5.11 or just cherry-pick (no conflicts expected) them. On top apply ("io_uring:
share fixed_file_refs b/w multiple rsrcs") to which I CC'ed you. It's
simple enough so shouldn't be much problems with it.
With that you need to call io_set_resource_node() every time you take
a resource. That's it, _no_ extra ref_put for you to add in puts/free/etc.
Also, consider that all ref_nodes of all types should be hooked into a
single ->ref_list (e.g. file_data->ref_list).
>
>>> +static ssize_t io_import_iovec_fixed(int rw, struct io_kiocb *req, void *buf,
>>> + unsigned segs, unsigned fast_segs,
>>> + struct iovec **iovec,
>>> + struct iov_iter *iter)
>>> +{
>>> + struct io_ring_ctx *ctx = req->ctx;
>>> + struct io_mapped_ubuf *imu;
>>> + struct iovec *iov;
>>> + u16 index, buf_index;
>>> + ssize_t ret;
>>> + unsigned long seg;
>>> +
>>> + if (unlikely(!ctx->buf_data))
>>> + return -EFAULT;
>>> +
>>> + ret = import_iovec(rw, buf, segs, fast_segs, iovec, iter);
>>
>> Did you test it? import_iovec() does access_ok() against each iov_base,
>> which in your case are an index.
>
> I used liburing:test/file-{register,update} as models for the equivalent buffer tests and they seem to work. I can send out the tests and the liburing changes if you want.
>
> The fixed io test registers an empty iov table first:
>
> ret = io_uring_register_buffers(ring, iovs, UIO_MAXIOV);
>
> It next updates the table with two actual buffers at offset 768:
>
> ret = io_uring_register_buffers_update(ring, 768, ups, 2);
>
> It later uses the buffer at index 768 for writev similar to the file-register test (IOSQE_FIXED_BUFFER instead of IOSQE_FIXED_FILE):
>
> iovs[768].iov_base = (void *)768;
> iovs[768].iov_len = pagesize;
>
>
> io_uring_prep_writev(sqe, fd, iovs, 1, 0);
> sqe->flags |= IOSQE_FIXED_BUFFER;
>
> ret = io_uring_submit(ring);
>
> Below is the iovec returned from
>
> io_import_iovec_fixed()
> -> io_import_vec()
>
> {iov_base = 0x300 <dm_early_create+412>, iov_len = 4096}
>
>>> + if (ret < 0)
>>> + return ret;
>>> +
>>> + iov = (struct iovec *)iter->iov;
>>> +
>>> + for (seg = 0; seg < iter->nr_segs; seg++) {
>>> + buf_index = *(u16 *)(&iov[seg].iov_base);
>>
>> That's ugly, and also not consistent with rw_fixed, because iov_base is
>> used to calculate offset.
>
> Would offset be applicable when using readv/writev?
>
> My thinkig was that for those cases, each iovec should be used exactly as registered.
>
>>
>>> + if (unlikely(buf_index < 0 || buf_index >= ctx->nr_user_bufs))
>>> + return -EFAULT;
>>> +
>>> + index = array_index_nospec(buf_index, ctx->nr_user_bufs);
>>> + imu = io_buf_from_index(ctx, index);
>>> + if (!imu->ubuf || !imu->len)
>>> + return -EFAULT;
>>> + if (iov[seg].iov_len > imu->len)
>>> + return -EFAULT;
>>> +
>>> + iov[seg].iov_base = (void *)imu->ubuf;
>>
>> Nope, that's not different from non registered version.
>> What import_fixed actually do is setting up the iter argument to point
>> to a bvec (a vector of struct page *).
>
> So in fact, the buffers end up being pinned again because they are not being as bvecs?
>
>>
>> So it either would need to keep a vector of bvecs, that's a vector of vectors,
>> that's not supported by iter, etc., so you'll also need to iterate over them
>> in io_read/write and so on. Or flat 2D structure into 1D, but that's still ugly.
>
> So you're saying there's no clean way to create a readv/writev + fixed buffers API? It would've been nice to have a consistent API between files and buffers.
>
>
>>> @@ -5692,7 +5743,7 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
>>> {
>>> if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>>> return -EINVAL;
>>> - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
>>> + if (unlikely(req->flags & (REQ_F_FIXED_RSRC | REQ_F_BUFFER_SELECT)))
>>
>> Why it's here?
>>
>> #define REQ_F_FIXED_RSRC (REQ_F_FIXED_FILE | REQ_F_FIXED_BUFFER)
>> So, why do you | with REQ_F_BUFFER_SELECT again here?
>
> I thought to group fixed files/buffers but distinguish them from selected buffers.
>
>>> @@ -87,6 +88,8 @@ enum {
>>> #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
>>> /* select buffer from sqe->buf_group */
>>> #define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT)
>>> +/* use fixed buffer set */
>>> +#define IOSQE_FIXED_BUFFER (1U << IOSQE_FIXED_BUFFER_BIT)
>>
>> Unfortenatuly, we're almost out of flags bits -- it's a 1 byte
>> field and 6 bits are already taken. Let's not use it.
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH 8/8] io_uring: support buffer registration sharing
2020-11-12 23:00 [PATCH 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
` (6 preceding siblings ...)
2020-11-12 23:00 ` [PATCH 7/8] io_uring: support readv/writev with fixed buffers Bijan Mottahedeh
@ 2020-11-12 23:00 ` Bijan Mottahedeh
2020-11-16 23:28 ` [PATCH 0/8] io_uring: buffer registration enhancements Pavel Begunkov
8 siblings, 0 replies; 22+ messages in thread
From: Bijan Mottahedeh @ 2020-11-12 23:00 UTC (permalink / raw)
To: axboe; +Cc: io-uring
Implement buffer sharing among multiple rings.
A ring shares its (future) buffer registrations at setup time with
IORING_SETUP_SHARE_BUF. A ring attaches to another ring's buffer
registration at setup time with IORING_SETUP_ATTACH_BUF, after
authenticating with the buffer registration owner's fd. Any updates to
the owner's buffer registrations become immediately available to the
attached rings.
Signed-off-by: Bijan Mottahedeh <[email protected]>
---
fs/io_uring.c | 140 +++++++++++++++++++++++++++++++++++-------
include/uapi/linux/io_uring.h | 2 +
2 files changed, 119 insertions(+), 23 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 12c4144..aab7649 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8332,6 +8332,12 @@ static void io_buffers_map_free(struct io_ring_ctx *ctx)
ctx->nr_user_bufs = 0;
}
+static void io_detach_buf_data(struct io_ring_ctx *ctx)
+{
+ percpu_ref_put(&ctx->buf_data->refs);
+ ctx->buf_data = NULL;
+}
+
static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
struct fixed_rsrc_data *data = ctx->buf_data;
@@ -8340,6 +8346,12 @@ static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
if (!data)
return -ENXIO;
+ if (ctx->flags & IORING_SETUP_ATTACH_BUF) {
+ io_detach_buf_data(ctx);
+ ctx->nr_user_bufs = 0;
+ return 0;
+ }
+
spin_lock(&data->lock);
if (!list_empty(&data->ref_list))
ref_node = list_first_entry(&data->ref_list,
@@ -8586,46 +8598,67 @@ static int io_alloc_buf_tables(struct fixed_rsrc_data *buf_data,
return 1;
}
+static struct fixed_rsrc_data *io_alloc_buf_data(struct io_ring_ctx *ctx)
+{
+ struct fixed_rsrc_data *buf_data;
+
+ buf_data = kzalloc(sizeof(*buf_data), GFP_KERNEL);
+ if (!buf_data)
+ return ERR_PTR(-ENOMEM);
+
+ buf_data->ctx = ctx;
+ init_completion(&buf_data->done);
+ INIT_LIST_HEAD(&buf_data->ref_list);
+ spin_lock_init(&buf_data->lock);
+
+ if (percpu_ref_init(&buf_data->refs, io_rsrc_ref_kill,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+ kfree(buf_data);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return buf_data;
+}
+
+static void io_free_buf_data(struct io_ring_ctx *ctx)
+{
+ percpu_ref_exit(&ctx->buf_data->refs);
+ kfree(ctx->buf_data->table);
+ kfree(ctx->buf_data);
+}
+
static struct fixed_rsrc_data *io_buffers_map_alloc(struct io_ring_ctx *ctx,
unsigned int nr_args)
{
- unsigned nr_tables;
struct fixed_rsrc_data *buf_data;
+ unsigned nr_tables;
int ret = -ENOMEM;
- if (ctx->buf_data)
+ if (ctx->nr_user_bufs)
return ERR_PTR(-EBUSY);
if (!nr_args || nr_args > IORING_MAX_FIXED_BUFS)
return ERR_PTR(-EINVAL);
- buf_data = kzalloc(sizeof(*ctx->buf_data), GFP_KERNEL);
- if (!buf_data)
- return ERR_PTR(-ENOMEM);
- buf_data->ctx = ctx;
- init_completion(&buf_data->done);
- INIT_LIST_HEAD(&buf_data->ref_list);
- spin_lock_init(&buf_data->lock);
+ if (ctx->buf_data)
+ buf_data = ctx->buf_data;
+ else {
+ buf_data = io_alloc_buf_data(ctx);
+ if (IS_ERR(buf_data))
+ return buf_data;
+ }
nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_BUFS_TABLE);
- buf_data->table = kcalloc(nr_tables, sizeof(buf_data->table),
+ buf_data->table = kcalloc(nr_tables, sizeof(struct fixed_rsrc_table),
GFP_KERNEL);
if (!buf_data->table)
- goto out_free;
-
- if (percpu_ref_init(&buf_data->refs, io_rsrc_ref_kill,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
- goto out_free;
+ goto out;
if (io_alloc_buf_tables(buf_data, nr_tables, nr_args))
- goto out_ref;
+ goto out;
return buf_data;
-
-out_ref:
- percpu_ref_exit(&buf_data->refs);
-out_free:
- kfree(buf_data->table);
- kfree(buf_data);
+out:
+ io_free_buf_data(ctx);
return ERR_PTR(ret);
}
@@ -8713,9 +8746,17 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
struct fixed_rsrc_ref_node *ref_node;
struct fixed_rsrc_data *buf_data;
+ if (ctx->flags & IORING_SETUP_ATTACH_BUF) {
+ if (!ctx->buf_data)
+ return -EFAULT;
+ ctx->nr_user_bufs = ctx->buf_data->ctx->nr_user_bufs;
+ return 0;
+ }
+
buf_data = io_buffers_map_alloc(ctx, nr_args);
if (IS_ERR(buf_data))
return PTR_ERR(buf_data);
+ ctx->buf_data = buf_data;
for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
struct fixed_rsrc_table *table;
@@ -8743,7 +8784,6 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
break;
}
- ctx->buf_data = buf_data;
if (ret) {
io_sqe_buffers_unregister(ctx);
return ret;
@@ -9784,6 +9824,55 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx)
return ret;
}
+static int io_attach_buf_data(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
+{
+ struct io_ring_ctx *ctx_attach;
+ struct fd f;
+
+ f = fdget(p->wq_fd);
+ if (!f.file)
+ return -EBADF;
+ if (f.file->f_op != &io_uring_fops) {
+ fdput(f);
+ return -EINVAL;
+ }
+
+ ctx_attach = f.file->private_data;
+ if (!ctx_attach->buf_data) {
+ fdput(f);
+ return -EINVAL;
+ }
+ ctx->buf_data = ctx_attach->buf_data;
+
+ percpu_ref_get(&ctx->buf_data->refs);
+ fdput(f);
+ return 0;
+}
+
+static int io_init_buf_data(struct io_ring_ctx *ctx, struct io_uring_params *p)
+{
+ if ((p->flags & (IORING_SETUP_SHARE_BUF | IORING_SETUP_ATTACH_BUF)) ==
+ (IORING_SETUP_SHARE_BUF | IORING_SETUP_ATTACH_BUF))
+ return -EINVAL;
+
+ if (p->flags & IORING_SETUP_SHARE_BUF) {
+ struct fixed_rsrc_data *buf_data;
+
+ buf_data = io_alloc_buf_data(ctx);
+ if (IS_ERR(buf_data))
+ return PTR_ERR(buf_data);
+
+ ctx->buf_data = buf_data;
+ return 0;
+ }
+
+ if (p->flags & IORING_SETUP_ATTACH_BUF)
+ return io_attach_buf_data(ctx, p);
+
+ return 0;
+}
+
static int io_uring_create(unsigned entries, struct io_uring_params *p,
struct io_uring_params __user *params)
{
@@ -9898,6 +9987,10 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
if (ret)
goto err;
+ ret = io_init_buf_data(ctx, p);
+ if (ret)
+ goto err;
+
ret = io_sq_offload_create(ctx, p);
if (ret)
goto err;
@@ -9969,6 +10062,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
+ IORING_SETUP_SHARE_BUF | IORING_SETUP_ATTACH_BUF |
IORING_SETUP_R_DISABLED))
return -EINVAL;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 41da59c..1d5cd02 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -101,6 +101,8 @@ enum {
#define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
#define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */
+#define IORING_SETUP_SHARE_BUF (1U << 7) /* share buffer registration */
+#define IORING_SETUP_ATTACH_BUF (1U << 8) /* attach buffer registration */
enum {
IORING_OP_NOP,
--
1.8.3.1
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [PATCH 0/8] io_uring: buffer registration enhancements
2020-11-12 23:00 [PATCH 0/8] io_uring: buffer registration enhancements Bijan Mottahedeh
` (7 preceding siblings ...)
2020-11-12 23:00 ` [PATCH 8/8] io_uring: support buffer registration sharing Bijan Mottahedeh
@ 2020-11-16 23:28 ` Pavel Begunkov
2020-11-17 0:21 ` Bijan Mottahedeh
8 siblings, 1 reply; 22+ messages in thread
From: Pavel Begunkov @ 2020-11-16 23:28 UTC (permalink / raw)
To: Bijan Mottahedeh, axboe; +Cc: io-uring
On 12/11/2020 23:00, Bijan Mottahedeh wrote:
> This patchset is the follow-on to my previous RFC which implements a
> set of enhancements to buffer registration consistent with existing file
> registration functionality:
I like the idea of generic resource handling
>
> - buffer registration updates IORING_REGISTER_BUFFERS_UPDATE
> IORING_OP_BUFFERS_UPDATE
Do you need it for something specific?
>
> - readv/writev with fixed buffers IOSQE_FIXED_BUFFER
Why do we need it?
>
> - buffer registration sharing IORING_SETUP_SHARE_BUF
> IORING_SETUP_ATTACH_BUF
I haven't looked it up. What's the overhead on that?
And again, do you really need it?
The set is +600 lines, so just want to know that there is
a real benefit from having it.
>
> Patches 1,2 modularize existing buffer registration code.
>
> Patch 3 generalizes fixed_file functionality to fixed_rsrc.
>
> Patch 4 applies fixed_rsrc functionality for fixed buffers support.
>
> Patch 5 generalizes files_update functionality to rsrc_update.
>
> Patch 6 implements buffer registration update, and introduces
> IORING_REGISTER_BUFFERS_UPDATE and IORING_OP_BUFFERS_UPDATE, consistent
> with file registration update.
>
> Patch 7 implements readv/writev support with fixed buffers, and introduces
> IOSQE_FIXED_BUFFER, consistent with fixed files.
>
> Patch 8 implements buffer sharing among multiple rings; it works as follows:
>
> - A new ring, A, is setup. Since no buffers have been registered, the
> registered buffer state is an empty set, Z. That's different from the
> NULL state in current implementation.
>
> - Ring B is setup, attaching to Ring A. It's also attaching to it's
> buffer registrations, now we have two references to the same empty
> set, Z.
>
> - Ring A registers buffers into set Z, which is no longer empty.
>
> - Ring B sees this immediately, since it's already sharing that set.
>
> TBD
>
> - I think I have to add IORING_UNREGISTER_BUFFERS to
> io_register_op_must_quiesce() but wanted to confirm.
->fixed_file_refs trades off fast unsynchronised access (by adding
percpu_ref_get/put) to not do full quiesce. So, yes, after it's hooked
up right.
>
> I have used liburing file-{register,update} tests as models for
> buffer-{register,update,share}, tests and they run ok.
>
> The liburing test suite fails for "self" with/without this patchset.
>
> Bijan Mottahedeh (8):
> io_uring: modularize io_sqe_buffer_register
> io_uring: modularize io_sqe_buffers_register
> io_uring: generalize fixed file functionality
> io_uring: implement fixed buffers registration similar to fixed files
> io_uring: generalize files_update functionlity to rsrc_update
> io_uring: support buffer registration updates
> io_uring: support readv/writev with fixed buffers
> io_uring: support buffer registration sharing
>
> fs/io_uring.c | 1021 ++++++++++++++++++++++++++++++++---------
> include/uapi/linux/io_uring.h | 15 +-
> 2 files changed, 807 insertions(+), 229 deletions(-)
>
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH 0/8] io_uring: buffer registration enhancements
2020-11-16 23:28 ` [PATCH 0/8] io_uring: buffer registration enhancements Pavel Begunkov
@ 2020-11-17 0:21 ` Bijan Mottahedeh
0 siblings, 0 replies; 22+ messages in thread
From: Bijan Mottahedeh @ 2020-11-17 0:21 UTC (permalink / raw)
To: Pavel Begunkov, axboe; +Cc: io-uring
>> This patchset is the follow-on to my previous RFC which implements a
>> set of enhancements to buffer registration consistent with existing file
>> registration functionality:
>
> I like the idea of generic resource handling
>
>>
>> - buffer registration updates IORING_REGISTER_BUFFERS_UPDATE
>> IORING_OP_BUFFERS_UPDATE
>
> Do you need it for something specific?
Incremental buffer registration, see below.
>
>>
>> - readv/writev with fixed buffers IOSQE_FIXED_BUFFER
>
> Why do we need it?
It makes fixed files/buffers APIs more consistent, and once the initial
work of generic resource handling is done, the additional work for this
support is not much I think.
>
>>
>> - buffer registration sharing IORING_SETUP_SHARE_BUF
>> IORING_SETUP_ATTACH_BUF
>
> I haven't looked it up. What's the overhead on that?
> And again, do you really need it?
For our use case, a DB instance may have a shared memory size of TB
order and very large number of processes (1000+) using that memory. The
cost of each process registering that memory could become prohibitive.
We need to allow for incremental buffer registration given the
potentially large size of the shared memory. It also makes the API
between files/buffers more consistent.
I had a chat with Jens a while back and he also felt that the static
nature of buffer registrations and the requirement to reload the full
set in case of changes was problematic.
>
> The set is +600 lines, so just want to know that there is
> a real benefit from having it.
Sure, understood.
^ permalink raw reply [flat|nested] 22+ messages in thread