* [PATCH liburing 2/3] src/include/liburing/barrier.h: Use C11 atomics
2020-06-21 20:36 [PATCH liburing 0/3] Three small liburing patches Bart Van Assche
2020-06-21 20:36 ` [PATCH liburing 1/3] Makefiles: Enable -Wextra Bart Van Assche
@ 2020-06-21 20:36 ` Bart Van Assche
2020-06-21 20:36 ` [PATCH liburing 3/3] Convert __io_uring_get_sqe() from a macro into an inline function Bart Van Assche
2020-06-22 2:50 ` [PATCH liburing 0/3] Three small liburing patches Jens Axboe
3 siblings, 0 replies; 5+ messages in thread
From: Bart Van Assche @ 2020-06-21 20:36 UTC (permalink / raw)
To: Jens Axboe; +Cc: io-uring, Bart Van Assche
Instead of using a combination of open-coding atomic primitives and using
gcc builtins, use C11 atomics for all CPU architectures. Note: despite their
name, atomic_*() operations do not necessarily translate into an atomic
instruction. This patch changes the order of the instructions in e.g.
io_uring_get_sqe() but not the number of instructions generated by gcc 10
on x86_64:
Without this patch:
0x0000000000000360 <+0>: mov 0x44(%rdi),%eax
0x0000000000000363 <+3>: lea 0x1(%rax),%edx
0x0000000000000366 <+6>: mov (%rdi),%rax
0x0000000000000369 <+9>: mov (%rax),%eax
0x000000000000036b <+11>: mov 0x18(%rdi),%rcx
0x000000000000036f <+15>: mov %edx,%esi
0x0000000000000371 <+17>: sub %eax,%esi
0x0000000000000373 <+19>: xor %eax,%eax
0x0000000000000375 <+21>: cmp (%rcx),%esi
0x0000000000000377 <+23>: ja 0x38d <io_uring_get_sqe+45>
0x0000000000000379 <+25>: mov 0x10(%rdi),%rax
0x000000000000037d <+29>: mov (%rax),%eax
0x000000000000037f <+31>: and 0x44(%rdi),%eax
0x0000000000000382 <+34>: mov %edx,0x44(%rdi)
0x0000000000000385 <+37>: shl $0x6,%rax
0x0000000000000389 <+41>: add 0x38(%rdi),%rax
0x000000000000038d <+45>: retq
With this patch applied:
0x0000000000000360 <+0>: mov 0x44(%rdi),%eax
0x0000000000000363 <+3>: lea 0x1(%rax),%edx
0x0000000000000366 <+6>: mov (%rdi),%rax
0x0000000000000369 <+9>: mov %edx,%esi
0x000000000000036b <+11>: mov (%rax),%eax
0x000000000000036d <+13>: sub %eax,%esi
0x000000000000036f <+15>: xor %eax,%eax
0x0000000000000371 <+17>: mov 0x18(%rdi),%rcx
0x0000000000000375 <+21>: cmp (%rcx),%esi
0x0000000000000377 <+23>: ja 0x38d <io_uring_get_sqe+45>
0x0000000000000379 <+25>: mov 0x10(%rdi),%rax
0x000000000000037d <+29>: mov (%rax),%eax
0x000000000000037f <+31>: and 0x44(%rdi),%eax
0x0000000000000382 <+34>: mov %edx,0x44(%rdi)
0x0000000000000385 <+37>: shl $0x6,%rax
0x0000000000000389 <+41>: add 0x38(%rdi),%rax
0x000000000000038d <+45>: retq
Signed-off-by: Bart Van Assche <[email protected]>
---
src/include/liburing/barrier.h | 44 ++++++++--------------------------
1 file changed, 10 insertions(+), 34 deletions(-)
diff --git a/src/include/liburing/barrier.h b/src/include/liburing/barrier.h
index ad69506bb248..c8aa4210371c 100644
--- a/src/include/liburing/barrier.h
+++ b/src/include/liburing/barrier.h
@@ -2,6 +2,8 @@
#ifndef LIBURING_BARRIER_H
#define LIBURING_BARRIER_H
+#include <stdatomic.h>
+
/*
From the kernel documentation file refcount-vs-atomic.rst:
@@ -21,40 +23,14 @@ after the acquire operation executes. This is implemented using
:c:func:`smp_acquire__after_ctrl_dep`.
*/
-/* From tools/include/linux/compiler.h */
-/* Optimization barrier */
-/* The "volatile" is due to gcc bugs */
-#define io_uring_barrier() __asm__ __volatile__("": : :"memory")
-
-/* From tools/virtio/linux/compiler.h */
-#define IO_URING_WRITE_ONCE(var, val) \
- (*((volatile __typeof(val) *)(&(var))) = (val))
-#define IO_URING_READ_ONCE(var) (*((volatile __typeof(var) *)(&(var))))
-
+#define IO_URING_WRITE_ONCE(var, val) \
+ atomic_store_explicit(&(var), (val), memory_order_relaxed)
+#define IO_URING_READ_ONCE(var) \
+ atomic_load_explicit(&(var), memory_order_relaxed)
-#if defined(__x86_64__) || defined(__i386__)
-/* Adapted from arch/x86/include/asm/barrier.h */
-#define io_uring_smp_store_release(p, v) \
-do { \
- io_uring_barrier(); \
- IO_URING_WRITE_ONCE(*(p), (v)); \
-} while (0)
-
-#define io_uring_smp_load_acquire(p) \
-({ \
- __typeof(*p) ___p1 = IO_URING_READ_ONCE(*(p)); \
- io_uring_barrier(); \
- ___p1; \
-})
-
-#else /* defined(__x86_64__) || defined(__i386__) */
-/*
- * Add arch appropriate definitions. Use built-in atomic operations for
- * archs we don't have support for.
- */
-#define io_uring_smp_store_release(p, v) \
- __atomic_store_n(p, v, __ATOMIC_RELEASE)
-#define io_uring_smp_load_acquire(p) __atomic_load_n(p, __ATOMIC_ACQUIRE)
-#endif /* defined(__x86_64__) || defined(__i386__) */
+#define io_uring_smp_store_release(p, v) \
+ atomic_store_explicit((p), (v), memory_order_release)
+#define io_uring_smp_load_acquire(p) \
+ atomic_load_explicit((p), memory_order_acquire)
#endif /* defined(LIBURING_BARRIER_H) */
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH liburing 3/3] Convert __io_uring_get_sqe() from a macro into an inline function
2020-06-21 20:36 [PATCH liburing 0/3] Three small liburing patches Bart Van Assche
2020-06-21 20:36 ` [PATCH liburing 1/3] Makefiles: Enable -Wextra Bart Van Assche
2020-06-21 20:36 ` [PATCH liburing 2/3] src/include/liburing/barrier.h: Use C11 atomics Bart Van Assche
@ 2020-06-21 20:36 ` Bart Van Assche
2020-06-22 2:50 ` [PATCH liburing 0/3] Three small liburing patches Jens Axboe
3 siblings, 0 replies; 5+ messages in thread
From: Bart Van Assche @ 2020-06-21 20:36 UTC (permalink / raw)
To: Jens Axboe; +Cc: io-uring, Bart Van Assche
This patch makes gcc 10 insert one additional assembly instruction in
io_uring_get_sqe(), namely xor %r8d,%r8d. That shouldn't cause any slowdown
since that instruction does not access memory:
0x0000000000000360 <+0>: mov (%rdi),%rax
0x0000000000000363 <+3>: xor %r8d,%r8d
0x0000000000000366 <+6>: mov (%rax),%ecx
0x0000000000000368 <+8>: mov 0x44(%rdi),%eax
0x000000000000036b <+11>: lea 0x1(%rax),%edx
0x000000000000036e <+14>: mov %edx,%esi
0x0000000000000370 <+16>: sub %ecx,%esi
0x0000000000000372 <+18>: mov 0x18(%rdi),%rcx
0x0000000000000376 <+22>: cmp (%rcx),%esi
0x0000000000000378 <+24>: ja 0x38e <io_uring_get_sqe+46>
0x000000000000037a <+26>: mov 0x10(%rdi),%rcx
0x000000000000037e <+30>: and (%rcx),%eax
0x0000000000000380 <+32>: mov %edx,0x44(%rdi)
0x0000000000000383 <+35>: shl $0x6,%rax
0x0000000000000387 <+39>: add 0x38(%rdi),%rax
0x000000000000038b <+43>: mov %rax,%r8
0x000000000000038e <+46>: mov %r8,%rax
0x0000000000000391 <+49>: retq
Signed-off-by: Bart Van Assche <[email protected]>
---
src/queue.c | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/src/queue.c b/src/queue.c
index 3db52bd17b86..88e0294c19fb 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -273,16 +273,18 @@ int io_uring_submit_and_wait(struct io_uring *ring, unsigned wait_nr)
return __io_uring_submit_and_wait(ring, wait_nr);
}
-#define __io_uring_get_sqe(sq, __head) ({ \
- unsigned __next = (sq)->sqe_tail + 1; \
- struct io_uring_sqe *__sqe = NULL; \
- \
- if (__next - __head <= *(sq)->kring_entries) { \
- __sqe = &(sq)->sqes[(sq)->sqe_tail & *(sq)->kring_mask];\
- (sq)->sqe_tail = __next; \
- } \
- __sqe; \
-})
+static inline struct io_uring_sqe *
+__io_uring_get_sqe(struct io_uring_sq *sq, unsigned int __head)
+{
+ unsigned int __next = (sq)->sqe_tail + 1;
+ struct io_uring_sqe *__sqe = NULL;
+
+ if (__next - __head <= *(sq)->kring_entries) {
+ __sqe = &(sq)->sqes[(sq)->sqe_tail & *(sq)->kring_mask];
+ (sq)->sqe_tail = __next;
+ }
+ return __sqe;
+}
/*
* Return an sqe to fill. Application must later call io_uring_submit()
^ permalink raw reply related [flat|nested] 5+ messages in thread