From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id F2C49C4708D for ; Fri, 6 Jan 2023 15:43:17 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231195AbjAFPnR (ORCPT ); Fri, 6 Jan 2023 10:43:17 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:54030 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229824AbjAFPnQ (ORCPT ); Fri, 6 Jan 2023 10:43:16 -0500 Received: from gnuweeb.org (gnuweeb.org [51.81.211.47]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id D890141648 for ; Fri, 6 Jan 2023 07:43:15 -0800 (PST) Received: from localhost.localdomain (unknown [182.253.183.184]) by gnuweeb.org (Postfix) with ESMTPSA id 26D277E538; Fri, 6 Jan 2023 15:43:12 +0000 (UTC) X-GW-Data: lPqxHiMPbJw1wb7CM9QUryAGzr0yq5atzVDdxTR0iA== DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=gnuweeb.org; s=default; t=1673019795; bh=0YJU3DPAiJGEi47sssnmOVdPYpXrO6fGKHqHoucqDns=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=So+yWudFUZYF9jZzKScAPrTgW+qLF67Ngrjf7LPGdiQo/9NgxUHirqLMkuRgysZK4 ZOtqR7Ec0FC+UZIWIM4SJe2VpIJkv8/gtS1jGT17cO8Izra+n/+T7t5GsJDIyxa1rG 4i0ghykwuv/yojTx1PYfQTtlFtFDJhD8R9CkcaC4Dxu6q61pCynL19ClF6O44iJub8 Krw74MdKj6uTBqDh6leDsaykrVoSMAPhxoUvB+mCHF/tkCuCBdN5fFROIQwpPc2djy 5WVsqQQCeRlCJTa7Do9mp1l5cZfkx6SWu9zg3e3/blhHSfR3WDRlYwrattCKVc/Y3G TkIePQFe9N1ew== From: Ammar Faizi To: Jens Axboe Cc: Ammar Faizi , Pavel Begunkov , Gilang Fachrezy , VNLX Kernel Department , Alviro Iskandar Setiawan , GNU/Weeb Mailing List , io-uring Mailing List Subject: [PATCH liburing v1 1/2] nolibc: Fix bloated memset due to unexpected vectorization Date: Fri, 6 Jan 2023 22:42:58 +0700 Message-Id: <20230106154259.556542-2-ammar.faizi@intel.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20230106154259.556542-1-ammar.faizi@intel.com> References: <20230106154259.556542-1-ammar.faizi@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk List-ID: X-Mailing-List: io-uring@vger.kernel.org From: Ammar Faizi Clang and GCC generate an insane vectorized memset() in nolibc.c. liburing doesn't need such a powerful memset(). Add an empty inline ASM to prevent the compilers from over-optimizing the memset(). Just for comparison, see the following Assembly code (generated by Clang). Before this patch: ``` 0000000000003a00 <__uring_memset>: 3a00: mov %rdi,%rax 3a03: test %rdx,%rdx 3a06: je 3b2c <__uring_memset+0x12c> 3a0c: cmp $0x8,%rdx 3a10: jae 3a19 <__uring_memset+0x19> 3a12: xor %ecx,%ecx 3a14: jmp 3b20 <__uring_memset+0x120> 3a19: movzbl %sil,%r8d 3a1d: cmp $0x20,%rdx 3a21: jae 3a2a <__uring_memset+0x2a> 3a23: xor %ecx,%ecx 3a25: jmp 3ae0 <__uring_memset+0xe0> 3a2a: mov %rdx,%rcx 3a2d: and $0xffffffffffffffe0,%rcx 3a31: movd %r8d,%xmm0 3a36: punpcklbw %xmm0,%xmm0 3a3a: pshuflw $0x0,%xmm0,%xmm0 3a3f: pshufd $0x0,%xmm0,%xmm0 3a44: lea -0x20(%rcx),%rdi 3a48: mov %rdi,%r10 3a4b: shr $0x5,%r10 3a4f: inc %r10 3a52: mov %r10d,%r9d 3a55: and $0x3,%r9d 3a59: cmp $0x60,%rdi 3a5d: jae 3a63 <__uring_memset+0x63> 3a5f: xor %edi,%edi 3a61: jmp 3aa9 <__uring_memset+0xa9> 3a63: and $0xfffffffffffffffc,%r10 3a67: xor %edi,%edi 3a69: nopl 0x0(%rax) 3a70: movdqu %xmm0,(%rax,%rdi,1) 3a75: movdqu %xmm0,0x10(%rax,%rdi,1) 3a7b: movdqu %xmm0,0x20(%rax,%rdi,1) 3a81: movdqu %xmm0,0x30(%rax,%rdi,1) 3a87: movdqu %xmm0,0x40(%rax,%rdi,1) 3a8d: movdqu %xmm0,0x50(%rax,%rdi,1) 3a93: movdqu %xmm0,0x60(%rax,%rdi,1) 3a99: movdqu %xmm0,0x70(%rax,%rdi,1) 3a9f: sub $0xffffffffffffff80,%rdi 3aa3: add $0xfffffffffffffffc,%r10 3aa7: jne 3a70 <__uring_memset+0x70> 3aa9: test %r9,%r9 3aac: je 3ad6 <__uring_memset+0xd6> 3aae: lea (%rdi,%rax,1),%r10 3ab2: add $0x10,%r10 3ab6: shl $0x5,%r9 3aba: xor %edi,%edi 3abc: nopl 0x0(%rax) 3ac0: movdqu %xmm0,-0x10(%r10,%rdi,1) 3ac7: movdqu %xmm0,(%r10,%rdi,1) 3acd: add $0x20,%rdi 3ad1: cmp %rdi,%r9 3ad4: jne 3ac0 <__uring_memset+0xc0> 3ad6: cmp %rdx,%rcx 3ad9: je 3b2c <__uring_memset+0x12c> 3adb: test $0x18,%dl 3ade: je 3b20 <__uring_memset+0x120> 3ae0: mov %rcx,%rdi 3ae3: mov %rdx,%rcx 3ae6: and $0xfffffffffffffff8,%rcx 3aea: movd %r8d,%xmm0 3aef: punpcklbw %xmm0,%xmm0 3af3: pshuflw $0x0,%xmm0,%xmm0 3af8: nopl 0x0(%rax,%rax,1) 3b00: movq %xmm0,(%rax,%rdi,1) 3b05: add $0x8,%rdi 3b09: cmp %rdi,%rcx 3b0c: jne 3b00 <__uring_memset+0x100> 3b0e: cmp %rdx,%rcx 3b11: je 3b2c <__uring_memset+0x12c> 3b13: data16 data16 data16 cs nopw 0x0(%rax,%rax,1) 3b20: mov %sil,(%rax,%rcx,1) 3b24: inc %rcx 3b27: cmp %rcx,%rdx 3b2a: jne 3b20 <__uring_memset+0x120> 3b2c: ret 3b2d: nopl (%rax) ``` After this patch: ``` 0000000000003424 <__uring_memset>: 3424: mov %rdi,%rax 3427: test %rdx,%rdx 342a: je 343a <__uring_memset+0x16> 342c: xor %ecx,%ecx 342e: mov %sil,(%rax,%rcx,1) 3432: inc %rcx 3435: cmp %rcx,%rdx 3438: jne 342e <__uring_memset+0xa> 343a: ret ``` Signed-off-by: Ammar Faizi --- src/nolibc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/nolibc.c b/src/nolibc.c index 3207e33..ac81575 100644 --- a/src/nolibc.c +++ b/src/nolibc.c @@ -12,9 +12,16 @@ void *__uring_memset(void *s, int c, size_t n) size_t i; unsigned char *p = s; - for (i = 0; i < n; i++) + for (i = 0; i < n; i++) { p[i] = (unsigned char) c; + /* + * An empty inline ASM to avoid auto-vectorization + * because it's too bloated for liburing. + */ + __asm__ volatile (""); + } + return s; } -- Ammar Faizi