From: Dmitry Kadashev <[email protected]>
To: [email protected]
Subject: io_uring's openat doesn't work with large (2G+) files
Date: Wed, 8 Apr 2020 21:51:23 +0700 [thread overview]
Message-ID: <CAOKbgA4K4FzxTEoHHYcoOAe6oNwFvGbzcfch2sDmicJvf3Ydwg@mail.gmail.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 1226 bytes --]
Hi,
io_uring's openat seems to produce FDs that are incompatible with
large files (>2GB). If a file (smaller than 2GB) is opened using
io_uring's openat then writes -- both using io_uring and just sync
pwrite() -- past that threshold fail with EFBIG. If such a file is
opened with sync openat, then both io_uring's writes and sync writes
succeed. And if the file is larger than 2GB then io_uring's openat
fails right away, while the sync one works.
Kernel versions: 5.6.0-rc2, 5.6.0.
A couple of reproducers attached, one demos successful open with
failed writes afterwards, and another failing open (in comparison with
sync calls).
The output of the former one for example:
*** sync openat
openat succeeded
sync write at offset 0
write succeeded
sync write at offset 4294967296
write succeeded
*** sync openat
openat succeeded
io_uring write at offset 0
write succeeded
io_uring write at offset 4294967296
write succeeded
*** io_uring openat
openat succeeded
sync write at offset 0
write succeeded
sync write at offset 4294967296
write failed: File too large
*** io_uring openat
openat succeeded
io_uring write at offset 0
write succeeded
io_uring write at offset 4294967296
write failed: File too large
--
Dmitry
[-- Attachment #2: test-io_uring-write-large-offset.c --]
[-- Type: text/x-csrc, Size: 3061 bytes --]
#include <liburing.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/resource.h>
#include <unistd.h>
static const int RSIZE = 2;
static const int OPEN_FLAGS = O_RDWR | O_CREAT;
static const mode_t OPEN_MODE = S_IRUSR | S_IWUSR;
#define DIE(...) do {\
fprintf(stderr, __VA_ARGS__);\
abort();\
} while(0);
void do_write(struct io_uring *ring, int sync, int fd, off_t offset)
{
fprintf(stderr, "%s write at offset %lld\n", sync ? "sync": "io_uring", offset);
char buf[] = "some test write buf";
int res;
if (sync) {
res = pwrite(fd, buf, sizeof(buf), offset);
if (res < 0) {
res = -errno;
}
}
else {
struct io_uring_sqe *sqe;
sqe = io_uring_get_sqe(ring);
if (!sqe) {
fprintf(stderr, "failed to get sqe\n");
return;
}
io_uring_prep_write(sqe, fd, buf, sizeof(buf), offset);
int ret = io_uring_submit(ring);
if (ret < 0) {
fprintf(stderr, "failed to submit write: %s\n", strerror(-ret));
return;
}
struct io_uring_cqe *cqe;
ret = io_uring_wait_cqe(ring, &cqe);
res = cqe->res;
io_uring_cqe_seen(ring, cqe);
if (ret < 0) {
fprintf(stderr, "wait_cqe failed: %s\n", strerror(-ret));
return;
}
}
if (res < 0) {
fprintf(stderr, "write failed: %s\n", strerror(-res));
}
else {
fprintf(stderr, "write succeeded\n");
}
}
void test_open_write(struct io_uring *ring, int sync_open, int sync_write, int dfd, const char* fn)
{
fprintf(stderr, "\n*** %s openat\n", sync_open ? "sync" : "io_uring");
struct io_uring_sqe *sqe;
int fd = -1;
if (sync_open) {
fd = openat(dfd, fn, OPEN_FLAGS, OPEN_MODE);
if (fd < 0) {
fd = -errno;
}
}
else {
sqe = io_uring_get_sqe(ring);
if (!sqe) {
fprintf(stderr, "failed to get sqe\n");
return;
}
io_uring_prep_openat(sqe, dfd, fn, OPEN_FLAGS, OPEN_MODE);
int ret = io_uring_submit(ring);
if (ret < 0) {
fprintf(stderr, "failed to submit openat: %s\n", strerror(-ret));
return;
}
struct io_uring_cqe *cqe;
ret = io_uring_wait_cqe(ring, &cqe);
fd = cqe->res;
io_uring_cqe_seen(ring, cqe);
if (ret < 0) {
fprintf(stderr, "wait_cqe failed: %s\n", strerror(-ret));
return;
}
}
if (fd < 0) {
fprintf(stderr, "openat failed: %s\n", strerror(-fd));
}
else {
fprintf(stderr, "openat succeeded\n");
do_write(ring, sync_write, fd, 0);
do_write(ring, sync_write, fd, 1ull << 32);
close(fd);
}
}
int main()
{
int dfd = open("/tmp", O_RDONLY | O_DIRECTORY);
if (dfd < 0) {
DIE("open /tmp: %s\n", strerror(errno));
}
struct io_uring ring;
int ret = io_uring_queue_init(RSIZE, &ring, 0);
if (ret < 0) {
DIE("failed to init io_uring: %s\n", strerror(-ret));
}
test_open_write(&ring, 1, 1, dfd, "io_uring_openat_write_test1");
test_open_write(&ring, 1, 0, dfd, "io_uring_openat_write_test2");
test_open_write(&ring, 0, 1, dfd, "io_uring_openat_write_test3");
test_open_write(&ring, 0, 0, dfd, "io_uring_openat_write_test4");
io_uring_queue_exit(&ring);
close(dfd);
return 0;
}
[-- Attachment #3: test-io_uring-openat-large-file.c --]
[-- Type: text/x-csrc, Size: 2255 bytes --]
#include <liburing.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/resource.h>
#include <unistd.h>
#define DIE(...) do {\
fprintf(stderr, __VA_ARGS__);\
abort();\
} while(0);
static const int RSIZE = 2;
static const int OPEN_FLAGS = O_RDWR | O_CREAT;
static const mode_t OPEN_MODE = S_IRUSR | S_IWUSR;
void open_sync(int dfd, const char* fn)
{
int fd = openat(dfd, fn, OPEN_FLAGS, OPEN_MODE);
if (fd < 0) {
fprintf(stderr, "sync open failed: %s\n", strerror(errno));
}
else {
fprintf(stderr, "sync open succeeded\n");
close(fd);
}
}
void open_io_uring(struct io_uring *ring, int dfd, const char* fn)
{
struct io_uring_sqe *sqe;
sqe = io_uring_get_sqe(ring);
if (!sqe) {
fprintf(stderr, "failed to get sqe\n");
return;
}
io_uring_prep_openat(sqe, dfd, fn, OPEN_FLAGS, OPEN_MODE);
int ret = io_uring_submit(ring);
if (ret < 0) {
fprintf(stderr, "failed to submit openat: %s\n", strerror(-ret));
return;
}
struct io_uring_cqe *cqe;
ret = io_uring_wait_cqe(ring, &cqe);
int fd = cqe->res;
io_uring_cqe_seen(ring, cqe);
if (ret < 0) {
fprintf(stderr, "wait_cqe failed: %s\n", strerror(-ret));
}
else if (fd < 0) {
fprintf(stderr, "io_uring openat failed: %s\n", strerror(-fd));
}
else {
fprintf(stderr, "io_uring openat succeeded\n");
close(fd);
}
}
int prepare_file(int dfd, const char* fn)
{
const char buf[] = "foo";
int fd = openat(dfd, fn, OPEN_FLAGS, OPEN_MODE);
if (fd < 0) {
fprintf(stderr, "prepare/open: %s\n", strerror(errno));
return -1;
}
int res = pwrite(fd, buf, sizeof(buf), 1ull << 32);
if (res < 0) {
fprintf(stderr, "prepare/pwrite: %s\n", strerror(errno));
}
close(fd);
return res < 0 ? res : 0;
}
int main()
{
const char *fn = "io_uring_openat_test";
int dfd = open("/tmp", O_RDONLY | O_DIRECTORY);
if (dfd < 0) {
DIE("open /tmp: %s\n", strerror(errno));
}
struct io_uring ring;
int ret = io_uring_queue_init(RSIZE, &ring, 0);
if (ret < 0) {
DIE("failed to init io_uring: %s\n", strerror(-ret));
}
if (!prepare_file(dfd, fn)) {
open_sync(dfd, fn);
open_io_uring(&ring, dfd, fn);
}
io_uring_queue_exit(&ring);
close(dfd);
return 0;
}
next reply other threads:[~2020-04-08 14:51 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-04-08 14:51 Dmitry Kadashev [this message]
2020-04-08 15:19 ` io_uring's openat doesn't work with large (2G+) files Jens Axboe
2020-04-08 15:30 ` Dmitry Kadashev
2020-04-08 15:36 ` Jens Axboe
2020-04-08 15:41 ` Dmitry Kadashev
2020-04-08 15:49 ` Jens Axboe
2020-04-08 16:12 ` Dmitry Kadashev
2020-04-08 16:26 ` Jens Axboe
2020-04-09 3:50 ` Dmitry Kadashev
2020-04-09 15:29 ` Jens Axboe
2020-04-13 9:20 ` Dmitry Kadashev
2020-04-13 10:09 ` Pavel Begunkov
2020-04-13 10:19 ` Dmitry Kadashev
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CAOKbgA4K4FzxTEoHHYcoOAe6oNwFvGbzcfch2sDmicJvf3Ydwg@mail.gmail.com \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox