Skip to content

Commit

Permalink
Merge branch 'io_uring_man' of https://github.com/CPestka/liburing
Browse files Browse the repository at this point in the history
* 'io_uring_man' of https://github.com/CPestka/liburing:
  man/io_uring: Add remark about where to find info about memory ordering
  man/io_uring: Update SQE struct
  man/io_uring: Add remark that operational behavior changes with flags
  man/io_uring: Moved SQE example of a read request to SQE description
  man/io_uring: Formatting fixes
  • Loading branch information
axboe committed Sep 28, 2024
2 parents 206650f + 8a7810d commit 4a581ec
Showing 1 changed file with 107 additions and 62 deletions.
169 changes: 107 additions & 62 deletions man/io_uring.7
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,16 @@ in essence,
the equivalent of a system call you would have made otherwise,
if you were not using
.BR io_uring .
For instance,
a SQE with the
.I opcode
set to
.B IORING_OP_READ
will request a read operation to be issued that is similar to the
.BR read (2)
system call. Refer to the opcode documentation in
.BR io_uring_enter (2)
for all supported opcodes and their properties.
You can add more than one SQE to the queue depending on the number of
operations you want to request.
.IP \(bu
Expand All @@ -79,37 +89,25 @@ which corresponds to the return value of the system
call's equivalent,
had you used it directly without using
.BR io_uring .
For instance,
a read operation under
.BR io_uring ,
started with the
.BR IORING_OP_READ
operation, issues the equivalent of the
.BR read (2)
system call. In practice, it mixes the semantics of
.BR pread (2)
and
.BR preadv2 (2)
in that it takes an explicit offset, and supports using -1 for the offset to
indicate that the current file position should be used instead of passing in
an explicit offset. See the opcode documentation for more details. Given that
io_uring is an async interface,
Given that
.B io_uring
is an async interface,
.I errno
is never used for passing back error information. Instead,
.I res
will contain what the equivalent system call would have returned in case
of success, and in case of error
.I res
will contain
.I -errno .
.IR -errno .
For example, if the normal read system call would have returned -1 and set
.I errno
to
.B EINVAL ,
.BR EINVAL ,
then
.I res
would contain
.B -EINVAL .
.BR -EINVAL .
If the normal system call would have returned a read size of 1024, then
.I res
would contain 1024.
Expand Down Expand Up @@ -146,6 +144,17 @@ The kernel reads SQEs off the head of the queue.
.IP \(bu
The kernel adds CQEs to the tail of the CQ.
You read CQEs off the head of the queue.
.PP
It should be noted that depending on the configuration io_uring's behavior
can deviate from the behavior outlined above, like not posting a CQE for
every SQE when setting
.B IOSQE_CQE_SKIP_SUCCESS
in the SQE or posting multiple CQEs for a single SQE for multi shot operations
or requiring an
.BR io_uring_enter (2)
syscall to make the kernel begin processing newly added SQEs when using
submission queue polling.

.SS Submission queue polling
One of the goals of
.B io_uring
Expand Down Expand Up @@ -223,50 +232,83 @@ in full:
.in +4n
.EX
struct io_uring_sqe {
__u8 opcode; /* type of operation for this sqe */
__u8 flags; /* IOSQE_ flags */
__u16 ioprio; /* ioprio for the request */
__s32 fd; /* file descriptor to do IO on */
union {
__u64 off; /* offset into file */
__u64 addr2;
};
union {
__u64 addr; /* pointer to buffer or iovecs */
__u64 splice_off_in;
};
__u32 len; /* buffer size or number of iovecs */
union {
__kernel_rwf_t rw_flags;
__u32 fsync_flags;
__u16 poll_events; /* compatibility */
__u32 poll32_events; /* word-reversed for BE */
__u32 sync_range_flags;
__u32 msg_flags;
__u32 timeout_flags;
__u32 accept_flags;
__u32 cancel_flags;
__u32 open_flags;
__u32 statx_flags;
__u32 fadvise_advice;
__u32 splice_flags;
};
__u64 user_data; /* data to be passed back at completion time */
union {
struct {
/* pack this to avoid bogus arm OABI complaints */
union {
/* index into fixed buffers, if used */
__u16 buf_index;
/* for grouped buffer selection */
__u16 buf_group;
} __attribute__((packed));
/* personality to use, if used */
__u16 personality;
__s32 splice_fd_in;
};
__u64 __pad2[3];
};
__u8 opcode; /* type of operation for this sqe */
__u8 flags; /* IOSQE_ flags */
__u16 ioprio; /* ioprio for the request */
__s32 fd; /* file descriptor to do IO on */
union {
__u64 off; /* offset into file */
__u64 addr2;
struct {
__u32 cmd_op;
__u32 __pad1;
};
};
union {
__u64 addr; /* pointer to buffer or iovecs */
__u64 splice_off_in;
struct {
__u32 level;
__u32 optname;
};
};
__u32 len; /* buffer size or number of iovecs */
union {
__kernel_rwf_t rw_flags;
__u32 fsync_flags;
__u16 poll_events; /* compatibility */
__u32 poll32_events; /* word-reversed for BE */
__u32 sync_range_flags;
__u32 msg_flags;
__u32 timeout_flags;
__u32 accept_flags;
__u32 cancel_flags;
__u32 open_flags;
__u32 statx_flags;
__u32 fadvise_advice;
__u32 splice_flags;
__u32 rename_flags;
__u32 unlink_flags;
__u32 hardlink_flags;
__u32 xattr_flags;
__u32 msg_ring_flags;
__u32 uring_cmd_flags;
__u32 waitid_flags;
__u32 futex_flags;
__u32 install_fd_flags;
__u32 nop_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
union {
/* index into fixed buffers, if used */
__u16 buf_index;
/* for grouped buffer selection */
__u16 buf_group;
} __attribute__((packed));
/* personality to use, if used */
__u16 personality;
union {
__s32 splice_fd_in;
__u32 file_index;
__u32 optlen;
struct {
__u16 addr_len;
__u16 __pad3[1];
};
};
union {
struct {
__u64 addr3;
__u64 __pad2[1];
};
__u64 optval;
/*
* If the ring is initialized with IORING_SETUP_SQE128, then
* this field is used for 80 bytes of arbitrary command data
*/
__u8 cmd[0];
};
};
.EE
.in
Expand Down Expand Up @@ -349,6 +391,9 @@ switch happened.
We use memory barriers to enforce this coherency.
Being significantly large subjects on their own,
memory barriers are out of scope for further discussion on this man page.
For more information on modern memory models the reader may refer to the
Documentation/memory-barriers.txt in the kernel tree or to the documentation
of the formal C11 or kernel memory model.
.TP
.B Letting the kernel know about I/O submissions
Once you place one or more SQEs on to the SQ,
Expand Down

0 comments on commit 4a581ec

Please sign in to comment.