Skip to content

Commit

Permalink
netfs: Speed up buffered reading
Browse files Browse the repository at this point in the history
Improve the efficiency of buffered reads in a number of ways:

 (1) Overhaul the algorithm in general so that it's a lot more compact and
     split the read submission code between buffered and unbuffered
     versions.  The unbuffered version can be vastly simplified.

 (2) Read-result collection is handed off to a work queue rather than being
     done in the I/O thread.  Multiple subrequests can be processes
     simultaneously.

 (3) When a subrequest is collected, any folios it fully spans are
     collected and "spare" data on either side is donated to either the
     previous or the next subrequest in the sequence.

Notes:

 (*) Readahead expansion is massively slows down fio, presumably because it
     causes a load of extra allocations, both folio and xarray, up front
     before RPC requests can be transmitted.

 (*) RDMA with cifs does appear to work, both with SIW and RXE.

 (*) PG_private_2-based reading and copy-to-cache is split out into its own
     file and altered to use folio_queue.  Note that the copy to the cache
     now creates a new write transaction against the cache and adds the
     folios to be copied into it.  This allows it to use part of the
     writeback I/O code.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20240814203850.2240469-20-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
  • Loading branch information
dhowells authored and brauner committed Sep 12, 2024
1 parent 2e45b92 commit ee4cdf7
Show file tree
Hide file tree
Showing 28 changed files with 2,058 additions and 470 deletions.
11 changes: 8 additions & 3 deletions fs/9p/vfs_addr.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,22 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
struct p9_fid *fid = rreq->netfs_priv;
unsigned long long pos = subreq->start + subreq->transferred;
int total, err;

total = p9_client_read(fid, subreq->start + subreq->transferred,
&subreq->io_iter, &err);
total = p9_client_read(fid, pos, &subreq->io_iter, &err);

/* if we just extended the file size, any portion not in
* cache won't be on server and is zeroes */
if (subreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
if (pos + total >= i_size_read(rreq->inode))
__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);

netfs_subreq_terminated(subreq, err ?: total, false);
if (!err)
subreq->transferred += total;

netfs_read_subreq_terminated(subreq, err, false);
}

/**
Expand Down
21 changes: 15 additions & 6 deletions fs/afs/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/netfs.h>
#include <trace/events/netfs.h>
#include "internal.h"

static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
Expand Down Expand Up @@ -242,9 +243,10 @@ static void afs_fetch_data_notify(struct afs_operation *op)

req->error = error;
if (subreq) {
if (subreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
netfs_subreq_terminated(subreq, error ?: req->actual_len, false);
subreq->rreq->i_size = req->file_size;
if (req->pos + req->actual_len >= req->file_size)
__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
netfs_read_subreq_terminated(subreq, error, false);
req->subreq = NULL;
} else if (req->done) {
req->done(req);
Expand All @@ -262,6 +264,12 @@ static void afs_fetch_data_success(struct afs_operation *op)
afs_fetch_data_notify(op);
}

static void afs_fetch_data_aborted(struct afs_operation *op)
{
afs_check_for_remote_deletion(op);
afs_fetch_data_notify(op);
}

static void afs_fetch_data_put(struct afs_operation *op)
{
op->fetch.req->error = afs_op_error(op);
Expand All @@ -272,7 +280,7 @@ static const struct afs_operation_ops afs_fetch_data_operation = {
.issue_afs_rpc = afs_fs_fetch_data,
.issue_yfs_rpc = yfs_fs_fetch_data,
.success = afs_fetch_data_success,
.aborted = afs_check_for_remote_deletion,
.aborted = afs_fetch_data_aborted,
.failed = afs_fetch_data_notify,
.put = afs_fetch_data_put,
};
Expand All @@ -294,7 +302,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req)
op = afs_alloc_operation(req->key, vnode->volume);
if (IS_ERR(op)) {
if (req->subreq)
netfs_subreq_terminated(req->subreq, PTR_ERR(op), false);
netfs_read_subreq_terminated(req->subreq, PTR_ERR(op), false);
return PTR_ERR(op);
}

Expand All @@ -313,7 +321,7 @@ static void afs_read_worker(struct work_struct *work)

fsreq = afs_alloc_read(GFP_NOFS);
if (!fsreq)
return netfs_subreq_terminated(subreq, -ENOMEM, false);
return netfs_read_subreq_terminated(subreq, -ENOMEM, false);

fsreq->subreq = subreq;
fsreq->pos = subreq->start + subreq->transferred;
Expand All @@ -322,6 +330,7 @@ static void afs_read_worker(struct work_struct *work)
fsreq->vnode = vnode;
fsreq->iter = &subreq->io_iter;

trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
afs_fetch_data(fsreq->vnode, fsreq);
afs_put_read(fsreq);
}
Expand Down
9 changes: 7 additions & 2 deletions fs/afs/fsclient.c
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
struct afs_vnode_param *vp = &op->file[0];
struct afs_read *req = op->fetch.req;
const __be32 *bp;
size_t count_before;
int ret;

_enter("{%u,%zu,%zu/%llu}",
Expand Down Expand Up @@ -345,10 +346,14 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)

/* extract the returned data */
case 2:
_debug("extract data %zu/%llu",
iov_iter_count(call->iter), req->actual_len);
count_before = call->iov_len;
_debug("extract data %zu/%llu", count_before, req->actual_len);

ret = afs_extract_data(call, true);
if (req->subreq) {
req->subreq->transferred += count_before - call->iov_len;
netfs_read_subreq_progress(req->subreq, false);
}
if (ret < 0)
return ret;

Expand Down
9 changes: 7 additions & 2 deletions fs/afs/yfsclient.c
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
struct afs_vnode_param *vp = &op->file[0];
struct afs_read *req = op->fetch.req;
const __be32 *bp;
size_t count_before;
int ret;

_enter("{%u,%zu, %zu/%llu}",
Expand Down Expand Up @@ -391,10 +392,14 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)

/* extract the returned data */
case 2:
_debug("extract data %zu/%llu",
iov_iter_count(call->iter), req->actual_len);
count_before = call->iov_len;
_debug("extract data %zu/%llu", count_before, req->actual_len);

ret = afs_extract_data(call, true);
if (req->subreq) {
req->subreq->transferred += count_before - call->iov_len;
netfs_read_subreq_progress(req->subreq, false);
}
if (ret < 0)
return ret;

Expand Down
76 changes: 46 additions & 30 deletions fs/ceph/addr.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <linux/iversion.h>
#include <linux/ktime.h>
#include <linux/netfs.h>
#include <trace/events/netfs.h>

#include "super.h"
#include "mds_client.h"
Expand Down Expand Up @@ -205,21 +206,6 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
}
}

static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
{
struct inode *inode = subreq->rreq->inode;
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 objno, objoff;
u32 xlen;

/* Truncate the extent at the end of the current block */
ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
&objno, &objoff, &xlen);
subreq->len = min(xlen, fsc->mount_options->rsize);
return true;
}

static void finish_netfs_read(struct ceph_osd_request *req)
{
struct inode *inode = req->r_inode;
Expand Down Expand Up @@ -264,7 +250,12 @@ static void finish_netfs_read(struct ceph_osd_request *req)
calc_pages_for(osd_data->alignment,
osd_data->length), false);
}
netfs_subreq_terminated(subreq, err, false);
if (err > 0) {
subreq->transferred = err;
err = 0;
}
trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);
netfs_read_subreq_terminated(subreq, err, false);
iput(req->r_inode);
ceph_dec_osd_stopping_blocker(fsc->mdsc);
}
Expand All @@ -278,7 +269,6 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci = ceph_inode(inode);
struct iov_iter iter;
ssize_t err = 0;
size_t len;
int mode;
Expand All @@ -301,6 +291,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
req->r_num_caps = 2;

trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err < 0)
goto out;
Expand All @@ -314,17 +305,36 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
}

len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
if (err == 0)
err = copy_to_iter(iinfo->inline_data + subreq->start, len, &subreq->io_iter);
if (err == 0) {
err = -EFAULT;
} else {
subreq->transferred += err;
err = 0;
}

ceph_mdsc_put_request(req);
out:
netfs_subreq_terminated(subreq, err, false);
netfs_read_subreq_terminated(subreq, err, false);
return true;
}

static int ceph_netfs_prepare_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
u64 objno, objoff;
u32 xlen;

/* Truncate the extent at the end of the current block */
ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
&objno, &objoff, &xlen);
rreq->io_streams[0].sreq_max_len = umin(xlen, fsc->mount_options->rsize);
return 0;
}

static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
Expand All @@ -334,9 +344,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
struct ceph_client *cl = fsc->client;
struct ceph_osd_request *req = NULL;
struct ceph_vino vino = ceph_vino(inode);
struct iov_iter iter;
int err = 0;
u64 len = subreq->len;
int err;
u64 len;
bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
u64 off = subreq->start;
int extent_cnt;
Expand All @@ -349,6 +358,12 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
return;

// TODO: This rounding here is slightly dodgy. It *should* work, for
// now, as the cache only deals in blocks that are a multiple of
// PAGE_SIZE and fscrypt blocks are at most PAGE_SIZE. What needs to
// happen is for the fscrypt driving to be moved into netfslib and the
// data in the cache also to be stored encrypted.
len = subreq->len;
ceph_fscrypt_adjust_off_and_len(inode, &off, &len);

req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,
Expand All @@ -371,8 +386,6 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n",
ceph_vinop(inode), subreq->start, subreq->len, len);

iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);

/*
* FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for
* encrypted inodes. We'd need infrastructure that handles an iov_iter
Expand All @@ -384,7 +397,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
struct page **pages;
size_t page_off;

err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
err = iov_iter_get_pages_alloc2(&subreq->io_iter, &pages, len, &page_off);
if (err < 0) {
doutc(cl, "%llx.%llx failed to allocate pages, %d\n",
ceph_vinop(inode), err);
Expand All @@ -399,7 +412,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false,
false);
} else {
osd_req_op_extent_osd_iter(req, 0, &iter);
osd_req_op_extent_osd_iter(req, 0, &subreq->io_iter);
}
if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
err = -EIO;
Expand All @@ -410,17 +423,19 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
req->r_inode = inode;
ihold(inode);

trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
ceph_osdc_start_request(req->r_osdc, req);
out:
ceph_osdc_put_request(req);
if (err)
netfs_subreq_terminated(subreq, err, false);
netfs_read_subreq_terminated(subreq, err, false);
doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err);
}

static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
{
struct inode *inode = rreq->inode;
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_client *cl = ceph_inode_to_client(inode);
int got = 0, want = CEPH_CAP_FILE_CACHE;
struct ceph_netfs_request_data *priv;
Expand Down Expand Up @@ -472,6 +487,7 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)

priv->caps = got;
rreq->netfs_priv = priv;
rreq->io_streams[0].sreq_max_len = fsc->mount_options->rsize;

out:
if (ret < 0)
Expand All @@ -496,9 +512,9 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq)
const struct netfs_request_ops ceph_netfs_ops = {
.init_request = ceph_init_request,
.free_request = ceph_netfs_free_request,
.prepare_read = ceph_netfs_prepare_read,
.issue_read = ceph_netfs_issue_read,
.expand_readahead = ceph_netfs_expand_readahead,
.clamp_length = ceph_netfs_clamp_length,
.check_write_begin = ceph_netfs_check_write_begin,
};

Expand Down
4 changes: 3 additions & 1 deletion fs/netfs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ netfs-y := \
buffered_write.o \
direct_read.o \
direct_write.o \
io.o \
iterator.o \
locking.o \
main.o \
misc.o \
objects.o \
read_collect.o \
read_pgpriv2.o \
read_retry.o \
write_collect.o \
write_issue.o

Expand Down
Loading

0 comments on commit ee4cdf7

Please sign in to comment.