Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

xsk: i40e: Tx performance improvements #320

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
sudo: required
language: bash
dist: bionic
services:
- docker

env:
global:
- PROJECT_NAME='libbpf'
- AUTHOR_EMAIL="$(git log -1 --pretty=\"%aE\")"
- REPO_ROOT="$TRAVIS_BUILD_DIR"
- CI_ROOT="$REPO_ROOT/travis-ci"
- VMTEST_ROOT="$CI_ROOT/vmtest"

addons:
apt:
packages:
- qemu-kvm
- zstd
- binutils-dev
- elfutils
- libcap-dev
- libelf-dev
- libdw-dev
- python3-docutils

jobs:
include:
- stage: Builds & Tests
name: Kernel LATEST + selftests
language: bash
env: KERNEL=LATEST
script: $CI_ROOT/vmtest/run_vmtest.sh || travis_terminate 1
11 changes: 11 additions & 0 deletions drivers/net/ethernet/intel/i40e/i40e_txrx.c
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,8 @@ void i40e_free_tx_resources(struct i40e_ring *tx_ring)
i40e_clean_tx_ring(tx_ring);
kfree(tx_ring->tx_bi);
tx_ring->tx_bi = NULL;
kfree(tx_ring->xsk_descs);
tx_ring->xsk_descs = NULL;

if (tx_ring->desc) {
dma_free_coherent(tx_ring->dev, tx_ring->size,
Expand Down Expand Up @@ -1277,6 +1279,13 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
if (!tx_ring->tx_bi)
goto err;

if (ring_is_xdp(tx_ring)) {
tx_ring->xsk_descs = kcalloc(I40E_MAX_NUM_DESCRIPTORS, sizeof(*tx_ring->xsk_descs),
GFP_KERNEL);
if (!tx_ring->xsk_descs)
goto err;
}

u64_stats_init(&tx_ring->syncp);

/* round up to nearest 4K */
Expand All @@ -1300,6 +1309,8 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
return 0;

err:
kfree(tx_ring->xsk_descs);
tx_ring->xsk_descs = NULL;
kfree(tx_ring->tx_bi);
tx_ring->tx_bi = NULL;
return -ENOMEM;
Expand Down
1 change: 1 addition & 0 deletions drivers/net/ethernet/intel/i40e/i40e_txrx.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ struct i40e_ring {
struct i40e_channel *ch;
struct xdp_rxq_info xdp_rxq;
struct xsk_buff_pool *xsk_pool;
struct xdp_desc *xsk_descs; /* For storing descriptors in the AF_XDP ZC path */
} ____cacheline_internodealigned_in_smp;

static inline bool ring_uses_build_skb(struct i40e_ring *ring)
Expand Down
131 changes: 92 additions & 39 deletions drivers/net/ethernet/intel/i40e/i40e_xsk.c
Original file line number Diff line number Diff line change
Expand Up @@ -381,58 +381,111 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
return failure ? budget : (int)total_rx_packets;
}

/**
* i40e_xmit_zc - Performs zero-copy Tx AF_XDP
* @xdp_ring: XDP Tx ring
* @budget: NAPI budget
*
* Returns true if the work is finished.
**/
static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
static void i40e_xmit_pkt(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
unsigned int *total_bytes)
{
unsigned int sent_frames = 0, total_bytes = 0;
struct i40e_tx_desc *tx_desc = NULL;
struct i40e_tx_buffer *tx_bi;
struct xdp_desc desc;
struct i40e_tx_desc *tx_desc;
dma_addr_t dma;

while (budget-- > 0) {
if (!xsk_tx_peek_desc(xdp_ring->xsk_pool, &desc))
break;
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc->addr);
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc->len);

dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc.addr);
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma,
desc.len);
tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use++);
tx_desc->buffer_addr = cpu_to_le64(dma);
tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC | I40E_TX_DESC_CMD_EOP,
0, desc->len, 0);

tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
tx_bi->bytecount = desc.len;
*total_bytes += desc->len;
}

tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
tx_desc->buffer_addr = cpu_to_le64(dma);
tx_desc->cmd_type_offset_bsz =
build_ctob(I40E_TX_DESC_CMD_ICRC
| I40E_TX_DESC_CMD_EOP,
0, desc.len, 0);
/* This value should match the pragma below. Why 4? It is strictly
* empirical. It seems to be a good compromise between the advantage
* of having simultaneous outstanding reads to the DMA array that can
* hide each others latency and the disadvantage of having a larger
* code path.
*/
#define PKTS_PER_BATCH 4

static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
unsigned int *total_bytes)
{
u16 ntu = xdp_ring->next_to_use;
struct i40e_tx_desc *tx_desc;
dma_addr_t dma;
u32 i;

sent_frames++;
total_bytes += tx_bi->bytecount;
#pragma GCC unroll 4
for (i = 0; i < PKTS_PER_BATCH; i++) {
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc[i].len);

xdp_ring->next_to_use++;
if (xdp_ring->next_to_use == xdp_ring->count)
xdp_ring->next_to_use = 0;
tx_desc = I40E_TX_DESC(xdp_ring, ntu++);
tx_desc->buffer_addr = cpu_to_le64(dma);
tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC |
I40E_TX_DESC_CMD_EOP,
0, desc[i].len, 0);

*total_bytes += desc[i].len;
}

if (tx_desc) {
/* Request an interrupt for the last frame and bump tail ptr. */
tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS <<
I40E_TXD_QW1_CMD_SHIFT);
i40e_xdp_ring_update_tail(xdp_ring);
xdp_ring->next_to_use = ntu;
}

xsk_tx_release(xdp_ring->xsk_pool);
i40e_update_tx_stats(xdp_ring, sent_frames, total_bytes);
static void i40e_fill_tx_hw_ring(struct i40e_ring *xdp_ring, struct xdp_desc *descs, u32 nb_pkts,
unsigned int *total_bytes)
{
u32 batched, leftover, i;

batched = nb_pkts & ~(PKTS_PER_BATCH - 1);
leftover = nb_pkts & (PKTS_PER_BATCH - 1);
for (i = 0; i < batched; i += PKTS_PER_BATCH)
i40e_xmit_pkt_batch(xdp_ring, &descs[i], total_bytes);
for (i = batched; i < batched + leftover; i++)
i40e_xmit_pkt(xdp_ring, &descs[i], total_bytes);
}

static void i40e_set_rs_bit(struct i40e_ring *xdp_ring)
{
u16 ntu = xdp_ring->next_to_use ? xdp_ring->next_to_use - 1 : xdp_ring->count - 1;
struct i40e_tx_desc *tx_desc;

tx_desc = I40E_TX_DESC(xdp_ring, ntu);
tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT);
}

/**
* i40e_xmit_zc - Performs zero-copy Tx AF_XDP
* @xdp_ring: XDP Tx ring
* @budget: NAPI budget
*
* Returns true if the work is finished.
**/
static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
{
struct xdp_desc *descs = xdp_ring->xsk_descs;
u32 nb_pkts, nb_processed = 0;
unsigned int total_bytes = 0;

nb_pkts = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool, descs, budget);
if (!nb_pkts)
return false;

if (xdp_ring->next_to_use + nb_pkts >= xdp_ring->count) {
nb_processed = xdp_ring->count - xdp_ring->next_to_use;
i40e_fill_tx_hw_ring(xdp_ring, descs, nb_processed, &total_bytes);
xdp_ring->next_to_use = 0;
}

return !!budget;
i40e_fill_tx_hw_ring(xdp_ring, &descs[nb_processed], nb_pkts - nb_processed,
&total_bytes);

/* Request an interrupt for the last frame and bump tail ptr. */
i40e_set_rs_bit(xdp_ring);
i40e_xdp_ring_update_tail(xdp_ring);

i40e_update_tx_stats(xdp_ring, nb_pkts, total_bytes);

return true;
}

/**
Expand Down
7 changes: 7 additions & 0 deletions include/net/xdp_sock_drv.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc);
u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, u32 max);
void xsk_tx_release(struct xsk_buff_pool *pool);
struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
u16 queue_id);
Expand Down Expand Up @@ -128,6 +129,12 @@ static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool,
return false;
}

static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc,
u32 max)
{
return 0;
}

static inline void xsk_tx_release(struct xsk_buff_pool *pool)
{
}
Expand Down
57 changes: 57 additions & 0 deletions net/xdp/xsk.c
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,63 @@ bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
}
EXPORT_SYMBOL(xsk_tx_peek_desc);

static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs,
u32 max_entries)
{
u32 nb_pkts = 0;

while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
nb_pkts++;

xsk_tx_release(pool);
return nb_pkts;
}

u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs,
u32 max_entries)
{
struct xdp_sock *xs;
u32 nb_pkts;

rcu_read_lock();
if (!list_is_singular(&pool->xsk_tx_list)) {
/* Fallback to the non-batched version */
rcu_read_unlock();
return xsk_tx_peek_release_fallback(pool, descs, max_entries);
}

xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
if (!xs) {
nb_pkts = 0;
goto out;
}

nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries);
if (!nb_pkts) {
xs->tx->queue_empty_descs++;
goto out;
}

/* This is the backpressure mechanism for the Tx path. Try to
* reserve space in the completion queue for all packets, but
* if there are fewer slots available, just process that many
* packets. This avoids having to implement any buffering in
* the Tx path.
*/
nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts);
if (!nb_pkts)
goto out;

xskq_cons_release_n(xs->tx, nb_pkts);
__xskq_cons_release(xs->tx);
xs->sk.sk_write_space(&xs->sk);

out:
rcu_read_unlock();
return nb_pkts;
}
EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);

static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
{
struct net_device *dev = xs->dev;
Expand Down
Loading