Skip to content
This repository has been archived by the owner on Dec 20, 2023. It is now read-only.

Per cpu sk #1

Open
wants to merge 10 commits into
base: fastly310-stable
Choose a base branch
from
16 changes: 16 additions & 0 deletions Documentation/sysctl/vm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/vm:
- dirty_writeback_centisecs
- drop_caches
- extfrag_threshold
- extra_free_kbytes
- hugepages_treat_as_movable
- hugetlb_shm_group
- laptop_mode
Expand Down Expand Up @@ -198,6 +199,21 @@ fragmentation index is <= extfrag_threshold. The default value is 500.

==============================================================

extra_free_kbytes

This parameter tells the VM to keep extra free memory between the threshold
where background reclaim (kswapd) kicks in, and the threshold where direct
reclaim (by allocating processes) kicks in.

This is useful for workloads that require low latency memory allocations
and have a bounded burstiness in memory allocations, for example a
realtime application that receives and transmits network traffic
(causing in-kernel memory allocations) with a maximum total message burst
size of 200MB may need 200MB of extra free memory to avoid direct reclaim
related latencies.

==============================================================

hugepages_treat_as_movable

This parameter is only useful when kernelcore= is specified at boot time to
Expand Down
2 changes: 1 addition & 1 deletion include/linux/mmzone.h
Original file line number Diff line number Diff line change
Expand Up @@ -891,7 +891,7 @@ static inline int is_dma(struct zone *zone)

/* These two functions are used to setup the per zone pages min values */
struct ctl_table;
int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
int free_kbytes_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
Expand Down
2 changes: 2 additions & 0 deletions include/linux/swap.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ struct swap_list_t {
/* linux/mm/page_alloc.c */
extern unsigned long totalram_pages;
extern unsigned long totalreserve_pages;
extern int min_free_kbytes;
extern int extra_free_kbytes;
extern unsigned long dirty_balance_reserve;
extern unsigned long nr_free_buffer_pages(void);
extern unsigned long nr_free_pagecache_pages(void);
Expand Down
1 change: 1 addition & 0 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ extern int sysctl_tcp_thin_dupack;
extern int sysctl_tcp_early_retrans;
extern int sysctl_tcp_limit_output_bytes;
extern int sysctl_tcp_challenge_ack_limit;
extern int sysctl_tcp_user_cwnd_max;

extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
Expand Down
5 changes: 3 additions & 2 deletions include/uapi/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,13 @@ enum {
#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/
#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */
#define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */
#define TCP_REPAIR 19 /* TCP sock is under repair right now */
#define TCP_REPAIR 24 /* TCP sock is under repair right now */
#define TCP_REPAIR_QUEUE 20
#define TCP_QUEUE_SEQ 21
#define TCP_REPAIR_OPTIONS 22
#define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */
#define TCP_TIMESTAMP 24
#define TCP_TIMESTAMP 25
#define TCP_CWND 19 /* Set congestion window */

struct tcp_repair_opt {
__u32 opt_code;
Expand Down
10 changes: 9 additions & 1 deletion kernel/sysctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -1262,9 +1262,17 @@ static struct ctl_table vm_table[] = {
.data = &min_free_kbytes,
.maxlen = sizeof(min_free_kbytes),
.mode = 0644,
.proc_handler = min_free_kbytes_sysctl_handler,
.proc_handler = free_kbytes_sysctl_handler,
.extra1 = &zero,
},
{
.procname = "extra_free_kbytes",
.data = &extra_free_kbytes,
.maxlen = sizeof(extra_free_kbytes),
.mode = 0644,
.proc_handler = free_kbytes_sysctl_handler,
.extra1 = &zero,
},
{
.procname = "percpu_pagelist_fraction",
.data = &percpu_pagelist_fraction,
Expand Down
39 changes: 29 additions & 10 deletions mm/page_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,21 @@ static char * const zone_names[MAX_NR_ZONES] = {
"Movable",
};

/*
* Try to keep at least this much lowmem free. Do not allow normal
* allocations below this point, only high priority ones. Automatically
* tuned according to the amount of memory in the system.
*/
int min_free_kbytes = 1024;

/*
* Extra memory for the system to try freeing between the min and
* low watermarks. Useful for workloads that require low latency
* memory allocations in bursts larger than the normal gap between
* low and min.
*/
int extra_free_kbytes;

static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
static unsigned long __meminitdata dma_reserve;
Expand Down Expand Up @@ -5320,6 +5333,7 @@ static void setup_per_zone_lowmem_reserve(void)
static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
Expand All @@ -5331,11 +5345,14 @@ static void __setup_per_zone_wmarks(void)
}

for_each_zone(zone) {
u64 tmp;
u64 min, low;

spin_lock_irqsave(&zone->lock, flags);
tmp = (u64)pages_min * zone->managed_pages;
do_div(tmp, lowmem_pages);
min = (u64)pages_min * zone->managed_pages;
do_div(min, lowmem_pages);
low = (u64)pages_low * zone->managed_pages;
do_div(low, vm_total_pages);

if (is_highmem(zone)) {
/*
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
Expand All @@ -5356,11 +5373,13 @@ static void __setup_per_zone_wmarks(void)
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
zone->watermark[WMARK_MIN] = tmp;
zone->watermark[WMARK_MIN] = min;
}

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) +
low + (min >> 2);
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
low + (min >> 1);

setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
Expand Down Expand Up @@ -5471,11 +5490,11 @@ int __meminit init_per_zone_wmark_min(void)
module_init(init_per_zone_wmark_min)

/*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call two helper functions whenever min_free_kbytes
* changes.
* free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call two helper functions whenever min_free_kbytes
* or extra_free_kbytes changes.
*/
int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
int free_kbytes_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec(table, write, buffer, length, ppos);
Expand Down
21 changes: 17 additions & 4 deletions net/ipv4/inet_hashtables.c
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,8 @@ struct sock *__inet_lookup_listener(struct net *net,
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
int score, hiscore, matches = 0, reuseport = 0;
u32 phash = 0;
// u32 phash = 0;
int curr_cpu = smp_processor_id();

rcu_read_lock();
begin:
Expand All @@ -198,15 +199,27 @@ struct sock *__inet_lookup_listener(struct net *net,
hiscore = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
phash = inet_ehashfn(net, daddr, hnum,
// matches++;

/* phash = inet_ehashfn(net, daddr, hnum,
saddr, sport);
matches = 1;
matches = 1; */
}
} else if (score == hiscore && reuseport) {
matches++;

/* goes through the sks and find the one corresponding to our cpu
it is critical that a RSS queue is bound to a specific cpu
*/
// pr_info("Matching sk %p match %d to cpu %d\n", sk, matches, curr_cpu);
if (matches++ == curr_cpu) {
result = sk;
}

/*
if (((u64)phash * matches) >> 32 == 0)
result = sk;
phash = next_pseudo_random32(phash);
*/
}
}
/*
Expand Down
7 changes: 7 additions & 0 deletions net/ipv4/sysctl_net_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,13 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_allowed_congestion_control,
},
{
.procname = "tcp_user_cwnd_max",
.data = &sysctl_tcp_user_cwnd_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "tcp_max_ssthresh",
.data = &sysctl_tcp_max_ssthresh,
Expand Down
18 changes: 18 additions & 0 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2602,6 +2602,24 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
}
break;

case TCP_CWND:
if (sysctl_tcp_user_cwnd_max <= 0)
err = -EPERM;
else if (val > 0 && sk->sk_state == TCP_ESTABLISHED &&
icsk->icsk_ca_state == TCP_CA_Open) {
u32 cwnd = val;
cwnd = min(cwnd, (u32)sysctl_tcp_user_cwnd_max);
cwnd = min(cwnd, tp->snd_cwnd_clamp);

if (tp->snd_cwnd != cwnd) {
tp->snd_cwnd = cwnd;
tp->snd_cwnd_stamp = tcp_time_stamp;
tp->snd_cwnd_cnt = 0;
}
} else
err = -EINVAL;
break;

#ifdef CONFIG_TCP_MD5SIG
case TCP_MD5SIG:
/* Read the IP->Key mappings from userspace */
Expand Down
17 changes: 15 additions & 2 deletions net/ipv4/tcp_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -2773,7 +2773,20 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
/* D. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
if (icsk->icsk_ca_state == TCP_CA_Open) {
WARN_ON(tp->retrans_out != 0);
if (WARN_ON(tp->retrans_out != 0)) {
printk(KERN_DEBUG "%pI4:%u F0x%x S%u s%d IF%u+%u-%u-%u"
"f%u ur%u rr%u rt%u um%u hs%u nxt%u\n",
&inet_sk(sk)->inet_daddr,
ntohs(inet_sk(sk)->inet_dport),
flag, sk->sk_state, tp->rx_opt.sack_ok,
tp->packets_out, tp->retrans_out,
tp->sacked_out, tp->lost_out,
tp->frto, tp->undo_retrans,
tp->reordering, icsk->icsk_retransmits,
tp->undo_marker ? tp->undo_marker-tp->snd_una:0,
tp->high_seq - tp->snd_una,
tp->snd_nxt - tp->snd_una);
}
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
switch (icsk->icsk_ca_state) {
Expand Down Expand Up @@ -3314,7 +3327,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
tcp_init_cwnd_reduction(sk, true);
tcp_set_ca_state(sk, TCP_CA_CWR);
tcp_end_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_try_keep_open(sk);
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPLOSSPROBERECOVERY);
}
Expand Down
2 changes: 2 additions & 0 deletions net/ipv4/tcp_output.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
/* By default, RFC2861 behavior. */
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;

int sysctl_tcp_user_cwnd_max __read_mostly;

static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);

Expand Down