From 5577d2a2feef63163d43e53c922da595a0d1fef2 Mon Sep 17 00:00:00 2001
From: Alexander Ostapenko <ao@tempesta-tech.com>
Date: Thu, 27 Sep 2018 13:18:59 +0700
Subject: [PATCH 1/5] Fix #806: Reduce count of IPI during work_queue
 processing.

---
 tempesta_fw/cache.c      | 17 ++++++++++++++---
 tempesta_fw/sock.c       | 35 ++++++++++++++++++++++++++---------
 tempesta_fw/work_queue.c |  1 +
 tempesta_fw/work_queue.h | 27 +++++++++++++++++----------
 4 files changed, 58 insertions(+), 22 deletions(-)

diff --git a/tempesta_fw/cache.c b/tempesta_fw/cache.c
index 5efb103ce7..307dc51acf 100644
--- a/tempesta_fw/cache.c
+++ b/tempesta_fw/cache.c
@@ -1673,7 +1673,7 @@ static void
 tfw_cache_ipi(struct irq_work *work)
 {
 	TfwWorkTasklet *ct = container_of(work, TfwWorkTasklet, ipi_work);
-
+	clear_bit(TFW_QUEUE_B_IPI, &ct->wq.flags);
 	tasklet_schedule(&ct->tasklet);
 }
 
@@ -1726,7 +1726,6 @@ tfw_cache_process(TfwHttpMsg *msg, tfw_http_cache_cb_t action)
 	TFW_DBG2("Cache: schedule tasklet w/ work: to_cpu=%d from_cpu=%d"
 		 " msg=%p key=%lx\n", cpu, smp_processor_id(),
 		 cw.msg, key);
-
 	if (tfw_wq_push(&ct->wq, &cw, cpu, &ct->ipi_work, tfw_cache_ipi)) {
 		TFW_WARN("Cache work queue overrun: [%s]\n",
 			 resp ? "response" : "request");
@@ -1743,10 +1742,22 @@ static void
 tfw_wq_tasklet(unsigned long data)
 {
 	TfwWorkTasklet *ct = (TfwWorkTasklet *)data;
+	TfwRBQueue *wq = &ct->wq;
 	TfwCWork cw;
 
-	while (!tfw_wq_pop(&ct->wq, &cw))
+	while (!tfw_wq_pop(wq, &cw))
 		tfw_cache_do_action(cw.msg, cw.action);
+
+	set_bit(TFW_QUEUE_B_IPI, &wq->flags);
+
+	smp_mb__after_atomic();
+
+	if (!tfw_wq_size(wq))
+		return;
+
+	clear_bit(TFW_QUEUE_B_IPI, &wq->flags);
+
+	tasklet_schedule(&ct->tasklet);
 }
 
 /**
diff --git a/tempesta_fw/sock.c b/tempesta_fw/sock.c
index 8e89c84ee7..284814cae0 100644
--- a/tempesta_fw/sock.c
+++ b/tempesta_fw/sock.c
@@ -187,6 +187,8 @@ do {									\
 static void
 ss_ipi(struct irq_work *work)
 {
+	TfwRBQueue *wq = &per_cpu(si_wq, smp_processor_id());
+	clear_bit(TFW_QUEUE_B_IPI, &wq->flags);
 	raise_softirq(NET_TX_SOFTIRQ);
 }
 
@@ -200,6 +202,7 @@ ss_turnstile_push(long ticket, SsWork *sw, int cpu)
 {
 	struct irq_work *iw = &per_cpu(ipi_work, cpu);
 	SsCloseBacklog *cb = &per_cpu(close_backlog, cpu);
+	TfwRBQueue *wq = &per_cpu(si_wq, cpu);
 	SsCblNode *cn;
 
 	cn = kmem_cache_alloc(ss_cbacklog_cache, GFP_ATOMIC);
@@ -214,7 +217,8 @@ ss_turnstile_push(long ticket, SsWork *sw, int cpu)
 		cb->turn = ticket;
 	spin_unlock_bh(&cb->lock);
 
-	tfw_raise_softirq(cpu, iw, ss_ipi);
+	if (test_bit(TFW_QUEUE_B_IPI, &wq->flags))
+		tfw_raise_softirq(cpu, iw, ss_ipi);
 
 	return 0;
 }
@@ -254,9 +258,8 @@ ss_wq_push(SsWork *sw, int cpu)
 }
 
 static int
-ss_wq_pop(SsWork *sw, long *ticket)
+ss_wq_pop(TfwRBQueue *wq, SsWork *sw, long *ticket)
 {
-	TfwRBQueue *wq = this_cpu_ptr(&si_wq);
 	SsCloseBacklog *cb = this_cpu_ptr(&close_backlog);
 
 	/*
@@ -1308,9 +1311,10 @@ static void
 ss_tx_action(void)
 {
 	SsWork sw;
+	int budget;
 	struct sk_buff *skb;
+	TfwRBQueue *wq = this_cpu_ptr(&si_wq);
 	long ticket = 0;
-	int budget;
 
 	/*
 	 * @budget limits the loop to prevent live lock on constantly arriving
@@ -1318,7 +1322,7 @@ ss_tx_action(void)
 	 * ariving items.
 	 */
 	budget = max(10UL, ss_close_q_sz());
-	while ((!ss_active() || budget--) && !ss_wq_pop(&sw, &ticket)) {
+	while ((!ss_active() || budget--) && !ss_wq_pop(wq, &sw, &ticket)) {
 		struct sock *sk = sw.sk;
 
 		bh_lock_sock(sk);
@@ -1358,11 +1362,24 @@ ss_tx_action(void)
 
 	/*
 	 * Rearm softirq for local CPU if there are more jobs to do.
-	 * ss_synchronize() is responsible for raising the softirq if there are
-	 * more jobs in the work queue or the backlog.
+	 * If all jobs are finished, and work queue and backlog are
+	 * empty, then enable IPI generation by producers (disabled
+	 * in 'ss_ipi()' handler).
+	 * ss_synchronize() is responsible for raising the softirq
+	 * if there are more jobs in the work queue or the backlog.
 	 */
-	if (!budget)
-		raise_softirq(NET_TX_SOFTIRQ);
+	if (budget) {
+		set_bit(TFW_QUEUE_B_IPI, &wq->flags);
+
+		smp_mb__after_atomic();
+
+		if (!ss_close_q_sz())
+			return;
+
+		clear_bit(TFW_QUEUE_B_IPI, &wq->flags);
+	}
+
+	raise_softirq(NET_TX_SOFTIRQ);
 }
 
 /*
diff --git a/tempesta_fw/work_queue.c b/tempesta_fw/work_queue.c
index ba40344d18..f9e857d412 100644
--- a/tempesta_fw/work_queue.c
+++ b/tempesta_fw/work_queue.c
@@ -49,6 +49,7 @@ tfw_wq_init(TfwRBQueue *q, int node)
 	q->last_head = 0;
 	atomic64_set(&q->head, 0);
 	atomic64_set(&q->tail, 0);
+	set_bit(TFW_QUEUE_B_IPI, &q->flags);
 
 	q->array = kmalloc_node(QSZ * WQ_ITEM_SZ, GFP_KERNEL, node);
 	if (!q->array) {
diff --git a/tempesta_fw/work_queue.h b/tempesta_fw/work_queue.h
index f5a073af8f..4f07ad8377 100644
--- a/tempesta_fw/work_queue.h
+++ b/tempesta_fw/work_queue.h
@@ -38,13 +38,28 @@ typedef struct {
 	long			last_head;
 	atomic64_t		head ____cacheline_aligned;
 	atomic64_t		tail ____cacheline_aligned;
+	unsigned long		flags;
 } TfwRBQueue;
 
+enum {
+	/* Enable IPI generation. */
+	TFW_QUEUE_B_IPI = 0
+};
+
 int tfw_wq_init(TfwRBQueue *wq, int node);
 void tfw_wq_destroy(TfwRBQueue *wq);
 long __tfw_wq_push(TfwRBQueue *wq, void *ptr);
 int tfw_wq_pop_ticket(TfwRBQueue *wq, void *buf, long *ticket);
 
+static inline int
+tfw_wq_size(TfwRBQueue *q)
+{
+	long t = atomic64_read(&q->tail);
+	long h = atomic64_read(&q->head);
+
+	return t > h ? 0 : h - t;
+}
+
 static inline void
 tfw_raise_softirq(int cpu, struct irq_work *work,
 		  void (*local_cpu_cb)(struct irq_work *))
@@ -63,7 +78,8 @@ tfw_wq_push(TfwRBQueue *q, void *ptr, int cpu, struct irq_work *work,
 	if (unlikely(ticket))
 		return ticket;
 
-	tfw_raise_softirq(cpu, work, local_cpu_cb);
+	if (test_bit(TFW_QUEUE_B_IPI, &q->flags))
+		tfw_raise_softirq(cpu, work, local_cpu_cb);
 
 	return 0;
 }
@@ -74,13 +90,4 @@ tfw_wq_pop(TfwRBQueue *wq, void *buf)
 	return tfw_wq_pop_ticket(wq, buf, NULL);
 }
 
-static inline int
-tfw_wq_size(TfwRBQueue *q)
-{
-	long t = atomic64_read(&q->tail);
-	long h = atomic64_read(&q->head);
-
-	return t > h ? 0 : h - t;
-}
-
 #endif /* __TFW_WORK_QUEUE_H__ */

From c39e8d7762a3840c886e398ff4837cc8457fc1a4 Mon Sep 17 00:00:00 2001
From: Alexander Ostapenko <ao@tempesta-tech.com>
Date: Fri, 28 Sep 2018 20:37:02 +0700
Subject: [PATCH 2/5] Correction for work_queue backlog (#1066).

---
 tempesta_fw/sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tempesta_fw/sock.c b/tempesta_fw/sock.c
index 284814cae0..fea0fdcd0c 100644
--- a/tempesta_fw/sock.c
+++ b/tempesta_fw/sock.c
@@ -211,7 +211,7 @@ ss_turnstile_push(long ticket, SsWork *sw, int cpu)
 	cn->ticket = ticket;
 	memcpy(&cn->sw, sw, sizeof(*sw));
 	spin_lock_bh(&cb->lock);
-	list_add(&cn->list, &cb->head);
+	list_add_tail(&cn->list, &cb->head);
 	cb->size++;
 	if (cb->turn > ticket)
 		cb->turn = ticket;

From a82e10e919460fbd8c30ac689af151796166707d Mon Sep 17 00:00:00 2001
From: Alexander Ostapenko <ao@tempesta-tech.com>
Date: Wed, 10 Oct 2018 20:32:05 +0700
Subject: [PATCH 3/5] Additional changes according review comments (#806).

---
 tempesta_fw/cache.c      |  9 +--------
 tempesta_fw/sock.c       | 28 +++++++++++++---------------
 tempesta_fw/work_queue.h | 14 ++++++++++++++
 3 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/tempesta_fw/cache.c b/tempesta_fw/cache.c
index 307dc51acf..8c1cd410fb 100644
--- a/tempesta_fw/cache.c
+++ b/tempesta_fw/cache.c
@@ -1748,14 +1748,7 @@ tfw_wq_tasklet(unsigned long data)
 	while (!tfw_wq_pop(wq, &cw))
 		tfw_cache_do_action(cw.msg, cw.action);
 
-	set_bit(TFW_QUEUE_B_IPI, &wq->flags);
-
-	smp_mb__after_atomic();
-
-	if (!tfw_wq_size(wq))
-		return;
-
-	clear_bit(TFW_QUEUE_B_IPI, &wq->flags);
+	TFW_WQ_IPI_SYNC(tfw_wq_size, wq);
 
 	tasklet_schedule(&ct->tasklet);
 }
diff --git a/tempesta_fw/sock.c b/tempesta_fw/sock.c
index fea0fdcd0c..3f10f7e401 100644
--- a/tempesta_fw/sock.c
+++ b/tempesta_fw/sock.c
@@ -217,6 +217,10 @@ ss_turnstile_push(long ticket, SsWork *sw, int cpu)
 		cb->turn = ticket;
 	spin_unlock_bh(&cb->lock);
 
+	/*
+	 * We do not need explicit memory barriers after
+	 * spinlock operation.
+	 */
 	if (test_bit(TFW_QUEUE_B_IPI, &wq->flags))
 		tfw_raise_softirq(cpu, iw, ss_ipi);
 
@@ -299,7 +303,7 @@ ss_wq_pop(TfwRBQueue *wq, SsWork *sw, long *ticket)
 }
 
 static size_t
-__ss_close_q_sz(int cpu)
+ss_wq_size(int cpu)
 {
 	TfwRBQueue *wq = &per_cpu(si_wq, cpu);
 	SsCloseBacklog *cb = &per_cpu(close_backlog, cpu);
@@ -308,9 +312,11 @@ __ss_close_q_sz(int cpu)
 }
 
 static size_t
-ss_close_q_sz(void)
+ss_wq_local_size(TfwRBQueue *wq)
 {
-	return __ss_close_q_sz(smp_processor_id());
+	SsCloseBacklog *cb = this_cpu_ptr(&close_backlog);
+
+	return tfw_wq_size(wq) + cb->size;
 }
 
 /*
@@ -1321,7 +1327,7 @@ ss_tx_action(void)
 	 * new items. We use some small integer as a lower bound to catch just
 	 * ariving items.
 	 */
-	budget = max(10UL, ss_close_q_sz());
+	budget = max(10UL, ss_wq_local_size(wq));
 	while ((!ss_active() || budget--) && !ss_wq_pop(wq, &sw, &ticket)) {
 		struct sock *sk = sw.sk;
 
@@ -1368,16 +1374,8 @@ ss_tx_action(void)
 	 * ss_synchronize() is responsible for raising the softirq
 	 * if there are more jobs in the work queue or the backlog.
 	 */
-	if (budget) {
-		set_bit(TFW_QUEUE_B_IPI, &wq->flags);
-
-		smp_mb__after_atomic();
-
-		if (!ss_close_q_sz())
-			return;
-
-		clear_bit(TFW_QUEUE_B_IPI, &wq->flags);
-	}
+	if (budget)
+		TFW_WQ_IPI_SYNC(ss_wq_local_size, wq);
 
 	raise_softirq(NET_TX_SOFTIRQ);
 }
@@ -1456,7 +1454,7 @@ ss_synchronize(void)
 	while (1) {
 		for_each_online_cpu(cpu) {
 			int n_conn = atomic64_read(&per_cpu(__ss_act_cnt, cpu));
-			int n_q = __ss_close_q_sz(cpu);
+			int n_q = ss_wq_size(cpu);
 			if (n_conn + n_q) {
 				irq_work_sync(&per_cpu(ipi_work, cpu));
 				schedule(); /* let softirq finish works */
diff --git a/tempesta_fw/work_queue.h b/tempesta_fw/work_queue.h
index 4f07ad8377..e4ce9d2970 100644
--- a/tempesta_fw/work_queue.h
+++ b/tempesta_fw/work_queue.h
@@ -46,6 +46,15 @@ enum {
 	TFW_QUEUE_B_IPI = 0
 };
 
+#define TFW_WQ_IPI_SYNC(size_cb, wq)				\
+do {								\
+	set_bit(TFW_QUEUE_B_IPI, &(wq)->flags);			\
+	smp_mb__after_atomic();					\
+	if (!size_cb(wq))					\
+		return;						\
+	clear_bit(TFW_QUEUE_B_IPI, &(wq)->flags);		\
+} while (0)
+
 int tfw_wq_init(TfwRBQueue *wq, int node);
 void tfw_wq_destroy(TfwRBQueue *wq);
 long __tfw_wq_push(TfwRBQueue *wq, void *ptr);
@@ -77,6 +86,11 @@ tfw_wq_push(TfwRBQueue *q, void *ptr, int cpu, struct irq_work *work,
 	long ticket = __tfw_wq_push(q, ptr);
 	if (unlikely(ticket))
 		return ticket;
+	/*
+	 * The atomic operation is 'atomic64_cmpxchg()' in
+	 * '__tfw_wq_push()' above.
+	 */
+	smp_mb__after_atomic();
 
 	if (test_bit(TFW_QUEUE_B_IPI, &q->flags))
 		tfw_raise_softirq(cpu, work, local_cpu_cb);

From aac45fc234d47b7534cd0214b56b8b49d88dabe0 Mon Sep 17 00:00:00 2001
From: Alexander Ostapenko <ao@tempesta-tech.com>
Date: Thu, 11 Oct 2018 20:50:48 +0700
Subject: [PATCH 4/5] Add TODO regarding timer callbacks deactivation and
 memory barriers.

---
 tempesta_fw/http_sched_ratio.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tempesta_fw/http_sched_ratio.c b/tempesta_fw/http_sched_ratio.c
index b5228407e1..8991d36efe 100644
--- a/tempesta_fw/http_sched_ratio.c
+++ b/tempesta_fw/http_sched_ratio.c
@@ -1011,8 +1011,14 @@ tfw_sched_ratio_del_grp(TfwSrvGroup *sg)
 	if (!ratio)
 		return;
 	/*
-	 * Make sure the timer doesn't re-arms itself. This
-	 * also ensures that no more RCU callbacks are created.
+	 * Make sure the timer doesn't re-arms itself. This also ensures
+	 * that no more RCU callbacks are created.
+	 *
+	 * TODO: check if the memory barriers is redundand here (and in
+	 * several similar places as well as in corresponding timer
+	 * callbacks); also it seems that function 'del_timer_sync'
+	 * process correctly the situation with reactivation of timer
+	 * from callback, so perhaps we don't need 'rearm' flag at all.
 	 */
 	if (sg->flags & (TFW_SG_F_SCHED_RATIO_DYNAMIC
 			 | TFW_SG_F_SCHED_RATIO_PREDICT))

From 914bcca13f4b77c391d2f0fbcc31182cb32a0322 Mon Sep 17 00:00:00 2001
From: Alexander Ostapenko <ao@tempesta-tech.com>
Date: Mon, 15 Oct 2018 17:56:57 +0700
Subject: [PATCH 5/5] Changes according comments (#806).

---
 tempesta_fw/cache.c      | 2 +-
 tempesta_fw/sock.c       | 4 ++--
 tempesta_fw/work_queue.c | 2 +-
 tempesta_fw/work_queue.h | 8 ++++----
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tempesta_fw/cache.c b/tempesta_fw/cache.c
index 8c1cd410fb..9fb9cfdfcb 100644
--- a/tempesta_fw/cache.c
+++ b/tempesta_fw/cache.c
@@ -1673,7 +1673,7 @@ static void
 tfw_cache_ipi(struct irq_work *work)
 {
 	TfwWorkTasklet *ct = container_of(work, TfwWorkTasklet, ipi_work);
-	clear_bit(TFW_QUEUE_B_IPI, &ct->wq.flags);
+	clear_bit(TFW_QUEUE_IPI, &ct->wq.flags);
 	tasklet_schedule(&ct->tasklet);
 }
 
diff --git a/tempesta_fw/sock.c b/tempesta_fw/sock.c
index 3f10f7e401..41b6b50b9f 100644
--- a/tempesta_fw/sock.c
+++ b/tempesta_fw/sock.c
@@ -188,7 +188,7 @@ static void
 ss_ipi(struct irq_work *work)
 {
 	TfwRBQueue *wq = &per_cpu(si_wq, smp_processor_id());
-	clear_bit(TFW_QUEUE_B_IPI, &wq->flags);
+	clear_bit(TFW_QUEUE_IPI, &wq->flags);
 	raise_softirq(NET_TX_SOFTIRQ);
 }
 
@@ -221,7 +221,7 @@ ss_turnstile_push(long ticket, SsWork *sw, int cpu)
 	 * We do not need explicit memory barriers after
 	 * spinlock operation.
 	 */
-	if (test_bit(TFW_QUEUE_B_IPI, &wq->flags))
+	if (test_bit(TFW_QUEUE_IPI, &wq->flags))
 		tfw_raise_softirq(cpu, iw, ss_ipi);
 
 	return 0;
diff --git a/tempesta_fw/work_queue.c b/tempesta_fw/work_queue.c
index f9e857d412..67b5752f7b 100644
--- a/tempesta_fw/work_queue.c
+++ b/tempesta_fw/work_queue.c
@@ -49,7 +49,7 @@ tfw_wq_init(TfwRBQueue *q, int node)
 	q->last_head = 0;
 	atomic64_set(&q->head, 0);
 	atomic64_set(&q->tail, 0);
-	set_bit(TFW_QUEUE_B_IPI, &q->flags);
+	set_bit(TFW_QUEUE_IPI, &q->flags);
 
 	q->array = kmalloc_node(QSZ * WQ_ITEM_SZ, GFP_KERNEL, node);
 	if (!q->array) {
diff --git a/tempesta_fw/work_queue.h b/tempesta_fw/work_queue.h
index e4ce9d2970..df072c9a71 100644
--- a/tempesta_fw/work_queue.h
+++ b/tempesta_fw/work_queue.h
@@ -43,16 +43,16 @@ typedef struct {
 
 enum {
 	/* Enable IPI generation. */
-	TFW_QUEUE_B_IPI = 0
+	TFW_QUEUE_IPI = 0
 };
 
 #define TFW_WQ_IPI_SYNC(size_cb, wq)				\
 do {								\
-	set_bit(TFW_QUEUE_B_IPI, &(wq)->flags);			\
+	set_bit(TFW_QUEUE_IPI, &(wq)->flags);			\
 	smp_mb__after_atomic();					\
 	if (!size_cb(wq))					\
 		return;						\
-	clear_bit(TFW_QUEUE_B_IPI, &(wq)->flags);		\
+	clear_bit(TFW_QUEUE_IPI, &(wq)->flags);			\
 } while (0)
 
 int tfw_wq_init(TfwRBQueue *wq, int node);
@@ -92,7 +92,7 @@ tfw_wq_push(TfwRBQueue *q, void *ptr, int cpu, struct irq_work *work,
 	 */
 	smp_mb__after_atomic();
 
-	if (test_bit(TFW_QUEUE_B_IPI, &q->flags))
+	if (test_bit(TFW_QUEUE_IPI, &q->flags))
 		tfw_raise_softirq(cpu, work, local_cpu_cb);
 
 	return 0;