diff mbox

[13/13] mm: memcontrol: hook up vmpressure to socket pressure

Message ID 20151124215940.GB1373@cmpxchg.org
State Not Applicable, archived
Delegated to: David Miller
Headers show

Commit Message

Johannes Weiner Nov. 24, 2015, 9:59 p.m. UTC
Let the networking stack know when a memcg is under reclaim pressure
so that it can clamp its transmit windows accordingly.

Whenever the reclaim efficiency of a cgroup's LRU lists drops low
enough for a MEDIUM or HIGH vmpressure event to occur, assert a
pressure state in the socket and tcp memory code that tells it to curb
consumption growth from sockets associated with said control group.

Traditionally, vmpressure reports for the entire subtree of a memcg
under pressure, which drops useful information on the individual
groups reclaimed. However, it's too late to change the userinterface,
so add a second reporting mode that reports on the level of reclaim
instead of at the level of pressure, and use that report for sockets.

vmpressure events are naturally edge triggered, so for hysteresis
assert socket pressure for a second to allow for subsequent vmpressure
events to occur before letting the socket code return to normal.

This will likely need finetuning for a wider variety of workloads, but
for now stick to the vmpressure presets and keep hysteresis simple.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/memcontrol.h | 32 ++++++++++++++++---
 include/linux/vmpressure.h |  5 ++-
 mm/memcontrol.c            | 17 ++--------
 mm/vmpressure.c            | 78 +++++++++++++++++++++++++++++++++++-----------
 mm/vmscan.c                | 10 +++++-
 5 files changed, 103 insertions(+), 39 deletions(-)

Comments

David Miller Nov. 25, 2015, 4:30 p.m. UTC | #1
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 24 Nov 2015 16:59:40 -0500

> Let the networking stack know when a memcg is under reclaim pressure
> so that it can clamp its transmit windows accordingly.
> 
> Whenever the reclaim efficiency of a cgroup's LRU lists drops low
> enough for a MEDIUM or HIGH vmpressure event to occur, assert a
> pressure state in the socket and tcp memory code that tells it to curb
> consumption growth from sockets associated with said control group.
> 
> Traditionally, vmpressure reports for the entire subtree of a memcg
> under pressure, which drops useful information on the individual
> groups reclaimed. However, it's too late to change the userinterface,
> so add a second reporting mode that reports on the level of reclaim
> instead of at the level of pressure, and use that report for sockets.
> 
> vmpressure events are naturally edge triggered, so for hysteresis
> assert socket pressure for a second to allow for subsequent vmpressure
> events to occur before letting the socket code return to normal.
> 
> This will likely need finetuning for a wider variety of workloads, but
> for now stick to the vmpressure presets and keep hysteresis simple.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Acked-by: David S. Miller <davem@davemloft.net>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vladimir Davydov Nov. 30, 2015, 11:36 a.m. UTC | #2
On Tue, Nov 24, 2015 at 04:59:40PM -0500, Johannes Weiner wrote:
...
> @@ -2396,6 +2396,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
>  		memcg = mem_cgroup_iter(root, NULL, &reclaim);
>  		do {
>  			unsigned long lru_pages;
> +			unsigned long reclaimed;
>  			unsigned long scanned;
>  			struct lruvec *lruvec;
>  			int swappiness;
> @@ -2408,6 +2409,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
>  
>  			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
>  			swappiness = mem_cgroup_swappiness(memcg);
> +			reclaimed = sc->nr_reclaimed;
>  			scanned = sc->nr_scanned;
>  
>  			shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
> @@ -2418,6 +2420,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
>  					    memcg, sc->nr_scanned - scanned,
>  					    lru_pages);
>  
> +			/* Record the group's reclaim efficiency */
> +			vmpressure(sc->gfp_mask, memcg, false,
> +				   sc->nr_scanned - scanned,
> +				   sc->nr_reclaimed - reclaimed);
> +

Suppose we have the following cgroup configuration.

A __ B
  \_ C

A is empty (which is natural for the unified hierarchy AFAIU). B has
some workload running in it, and C generates socket pressure. Due to the
socket pressure coming from C we start reclaim in A, which results in
thrashing of B, but we might not put sockets under pressure in A or C,
because vmpressure does not account pages scanned/reclaimed in B when
generating a vmpressure event for A or C. This might result in
aggressive reclaim and thrashing in B w/o generating a signal for C to
stop growing socket buffers.

Do you think such a situation is possible? If so, would it make sense to
switch to post-order walk in shrink_zone and pass sub-tree
scanned/reclaimed stats to vmpressure for each scanned memcg?

Thanks,
Vladimir

>  			/*
>  			 * Direct reclaim and kswapd have to scan all memory
>  			 * cgroups to fulfill the overall scan target for the
> @@ -2449,7 +2456,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
>  			reclaim_state->reclaimed_slab = 0;
>  		}
>  
> -		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
> +		/* Record the subtree's reclaim efficiency */
> +		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
>  			   sc->nr_scanned - nr_scanned,
>  			   sc->nr_reclaimed - nr_reclaimed);
>  
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Johannes Weiner Nov. 30, 2015, 3:58 p.m. UTC | #3
On Mon, Nov 30, 2015 at 02:36:28PM +0300, Vladimir Davydov wrote:
> Suppose we have the following cgroup configuration.
> 
> A __ B
>   \_ C
> 
> A is empty (which is natural for the unified hierarchy AFAIU). B has
> some workload running in it, and C generates socket pressure. Due to the
> socket pressure coming from C we start reclaim in A, which results in
> thrashing of B, but we might not put sockets under pressure in A or C,
> because vmpressure does not account pages scanned/reclaimed in B when
> generating a vmpressure event for A or C. This might result in
> aggressive reclaim and thrashing in B w/o generating a signal for C to
> stop growing socket buffers.
> 
> Do you think such a situation is possible? If so, would it make sense to
> switch to post-order walk in shrink_zone and pass sub-tree
> scanned/reclaimed stats to vmpressure for each scanned memcg?

In that case the LRU pages in C would experience pressure as well,
which would then reign in the sockets in C. There must be some LRU
pages in there, otherwise who is creating socket pressure?

The same applies to shrinkers. All secondary reclaim is driven by LRU
reclaim results.

I can see that there is some unfairness in distributing memcg reclaim
pressure purely based on LRU size, because there are scenarios where
the auxiliary objects (incl. sockets, but mostly shrinker pools)
amount to a significant portion of the group's memory footprint. But
substitute group for NUMA node and we've had this behavior for
years. I'm not sure it's actually a problem in practice.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vladimir Davydov Nov. 30, 2015, 4:13 p.m. UTC | #4
On Mon, Nov 30, 2015 at 10:58:38AM -0500, Johannes Weiner wrote:
> On Mon, Nov 30, 2015 at 02:36:28PM +0300, Vladimir Davydov wrote:
> > Suppose we have the following cgroup configuration.
> > 
> > A __ B
> >   \_ C
> > 
> > A is empty (which is natural for the unified hierarchy AFAIU). B has
> > some workload running in it, and C generates socket pressure. Due to the
> > socket pressure coming from C we start reclaim in A, which results in
> > thrashing of B, but we might not put sockets under pressure in A or C,
> > because vmpressure does not account pages scanned/reclaimed in B when
> > generating a vmpressure event for A or C. This might result in
> > aggressive reclaim and thrashing in B w/o generating a signal for C to
> > stop growing socket buffers.
> > 
> > Do you think such a situation is possible? If so, would it make sense to
> > switch to post-order walk in shrink_zone and pass sub-tree
> > scanned/reclaimed stats to vmpressure for each scanned memcg?
> 
> In that case the LRU pages in C would experience pressure as well,
> which would then reign in the sockets in C. There must be some LRU
> pages in there, otherwise who is creating socket pressure?
> 
> The same applies to shrinkers. All secondary reclaim is driven by LRU
> reclaim results.
> 
> I can see that there is some unfairness in distributing memcg reclaim
> pressure purely based on LRU size, because there are scenarios where
> the auxiliary objects (incl. sockets, but mostly shrinker pools)
> amount to a significant portion of the group's memory footprint. But
> substitute group for NUMA node and we've had this behavior for
> years. I'm not sure it's actually a problem in practice.
> 

Fiar enough. Let's wait until we hit this problem in real world then.

The patch looks good to me.

Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>

Thanks,
Vladimir
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fae0aaf..a8df46c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -249,6 +249,10 @@  struct mem_cgroup {
 	struct wb_domain cgwb_domain;
 #endif
 
+#ifdef CONFIG_INET
+	unsigned long		socket_pressure;
+#endif
+
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
 	spinlock_t event_list_lock;
@@ -292,18 +296,34 @@  struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
 
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
-struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
 
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
 	return css ? container_of(css, struct mem_cgroup, css) : NULL;
 }
 
+#define mem_cgroup_from_counter(counter, member)	\
+	container_of(counter, struct mem_cgroup, member)
+
 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
 				   struct mem_cgroup *,
 				   struct mem_cgroup_reclaim_cookie *);
 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
 
+/**
+ * parent_mem_cgroup - find the accounting parent of a memcg
+ * @memcg: memcg whose parent to find
+ *
+ * Returns the parent memcg, or NULL if this is the root or the memory
+ * controller is in legacy no-hierarchy mode.
+ */
+static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
+{
+	if (!memcg->memory.parent)
+		return NULL;
+	return mem_cgroup_from_counter(memcg->memory.parent, memory);
+}
+
 static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
 			      struct mem_cgroup *root)
 {
@@ -693,10 +713,14 @@  void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
 static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 {
 #ifdef CONFIG_MEMCG_KMEM
-	return memcg->tcp_mem.memory_pressure;
-#else
-	return false;
+	if (memcg->tcp_mem.memory_pressure)
+		return true;
 #endif
+	do {
+		if (time_before(jiffies, memcg->socket_pressure))
+			return true;
+	} while ((memcg = parent_mem_cgroup(memcg)));
+	return false;
 }
 #else
 #define mem_cgroup_sockets_enabled 0
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 3e45358..a77b142 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -12,6 +12,9 @@ 
 struct vmpressure {
 	unsigned long scanned;
 	unsigned long reclaimed;
+
+	unsigned long tree_scanned;
+	unsigned long tree_reclaimed;
 	/* The lock is used to keep the scanned/reclaimed above in sync. */
 	struct spinlock sr_lock;
 
@@ -26,7 +29,7 @@  struct vmpressure {
 struct mem_cgroup;
 
 #ifdef CONFIG_MEMCG
-extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
 		       unsigned long scanned, unsigned long reclaimed);
 extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 59555b0..a0da91f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1091,9 +1091,6 @@  bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
 	return ret;
 }
 
-#define mem_cgroup_from_counter(counter, member)	\
-	container_of(counter, struct mem_cgroup, member)
-
 /**
  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
  * @memcg: the memory cgroup
@@ -4159,17 +4156,6 @@  static void __mem_cgroup_free(struct mem_cgroup *memcg)
 	kfree(memcg);
 }
 
-/*
- * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
- */
-struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
-{
-	if (!memcg->memory.parent)
-		return NULL;
-	return mem_cgroup_from_counter(memcg->memory.parent, memory);
-}
-EXPORT_SYMBOL(parent_mem_cgroup);
-
 static struct cgroup_subsys_state * __ref
 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
@@ -4210,6 +4196,9 @@  mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 #ifdef CONFIG_CGROUP_WRITEBACK
 	INIT_LIST_HEAD(&memcg->cgwb_list);
 #endif
+#ifdef CONFIG_INET
+	memcg->socket_pressure = jiffies;
+#endif
 	return &memcg->css;
 
 free_out:
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 4c25e62..af262bb 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -137,14 +137,11 @@  struct vmpressure_event {
 };
 
 static bool vmpressure_event(struct vmpressure *vmpr,
-			     unsigned long scanned, unsigned long reclaimed)
+			     enum vmpressure_levels level)
 {
 	struct vmpressure_event *ev;
-	enum vmpressure_levels level;
 	bool signalled = false;
 
-	level = vmpressure_calc_level(scanned, reclaimed);
-
 	mutex_lock(&vmpr->events_lock);
 
 	list_for_each_entry(ev, &vmpr->events, node) {
@@ -164,6 +161,7 @@  static void vmpressure_work_fn(struct work_struct *work)
 	struct vmpressure *vmpr = work_to_vmpressure(work);
 	unsigned long scanned;
 	unsigned long reclaimed;
+	enum vmpressure_levels level;
 
 	spin_lock(&vmpr->sr_lock);
 	/*
@@ -174,19 +172,21 @@  static void vmpressure_work_fn(struct work_struct *work)
 	 * here. No need for any locks here since we don't care if
 	 * vmpr->reclaimed is in sync.
 	 */
-	scanned = vmpr->scanned;
+	scanned = vmpr->tree_scanned;
 	if (!scanned) {
 		spin_unlock(&vmpr->sr_lock);
 		return;
 	}
 
-	reclaimed = vmpr->reclaimed;
-	vmpr->scanned = 0;
-	vmpr->reclaimed = 0;
+	reclaimed = vmpr->tree_reclaimed;
+	vmpr->tree_scanned = 0;
+	vmpr->tree_reclaimed = 0;
 	spin_unlock(&vmpr->sr_lock);
 
+	level = vmpressure_calc_level(scanned, reclaimed);
+
 	do {
-		if (vmpressure_event(vmpr, scanned, reclaimed))
+		if (vmpressure_event(vmpr, level))
 			break;
 		/*
 		 * If not handled, propagate the event upward into the
@@ -199,6 +199,7 @@  static void vmpressure_work_fn(struct work_struct *work)
  * vmpressure() - Account memory pressure through scanned/reclaimed ratio
  * @gfp:	reclaimer's gfp mask
  * @memcg:	cgroup memory controller handle
+ * @tree:	legacy subtree mode
  * @scanned:	number of pages scanned
  * @reclaimed:	number of pages reclaimed
  *
@@ -206,9 +207,16 @@  static void vmpressure_work_fn(struct work_struct *work)
  * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
  * pressure index is then further refined and averaged over time.
  *
+ * If @tree is set, vmpressure is in traditional userspace reporting
+ * mode: @memcg is considered the pressure root and userspace is
+ * notified of the entire subtree's reclaim efficiency.
+ *
+ * If @tree is not set, reclaim efficiency is recorded for @memcg, and
+ * only in-kernel users are notified.
+ *
  * This function does not return any value.
  */
-void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
 		unsigned long scanned, unsigned long reclaimed)
 {
 	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
@@ -238,15 +246,47 @@  void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
 	if (!scanned)
 		return;
 
-	spin_lock(&vmpr->sr_lock);
-	vmpr->scanned += scanned;
-	vmpr->reclaimed += reclaimed;
-	scanned = vmpr->scanned;
-	spin_unlock(&vmpr->sr_lock);
+	if (tree) {
+		spin_lock(&vmpr->sr_lock);
+		vmpr->tree_scanned += scanned;
+		vmpr->tree_reclaimed += reclaimed;
+		scanned = vmpr->scanned;
+		spin_unlock(&vmpr->sr_lock);
 
-	if (scanned < vmpressure_win)
-		return;
-	schedule_work(&vmpr->work);
+		if (scanned < vmpressure_win)
+			return;
+		schedule_work(&vmpr->work);
+	} else {
+		enum vmpressure_levels level;
+
+		/* For now, no users for root-level efficiency */
+		if (memcg == root_mem_cgroup)
+			return;
+
+		spin_lock(&vmpr->sr_lock);
+		scanned = vmpr->scanned += scanned;
+		reclaimed = vmpr->reclaimed += reclaimed;
+		if (scanned < vmpressure_win) {
+			spin_unlock(&vmpr->sr_lock);
+			return;
+		}
+		vmpr->scanned = vmpr->reclaimed = 0;
+		spin_unlock(&vmpr->sr_lock);
+
+		level = vmpressure_calc_level(scanned, reclaimed);
+
+		if (level > VMPRESSURE_LOW) {
+			/*
+			 * Let the socket buffer allocator know that
+			 * we are having trouble reclaiming LRU pages.
+			 *
+			 * For hysteresis keep the pressure state
+			 * asserted for a second in which subsequent
+			 * pressure events can occur.
+			 */
+			memcg->socket_pressure = jiffies + HZ;
+		}
+	}
 }
 
 /**
@@ -276,7 +316,7 @@  void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
 	 * to the vmpressure() basically means that we signal 'critical'
 	 * level.
 	 */
-	vmpressure(gfp, memcg, vmpressure_win, 0);
+	vmpressure(gfp, memcg, true, vmpressure_win, 0);
 }
 
 /**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 97ba9e1..50e54c0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2396,6 +2396,7 @@  static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 		memcg = mem_cgroup_iter(root, NULL, &reclaim);
 		do {
 			unsigned long lru_pages;
+			unsigned long reclaimed;
 			unsigned long scanned;
 			struct lruvec *lruvec;
 			int swappiness;
@@ -2408,6 +2409,7 @@  static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 			swappiness = mem_cgroup_swappiness(memcg);
+			reclaimed = sc->nr_reclaimed;
 			scanned = sc->nr_scanned;
 
 			shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
@@ -2418,6 +2420,11 @@  static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 					    memcg, sc->nr_scanned - scanned,
 					    lru_pages);
 
+			/* Record the group's reclaim efficiency */
+			vmpressure(sc->gfp_mask, memcg, false,
+				   sc->nr_scanned - scanned,
+				   sc->nr_reclaimed - reclaimed);
+
 			/*
 			 * Direct reclaim and kswapd have to scan all memory
 			 * cgroups to fulfill the overall scan target for the
@@ -2449,7 +2456,8 @@  static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 			reclaim_state->reclaimed_slab = 0;
 		}
 
-		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
+		/* Record the subtree's reclaim efficiency */
+		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
 			   sc->nr_scanned - nr_scanned,
 			   sc->nr_reclaimed - nr_reclaimed);