Skip to content

Commit

Permalink
mm: multigenerational lru: groundwork
Browse files Browse the repository at this point in the history
For each lruvec, evictable pages are divided into multiple
generations. The youngest generation number is stored in
lrugen->max_seq for both anon and file types as they are aged on an
equal footing. The oldest generation numbers are stored in
lrugen->min_seq[] separately for anon and file types as clean file
pages can be evicted regardless of swap constraints. These three
variables are monotonically increasing. Generation numbers are
truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into
page->flags. The sliding window technique is used to prevent truncated
generation numbers from overlapping. Each truncated generation number
is an index to
lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES].

The framework comprises two conceptually independent components: the
aging, which produces young generations, and the eviction, which
consumes old generations. Both can be invoked independently from user
space for the purpose of working set estimation and proactive reclaim.

The protection of hot pages and the selection of cold pages are based
on page access types and patterns. There are two access types: one via
page tables and the other via file descriptors. The protection of the
former type is by design stronger because:
  1) The uncertainty in determining the access patterns of the former
  type is higher due to the coalesced nature of the accessed bit.
  2) The cost of evicting the former type is higher due to the TLB
  flushes required and the likelihood of involving I/O.
  3) The penalty of under-protecting the former type is higher because
  applications usually do not prepare themselves for major faults like
  they do for blocked I/O. For example, client applications commonly
  dedicate blocked I/O to separate threads to avoid UI janks that
  negatively affect user experience.

There are also two access patterns: one with temporal locality and the
other without. The latter pattern, e.g., random and sequential, needs
to be explicitly excluded to avoid weakening the protection of the
former pattern. Generally the former type follows the former pattern
unless MADV_SEQUENTIAL is specified and the latter type follows the
latter pattern unless outlying refaults have been observed.

Upon faulting, a page is added to the youngest generation, which
provides the strongest protection as the eviction will not consider
this page before the aging has scanned it at least twice. The first
scan clears the accessed bit set during the initial fault. And the
second scan makes sure this page has not been used since the first
scan. A page from any other generations is brought back to the
youngest generation whenever the aging finds the accessed bit set on
any of the PTEs mapping this page.

Unmapped pages are initially added to the oldest generation and then
conditionally protected by tiers. This is done later [PATCH 07/10].

Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
  • Loading branch information
yuzhaogoogle authored and damentz committed Nov 11, 2021
1 parent db71f76 commit c91daf9
Show file tree
Hide file tree
Showing 18 changed files with 612 additions and 15 deletions.
3 changes: 2 additions & 1 deletion fs/fuse/dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -785,7 +785,8 @@ static int fuse_check_page(struct page *page)
1 << PG_active |
1 << PG_workingset |
1 << PG_reclaim |
1 << PG_waiters))) {
1 << PG_waiters |
LRU_GEN_MASK | LRU_REFS_MASK))) {
dump_page(page, "fuse: trying to steal weird page");
return 1;
}
Expand Down
15 changes: 14 additions & 1 deletion include/linux/cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
css_put(&cgrp->self);
}

extern struct mutex cgroup_mutex;

static inline void cgroup_lock(void)
{
mutex_lock(&cgroup_mutex);
}

static inline void cgroup_unlock(void)
{
mutex_unlock(&cgroup_mutex);
}

/**
* task_css_set_check - obtain a task's css_set with extra access conditions
* @task: the task to obtain css_set for
Expand All @@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
* as locks used during the cgroup_subsys::attach() methods.
*/
#ifdef CONFIG_PROVE_RCU
extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
#define task_css_set_check(task, __c) \
rcu_dereference_check((task)->cgroups, \
Expand Down Expand Up @@ -707,6 +718,8 @@ struct cgroup;
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
static inline void cgroup_lock(void) {}
static inline void cgroup_unlock(void) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
Expand Down
36 changes: 36 additions & 0 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)

/*
* Define the bit shifts to access each section. For non-existent
Expand Down Expand Up @@ -1800,6 +1802,40 @@ static inline void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows) { }
#endif

#ifdef CONFIG_LRU_GEN
static inline void task_enter_nonseq_fault(void)
{
WARN_ON(current->in_nonseq_fault);

current->in_nonseq_fault = 1;
}

static inline void task_exit_nonseq_fault(void)
{
WARN_ON(!current->in_nonseq_fault);

current->in_nonseq_fault = 0;
}

static inline bool task_in_nonseq_fault(void)
{
return current->in_nonseq_fault;
}
#else
static inline void task_enter_nonseq_fault(void)
{
}

static inline void task_exit_nonseq_fault(void)
{
}

static inline bool task_in_nonseq_fault(void)
{
return false;
}
#endif /* CONFIG_LRU_GEN */

static inline void unmap_shared_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen)
{
Expand Down
182 changes: 182 additions & 0 deletions include/linux/mm_inline.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,187 @@ static __always_inline enum lru_list page_lru(struct page *page)
return lru;
}

#ifdef CONFIG_LRU_GEN

static inline bool lru_gen_enabled(void)
{
#ifdef CONFIG_LRU_GEN_ENABLED
DECLARE_STATIC_KEY_TRUE(lru_gen_static_key);

return static_branch_likely(&lru_gen_static_key);
#else
DECLARE_STATIC_KEY_FALSE(lru_gen_static_key);

return static_branch_unlikely(&lru_gen_static_key);
#endif
}

/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */
static inline int lru_gen_from_seq(unsigned long seq)
{
return seq % MAX_NR_GENS;
}

/* The youngest and the second youngest generations are counted as active. */
static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
{
unsigned long max_seq = lruvec->evictable.max_seq;

VM_BUG_ON(gen >= MAX_NR_GENS);

return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
}

/* Update the sizes of the multigenerational lru lists. */
static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec,
int old_gen, int new_gen)
{
int type = page_is_file_lru(page);
int zone = page_zonenum(page);
int delta = thp_nr_pages(page);
enum lru_list lru = type * LRU_FILE;
struct lrugen *lrugen = &lruvec->evictable;

lockdep_assert_held(&lruvec->lru_lock);
VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
VM_BUG_ON(old_gen == -1 && new_gen == -1);

if (old_gen >= 0)
WRITE_ONCE(lrugen->sizes[old_gen][type][zone],
lrugen->sizes[old_gen][type][zone] - delta);
if (new_gen >= 0)
WRITE_ONCE(lrugen->sizes[new_gen][type][zone],
lrugen->sizes[new_gen][type][zone] + delta);

if (old_gen < 0) {
if (lru_gen_is_active(lruvec, new_gen))
lru += LRU_ACTIVE;
update_lru_size(lruvec, lru, zone, delta);
return;
}

if (new_gen < 0) {
if (lru_gen_is_active(lruvec, old_gen))
lru += LRU_ACTIVE;
update_lru_size(lruvec, lru, zone, -delta);
return;
}

if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
update_lru_size(lruvec, lru, zone, -delta);
update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
}

VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
}

/* Add a page to one of the multigenerational lru lists. Return true on success. */
static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
{
int gen;
unsigned long old_flags, new_flags;
int type = page_is_file_lru(page);
int zone = page_zonenum(page);
struct lrugen *lrugen = &lruvec->evictable;

if (PageUnevictable(page) || !lrugen->enabled[type])
return false;
/*
* If a page shouldn't be considered for eviction, i.e., a page mapped
* upon fault during which the accessed bit is set, add it to the
* youngest generation.
*
* If a page can't be evicted immediately, i.e., an anon page not in
* swap cache or a dirty page pending writeback, add it to the second
* oldest generation.
*
* If a page could be evicted immediately, e.g., a clean page, add it to
* the oldest generation.
*/
if (PageActive(page))
gen = lru_gen_from_seq(lrugen->max_seq);
else if ((!type && !PageSwapCache(page)) ||
(PageReclaim(page) && (PageDirty(page) || PageWriteback(page))))
gen = lru_gen_from_seq(lrugen->min_seq[type] + 1);
else
gen = lru_gen_from_seq(lrugen->min_seq[type]);

do {
new_flags = old_flags = READ_ONCE(page->flags);
VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page);

new_flags &= ~(LRU_GEN_MASK | BIT(PG_active));
new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
} while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);

lru_gen_update_size(page, lruvec, -1, gen);
/* for rotate_reclaimable_page() */
if (reclaiming)
list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
else
list_add(&page->lru, &lrugen->lists[gen][type][zone]);

return true;
}

/* Delete a page from one of the multigenerational lru lists. Return true on success. */
static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
{
int gen;
unsigned long old_flags, new_flags;

do {
new_flags = old_flags = READ_ONCE(page->flags);
if (!(new_flags & LRU_GEN_MASK))
return false;

VM_BUG_ON_PAGE(PageActive(page), page);
VM_BUG_ON_PAGE(PageUnevictable(page), page);

gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;

new_flags &= ~LRU_GEN_MASK;
/* for shrink_page_list() */
if (reclaiming)
new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
else if (lru_gen_is_active(lruvec, gen))
new_flags |= BIT(PG_active);
} while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);

lru_gen_update_size(page, lruvec, gen, -1);
list_del(&page->lru);

return true;
}

#else

static inline bool lru_gen_enabled(void)
{
return false;
}

static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
{
return false;
}

static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
{
return false;
}

#endif /* CONFIG_LRU_GEN */

static __always_inline void add_page_to_lru_list(struct page *page,
struct lruvec *lruvec)
{
enum lru_list lru = page_lru(page);

if (lru_gen_add_page(page, lruvec, false))
return;

update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
list_add(&page->lru, &lruvec->lists[lru]);
}
Expand All @@ -93,13 +269,19 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page,
{
enum lru_list lru = page_lru(page);

if (lru_gen_add_page(page, lruvec, true))
return;

update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
list_add_tail(&page->lru, &lruvec->lists[lru]);
}

static __always_inline void del_page_from_lru_list(struct page *page,
struct lruvec *lruvec)
{
if (lru_gen_del_page(page, lruvec, false))
return;

list_del(&page->lru);
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
-thp_nr_pages(page));
Expand Down
70 changes: 70 additions & 0 deletions include/linux/mmzone.h
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,72 @@ enum lruvec_flags {
*/
};

struct lruvec;

#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)

#ifdef CONFIG_LRU_GEN

/*
* For each lruvec, evictable pages are divided into multiple generations. The
* youngest and the oldest generation numbers, AKA max_seq and min_seq, are
* monotonically increasing. The sliding window technique is used to track at
* least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the
* window, AKA gen, indexes an array of per-type and per-zone lists for the
* corresponding generation. The counter in page->flags stores gen+1 while a
* page is on one of the multigenerational lru lists. Otherwise, it stores 0.
*
* After a page is faulted in, the aging must check the accessed bit at least
* twice before the eviction would consider it. The first check clears the
* accessed bit set during the initial fault. The second check makes sure this
* page hasn't been used since then.
*/
#define MIN_NR_GENS 2
#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)

struct lrugen {
/* the aging increments the max generation number */
unsigned long max_seq;
/* the eviction increments the min generation numbers */
unsigned long min_seq[ANON_AND_FILE];
/* the birth time of each generation in jiffies */
unsigned long timestamps[MAX_NR_GENS];
/* the multigenerational lru lists */
struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* the sizes of the multigenerational lru lists in pages */
unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* whether the multigenerational lru is enabled */
bool enabled[ANON_AND_FILE];
};

#define MAX_BATCH_SIZE 8192

void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
void lru_gen_change_state(bool enable, bool main, bool swap);

#ifdef CONFIG_MEMCG
void lru_gen_init_memcg(struct mem_cgroup *memcg);
#endif

#else /* !CONFIG_LRU_GEN */

static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
{
}

static inline void lru_gen_change_state(bool enable, bool main, bool swap)
{
}

#ifdef CONFIG_MEMCG
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
}
#endif

#endif /* CONFIG_LRU_GEN */

struct lruvec {
struct list_head lists[NR_LRU_LISTS];
/* per lruvec lru_lock for memcg */
Expand All @@ -311,6 +377,10 @@ struct lruvec {
unsigned long refaults[ANON_AND_FILE];
/* Various lruvec state flags (enum lruvec_flags) */
unsigned long flags;
#ifdef CONFIG_LRU_GEN
/* unevictable pages are on LRU_UNEVICTABLE */
struct lrugen evictable;
#endif
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
Expand Down
Loading

0 comments on commit c91daf9

Please sign in to comment.