Programming Guide: Activate swap more earlier in linux 6 and above

in linux 5
initial script
# Use half physical memory
memkb=$(awk '/MemTotal/ {print $2}' /proc/meminfo)

# must set minimum free memory watermark to __setup_per_zone_wmarks
minwater=$((memkb / 8))
if [ "$minwater" -gt 262144 ]; then
minwater=262144
fi
sysctl -w vm.min_free_kbytes=$minwater
# low watermark = min watermark + distance, distance calculate from scale factor
sysctl -w vm.watermark_scale_factor=100
# setting 0 to 200 allows the system to more actively utilize compressed space
sysctl -w vm.swappiness=100
# read ahead 2^n page, n=0 page=1
sysctl -w vm.page-cluster=0

in linux 6
1.
initial script as linux 5

2.
gedit ~/linux-6.x.x/mm/page_alloc.c
struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
#if defined(EARLIER_SWAP_PATCH)
int i;
unsigned long free_pages, total_pages;
pg_data_t *pgdat;
struct zone *zoneshift;
#endif

if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))
return NULL;

gfp &= gfp_allowed_mask;

gfp = current_gfp_context(gfp);
alloc_gfp = gfp;
if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
&alloc_gfp, &alloc_flags))
return NULL;

alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);

#if defined(EARLIER_SWAP_PATCH)
/* (alloc_gfp & __GFP_DIRECT_RECLAIM):
* only allow can wait general memory allocation,
* and cut off atomic allocation, (e.g., sk->sk_allocation = GFP_ATOMIC in net paths)
* that cannot wait for kswapd and must fast-path allocate from the freelist.
* !current_is_kswapd():
* Prevent recursively call its own memory allocation path,
* because tsk->flags |= PF_MEMALLOC | PF_KSWAPD in static int kswapd(void *p)
*/
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && !current_is_kswapd() && ac.preferred_zoneref->zone) {
free_pages = 0;
total_pages = 0;
pgdat = ac.preferred_zoneref->zone->zone_pgdat;
for (i = 0; i <= ac.highest_zoneidx; i++) {
zoneshift = pgdat->node_zones + i;
if (managed_zone(zoneshift)) {
free_pages += zone_page_state(zoneshift, NR_FREE_PAGES);
total_pages += zone_managed_pages(zoneshift);
}
}
//(free_pages*100 < total_pages*20)
if(free_pages*5 < total_pages) {
wake_all_kswapds(order, alloc_gfp, &ac);
}
}
#endif

page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
if (likely(page))
goto out;

alloc_gfp = gfp;
ac.spread_dirty_pages = false;

ac.nodemask = nodemask;

page = __alloc_pages_slowpath(alloc_gfp, order, &ac);

out:
if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
__free_pages(page, order);
page = NULL;
}

trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
kmsan_alloc_page(page, order, alloc_gfp);

return page;
}

3.
gedit ~/linux-6.x.x/mm/vmscan.c
static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
{
int i;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
unsigned long pflags;
unsigned long nr_boost_reclaim;
unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
bool boosted;
struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
.may_unmap = 1,
};

set_task_reclaim_state(current, &sc.reclaim_state);
psi_memstall_enter(&pflags);
__fs_reclaim_acquire(_THIS_IP_);

count_vm_event(PAGEOUTRUN);

nr_boost_reclaim = 0;
for (i = 0; i <= highest_zoneidx; i++) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;

nr_boost_reclaim += zone->watermark_boost;
zone_boosts[i] = zone->watermark_boost;
}
boosted = nr_boost_reclaim;

restart:
set_reclaim_active(pgdat, highest_zoneidx);
sc.priority = DEF_PRIORITY;
do {
unsigned long nr_reclaimed = sc.nr_reclaimed;
bool raise_priority = true;
bool balanced;
bool ret;
#if defined(EARLIER_SWAP_PATCH)
int i;
unsigned long free_pages, total_pages;
struct zone* tmpzone;
#endif
sc.reclaim_idx = highest_zoneidx;

if (buffer_heads_over_limit) {
for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;

sc.reclaim_idx = i;
break;
}
}

balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
if (!balanced && nr_boost_reclaim) {
nr_boost_reclaim = 0;
goto restart;
}

#if defined(EARLIER_SWAP_PATCH)
free_pages = 0;
total_pages = 0;
for (i = 0; i <= highest_zoneidx; i++) {
tmpzone = pgdat->node_zones + i;
if (managed_zone(tmpzone)) {
free_pages += zone_page_state(tmpzone, NR_FREE_PAGES);
total_pages += zone_managed_pages(tmpzone);
}
}
balanced = balanced && !(free_pages * 100 < total_pages * 30);
#endif

if (!nr_boost_reclaim && balanced)
goto out;

if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
raise_priority = false;

sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
sc.may_swap = !nr_boost_reclaim;

kswapd_age_node(pgdat, &sc);

if (sc.priority < DEF_PRIORITY - 2)
sc.may_writepage = 1;

sc.nr_scanned = 0;
nr_soft_scanned = 0;
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
sc.gfp_mask, &nr_soft_scanned);
sc.nr_reclaimed += nr_soft_reclaimed;

if (kswapd_shrink_node(pgdat, &sc))
raise_priority = false;

if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
allow_direct_reclaim(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);

__fs_reclaim_release(_THIS_IP_);
ret = try_to_freeze();
__fs_reclaim_acquire(_THIS_IP_);
if (ret || kthread_should_stop())
break;

nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);

if (nr_boost_reclaim && !nr_reclaimed)
break;

if (raise_priority || !nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1);

if (!sc.nr_reclaimed)
pgdat->kswapd_failures++;

out:
clear_reclaim_active(pgdat, highest_zoneidx);

/* If reclaim was boosted, account for the reclaim done in this pass */
if (boosted) {
unsigned long flags;

for (i = 0; i <= highest_zoneidx; i++) {
if (!zone_boosts[i])
continue;

/* Increments are under the zone lock */
zone = pgdat->node_zones + i;
spin_lock_irqsave(&zone->lock, flags);
zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
spin_unlock_irqrestore(&zone->lock, flags);
}

wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
}

snapshot_refaults(NULL, pgdat);
__fs_reclaim_release(_THIS_IP_);
psi_memstall_leave(&pflags);
set_task_reclaim_state(current, NULL);

return sc.order;
}

void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, enum zone_type highest_zoneidx)
{
pg_data_t *pgdat;
enum zone_type curr_idx;
#if defined(EARLIER_SWAP_PATCH)
int i;
unsigned long free_pages, total_pages;
struct zone* tmpzone;
#endif

if (!managed_zone(zone))
return;

if (!cpuset_zone_allowed(zone, gfp_flags))
return;

pgdat = zone->zone_pgdat;
curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);

if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);

if (READ_ONCE(pgdat->kswapd_order) < order)
WRITE_ONCE(pgdat->kswapd_order, order);

if (!waitqueue_active(&pgdat->kswapd_wait))
return;

#if defined(EARLIER_SWAP_PATCH)
free_pages = 0;
total_pages = 0;
for (i = 0; i <= highest_zoneidx; i++) {
tmpzone = pgdat->node_zones + i;
if (managed_zone(tmpzone)) {
free_pages += zone_page_state(tmpzone, NR_FREE_PAGES);
total_pages += zone_managed_pages(tmpzone);
}
}
if ( pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
(!(free_pages*100 < total_pages*30) && !pgdat_watermark_boosted(pgdat, highest_zoneidx)) )
#else
if ( pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
(pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx)) )
#endif
{
if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
wakeup_kcompactd(pgdat, order, highest_zoneidx);
return;
}

trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
gfp_flags);
wake_up_interruptible(&pgdat->kswapd_wait);
}

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
note:
1. (alloc_gfp & __GFP_DIRECT_RECLAIM)
This flag answers the question: "Is the caller allowed to block (sleep) and perform synchronous memory reclamation right now?"
DIRECT_RECLAIM means the current process context is willing to dive in and do the heavy lifting of clearing memory
(like flushing dirty pages to disk via I/O or compressing anonymous memory into zram/swap).
Because storage I/O takes time, this path inherently requires the process to enter a sleeping state (TASK_UNINTERRUPTIBLE)
while waiting for the hardware interrupt.

example:
include/linux/gfp_types.h
#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)

struct my_buf *buffer = kmalloc(sizeof(*buffer), GFP_KERNEL);

2. !current_is_kswapd()
~/include/linux/sched/mm.h
static inline gfp_t current_gfp_context(gfp_t flags)
{
unsigned int pflags = READ_ONCE(current->flags);

if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) {
/*
* NOIO implies both NOIO and NOFS and it is a weaker context
* so always make sure it makes precedence
*/
if (pflags & PF_MEMALLOC_NOIO)
flags &= ~(__GFP_IO | __GFP_FS);
else if (pflags & PF_MEMALLOC_NOFS)
flags &= ~__GFP_FS;
}
return flags;
}

struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask)
{
...
gfp = current_gfp_context(gfp);
...
}

!(alloc_gfp & __GFP_MEMALLOC) can not stop kswapd call itself, we use !current_is_kswapd()

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
(option)
defconfig recommend enable
CONFIG_LRU_GEN=y
CONFIG_LRU_GEN_ENABLED=y

Programming Guide

2019年12月31日星期二

Activate swap more earlier in linux 6 and above

沒有留言:

張貼留言

2019年12月31日 星期二

Activate swap more earlier in linux 6 and above

沒有留言:

張貼留言

2019年12月31日星期二