in linux 5
initial script
# low watermark = min watermark + distance, distance calculate from scale factor
sysctl -w vm.watermark_scale_factor=100
# must set minimum free memory watermark to __setup_per_zone_wmarks
sysctl -w vm.min_free_kbytes=$((memkb / 8))
# setting it to 100 allows the system to more actively utilize compressed space
sysctl -w vm.swappiness=100
# read ahead 2^n page, n=0 page=1
sysctl -w vm.page-cluster=0
in linux 6
1.
initial script as linux 5
2.
gedit ~/linux-6.x.x/mm/page_alloc.c
struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))
return NULL;
gfp &= gfp_allowed_mask;
gfp = current_gfp_context(gfp);
alloc_gfp = gfp;
if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
&alloc_gfp, &alloc_flags))
return NULL;
alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
#if defined(EARLIER_SWAP_PATCH)
/* (alloc_gfp & __GFP_DIRECT_RECLAIM):
* allow can wait general memory allocation only,
* and cut off atomic allocation, (e.g., sk->sk_allocation = GFP_ATOMIC in net paths)
* that cannot wait for kswapd and must fast-path allocate from the freelist.
* !current_is_kswapd():
* Prevent recursively call its own memory allocation path,
* because tsk->flags |= PF_MEMALLOC | PF_KSWAPD in static int kswapd(void *p)
*/
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) &&
!current_is_kswapd() &&
ac.preferred_zoneref &&
!zone_watermark_ok(ac.preferred_zoneref->zone,
order,
high_wmark_pages(ac.preferred_zoneref->zone),
ac.highest_zoneidx,
alloc_flags)) {
wake_all_kswapds(order, alloc_gfp, &ac);
}
#endif
page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
if (likely(page))
goto out;
alloc_gfp = gfp;
ac.spread_dirty_pages = false;
ac.nodemask = nodemask;
page = __alloc_pages_slowpath(alloc_gfp, order, &ac);
out:
if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
kmsan_alloc_page(page, order, alloc_gfp);
return page;
}
3.
gedit ~/linux-6.x.x/mm/vmscan.c
static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
{
int i;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
unsigned long pflags;
unsigned long nr_boost_reclaim;
unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
bool boosted;
struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
.may_unmap = 1,
};
set_task_reclaim_state(current, &sc.reclaim_state);
psi_memstall_enter(&pflags);
__fs_reclaim_acquire(_THIS_IP_);
count_vm_event(PAGEOUTRUN);
nr_boost_reclaim = 0;
for (i = 0; i <= highest_zoneidx; i++) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
nr_boost_reclaim += zone->watermark_boost;
zone_boosts[i] = zone->watermark_boost;
}
boosted = nr_boost_reclaim;
restart:
set_reclaim_active(pgdat, highest_zoneidx);
sc.priority = DEF_PRIORITY;
do {
unsigned long nr_reclaimed = sc.nr_reclaimed;
bool raise_priority = true;
bool balanced;
bool ret;
#if defined(EARLIER_SWAP_PATCH)
int i;
unsigned long free_pages, total_pages;
struct zone* tmpzone;
#endif
sc.reclaim_idx = highest_zoneidx;
if (buffer_heads_over_limit) {
for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
sc.reclaim_idx = i;
break;
}
}
balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
if (!balanced && nr_boost_reclaim) {
nr_boost_reclaim = 0;
goto restart;
}
#if defined(EARLIER_SWAP_PATCH)
free_pages = 0;
total_pages = 0;
for (i = 0; i <= highest_zoneidx; i++) {
tmpzone = pgdat->node_zones + i;
if (managed_zone(tmpzone)) {
free_pages += zone_page_state(tmpzone, NR_FREE_PAGES);
total_pages += zone_managed_pages(tmpzone);
}
}
balanced = balanced && !(free_pages * 100 < total_pages * 30);
#endif
if (!nr_boost_reclaim && balanced)
goto out;
if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
raise_priority = false;
sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
sc.may_swap = !nr_boost_reclaim;
kswapd_age_node(pgdat, &sc);
if (sc.priority < DEF_PRIORITY - 2)
sc.may_writepage = 1;
sc.nr_scanned = 0;
nr_soft_scanned = 0;
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
sc.gfp_mask, &nr_soft_scanned);
sc.nr_reclaimed += nr_soft_reclaimed;
if (kswapd_shrink_node(pgdat, &sc))
raise_priority = false;
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
allow_direct_reclaim(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);
__fs_reclaim_release(_THIS_IP_);
ret = try_to_freeze();
__fs_reclaim_acquire(_THIS_IP_);
if (ret || kthread_should_stop())
break;
nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
if (nr_boost_reclaim && !nr_reclaimed)
break;
if (raise_priority || !nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1);
if (!sc.nr_reclaimed)
pgdat->kswapd_failures++;
out:
clear_reclaim_active(pgdat, highest_zoneidx);
/* If reclaim was boosted, account for the reclaim done in this pass */
if (boosted) {
unsigned long flags;
for (i = 0; i <= highest_zoneidx; i++) {
if (!zone_boosts[i])
continue;
/* Increments are under the zone lock */
zone = pgdat->node_zones + i;
spin_lock_irqsave(&zone->lock, flags);
zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
spin_unlock_irqrestore(&zone->lock, flags);
}
wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
}
snapshot_refaults(NULL, pgdat);
__fs_reclaim_release(_THIS_IP_);
psi_memstall_leave(&pflags);
set_task_reclaim_state(current, NULL);
return sc.order;
}
void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, enum zone_type highest_zoneidx)
{
pg_data_t *pgdat;
enum zone_type curr_idx;
#if defined(EARLIER_SWAP_PATCH)
int i;
unsigned long free_pages, total_pages;
struct zone* tmpzone;
#endif
if (!managed_zone(zone))
return;
if (!cpuset_zone_allowed(zone, gfp_flags))
return;
pgdat = zone->zone_pgdat;
curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
if (READ_ONCE(pgdat->kswapd_order) < order)
WRITE_ONCE(pgdat->kswapd_order, order);
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
#if defined(EARLIER_SWAP_PATCH)
free_pages = 0;
total_pages = 0;
for (i = 0; i <= highest_zoneidx; i++) {
tmpzone = pgdat->node_zones + i;
if (managed_zone(tmpzone)) {
free_pages += zone_page_state(tmpzone, NR_FREE_PAGES);
total_pages += zone_managed_pages(tmpzone);
}
}
if ( pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
(!(free_pages*100 < total_pages*30) && !pgdat_watermark_boosted(pgdat, highest_zoneidx)) )
#else
if ( pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
(pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx)) )
#endif
{
if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
wakeup_kcompactd(pgdat, order, highest_zoneidx);
return;
}
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
gfp_flags);
wake_up_interruptible(&pgdat->kswapd_wait);
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
note:
1. (alloc_gfp & __GFP_DIRECT_RECLAIM)
This flag answers the question: "Is the caller allowed to block (sleep) and perform synchronous memory reclamation right now?"
DIRECT_RECLAIM means the current process context is willing to dive in and do the heavy lifting of clearing memory
(like flushing dirty pages to disk via I/O or compressing anonymous memory into zram/swap).
Because storage I/O takes time, this path inherently requires the process to enter a sleeping state (TASK_UNINTERRUPTIBLE)
while waiting for the hardware interrupt.
example:
include/linux/gfp_types.h
#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
struct my_buf *buffer = kmalloc(sizeof(*buffer), GFP_KERNEL);
2. !(alloc_gfp & __GFP_MEMALLOC)
This flag answers the question: "Is this caller a frontline emergency responder trying to save the system from an
Out-of-Memory (OOM) disaster?"
__GFP_MEMALLOC is a "Get Out of Jail Free card".
When the system is critically starved of memory, background rescue threads like kswapd or processes chosen by the
OOM-Killer need to allocate a tiny bit of memory (e.g., for stack variables or structures) to actually free up larger
chunks of memory.
To prevent a deadlock (a snake eating its own tail), the kernel assigns them the __GFP_MEMALLOC flag. This flag tells
the memory allocator: "Ignore all watermarks, do not intercept me, and let me dip into the system's emergency reserve funds."
example:
include/linux/gfp_types.h
#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)
current->flags |= PF_MEMALLOC; // This implicitly forces __GFP_MEMALLOC on all allocations
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
(option)
defconfig recommend enable
CONFIG_LRU_GEN=y
CONFIG_LRU_GEN_ENABLED=y
沒有留言:
張貼留言