2019年12月31日 星期二

Activate swap more earlier in linux 6 and above

in linux 5
initial script
# low watermark = min watermark + distance, distance calculate from scale factor
sysctl -w vm.watermark_scale_factor=100
# must set minimum free memory watermark to __setup_per_zone_wmarks
sysctl -w vm.min_free_kbytes=$((memkb / 8))    
# setting it to 100 allows the system to more actively utilize compressed space
sysctl -w vm.swappiness=100
# read ahead 2^n page, n=0 page=1
sysctl -w vm.page-cluster=0

in linux 6
1. 
initial script as linux 5

2. 
gedit ~/linux-6.x.x/mm/page_alloc.c
struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask)
{
    struct page *page;
    unsigned int alloc_flags = ALLOC_WMARK_LOW;
    gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
    struct alloc_context ac = { };

    if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))
        return NULL;

    gfp &= gfp_allowed_mask;

    gfp = current_gfp_context(gfp);
    alloc_gfp = gfp;
    if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
            &alloc_gfp, &alloc_flags))
        return NULL;

    alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);

#if defined(EARLIER_SWAP_PATCH)
    /* (alloc_gfp & __GFP_DIRECT_RECLAIM):
     * allow can wait general memory allocation only,
     * and cut off atomic allocation, (e.g., sk->sk_allocation = GFP_ATOMIC in net paths)
     * that cannot wait for kswapd and must fast-path allocate from the freelist.
     * !current_is_kswapd():
     * Prevent recursively call its own memory allocation path,
     * because tsk->flags |= PF_MEMALLOC | PF_KSWAPD in static int kswapd(void *p)
     */
    if ((alloc_gfp & __GFP_DIRECT_RECLAIM) &&
        !current_is_kswapd() &&
        ac.preferred_zoneref &&
        !zone_watermark_ok(ac.preferred_zoneref->zone,
                           order,
                           high_wmark_pages(ac.preferred_zoneref->zone),
                           ac.highest_zoneidx,
                           alloc_flags)) {
        wake_all_kswapds(order, alloc_gfp, &ac);
    }
#endif

    page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
    if (likely(page))
        goto out;

    alloc_gfp = gfp;
    ac.spread_dirty_pages = false;

    ac.nodemask = nodemask;

    page = __alloc_pages_slowpath(alloc_gfp, order, &ac);

out:
    if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
        unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
        __free_pages(page, order);
        page = NULL;
    }

    trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
    kmsan_alloc_page(page, order, alloc_gfp);

    return page;
}

3.
gedit ~/linux-6.x.x/mm/vmscan.c
static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
{
    int i;
    unsigned long nr_soft_reclaimed;
    unsigned long nr_soft_scanned;
    unsigned long pflags;
    unsigned long nr_boost_reclaim;
    unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
    bool boosted;
    struct zone *zone;
    struct scan_control sc = {
        .gfp_mask = GFP_KERNEL,
        .order = order,
        .may_unmap = 1,
    };

    set_task_reclaim_state(current, &sc.reclaim_state);
    psi_memstall_enter(&pflags);
    __fs_reclaim_acquire(_THIS_IP_);

    count_vm_event(PAGEOUTRUN);

    nr_boost_reclaim = 0;
    for (i = 0; i <= highest_zoneidx; i++) {
        zone = pgdat->node_zones + i;
        if (!managed_zone(zone))
            continue;

        nr_boost_reclaim += zone->watermark_boost;
        zone_boosts[i] = zone->watermark_boost;
    }
    boosted = nr_boost_reclaim;

restart:
    set_reclaim_active(pgdat, highest_zoneidx);
    sc.priority = DEF_PRIORITY;
    do {
        unsigned long nr_reclaimed = sc.nr_reclaimed;
        bool raise_priority = true;
        bool balanced;
        bool ret;
#if defined(EARLIER_SWAP_PATCH)
        int             i;
        unsigned long   free_pages, total_pages;
        struct zone*    tmpzone;
#endif
        sc.reclaim_idx = highest_zoneidx;

        if (buffer_heads_over_limit) {
            for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
                zone = pgdat->node_zones + i;
                if (!managed_zone(zone))
                    continue;

                sc.reclaim_idx = i;
                break;
            }
        }

        balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
        if (!balanced && nr_boost_reclaim) {
            nr_boost_reclaim = 0;
            goto restart;
        }

#if defined(EARLIER_SWAP_PATCH)
        free_pages  = 0;
        total_pages = 0;
        for (i = 0; i <= highest_zoneidx; i++) {
            tmpzone = pgdat->node_zones + i;
            if (managed_zone(tmpzone)) {
                free_pages  += zone_page_state(tmpzone, NR_FREE_PAGES);
                total_pages += zone_managed_pages(tmpzone);
            }
        }
        balanced = balanced && !(free_pages * 100 < total_pages * 30);
#endif

        if (!nr_boost_reclaim && balanced)
            goto out;

        if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
            raise_priority = false;

        sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
        sc.may_swap = !nr_boost_reclaim;

        kswapd_age_node(pgdat, &sc);

        if (sc.priority < DEF_PRIORITY - 2)
            sc.may_writepage = 1;

        sc.nr_scanned = 0;
        nr_soft_scanned = 0;
        nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
                        sc.gfp_mask, &nr_soft_scanned);
        sc.nr_reclaimed += nr_soft_reclaimed;

        if (kswapd_shrink_node(pgdat, &sc))
            raise_priority = false;

        if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
                allow_direct_reclaim(pgdat))
            wake_up_all(&pgdat->pfmemalloc_wait);

        __fs_reclaim_release(_THIS_IP_);
        ret = try_to_freeze();
        __fs_reclaim_acquire(_THIS_IP_);
        if (ret || kthread_should_stop())
            break;

        nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
        nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);

        if (nr_boost_reclaim && !nr_reclaimed)
            break;

        if (raise_priority || !nr_reclaimed)
            sc.priority--;
    } while (sc.priority >= 1);

    if (!sc.nr_reclaimed)
        pgdat->kswapd_failures++;

out:
    clear_reclaim_active(pgdat, highest_zoneidx);

    /* If reclaim was boosted, account for the reclaim done in this pass */
    if (boosted) {
        unsigned long flags;

        for (i = 0; i <= highest_zoneidx; i++) {
            if (!zone_boosts[i])
                continue;

            /* Increments are under the zone lock */
            zone = pgdat->node_zones + i;
            spin_lock_irqsave(&zone->lock, flags);
            zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
            spin_unlock_irqrestore(&zone->lock, flags);
        }

        wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
    }

    snapshot_refaults(NULL, pgdat);
    __fs_reclaim_release(_THIS_IP_);
    psi_memstall_leave(&pflags);
    set_task_reclaim_state(current, NULL);

    return sc.order;
}

void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, enum zone_type highest_zoneidx)
{
    pg_data_t *pgdat;
    enum zone_type curr_idx;
#if defined(EARLIER_SWAP_PATCH)
    int             i;
    unsigned long   free_pages, total_pages;
    struct zone*    tmpzone;
#endif

    if (!managed_zone(zone))
        return;

    if (!cpuset_zone_allowed(zone, gfp_flags))
        return;

    pgdat = zone->zone_pgdat;
    curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);

    if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
        WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);

    if (READ_ONCE(pgdat->kswapd_order) < order)
        WRITE_ONCE(pgdat->kswapd_order, order);

    if (!waitqueue_active(&pgdat->kswapd_wait))
        return;

#if defined(EARLIER_SWAP_PATCH)
    free_pages  = 0;
    total_pages = 0;
    for (i = 0; i <= highest_zoneidx; i++) {
        tmpzone = pgdat->node_zones + i;
        if (managed_zone(tmpzone)) {
            free_pages  += zone_page_state(tmpzone, NR_FREE_PAGES);
            total_pages += zone_managed_pages(tmpzone);
        }
    }
    if ( pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
         (!(free_pages*100 < total_pages*30)            && !pgdat_watermark_boosted(pgdat, highest_zoneidx)) )
#else
    if ( pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
         (pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx)) )
#endif
    {
        if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
            wakeup_kcompactd(pgdat, order, highest_zoneidx);
        return;
    }

    trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
                      gfp_flags);
    wake_up_interruptible(&pgdat->kswapd_wait);
}

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
note:
1. (alloc_gfp & __GFP_DIRECT_RECLAIM)
This flag answers the question: "Is the caller allowed to block (sleep) and perform synchronous memory reclamation right now?"
DIRECT_RECLAIM means the current process context is willing to dive in and do the heavy lifting of clearing memory 
(like flushing dirty pages to disk via I/O or compressing anonymous memory into zram/swap).
Because storage I/O takes time, this path inherently requires the process to enter a sleeping state (TASK_UNINTERRUPTIBLE) 
while waiting for the hardware interrupt.

example:
include/linux/gfp_types.h
#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
#define GFP_KERNEL    (__GFP_RECLAIM | __GFP_IO | __GFP_FS)

struct my_buf *buffer = kmalloc(sizeof(*buffer), GFP_KERNEL);

2. !(alloc_gfp & __GFP_MEMALLOC)
This flag answers the question: "Is this caller a frontline emergency responder trying to save the system from an 
Out-of-Memory (OOM) disaster?"
__GFP_MEMALLOC is a "Get Out of Jail Free card".
When the system is critically starved of memory, background rescue threads like kswapd or processes chosen by the 
OOM-Killer need to allocate a tiny bit of memory (e.g., for stack variables or structures) to actually free up larger 
chunks of memory.
To prevent a deadlock (a snake eating its own tail), the kernel assigns them the __GFP_MEMALLOC flag. This flag tells 
the memory allocator: "Ignore all watermarks, do not intercept me, and let me dip into the system's emergency reserve funds."

example:
include/linux/gfp_types.h
#define __GFP_MEMALLOC    ((__force gfp_t)___GFP_MEMALLOC)
current->flags |= PF_MEMALLOC; // This implicitly forces __GFP_MEMALLOC on all allocations

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
(option)
defconfig recommend enable
CONFIG_LRU_GEN=y
CONFIG_LRU_GEN_ENABLED=y


沒有留言:

張貼留言