本文章涵蓋基礎 Slub Allocator 的運作流程。
分配 分配主要由 slab_alloc_node
負責,經過一些檢查,呼叫 __slab_alloc_node
。 使用 GFP_ZERO
清空時,僅會清空 orig_size
大小的內容,而非整個 object。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 static __fastpath_inline void *slab_alloc_node (struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { void *object; bool init = false ; s = slab_pre_alloc_hook(s, gfpflags); if (unlikely(!s)) return NULL ; object = kfence_alloc(s, orig_size, gfpflags); if (unlikely(object)) goto out; object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); maybe_wipe_obj_freeptr(s, object); init = slab_want_init_on_alloc(gfpflags, s); out: slab_post_alloc_hook(s, lru, gfpflags, 1 , &object, init, orig_size); return object; }
__slab_alloc_node __slab_alloc_node
主要判斷 cpu_stab
上有無空的 object,若沒有則呼叫 __slab_alloc
。tid
是一個單調遞增的值,每次 CPU 的 freelist
變更時,tid
都會更新。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 static __always_inline void *__slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { struct kmem_cache_cpu *c ; struct slab *slab ; unsigned long tid; void *object; redo: c = raw_cpu_ptr(s->cpu_slab); tid = READ_ONCE(c->tid); barrier(); object = c->freelist; slab = c->slab; if (!USE_LOCKLESS_FAST_PATH() || unlikely(!object || !slab || !node_match(slab, node))) { object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); } else { void *next_object = get_freepointer_safe(s, object); if (unlikely(!this_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, object, tid, next_object, next_tid(tid)))) { note_cmpxchg_failure("slab_alloc" , s, tid); goto redo; } prefetch_freepointer(s, next_object); stat(s, ALLOC_FASTPATH); } return object; }
__slab_alloc
是 ___slab_alloc
的 wrapper,用於重新讀取 cpu_slab。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { void *p; #ifdef CONFIG_PREEMPT_COUNT c = slub_get_cpu_ptr(s->cpu_slab); #endif p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); #ifdef CONFIG_PREEMPT_COUNT slub_put_cpu_ptr(s->cpu_slab); #endif return p; }
___slab_alloc 如果當前 kmem_cache_cpu
沒有 slab,則跳到 new_slab
要一個 slab。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { void *freelist; struct slab *slab ; unsigned long flags; struct partial_context pc ; stat(s, ALLOC_SLOWPATH); reread_slab: slab = READ_ONCE(c->slab); if (!slab) { if (unlikely(node != NUMA_NO_NODE && !node_isset(node, slab_nodes))) node = NUMA_NO_NODE; goto new_slab; }
redo
如果 slab 不屬於當前 node,則跳到 deactivate_slab
檢查 slab == c->slab
,如果中間有被 preempt 造成不一致,回到 reread_slab
如果 c->freelist
有東西,表示可以直接從 freelist 拿 object,跳至 load_freelist
若 freelist 仍為空,從 slab 拿 freelist,裡面仍是空的話,new_slab
索取新的 slab
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 redo: if (unlikely(!node_match(slab, node))) { if (!node_isset(node, slab_nodes)) { node = NUMA_NO_NODE; } else { stat(s, ALLOC_NODE_MISMATCH); goto deactivate_slab; } } if (unlikely(!pfmemalloc_match(slab, gfpflags))) goto deactivate_slab; local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(slab != c->slab)) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto reread_slab; } freelist = c->freelist; if (freelist) goto load_freelist; freelist = get_freelist(s, slab); if (!freelist) { c->slab = NULL ; c->tid = next_tid(c->tid); local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; } stat(s, ALLOC_REFILL);
總結一下,若 slab 出問題,就會進入 new_slab 或 deactivate_slab 以修正 slab。 若 c->freelist
存在,則拿 c->freelist
,若沒有則拿 slab 的。 連 slab 都沒有 freelist 的話,同樣 new_slab。
走 load_freelist
條件是 freelist, slab 都存在且 node 正確,這應該在 __slab_alloc_node
就被過濾掉了,為何會走到這裡? 原因可能是 USE_LOCKLESS_FAST_PATH() == false
,或是中途被 preempt 導致經過 redo 進來這裡。
get_freelist()
函数主要獲取 slab->freelist
,將 slab->freelist
設為 NULL 並回傳原来的 freelist。如同前面講的,當 slab 被一個 cpu 使用時,其 freelist 會被清空。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 static inline void *get_freelist (struct kmem_cache *s, struct slab *slab) { struct slab new ; unsigned long counters; void *freelist; lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); do { freelist = slab->freelist; counters = slab->counters; new.counters = counters; VM_BUG_ON(!new.frozen); new.inuse = slab->objects; new.frozen = freelist != NULL ; } while (!__cmpxchg_double_slab(s, slab, freelist, counters, NULL , new.counters, "get_freelist" )); return freelist; }
load_freelist load_freelist
假設 c->freelist
,更新 c->freelist
成下一個 object。注意不是拿一個 freelist 回傳,因為我們在 redo 已經拿到了。
1 2 3 4 5 6 7 8 9 load_freelist: lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); VM_BUG_ON(!c->slab->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); local_unlock_irqrestore(&s->cpu_slab->lock, flags); return freelist;
其實 load_freelist
做的事幾乎跟 fast path 一樣,只是變成用 lock 確保同步。
get_freepointer
負責從一個 object 找到下一個 object 的位址,底下是 freelist_ptr
,若有開 harden freelist 則會與一些值 xor。
1 2 3 4 5 6 7 8 9 10 11 static inline void *freelist_ptr (const struct kmem_cache *s, void *ptr, unsigned long ptr_addr) { #ifdef CONFIG_SLAB_FREELIST_HARDENED return (void *)((unsigned long )ptr ^ s->random ^ swab((unsigned long )kasan_reset_tag((void *)ptr_addr))); #else return ptr; #endif }
注意 get_freelist
是從 slab 獲取 freelist,會清空 slab->freelist
。而 get_freepointer
從一個 freelist 獲取下一個 object 位址,應相鑑別。
deactivate_slab 將 cpu cache 資料清空,調用 deactivate_slab
函數。
1 2 3 4 5 6 7 8 9 10 11 12 13 deactivate_slab: local_lock_irqsave(&s->cpu_slab->lock, flags); if (slab != c->slab) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto reread_slab; } freelist = c->freelist; c->slab = NULL ; c->freelist = NULL ; c->tid = next_tid(c->tid); local_unlock_irqrestore(&s->cpu_slab->lock, flags); deactivate_slab(s, slab, freelist);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 static void deactivate_slab (struct kmem_cache *s, struct slab *slab, void *freelist) { enum slab_modes { M_NONE, M_PARTIAL, M_FREE, M_FULL_NOLIST }; struct kmem_cache_node *n = get_node(s, slab_nid(slab)); int free_delta = 0 ; enum slab_modes mode = M_NONE; void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; unsigned long flags = 0 ; struct slab new ; struct slab old ; if (slab->freelist) { stat(s, DEACTIVATE_REMOTE_FREES); tail = DEACTIVATE_TO_TAIL; } freelist_tail = NULL ; freelist_iter = freelist; while (freelist_iter) { nextfree = get_freepointer(s, freelist_iter); if (freelist_corrupted(s, slab, &freelist_iter, nextfree)) break ; freelist_tail = freelist_iter; free_delta++; freelist_iter = nextfree; } redo: old.freelist = READ_ONCE(slab->freelist); old.counters = READ_ONCE(slab->counters); VM_BUG_ON(!old.frozen); new.counters = old.counters; if (freelist_tail) { new.inuse -= free_delta; set_freepointer(s, freelist_tail, old.freelist); new.freelist = freelist; } else new.freelist = old.freelist; new.frozen = 0 ; if (!new.inuse && n->nr_partial >= s->min_partial) { mode = M_FREE; } else if (new.freelist) { mode = M_PARTIAL; spin_lock_irqsave(&n->list_lock, flags); } else { mode = M_FULL_NOLIST; } if (!cmpxchg_double_slab(s, slab, old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab" )) { if (mode == M_PARTIAL) spin_unlock_irqrestore(&n->list_lock, flags); goto redo; } if (mode == M_PARTIAL) { add_partial(n, slab, tail); spin_unlock_irqrestore(&n->list_lock, flags); stat(s, tail); } else if (mode == M_FREE) { stat(s, DEACTIVATE_EMPTY); discard_slab(s, slab); stat(s, FREE_SLAB); } else if (mode == M_FULL_NOLIST) { stat(s, DEACTIVATE_FULL); } }
new_slab 從 partial list 拿一個 slab。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 new_slab: if (slub_percpu_partial(c)) { local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(c->slab)) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto reread_slab; } if (unlikely(!slub_percpu_partial(c))) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto new_objects; } slab = c->slab = slub_percpu_partial(c); slub_set_percpu_partial(c, slab); local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, CPU_PARTIAL_ALLOC); goto redo; }
new_objects 獲取 node partial slab 的 freelist,或跟 buddy system 要一塊。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 new_objects: pc.flags = gfpflags; pc.slab = &slab; pc.orig_size = orig_size; freelist = get_partial(s, node, &pc); if (freelist) goto check_new_slab; slub_put_cpu_ptr(s->cpu_slab); slab = new_slab(s, gfpflags, node); c = slub_get_cpu_ptr(s->cpu_slab); if (unlikely(!slab)) { slab_out_of_memory(s, gfpflags, node); return NULL ; } stat(s, ALLOC_SLAB); if (kmem_cache_debug(s)) { freelist = alloc_single_from_new_slab(s, slab, orig_size); if (unlikely(!freelist)) goto new_objects; if (s->flags & SLAB_STORE_USER) set_track(s, freelist, TRACK_ALLOC, addr); return freelist; } freelist = slab->freelist; slab->freelist = NULL ; slab->inuse = slab->objects; slab->frozen = 1 ; inc_slabs_node(s, slab_nid(slab), slab->objects);
若此 cache 有開啟 debug flag,追蹤資訊。
pfmemalloc 是 linux 確保最低記憶體大小的機制,假設現在記憶體快滿了,需要 kswapd 來做 swapping,但 kswapd 本身也需要記憶體。 因此 linux 將一部分記憶體標記為 PF_MEMALLOC
保留區,只有 PF_MEMALLOC
標誌的 process 能使用它。 若 slab 來自 PF_MEMALLOC
且當前 process 無 PF_MEMALLOC
標誌,則將 slab 返回給 kmem_cache_node
,確保不會從 fast path 用到它。最後, free object 仍要丟給使用者。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 check_new_slab: if (kmem_cache_debug(s)) { if (s->flags & SLAB_STORE_USER) set_track(s, freelist, TRACK_ALLOC, addr); return freelist; } if (unlikely(!pfmemalloc_match(slab, gfpflags))) { deactivate_slab(s, slab, get_freepointer(s, freelist)); return freelist; }
將新的 slab 給 cpu_slab,跳到 load_freelist。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 retry_load_slab: local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(c->slab)) { void *flush_freelist = c->freelist; struct slab *flush_slab = c->slab; c->slab = NULL ; c->freelist = NULL ; c->tid = next_tid(c->tid); local_unlock_irqrestore(&s->cpu_slab->lock, flags); deactivate_slab(s, flush_slab, flush_freelist); stat(s, CPUSLAB_FLUSH); goto retry_load_slab; } c->slab = slab; goto load_freelist; }
總結 object 分配共有 fast path 和 slow path,起始點在 slab_alloc_node
。
如果 cpu 上有 slab, freelist,則走 fast path,也就是取用 cpu 上的 freelist,並更新 freelist。整個過程用 cmpxchg
指令取代 lock,避免同步問題。 若條件不符合,則進入 ___slab_alloc
走 slow path,過程中可能發生 context switch 導致使用的 cpu 不一致或 freelist 被更改,裡面用 cpu lock 來解決此問題。
如果發現 cpu slab, freelist 又出現了,則直接拿裡面的 object,並 load_freelist 更新 freelist。 如果 node 不 match、佔用到保留記憶體,或是要新的 slab 後發現 cpu 又有 slab 可用,則 deactivate_slab 歸還給 kmem_cache_node
或 buddy system。 如果 slab 有 freelist 而 cpu 沒有,使用 slab 的 freelist。
如果 freelist 無 object 可用,new_slab 從 cpu partial 拿。 連 partial 都沒有的話,new_object 從 node partial 拿或跟 buddy system 要一塊。
所以取用 slab 的優先級是這樣的:
cpu freelist (slab freelist)
cpu partial
node partial
buddy system
這個網站 有清楚的示意圖,可以去看看。
釋放 do_slab_free do_slab_free
允許釋放多個 object 連成的 list。 若 slab != c->slab
,則呼叫 __slab_free
走 slow path。 fast path 將 object 放回當前 cpu 用的 slab,USE_LOCKLESS_FAST_PATH
啟用時,透過 cmpxchg
確保同步,否則用 lock。兩者皆使用 set_freepointer
連接 list 後,放回 cpu freelist 上。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 static __always_inline void do_slab_free (struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { void *tail_obj = tail ? : head; struct kmem_cache_cpu *c ; unsigned long tid; void **freelist; redo: c = raw_cpu_ptr(s->cpu_slab); tid = READ_ONCE(c->tid); barrier(); if (unlikely(slab != c->slab)) { __slab_free(s, slab, head, tail_obj, cnt, addr); return ; } if (USE_LOCKLESS_FAST_PATH()) { freelist = READ_ONCE(c->freelist); set_freepointer(s, tail_obj, freelist); if (unlikely(!this_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, freelist, tid, head, next_tid(tid)))) { note_cmpxchg_failure("slab_free" , s, tid); goto redo; } } else { local_lock(&s->cpu_slab->lock); c = this_cpu_ptr(s->cpu_slab); if (unlikely(slab != c->slab)) { local_unlock(&s->cpu_slab->lock); goto redo; } tid = c->tid; freelist = c->freelist; set_freepointer(s, tail_obj, freelist); c->freelist = head; c->tid = next_tid(tid); local_unlock(&s->cpu_slab->lock); } stat(s, FREE_FASTPATH); }
__slab_free 將要 free 的 list 接在 slab 的 freelist 前面,判斷此 slab 應丟給 node 或 cpu。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 static void __slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { void *prior; int was_frozen; struct slab new ; unsigned long counters; struct kmem_cache_node *n = NULL ; unsigned long flags; stat(s, FREE_SLOWPATH); if (kfence_free(head)) return ; if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { free_to_partial_list(s, slab, head, tail, cnt, addr); return ; } do { if (unlikely(n)) { spin_unlock_irqrestore(&n->list_lock, flags); n = NULL ; } prior = slab->freelist; counters = slab->counters; set_freepointer(s, tail, prior); new.counters = counters; was_frozen = new.frozen; new.inuse -= cnt; if ((!new.inuse || !prior) && !was_frozen) { if (kmem_cache_has_cpu_partial(s) && !prior) { new.frozen = 1 ; } else { n = get_node(s, slab_nid(slab)); spin_lock_irqsave(&n->list_lock, flags); } } } while (!cmpxchg_double_slab(s, slab, prior, counters, head, new.counters, "__slab_free" )); if (likely(!n)) { if (likely(was_frozen)) stat(s, FREE_FROZEN); } else if (new.frozen) { put_cpu_partial(s, slab, 1 ); stat(s, CPU_PARTIAL_FREE); } return ; } if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { remove_full(s, n, slab); add_partial(n, slab, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } spin_unlock_irqrestore(&n->list_lock, flags); return ; slab_empty: if (prior) { remove_partial(n, slab); stat(s, FREE_REMOVE_PARTIAL); } else { remove_full(s, n, slab); } spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); discard_slab(s, slab); }
參考資料