1 /*
2  * linux/mm/slab.c
3  * Written by Mark Hemment, 1996/97.
4  * (markhe@nextd.demon.co.uk)
5  *
6  * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7  *
8  * Major cleanup, different bufctl logic, per-cpu arrays
9  *	(c) 2000 Manfred Spraul
10  *
11  * Cleanup, make the head arrays unconditional, preparation for NUMA
12  * 	(c) 2002 Manfred Spraul
13  *
14  * An implementation of the Slab Allocator as described in outline in;
15  *	UNIX Internals: The New Frontiers by Uresh Vahalia
16  *	Pub: Prentice Hall	ISBN 0-13-101908-2
17  * or with a little more detail in;
18  *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
19  *	Jeff Bonwick (Sun Microsystems).
20  *	Presented at: USENIX Summer 1994 Technical Conference
21  *
22  * The memory is organized in caches, one cache for each object type.
23  * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
24  * Each cache consists out of many slabs (they are small (usually one
25  * page long) and always contiguous), and each slab contains multiple
26  * initialized objects.
27  *
28  * This means, that your constructor is used only for newly allocated
29  * slabs and you must pass objects with the same initializations to
30  * kmem_cache_free.
31  *
32  * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
33  * normal). If you need a special memory type, then must create a new
34  * cache for that memory type.
35  *
36  * In order to reduce fragmentation, the slabs are sorted in 3 groups:
37  *   full slabs with 0 free objects
38  *   partial slabs
39  *   empty slabs with no allocated objects
40  *
41  * If partial slabs exist, then new allocations come from these slabs,
42  * otherwise from empty slabs or new slabs are allocated.
43  *
44  * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
45  * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
46  *
47  * Each cache has a short per-cpu head array, most allocs
48  * and frees go into that array, and if that array overflows, then 1/2
49  * of the entries in the array are given back into the global cache.
50  * The head array is strictly LIFO and should improve the cache hit rates.
51  * On SMP, it additionally reduces the spinlock operations.
52  *
53  * The c_cpuarray may not be read with enabled local interrupts -
54  * it's changed with a smp_call_function().
55  *
56  * SMP synchronization:
57  *  constructors and destructors are called without any locking.
58  *  Several members in struct kmem_cache and struct slab never change, they
59  *	are accessed without any locking.
60  *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
61  *  	and local interrupts are disabled so slab code is preempt-safe.
62  *  The non-constant members are protected with a per-cache irq spinlock.
63  *
64  * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
65  * in 2000 - many ideas in the current implementation are derived from
66  * his patch.
67  *
68  * Further notes from the original documentation:
69  *
70  * 11 April '97.  Started multi-threading - markhe
71  *	The global cache-chain is protected by the mutex 'cache_chain_mutex'.
72  *	The sem is only needed when accessing/extending the cache-chain, which
73  *	can never happen inside an interrupt (kmem_cache_create(),
74  *	kmem_cache_shrink() and kmem_cache_reap()).
75  *
76  *	At present, each engine can be growing a cache.  This should be blocked.
77  *
78  * 15 March 2005. NUMA slab allocator.
79  *	Shai Fultheim <shai@scalex86.org>.
80  *	Shobhit Dayal <shobhit@calsoftinc.com>
81  *	Alok N Kataria <alokk@calsoftinc.com>
82  *	Christoph Lameter <christoph@lameter.com>
83  *
84  *	Modified the slab allocator to be node aware on NUMA systems.
85  *	Each node has its own list of partial, free and full slabs.
86  *	All object allocations for a node occur from node specific slab lists.
87  */
88 
89 #include	<linux/slab.h>
90 #include	<linux/mm.h>
91 #include	<linux/poison.h>
92 #include	<linux/swap.h>
93 #include	<linux/cache.h>
94 #include	<linux/interrupt.h>
95 #include	<linux/init.h>
96 #include	<linux/compiler.h>
97 #include	<linux/cpuset.h>
98 #include	<linux/proc_fs.h>
99 #include	<linux/seq_file.h>
100 #include	<linux/notifier.h>
101 #include	<linux/kallsyms.h>
102 #include	<linux/cpu.h>
103 #include	<linux/sysctl.h>
104 #include	<linux/module.h>
105 #include	<linux/rcupdate.h>
106 #include	<linux/string.h>
107 #include	<linux/uaccess.h>
108 #include	<linux/nodemask.h>
109 #include	<linux/kmemleak.h>
110 #include	<linux/mempolicy.h>
111 #include	<linux/mutex.h>
112 #include	<linux/fault-inject.h>
113 #include	<linux/rtmutex.h>
114 #include	<linux/reciprocal_div.h>
115 #include	<linux/debugobjects.h>
116 #include	<linux/kmemcheck.h>
117 #include	<linux/memory.h>
118 
119 #include	<asm/cacheflush.h>
120 #include	<asm/tlbflush.h>
121 #include	<asm/page.h>
122 
123 /*
124  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
125  *		  0 for faster, smaller code (especially in the critical paths).
126  *
127  * STATS	- 1 to collect stats for /proc/slabinfo.
128  *		  0 for faster, smaller code (especially in the critical paths).
129  *
130  * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
131  */
132 
133 #ifdef CONFIG_DEBUG_SLAB
134 #define	DEBUG		1
135 #define	STATS		1
136 #define	FORCED_DEBUG	1
137 #else
138 #define	DEBUG		0
139 #define	STATS		0
140 #define	FORCED_DEBUG	0
141 #endif
142 
143 /* Shouldn't this be in a header file somewhere? */
144 #define	BYTES_PER_WORD		sizeof(void *)
145 #define	REDZONE_ALIGN		max(BYTES_PER_WORD, __alignof__(unsigned long long))
146 
147 #ifndef ARCH_KMALLOC_FLAGS
148 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
149 #endif
150 
151 /* Legal flag mask for kmem_cache_create(). */
152 #if DEBUG
153 # define CREATE_MASK	(SLAB_RED_ZONE | \
154 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
155 			 SLAB_CACHE_DMA | \
156 			 SLAB_STORE_USER | \
157 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
158 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
159 			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
160 #else
161 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
162 			 SLAB_CACHE_DMA | \
163 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
164 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
165 			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
166 #endif
167 
168 /*
169  * kmem_bufctl_t:
170  *
171  * Bufctl's are used for linking objs within a slab
172  * linked offsets.
173  *
174  * This implementation relies on "struct page" for locating the cache &
175  * slab an object belongs to.
176  * This allows the bufctl structure to be small (one int), but limits
177  * the number of objects a slab (not a cache) can contain when off-slab
178  * bufctls are used. The limit is the size of the largest general cache
179  * that does not use off-slab slabs.
180  * For 32bit archs with 4 kB pages, is this 56.
181  * This is not serious, as it is only for large objects, when it is unwise
182  * to have too many per slab.
183  * Note: This limit can be raised by introducing a general cache whose size
184  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
185  */
186 
187 typedef unsigned int kmem_bufctl_t;
188 #define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
189 #define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
190 #define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
191 #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
192 
193 /*
194  * struct slab_rcu
195  *
196  * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
197  * arrange for kmem_freepages to be called via RCU.  This is useful if
198  * we need to approach a kernel structure obliquely, from its address
199  * obtained without the usual locking.  We can lock the structure to
200  * stabilize it and check it's still at the given address, only if we
201  * can be sure that the memory has not been meanwhile reused for some
202  * other kind of object (which our subsystem's lock might corrupt).
203  *
204  * rcu_read_lock before reading the address, then rcu_read_unlock after
205  * taking the spinlock within the structure expected at that address.
206  */
207 struct slab_rcu {
208 	struct rcu_head head;
209 	struct kmem_cache *cachep;
210 	void *addr;
211 };
212 
213 /*
214  * struct slab
215  *
216  * Manages the objs in a slab. Placed either at the beginning of mem allocated
217  * for a slab, or allocated from an general cache.
218  * Slabs are chained into three list: fully used, partial, fully free slabs.
219  */
220 struct slab {
221 	union {
222 		struct {
223 			struct list_head list;
224 			unsigned long colouroff;
225 			void *s_mem;		/* including colour offset */
226 			unsigned int inuse;	/* num of objs active in slab */
227 			kmem_bufctl_t free;
228 			unsigned short nodeid;
229 		};
230 		struct slab_rcu __slab_cover_slab_rcu;
231 	};
232 };
233 
234 /*
235  * struct array_cache
236  *
237  * Purpose:
238  * - LIFO ordering, to hand out cache-warm objects from _alloc
239  * - reduce the number of linked list operations
240  * - reduce spinlock operations
241  *
242  * The limit is stored in the per-cpu structure to reduce the data cache
243  * footprint.
244  *
245  */
246 struct array_cache {
247 	unsigned int avail;
248 	unsigned int limit;
249 	unsigned int batchcount;
250 	unsigned int touched;
251 	spinlock_t lock;
252 	void *entry[];	/*
253 			 * Must have this definition in here for the proper
254 			 * alignment of array_cache. Also simplifies accessing
255 			 * the entries.
256 			 */
257 };
258 
259 /*
260  * bootstrap: The caches do not work without cpuarrays anymore, but the
261  * cpuarrays are allocated from the generic caches...
262  */
263 #define BOOT_CPUCACHE_ENTRIES	1
264 struct arraycache_init {
265 	struct array_cache cache;
266 	void *entries[BOOT_CPUCACHE_ENTRIES];
267 };
268 
269 /*
270  * The slab lists for all objects.
271  */
272 struct kmem_list3 {
273 	struct list_head slabs_partial;	/* partial list first, better asm code */
274 	struct list_head slabs_full;
275 	struct list_head slabs_free;
276 	unsigned long free_objects;
277 	unsigned int free_limit;
278 	unsigned int colour_next;	/* Per-node cache coloring */
279 	spinlock_t list_lock;
280 	struct array_cache *shared;	/* shared per node */
281 	struct array_cache **alien;	/* on other nodes */
282 	unsigned long next_reap;	/* updated without locking */
283 	int free_touched;		/* updated without locking */
284 };
285 
286 /*
287  * Need this for bootstrapping a per node allocator.
288  */
289 #define NUM_INIT_LISTS (3 * MAX_NUMNODES)
290 static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
291 #define	CACHE_CACHE 0
292 #define	SIZE_AC MAX_NUMNODES
293 #define	SIZE_L3 (2 * MAX_NUMNODES)
294 
295 static int drain_freelist(struct kmem_cache *cache,
296 			struct kmem_list3 *l3, int tofree);
297 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
298 			int node);
299 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
300 static void cache_reap(struct work_struct *unused);
301 
302 /*
303  * This function must be completely optimized away if a constant is passed to
304  * it.  Mostly the same as what is in linux/slab.h except it returns an index.
305  */
index_of(const size_t size)306 static __always_inline int index_of(const size_t size)
307 {
308 	extern void __bad_size(void);
309 
310 	if (__builtin_constant_p(size)) {
311 		int i = 0;
312 
313 #define CACHE(x) \
314 	if (size <=x) \
315 		return i; \
316 	else \
317 		i++;
318 #include <linux/kmalloc_sizes.h>
319 #undef CACHE
320 		__bad_size();
321 	} else
322 		__bad_size();
323 	return 0;
324 }
325 
326 static int slab_early_init = 1;
327 
328 #define INDEX_AC index_of(sizeof(struct arraycache_init))
329 #define INDEX_L3 index_of(sizeof(struct kmem_list3))
330 
kmem_list3_init(struct kmem_list3 * parent)331 static void kmem_list3_init(struct kmem_list3 *parent)
332 {
333 	INIT_LIST_HEAD(&parent->slabs_full);
334 	INIT_LIST_HEAD(&parent->slabs_partial);
335 	INIT_LIST_HEAD(&parent->slabs_free);
336 	parent->shared = NULL;
337 	parent->alien = NULL;
338 	parent->colour_next = 0;
339 	spin_lock_init(&parent->list_lock);
340 	parent->free_objects = 0;
341 	parent->free_touched = 0;
342 }
343 
344 #define MAKE_LIST(cachep, listp, slab, nodeid)				\
345 	do {								\
346 		INIT_LIST_HEAD(listp);					\
347 		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
348 	} while (0)
349 
350 #define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
351 	do {								\
352 	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
353 	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
354 	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
355 	} while (0)
356 
357 #define CFLGS_OFF_SLAB		(0x80000000UL)
358 #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
359 
360 #define BATCHREFILL_LIMIT	16
361 /*
362  * Optimization question: fewer reaps means less probability for unnessary
363  * cpucache drain/refill cycles.
364  *
365  * OTOH the cpuarrays can contain lots of objects,
366  * which could lock up otherwise freeable slabs.
367  */
368 #define REAPTIMEOUT_CPUC	(2*HZ)
369 #define REAPTIMEOUT_LIST3	(4*HZ)
370 
371 #if STATS
372 #define	STATS_INC_ACTIVE(x)	((x)->num_active++)
373 #define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
374 #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
375 #define	STATS_INC_GROWN(x)	((x)->grown++)
376 #define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
377 #define	STATS_SET_HIGH(x)						\
378 	do {								\
379 		if ((x)->num_active > (x)->high_mark)			\
380 			(x)->high_mark = (x)->num_active;		\
381 	} while (0)
382 #define	STATS_INC_ERR(x)	((x)->errors++)
383 #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
384 #define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
385 #define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
386 #define	STATS_SET_FREEABLE(x, i)					\
387 	do {								\
388 		if ((x)->max_freeable < i)				\
389 			(x)->max_freeable = i;				\
390 	} while (0)
391 #define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
392 #define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
393 #define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
394 #define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
395 #else
396 #define	STATS_INC_ACTIVE(x)	do { } while (0)
397 #define	STATS_DEC_ACTIVE(x)	do { } while (0)
398 #define	STATS_INC_ALLOCED(x)	do { } while (0)
399 #define	STATS_INC_GROWN(x)	do { } while (0)
400 #define	STATS_ADD_REAPED(x,y)	do { (void)(y); } while (0)
401 #define	STATS_SET_HIGH(x)	do { } while (0)
402 #define	STATS_INC_ERR(x)	do { } while (0)
403 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
404 #define	STATS_INC_NODEFREES(x)	do { } while (0)
405 #define STATS_INC_ACOVERFLOW(x)   do { } while (0)
406 #define	STATS_SET_FREEABLE(x, i) do { } while (0)
407 #define STATS_INC_ALLOCHIT(x)	do { } while (0)
408 #define STATS_INC_ALLOCMISS(x)	do { } while (0)
409 #define STATS_INC_FREEHIT(x)	do { } while (0)
410 #define STATS_INC_FREEMISS(x)	do { } while (0)
411 #endif
412 
413 #if DEBUG
414 
415 /*
416  * memory layout of objects:
417  * 0		: objp
418  * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
419  * 		the end of an object is aligned with the end of the real
420  * 		allocation. Catches writes behind the end of the allocation.
421  * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
422  * 		redzone word.
423  * cachep->obj_offset: The real object.
424  * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
425  * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
426  *					[BYTES_PER_WORD long]
427  */
obj_offset(struct kmem_cache * cachep)428 static int obj_offset(struct kmem_cache *cachep)
429 {
430 	return cachep->obj_offset;
431 }
432 
obj_size(struct kmem_cache * cachep)433 static int obj_size(struct kmem_cache *cachep)
434 {
435 	return cachep->obj_size;
436 }
437 
dbg_redzone1(struct kmem_cache * cachep,void * objp)438 static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
439 {
440 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
441 	return (unsigned long long*) (objp + obj_offset(cachep) -
442 				      sizeof(unsigned long long));
443 }
444 
dbg_redzone2(struct kmem_cache * cachep,void * objp)445 static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
446 {
447 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
448 	if (cachep->flags & SLAB_STORE_USER)
449 		return (unsigned long long *)(objp + cachep->buffer_size -
450 					      sizeof(unsigned long long) -
451 					      REDZONE_ALIGN);
452 	return (unsigned long long *) (objp + cachep->buffer_size -
453 				       sizeof(unsigned long long));
454 }
455 
dbg_userword(struct kmem_cache * cachep,void * objp)456 static void **dbg_userword(struct kmem_cache *cachep, void *objp)
457 {
458 	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
459 	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
460 }
461 
462 #else
463 
464 #define obj_offset(x)			0
465 #define obj_size(cachep)		(cachep->buffer_size)
466 #define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
467 #define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
468 #define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
469 
470 #endif
471 
472 #ifdef CONFIG_TRACING
slab_buffer_size(struct kmem_cache * cachep)473 size_t slab_buffer_size(struct kmem_cache *cachep)
474 {
475 	return cachep->buffer_size;
476 }
477 EXPORT_SYMBOL(slab_buffer_size);
478 #endif
479 
480 /*
481  * Do not go above this order unless 0 objects fit into the slab.
482  */
483 #define	BREAK_GFP_ORDER_HI	1
484 #define	BREAK_GFP_ORDER_LO	0
485 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
486 
487 /*
488  * Functions for storing/retrieving the cachep and or slab from the page
489  * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
490  * these are used to find the cache which an obj belongs to.
491  */
page_set_cache(struct page * page,struct kmem_cache * cache)492 static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
493 {
494 	page->lru.next = (struct list_head *)cache;
495 }
496 
page_get_cache(struct page * page)497 static inline struct kmem_cache *page_get_cache(struct page *page)
498 {
499 	page = compound_head(page);
500 	BUG_ON(!PageSlab(page));
501 	return (struct kmem_cache *)page->lru.next;
502 }
503 
page_set_slab(struct page * page,struct slab * slab)504 static inline void page_set_slab(struct page *page, struct slab *slab)
505 {
506 	page->lru.prev = (struct list_head *)slab;
507 }
508 
page_get_slab(struct page * page)509 static inline struct slab *page_get_slab(struct page *page)
510 {
511 	BUG_ON(!PageSlab(page));
512 	return (struct slab *)page->lru.prev;
513 }
514 
virt_to_cache(const void * obj)515 static inline struct kmem_cache *virt_to_cache(const void *obj)
516 {
517 	struct page *page = virt_to_head_page(obj);
518 	return page_get_cache(page);
519 }
520 
virt_to_slab(const void * obj)521 static inline struct slab *virt_to_slab(const void *obj)
522 {
523 	struct page *page = virt_to_head_page(obj);
524 	return page_get_slab(page);
525 }
526 
index_to_obj(struct kmem_cache * cache,struct slab * slab,unsigned int idx)527 static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
528 				 unsigned int idx)
529 {
530 	return slab->s_mem + cache->buffer_size * idx;
531 }
532 
533 /*
534  * We want to avoid an expensive divide : (offset / cache->buffer_size)
535  *   Using the fact that buffer_size is a constant for a particular cache,
536  *   we can replace (offset / cache->buffer_size) by
537  *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
538  */
obj_to_index(const struct kmem_cache * cache,const struct slab * slab,void * obj)539 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
540 					const struct slab *slab, void *obj)
541 {
542 	u32 offset = (obj - slab->s_mem);
543 	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
544 }
545 
546 /*
547  * These are the default caches for kmalloc. Custom caches can have other sizes.
548  */
549 struct cache_sizes malloc_sizes[] = {
550 #define CACHE(x) { .cs_size = (x) },
551 #include <linux/kmalloc_sizes.h>
552 	CACHE(ULONG_MAX)
553 #undef CACHE
554 };
555 EXPORT_SYMBOL(malloc_sizes);
556 
557 /* Must match cache_sizes above. Out of line to keep cache footprint low. */
558 struct cache_names {
559 	char *name;
560 	char *name_dma;
561 };
562 
563 static struct cache_names __initdata cache_names[] = {
564 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
565 #include <linux/kmalloc_sizes.h>
566 	{NULL,}
567 #undef CACHE
568 };
569 
570 static struct arraycache_init initarray_cache __initdata =
571     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
572 static struct arraycache_init initarray_generic =
573     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
574 
575 /* internal cache of cache description objs */
576 static struct kmem_cache cache_cache = {
577 	.batchcount = 1,
578 	.limit = BOOT_CPUCACHE_ENTRIES,
579 	.shared = 1,
580 	.buffer_size = sizeof(struct kmem_cache),
581 	.name = "kmem_cache",
582 };
583 
584 #define BAD_ALIEN_MAGIC 0x01020304ul
585 
586 /*
587  * chicken and egg problem: delay the per-cpu array allocation
588  * until the general caches are up.
589  */
590 static enum {
591 	NONE,
592 	PARTIAL_AC,
593 	PARTIAL_L3,
594 	EARLY,
595 	FULL
596 } g_cpucache_up;
597 
598 /*
599  * used by boot code to determine if it can use slab based allocator
600  */
slab_is_available(void)601 int slab_is_available(void)
602 {
603 	return g_cpucache_up >= EARLY;
604 }
605 
606 #ifdef CONFIG_LOCKDEP
607 
608 /*
609  * Slab sometimes uses the kmalloc slabs to store the slab headers
610  * for other slabs "off slab".
611  * The locking for this is tricky in that it nests within the locks
612  * of all other slabs in a few places; to deal with this special
613  * locking we put on-slab caches into a separate lock-class.
614  *
615  * We set lock class for alien array caches which are up during init.
616  * The lock annotation will be lost if all cpus of a node goes down and
617  * then comes back up during hotplug
618  */
619 static struct lock_class_key on_slab_l3_key;
620 static struct lock_class_key on_slab_alc_key;
621 
init_node_lock_keys(int q)622 static void init_node_lock_keys(int q)
623 {
624 	struct cache_sizes *s = malloc_sizes;
625 
626 	if (g_cpucache_up != FULL)
627 		return;
628 
629 	for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
630 		struct array_cache **alc;
631 		struct kmem_list3 *l3;
632 		int r;
633 
634 		l3 = s->cs_cachep->nodelists[q];
635 		if (!l3 || OFF_SLAB(s->cs_cachep))
636 			continue;
637 		lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
638 		alc = l3->alien;
639 		/*
640 		 * FIXME: This check for BAD_ALIEN_MAGIC
641 		 * should go away when common slab code is taught to
642 		 * work even without alien caches.
643 		 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
644 		 * for alloc_alien_cache,
645 		 */
646 		if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
647 			continue;
648 		for_each_node(r) {
649 			if (alc[r])
650 				lockdep_set_class(&alc[r]->lock,
651 					&on_slab_alc_key);
652 		}
653 	}
654 }
655 
init_lock_keys(void)656 static inline void init_lock_keys(void)
657 {
658 	int node;
659 
660 	for_each_node(node)
661 		init_node_lock_keys(node);
662 }
663 #else
init_node_lock_keys(int q)664 static void init_node_lock_keys(int q)
665 {
666 }
667 
init_lock_keys(void)668 static inline void init_lock_keys(void)
669 {
670 }
671 #endif
672 
673 /*
674  * Guard access to the cache-chain.
675  */
676 static DEFINE_MUTEX(cache_chain_mutex);
677 static struct list_head cache_chain;
678 
679 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
680 
cpu_cache_get(struct kmem_cache * cachep)681 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
682 {
683 	return cachep->array[smp_processor_id()];
684 }
685 
__find_general_cachep(size_t size,gfp_t gfpflags)686 static inline struct kmem_cache *__find_general_cachep(size_t size,
687 							gfp_t gfpflags)
688 {
689 	struct cache_sizes *csizep = malloc_sizes;
690 
691 #if DEBUG
692 	/* This happens if someone tries to call
693 	 * kmem_cache_create(), or __kmalloc(), before
694 	 * the generic caches are initialized.
695 	 */
696 	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
697 #endif
698 	if (!size)
699 		return ZERO_SIZE_PTR;
700 
701 	while (size > csizep->cs_size)
702 		csizep++;
703 
704 	/*
705 	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
706 	 * has cs_{dma,}cachep==NULL. Thus no special case
707 	 * for large kmalloc calls required.
708 	 */
709 #ifdef CONFIG_ZONE_DMA
710 	if (unlikely(gfpflags & GFP_DMA))
711 		return csizep->cs_dmacachep;
712 #endif
713 	return csizep->cs_cachep;
714 }
715 
kmem_find_general_cachep(size_t size,gfp_t gfpflags)716 static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
717 {
718 	return __find_general_cachep(size, gfpflags);
719 }
720 
slab_mgmt_size(size_t nr_objs,size_t align)721 static size_t slab_mgmt_size(size_t nr_objs, size_t align)
722 {
723 	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
724 }
725 
726 /*
727  * Calculate the number of objects and left-over bytes for a given buffer size.
728  */
cache_estimate(unsigned long gfporder,size_t buffer_size,size_t align,int flags,size_t * left_over,unsigned int * num)729 static void cache_estimate(unsigned long gfporder, size_t buffer_size,
730 			   size_t align, int flags, size_t *left_over,
731 			   unsigned int *num)
732 {
733 	int nr_objs;
734 	size_t mgmt_size;
735 	size_t slab_size = PAGE_SIZE << gfporder;
736 
737 	/*
738 	 * The slab management structure can be either off the slab or
739 	 * on it. For the latter case, the memory allocated for a
740 	 * slab is used for:
741 	 *
742 	 * - The struct slab
743 	 * - One kmem_bufctl_t for each object
744 	 * - Padding to respect alignment of @align
745 	 * - @buffer_size bytes for each object
746 	 *
747 	 * If the slab management structure is off the slab, then the
748 	 * alignment will already be calculated into the size. Because
749 	 * the slabs are all pages aligned, the objects will be at the
750 	 * correct alignment when allocated.
751 	 */
752 	if (flags & CFLGS_OFF_SLAB) {
753 		mgmt_size = 0;
754 		nr_objs = slab_size / buffer_size;
755 
756 		if (nr_objs > SLAB_LIMIT)
757 			nr_objs = SLAB_LIMIT;
758 	} else {
759 		/*
760 		 * Ignore padding for the initial guess. The padding
761 		 * is at most @align-1 bytes, and @buffer_size is at
762 		 * least @align. In the worst case, this result will
763 		 * be one greater than the number of objects that fit
764 		 * into the memory allocation when taking the padding
765 		 * into account.
766 		 */
767 		nr_objs = (slab_size - sizeof(struct slab)) /
768 			  (buffer_size + sizeof(kmem_bufctl_t));
769 
770 		/*
771 		 * This calculated number will be either the right
772 		 * amount, or one greater than what we want.
773 		 */
774 		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
775 		       > slab_size)
776 			nr_objs--;
777 
778 		if (nr_objs > SLAB_LIMIT)
779 			nr_objs = SLAB_LIMIT;
780 
781 		mgmt_size = slab_mgmt_size(nr_objs, align);
782 	}
783 	*num = nr_objs;
784 	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
785 }
786 
787 #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
788 
__slab_error(const char * function,struct kmem_cache * cachep,char * msg)789 static void __slab_error(const char *function, struct kmem_cache *cachep,
790 			char *msg)
791 {
792 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
793 	       function, cachep->name, msg);
794 	dump_stack();
795 }
796 
797 /*
798  * By default on NUMA we use alien caches to stage the freeing of
799  * objects allocated from other nodes. This causes massive memory
800  * inefficiencies when using fake NUMA setup to split memory into a
801  * large number of small nodes, so it can be disabled on the command
802  * line
803   */
804 
805 static int use_alien_caches __read_mostly = 1;
noaliencache_setup(char * s)806 static int __init noaliencache_setup(char *s)
807 {
808 	use_alien_caches = 0;
809 	return 1;
810 }
811 __setup("noaliencache", noaliencache_setup);
812 
813 #ifdef CONFIG_NUMA
814 /*
815  * Special reaping functions for NUMA systems called from cache_reap().
816  * These take care of doing round robin flushing of alien caches (containing
817  * objects freed on different nodes from which they were allocated) and the
818  * flushing of remote pcps by calling drain_node_pages.
819  */
820 static DEFINE_PER_CPU(unsigned long, slab_reap_node);
821 
init_reap_node(int cpu)822 static void init_reap_node(int cpu)
823 {
824 	int node;
825 
826 	node = next_node(cpu_to_mem(cpu), node_online_map);
827 	if (node == MAX_NUMNODES)
828 		node = first_node(node_online_map);
829 
830 	per_cpu(slab_reap_node, cpu) = node;
831 }
832 
next_reap_node(void)833 static void next_reap_node(void)
834 {
835 	int node = __this_cpu_read(slab_reap_node);
836 
837 	node = next_node(node, node_online_map);
838 	if (unlikely(node >= MAX_NUMNODES))
839 		node = first_node(node_online_map);
840 	__this_cpu_write(slab_reap_node, node);
841 }
842 
843 #else
844 #define init_reap_node(cpu) do { } while (0)
845 #define next_reap_node(void) do { } while (0)
846 #endif
847 
848 /*
849  * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
850  * via the workqueue/eventd.
851  * Add the CPU number into the expiration time to minimize the possibility of
852  * the CPUs getting into lockstep and contending for the global cache chain
853  * lock.
854  */
start_cpu_timer(int cpu)855 static void __cpuinit start_cpu_timer(int cpu)
856 {
857 	struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
858 
859 	/*
860 	 * When this gets called from do_initcalls via cpucache_init(),
861 	 * init_workqueues() has already run, so keventd will be setup
862 	 * at that time.
863 	 */
864 	if (keventd_up() && reap_work->work.func == NULL) {
865 		init_reap_node(cpu);
866 		INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap);
867 		schedule_delayed_work_on(cpu, reap_work,
868 					__round_jiffies_relative(HZ, cpu));
869 	}
870 }
871 
alloc_arraycache(int node,int entries,int batchcount,gfp_t gfp)872 static struct array_cache *alloc_arraycache(int node, int entries,
873 					    int batchcount, gfp_t gfp)
874 {
875 	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
876 	struct array_cache *nc = NULL;
877 
878 	nc = kmalloc_node(memsize, gfp, node);
879 	/*
880 	 * The array_cache structures contain pointers to free object.
881 	 * However, when such objects are allocated or transferred to another
882 	 * cache the pointers are not cleared and they could be counted as
883 	 * valid references during a kmemleak scan. Therefore, kmemleak must
884 	 * not scan such objects.
885 	 */
886 	kmemleak_no_scan(nc);
887 	if (nc) {
888 		nc->avail = 0;
889 		nc->limit = entries;
890 		nc->batchcount = batchcount;
891 		nc->touched = 0;
892 		spin_lock_init(&nc->lock);
893 	}
894 	return nc;
895 }
896 
897 /*
898  * Transfer objects in one arraycache to another.
899  * Locking must be handled by the caller.
900  *
901  * Return the number of entries transferred.
902  */
transfer_objects(struct array_cache * to,struct array_cache * from,unsigned int max)903 static int transfer_objects(struct array_cache *to,
904 		struct array_cache *from, unsigned int max)
905 {
906 	/* Figure out how many entries to transfer */
907 	int nr = min3(from->avail, max, to->limit - to->avail);
908 
909 	if (!nr)
910 		return 0;
911 
912 	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
913 			sizeof(void *) *nr);
914 
915 	from->avail -= nr;
916 	to->avail += nr;
917 	return nr;
918 }
919 
920 #ifndef CONFIG_NUMA
921 
922 #define drain_alien_cache(cachep, alien) do { } while (0)
923 #define reap_alien(cachep, l3) do { } while (0)
924 
alloc_alien_cache(int node,int limit,gfp_t gfp)925 static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
926 {
927 	return (struct array_cache **)BAD_ALIEN_MAGIC;
928 }
929 
free_alien_cache(struct array_cache ** ac_ptr)930 static inline void free_alien_cache(struct array_cache **ac_ptr)
931 {
932 }
933 
cache_free_alien(struct kmem_cache * cachep,void * objp)934 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
935 {
936 	return 0;
937 }
938 
alternate_node_alloc(struct kmem_cache * cachep,gfp_t flags)939 static inline void *alternate_node_alloc(struct kmem_cache *cachep,
940 		gfp_t flags)
941 {
942 	return NULL;
943 }
944 
____cache_alloc_node(struct kmem_cache * cachep,gfp_t flags,int nodeid)945 static inline void *____cache_alloc_node(struct kmem_cache *cachep,
946 		 gfp_t flags, int nodeid)
947 {
948 	return NULL;
949 }
950 
951 #else	/* CONFIG_NUMA */
952 
953 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
954 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
955 
alloc_alien_cache(int node,int limit,gfp_t gfp)956 static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
957 {
958 	struct array_cache **ac_ptr;
959 	int memsize = sizeof(void *) * nr_node_ids;
960 	int i;
961 
962 	if (limit > 1)
963 		limit = 12;
964 	ac_ptr = kzalloc_node(memsize, gfp, node);
965 	if (ac_ptr) {
966 		for_each_node(i) {
967 			if (i == node || !node_online(i))
968 				continue;
969 			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
970 			if (!ac_ptr[i]) {
971 				for (i--; i >= 0; i--)
972 					kfree(ac_ptr[i]);
973 				kfree(ac_ptr);
974 				return NULL;
975 			}
976 		}
977 	}
978 	return ac_ptr;
979 }
980 
free_alien_cache(struct array_cache ** ac_ptr)981 static void free_alien_cache(struct array_cache **ac_ptr)
982 {
983 	int i;
984 
985 	if (!ac_ptr)
986 		return;
987 	for_each_node(i)
988 	    kfree(ac_ptr[i]);
989 	kfree(ac_ptr);
990 }
991 
__drain_alien_cache(struct kmem_cache * cachep,struct array_cache * ac,int node)992 static void __drain_alien_cache(struct kmem_cache *cachep,
993 				struct array_cache *ac, int node)
994 {
995 	struct kmem_list3 *rl3 = cachep->nodelists[node];
996 
997 	if (ac->avail) {
998 		spin_lock(&rl3->list_lock);
999 		/*
1000 		 * Stuff objects into the remote nodes shared array first.
1001 		 * That way we could avoid the overhead of putting the objects
1002 		 * into the free lists and getting them back later.
1003 		 */
1004 		if (rl3->shared)
1005 			transfer_objects(rl3->shared, ac, ac->limit);
1006 
1007 		free_block(cachep, ac->entry, ac->avail, node);
1008 		ac->avail = 0;
1009 		spin_unlock(&rl3->list_lock);
1010 	}
1011 }
1012 
1013 /*
1014  * Called from cache_reap() to regularly drain alien caches round robin.
1015  */
reap_alien(struct kmem_cache * cachep,struct kmem_list3 * l3)1016 static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1017 {
1018 	int node = __this_cpu_read(slab_reap_node);
1019 
1020 	if (l3->alien) {
1021 		struct array_cache *ac = l3->alien[node];
1022 
1023 		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
1024 			__drain_alien_cache(cachep, ac, node);
1025 			spin_unlock_irq(&ac->lock);
1026 		}
1027 	}
1028 }
1029 
drain_alien_cache(struct kmem_cache * cachep,struct array_cache ** alien)1030 static void drain_alien_cache(struct kmem_cache *cachep,
1031 				struct array_cache **alien)
1032 {
1033 	int i = 0;
1034 	struct array_cache *ac;
1035 	unsigned long flags;
1036 
1037 	for_each_online_node(i) {
1038 		ac = alien[i];
1039 		if (ac) {
1040 			spin_lock_irqsave(&ac->lock, flags);
1041 			__drain_alien_cache(cachep, ac, i);
1042 			spin_unlock_irqrestore(&ac->lock, flags);
1043 		}
1044 	}
1045 }
1046 
cache_free_alien(struct kmem_cache * cachep,void * objp)1047 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1048 {
1049 	struct slab *slabp = virt_to_slab(objp);
1050 	int nodeid = slabp->nodeid;
1051 	struct kmem_list3 *l3;
1052 	struct array_cache *alien = NULL;
1053 	int node;
1054 
1055 	node = numa_mem_id();
1056 
1057 	/*
1058 	 * Make sure we are not freeing a object from another node to the array
1059 	 * cache on this cpu.
1060 	 */
1061 	if (likely(slabp->nodeid == node))
1062 		return 0;
1063 
1064 	l3 = cachep->nodelists[node];
1065 	STATS_INC_NODEFREES(cachep);
1066 	if (l3->alien && l3->alien[nodeid]) {
1067 		alien = l3->alien[nodeid];
1068 		spin_lock(&alien->lock);
1069 		if (unlikely(alien->avail == alien->limit)) {
1070 			STATS_INC_ACOVERFLOW(cachep);
1071 			__drain_alien_cache(cachep, alien, nodeid);
1072 		}
1073 		alien->entry[alien->avail++] = objp;
1074 		spin_unlock(&alien->lock);
1075 	} else {
1076 		spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1077 		free_block(cachep, &objp, 1, nodeid);
1078 		spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1079 	}
1080 	return 1;
1081 }
1082 #endif
1083 
1084 /*
1085  * Allocates and initializes nodelists for a node on each slab cache, used for
1086  * either memory or cpu hotplug.  If memory is being hot-added, the kmem_list3
1087  * will be allocated off-node since memory is not yet online for the new node.
1088  * When hotplugging memory or a cpu, existing nodelists are not replaced if
1089  * already in use.
1090  *
1091  * Must hold cache_chain_mutex.
1092  */
init_cache_nodelists_node(int node)1093 static int init_cache_nodelists_node(int node)
1094 {
1095 	struct kmem_cache *cachep;
1096 	struct kmem_list3 *l3;
1097 	const int memsize = sizeof(struct kmem_list3);
1098 
1099 	list_for_each_entry(cachep, &cache_chain, next) {
1100 		/*
1101 		 * Set up the size64 kmemlist for cpu before we can
1102 		 * begin anything. Make sure some other cpu on this
1103 		 * node has not already allocated this
1104 		 */
1105 		if (!cachep->nodelists[node]) {
1106 			l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1107 			if (!l3)
1108 				return -ENOMEM;
1109 			kmem_list3_init(l3);
1110 			l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1111 			    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1112 
1113 			/*
1114 			 * The l3s don't come and go as CPUs come and
1115 			 * go.  cache_chain_mutex is sufficient
1116 			 * protection here.
1117 			 */
1118 			cachep->nodelists[node] = l3;
1119 		}
1120 
1121 		spin_lock_irq(&cachep->nodelists[node]->list_lock);
1122 		cachep->nodelists[node]->free_limit =
1123 			(1 + nr_cpus_node(node)) *
1124 			cachep->batchcount + cachep->num;
1125 		spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1126 	}
1127 	return 0;
1128 }
1129 
cpuup_canceled(long cpu)1130 static void __cpuinit cpuup_canceled(long cpu)
1131 {
1132 	struct kmem_cache *cachep;
1133 	struct kmem_list3 *l3 = NULL;
1134 	int node = cpu_to_mem(cpu);
1135 	const struct cpumask *mask = cpumask_of_node(node);
1136 
1137 	list_for_each_entry(cachep, &cache_chain, next) {
1138 		struct array_cache *nc;
1139 		struct array_cache *shared;
1140 		struct array_cache **alien;
1141 
1142 		/* cpu is dead; no one can alloc from it. */
1143 		nc = cachep->array[cpu];
1144 		cachep->array[cpu] = NULL;
1145 		l3 = cachep->nodelists[node];
1146 
1147 		if (!l3)
1148 			goto free_array_cache;
1149 
1150 		spin_lock_irq(&l3->list_lock);
1151 
1152 		/* Free limit for this kmem_list3 */
1153 		l3->free_limit -= cachep->batchcount;
1154 		if (nc)
1155 			free_block(cachep, nc->entry, nc->avail, node);
1156 
1157 		if (!cpumask_empty(mask)) {
1158 			spin_unlock_irq(&l3->list_lock);
1159 			goto free_array_cache;
1160 		}
1161 
1162 		shared = l3->shared;
1163 		if (shared) {
1164 			free_block(cachep, shared->entry,
1165 				   shared->avail, node);
1166 			l3->shared = NULL;
1167 		}
1168 
1169 		alien = l3->alien;
1170 		l3->alien = NULL;
1171 
1172 		spin_unlock_irq(&l3->list_lock);
1173 
1174 		kfree(shared);
1175 		if (alien) {
1176 			drain_alien_cache(cachep, alien);
1177 			free_alien_cache(alien);
1178 		}
1179 free_array_cache:
1180 		kfree(nc);
1181 	}
1182 	/*
1183 	 * In the previous loop, all the objects were freed to
1184 	 * the respective cache's slabs,  now we can go ahead and
1185 	 * shrink each nodelist to its limit.
1186 	 */
1187 	list_for_each_entry(cachep, &cache_chain, next) {
1188 		l3 = cachep->nodelists[node];
1189 		if (!l3)
1190 			continue;
1191 		drain_freelist(cachep, l3, l3->free_objects);
1192 	}
1193 }
1194 
cpuup_prepare(long cpu)1195 static int __cpuinit cpuup_prepare(long cpu)
1196 {
1197 	struct kmem_cache *cachep;
1198 	struct kmem_list3 *l3 = NULL;
1199 	int node = cpu_to_mem(cpu);
1200 	int err;
1201 
1202 	/*
1203 	 * We need to do this right in the beginning since
1204 	 * alloc_arraycache's are going to use this list.
1205 	 * kmalloc_node allows us to add the slab to the right
1206 	 * kmem_list3 and not this cpu's kmem_list3
1207 	 */
1208 	err = init_cache_nodelists_node(node);
1209 	if (err < 0)
1210 		goto bad;
1211 
1212 	/*
1213 	 * Now we can go ahead with allocating the shared arrays and
1214 	 * array caches
1215 	 */
1216 	list_for_each_entry(cachep, &cache_chain, next) {
1217 		struct array_cache *nc;
1218 		struct array_cache *shared = NULL;
1219 		struct array_cache **alien = NULL;
1220 
1221 		nc = alloc_arraycache(node, cachep->limit,
1222 					cachep->batchcount, GFP_KERNEL);
1223 		if (!nc)
1224 			goto bad;
1225 		if (cachep->shared) {
1226 			shared = alloc_arraycache(node,
1227 				cachep->shared * cachep->batchcount,
1228 				0xbaadf00d, GFP_KERNEL);
1229 			if (!shared) {
1230 				kfree(nc);
1231 				goto bad;
1232 			}
1233 		}
1234 		if (use_alien_caches) {
1235 			alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
1236 			if (!alien) {
1237 				kfree(shared);
1238 				kfree(nc);
1239 				goto bad;
1240 			}
1241 		}
1242 		cachep->array[cpu] = nc;
1243 		l3 = cachep->nodelists[node];
1244 		BUG_ON(!l3);
1245 
1246 		spin_lock_irq(&l3->list_lock);
1247 		if (!l3->shared) {
1248 			/*
1249 			 * We are serialised from CPU_DEAD or
1250 			 * CPU_UP_CANCELLED by the cpucontrol lock
1251 			 */
1252 			l3->shared = shared;
1253 			shared = NULL;
1254 		}
1255 #ifdef CONFIG_NUMA
1256 		if (!l3->alien) {
1257 			l3->alien = alien;
1258 			alien = NULL;
1259 		}
1260 #endif
1261 		spin_unlock_irq(&l3->list_lock);
1262 		kfree(shared);
1263 		free_alien_cache(alien);
1264 	}
1265 	init_node_lock_keys(node);
1266 
1267 	return 0;
1268 bad:
1269 	cpuup_canceled(cpu);
1270 	return -ENOMEM;
1271 }
1272 
cpuup_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)1273 static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1274 				    unsigned long action, void *hcpu)
1275 {
1276 	long cpu = (long)hcpu;
1277 	int err = 0;
1278 
1279 	switch (action) {
1280 	case CPU_UP_PREPARE:
1281 	case CPU_UP_PREPARE_FROZEN:
1282 		mutex_lock(&cache_chain_mutex);
1283 		err = cpuup_prepare(cpu);
1284 		mutex_unlock(&cache_chain_mutex);
1285 		break;
1286 	case CPU_ONLINE:
1287 	case CPU_ONLINE_FROZEN:
1288 		start_cpu_timer(cpu);
1289 		break;
1290 #ifdef CONFIG_HOTPLUG_CPU
1291   	case CPU_DOWN_PREPARE:
1292   	case CPU_DOWN_PREPARE_FROZEN:
1293 		/*
1294 		 * Shutdown cache reaper. Note that the cache_chain_mutex is
1295 		 * held so that if cache_reap() is invoked it cannot do
1296 		 * anything expensive but will only modify reap_work
1297 		 * and reschedule the timer.
1298 		*/
1299 		cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
1300 		/* Now the cache_reaper is guaranteed to be not running. */
1301 		per_cpu(slab_reap_work, cpu).work.func = NULL;
1302   		break;
1303   	case CPU_DOWN_FAILED:
1304   	case CPU_DOWN_FAILED_FROZEN:
1305 		start_cpu_timer(cpu);
1306   		break;
1307 	case CPU_DEAD:
1308 	case CPU_DEAD_FROZEN:
1309 		/*
1310 		 * Even if all the cpus of a node are down, we don't free the
1311 		 * kmem_list3 of any cache. This to avoid a race between
1312 		 * cpu_down, and a kmalloc allocation from another cpu for
1313 		 * memory from the node of the cpu going down.  The list3
1314 		 * structure is usually allocated from kmem_cache_create() and
1315 		 * gets destroyed at kmem_cache_destroy().
1316 		 */
1317 		/* fall through */
1318 #endif
1319 	case CPU_UP_CANCELED:
1320 	case CPU_UP_CANCELED_FROZEN:
1321 		mutex_lock(&cache_chain_mutex);
1322 		cpuup_canceled(cpu);
1323 		mutex_unlock(&cache_chain_mutex);
1324 		break;
1325 	}
1326 	return notifier_from_errno(err);
1327 }
1328 
1329 static struct notifier_block __cpuinitdata cpucache_notifier = {
1330 	&cpuup_callback, NULL, 0
1331 };
1332 
1333 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
1334 /*
1335  * Drains freelist for a node on each slab cache, used for memory hot-remove.
1336  * Returns -EBUSY if all objects cannot be drained so that the node is not
1337  * removed.
1338  *
1339  * Must hold cache_chain_mutex.
1340  */
drain_cache_nodelists_node(int node)1341 static int __meminit drain_cache_nodelists_node(int node)
1342 {
1343 	struct kmem_cache *cachep;
1344 	int ret = 0;
1345 
1346 	list_for_each_entry(cachep, &cache_chain, next) {
1347 		struct kmem_list3 *l3;
1348 
1349 		l3 = cachep->nodelists[node];
1350 		if (!l3)
1351 			continue;
1352 
1353 		drain_freelist(cachep, l3, l3->free_objects);
1354 
1355 		if (!list_empty(&l3->slabs_full) ||
1356 		    !list_empty(&l3->slabs_partial)) {
1357 			ret = -EBUSY;
1358 			break;
1359 		}
1360 	}
1361 	return ret;
1362 }
1363 
slab_memory_callback(struct notifier_block * self,unsigned long action,void * arg)1364 static int __meminit slab_memory_callback(struct notifier_block *self,
1365 					unsigned long action, void *arg)
1366 {
1367 	struct memory_notify *mnb = arg;
1368 	int ret = 0;
1369 	int nid;
1370 
1371 	nid = mnb->status_change_nid;
1372 	if (nid < 0)
1373 		goto out;
1374 
1375 	switch (action) {
1376 	case MEM_GOING_ONLINE:
1377 		mutex_lock(&cache_chain_mutex);
1378 		ret = init_cache_nodelists_node(nid);
1379 		mutex_unlock(&cache_chain_mutex);
1380 		break;
1381 	case MEM_GOING_OFFLINE:
1382 		mutex_lock(&cache_chain_mutex);
1383 		ret = drain_cache_nodelists_node(nid);
1384 		mutex_unlock(&cache_chain_mutex);
1385 		break;
1386 	case MEM_ONLINE:
1387 	case MEM_OFFLINE:
1388 	case MEM_CANCEL_ONLINE:
1389 	case MEM_CANCEL_OFFLINE:
1390 		break;
1391 	}
1392 out:
1393 	return notifier_from_errno(ret);
1394 }
1395 #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
1396 
1397 /*
1398  * swap the static kmem_list3 with kmalloced memory
1399  */
init_list(struct kmem_cache * cachep,struct kmem_list3 * list,int nodeid)1400 static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1401 				int nodeid)
1402 {
1403 	struct kmem_list3 *ptr;
1404 
1405 	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
1406 	BUG_ON(!ptr);
1407 
1408 	memcpy(ptr, list, sizeof(struct kmem_list3));
1409 	/*
1410 	 * Do not assume that spinlocks can be initialized via memcpy:
1411 	 */
1412 	spin_lock_init(&ptr->list_lock);
1413 
1414 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
1415 	cachep->nodelists[nodeid] = ptr;
1416 }
1417 
1418 /*
1419  * For setting up all the kmem_list3s for cache whose buffer_size is same as
1420  * size of kmem_list3.
1421  */
set_up_list3s(struct kmem_cache * cachep,int index)1422 static void __init set_up_list3s(struct kmem_cache *cachep, int index)
1423 {
1424 	int node;
1425 
1426 	for_each_online_node(node) {
1427 		cachep->nodelists[node] = &initkmem_list3[index + node];
1428 		cachep->nodelists[node]->next_reap = jiffies +
1429 		    REAPTIMEOUT_LIST3 +
1430 		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1431 	}
1432 }
1433 
1434 /*
1435  * Initialisation.  Called after the page allocator have been initialised and
1436  * before smp_init().
1437  */
kmem_cache_init(void)1438 void __init kmem_cache_init(void)
1439 {
1440 	size_t left_over;
1441 	struct cache_sizes *sizes;
1442 	struct cache_names *names;
1443 	int i;
1444 	int order;
1445 	int node;
1446 
1447 	if (num_possible_nodes() == 1)
1448 		use_alien_caches = 0;
1449 
1450 	for (i = 0; i < NUM_INIT_LISTS; i++) {
1451 		kmem_list3_init(&initkmem_list3[i]);
1452 		if (i < MAX_NUMNODES)
1453 			cache_cache.nodelists[i] = NULL;
1454 	}
1455 	set_up_list3s(&cache_cache, CACHE_CACHE);
1456 
1457 	/*
1458 	 * Fragmentation resistance on low memory - only use bigger
1459 	 * page orders on machines with more than 32MB of memory.
1460 	 */
1461 	if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
1462 		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1463 
1464 	/* Bootstrap is tricky, because several objects are allocated
1465 	 * from caches that do not exist yet:
1466 	 * 1) initialize the cache_cache cache: it contains the struct
1467 	 *    kmem_cache structures of all caches, except cache_cache itself:
1468 	 *    cache_cache is statically allocated.
1469 	 *    Initially an __init data area is used for the head array and the
1470 	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
1471 	 *    array at the end of the bootstrap.
1472 	 * 2) Create the first kmalloc cache.
1473 	 *    The struct kmem_cache for the new cache is allocated normally.
1474 	 *    An __init data area is used for the head array.
1475 	 * 3) Create the remaining kmalloc caches, with minimally sized
1476 	 *    head arrays.
1477 	 * 4) Replace the __init data head arrays for cache_cache and the first
1478 	 *    kmalloc cache with kmalloc allocated arrays.
1479 	 * 5) Replace the __init data for kmem_list3 for cache_cache and
1480 	 *    the other cache's with kmalloc allocated memory.
1481 	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1482 	 */
1483 
1484 	node = numa_mem_id();
1485 
1486 	/* 1) create the cache_cache */
1487 	INIT_LIST_HEAD(&cache_chain);
1488 	list_add(&cache_cache.next, &cache_chain);
1489 	cache_cache.colour_off = cache_line_size();
1490 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1491 	cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
1492 
1493 	/*
1494 	 * struct kmem_cache size depends on nr_node_ids, which
1495 	 * can be less than MAX_NUMNODES.
1496 	 */
1497 	cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
1498 				 nr_node_ids * sizeof(struct kmem_list3 *);
1499 #if DEBUG
1500 	cache_cache.obj_size = cache_cache.buffer_size;
1501 #endif
1502 	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1503 					cache_line_size());
1504 	cache_cache.reciprocal_buffer_size =
1505 		reciprocal_value(cache_cache.buffer_size);
1506 
1507 	for (order = 0; order < MAX_ORDER; order++) {
1508 		cache_estimate(order, cache_cache.buffer_size,
1509 			cache_line_size(), 0, &left_over, &cache_cache.num);
1510 		if (cache_cache.num)
1511 			break;
1512 	}
1513 	BUG_ON(!cache_cache.num);
1514 	cache_cache.gfporder = order;
1515 	cache_cache.colour = left_over / cache_cache.colour_off;
1516 	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1517 				      sizeof(struct slab), cache_line_size());
1518 
1519 	/* 2+3) create the kmalloc caches */
1520 	sizes = malloc_sizes;
1521 	names = cache_names;
1522 
1523 	/*
1524 	 * Initialize the caches that provide memory for the array cache and the
1525 	 * kmem_list3 structures first.  Without this, further allocations will
1526 	 * bug.
1527 	 */
1528 
1529 	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1530 					sizes[INDEX_AC].cs_size,
1531 					ARCH_KMALLOC_MINALIGN,
1532 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1533 					NULL);
1534 
1535 	if (INDEX_AC != INDEX_L3) {
1536 		sizes[INDEX_L3].cs_cachep =
1537 			kmem_cache_create(names[INDEX_L3].name,
1538 				sizes[INDEX_L3].cs_size,
1539 				ARCH_KMALLOC_MINALIGN,
1540 				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1541 				NULL);
1542 	}
1543 
1544 	slab_early_init = 0;
1545 
1546 	while (sizes->cs_size != ULONG_MAX) {
1547 		/*
1548 		 * For performance, all the general caches are L1 aligned.
1549 		 * This should be particularly beneficial on SMP boxes, as it
1550 		 * eliminates "false sharing".
1551 		 * Note for systems short on memory removing the alignment will
1552 		 * allow tighter packing of the smaller caches.
1553 		 */
1554 		if (!sizes->cs_cachep) {
1555 			sizes->cs_cachep = kmem_cache_create(names->name,
1556 					sizes->cs_size,
1557 					ARCH_KMALLOC_MINALIGN,
1558 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1559 					NULL);
1560 		}
1561 #ifdef CONFIG_ZONE_DMA
1562 		sizes->cs_dmacachep = kmem_cache_create(
1563 					names->name_dma,
1564 					sizes->cs_size,
1565 					ARCH_KMALLOC_MINALIGN,
1566 					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1567 						SLAB_PANIC,
1568 					NULL);
1569 #endif
1570 		sizes++;
1571 		names++;
1572 	}
1573 	/* 4) Replace the bootstrap head arrays */
1574 	{
1575 		struct array_cache *ptr;
1576 
1577 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1578 
1579 		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1580 		memcpy(ptr, cpu_cache_get(&cache_cache),
1581 		       sizeof(struct arraycache_init));
1582 		/*
1583 		 * Do not assume that spinlocks can be initialized via memcpy:
1584 		 */
1585 		spin_lock_init(&ptr->lock);
1586 
1587 		cache_cache.array[smp_processor_id()] = ptr;
1588 
1589 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1590 
1591 		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1592 		       != &initarray_generic.cache);
1593 		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1594 		       sizeof(struct arraycache_init));
1595 		/*
1596 		 * Do not assume that spinlocks can be initialized via memcpy:
1597 		 */
1598 		spin_lock_init(&ptr->lock);
1599 
1600 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1601 		    ptr;
1602 	}
1603 	/* 5) Replace the bootstrap kmem_list3's */
1604 	{
1605 		int nid;
1606 
1607 		for_each_online_node(nid) {
1608 			init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
1609 
1610 			init_list(malloc_sizes[INDEX_AC].cs_cachep,
1611 				  &initkmem_list3[SIZE_AC + nid], nid);
1612 
1613 			if (INDEX_AC != INDEX_L3) {
1614 				init_list(malloc_sizes[INDEX_L3].cs_cachep,
1615 					  &initkmem_list3[SIZE_L3 + nid], nid);
1616 			}
1617 		}
1618 	}
1619 
1620 	g_cpucache_up = EARLY;
1621 }
1622 
kmem_cache_init_late(void)1623 void __init kmem_cache_init_late(void)
1624 {
1625 	struct kmem_cache *cachep;
1626 
1627 	/* 6) resize the head arrays to their final sizes */
1628 	mutex_lock(&cache_chain_mutex);
1629 	list_for_each_entry(cachep, &cache_chain, next)
1630 		if (enable_cpucache(cachep, GFP_NOWAIT))
1631 			BUG();
1632 	mutex_unlock(&cache_chain_mutex);
1633 
1634 	/* Done! */
1635 	g_cpucache_up = FULL;
1636 
1637 	/* Annotate slab for lockdep -- annotate the malloc caches */
1638 	init_lock_keys();
1639 
1640 	/*
1641 	 * Register a cpu startup notifier callback that initializes
1642 	 * cpu_cache_get for all new cpus
1643 	 */
1644 	register_cpu_notifier(&cpucache_notifier);
1645 
1646 #ifdef CONFIG_NUMA
1647 	/*
1648 	 * Register a memory hotplug callback that initializes and frees
1649 	 * nodelists.
1650 	 */
1651 	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
1652 #endif
1653 
1654 	/*
1655 	 * The reap timers are started later, with a module init call: That part
1656 	 * of the kernel is not yet operational.
1657 	 */
1658 }
1659 
cpucache_init(void)1660 static int __init cpucache_init(void)
1661 {
1662 	int cpu;
1663 
1664 	/*
1665 	 * Register the timers that return unneeded pages to the page allocator
1666 	 */
1667 	for_each_online_cpu(cpu)
1668 		start_cpu_timer(cpu);
1669 	return 0;
1670 }
1671 __initcall(cpucache_init);
1672 
1673 /*
1674  * Interface to system's page allocator. No need to hold the cache-lock.
1675  *
1676  * If we requested dmaable memory, we will get it. Even if we
1677  * did not request dmaable memory, we might get it, but that
1678  * would be relatively rare and ignorable.
1679  */
kmem_getpages(struct kmem_cache * cachep,gfp_t flags,int nodeid)1680 static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1681 {
1682 	struct page *page;
1683 	int nr_pages;
1684 	int i;
1685 
1686 #ifndef CONFIG_MMU
1687 	/*
1688 	 * Nommu uses slab's for process anonymous memory allocations, and thus
1689 	 * requires __GFP_COMP to properly refcount higher order allocations
1690 	 */
1691 	flags |= __GFP_COMP;
1692 #endif
1693 
1694 	flags |= cachep->gfpflags;
1695 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1696 		flags |= __GFP_RECLAIMABLE;
1697 
1698 	page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1699 	if (!page)
1700 		return NULL;
1701 
1702 	nr_pages = (1 << cachep->gfporder);
1703 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1704 		add_zone_page_state(page_zone(page),
1705 			NR_SLAB_RECLAIMABLE, nr_pages);
1706 	else
1707 		add_zone_page_state(page_zone(page),
1708 			NR_SLAB_UNRECLAIMABLE, nr_pages);
1709 	for (i = 0; i < nr_pages; i++)
1710 		__SetPageSlab(page + i);
1711 
1712 	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1713 		kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1714 
1715 		if (cachep->ctor)
1716 			kmemcheck_mark_uninitialized_pages(page, nr_pages);
1717 		else
1718 			kmemcheck_mark_unallocated_pages(page, nr_pages);
1719 	}
1720 
1721 	return page_address(page);
1722 }
1723 
1724 /*
1725  * Interface to system's page release.
1726  */
kmem_freepages(struct kmem_cache * cachep,void * addr)1727 static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1728 {
1729 	unsigned long i = (1 << cachep->gfporder);
1730 	struct page *page = virt_to_page(addr);
1731 	const unsigned long nr_freed = i;
1732 
1733 	kmemcheck_free_shadow(page, cachep->gfporder);
1734 
1735 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1736 		sub_zone_page_state(page_zone(page),
1737 				NR_SLAB_RECLAIMABLE, nr_freed);
1738 	else
1739 		sub_zone_page_state(page_zone(page),
1740 				NR_SLAB_UNRECLAIMABLE, nr_freed);
1741 	while (i--) {
1742 		BUG_ON(!PageSlab(page));
1743 		__ClearPageSlab(page);
1744 		page++;
1745 	}
1746 	if (current->reclaim_state)
1747 		current->reclaim_state->reclaimed_slab += nr_freed;
1748 	free_pages((unsigned long)addr, cachep->gfporder);
1749 }
1750 
kmem_rcu_free(struct rcu_head * head)1751 static void kmem_rcu_free(struct rcu_head *head)
1752 {
1753 	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1754 	struct kmem_cache *cachep = slab_rcu->cachep;
1755 
1756 	kmem_freepages(cachep, slab_rcu->addr);
1757 	if (OFF_SLAB(cachep))
1758 		kmem_cache_free(cachep->slabp_cache, slab_rcu);
1759 }
1760 
1761 #if DEBUG
1762 
1763 #ifdef CONFIG_DEBUG_PAGEALLOC
store_stackinfo(struct kmem_cache * cachep,unsigned long * addr,unsigned long caller)1764 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1765 			    unsigned long caller)
1766 {
1767 	int size = obj_size(cachep);
1768 
1769 	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1770 
1771 	if (size < 5 * sizeof(unsigned long))
1772 		return;
1773 
1774 	*addr++ = 0x12345678;
1775 	*addr++ = caller;
1776 	*addr++ = smp_processor_id();
1777 	size -= 3 * sizeof(unsigned long);
1778 	{
1779 		unsigned long *sptr = &caller;
1780 		unsigned long svalue;
1781 
1782 		while (!kstack_end(sptr)) {
1783 			svalue = *sptr++;
1784 			if (kernel_text_address(svalue)) {
1785 				*addr++ = svalue;
1786 				size -= sizeof(unsigned long);
1787 				if (size <= sizeof(unsigned long))
1788 					break;
1789 			}
1790 		}
1791 
1792 	}
1793 	*addr++ = 0x87654321;
1794 }
1795 #endif
1796 
poison_obj(struct kmem_cache * cachep,void * addr,unsigned char val)1797 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1798 {
1799 	int size = obj_size(cachep);
1800 	addr = &((char *)addr)[obj_offset(cachep)];
1801 
1802 	memset(addr, val, size);
1803 	*(unsigned char *)(addr + size - 1) = POISON_END;
1804 }
1805 
dump_line(char * data,int offset,int limit)1806 static void dump_line(char *data, int offset, int limit)
1807 {
1808 	int i;
1809 	unsigned char error = 0;
1810 	int bad_count = 0;
1811 
1812 	printk(KERN_ERR "%03x:", offset);
1813 	for (i = 0; i < limit; i++) {
1814 		if (data[offset + i] != POISON_FREE) {
1815 			error = data[offset + i];
1816 			bad_count++;
1817 		}
1818 		printk(" %02x", (unsigned char)data[offset + i]);
1819 	}
1820 	printk("\n");
1821 
1822 	if (bad_count == 1) {
1823 		error ^= POISON_FREE;
1824 		if (!(error & (error - 1))) {
1825 			printk(KERN_ERR "Single bit error detected. Probably "
1826 					"bad RAM.\n");
1827 #ifdef CONFIG_X86
1828 			printk(KERN_ERR "Run memtest86+ or a similar memory "
1829 					"test tool.\n");
1830 #else
1831 			printk(KERN_ERR "Run a memory test tool.\n");
1832 #endif
1833 		}
1834 	}
1835 }
1836 #endif
1837 
1838 #if DEBUG
1839 
print_objinfo(struct kmem_cache * cachep,void * objp,int lines)1840 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1841 {
1842 	int i, size;
1843 	char *realobj;
1844 
1845 	if (cachep->flags & SLAB_RED_ZONE) {
1846 		printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
1847 			*dbg_redzone1(cachep, objp),
1848 			*dbg_redzone2(cachep, objp));
1849 	}
1850 
1851 	if (cachep->flags & SLAB_STORE_USER) {
1852 		printk(KERN_ERR "Last user: [<%p>]",
1853 			*dbg_userword(cachep, objp));
1854 		print_symbol("(%s)",
1855 				(unsigned long)*dbg_userword(cachep, objp));
1856 		printk("\n");
1857 	}
1858 	realobj = (char *)objp + obj_offset(cachep);
1859 	size = obj_size(cachep);
1860 	for (i = 0; i < size && lines; i += 16, lines--) {
1861 		int limit;
1862 		limit = 16;
1863 		if (i + limit > size)
1864 			limit = size - i;
1865 		dump_line(realobj, i, limit);
1866 	}
1867 }
1868 
check_poison_obj(struct kmem_cache * cachep,void * objp)1869 static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1870 {
1871 	char *realobj;
1872 	int size, i;
1873 	int lines = 0;
1874 
1875 	realobj = (char *)objp + obj_offset(cachep);
1876 	size = obj_size(cachep);
1877 
1878 	for (i = 0; i < size; i++) {
1879 		char exp = POISON_FREE;
1880 		if (i == size - 1)
1881 			exp = POISON_END;
1882 		if (realobj[i] != exp) {
1883 			int limit;
1884 			/* Mismatch ! */
1885 			/* Print header */
1886 			if (lines == 0) {
1887 				printk(KERN_ERR
1888 					"Slab corruption: %s start=%p, len=%d\n",
1889 					cachep->name, realobj, size);
1890 				print_objinfo(cachep, objp, 0);
1891 			}
1892 			/* Hexdump the affected line */
1893 			i = (i / 16) * 16;
1894 			limit = 16;
1895 			if (i + limit > size)
1896 				limit = size - i;
1897 			dump_line(realobj, i, limit);
1898 			i += 16;
1899 			lines++;
1900 			/* Limit to 5 lines */
1901 			if (lines > 5)
1902 				break;
1903 		}
1904 	}
1905 	if (lines != 0) {
1906 		/* Print some data about the neighboring objects, if they
1907 		 * exist:
1908 		 */
1909 		struct slab *slabp = virt_to_slab(objp);
1910 		unsigned int objnr;
1911 
1912 		objnr = obj_to_index(cachep, slabp, objp);
1913 		if (objnr) {
1914 			objp = index_to_obj(cachep, slabp, objnr - 1);
1915 			realobj = (char *)objp + obj_offset(cachep);
1916 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1917 			       realobj, size);
1918 			print_objinfo(cachep, objp, 2);
1919 		}
1920 		if (objnr + 1 < cachep->num) {
1921 			objp = index_to_obj(cachep, slabp, objnr + 1);
1922 			realobj = (char *)objp + obj_offset(cachep);
1923 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1924 			       realobj, size);
1925 			print_objinfo(cachep, objp, 2);
1926 		}
1927 	}
1928 }
1929 #endif
1930 
1931 #if DEBUG
slab_destroy_debugcheck(struct kmem_cache * cachep,struct slab * slabp)1932 static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
1933 {
1934 	int i;
1935 	for (i = 0; i < cachep->num; i++) {
1936 		void *objp = index_to_obj(cachep, slabp, i);
1937 
1938 		if (cachep->flags & SLAB_POISON) {
1939 #ifdef CONFIG_DEBUG_PAGEALLOC
1940 			if (cachep->buffer_size % PAGE_SIZE == 0 &&
1941 					OFF_SLAB(cachep))
1942 				kernel_map_pages(virt_to_page(objp),
1943 					cachep->buffer_size / PAGE_SIZE, 1);
1944 			else
1945 				check_poison_obj(cachep, objp);
1946 #else
1947 			check_poison_obj(cachep, objp);
1948 #endif
1949 		}
1950 		if (cachep->flags & SLAB_RED_ZONE) {
1951 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1952 				slab_error(cachep, "start of a freed object "
1953 					   "was overwritten");
1954 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1955 				slab_error(cachep, "end of a freed object "
1956 					   "was overwritten");
1957 		}
1958 	}
1959 }
1960 #else
slab_destroy_debugcheck(struct kmem_cache * cachep,struct slab * slabp)1961 static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
1962 {
1963 }
1964 #endif
1965 
1966 /**
1967  * slab_destroy - destroy and release all objects in a slab
1968  * @cachep: cache pointer being destroyed
1969  * @slabp: slab pointer being destroyed
1970  *
1971  * Destroy all the objs in a slab, and release the mem back to the system.
1972  * Before calling the slab must have been unlinked from the cache.  The
1973  * cache-lock is not held/needed.
1974  */
slab_destroy(struct kmem_cache * cachep,struct slab * slabp)1975 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1976 {
1977 	void *addr = slabp->s_mem - slabp->colouroff;
1978 
1979 	slab_destroy_debugcheck(cachep, slabp);
1980 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1981 		struct slab_rcu *slab_rcu;
1982 
1983 		slab_rcu = (struct slab_rcu *)slabp;
1984 		slab_rcu->cachep = cachep;
1985 		slab_rcu->addr = addr;
1986 		call_rcu(&slab_rcu->head, kmem_rcu_free);
1987 	} else {
1988 		kmem_freepages(cachep, addr);
1989 		if (OFF_SLAB(cachep))
1990 			kmem_cache_free(cachep->slabp_cache, slabp);
1991 	}
1992 }
1993 
__kmem_cache_destroy(struct kmem_cache * cachep)1994 static void __kmem_cache_destroy(struct kmem_cache *cachep)
1995 {
1996 	int i;
1997 	struct kmem_list3 *l3;
1998 
1999 	for_each_online_cpu(i)
2000 	    kfree(cachep->array[i]);
2001 
2002 	/* NUMA: free the list3 structures */
2003 	for_each_online_node(i) {
2004 		l3 = cachep->nodelists[i];
2005 		if (l3) {
2006 			kfree(l3->shared);
2007 			free_alien_cache(l3->alien);
2008 			kfree(l3);
2009 		}
2010 	}
2011 	kmem_cache_free(&cache_cache, cachep);
2012 }
2013 
2014 
2015 /**
2016  * calculate_slab_order - calculate size (page order) of slabs
2017  * @cachep: pointer to the cache that is being created
2018  * @size: size of objects to be created in this cache.
2019  * @align: required alignment for the objects.
2020  * @flags: slab allocation flags
2021  *
2022  * Also calculates the number of objects per slab.
2023  *
2024  * This could be made much more intelligent.  For now, try to avoid using
2025  * high order pages for slabs.  When the gfp() functions are more friendly
2026  * towards high-order requests, this should be changed.
2027  */
calculate_slab_order(struct kmem_cache * cachep,size_t size,size_t align,unsigned long flags)2028 static size_t calculate_slab_order(struct kmem_cache *cachep,
2029 			size_t size, size_t align, unsigned long flags)
2030 {
2031 	unsigned long offslab_limit;
2032 	size_t left_over = 0;
2033 	int gfporder;
2034 
2035 	for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
2036 		unsigned int num;
2037 		size_t remainder;
2038 
2039 		cache_estimate(gfporder, size, align, flags, &remainder, &num);
2040 		if (!num)
2041 			continue;
2042 
2043 		if (flags & CFLGS_OFF_SLAB) {
2044 			/*
2045 			 * Max number of objs-per-slab for caches which
2046 			 * use off-slab slabs. Needed to avoid a possible
2047 			 * looping condition in cache_grow().
2048 			 */
2049 			offslab_limit = size - sizeof(struct slab);
2050 			offslab_limit /= sizeof(kmem_bufctl_t);
2051 
2052  			if (num > offslab_limit)
2053 				break;
2054 		}
2055 
2056 		/* Found something acceptable - save it away */
2057 		cachep->num = num;
2058 		cachep->gfporder = gfporder;
2059 		left_over = remainder;
2060 
2061 		/*
2062 		 * A VFS-reclaimable slab tends to have most allocations
2063 		 * as GFP_NOFS and we really don't want to have to be allocating
2064 		 * higher-order pages when we are unable to shrink dcache.
2065 		 */
2066 		if (flags & SLAB_RECLAIM_ACCOUNT)
2067 			break;
2068 
2069 		/*
2070 		 * Large number of objects is good, but very large slabs are
2071 		 * currently bad for the gfp()s.
2072 		 */
2073 		if (gfporder >= slab_break_gfp_order)
2074 			break;
2075 
2076 		/*
2077 		 * Acceptable internal fragmentation?
2078 		 */
2079 		if (left_over * 8 <= (PAGE_SIZE << gfporder))
2080 			break;
2081 	}
2082 	return left_over;
2083 }
2084 
setup_cpu_cache(struct kmem_cache * cachep,gfp_t gfp)2085 static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2086 {
2087 	if (g_cpucache_up == FULL)
2088 		return enable_cpucache(cachep, gfp);
2089 
2090 	if (g_cpucache_up == NONE) {
2091 		/*
2092 		 * Note: the first kmem_cache_create must create the cache
2093 		 * that's used by kmalloc(24), otherwise the creation of
2094 		 * further caches will BUG().
2095 		 */
2096 		cachep->array[smp_processor_id()] = &initarray_generic.cache;
2097 
2098 		/*
2099 		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
2100 		 * the first cache, then we need to set up all its list3s,
2101 		 * otherwise the creation of further caches will BUG().
2102 		 */
2103 		set_up_list3s(cachep, SIZE_AC);
2104 		if (INDEX_AC == INDEX_L3)
2105 			g_cpucache_up = PARTIAL_L3;
2106 		else
2107 			g_cpucache_up = PARTIAL_AC;
2108 	} else {
2109 		cachep->array[smp_processor_id()] =
2110 			kmalloc(sizeof(struct arraycache_init), gfp);
2111 
2112 		if (g_cpucache_up == PARTIAL_AC) {
2113 			set_up_list3s(cachep, SIZE_L3);
2114 			g_cpucache_up = PARTIAL_L3;
2115 		} else {
2116 			int node;
2117 			for_each_online_node(node) {
2118 				cachep->nodelists[node] =
2119 				    kmalloc_node(sizeof(struct kmem_list3),
2120 						gfp, node);
2121 				BUG_ON(!cachep->nodelists[node]);
2122 				kmem_list3_init(cachep->nodelists[node]);
2123 			}
2124 		}
2125 	}
2126 	cachep->nodelists[numa_mem_id()]->next_reap =
2127 			jiffies + REAPTIMEOUT_LIST3 +
2128 			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2129 
2130 	cpu_cache_get(cachep)->avail = 0;
2131 	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2132 	cpu_cache_get(cachep)->batchcount = 1;
2133 	cpu_cache_get(cachep)->touched = 0;
2134 	cachep->batchcount = 1;
2135 	cachep->limit = BOOT_CPUCACHE_ENTRIES;
2136 	return 0;
2137 }
2138 
2139 /**
2140  * kmem_cache_create - Create a cache.
2141  * @name: A string which is used in /proc/slabinfo to identify this cache.
2142  * @size: The size of objects to be created in this cache.
2143  * @align: The required alignment for the objects.
2144  * @flags: SLAB flags
2145  * @ctor: A constructor for the objects.
2146  *
2147  * Returns a ptr to the cache on success, NULL on failure.
2148  * Cannot be called within a int, but can be interrupted.
2149  * The @ctor is run when new pages are allocated by the cache.
2150  *
2151  * @name must be valid until the cache is destroyed. This implies that
2152  * the module calling this has to destroy the cache before getting unloaded.
2153  *
2154  * The flags are
2155  *
2156  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
2157  * to catch references to uninitialised memory.
2158  *
2159  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
2160  * for buffer overruns.
2161  *
2162  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
2163  * cacheline.  This can be beneficial if you're counting cycles as closely
2164  * as davem.
2165  */
2166 struct kmem_cache *
kmem_cache_create(const char * name,size_t size,size_t align,unsigned long flags,void (* ctor)(void *))2167 kmem_cache_create (const char *name, size_t size, size_t align,
2168 	unsigned long flags, void (*ctor)(void *))
2169 {
2170 	size_t left_over, slab_size, ralign;
2171 	struct kmem_cache *cachep = NULL, *pc;
2172 	gfp_t gfp;
2173 
2174 	/*
2175 	 * Sanity checks... these are all serious usage bugs.
2176 	 */
2177 	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
2178 	    size > KMALLOC_MAX_SIZE) {
2179 		printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
2180 				name);
2181 		BUG();
2182 	}
2183 
2184 	/*
2185 	 * We use cache_chain_mutex to ensure a consistent view of
2186 	 * cpu_online_mask as well.  Please see cpuup_callback
2187 	 */
2188 	if (slab_is_available()) {
2189 		get_online_cpus();
2190 		mutex_lock(&cache_chain_mutex);
2191 	}
2192 
2193 	list_for_each_entry(pc, &cache_chain, next) {
2194 		char tmp;
2195 		int res;
2196 
2197 		/*
2198 		 * This happens when the module gets unloaded and doesn't
2199 		 * destroy its slab cache and no-one else reuses the vmalloc
2200 		 * area of the module.  Print a warning.
2201 		 */
2202 		res = probe_kernel_address(pc->name, tmp);
2203 		if (res) {
2204 			printk(KERN_ERR
2205 			       "SLAB: cache with size %d has lost its name\n",
2206 			       pc->buffer_size);
2207 			continue;
2208 		}
2209 
2210 		if (!strcmp(pc->name, name)) {
2211 			printk(KERN_ERR
2212 			       "kmem_cache_create: duplicate cache %s\n", name);
2213 			dump_stack();
2214 			goto oops;
2215 		}
2216 	}
2217 
2218 #if DEBUG
2219 	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
2220 #if FORCED_DEBUG
2221 	/*
2222 	 * Enable redzoning and last user accounting, except for caches with
2223 	 * large objects, if the increased size would increase the object size
2224 	 * above the next power of two: caches with object sizes just above a
2225 	 * power of two have a significant amount of internal fragmentation.
2226 	 */
2227 	if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
2228 						2 * sizeof(unsigned long long)))
2229 		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2230 	if (!(flags & SLAB_DESTROY_BY_RCU))
2231 		flags |= SLAB_POISON;
2232 #endif
2233 	if (flags & SLAB_DESTROY_BY_RCU)
2234 		BUG_ON(flags & SLAB_POISON);
2235 #endif
2236 	/*
2237 	 * Always checks flags, a caller might be expecting debug support which
2238 	 * isn't available.
2239 	 */
2240 	BUG_ON(flags & ~CREATE_MASK);
2241 
2242 	/*
2243 	 * Check that size is in terms of words.  This is needed to avoid
2244 	 * unaligned accesses for some archs when redzoning is used, and makes
2245 	 * sure any on-slab bufctl's are also correctly aligned.
2246 	 */
2247 	if (size & (BYTES_PER_WORD - 1)) {
2248 		size += (BYTES_PER_WORD - 1);
2249 		size &= ~(BYTES_PER_WORD - 1);
2250 	}
2251 
2252 	/* calculate the final buffer alignment: */
2253 
2254 	/* 1) arch recommendation: can be overridden for debug */
2255 	if (flags & SLAB_HWCACHE_ALIGN) {
2256 		/*
2257 		 * Default alignment: as specified by the arch code.  Except if
2258 		 * an object is really small, then squeeze multiple objects into
2259 		 * one cacheline.
2260 		 */
2261 		ralign = cache_line_size();
2262 		while (size <= ralign / 2)
2263 			ralign /= 2;
2264 	} else {
2265 		ralign = BYTES_PER_WORD;
2266 	}
2267 
2268 	/*
2269 	 * Redzoning and user store require word alignment or possibly larger.
2270 	 * Note this will be overridden by architecture or caller mandated
2271 	 * alignment if either is greater than BYTES_PER_WORD.
2272 	 */
2273 	if (flags & SLAB_STORE_USER)
2274 		ralign = BYTES_PER_WORD;
2275 
2276 	if (flags & SLAB_RED_ZONE) {
2277 		ralign = REDZONE_ALIGN;
2278 		/* If redzoning, ensure that the second redzone is suitably
2279 		 * aligned, by adjusting the object size accordingly. */
2280 		size += REDZONE_ALIGN - 1;
2281 		size &= ~(REDZONE_ALIGN - 1);
2282 	}
2283 
2284 	/* 2) arch mandated alignment */
2285 	if (ralign < ARCH_SLAB_MINALIGN) {
2286 		ralign = ARCH_SLAB_MINALIGN;
2287 	}
2288 	/* 3) caller mandated alignment */
2289 	if (ralign < align) {
2290 		ralign = align;
2291 	}
2292 	/* disable debug if necessary */
2293 	if (ralign > __alignof__(unsigned long long))
2294 		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2295 	/*
2296 	 * 4) Store it.
2297 	 */
2298 	align = ralign;
2299 
2300 	if (slab_is_available())
2301 		gfp = GFP_KERNEL;
2302 	else
2303 		gfp = GFP_NOWAIT;
2304 
2305 	/* Get cache's description obj. */
2306 	cachep = kmem_cache_zalloc(&cache_cache, gfp);
2307 	if (!cachep)
2308 		goto oops;
2309 
2310 #if DEBUG
2311 	cachep->obj_size = size;
2312 
2313 	/*
2314 	 * Both debugging options require word-alignment which is calculated
2315 	 * into align above.
2316 	 */
2317 	if (flags & SLAB_RED_ZONE) {
2318 		/* add space for red zone words */
2319 		cachep->obj_offset += sizeof(unsigned long long);
2320 		size += 2 * sizeof(unsigned long long);
2321 	}
2322 	if (flags & SLAB_STORE_USER) {
2323 		/* user store requires one word storage behind the end of
2324 		 * the real object. But if the second red zone needs to be
2325 		 * aligned to 64 bits, we must allow that much space.
2326 		 */
2327 		if (flags & SLAB_RED_ZONE)
2328 			size += REDZONE_ALIGN;
2329 		else
2330 			size += BYTES_PER_WORD;
2331 	}
2332 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2333 	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2334 	    && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
2335 		cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
2336 		size = PAGE_SIZE;
2337 	}
2338 #endif
2339 #endif
2340 
2341 	/*
2342 	 * Determine if the slab management is 'on' or 'off' slab.
2343 	 * (bootstrapping cannot cope with offslab caches so don't do
2344 	 * it too early on. Always use on-slab management when
2345 	 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
2346 	 */
2347 	if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
2348 	    !(flags & SLAB_NOLEAKTRACE))
2349 		/*
2350 		 * Size is large, assume best to place the slab management obj
2351 		 * off-slab (should allow better packing of objs).
2352 		 */
2353 		flags |= CFLGS_OFF_SLAB;
2354 
2355 	size = ALIGN(size, align);
2356 
2357 	left_over = calculate_slab_order(cachep, size, align, flags);
2358 
2359 	if (!cachep->num) {
2360 		printk(KERN_ERR
2361 		       "kmem_cache_create: couldn't create cache %s.\n", name);
2362 		kmem_cache_free(&cache_cache, cachep);
2363 		cachep = NULL;
2364 		goto oops;
2365 	}
2366 	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2367 			  + sizeof(struct slab), align);
2368 
2369 	/*
2370 	 * If the slab has been placed off-slab, and we have enough space then
2371 	 * move it on-slab. This is at the expense of any extra colouring.
2372 	 */
2373 	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
2374 		flags &= ~CFLGS_OFF_SLAB;
2375 		left_over -= slab_size;
2376 	}
2377 
2378 	if (flags & CFLGS_OFF_SLAB) {
2379 		/* really off slab. No need for manual alignment */
2380 		slab_size =
2381 		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2382 
2383 #ifdef CONFIG_PAGE_POISONING
2384 		/* If we're going to use the generic kernel_map_pages()
2385 		 * poisoning, then it's going to smash the contents of
2386 		 * the redzone and userword anyhow, so switch them off.
2387 		 */
2388 		if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
2389 			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2390 #endif
2391 	}
2392 
2393 	cachep->colour_off = cache_line_size();
2394 	/* Offset must be a multiple of the alignment. */
2395 	if (cachep->colour_off < align)
2396 		cachep->colour_off = align;
2397 	cachep->colour = left_over / cachep->colour_off;
2398 	cachep->slab_size = slab_size;
2399 	cachep->flags = flags;
2400 	cachep->gfpflags = 0;
2401 	if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
2402 		cachep->gfpflags |= GFP_DMA;
2403 	cachep->buffer_size = size;
2404 	cachep->reciprocal_buffer_size = reciprocal_value(size);
2405 
2406 	if (flags & CFLGS_OFF_SLAB) {
2407 		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2408 		/*
2409 		 * This is a possibility for one of the malloc_sizes caches.
2410 		 * But since we go off slab only for object size greater than
2411 		 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2412 		 * this should not happen at all.
2413 		 * But leave a BUG_ON for some lucky dude.
2414 		 */
2415 		BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
2416 	}
2417 	cachep->ctor = ctor;
2418 	cachep->name = name;
2419 
2420 	if (setup_cpu_cache(cachep, gfp)) {
2421 		__kmem_cache_destroy(cachep);
2422 		cachep = NULL;
2423 		goto oops;
2424 	}
2425 
2426 	/* cache setup completed, link it into the list */
2427 	list_add(&cachep->next, &cache_chain);
2428 oops:
2429 	if (!cachep && (flags & SLAB_PANIC))
2430 		panic("kmem_cache_create(): failed to create slab `%s'\n",
2431 		      name);
2432 	if (slab_is_available()) {
2433 		mutex_unlock(&cache_chain_mutex);
2434 		put_online_cpus();
2435 	}
2436 	return cachep;
2437 }
2438 EXPORT_SYMBOL(kmem_cache_create);
2439 
2440 #if DEBUG
check_irq_off(void)2441 static void check_irq_off(void)
2442 {
2443 	BUG_ON(!irqs_disabled());
2444 }
2445 
check_irq_on(void)2446 static void check_irq_on(void)
2447 {
2448 	BUG_ON(irqs_disabled());
2449 }
2450 
check_spinlock_acquired(struct kmem_cache * cachep)2451 static void check_spinlock_acquired(struct kmem_cache *cachep)
2452 {
2453 #ifdef CONFIG_SMP
2454 	check_irq_off();
2455 	assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock);
2456 #endif
2457 }
2458 
check_spinlock_acquired_node(struct kmem_cache * cachep,int node)2459 static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2460 {
2461 #ifdef CONFIG_SMP
2462 	check_irq_off();
2463 	assert_spin_locked(&cachep->nodelists[node]->list_lock);
2464 #endif
2465 }
2466 
2467 #else
2468 #define check_irq_off()	do { } while(0)
2469 #define check_irq_on()	do { } while(0)
2470 #define check_spinlock_acquired(x) do { } while(0)
2471 #define check_spinlock_acquired_node(x, y) do { } while(0)
2472 #endif
2473 
2474 static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2475 			struct array_cache *ac,
2476 			int force, int node);
2477 
do_drain(void * arg)2478 static void do_drain(void *arg)
2479 {
2480 	struct kmem_cache *cachep = arg;
2481 	struct array_cache *ac;
2482 	int node = numa_mem_id();
2483 
2484 	check_irq_off();
2485 	ac = cpu_cache_get(cachep);
2486 	spin_lock(&cachep->nodelists[node]->list_lock);
2487 	free_block(cachep, ac->entry, ac->avail, node);
2488 	spin_unlock(&cachep->nodelists[node]->list_lock);
2489 	ac->avail = 0;
2490 }
2491 
drain_cpu_caches(struct kmem_cache * cachep)2492 static void drain_cpu_caches(struct kmem_cache *cachep)
2493 {
2494 	struct kmem_list3 *l3;
2495 	int node;
2496 
2497 	on_each_cpu(do_drain, cachep, 1);
2498 	check_irq_on();
2499 	for_each_online_node(node) {
2500 		l3 = cachep->nodelists[node];
2501 		if (l3 && l3->alien)
2502 			drain_alien_cache(cachep, l3->alien);
2503 	}
2504 
2505 	for_each_online_node(node) {
2506 		l3 = cachep->nodelists[node];
2507 		if (l3)
2508 			drain_array(cachep, l3, l3->shared, 1, node);
2509 	}
2510 }
2511 
2512 /*
2513  * Remove slabs from the list of free slabs.
2514  * Specify the number of slabs to drain in tofree.
2515  *
2516  * Returns the actual number of slabs released.
2517  */
drain_freelist(struct kmem_cache * cache,struct kmem_list3 * l3,int tofree)2518 static int drain_freelist(struct kmem_cache *cache,
2519 			struct kmem_list3 *l3, int tofree)
2520 {
2521 	struct list_head *p;
2522 	int nr_freed;
2523 	struct slab *slabp;
2524 
2525 	nr_freed = 0;
2526 	while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
2527 
2528 		spin_lock_irq(&l3->list_lock);
2529 		p = l3->slabs_free.prev;
2530 		if (p == &l3->slabs_free) {
2531 			spin_unlock_irq(&l3->list_lock);
2532 			goto out;
2533 		}
2534 
2535 		slabp = list_entry(p, struct slab, list);
2536 #if DEBUG
2537 		BUG_ON(slabp->inuse);
2538 #endif
2539 		list_del(&slabp->list);
2540 		/*
2541 		 * Safe to drop the lock. The slab is no longer linked
2542 		 * to the cache.
2543 		 */
2544 		l3->free_objects -= cache->num;
2545 		spin_unlock_irq(&l3->list_lock);
2546 		slab_destroy(cache, slabp);
2547 		nr_freed++;
2548 	}
2549 out:
2550 	return nr_freed;
2551 }
2552 
2553 /* Called with cache_chain_mutex held to protect against cpu hotplug */
__cache_shrink(struct kmem_cache * cachep)2554 static int __cache_shrink(struct kmem_cache *cachep)
2555 {
2556 	int ret = 0, i = 0;
2557 	struct kmem_list3 *l3;
2558 
2559 	drain_cpu_caches(cachep);
2560 
2561 	check_irq_on();
2562 	for_each_online_node(i) {
2563 		l3 = cachep->nodelists[i];
2564 		if (!l3)
2565 			continue;
2566 
2567 		drain_freelist(cachep, l3, l3->free_objects);
2568 
2569 		ret += !list_empty(&l3->slabs_full) ||
2570 			!list_empty(&l3->slabs_partial);
2571 	}
2572 	return (ret ? 1 : 0);
2573 }
2574 
2575 /**
2576  * kmem_cache_shrink - Shrink a cache.
2577  * @cachep: The cache to shrink.
2578  *
2579  * Releases as many slabs as possible for a cache.
2580  * To help debugging, a zero exit status indicates all slabs were released.
2581  */
kmem_cache_shrink(struct kmem_cache * cachep)2582 int kmem_cache_shrink(struct kmem_cache *cachep)
2583 {
2584 	int ret;
2585 	BUG_ON(!cachep || in_interrupt());
2586 
2587 	get_online_cpus();
2588 	mutex_lock(&cache_chain_mutex);
2589 	ret = __cache_shrink(cachep);
2590 	mutex_unlock(&cache_chain_mutex);
2591 	put_online_cpus();
2592 	return ret;
2593 }
2594 EXPORT_SYMBOL(kmem_cache_shrink);
2595 
2596 /**
2597  * kmem_cache_destroy - delete a cache
2598  * @cachep: the cache to destroy
2599  *
2600  * Remove a &struct kmem_cache object from the slab cache.
2601  *
2602  * It is expected this function will be called by a module when it is
2603  * unloaded.  This will remove the cache completely, and avoid a duplicate
2604  * cache being allocated each time a module is loaded and unloaded, if the
2605  * module doesn't have persistent in-kernel storage across loads and unloads.
2606  *
2607  * The cache must be empty before calling this function.
2608  *
2609  * The caller must guarantee that no one will allocate memory from the cache
2610  * during the kmem_cache_destroy().
2611  */
kmem_cache_destroy(struct kmem_cache * cachep)2612 void kmem_cache_destroy(struct kmem_cache *cachep)
2613 {
2614 	BUG_ON(!cachep || in_interrupt());
2615 
2616 	/* Find the cache in the chain of caches. */
2617 	get_online_cpus();
2618 	mutex_lock(&cache_chain_mutex);
2619 	/*
2620 	 * the chain is never empty, cache_cache is never destroyed
2621 	 */
2622 	list_del(&cachep->next);
2623 	if (__cache_shrink(cachep)) {
2624 		slab_error(cachep, "Can't free all objects");
2625 		list_add(&cachep->next, &cache_chain);
2626 		mutex_unlock(&cache_chain_mutex);
2627 		put_online_cpus();
2628 		return;
2629 	}
2630 
2631 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2632 		rcu_barrier();
2633 
2634 	__kmem_cache_destroy(cachep);
2635 	mutex_unlock(&cache_chain_mutex);
2636 	put_online_cpus();
2637 }
2638 EXPORT_SYMBOL(kmem_cache_destroy);
2639 
2640 /*
2641  * Get the memory for a slab management obj.
2642  * For a slab cache when the slab descriptor is off-slab, slab descriptors
2643  * always come from malloc_sizes caches.  The slab descriptor cannot
2644  * come from the same cache which is getting created because,
2645  * when we are searching for an appropriate cache for these
2646  * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2647  * If we are creating a malloc_sizes cache here it would not be visible to
2648  * kmem_find_general_cachep till the initialization is complete.
2649  * Hence we cannot have slabp_cache same as the original cache.
2650  */
alloc_slabmgmt(struct kmem_cache * cachep,void * objp,int colour_off,gfp_t local_flags,int nodeid)2651 static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2652 				   int colour_off, gfp_t local_flags,
2653 				   int nodeid)
2654 {
2655 	struct slab *slabp;
2656 
2657 	if (OFF_SLAB(cachep)) {
2658 		/* Slab management obj is off-slab. */
2659 		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2660 					      local_flags, nodeid);
2661 		/*
2662 		 * If the first object in the slab is leaked (it's allocated
2663 		 * but no one has a reference to it), we want to make sure
2664 		 * kmemleak does not treat the ->s_mem pointer as a reference
2665 		 * to the object. Otherwise we will not report the leak.
2666 		 */
2667 		kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
2668 				   local_flags);
2669 		if (!slabp)
2670 			return NULL;
2671 	} else {
2672 		slabp = objp + colour_off;
2673 		colour_off += cachep->slab_size;
2674 	}
2675 	slabp->inuse = 0;
2676 	slabp->colouroff = colour_off;
2677 	slabp->s_mem = objp + colour_off;
2678 	slabp->nodeid = nodeid;
2679 	slabp->free = 0;
2680 	return slabp;
2681 }
2682 
slab_bufctl(struct slab * slabp)2683 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2684 {
2685 	return (kmem_bufctl_t *) (slabp + 1);
2686 }
2687 
cache_init_objs(struct kmem_cache * cachep,struct slab * slabp)2688 static void cache_init_objs(struct kmem_cache *cachep,
2689 			    struct slab *slabp)
2690 {
2691 	int i;
2692 
2693 	for (i = 0; i < cachep->num; i++) {
2694 		void *objp = index_to_obj(cachep, slabp, i);
2695 #if DEBUG
2696 		/* need to poison the objs? */
2697 		if (cachep->flags & SLAB_POISON)
2698 			poison_obj(cachep, objp, POISON_FREE);
2699 		if (cachep->flags & SLAB_STORE_USER)
2700 			*dbg_userword(cachep, objp) = NULL;
2701 
2702 		if (cachep->flags & SLAB_RED_ZONE) {
2703 			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2704 			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2705 		}
2706 		/*
2707 		 * Constructors are not allowed to allocate memory from the same
2708 		 * cache which they are a constructor for.  Otherwise, deadlock.
2709 		 * They must also be threaded.
2710 		 */
2711 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2712 			cachep->ctor(objp + obj_offset(cachep));
2713 
2714 		if (cachep->flags & SLAB_RED_ZONE) {
2715 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2716 				slab_error(cachep, "constructor overwrote the"
2717 					   " end of an object");
2718 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2719 				slab_error(cachep, "constructor overwrote the"
2720 					   " start of an object");
2721 		}
2722 		if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2723 			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2724 			kernel_map_pages(virt_to_page(objp),
2725 					 cachep->buffer_size / PAGE_SIZE, 0);
2726 #else
2727 		if (cachep->ctor)
2728 			cachep->ctor(objp);
2729 #endif
2730 		slab_bufctl(slabp)[i] = i + 1;
2731 	}
2732 	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2733 }
2734 
kmem_flagcheck(struct kmem_cache * cachep,gfp_t flags)2735 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2736 {
2737 	if (CONFIG_ZONE_DMA_FLAG) {
2738 		if (flags & GFP_DMA)
2739 			BUG_ON(!(cachep->gfpflags & GFP_DMA));
2740 		else
2741 			BUG_ON(cachep->gfpflags & GFP_DMA);
2742 	}
2743 }
2744 
slab_get_obj(struct kmem_cache * cachep,struct slab * slabp,int nodeid)2745 static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2746 				int nodeid)
2747 {
2748 	void *objp = index_to_obj(cachep, slabp, slabp->free);
2749 	kmem_bufctl_t next;
2750 
2751 	slabp->inuse++;
2752 	next = slab_bufctl(slabp)[slabp->free];
2753 #if DEBUG
2754 	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2755 	WARN_ON(slabp->nodeid != nodeid);
2756 #endif
2757 	slabp->free = next;
2758 
2759 	return objp;
2760 }
2761 
slab_put_obj(struct kmem_cache * cachep,struct slab * slabp,void * objp,int nodeid)2762 static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2763 				void *objp, int nodeid)
2764 {
2765 	unsigned int objnr = obj_to_index(cachep, slabp, objp);
2766 
2767 #if DEBUG
2768 	/* Verify that the slab belongs to the intended node */
2769 	WARN_ON(slabp->nodeid != nodeid);
2770 
2771 	if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2772 		printk(KERN_ERR "slab: double free detected in cache "
2773 				"'%s', objp %p\n", cachep->name, objp);
2774 		BUG();
2775 	}
2776 #endif
2777 	slab_bufctl(slabp)[objnr] = slabp->free;
2778 	slabp->free = objnr;
2779 	slabp->inuse--;
2780 }
2781 
2782 /*
2783  * Map pages beginning at addr to the given cache and slab. This is required
2784  * for the slab allocator to be able to lookup the cache and slab of a
2785  * virtual address for kfree, ksize, and slab debugging.
2786  */
slab_map_pages(struct kmem_cache * cache,struct slab * slab,void * addr)2787 static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2788 			   void *addr)
2789 {
2790 	int nr_pages;
2791 	struct page *page;
2792 
2793 	page = virt_to_page(addr);
2794 
2795 	nr_pages = 1;
2796 	if (likely(!PageCompound(page)))
2797 		nr_pages <<= cache->gfporder;
2798 
2799 	do {
2800 		page_set_cache(page, cache);
2801 		page_set_slab(page, slab);
2802 		page++;
2803 	} while (--nr_pages);
2804 }
2805 
2806 /*
2807  * Grow (by 1) the number of slabs within a cache.  This is called by
2808  * kmem_cache_alloc() when there are no active objs left in a cache.
2809  */
cache_grow(struct kmem_cache * cachep,gfp_t flags,int nodeid,void * objp)2810 static int cache_grow(struct kmem_cache *cachep,
2811 		gfp_t flags, int nodeid, void *objp)
2812 {
2813 	struct slab *slabp;
2814 	size_t offset;
2815 	gfp_t local_flags;
2816 	struct kmem_list3 *l3;
2817 
2818 	/*
2819 	 * Be lazy and only check for valid flags here,  keeping it out of the
2820 	 * critical path in kmem_cache_alloc().
2821 	 */
2822 	BUG_ON(flags & GFP_SLAB_BUG_MASK);
2823 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
2824 
2825 	/* Take the l3 list lock to change the colour_next on this node */
2826 	check_irq_off();
2827 	l3 = cachep->nodelists[nodeid];
2828 	spin_lock(&l3->list_lock);
2829 
2830 	/* Get colour for the slab, and cal the next value. */
2831 	offset = l3->colour_next;
2832 	l3->colour_next++;
2833 	if (l3->colour_next >= cachep->colour)
2834 		l3->colour_next = 0;
2835 	spin_unlock(&l3->list_lock);
2836 
2837 	offset *= cachep->colour_off;
2838 
2839 	if (local_flags & __GFP_WAIT)
2840 		local_irq_enable();
2841 
2842 	/*
2843 	 * The test for missing atomic flag is performed here, rather than
2844 	 * the more obvious place, simply to reduce the critical path length
2845 	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2846 	 * will eventually be caught here (where it matters).
2847 	 */
2848 	kmem_flagcheck(cachep, flags);
2849 
2850 	/*
2851 	 * Get mem for the objs.  Attempt to allocate a physical page from
2852 	 * 'nodeid'.
2853 	 */
2854 	if (!objp)
2855 		objp = kmem_getpages(cachep, local_flags, nodeid);
2856 	if (!objp)
2857 		goto failed;
2858 
2859 	/* Get slab management. */
2860 	slabp = alloc_slabmgmt(cachep, objp, offset,
2861 			local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
2862 	if (!slabp)
2863 		goto opps1;
2864 
2865 	slab_map_pages(cachep, slabp, objp);
2866 
2867 	cache_init_objs(cachep, slabp);
2868 
2869 	if (local_flags & __GFP_WAIT)
2870 		local_irq_disable();
2871 	check_irq_off();
2872 	spin_lock(&l3->list_lock);
2873 
2874 	/* Make slab active. */
2875 	list_add_tail(&slabp->list, &(l3->slabs_free));
2876 	STATS_INC_GROWN(cachep);
2877 	l3->free_objects += cachep->num;
2878 	spin_unlock(&l3->list_lock);
2879 	return 1;
2880 opps1:
2881 	kmem_freepages(cachep, objp);
2882 failed:
2883 	if (local_flags & __GFP_WAIT)
2884 		local_irq_disable();
2885 	return 0;
2886 }
2887 
2888 #if DEBUG
2889 
2890 /*
2891  * Perform extra freeing checks:
2892  * - detect bad pointers.
2893  * - POISON/RED_ZONE checking
2894  */
kfree_debugcheck(const void * objp)2895 static void kfree_debugcheck(const void *objp)
2896 {
2897 	if (!virt_addr_valid(objp)) {
2898 		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2899 		       (unsigned long)objp);
2900 		BUG();
2901 	}
2902 }
2903 
verify_redzone_free(struct kmem_cache * cache,void * obj)2904 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2905 {
2906 	unsigned long long redzone1, redzone2;
2907 
2908 	redzone1 = *dbg_redzone1(cache, obj);
2909 	redzone2 = *dbg_redzone2(cache, obj);
2910 
2911 	/*
2912 	 * Redzone is ok.
2913 	 */
2914 	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2915 		return;
2916 
2917 	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2918 		slab_error(cache, "double free detected");
2919 	else
2920 		slab_error(cache, "memory outside object was overwritten");
2921 
2922 	printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
2923 			obj, redzone1, redzone2);
2924 }
2925 
cache_free_debugcheck(struct kmem_cache * cachep,void * objp,void * caller)2926 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2927 				   void *caller)
2928 {
2929 	struct page *page;
2930 	unsigned int objnr;
2931 	struct slab *slabp;
2932 
2933 	BUG_ON(virt_to_cache(objp) != cachep);
2934 
2935 	objp -= obj_offset(cachep);
2936 	kfree_debugcheck(objp);
2937 	page = virt_to_head_page(objp);
2938 
2939 	slabp = page_get_slab(page);
2940 
2941 	if (cachep->flags & SLAB_RED_ZONE) {
2942 		verify_redzone_free(cachep, objp);
2943 		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2944 		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2945 	}
2946 	if (cachep->flags & SLAB_STORE_USER)
2947 		*dbg_userword(cachep, objp) = caller;
2948 
2949 	objnr = obj_to_index(cachep, slabp, objp);
2950 
2951 	BUG_ON(objnr >= cachep->num);
2952 	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2953 
2954 #ifdef CONFIG_DEBUG_SLAB_LEAK
2955 	slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2956 #endif
2957 	if (cachep->flags & SLAB_POISON) {
2958 #ifdef CONFIG_DEBUG_PAGEALLOC
2959 		if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2960 			store_stackinfo(cachep, objp, (unsigned long)caller);
2961 			kernel_map_pages(virt_to_page(objp),
2962 					 cachep->buffer_size / PAGE_SIZE, 0);
2963 		} else {
2964 			poison_obj(cachep, objp, POISON_FREE);
2965 		}
2966 #else
2967 		poison_obj(cachep, objp, POISON_FREE);
2968 #endif
2969 	}
2970 	return objp;
2971 }
2972 
check_slabp(struct kmem_cache * cachep,struct slab * slabp)2973 static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2974 {
2975 	kmem_bufctl_t i;
2976 	int entries = 0;
2977 
2978 	/* Check slab's freelist to see if this obj is there. */
2979 	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2980 		entries++;
2981 		if (entries > cachep->num || i >= cachep->num)
2982 			goto bad;
2983 	}
2984 	if (entries != cachep->num - slabp->inuse) {
2985 bad:
2986 		printk(KERN_ERR "slab: Internal list corruption detected in "
2987 				"cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2988 			cachep->name, cachep->num, slabp, slabp->inuse);
2989 		for (i = 0;
2990 		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2991 		     i++) {
2992 			if (i % 16 == 0)
2993 				printk("\n%03x:", i);
2994 			printk(" %02x", ((unsigned char *)slabp)[i]);
2995 		}
2996 		printk("\n");
2997 		BUG();
2998 	}
2999 }
3000 #else
3001 #define kfree_debugcheck(x) do { } while(0)
3002 #define cache_free_debugcheck(x,objp,z) (objp)
3003 #define check_slabp(x,y) do { } while(0)
3004 #endif
3005 
cache_alloc_refill(struct kmem_cache * cachep,gfp_t flags)3006 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
3007 {
3008 	int batchcount;
3009 	struct kmem_list3 *l3;
3010 	struct array_cache *ac;
3011 	int node;
3012 
3013 retry:
3014 	check_irq_off();
3015 	node = numa_mem_id();
3016 	ac = cpu_cache_get(cachep);
3017 	batchcount = ac->batchcount;
3018 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
3019 		/*
3020 		 * If there was little recent activity on this cache, then
3021 		 * perform only a partial refill.  Otherwise we could generate
3022 		 * refill bouncing.
3023 		 */
3024 		batchcount = BATCHREFILL_LIMIT;
3025 	}
3026 	l3 = cachep->nodelists[node];
3027 
3028 	BUG_ON(ac->avail > 0 || !l3);
3029 	spin_lock(&l3->list_lock);
3030 
3031 	/* See if we can refill from the shared array */
3032 	if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
3033 		l3->shared->touched = 1;
3034 		goto alloc_done;
3035 	}
3036 
3037 	while (batchcount > 0) {
3038 		struct list_head *entry;
3039 		struct slab *slabp;
3040 		/* Get slab alloc is to come from. */
3041 		entry = l3->slabs_partial.next;
3042 		if (entry == &l3->slabs_partial) {
3043 			l3->free_touched = 1;
3044 			entry = l3->slabs_free.next;
3045 			if (entry == &l3->slabs_free)
3046 				goto must_grow;
3047 		}
3048 
3049 		slabp = list_entry(entry, struct slab, list);
3050 		check_slabp(cachep, slabp);
3051 		check_spinlock_acquired(cachep);
3052 
3053 		/*
3054 		 * The slab was either on partial or free list so
3055 		 * there must be at least one object available for
3056 		 * allocation.
3057 		 */
3058 		BUG_ON(slabp->inuse >= cachep->num);
3059 
3060 		while (slabp->inuse < cachep->num && batchcount--) {
3061 			STATS_INC_ALLOCED(cachep);
3062 			STATS_INC_ACTIVE(cachep);
3063 			STATS_SET_HIGH(cachep);
3064 
3065 			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
3066 							    node);
3067 		}
3068 		check_slabp(cachep, slabp);
3069 
3070 		/* move slabp to correct slabp list: */
3071 		list_del(&slabp->list);
3072 		if (slabp->free == BUFCTL_END)
3073 			list_add(&slabp->list, &l3->slabs_full);
3074 		else
3075 			list_add(&slabp->list, &l3->slabs_partial);
3076 	}
3077 
3078 must_grow:
3079 	l3->free_objects -= ac->avail;
3080 alloc_done:
3081 	spin_unlock(&l3->list_lock);
3082 
3083 	if (unlikely(!ac->avail)) {
3084 		int x;
3085 		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3086 
3087 		/* cache_grow can reenable interrupts, then ac could change. */
3088 		ac = cpu_cache_get(cachep);
3089 		if (!x && ac->avail == 0)	/* no objects in sight? abort */
3090 			return NULL;
3091 
3092 		if (!ac->avail)		/* objects refilled by interrupt? */
3093 			goto retry;
3094 	}
3095 	ac->touched = 1;
3096 	return ac->entry[--ac->avail];
3097 }
3098 
cache_alloc_debugcheck_before(struct kmem_cache * cachep,gfp_t flags)3099 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
3100 						gfp_t flags)
3101 {
3102 	might_sleep_if(flags & __GFP_WAIT);
3103 #if DEBUG
3104 	kmem_flagcheck(cachep, flags);
3105 #endif
3106 }
3107 
3108 #if DEBUG
cache_alloc_debugcheck_after(struct kmem_cache * cachep,gfp_t flags,void * objp,void * caller)3109 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3110 				gfp_t flags, void *objp, void *caller)
3111 {
3112 	if (!objp)
3113 		return objp;
3114 	if (cachep->flags & SLAB_POISON) {
3115 #ifdef CONFIG_DEBUG_PAGEALLOC
3116 		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
3117 			kernel_map_pages(virt_to_page(objp),
3118 					 cachep->buffer_size / PAGE_SIZE, 1);
3119 		else
3120 			check_poison_obj(cachep, objp);
3121 #else
3122 		check_poison_obj(cachep, objp);
3123 #endif
3124 		poison_obj(cachep, objp, POISON_INUSE);
3125 	}
3126 	if (cachep->flags & SLAB_STORE_USER)
3127 		*dbg_userword(cachep, objp) = caller;
3128 
3129 	if (cachep->flags & SLAB_RED_ZONE) {
3130 		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
3131 				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
3132 			slab_error(cachep, "double free, or memory outside"
3133 						" object was overwritten");
3134 			printk(KERN_ERR
3135 				"%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
3136 				objp, *dbg_redzone1(cachep, objp),
3137 				*dbg_redzone2(cachep, objp));
3138 		}
3139 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
3140 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
3141 	}
3142 #ifdef CONFIG_DEBUG_SLAB_LEAK
3143 	{
3144 		struct slab *slabp;
3145 		unsigned objnr;
3146 
3147 		slabp = page_get_slab(virt_to_head_page(objp));
3148 		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
3149 		slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3150 	}
3151 #endif
3152 	objp += obj_offset(cachep);
3153 	if (cachep->ctor && cachep->flags & SLAB_POISON)
3154 		cachep->ctor(objp);
3155 #if ARCH_SLAB_MINALIGN
3156 	if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3157 		printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3158 		       objp, ARCH_SLAB_MINALIGN);
3159 	}
3160 #endif
3161 	return objp;
3162 }
3163 #else
3164 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3165 #endif
3166 
slab_should_failslab(struct kmem_cache * cachep,gfp_t flags)3167 static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3168 {
3169 	if (cachep == &cache_cache)
3170 		return false;
3171 
3172 	return should_failslab(obj_size(cachep), flags, cachep->flags);
3173 }
3174 
____cache_alloc(struct kmem_cache * cachep,gfp_t flags)3175 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3176 {
3177 	void *objp;
3178 	struct array_cache *ac;
3179 
3180 	check_irq_off();
3181 
3182 	ac = cpu_cache_get(cachep);
3183 	if (likely(ac->avail)) {
3184 		STATS_INC_ALLOCHIT(cachep);
3185 		ac->touched = 1;
3186 		objp = ac->entry[--ac->avail];
3187 	} else {
3188 		STATS_INC_ALLOCMISS(cachep);
3189 		objp = cache_alloc_refill(cachep, flags);
3190 		/*
3191 		 * the 'ac' may be updated by cache_alloc_refill(),
3192 		 * and kmemleak_erase() requires its correct value.
3193 		 */
3194 		ac = cpu_cache_get(cachep);
3195 	}
3196 	/*
3197 	 * To avoid a false negative, if an object that is in one of the
3198 	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
3199 	 * treat the array pointers as a reference to the object.
3200 	 */
3201 	if (objp)
3202 		kmemleak_erase(&ac->entry[ac->avail]);
3203 	return objp;
3204 }
3205 
3206 #ifdef CONFIG_NUMA
3207 /*
3208  * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
3209  *
3210  * If we are in_interrupt, then process context, including cpusets and
3211  * mempolicy, may not apply and should not be used for allocation policy.
3212  */
alternate_node_alloc(struct kmem_cache * cachep,gfp_t flags)3213 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3214 {
3215 	int nid_alloc, nid_here;
3216 
3217 	if (in_interrupt() || (flags & __GFP_THISNODE))
3218 		return NULL;
3219 	nid_alloc = nid_here = numa_mem_id();
3220 	get_mems_allowed();
3221 	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3222 		nid_alloc = cpuset_slab_spread_node();
3223 	else if (current->mempolicy)
3224 		nid_alloc = slab_node(current->mempolicy);
3225 	put_mems_allowed();
3226 	if (nid_alloc != nid_here)
3227 		return ____cache_alloc_node(cachep, flags, nid_alloc);
3228 	return NULL;
3229 }
3230 
3231 /*
3232  * Fallback function if there was no memory available and no objects on a
3233  * certain node and fall back is permitted. First we scan all the
3234  * available nodelists for available objects. If that fails then we
3235  * perform an allocation without specifying a node. This allows the page
3236  * allocator to do its reclaim / fallback magic. We then insert the
3237  * slab into the proper nodelist and then allocate from it.
3238  */
fallback_alloc(struct kmem_cache * cache,gfp_t flags)3239 static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3240 {
3241 	struct zonelist *zonelist;
3242 	gfp_t local_flags;
3243 	struct zoneref *z;
3244 	struct zone *zone;
3245 	enum zone_type high_zoneidx = gfp_zone(flags);
3246 	void *obj = NULL;
3247 	int nid;
3248 
3249 	if (flags & __GFP_THISNODE)
3250 		return NULL;
3251 
3252 	get_mems_allowed();
3253 	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3254 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3255 
3256 retry:
3257 	/*
3258 	 * Look through allowed nodes for objects available
3259 	 * from existing per node queues.
3260 	 */
3261 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
3262 		nid = zone_to_nid(zone);
3263 
3264 		if (cpuset_zone_allowed_hardwall(zone, flags) &&
3265 			cache->nodelists[nid] &&
3266 			cache->nodelists[nid]->free_objects) {
3267 				obj = ____cache_alloc_node(cache,
3268 					flags | GFP_THISNODE, nid);
3269 				if (obj)
3270 					break;
3271 		}
3272 	}
3273 
3274 	if (!obj) {
3275 		/*
3276 		 * This allocation will be performed within the constraints
3277 		 * of the current cpuset / memory policy requirements.
3278 		 * We may trigger various forms of reclaim on the allowed
3279 		 * set and go into memory reserves if necessary.
3280 		 */
3281 		if (local_flags & __GFP_WAIT)
3282 			local_irq_enable();
3283 		kmem_flagcheck(cache, flags);
3284 		obj = kmem_getpages(cache, local_flags, numa_mem_id());
3285 		if (local_flags & __GFP_WAIT)
3286 			local_irq_disable();
3287 		if (obj) {
3288 			/*
3289 			 * Insert into the appropriate per node queues
3290 			 */
3291 			nid = page_to_nid(virt_to_page(obj));
3292 			if (cache_grow(cache, flags, nid, obj)) {
3293 				obj = ____cache_alloc_node(cache,
3294 					flags | GFP_THISNODE, nid);
3295 				if (!obj)
3296 					/*
3297 					 * Another processor may allocate the
3298 					 * objects in the slab since we are
3299 					 * not holding any locks.
3300 					 */
3301 					goto retry;
3302 			} else {
3303 				/* cache_grow already freed obj */
3304 				obj = NULL;
3305 			}
3306 		}
3307 	}
3308 	put_mems_allowed();
3309 	return obj;
3310 }
3311 
3312 /*
3313  * A interface to enable slab creation on nodeid
3314  */
____cache_alloc_node(struct kmem_cache * cachep,gfp_t flags,int nodeid)3315 static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3316 				int nodeid)
3317 {
3318 	struct list_head *entry;
3319 	struct slab *slabp;
3320 	struct kmem_list3 *l3;
3321 	void *obj;
3322 	int x;
3323 
3324 	l3 = cachep->nodelists[nodeid];
3325 	BUG_ON(!l3);
3326 
3327 retry:
3328 	check_irq_off();
3329 	spin_lock(&l3->list_lock);
3330 	entry = l3->slabs_partial.next;
3331 	if (entry == &l3->slabs_partial) {
3332 		l3->free_touched = 1;
3333 		entry = l3->slabs_free.next;
3334 		if (entry == &l3->slabs_free)
3335 			goto must_grow;
3336 	}
3337 
3338 	slabp = list_entry(entry, struct slab, list);
3339 	check_spinlock_acquired_node(cachep, nodeid);
3340 	check_slabp(cachep, slabp);
3341 
3342 	STATS_INC_NODEALLOCS(cachep);
3343 	STATS_INC_ACTIVE(cachep);
3344 	STATS_SET_HIGH(cachep);
3345 
3346 	BUG_ON(slabp->inuse == cachep->num);
3347 
3348 	obj = slab_get_obj(cachep, slabp, nodeid);
3349 	check_slabp(cachep, slabp);
3350 	l3->free_objects--;
3351 	/* move slabp to correct slabp list: */
3352 	list_del(&slabp->list);
3353 
3354 	if (slabp->free == BUFCTL_END)
3355 		list_add(&slabp->list, &l3->slabs_full);
3356 	else
3357 		list_add(&slabp->list, &l3->slabs_partial);
3358 
3359 	spin_unlock(&l3->list_lock);
3360 	goto done;
3361 
3362 must_grow:
3363 	spin_unlock(&l3->list_lock);
3364 	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3365 	if (x)
3366 		goto retry;
3367 
3368 	return fallback_alloc(cachep, flags);
3369 
3370 done:
3371 	return obj;
3372 }
3373 
3374 /**
3375  * kmem_cache_alloc_node - Allocate an object on the specified node
3376  * @cachep: The cache to allocate from.
3377  * @flags: See kmalloc().
3378  * @nodeid: node number of the target node.
3379  * @caller: return address of caller, used for debug information
3380  *
3381  * Identical to kmem_cache_alloc but it will allocate memory on the given
3382  * node, which can improve the performance for cpu bound structures.
3383  *
3384  * Fallback to other node is possible if __GFP_THISNODE is not set.
3385  */
3386 static __always_inline void *
__cache_alloc_node(struct kmem_cache * cachep,gfp_t flags,int nodeid,void * caller)3387 __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3388 		   void *caller)
3389 {
3390 	unsigned long save_flags;
3391 	void *ptr;
3392 	int slab_node = numa_mem_id();
3393 
3394 	flags &= gfp_allowed_mask;
3395 
3396 	lockdep_trace_alloc(flags);
3397 
3398 	if (slab_should_failslab(cachep, flags))
3399 		return NULL;
3400 
3401 	cache_alloc_debugcheck_before(cachep, flags);
3402 	local_irq_save(save_flags);
3403 
3404 	if (nodeid == -1)
3405 		nodeid = slab_node;
3406 
3407 	if (unlikely(!cachep->nodelists[nodeid])) {
3408 		/* Node not bootstrapped yet */
3409 		ptr = fallback_alloc(cachep, flags);
3410 		goto out;
3411 	}
3412 
3413 	if (nodeid == slab_node) {
3414 		/*
3415 		 * Use the locally cached objects if possible.
3416 		 * However ____cache_alloc does not allow fallback
3417 		 * to other nodes. It may fail while we still have
3418 		 * objects on other nodes available.
3419 		 */
3420 		ptr = ____cache_alloc(cachep, flags);
3421 		if (ptr)
3422 			goto out;
3423 	}
3424 	/* ___cache_alloc_node can fall back to other nodes */
3425 	ptr = ____cache_alloc_node(cachep, flags, nodeid);
3426   out:
3427 	local_irq_restore(save_flags);
3428 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3429 	kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
3430 				 flags);
3431 
3432 	if (likely(ptr))
3433 		kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
3434 
3435 	if (unlikely((flags & __GFP_ZERO) && ptr))
3436 		memset(ptr, 0, obj_size(cachep));
3437 
3438 	return ptr;
3439 }
3440 
3441 static __always_inline void *
__do_cache_alloc(struct kmem_cache * cache,gfp_t flags)3442 __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3443 {
3444 	void *objp;
3445 
3446 	if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
3447 		objp = alternate_node_alloc(cache, flags);
3448 		if (objp)
3449 			goto out;
3450 	}
3451 	objp = ____cache_alloc(cache, flags);
3452 
3453 	/*
3454 	 * We may just have run out of memory on the local node.
3455 	 * ____cache_alloc_node() knows how to locate memory on other nodes
3456 	 */
3457 	if (!objp)
3458 		objp = ____cache_alloc_node(cache, flags, numa_mem_id());
3459 
3460   out:
3461 	return objp;
3462 }
3463 #else
3464 
3465 static __always_inline void *
__do_cache_alloc(struct kmem_cache * cachep,gfp_t flags)3466 __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3467 {
3468 	return ____cache_alloc(cachep, flags);
3469 }
3470 
3471 #endif /* CONFIG_NUMA */
3472 
3473 static __always_inline void *
__cache_alloc(struct kmem_cache * cachep,gfp_t flags,void * caller)3474 __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3475 {
3476 	unsigned long save_flags;
3477 	void *objp;
3478 
3479 	flags &= gfp_allowed_mask;
3480 
3481 	lockdep_trace_alloc(flags);
3482 
3483 	if (slab_should_failslab(cachep, flags))
3484 		return NULL;
3485 
3486 	cache_alloc_debugcheck_before(cachep, flags);
3487 	local_irq_save(save_flags);
3488 	objp = __do_cache_alloc(cachep, flags);
3489 	local_irq_restore(save_flags);
3490 	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3491 	kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
3492 				 flags);
3493 	prefetchw(objp);
3494 
3495 	if (likely(objp))
3496 		kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
3497 
3498 	if (unlikely((flags & __GFP_ZERO) && objp))
3499 		memset(objp, 0, obj_size(cachep));
3500 
3501 	return objp;
3502 }
3503 
3504 /*
3505  * Caller needs to acquire correct kmem_list's list_lock
3506  */
free_block(struct kmem_cache * cachep,void ** objpp,int nr_objects,int node)3507 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3508 		       int node)
3509 {
3510 	int i;
3511 	struct kmem_list3 *l3;
3512 
3513 	for (i = 0; i < nr_objects; i++) {
3514 		void *objp = objpp[i];
3515 		struct slab *slabp;
3516 
3517 		slabp = virt_to_slab(objp);
3518 		l3 = cachep->nodelists[node];
3519 		list_del(&slabp->list);
3520 		check_spinlock_acquired_node(cachep, node);
3521 		check_slabp(cachep, slabp);
3522 		slab_put_obj(cachep, slabp, objp, node);
3523 		STATS_DEC_ACTIVE(cachep);
3524 		l3->free_objects++;
3525 		check_slabp(cachep, slabp);
3526 
3527 		/* fixup slab chains */
3528 		if (slabp->inuse == 0) {
3529 			if (l3->free_objects > l3->free_limit) {
3530 				l3->free_objects -= cachep->num;
3531 				/* No need to drop any previously held
3532 				 * lock here, even if we have a off-slab slab
3533 				 * descriptor it is guaranteed to come from
3534 				 * a different cache, refer to comments before
3535 				 * alloc_slabmgmt.
3536 				 */
3537 				slab_destroy(cachep, slabp);
3538 			} else {
3539 				list_add(&slabp->list, &l3->slabs_free);
3540 			}
3541 		} else {
3542 			/* Unconditionally move a slab to the end of the
3543 			 * partial list on free - maximum time for the
3544 			 * other objects to be freed, too.
3545 			 */
3546 			list_add_tail(&slabp->list, &l3->slabs_partial);
3547 		}
3548 	}
3549 }
3550 
cache_flusharray(struct kmem_cache * cachep,struct array_cache * ac)3551 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3552 {
3553 	int batchcount;
3554 	struct kmem_list3 *l3;
3555 	int node = numa_mem_id();
3556 
3557 	batchcount = ac->batchcount;
3558 #if DEBUG
3559 	BUG_ON(!batchcount || batchcount > ac->avail);
3560 #endif
3561 	check_irq_off();
3562 	l3 = cachep->nodelists[node];
3563 	spin_lock(&l3->list_lock);
3564 	if (l3->shared) {
3565 		struct array_cache *shared_array = l3->shared;
3566 		int max = shared_array->limit - shared_array->avail;
3567 		if (max) {
3568 			if (batchcount > max)
3569 				batchcount = max;
3570 			memcpy(&(shared_array->entry[shared_array->avail]),
3571 			       ac->entry, sizeof(void *) * batchcount);
3572 			shared_array->avail += batchcount;
3573 			goto free_done;
3574 		}
3575 	}
3576 
3577 	free_block(cachep, ac->entry, batchcount, node);
3578 free_done:
3579 #if STATS
3580 	{
3581 		int i = 0;
3582 		struct list_head *p;
3583 
3584 		p = l3->slabs_free.next;
3585 		while (p != &(l3->slabs_free)) {
3586 			struct slab *slabp;
3587 
3588 			slabp = list_entry(p, struct slab, list);
3589 			BUG_ON(slabp->inuse);
3590 
3591 			i++;
3592 			p = p->next;
3593 		}
3594 		STATS_SET_FREEABLE(cachep, i);
3595 	}
3596 #endif
3597 	spin_unlock(&l3->list_lock);
3598 	ac->avail -= batchcount;
3599 	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3600 }
3601 
3602 /*
3603  * Release an obj back to its cache. If the obj has a constructed state, it must
3604  * be in this state _before_ it is released.  Called with disabled ints.
3605  */
__cache_free(struct kmem_cache * cachep,void * objp)3606 static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3607 {
3608 	struct array_cache *ac = cpu_cache_get(cachep);
3609 
3610 	check_irq_off();
3611 	kmemleak_free_recursive(objp, cachep->flags);
3612 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3613 
3614 	kmemcheck_slab_free(cachep, objp, obj_size(cachep));
3615 
3616 	/*
3617 	 * Skip calling cache_free_alien() when the platform is not numa.
3618 	 * This will avoid cache misses that happen while accessing slabp (which
3619 	 * is per page memory  reference) to get nodeid. Instead use a global
3620 	 * variable to skip the call, which is mostly likely to be present in
3621 	 * the cache.
3622 	 */
3623 	if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3624 		return;
3625 
3626 	if (likely(ac->avail < ac->limit)) {
3627 		STATS_INC_FREEHIT(cachep);
3628 		ac->entry[ac->avail++] = objp;
3629 		return;
3630 	} else {
3631 		STATS_INC_FREEMISS(cachep);
3632 		cache_flusharray(cachep, ac);
3633 		ac->entry[ac->avail++] = objp;
3634 	}
3635 }
3636 
3637 /**
3638  * kmem_cache_alloc - Allocate an object
3639  * @cachep: The cache to allocate from.
3640  * @flags: See kmalloc().
3641  *
3642  * Allocate an object from this cache.  The flags are only relevant
3643  * if the cache has no available objects.
3644  */
kmem_cache_alloc(struct kmem_cache * cachep,gfp_t flags)3645 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3646 {
3647 	void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3648 
3649 	trace_kmem_cache_alloc(_RET_IP_, ret,
3650 			       obj_size(cachep), cachep->buffer_size, flags);
3651 
3652 	return ret;
3653 }
3654 EXPORT_SYMBOL(kmem_cache_alloc);
3655 
3656 #ifdef CONFIG_TRACING
3657 void *
kmem_cache_alloc_trace(size_t size,struct kmem_cache * cachep,gfp_t flags)3658 kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
3659 {
3660 	void *ret;
3661 
3662 	ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3663 
3664 	trace_kmalloc(_RET_IP_, ret,
3665 		      size, slab_buffer_size(cachep), flags);
3666 	return ret;
3667 }
3668 EXPORT_SYMBOL(kmem_cache_alloc_trace);
3669 #endif
3670 
3671 #ifdef CONFIG_NUMA
kmem_cache_alloc_node(struct kmem_cache * cachep,gfp_t flags,int nodeid)3672 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3673 {
3674 	void *ret = __cache_alloc_node(cachep, flags, nodeid,
3675 				       __builtin_return_address(0));
3676 
3677 	trace_kmem_cache_alloc_node(_RET_IP_, ret,
3678 				    obj_size(cachep), cachep->buffer_size,
3679 				    flags, nodeid);
3680 
3681 	return ret;
3682 }
3683 EXPORT_SYMBOL(kmem_cache_alloc_node);
3684 
3685 #ifdef CONFIG_TRACING
kmem_cache_alloc_node_trace(size_t size,struct kmem_cache * cachep,gfp_t flags,int nodeid)3686 void *kmem_cache_alloc_node_trace(size_t size,
3687 				  struct kmem_cache *cachep,
3688 				  gfp_t flags,
3689 				  int nodeid)
3690 {
3691 	void *ret;
3692 
3693 	ret = __cache_alloc_node(cachep, flags, nodeid,
3694 				  __builtin_return_address(0));
3695 	trace_kmalloc_node(_RET_IP_, ret,
3696 			   size, slab_buffer_size(cachep),
3697 			   flags, nodeid);
3698 	return ret;
3699 }
3700 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
3701 #endif
3702 
3703 static __always_inline void *
__do_kmalloc_node(size_t size,gfp_t flags,int node,void * caller)3704 __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3705 {
3706 	struct kmem_cache *cachep;
3707 
3708 	cachep = kmem_find_general_cachep(size, flags);
3709 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3710 		return cachep;
3711 	return kmem_cache_alloc_node_trace(size, cachep, flags, node);
3712 }
3713 
3714 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
__kmalloc_node(size_t size,gfp_t flags,int node)3715 void *__kmalloc_node(size_t size, gfp_t flags, int node)
3716 {
3717 	return __do_kmalloc_node(size, flags, node,
3718 			__builtin_return_address(0));
3719 }
3720 EXPORT_SYMBOL(__kmalloc_node);
3721 
__kmalloc_node_track_caller(size_t size,gfp_t flags,int node,unsigned long caller)3722 void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3723 		int node, unsigned long caller)
3724 {
3725 	return __do_kmalloc_node(size, flags, node, (void *)caller);
3726 }
3727 EXPORT_SYMBOL(__kmalloc_node_track_caller);
3728 #else
__kmalloc_node(size_t size,gfp_t flags,int node)3729 void *__kmalloc_node(size_t size, gfp_t flags, int node)
3730 {
3731 	return __do_kmalloc_node(size, flags, node, NULL);
3732 }
3733 EXPORT_SYMBOL(__kmalloc_node);
3734 #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
3735 #endif /* CONFIG_NUMA */
3736 
3737 /**
3738  * __do_kmalloc - allocate memory
3739  * @size: how many bytes of memory are required.
3740  * @flags: the type of memory to allocate (see kmalloc).
3741  * @caller: function caller for debug tracking of the caller
3742  */
__do_kmalloc(size_t size,gfp_t flags,void * caller)3743 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3744 					  void *caller)
3745 {
3746 	struct kmem_cache *cachep;
3747 	void *ret;
3748 
3749 	/* If you want to save a few bytes .text space: replace
3750 	 * __ with kmem_.
3751 	 * Then kmalloc uses the uninlined functions instead of the inline
3752 	 * functions.
3753 	 */
3754 	cachep = __find_general_cachep(size, flags);
3755 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3756 		return cachep;
3757 	ret = __cache_alloc(cachep, flags, caller);
3758 
3759 	trace_kmalloc((unsigned long) caller, ret,
3760 		      size, cachep->buffer_size, flags);
3761 
3762 	return ret;
3763 }
3764 
3765 
3766 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
__kmalloc(size_t size,gfp_t flags)3767 void *__kmalloc(size_t size, gfp_t flags)
3768 {
3769 	return __do_kmalloc(size, flags, __builtin_return_address(0));
3770 }
3771 EXPORT_SYMBOL(__kmalloc);
3772 
__kmalloc_track_caller(size_t size,gfp_t flags,unsigned long caller)3773 void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3774 {
3775 	return __do_kmalloc(size, flags, (void *)caller);
3776 }
3777 EXPORT_SYMBOL(__kmalloc_track_caller);
3778 
3779 #else
__kmalloc(size_t size,gfp_t flags)3780 void *__kmalloc(size_t size, gfp_t flags)
3781 {
3782 	return __do_kmalloc(size, flags, NULL);
3783 }
3784 EXPORT_SYMBOL(__kmalloc);
3785 #endif
3786 
3787 /**
3788  * kmem_cache_free - Deallocate an object
3789  * @cachep: The cache the allocation was from.
3790  * @objp: The previously allocated object.
3791  *
3792  * Free an object which was previously allocated from this
3793  * cache.
3794  */
kmem_cache_free(struct kmem_cache * cachep,void * objp)3795 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3796 {
3797 	unsigned long flags;
3798 
3799 	local_irq_save(flags);
3800 	debug_check_no_locks_freed(objp, obj_size(cachep));
3801 	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3802 		debug_check_no_obj_freed(objp, obj_size(cachep));
3803 	__cache_free(cachep, objp);
3804 	local_irq_restore(flags);
3805 
3806 	trace_kmem_cache_free(_RET_IP_, objp);
3807 }
3808 EXPORT_SYMBOL(kmem_cache_free);
3809 
3810 /**
3811  * kfree - free previously allocated memory
3812  * @objp: pointer returned by kmalloc.
3813  *
3814  * If @objp is NULL, no operation is performed.
3815  *
3816  * Don't free memory not originally allocated by kmalloc()
3817  * or you will run into trouble.
3818  */
kfree(const void * objp)3819 void kfree(const void *objp)
3820 {
3821 	struct kmem_cache *c;
3822 	unsigned long flags;
3823 
3824 	trace_kfree(_RET_IP_, objp);
3825 
3826 	if (unlikely(ZERO_OR_NULL_PTR(objp)))
3827 		return;
3828 	local_irq_save(flags);
3829 	kfree_debugcheck(objp);
3830 	c = virt_to_cache(objp);
3831 	debug_check_no_locks_freed(objp, obj_size(c));
3832 	debug_check_no_obj_freed(objp, obj_size(c));
3833 	__cache_free(c, (void *)objp);
3834 	local_irq_restore(flags);
3835 }
3836 EXPORT_SYMBOL(kfree);
3837 
kmem_cache_size(struct kmem_cache * cachep)3838 unsigned int kmem_cache_size(struct kmem_cache *cachep)
3839 {
3840 	return obj_size(cachep);
3841 }
3842 EXPORT_SYMBOL(kmem_cache_size);
3843 
3844 /*
3845  * This initializes kmem_list3 or resizes various caches for all nodes.
3846  */
alloc_kmemlist(struct kmem_cache * cachep,gfp_t gfp)3847 static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
3848 {
3849 	int node;
3850 	struct kmem_list3 *l3;
3851 	struct array_cache *new_shared;
3852 	struct array_cache **new_alien = NULL;
3853 
3854 	for_each_online_node(node) {
3855 
3856                 if (use_alien_caches) {
3857                         new_alien = alloc_alien_cache(node, cachep->limit, gfp);
3858                         if (!new_alien)
3859                                 goto fail;
3860                 }
3861 
3862 		new_shared = NULL;
3863 		if (cachep->shared) {
3864 			new_shared = alloc_arraycache(node,
3865 				cachep->shared*cachep->batchcount,
3866 					0xbaadf00d, gfp);
3867 			if (!new_shared) {
3868 				free_alien_cache(new_alien);
3869 				goto fail;
3870 			}
3871 		}
3872 
3873 		l3 = cachep->nodelists[node];
3874 		if (l3) {
3875 			struct array_cache *shared = l3->shared;
3876 
3877 			spin_lock_irq(&l3->list_lock);
3878 
3879 			if (shared)
3880 				free_block(cachep, shared->entry,
3881 						shared->avail, node);
3882 
3883 			l3->shared = new_shared;
3884 			if (!l3->alien) {
3885 				l3->alien = new_alien;
3886 				new_alien = NULL;
3887 			}
3888 			l3->free_limit = (1 + nr_cpus_node(node)) *
3889 					cachep->batchcount + cachep->num;
3890 			spin_unlock_irq(&l3->list_lock);
3891 			kfree(shared);
3892 			free_alien_cache(new_alien);
3893 			continue;
3894 		}
3895 		l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
3896 		if (!l3) {
3897 			free_alien_cache(new_alien);
3898 			kfree(new_shared);
3899 			goto fail;
3900 		}
3901 
3902 		kmem_list3_init(l3);
3903 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3904 				((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3905 		l3->shared = new_shared;
3906 		l3->alien = new_alien;
3907 		l3->free_limit = (1 + nr_cpus_node(node)) *
3908 					cachep->batchcount + cachep->num;
3909 		cachep->nodelists[node] = l3;
3910 	}
3911 	return 0;
3912 
3913 fail:
3914 	if (!cachep->next.next) {
3915 		/* Cache is not active yet. Roll back what we did */
3916 		node--;
3917 		while (node >= 0) {
3918 			if (cachep->nodelists[node]) {
3919 				l3 = cachep->nodelists[node];
3920 
3921 				kfree(l3->shared);
3922 				free_alien_cache(l3->alien);
3923 				kfree(l3);
3924 				cachep->nodelists[node] = NULL;
3925 			}
3926 			node--;
3927 		}
3928 	}
3929 	return -ENOMEM;
3930 }
3931 
3932 struct ccupdate_struct {
3933 	struct kmem_cache *cachep;
3934 	struct array_cache *new[NR_CPUS];
3935 };
3936 
do_ccupdate_local(void * info)3937 static void do_ccupdate_local(void *info)
3938 {
3939 	struct ccupdate_struct *new = info;
3940 	struct array_cache *old;
3941 
3942 	check_irq_off();
3943 	old = cpu_cache_get(new->cachep);
3944 
3945 	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3946 	new->new[smp_processor_id()] = old;
3947 }
3948 
3949 /* Always called with the cache_chain_mutex held */
do_tune_cpucache(struct kmem_cache * cachep,int limit,int batchcount,int shared,gfp_t gfp)3950 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3951 				int batchcount, int shared, gfp_t gfp)
3952 {
3953 	struct ccupdate_struct *new;
3954 	int i;
3955 
3956 	new = kzalloc(sizeof(*new), gfp);
3957 	if (!new)
3958 		return -ENOMEM;
3959 
3960 	for_each_online_cpu(i) {
3961 		new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
3962 						batchcount, gfp);
3963 		if (!new->new[i]) {
3964 			for (i--; i >= 0; i--)
3965 				kfree(new->new[i]);
3966 			kfree(new);
3967 			return -ENOMEM;
3968 		}
3969 	}
3970 	new->cachep = cachep;
3971 
3972 	on_each_cpu(do_ccupdate_local, (void *)new, 1);
3973 
3974 	check_irq_on();
3975 	cachep->batchcount = batchcount;
3976 	cachep->limit = limit;
3977 	cachep->shared = shared;
3978 
3979 	for_each_online_cpu(i) {
3980 		struct array_cache *ccold = new->new[i];
3981 		if (!ccold)
3982 			continue;
3983 		spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
3984 		free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
3985 		spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
3986 		kfree(ccold);
3987 	}
3988 	kfree(new);
3989 	return alloc_kmemlist(cachep, gfp);
3990 }
3991 
3992 /* Called with cache_chain_mutex held always */
enable_cpucache(struct kmem_cache * cachep,gfp_t gfp)3993 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
3994 {
3995 	int err;
3996 	int limit, shared;
3997 
3998 	/*
3999 	 * The head array serves three purposes:
4000 	 * - create a LIFO ordering, i.e. return objects that are cache-warm
4001 	 * - reduce the number of spinlock operations.
4002 	 * - reduce the number of linked list operations on the slab and
4003 	 *   bufctl chains: array operations are cheaper.
4004 	 * The numbers are guessed, we should auto-tune as described by
4005 	 * Bonwick.
4006 	 */
4007 	if (cachep->buffer_size > 131072)
4008 		limit = 1;
4009 	else if (cachep->buffer_size > PAGE_SIZE)
4010 		limit = 8;
4011 	else if (cachep->buffer_size > 1024)
4012 		limit = 24;
4013 	else if (cachep->buffer_size > 256)
4014 		limit = 54;
4015 	else
4016 		limit = 120;
4017 
4018 	/*
4019 	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
4020 	 * allocation behaviour: Most allocs on one cpu, most free operations
4021 	 * on another cpu. For these cases, an efficient object passing between
4022 	 * cpus is necessary. This is provided by a shared array. The array
4023 	 * replaces Bonwick's magazine layer.
4024 	 * On uniprocessor, it's functionally equivalent (but less efficient)
4025 	 * to a larger limit. Thus disabled by default.
4026 	 */
4027 	shared = 0;
4028 	if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
4029 		shared = 8;
4030 
4031 #if DEBUG
4032 	/*
4033 	 * With debugging enabled, large batchcount lead to excessively long
4034 	 * periods with disabled local interrupts. Limit the batchcount
4035 	 */
4036 	if (limit > 32)
4037 		limit = 32;
4038 #endif
4039 	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
4040 	if (err)
4041 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
4042 		       cachep->name, -err);
4043 	return err;
4044 }
4045 
4046 /*
4047  * Drain an array if it contains any elements taking the l3 lock only if
4048  * necessary. Note that the l3 listlock also protects the array_cache
4049  * if drain_array() is used on the shared array.
4050  */
drain_array(struct kmem_cache * cachep,struct kmem_list3 * l3,struct array_cache * ac,int force,int node)4051 static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
4052 			 struct array_cache *ac, int force, int node)
4053 {
4054 	int tofree;
4055 
4056 	if (!ac || !ac->avail)
4057 		return;
4058 	if (ac->touched && !force) {
4059 		ac->touched = 0;
4060 	} else {
4061 		spin_lock_irq(&l3->list_lock);
4062 		if (ac->avail) {
4063 			tofree = force ? ac->avail : (ac->limit + 4) / 5;
4064 			if (tofree > ac->avail)
4065 				tofree = (ac->avail + 1) / 2;
4066 			free_block(cachep, ac->entry, tofree, node);
4067 			ac->avail -= tofree;
4068 			memmove(ac->entry, &(ac->entry[tofree]),
4069 				sizeof(void *) * ac->avail);
4070 		}
4071 		spin_unlock_irq(&l3->list_lock);
4072 	}
4073 }
4074 
4075 /**
4076  * cache_reap - Reclaim memory from caches.
4077  * @w: work descriptor
4078  *
4079  * Called from workqueue/eventd every few seconds.
4080  * Purpose:
4081  * - clear the per-cpu caches for this CPU.
4082  * - return freeable pages to the main free memory pool.
4083  *
4084  * If we cannot acquire the cache chain mutex then just give up - we'll try
4085  * again on the next iteration.
4086  */
cache_reap(struct work_struct * w)4087 static void cache_reap(struct work_struct *w)
4088 {
4089 	struct kmem_cache *searchp;
4090 	struct kmem_list3 *l3;
4091 	int node = numa_mem_id();
4092 	struct delayed_work *work = to_delayed_work(w);
4093 
4094 	if (!mutex_trylock(&cache_chain_mutex))
4095 		/* Give up. Setup the next iteration. */
4096 		goto out;
4097 
4098 	list_for_each_entry(searchp, &cache_chain, next) {
4099 		check_irq_on();
4100 
4101 		/*
4102 		 * We only take the l3 lock if absolutely necessary and we
4103 		 * have established with reasonable certainty that
4104 		 * we can do some work if the lock was obtained.
4105 		 */
4106 		l3 = searchp->nodelists[node];
4107 
4108 		reap_alien(searchp, l3);
4109 
4110 		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
4111 
4112 		/*
4113 		 * These are racy checks but it does not matter
4114 		 * if we skip one check or scan twice.
4115 		 */
4116 		if (time_after(l3->next_reap, jiffies))
4117 			goto next;
4118 
4119 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
4120 
4121 		drain_array(searchp, l3, l3->shared, 0, node);
4122 
4123 		if (l3->free_touched)
4124 			l3->free_touched = 0;
4125 		else {
4126 			int freed;
4127 
4128 			freed = drain_freelist(searchp, l3, (l3->free_limit +
4129 				5 * searchp->num - 1) / (5 * searchp->num));
4130 			STATS_ADD_REAPED(searchp, freed);
4131 		}
4132 next:
4133 		cond_resched();
4134 	}
4135 	check_irq_on();
4136 	mutex_unlock(&cache_chain_mutex);
4137 	next_reap_node();
4138 out:
4139 	/* Set up the next iteration */
4140 	schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
4141 }
4142 
4143 #ifdef CONFIG_SLABINFO
4144 
print_slabinfo_header(struct seq_file * m)4145 static void print_slabinfo_header(struct seq_file *m)
4146 {
4147 	/*
4148 	 * Output format version, so at least we can change it
4149 	 * without _too_ many complaints.
4150 	 */
4151 #if STATS
4152 	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
4153 #else
4154 	seq_puts(m, "slabinfo - version: 2.1\n");
4155 #endif
4156 	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4157 		 "<objperslab> <pagesperslab>");
4158 	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4159 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4160 #if STATS
4161 	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
4162 		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
4163 	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
4164 #endif
4165 	seq_putc(m, '\n');
4166 }
4167 
s_start(struct seq_file * m,loff_t * pos)4168 static void *s_start(struct seq_file *m, loff_t *pos)
4169 {
4170 	loff_t n = *pos;
4171 
4172 	mutex_lock(&cache_chain_mutex);
4173 	if (!n)
4174 		print_slabinfo_header(m);
4175 
4176 	return seq_list_start(&cache_chain, *pos);
4177 }
4178 
s_next(struct seq_file * m,void * p,loff_t * pos)4179 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4180 {
4181 	return seq_list_next(p, &cache_chain, pos);
4182 }
4183 
s_stop(struct seq_file * m,void * p)4184 static void s_stop(struct seq_file *m, void *p)
4185 {
4186 	mutex_unlock(&cache_chain_mutex);
4187 }
4188 
s_show(struct seq_file * m,void * p)4189 static int s_show(struct seq_file *m, void *p)
4190 {
4191 	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
4192 	struct slab *slabp;
4193 	unsigned long active_objs;
4194 	unsigned long num_objs;
4195 	unsigned long active_slabs = 0;
4196 	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
4197 	const char *name;
4198 	char *error = NULL;
4199 	int node;
4200 	struct kmem_list3 *l3;
4201 
4202 	active_objs = 0;
4203 	num_slabs = 0;
4204 	for_each_online_node(node) {
4205 		l3 = cachep->nodelists[node];
4206 		if (!l3)
4207 			continue;
4208 
4209 		check_irq_on();
4210 		spin_lock_irq(&l3->list_lock);
4211 
4212 		list_for_each_entry(slabp, &l3->slabs_full, list) {
4213 			if (slabp->inuse != cachep->num && !error)
4214 				error = "slabs_full accounting error";
4215 			active_objs += cachep->num;
4216 			active_slabs++;
4217 		}
4218 		list_for_each_entry(slabp, &l3->slabs_partial, list) {
4219 			if (slabp->inuse == cachep->num && !error)
4220 				error = "slabs_partial inuse accounting error";
4221 			if (!slabp->inuse && !error)
4222 				error = "slabs_partial/inuse accounting error";
4223 			active_objs += slabp->inuse;
4224 			active_slabs++;
4225 		}
4226 		list_for_each_entry(slabp, &l3->slabs_free, list) {
4227 			if (slabp->inuse && !error)
4228 				error = "slabs_free/inuse accounting error";
4229 			num_slabs++;
4230 		}
4231 		free_objects += l3->free_objects;
4232 		if (l3->shared)
4233 			shared_avail += l3->shared->avail;
4234 
4235 		spin_unlock_irq(&l3->list_lock);
4236 	}
4237 	num_slabs += active_slabs;
4238 	num_objs = num_slabs * cachep->num;
4239 	if (num_objs - active_objs != free_objects && !error)
4240 		error = "free_objects accounting error";
4241 
4242 	name = cachep->name;
4243 	if (error)
4244 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4245 
4246 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
4247 		   name, active_objs, num_objs, cachep->buffer_size,
4248 		   cachep->num, (1 << cachep->gfporder));
4249 	seq_printf(m, " : tunables %4u %4u %4u",
4250 		   cachep->limit, cachep->batchcount, cachep->shared);
4251 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
4252 		   active_slabs, num_slabs, shared_avail);
4253 #if STATS
4254 	{			/* list3 stats */
4255 		unsigned long high = cachep->high_mark;
4256 		unsigned long allocs = cachep->num_allocations;
4257 		unsigned long grown = cachep->grown;
4258 		unsigned long reaped = cachep->reaped;
4259 		unsigned long errors = cachep->errors;
4260 		unsigned long max_freeable = cachep->max_freeable;
4261 		unsigned long node_allocs = cachep->node_allocs;
4262 		unsigned long node_frees = cachep->node_frees;
4263 		unsigned long overflows = cachep->node_overflow;
4264 
4265 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
4266 			   "%4lu %4lu %4lu %4lu %4lu",
4267 			   allocs, high, grown,
4268 			   reaped, errors, max_freeable, node_allocs,
4269 			   node_frees, overflows);
4270 	}
4271 	/* cpu stats */
4272 	{
4273 		unsigned long allochit = atomic_read(&cachep->allochit);
4274 		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
4275 		unsigned long freehit = atomic_read(&cachep->freehit);
4276 		unsigned long freemiss = atomic_read(&cachep->freemiss);
4277 
4278 		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
4279 			   allochit, allocmiss, freehit, freemiss);
4280 	}
4281 #endif
4282 	seq_putc(m, '\n');
4283 	return 0;
4284 }
4285 
4286 /*
4287  * slabinfo_op - iterator that generates /proc/slabinfo
4288  *
4289  * Output layout:
4290  * cache-name
4291  * num-active-objs
4292  * total-objs
4293  * object size
4294  * num-active-slabs
4295  * total-slabs
4296  * num-pages-per-slab
4297  * + further values on SMP and with statistics enabled
4298  */
4299 
4300 static const struct seq_operations slabinfo_op = {
4301 	.start = s_start,
4302 	.next = s_next,
4303 	.stop = s_stop,
4304 	.show = s_show,
4305 };
4306 
4307 #define MAX_SLABINFO_WRITE 128
4308 /**
4309  * slabinfo_write - Tuning for the slab allocator
4310  * @file: unused
4311  * @buffer: user buffer
4312  * @count: data length
4313  * @ppos: unused
4314  */
slabinfo_write(struct file * file,const char __user * buffer,size_t count,loff_t * ppos)4315 static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4316 		       size_t count, loff_t *ppos)
4317 {
4318 	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
4319 	int limit, batchcount, shared, res;
4320 	struct kmem_cache *cachep;
4321 
4322 	if (count > MAX_SLABINFO_WRITE)
4323 		return -EINVAL;
4324 	if (copy_from_user(&kbuf, buffer, count))
4325 		return -EFAULT;
4326 	kbuf[MAX_SLABINFO_WRITE] = '\0';
4327 
4328 	tmp = strchr(kbuf, ' ');
4329 	if (!tmp)
4330 		return -EINVAL;
4331 	*tmp = '\0';
4332 	tmp++;
4333 	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
4334 		return -EINVAL;
4335 
4336 	/* Find the cache in the chain of caches. */
4337 	mutex_lock(&cache_chain_mutex);
4338 	res = -EINVAL;
4339 	list_for_each_entry(cachep, &cache_chain, next) {
4340 		if (!strcmp(cachep->name, kbuf)) {
4341 			if (limit < 1 || batchcount < 1 ||
4342 					batchcount > limit || shared < 0) {
4343 				res = 0;
4344 			} else {
4345 				res = do_tune_cpucache(cachep, limit,
4346 						       batchcount, shared,
4347 						       GFP_KERNEL);
4348 			}
4349 			break;
4350 		}
4351 	}
4352 	mutex_unlock(&cache_chain_mutex);
4353 	if (res >= 0)
4354 		res = count;
4355 	return res;
4356 }
4357 
slabinfo_open(struct inode * inode,struct file * file)4358 static int slabinfo_open(struct inode *inode, struct file *file)
4359 {
4360 	return seq_open(file, &slabinfo_op);
4361 }
4362 
4363 static const struct file_operations proc_slabinfo_operations = {
4364 	.open		= slabinfo_open,
4365 	.read		= seq_read,
4366 	.write		= slabinfo_write,
4367 	.llseek		= seq_lseek,
4368 	.release	= seq_release,
4369 };
4370 
4371 #ifdef CONFIG_DEBUG_SLAB_LEAK
4372 
leaks_start(struct seq_file * m,loff_t * pos)4373 static void *leaks_start(struct seq_file *m, loff_t *pos)
4374 {
4375 	mutex_lock(&cache_chain_mutex);
4376 	return seq_list_start(&cache_chain, *pos);
4377 }
4378 
add_caller(unsigned long * n,unsigned long v)4379 static inline int add_caller(unsigned long *n, unsigned long v)
4380 {
4381 	unsigned long *p;
4382 	int l;
4383 	if (!v)
4384 		return 1;
4385 	l = n[1];
4386 	p = n + 2;
4387 	while (l) {
4388 		int i = l/2;
4389 		unsigned long *q = p + 2 * i;
4390 		if (*q == v) {
4391 			q[1]++;
4392 			return 1;
4393 		}
4394 		if (*q > v) {
4395 			l = i;
4396 		} else {
4397 			p = q + 2;
4398 			l -= i + 1;
4399 		}
4400 	}
4401 	if (++n[1] == n[0])
4402 		return 0;
4403 	memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4404 	p[0] = v;
4405 	p[1] = 1;
4406 	return 1;
4407 }
4408 
handle_slab(unsigned long * n,struct kmem_cache * c,struct slab * s)4409 static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4410 {
4411 	void *p;
4412 	int i;
4413 	if (n[0] == n[1])
4414 		return;
4415 	for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
4416 		if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4417 			continue;
4418 		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4419 			return;
4420 	}
4421 }
4422 
show_symbol(struct seq_file * m,unsigned long address)4423 static void show_symbol(struct seq_file *m, unsigned long address)
4424 {
4425 #ifdef CONFIG_KALLSYMS
4426 	unsigned long offset, size;
4427 	char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
4428 
4429 	if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
4430 		seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4431 		if (modname[0])
4432 			seq_printf(m, " [%s]", modname);
4433 		return;
4434 	}
4435 #endif
4436 	seq_printf(m, "%p", (void *)address);
4437 }
4438 
leaks_show(struct seq_file * m,void * p)4439 static int leaks_show(struct seq_file *m, void *p)
4440 {
4441 	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
4442 	struct slab *slabp;
4443 	struct kmem_list3 *l3;
4444 	const char *name;
4445 	unsigned long *n = m->private;
4446 	int node;
4447 	int i;
4448 
4449 	if (!(cachep->flags & SLAB_STORE_USER))
4450 		return 0;
4451 	if (!(cachep->flags & SLAB_RED_ZONE))
4452 		return 0;
4453 
4454 	/* OK, we can do it */
4455 
4456 	n[1] = 0;
4457 
4458 	for_each_online_node(node) {
4459 		l3 = cachep->nodelists[node];
4460 		if (!l3)
4461 			continue;
4462 
4463 		check_irq_on();
4464 		spin_lock_irq(&l3->list_lock);
4465 
4466 		list_for_each_entry(slabp, &l3->slabs_full, list)
4467 			handle_slab(n, cachep, slabp);
4468 		list_for_each_entry(slabp, &l3->slabs_partial, list)
4469 			handle_slab(n, cachep, slabp);
4470 		spin_unlock_irq(&l3->list_lock);
4471 	}
4472 	name = cachep->name;
4473 	if (n[0] == n[1]) {
4474 		/* Increase the buffer size */
4475 		mutex_unlock(&cache_chain_mutex);
4476 		m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4477 		if (!m->private) {
4478 			/* Too bad, we are really out */
4479 			m->private = n;
4480 			mutex_lock(&cache_chain_mutex);
4481 			return -ENOMEM;
4482 		}
4483 		*(unsigned long *)m->private = n[0] * 2;
4484 		kfree(n);
4485 		mutex_lock(&cache_chain_mutex);
4486 		/* Now make sure this entry will be retried */
4487 		m->count = m->size;
4488 		return 0;
4489 	}
4490 	for (i = 0; i < n[1]; i++) {
4491 		seq_printf(m, "%s: %lu ", name, n[2*i+3]);
4492 		show_symbol(m, n[2*i+2]);
4493 		seq_putc(m, '\n');
4494 	}
4495 
4496 	return 0;
4497 }
4498 
4499 static const struct seq_operations slabstats_op = {
4500 	.start = leaks_start,
4501 	.next = s_next,
4502 	.stop = s_stop,
4503 	.show = leaks_show,
4504 };
4505 
slabstats_open(struct inode * inode,struct file * file)4506 static int slabstats_open(struct inode *inode, struct file *file)
4507 {
4508 	unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
4509 	int ret = -ENOMEM;
4510 	if (n) {
4511 		ret = seq_open(file, &slabstats_op);
4512 		if (!ret) {
4513 			struct seq_file *m = file->private_data;
4514 			*n = PAGE_SIZE / (2 * sizeof(unsigned long));
4515 			m->private = n;
4516 			n = NULL;
4517 		}
4518 		kfree(n);
4519 	}
4520 	return ret;
4521 }
4522 
4523 static const struct file_operations proc_slabstats_operations = {
4524 	.open		= slabstats_open,
4525 	.read		= seq_read,
4526 	.llseek		= seq_lseek,
4527 	.release	= seq_release_private,
4528 };
4529 #endif
4530 
slab_proc_init(void)4531 static int __init slab_proc_init(void)
4532 {
4533 	proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
4534 #ifdef CONFIG_DEBUG_SLAB_LEAK
4535 	proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
4536 #endif
4537 	return 0;
4538 }
4539 module_init(slab_proc_init);
4540 #endif
4541 
4542 /**
4543  * ksize - get the actual amount of memory allocated for a given object
4544  * @objp: Pointer to the object
4545  *
4546  * kmalloc may internally round up allocations and return more memory
4547  * than requested. ksize() can be used to determine the actual amount of
4548  * memory allocated. The caller may use this additional memory, even though
4549  * a smaller amount of memory was initially specified with the kmalloc call.
4550  * The caller must guarantee that objp points to a valid object previously
4551  * allocated with either kmalloc() or kmem_cache_alloc(). The object
4552  * must not be freed during the duration of the call.
4553  */
ksize(const void * objp)4554 size_t ksize(const void *objp)
4555 {
4556 	BUG_ON(!objp);
4557 	if (unlikely(objp == ZERO_SIZE_PTR))
4558 		return 0;
4559 
4560 	return obj_size(virt_to_cache(objp));
4561 }
4562 EXPORT_SYMBOL(ksize);
4563