1/* $NetBSD: uvm_pglist.c,v 1.67 2014/10/26 01:42:07 christos Exp $ */
2
3/*-
4 * Copyright (c) 1997 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/*
34 * uvm_pglist.c: pglist functions
35 */
36
37#include <sys/cdefs.h>
38__KERNEL_RCSID(0, "$NetBSD: uvm_pglist.c,v 1.67 2014/10/26 01:42:07 christos Exp $");
39
40#include <sys/param.h>
41#include <sys/systm.h>
42
43#include <uvm/uvm.h>
44#include <uvm/uvm_pdpolicy.h>
45
46#ifdef VM_PAGE_ALLOC_MEMORY_STATS
47#define STAT_INCR(v) (v)++
48#define STAT_DECR(v) do { \
49 if ((v) == 0) \
50 printf("%s:%d -- Already 0!\n", __FILE__, __LINE__); \
51 else \
52 (v)--; \
53 } while (/*CONSTCOND*/ 0)
54u_long uvm_pglistalloc_npages;
55#else
56#define STAT_INCR(v)
57#define STAT_DECR(v)
58#endif
59
60/*
61 * uvm_pglistalloc: allocate a list of pages
62 *
63 * => allocated pages are placed onto an rlist. rlist is
64 * initialized by uvm_pglistalloc.
65 * => returns 0 on success or errno on failure
66 * => implementation allocates a single segment if any constraints are
67 * imposed by call arguments.
68 * => doesn't take into account clean non-busy pages on inactive list
69 * that could be used(?)
70 * => params:
71 * size the size of the allocation, rounded to page size.
72 * low the low address of the allowed allocation range.
73 * high the high address of the allowed allocation range.
74 * alignment memory must be aligned to this power-of-two boundary.
75 * boundary no segment in the allocation may cross this
76 * power-of-two boundary (relative to zero).
77 */
78
79static void
80uvm_pglist_add(struct vm_page *pg, struct pglist *rlist)
81{
82 int free_list __unused, color __unused, pgflidx;
83
84 KASSERT(mutex_owned(&uvm_fpageqlock));
85
86#if PGFL_NQUEUES != 2
87#error uvm_pglistalloc needs to be updated
88#endif
89
90 free_list = uvm_page_lookup_freelist(pg);
91 color = VM_PGCOLOR_BUCKET(pg);
92 pgflidx = (pg->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN;
93#ifdef UVMDEBUG
94 struct vm_page *tp;
95 LIST_FOREACH(tp,
96 &uvm.page_free[free_list].pgfl_buckets[color].pgfl_queues[pgflidx],
97 pageq.list) {
98 if (tp == pg)
99 break;
100 }
101 if (tp == NULL)
102 panic("uvm_pglistalloc: page not on freelist");
103#endif
104 LIST_REMOVE(pg, pageq.list); /* global */
105 LIST_REMOVE(pg, listq.list); /* cpu */
106 uvmexp.free--;
107 if (pg->flags & PG_ZERO)
108 uvmexp.zeropages--;
109 VM_FREE_PAGE_TO_CPU(pg)->pages[pgflidx]--;
110 pg->flags = PG_CLEAN;
111 pg->pqflags = 0;
112 pg->uobject = NULL;
113 pg->uanon = NULL;
114 TAILQ_INSERT_TAIL(rlist, pg, pageq.queue);
115 STAT_INCR(uvm_pglistalloc_npages);
116}
117
118static int
119uvm_pglistalloc_c_ps(struct vm_physseg *ps, int num, paddr_t low, paddr_t high,
120 paddr_t alignment, paddr_t boundary, struct pglist *rlist)
121{
122 signed int candidate, limit, candidateidx, end, idx, skip;
123 struct vm_page *pgs;
124 int pagemask;
125 bool second_pass;
126#ifdef DEBUG
127 paddr_t idxpa, lastidxpa;
128 int cidx = 0; /* XXX: GCC */
129#endif
130#ifdef PGALLOC_VERBOSE
131 printf("pgalloc: contig %d pgs from psi %zd\n", num, ps - vm_physmem);
132#endif
133
134 KASSERT(mutex_owned(&uvm_fpageqlock));
135
136 low = atop(low);
137 high = atop(high);
138 alignment = atop(alignment);
139
140 /*
141 * Make sure that physseg falls within with range to be allocated from.
142 */
143 if (high <= ps->avail_start || low >= ps->avail_end)
144 return 0;
145
146 /*
147 * We start our search at the just after where the last allocation
148 * succeeded.
149 */
150 candidate = roundup2(max(low, ps->avail_start + ps->start_hint), alignment);
151 limit = min(high, ps->avail_end);
152 pagemask = ~((boundary >> PAGE_SHIFT) - 1);
153 skip = 0;
154 second_pass = false;
155 pgs = ps->pgs;
156
157 for (;;) {
158 bool ok = true;
159 signed int cnt;
160
161 if (candidate + num > limit) {
162 if (ps->start_hint == 0 || second_pass) {
163 /*
164 * We've run past the allowable range.
165 */
166 return 0; /* FAIL = 0 pages*/
167 }
168 /*
169 * We've wrapped around the end of this segment
170 * so restart at the beginning but now our limit
171 * is were we started.
172 */
173 second_pass = true;
174 candidate = roundup2(max(low, ps->avail_start), alignment);
175 limit = min(limit, ps->avail_start + ps->start_hint);
176 skip = 0;
177 continue;
178 }
179 if (boundary != 0 &&
180 ((candidate ^ (candidate + num - 1)) & pagemask) != 0) {
181 /*
182 * Region crosses boundary. Jump to the boundary
183 * just crossed and ensure alignment.
184 */
185 candidate = (candidate + num - 1) & pagemask;
186 candidate = roundup2(candidate, alignment);
187 skip = 0;
188 continue;
189 }
190#ifdef DEBUG
191 /*
192 * Make sure this is a managed physical page.
193 */
194
195 if (vm_physseg_find(candidate, &cidx) != ps - vm_physmem)
196 panic("pgalloc contig: botch1");
197 if (cidx != candidate - ps->start)
198 panic("pgalloc contig: botch2");
199 if (vm_physseg_find(candidate + num - 1, &cidx) != ps - vm_physmem)
200 panic("pgalloc contig: botch3");
201 if (cidx != candidate - ps->start + num - 1)
202 panic("pgalloc contig: botch4");
203#endif
204 candidateidx = candidate - ps->start;
205 end = candidateidx + num;
206
207 /*
208 * Found a suitable starting page. See if the range is free.
209 */
210#ifdef PGALLOC_VERBOSE
211 printf("%s: ps=%p candidate=%#x end=%#x skip=%#x, align=%#"PRIxPADDR,
212 __func__, ps, candidateidx, end, skip, alignment);
213#endif
214 /*
215 * We start at the end and work backwards since if we find a
216 * non-free page, it makes no sense to continue.
217 *
218 * But on the plus size we have "vetted" some number of free
219 * pages. If this iteration fails, we may be able to skip
220 * testing most of those pages again in the next pass.
221 */
222 for (idx = end - 1; idx >= candidateidx + skip; idx--) {
223 if (VM_PAGE_IS_FREE(&pgs[idx]) == 0) {
224 ok = false;
225 break;
226 }
227
228#ifdef DEBUG
229 if (idx > candidateidx) {
230 idxpa = VM_PAGE_TO_PHYS(&pgs[idx]);
231 lastidxpa = VM_PAGE_TO_PHYS(&pgs[idx - 1]);
232 if ((lastidxpa + PAGE_SIZE) != idxpa) {
233 /*
234 * Region not contiguous.
235 */
236 panic("pgalloc contig: botch5");
237 }
238 if (boundary != 0 &&
239 ((lastidxpa ^ idxpa) & ~(boundary - 1))
240 != 0) {
241 /*
242 * Region crosses boundary.
243 */
244 panic("pgalloc contig: botch6");
245 }
246 }
247#endif
248 }
249
250 if (ok) {
251 while (skip-- > 0) {
252 KDASSERT(VM_PAGE_IS_FREE(&pgs[candidateidx + skip]));
253 }
254#ifdef PGALLOC_VERBOSE
255 printf(": ok\n");
256#endif
257 break;
258 }
259
260#ifdef PGALLOC_VERBOSE
261 printf(": non-free at %#x\n", idx - candidateidx);
262#endif
263 /*
264 * count the number of pages we can advance
265 * since we know they aren't all free.
266 */
267 cnt = idx + 1 - candidateidx;
268 /*
269 * now round up that to the needed alignment.
270 */
271 cnt = roundup2(cnt, alignment);
272 /*
273 * The number of pages we can skip checking
274 * (might be 0 if cnt > num).
275 */
276 skip = max(num - cnt, 0);
277 candidate += cnt;
278 }
279
280 /*
281 * we have a chunk of memory that conforms to the requested constraints.
282 */
283 for (idx = candidateidx, pgs += idx; idx < end; idx++, pgs++)
284 uvm_pglist_add(pgs, rlist);
285
286 /*
287 * the next time we need to search this segment, start after this
288 * chunk of pages we just allocated.
289 */
290 ps->start_hint = candidate + num - ps->avail_start;
291 KASSERTMSG(ps->start_hint <= ps->avail_end - ps->avail_start,
292 "%x %u (%#x) <= %#"PRIxPADDR" - %#"PRIxPADDR" (%#"PRIxPADDR")",
293 candidate + num,
294 ps->start_hint, ps->start_hint, ps->avail_end, ps->avail_start,
295 ps->avail_end - ps->avail_start);
296
297#ifdef PGALLOC_VERBOSE
298 printf("got %d pgs\n", num);
299#endif
300 return num; /* number of pages allocated */
301}
302
303static int
304uvm_pglistalloc_contig(int num, paddr_t low, paddr_t high, paddr_t alignment,
305 paddr_t boundary, struct pglist *rlist)
306{
307 int fl, psi;
308 struct vm_physseg *ps;
309 int error;
310
311 /* Default to "lose". */
312 error = ENOMEM;
313
314 /*
315 * Block all memory allocation and lock the free list.
316 */
317 mutex_spin_enter(&uvm_fpageqlock);
318
319 /* Are there even any free pages? */
320 if (uvmexp.free <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel))
321 goto out;
322
323 for (fl = 0; fl < VM_NFREELIST; fl++) {
324#if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
325 for (psi = vm_nphysseg - 1 ; psi >= 0 ; psi--)
326#else
327 for (psi = 0 ; psi < vm_nphysseg ; psi++)
328#endif
329 {
330 ps = &vm_physmem[psi];
331
332 if (ps->free_list != fl)
333 continue;
334
335 num -= uvm_pglistalloc_c_ps(ps, num, low, high,
336 alignment, boundary, rlist);
337 if (num == 0) {
338#ifdef PGALLOC_VERBOSE
339 printf("pgalloc: %"PRIxMAX"-%"PRIxMAX"\n",
340 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_FIRST(rlist)),
341 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_LAST(rlist, pglist)));
342#endif
343 error = 0;
344 goto out;
345 }
346 }
347 }
348
349out:
350 /*
351 * check to see if we need to generate some free pages waking
352 * the pagedaemon.
353 */
354
355 uvm_kick_pdaemon();
356 mutex_spin_exit(&uvm_fpageqlock);
357 return (error);
358}
359
360static int
361uvm_pglistalloc_s_ps(struct vm_physseg *ps, int num, paddr_t low, paddr_t high,
362 struct pglist *rlist)
363{
364 int todo, limit, candidate;
365 struct vm_page *pg;
366 bool second_pass;
367#ifdef PGALLOC_VERBOSE
368 printf("pgalloc: simple %d pgs from psi %zd\n", num, ps - vm_physmem);
369#endif
370
371 KASSERT(mutex_owned(&uvm_fpageqlock));
372 KASSERT(ps->start <= ps->avail_start);
373 KASSERT(ps->start <= ps->avail_end);
374 KASSERT(ps->avail_start <= ps->end);
375 KASSERT(ps->avail_end <= ps->end);
376
377 low = atop(low);
378 high = atop(high);
379 todo = num;
380 candidate = max(low, ps->avail_start + ps->start_hint);
381 limit = min(high, ps->avail_end);
382 pg = &ps->pgs[candidate - ps->start];
383 second_pass = false;
384
385 /*
386 * Make sure that physseg falls within with range to be allocated from.
387 */
388 if (high <= ps->avail_start || low >= ps->avail_end)
389 return 0;
390
391again:
392 for (;; candidate++, pg++) {
393 if (candidate >= limit) {
394 if (ps->start_hint == 0 || second_pass) {
395 candidate = limit - 1;
396 break;
397 }
398 second_pass = true;
399 candidate = max(low, ps->avail_start);
400 limit = min(limit, ps->avail_start + ps->start_hint);
401 pg = &ps->pgs[candidate - ps->start];
402 goto again;
403 }
404#if defined(DEBUG)
405 {
406 int cidx = 0;
407 const int bank = vm_physseg_find(candidate, &cidx);
408 KDASSERTMSG(bank == ps - vm_physmem,
409 "vm_physseg_find(%#x) (%d) != ps %zd",
410 candidate, bank, ps - vm_physmem);
411 KDASSERTMSG(cidx == candidate - ps->start,
412 "vm_physseg_find(%#x): %#x != off %"PRIxPADDR,
413 candidate, cidx, candidate - ps->start);
414 }
415#endif
416 if (VM_PAGE_IS_FREE(pg) == 0)
417 continue;
418
419 uvm_pglist_add(pg, rlist);
420 if (--todo == 0) {
421 break;
422 }
423 }
424
425 /*
426 * The next time we need to search this segment,
427 * start just after the pages we just allocated.
428 */
429 ps->start_hint = candidate + 1 - ps->avail_start;
430 KASSERTMSG(ps->start_hint <= ps->avail_end - ps->avail_start,
431 "%#x %u (%#x) <= %#"PRIxPADDR" - %#"PRIxPADDR" (%#"PRIxPADDR")",
432 candidate + 1,
433 ps->start_hint, ps->start_hint, ps->avail_end, ps->avail_start,
434 ps->avail_end - ps->avail_start);
435
436#ifdef PGALLOC_VERBOSE
437 printf("got %d pgs\n", num - todo);
438#endif
439 return (num - todo); /* number of pages allocated */
440}
441
442static int
443uvm_pglistalloc_simple(int num, paddr_t low, paddr_t high,
444 struct pglist *rlist, int waitok)
445{
446 int fl, psi, error;
447 struct vm_physseg *ps;
448
449 /* Default to "lose". */
450 error = ENOMEM;
451
452again:
453 /*
454 * Block all memory allocation and lock the free list.
455 */
456 mutex_spin_enter(&uvm_fpageqlock);
457
458 /* Are there even any free pages? */
459 if (uvmexp.free <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel))
460 goto out;
461
462 for (fl = 0; fl < VM_NFREELIST; fl++) {
463#if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
464 for (psi = vm_nphysseg - 1 ; psi >= 0 ; psi--)
465#else
466 for (psi = 0 ; psi < vm_nphysseg ; psi++)
467#endif
468 {
469 ps = &vm_physmem[psi];
470
471 if (ps->free_list != fl)
472 continue;
473
474 num -= uvm_pglistalloc_s_ps(ps, num, low, high, rlist);
475 if (num == 0) {
476 error = 0;
477 goto out;
478 }
479 }
480
481 }
482
483out:
484 /*
485 * check to see if we need to generate some free pages waking
486 * the pagedaemon.
487 */
488
489 uvm_kick_pdaemon();
490 mutex_spin_exit(&uvm_fpageqlock);
491
492 if (error) {
493 if (waitok) {
494 /* XXX perhaps some time limitation? */
495#ifdef DEBUG
496 printf("pglistalloc waiting\n");
497#endif
498 uvm_wait("pglalloc");
499 goto again;
500 } else
501 uvm_pglistfree(rlist);
502 }
503#ifdef PGALLOC_VERBOSE
504 if (!error)
505 printf("pgalloc: %"PRIxMAX"..%"PRIxMAX"\n",
506 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_FIRST(rlist)),
507 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_LAST(rlist, pglist)));
508#endif
509 return (error);
510}
511
512int
513uvm_pglistalloc(psize_t size, paddr_t low, paddr_t high, paddr_t alignment,
514 paddr_t boundary, struct pglist *rlist, int nsegs, int waitok)
515{
516 int num, res;
517
518 KASSERT((alignment & (alignment - 1)) == 0);
519 KASSERT((boundary & (boundary - 1)) == 0);
520
521 /*
522 * Our allocations are always page granularity, so our alignment
523 * must be, too.
524 */
525 if (alignment < PAGE_SIZE)
526 alignment = PAGE_SIZE;
527 if (boundary != 0 && boundary < size)
528 return (EINVAL);
529 num = atop(round_page(size));
530 low = roundup2(low, alignment);
531
532 TAILQ_INIT(rlist);
533
534 if ((nsegs < size >> PAGE_SHIFT) || (alignment != PAGE_SIZE) ||
535 (boundary != 0))
536 res = uvm_pglistalloc_contig(num, low, high, alignment,
537 boundary, rlist);
538 else
539 res = uvm_pglistalloc_simple(num, low, high, rlist, waitok);
540
541 return (res);
542}
543
544/*
545 * uvm_pglistfree: free a list of pages
546 *
547 * => pages should already be unmapped
548 */
549
550void
551uvm_pglistfree(struct pglist *list)
552{
553 struct uvm_cpu *ucpu;
554 struct vm_page *pg;
555 int index, color, queue;
556 bool iszero;
557
558 /*
559 * Lock the free list and free each page.
560 */
561
562 mutex_spin_enter(&uvm_fpageqlock);
563 ucpu = curcpu()->ci_data.cpu_uvm;
564 while ((pg = TAILQ_FIRST(list)) != NULL) {
565 KASSERT(!uvmpdpol_pageisqueued_p(pg));
566 TAILQ_REMOVE(list, pg, pageq.queue);
567 iszero = (pg->flags & PG_ZERO);
568 pg->pqflags = PQ_FREE;
569#ifdef DEBUG
570 pg->uobject = (void *)0xdeadbeef;
571 pg->uanon = (void *)0xdeadbeef;
572#endif /* DEBUG */
573#ifdef DEBUG
574 if (iszero)
575 uvm_pagezerocheck(pg);
576#endif /* DEBUG */
577 index = uvm_page_lookup_freelist(pg);
578 color = VM_PGCOLOR_BUCKET(pg);
579 queue = iszero ? PGFL_ZEROS : PGFL_UNKNOWN;
580 pg->offset = (uintptr_t)ucpu;
581 LIST_INSERT_HEAD(&uvm.page_free[index].pgfl_buckets[color].
582 pgfl_queues[queue], pg, pageq.list);
583 LIST_INSERT_HEAD(&ucpu->page_free[index].pgfl_buckets[color].
584 pgfl_queues[queue], pg, listq.list);
585 uvmexp.free++;
586 if (iszero)
587 uvmexp.zeropages++;
588 ucpu->pages[queue]++;
589 STAT_DECR(uvm_pglistalloc_npages);
590 }
591 if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN])
592 ucpu->page_idle_zero = vm_page_zero_enable;
593 mutex_spin_exit(&uvm_fpageqlock);
594}
595