1 | /* $NetBSD: uvm_pdaemon.c,v 1.108 2013/10/25 20:28:33 martin Exp $ */ |
2 | |
3 | /* |
4 | * Copyright (c) 1997 Charles D. Cranor and Washington University. |
5 | * Copyright (c) 1991, 1993, The Regents of the University of California. |
6 | * |
7 | * All rights reserved. |
8 | * |
9 | * This code is derived from software contributed to Berkeley by |
10 | * The Mach Operating System project at Carnegie-Mellon University. |
11 | * |
12 | * Redistribution and use in source and binary forms, with or without |
13 | * modification, are permitted provided that the following conditions |
14 | * are met: |
15 | * 1. Redistributions of source code must retain the above copyright |
16 | * notice, this list of conditions and the following disclaimer. |
17 | * 2. Redistributions in binary form must reproduce the above copyright |
18 | * notice, this list of conditions and the following disclaimer in the |
19 | * documentation and/or other materials provided with the distribution. |
20 | * 3. Neither the name of the University nor the names of its contributors |
21 | * may be used to endorse or promote products derived from this software |
22 | * without specific prior written permission. |
23 | * |
24 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
25 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
26 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
28 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
30 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
31 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
32 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
33 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
34 | * SUCH DAMAGE. |
35 | * |
36 | * @(#)vm_pageout.c 8.5 (Berkeley) 2/14/94 |
37 | * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp |
38 | * |
39 | * |
40 | * Copyright (c) 1987, 1990 Carnegie-Mellon University. |
41 | * All rights reserved. |
42 | * |
43 | * Permission to use, copy, modify and distribute this software and |
44 | * its documentation is hereby granted, provided that both the copyright |
45 | * notice and this permission notice appear in all copies of the |
46 | * software, derivative works or modified versions, and any portions |
47 | * thereof, and that both notices appear in supporting documentation. |
48 | * |
49 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
50 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
51 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
52 | * |
53 | * Carnegie Mellon requests users of this software to return to |
54 | * |
55 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
56 | * School of Computer Science |
57 | * Carnegie Mellon University |
58 | * Pittsburgh PA 15213-3890 |
59 | * |
60 | * any improvements or extensions that they make and grant Carnegie the |
61 | * rights to redistribute these changes. |
62 | */ |
63 | |
64 | /* |
65 | * uvm_pdaemon.c: the page daemon |
66 | */ |
67 | |
68 | #include <sys/cdefs.h> |
69 | __KERNEL_RCSID(0, "$NetBSD: uvm_pdaemon.c,v 1.108 2013/10/25 20:28:33 martin Exp $" ); |
70 | |
71 | #include "opt_uvmhist.h" |
72 | #include "opt_readahead.h" |
73 | |
74 | #include <sys/param.h> |
75 | #include <sys/proc.h> |
76 | #include <sys/systm.h> |
77 | #include <sys/kernel.h> |
78 | #include <sys/pool.h> |
79 | #include <sys/buf.h> |
80 | #include <sys/module.h> |
81 | #include <sys/atomic.h> |
82 | |
83 | #include <uvm/uvm.h> |
84 | #include <uvm/uvm_pdpolicy.h> |
85 | |
86 | #ifdef UVMHIST |
87 | UVMHIST_DEFINE(pdhist); |
88 | #endif |
89 | |
90 | /* |
91 | * UVMPD_NUMDIRTYREACTS is how many dirty pages the pagedaemon will reactivate |
92 | * in a pass thru the inactive list when swap is full. the value should be |
93 | * "small"... if it's too large we'll cycle the active pages thru the inactive |
94 | * queue too quickly to for them to be referenced and avoid being freed. |
95 | */ |
96 | |
97 | #define UVMPD_NUMDIRTYREACTS 16 |
98 | |
99 | #define UVMPD_NUMTRYLOCKOWNER 16 |
100 | |
101 | /* |
102 | * local prototypes |
103 | */ |
104 | |
105 | static void uvmpd_scan(void); |
106 | static void uvmpd_scan_queue(void); |
107 | static void uvmpd_tune(void); |
108 | |
109 | static unsigned int uvm_pagedaemon_waiters; |
110 | |
111 | /* |
112 | * XXX hack to avoid hangs when large processes fork. |
113 | */ |
114 | u_int ; |
115 | |
116 | /* |
117 | * uvm_wait: wait (sleep) for the page daemon to free some pages |
118 | * |
119 | * => should be called with all locks released |
120 | * => should _not_ be called by the page daemon (to avoid deadlock) |
121 | */ |
122 | |
123 | void |
124 | uvm_wait(const char *wmsg) |
125 | { |
126 | int timo = 0; |
127 | |
128 | mutex_spin_enter(&uvm_fpageqlock); |
129 | |
130 | /* |
131 | * check for page daemon going to sleep (waiting for itself) |
132 | */ |
133 | |
134 | if (curlwp == uvm.pagedaemon_lwp && uvmexp.paging == 0) { |
135 | /* |
136 | * now we have a problem: the pagedaemon wants to go to |
137 | * sleep until it frees more memory. but how can it |
138 | * free more memory if it is asleep? that is a deadlock. |
139 | * we have two options: |
140 | * [1] panic now |
141 | * [2] put a timeout on the sleep, thus causing the |
142 | * pagedaemon to only pause (rather than sleep forever) |
143 | * |
144 | * note that option [2] will only help us if we get lucky |
145 | * and some other process on the system breaks the deadlock |
146 | * by exiting or freeing memory (thus allowing the pagedaemon |
147 | * to continue). for now we panic if DEBUG is defined, |
148 | * otherwise we hope for the best with option [2] (better |
149 | * yet, this should never happen in the first place!). |
150 | */ |
151 | |
152 | printf("pagedaemon: deadlock detected!\n" ); |
153 | timo = hz >> 3; /* set timeout */ |
154 | #if defined(DEBUG) |
155 | /* DEBUG: panic so we can debug it */ |
156 | panic("pagedaemon deadlock" ); |
157 | #endif |
158 | } |
159 | |
160 | uvm_pagedaemon_waiters++; |
161 | wakeup(&uvm.pagedaemon); /* wake the daemon! */ |
162 | UVM_UNLOCK_AND_WAIT(&uvmexp.free, &uvm_fpageqlock, false, wmsg, timo); |
163 | } |
164 | |
165 | /* |
166 | * uvm_kick_pdaemon: perform checks to determine if we need to |
167 | * give the pagedaemon a nudge, and do so if necessary. |
168 | * |
169 | * => called with uvm_fpageqlock held. |
170 | */ |
171 | |
172 | void |
173 | uvm_kick_pdaemon(void) |
174 | { |
175 | |
176 | KASSERT(mutex_owned(&uvm_fpageqlock)); |
177 | |
178 | if (uvmexp.free + uvmexp.paging < uvmexp.freemin || |
179 | (uvmexp.free + uvmexp.paging < uvmexp.freetarg && |
180 | uvmpdpol_needsscan_p()) || |
181 | uvm_km_va_starved_p()) { |
182 | wakeup(&uvm.pagedaemon); |
183 | } |
184 | } |
185 | |
186 | /* |
187 | * uvmpd_tune: tune paging parameters |
188 | * |
189 | * => called when ever memory is added (or removed?) to the system |
190 | * => caller must call with page queues locked |
191 | */ |
192 | |
193 | static void |
194 | uvmpd_tune(void) |
195 | { |
196 | int val; |
197 | |
198 | UVMHIST_FUNC("uvmpd_tune" ); UVMHIST_CALLED(pdhist); |
199 | |
200 | /* |
201 | * try to keep 0.5% of available RAM free, but limit to between |
202 | * 128k and 1024k per-CPU. XXX: what are these values good for? |
203 | */ |
204 | val = uvmexp.npages / 200; |
205 | val = MAX(val, (128*1024) >> PAGE_SHIFT); |
206 | val = MIN(val, (1024*1024) >> PAGE_SHIFT); |
207 | val *= ncpu; |
208 | |
209 | /* Make sure there's always a user page free. */ |
210 | if (val < uvmexp.reserve_kernel + 1) |
211 | val = uvmexp.reserve_kernel + 1; |
212 | uvmexp.freemin = val; |
213 | |
214 | /* Calculate free target. */ |
215 | val = (uvmexp.freemin * 4) / 3; |
216 | if (val <= uvmexp.freemin) |
217 | val = uvmexp.freemin + 1; |
218 | uvmexp.freetarg = val + atomic_swap_uint(&uvm_extrapages, 0); |
219 | |
220 | uvmexp.wiredmax = uvmexp.npages / 3; |
221 | UVMHIST_LOG(pdhist, "<- done, freemin=%d, freetarg=%d, wiredmax=%d" , |
222 | uvmexp.freemin, uvmexp.freetarg, uvmexp.wiredmax, 0); |
223 | } |
224 | |
225 | /* |
226 | * uvm_pageout: the main loop for the pagedaemon |
227 | */ |
228 | |
229 | void |
230 | uvm_pageout(void *arg) |
231 | { |
232 | int bufcnt, npages = 0; |
233 | int = 0; |
234 | struct pool *pp; |
235 | |
236 | UVMHIST_FUNC("uvm_pageout" ); UVMHIST_CALLED(pdhist); |
237 | |
238 | UVMHIST_LOG(pdhist,"<starting uvm pagedaemon>" , 0, 0, 0, 0); |
239 | |
240 | /* |
241 | * ensure correct priority and set paging parameters... |
242 | */ |
243 | |
244 | uvm.pagedaemon_lwp = curlwp; |
245 | mutex_enter(&uvm_pageqlock); |
246 | npages = uvmexp.npages; |
247 | uvmpd_tune(); |
248 | mutex_exit(&uvm_pageqlock); |
249 | |
250 | /* |
251 | * main loop |
252 | */ |
253 | |
254 | for (;;) { |
255 | bool needsscan, needsfree, kmem_va_starved; |
256 | |
257 | kmem_va_starved = uvm_km_va_starved_p(); |
258 | |
259 | mutex_spin_enter(&uvm_fpageqlock); |
260 | if ((uvm_pagedaemon_waiters == 0 || uvmexp.paging > 0) && |
261 | !kmem_va_starved) { |
262 | UVMHIST_LOG(pdhist," <<SLEEPING>>" ,0,0,0,0); |
263 | UVM_UNLOCK_AND_WAIT(&uvm.pagedaemon, |
264 | &uvm_fpageqlock, false, "pgdaemon" , 0); |
265 | uvmexp.pdwoke++; |
266 | UVMHIST_LOG(pdhist," <<WOKE UP>>" ,0,0,0,0); |
267 | } else { |
268 | mutex_spin_exit(&uvm_fpageqlock); |
269 | } |
270 | |
271 | /* |
272 | * now lock page queues and recompute inactive count |
273 | */ |
274 | |
275 | mutex_enter(&uvm_pageqlock); |
276 | if (npages != uvmexp.npages || extrapages != uvm_extrapages) { |
277 | npages = uvmexp.npages; |
278 | extrapages = uvm_extrapages; |
279 | mutex_spin_enter(&uvm_fpageqlock); |
280 | uvmpd_tune(); |
281 | mutex_spin_exit(&uvm_fpageqlock); |
282 | } |
283 | |
284 | uvmpdpol_tune(); |
285 | |
286 | /* |
287 | * Estimate a hint. Note that bufmem are returned to |
288 | * system only when entire pool page is empty. |
289 | */ |
290 | mutex_spin_enter(&uvm_fpageqlock); |
291 | bufcnt = uvmexp.freetarg - uvmexp.free; |
292 | if (bufcnt < 0) |
293 | bufcnt = 0; |
294 | |
295 | UVMHIST_LOG(pdhist," free/ftarg=%d/%d" , |
296 | uvmexp.free, uvmexp.freetarg, 0,0); |
297 | |
298 | needsfree = uvmexp.free + uvmexp.paging < uvmexp.freetarg; |
299 | needsscan = needsfree || uvmpdpol_needsscan_p(); |
300 | |
301 | /* |
302 | * scan if needed |
303 | */ |
304 | if (needsscan) { |
305 | mutex_spin_exit(&uvm_fpageqlock); |
306 | uvmpd_scan(); |
307 | mutex_spin_enter(&uvm_fpageqlock); |
308 | } |
309 | |
310 | /* |
311 | * if there's any free memory to be had, |
312 | * wake up any waiters. |
313 | */ |
314 | if (uvmexp.free > uvmexp.reserve_kernel || |
315 | uvmexp.paging == 0) { |
316 | wakeup(&uvmexp.free); |
317 | uvm_pagedaemon_waiters = 0; |
318 | } |
319 | mutex_spin_exit(&uvm_fpageqlock); |
320 | |
321 | /* |
322 | * scan done. unlock page queues (the only lock we are holding) |
323 | */ |
324 | mutex_exit(&uvm_pageqlock); |
325 | |
326 | /* |
327 | * if we don't need free memory, we're done. |
328 | */ |
329 | |
330 | if (!needsfree && !kmem_va_starved) |
331 | continue; |
332 | |
333 | /* |
334 | * kill unused metadata buffers. |
335 | */ |
336 | mutex_enter(&bufcache_lock); |
337 | buf_drain(bufcnt << PAGE_SHIFT); |
338 | mutex_exit(&bufcache_lock); |
339 | |
340 | /* |
341 | * drain the pools. |
342 | */ |
343 | pool_drain(&pp); |
344 | } |
345 | /*NOTREACHED*/ |
346 | } |
347 | |
348 | |
349 | /* |
350 | * uvm_aiodone_worker: a workqueue callback for the aiodone daemon. |
351 | */ |
352 | |
353 | void |
354 | uvm_aiodone_worker(struct work *wk, void *dummy) |
355 | { |
356 | struct buf *bp = (void *)wk; |
357 | |
358 | KASSERT(&bp->b_work == wk); |
359 | |
360 | /* |
361 | * process an i/o that's done. |
362 | */ |
363 | |
364 | (*bp->b_iodone)(bp); |
365 | } |
366 | |
367 | void |
368 | uvm_pageout_start(int npages) |
369 | { |
370 | |
371 | mutex_spin_enter(&uvm_fpageqlock); |
372 | uvmexp.paging += npages; |
373 | mutex_spin_exit(&uvm_fpageqlock); |
374 | } |
375 | |
376 | void |
377 | uvm_pageout_done(int npages) |
378 | { |
379 | |
380 | mutex_spin_enter(&uvm_fpageqlock); |
381 | KASSERT(uvmexp.paging >= npages); |
382 | uvmexp.paging -= npages; |
383 | |
384 | /* |
385 | * wake up either of pagedaemon or LWPs waiting for it. |
386 | */ |
387 | |
388 | if (uvmexp.free <= uvmexp.reserve_kernel) { |
389 | wakeup(&uvm.pagedaemon); |
390 | } else { |
391 | wakeup(&uvmexp.free); |
392 | uvm_pagedaemon_waiters = 0; |
393 | } |
394 | mutex_spin_exit(&uvm_fpageqlock); |
395 | } |
396 | |
397 | /* |
398 | * uvmpd_trylockowner: trylock the page's owner. |
399 | * |
400 | * => called with pageq locked. |
401 | * => resolve orphaned O->A loaned page. |
402 | * => return the locked mutex on success. otherwise, return NULL. |
403 | */ |
404 | |
405 | kmutex_t * |
406 | uvmpd_trylockowner(struct vm_page *pg) |
407 | { |
408 | struct uvm_object *uobj = pg->uobject; |
409 | kmutex_t *slock; |
410 | |
411 | KASSERT(mutex_owned(&uvm_pageqlock)); |
412 | |
413 | if (uobj != NULL) { |
414 | slock = uobj->vmobjlock; |
415 | } else { |
416 | struct vm_anon *anon = pg->uanon; |
417 | |
418 | KASSERT(anon != NULL); |
419 | slock = anon->an_lock; |
420 | } |
421 | |
422 | if (!mutex_tryenter(slock)) { |
423 | return NULL; |
424 | } |
425 | |
426 | if (uobj == NULL) { |
427 | |
428 | /* |
429 | * set PQ_ANON if it isn't set already. |
430 | */ |
431 | |
432 | if ((pg->pqflags & PQ_ANON) == 0) { |
433 | KASSERT(pg->loan_count > 0); |
434 | pg->loan_count--; |
435 | pg->pqflags |= PQ_ANON; |
436 | /* anon now owns it */ |
437 | } |
438 | } |
439 | |
440 | return slock; |
441 | } |
442 | |
443 | #if defined(VMSWAP) |
444 | struct swapcluster { |
445 | int swc_slot; |
446 | int swc_nallocated; |
447 | int swc_nused; |
448 | struct vm_page *swc_pages[howmany(MAXPHYS, MIN_PAGE_SIZE)]; |
449 | }; |
450 | |
451 | static void |
452 | swapcluster_init(struct swapcluster *swc) |
453 | { |
454 | |
455 | swc->swc_slot = 0; |
456 | swc->swc_nused = 0; |
457 | } |
458 | |
459 | static int |
460 | swapcluster_allocslots(struct swapcluster *swc) |
461 | { |
462 | int slot; |
463 | int npages; |
464 | |
465 | if (swc->swc_slot != 0) { |
466 | return 0; |
467 | } |
468 | |
469 | /* Even with strange MAXPHYS, the shift |
470 | implicitly rounds down to a page. */ |
471 | npages = MAXPHYS >> PAGE_SHIFT; |
472 | slot = uvm_swap_alloc(&npages, true); |
473 | if (slot == 0) { |
474 | return ENOMEM; |
475 | } |
476 | swc->swc_slot = slot; |
477 | swc->swc_nallocated = npages; |
478 | swc->swc_nused = 0; |
479 | |
480 | return 0; |
481 | } |
482 | |
483 | static int |
484 | swapcluster_add(struct swapcluster *swc, struct vm_page *pg) |
485 | { |
486 | int slot; |
487 | struct uvm_object *uobj; |
488 | |
489 | KASSERT(swc->swc_slot != 0); |
490 | KASSERT(swc->swc_nused < swc->swc_nallocated); |
491 | KASSERT((pg->pqflags & PQ_SWAPBACKED) != 0); |
492 | |
493 | slot = swc->swc_slot + swc->swc_nused; |
494 | uobj = pg->uobject; |
495 | if (uobj == NULL) { |
496 | KASSERT(mutex_owned(pg->uanon->an_lock)); |
497 | pg->uanon->an_swslot = slot; |
498 | } else { |
499 | int result; |
500 | |
501 | KASSERT(mutex_owned(uobj->vmobjlock)); |
502 | result = uao_set_swslot(uobj, pg->offset >> PAGE_SHIFT, slot); |
503 | if (result == -1) { |
504 | return ENOMEM; |
505 | } |
506 | } |
507 | swc->swc_pages[swc->swc_nused] = pg; |
508 | swc->swc_nused++; |
509 | |
510 | return 0; |
511 | } |
512 | |
513 | static void |
514 | swapcluster_flush(struct swapcluster *swc, bool now) |
515 | { |
516 | int slot; |
517 | int nused; |
518 | int nallocated; |
519 | int error __diagused; |
520 | |
521 | if (swc->swc_slot == 0) { |
522 | return; |
523 | } |
524 | KASSERT(swc->swc_nused <= swc->swc_nallocated); |
525 | |
526 | slot = swc->swc_slot; |
527 | nused = swc->swc_nused; |
528 | nallocated = swc->swc_nallocated; |
529 | |
530 | /* |
531 | * if this is the final pageout we could have a few |
532 | * unused swap blocks. if so, free them now. |
533 | */ |
534 | |
535 | if (nused < nallocated) { |
536 | if (!now) { |
537 | return; |
538 | } |
539 | uvm_swap_free(slot + nused, nallocated - nused); |
540 | } |
541 | |
542 | /* |
543 | * now start the pageout. |
544 | */ |
545 | |
546 | if (nused > 0) { |
547 | uvmexp.pdpageouts++; |
548 | uvm_pageout_start(nused); |
549 | error = uvm_swap_put(slot, swc->swc_pages, nused, 0); |
550 | KASSERT(error == 0 || error == ENOMEM); |
551 | } |
552 | |
553 | /* |
554 | * zero swslot to indicate that we are |
555 | * no longer building a swap-backed cluster. |
556 | */ |
557 | |
558 | swc->swc_slot = 0; |
559 | swc->swc_nused = 0; |
560 | } |
561 | |
562 | static int |
563 | swapcluster_nused(struct swapcluster *swc) |
564 | { |
565 | |
566 | return swc->swc_nused; |
567 | } |
568 | |
569 | /* |
570 | * uvmpd_dropswap: free any swap allocated to this page. |
571 | * |
572 | * => called with owner locked. |
573 | * => return true if a page had an associated slot. |
574 | */ |
575 | |
576 | static bool |
577 | uvmpd_dropswap(struct vm_page *pg) |
578 | { |
579 | bool result = false; |
580 | struct vm_anon *anon = pg->uanon; |
581 | |
582 | if ((pg->pqflags & PQ_ANON) && anon->an_swslot) { |
583 | uvm_swap_free(anon->an_swslot, 1); |
584 | anon->an_swslot = 0; |
585 | pg->flags &= ~PG_CLEAN; |
586 | result = true; |
587 | } else if (pg->pqflags & PQ_AOBJ) { |
588 | int slot = uao_set_swslot(pg->uobject, |
589 | pg->offset >> PAGE_SHIFT, 0); |
590 | if (slot) { |
591 | uvm_swap_free(slot, 1); |
592 | pg->flags &= ~PG_CLEAN; |
593 | result = true; |
594 | } |
595 | } |
596 | |
597 | return result; |
598 | } |
599 | |
600 | /* |
601 | * uvmpd_trydropswap: try to free any swap allocated to this page. |
602 | * |
603 | * => return true if a slot is successfully freed. |
604 | */ |
605 | |
606 | bool |
607 | uvmpd_trydropswap(struct vm_page *pg) |
608 | { |
609 | kmutex_t *slock; |
610 | bool result; |
611 | |
612 | if ((pg->flags & PG_BUSY) != 0) { |
613 | return false; |
614 | } |
615 | |
616 | /* |
617 | * lock the page's owner. |
618 | */ |
619 | |
620 | slock = uvmpd_trylockowner(pg); |
621 | if (slock == NULL) { |
622 | return false; |
623 | } |
624 | |
625 | /* |
626 | * skip this page if it's busy. |
627 | */ |
628 | |
629 | if ((pg->flags & PG_BUSY) != 0) { |
630 | mutex_exit(slock); |
631 | return false; |
632 | } |
633 | |
634 | result = uvmpd_dropswap(pg); |
635 | |
636 | mutex_exit(slock); |
637 | |
638 | return result; |
639 | } |
640 | |
641 | #endif /* defined(VMSWAP) */ |
642 | |
643 | /* |
644 | * uvmpd_scan_queue: scan an replace candidate list for pages |
645 | * to clean or free. |
646 | * |
647 | * => called with page queues locked |
648 | * => we work on meeting our free target by converting inactive pages |
649 | * into free pages. |
650 | * => we handle the building of swap-backed clusters |
651 | */ |
652 | |
653 | static void |
654 | uvmpd_scan_queue(void) |
655 | { |
656 | struct vm_page *p; |
657 | struct uvm_object *uobj; |
658 | struct vm_anon *anon; |
659 | #if defined(VMSWAP) |
660 | struct swapcluster swc; |
661 | #endif /* defined(VMSWAP) */ |
662 | int dirtyreacts; |
663 | int lockownerfail; |
664 | kmutex_t *slock; |
665 | UVMHIST_FUNC("uvmpd_scan_queue" ); UVMHIST_CALLED(pdhist); |
666 | |
667 | /* |
668 | * swslot is non-zero if we are building a swap cluster. we want |
669 | * to stay in the loop while we have a page to scan or we have |
670 | * a swap-cluster to build. |
671 | */ |
672 | |
673 | #if defined(VMSWAP) |
674 | swapcluster_init(&swc); |
675 | #endif /* defined(VMSWAP) */ |
676 | |
677 | dirtyreacts = 0; |
678 | lockownerfail = 0; |
679 | uvmpdpol_scaninit(); |
680 | |
681 | while (/* CONSTCOND */ 1) { |
682 | |
683 | /* |
684 | * see if we've met the free target. |
685 | */ |
686 | |
687 | if (uvmexp.free + uvmexp.paging |
688 | #if defined(VMSWAP) |
689 | + swapcluster_nused(&swc) |
690 | #endif /* defined(VMSWAP) */ |
691 | >= uvmexp.freetarg << 2 || |
692 | dirtyreacts == UVMPD_NUMDIRTYREACTS) { |
693 | UVMHIST_LOG(pdhist," met free target: " |
694 | "exit loop" , 0, 0, 0, 0); |
695 | break; |
696 | } |
697 | |
698 | p = uvmpdpol_selectvictim(); |
699 | if (p == NULL) { |
700 | break; |
701 | } |
702 | KASSERT(uvmpdpol_pageisqueued_p(p)); |
703 | KASSERT(p->wire_count == 0); |
704 | |
705 | /* |
706 | * we are below target and have a new page to consider. |
707 | */ |
708 | |
709 | anon = p->uanon; |
710 | uobj = p->uobject; |
711 | |
712 | /* |
713 | * first we attempt to lock the object that this page |
714 | * belongs to. if our attempt fails we skip on to |
715 | * the next page (no harm done). it is important to |
716 | * "try" locking the object as we are locking in the |
717 | * wrong order (pageq -> object) and we don't want to |
718 | * deadlock. |
719 | * |
720 | * the only time we expect to see an ownerless page |
721 | * (i.e. a page with no uobject and !PQ_ANON) is if an |
722 | * anon has loaned a page from a uvm_object and the |
723 | * uvm_object has dropped the ownership. in that |
724 | * case, the anon can "take over" the loaned page |
725 | * and make it its own. |
726 | */ |
727 | |
728 | slock = uvmpd_trylockowner(p); |
729 | if (slock == NULL) { |
730 | /* |
731 | * yield cpu to make a chance for an LWP holding |
732 | * the lock run. otherwise we can busy-loop too long |
733 | * if the page queue is filled with a lot of pages |
734 | * from few objects. |
735 | */ |
736 | lockownerfail++; |
737 | if (lockownerfail > UVMPD_NUMTRYLOCKOWNER) { |
738 | mutex_exit(&uvm_pageqlock); |
739 | /* XXX Better than yielding but inadequate. */ |
740 | kpause("livelock" , false, 1, NULL); |
741 | mutex_enter(&uvm_pageqlock); |
742 | lockownerfail = 0; |
743 | } |
744 | continue; |
745 | } |
746 | if (p->flags & PG_BUSY) { |
747 | mutex_exit(slock); |
748 | uvmexp.pdbusy++; |
749 | continue; |
750 | } |
751 | |
752 | /* does the page belong to an object? */ |
753 | if (uobj != NULL) { |
754 | uvmexp.pdobscan++; |
755 | } else { |
756 | #if defined(VMSWAP) |
757 | KASSERT(anon != NULL); |
758 | uvmexp.pdanscan++; |
759 | #else /* defined(VMSWAP) */ |
760 | panic("%s: anon" , __func__); |
761 | #endif /* defined(VMSWAP) */ |
762 | } |
763 | |
764 | |
765 | /* |
766 | * we now have the object and the page queues locked. |
767 | * if the page is not swap-backed, call the object's |
768 | * pager to flush and free the page. |
769 | */ |
770 | |
771 | #if defined(READAHEAD_STATS) |
772 | if ((p->pqflags & PQ_READAHEAD) != 0) { |
773 | p->pqflags &= ~PQ_READAHEAD; |
774 | uvm_ra_miss.ev_count++; |
775 | } |
776 | #endif /* defined(READAHEAD_STATS) */ |
777 | |
778 | if ((p->pqflags & PQ_SWAPBACKED) == 0) { |
779 | KASSERT(uobj != NULL); |
780 | mutex_exit(&uvm_pageqlock); |
781 | (void) (uobj->pgops->pgo_put)(uobj, p->offset, |
782 | p->offset + PAGE_SIZE, PGO_CLEANIT|PGO_FREE); |
783 | mutex_enter(&uvm_pageqlock); |
784 | continue; |
785 | } |
786 | |
787 | /* |
788 | * the page is swap-backed. remove all the permissions |
789 | * from the page so we can sync the modified info |
790 | * without any race conditions. if the page is clean |
791 | * we can free it now and continue. |
792 | */ |
793 | |
794 | pmap_page_protect(p, VM_PROT_NONE); |
795 | if ((p->flags & PG_CLEAN) && pmap_clear_modify(p)) { |
796 | p->flags &= ~(PG_CLEAN); |
797 | } |
798 | if (p->flags & PG_CLEAN) { |
799 | int slot; |
800 | int pageidx; |
801 | |
802 | pageidx = p->offset >> PAGE_SHIFT; |
803 | uvm_pagefree(p); |
804 | uvmexp.pdfreed++; |
805 | |
806 | /* |
807 | * for anons, we need to remove the page |
808 | * from the anon ourselves. for aobjs, |
809 | * pagefree did that for us. |
810 | */ |
811 | |
812 | if (anon) { |
813 | KASSERT(anon->an_swslot != 0); |
814 | anon->an_page = NULL; |
815 | slot = anon->an_swslot; |
816 | } else { |
817 | slot = uao_find_swslot(uobj, pageidx); |
818 | } |
819 | mutex_exit(slock); |
820 | |
821 | if (slot > 0) { |
822 | /* this page is now only in swap. */ |
823 | mutex_enter(&uvm_swap_data_lock); |
824 | KASSERT(uvmexp.swpgonly < uvmexp.swpginuse); |
825 | uvmexp.swpgonly++; |
826 | mutex_exit(&uvm_swap_data_lock); |
827 | } |
828 | continue; |
829 | } |
830 | |
831 | #if defined(VMSWAP) |
832 | /* |
833 | * this page is dirty, skip it if we'll have met our |
834 | * free target when all the current pageouts complete. |
835 | */ |
836 | |
837 | if (uvmexp.free + uvmexp.paging > uvmexp.freetarg << 2) { |
838 | mutex_exit(slock); |
839 | continue; |
840 | } |
841 | |
842 | /* |
843 | * free any swap space allocated to the page since |
844 | * we'll have to write it again with its new data. |
845 | */ |
846 | |
847 | uvmpd_dropswap(p); |
848 | |
849 | /* |
850 | * start new swap pageout cluster (if necessary). |
851 | * |
852 | * if swap is full reactivate this page so that |
853 | * we eventually cycle all pages through the |
854 | * inactive queue. |
855 | */ |
856 | |
857 | if (swapcluster_allocslots(&swc)) { |
858 | dirtyreacts++; |
859 | uvm_pageactivate(p); |
860 | mutex_exit(slock); |
861 | continue; |
862 | } |
863 | |
864 | /* |
865 | * at this point, we're definitely going reuse this |
866 | * page. mark the page busy and delayed-free. |
867 | * we should remove the page from the page queues |
868 | * so we don't ever look at it again. |
869 | * adjust counters and such. |
870 | */ |
871 | |
872 | p->flags |= PG_BUSY; |
873 | UVM_PAGE_OWN(p, "scan_queue" ); |
874 | |
875 | p->flags |= PG_PAGEOUT; |
876 | uvm_pagedequeue(p); |
877 | |
878 | uvmexp.pgswapout++; |
879 | mutex_exit(&uvm_pageqlock); |
880 | |
881 | /* |
882 | * add the new page to the cluster. |
883 | */ |
884 | |
885 | if (swapcluster_add(&swc, p)) { |
886 | p->flags &= ~(PG_BUSY|PG_PAGEOUT); |
887 | UVM_PAGE_OWN(p, NULL); |
888 | mutex_enter(&uvm_pageqlock); |
889 | dirtyreacts++; |
890 | uvm_pageactivate(p); |
891 | mutex_exit(slock); |
892 | continue; |
893 | } |
894 | mutex_exit(slock); |
895 | |
896 | swapcluster_flush(&swc, false); |
897 | mutex_enter(&uvm_pageqlock); |
898 | |
899 | /* |
900 | * the pageout is in progress. bump counters and set up |
901 | * for the next loop. |
902 | */ |
903 | |
904 | uvmexp.pdpending++; |
905 | |
906 | #else /* defined(VMSWAP) */ |
907 | uvm_pageactivate(p); |
908 | mutex_exit(slock); |
909 | #endif /* defined(VMSWAP) */ |
910 | } |
911 | |
912 | #if defined(VMSWAP) |
913 | mutex_exit(&uvm_pageqlock); |
914 | swapcluster_flush(&swc, true); |
915 | mutex_enter(&uvm_pageqlock); |
916 | #endif /* defined(VMSWAP) */ |
917 | } |
918 | |
919 | /* |
920 | * uvmpd_scan: scan the page queues and attempt to meet our targets. |
921 | * |
922 | * => called with pageq's locked |
923 | */ |
924 | |
925 | static void |
926 | uvmpd_scan(void) |
927 | { |
928 | int swap_shortage, pages_freed; |
929 | UVMHIST_FUNC("uvmpd_scan" ); UVMHIST_CALLED(pdhist); |
930 | |
931 | uvmexp.pdrevs++; |
932 | |
933 | /* |
934 | * work on meeting our targets. first we work on our free target |
935 | * by converting inactive pages into free pages. then we work on |
936 | * meeting our inactive target by converting active pages to |
937 | * inactive ones. |
938 | */ |
939 | |
940 | UVMHIST_LOG(pdhist, " starting 'free' loop" ,0,0,0,0); |
941 | |
942 | pages_freed = uvmexp.pdfreed; |
943 | uvmpd_scan_queue(); |
944 | pages_freed = uvmexp.pdfreed - pages_freed; |
945 | |
946 | /* |
947 | * detect if we're not going to be able to page anything out |
948 | * until we free some swap resources from active pages. |
949 | */ |
950 | |
951 | swap_shortage = 0; |
952 | if (uvmexp.free < uvmexp.freetarg && |
953 | uvmexp.swpginuse >= uvmexp.swpgavail && |
954 | !uvm_swapisfull() && |
955 | pages_freed == 0) { |
956 | swap_shortage = uvmexp.freetarg - uvmexp.free; |
957 | } |
958 | |
959 | uvmpdpol_balancequeue(swap_shortage); |
960 | |
961 | /* |
962 | * if still below the minimum target, try unloading kernel |
963 | * modules. |
964 | */ |
965 | |
966 | if (uvmexp.free < uvmexp.freemin) { |
967 | module_thread_kick(); |
968 | } |
969 | } |
970 | |
971 | /* |
972 | * uvm_reclaimable: decide whether to wait for pagedaemon. |
973 | * |
974 | * => return true if it seems to be worth to do uvm_wait. |
975 | * |
976 | * XXX should be tunable. |
977 | * XXX should consider pools, etc? |
978 | */ |
979 | |
980 | bool |
981 | uvm_reclaimable(void) |
982 | { |
983 | int filepages; |
984 | int active, inactive; |
985 | |
986 | /* |
987 | * if swap is not full, no problem. |
988 | */ |
989 | |
990 | if (!uvm_swapisfull()) { |
991 | return true; |
992 | } |
993 | |
994 | /* |
995 | * file-backed pages can be reclaimed even when swap is full. |
996 | * if we have more than 1/16 of pageable memory or 5MB, try to reclaim. |
997 | * |
998 | * XXX assume the worst case, ie. all wired pages are file-backed. |
999 | * |
1000 | * XXX should consider about other reclaimable memory. |
1001 | * XXX ie. pools, traditional buffer cache. |
1002 | */ |
1003 | |
1004 | filepages = uvmexp.filepages + uvmexp.execpages - uvmexp.wired; |
1005 | uvm_estimatepageable(&active, &inactive); |
1006 | if (filepages >= MIN((active + inactive) >> 4, |
1007 | 5 * 1024 * 1024 >> PAGE_SHIFT)) { |
1008 | return true; |
1009 | } |
1010 | |
1011 | /* |
1012 | * kill the process, fail allocation, etc.. |
1013 | */ |
1014 | |
1015 | return false; |
1016 | } |
1017 | |
1018 | void |
1019 | uvm_estimatepageable(int *active, int *inactive) |
1020 | { |
1021 | |
1022 | uvmpdpol_estimatepageable(active, inactive); |
1023 | } |
1024 | |
1025 | |