1 | /* $NetBSD: uvm_swap.c,v 1.174 2016/07/08 06:45:34 skrll Exp $ */ |
2 | |
3 | /* |
4 | * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green |
5 | * All rights reserved. |
6 | * |
7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions |
9 | * are met: |
10 | * 1. Redistributions of source code must retain the above copyright |
11 | * notice, this list of conditions and the following disclaimer. |
12 | * 2. Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
17 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
18 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
19 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, |
20 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
21 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
23 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
26 | * SUCH DAMAGE. |
27 | * |
28 | * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp |
29 | * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp |
30 | */ |
31 | |
32 | #include <sys/cdefs.h> |
33 | __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.174 2016/07/08 06:45:34 skrll Exp $" ); |
34 | |
35 | #include "opt_uvmhist.h" |
36 | #include "opt_compat_netbsd.h" |
37 | #include "opt_ddb.h" |
38 | |
39 | #include <sys/param.h> |
40 | #include <sys/systm.h> |
41 | #include <sys/buf.h> |
42 | #include <sys/bufq.h> |
43 | #include <sys/conf.h> |
44 | #include <sys/proc.h> |
45 | #include <sys/namei.h> |
46 | #include <sys/disklabel.h> |
47 | #include <sys/errno.h> |
48 | #include <sys/kernel.h> |
49 | #include <sys/vnode.h> |
50 | #include <sys/file.h> |
51 | #include <sys/vmem.h> |
52 | #include <sys/blist.h> |
53 | #include <sys/mount.h> |
54 | #include <sys/pool.h> |
55 | #include <sys/kmem.h> |
56 | #include <sys/syscallargs.h> |
57 | #include <sys/swap.h> |
58 | #include <sys/kauth.h> |
59 | #include <sys/sysctl.h> |
60 | #include <sys/workqueue.h> |
61 | |
62 | #include <uvm/uvm.h> |
63 | |
64 | #include <miscfs/specfs/specdev.h> |
65 | |
66 | /* |
67 | * uvm_swap.c: manage configuration and i/o to swap space. |
68 | */ |
69 | |
70 | /* |
71 | * swap space is managed in the following way: |
72 | * |
73 | * each swap partition or file is described by a "swapdev" structure. |
74 | * each "swapdev" structure contains a "swapent" structure which contains |
75 | * information that is passed up to the user (via system calls). |
76 | * |
77 | * each swap partition is assigned a "priority" (int) which controls |
78 | * swap parition usage. |
79 | * |
80 | * the system maintains a global data structure describing all swap |
81 | * partitions/files. there is a sorted LIST of "swappri" structures |
82 | * which describe "swapdev"'s at that priority. this LIST is headed |
83 | * by the "swap_priority" global var. each "swappri" contains a |
84 | * TAILQ of "swapdev" structures at that priority. |
85 | * |
86 | * locking: |
87 | * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl |
88 | * system call and prevents the swap priority list from changing |
89 | * while we are in the middle of a system call (e.g. SWAP_STATS). |
90 | * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data |
91 | * structures including the priority list, the swapdev structures, |
92 | * and the swapmap arena. |
93 | * |
94 | * each swap device has the following info: |
95 | * - swap device in use (could be disabled, preventing future use) |
96 | * - swap enabled (allows new allocations on swap) |
97 | * - map info in /dev/drum |
98 | * - vnode pointer |
99 | * for swap files only: |
100 | * - block size |
101 | * - max byte count in buffer |
102 | * - buffer |
103 | * |
104 | * userland controls and configures swap with the swapctl(2) system call. |
105 | * the sys_swapctl performs the following operations: |
106 | * [1] SWAP_NSWAP: returns the number of swap devices currently configured |
107 | * [2] SWAP_STATS: given a pointer to an array of swapent structures |
108 | * (passed in via "arg") of a size passed in via "misc" ... we load |
109 | * the current swap config into the array. The actual work is done |
110 | * in the uvm_swap_stats() function. |
111 | * [3] SWAP_ON: given a pathname in arg (could be device or file) and a |
112 | * priority in "misc", start swapping on it. |
113 | * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device |
114 | * [5] SWAP_CTL: changes the priority of a swap device (new priority in |
115 | * "misc") |
116 | */ |
117 | |
118 | /* |
119 | * swapdev: describes a single swap partition/file |
120 | * |
121 | * note the following should be true: |
122 | * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] |
123 | * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] |
124 | */ |
125 | struct swapdev { |
126 | dev_t swd_dev; /* device id */ |
127 | int swd_flags; /* flags:inuse/enable/fake */ |
128 | int swd_priority; /* our priority */ |
129 | int swd_nblks; /* blocks in this device */ |
130 | char *swd_path; /* saved pathname of device */ |
131 | int swd_pathlen; /* length of pathname */ |
132 | int swd_npages; /* #pages we can use */ |
133 | int swd_npginuse; /* #pages in use */ |
134 | int swd_npgbad; /* #pages bad */ |
135 | int swd_drumoffset; /* page0 offset in drum */ |
136 | int swd_drumsize; /* #pages in drum */ |
137 | blist_t swd_blist; /* blist for this swapdev */ |
138 | struct vnode *swd_vp; /* backing vnode */ |
139 | TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ |
140 | |
141 | int swd_bsize; /* blocksize (bytes) */ |
142 | int swd_maxactive; /* max active i/o reqs */ |
143 | struct bufq_state *swd_tab; /* buffer list */ |
144 | int swd_active; /* number of active buffers */ |
145 | }; |
146 | |
147 | /* |
148 | * swap device priority entry; the list is kept sorted on `spi_priority'. |
149 | */ |
150 | struct swappri { |
151 | int spi_priority; /* priority */ |
152 | TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev; |
153 | /* tailq of swapdevs at this priority */ |
154 | LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ |
155 | }; |
156 | |
157 | /* |
158 | * The following two structures are used to keep track of data transfers |
159 | * on swap devices associated with regular files. |
160 | * NOTE: this code is more or less a copy of vnd.c; we use the same |
161 | * structure names here to ease porting.. |
162 | */ |
163 | struct vndxfer { |
164 | struct buf *vx_bp; /* Pointer to parent buffer */ |
165 | struct swapdev *vx_sdp; |
166 | int vx_error; |
167 | int vx_pending; /* # of pending aux buffers */ |
168 | int vx_flags; |
169 | #define VX_BUSY 1 |
170 | #define VX_DEAD 2 |
171 | }; |
172 | |
173 | struct vndbuf { |
174 | struct buf vb_buf; |
175 | struct vndxfer *vb_xfer; |
176 | }; |
177 | |
178 | /* |
179 | * NetBSD 1.3 swapctl(SWAP_STATS, ...) swapent structure; uses 32 bit |
180 | * dev_t and has no se_path[] member. |
181 | */ |
182 | struct swapent13 { |
183 | int32_t se13_dev; /* device id */ |
184 | int se13_flags; /* flags */ |
185 | int se13_nblks; /* total blocks */ |
186 | int se13_inuse; /* blocks in use */ |
187 | int se13_priority; /* priority of this device */ |
188 | }; |
189 | |
190 | /* |
191 | * NetBSD 5.0 swapctl(SWAP_STATS, ...) swapent structure; uses 32 bit |
192 | * dev_t. |
193 | */ |
194 | struct swapent50 { |
195 | int32_t se50_dev; /* device id */ |
196 | int se50_flags; /* flags */ |
197 | int se50_nblks; /* total blocks */ |
198 | int se50_inuse; /* blocks in use */ |
199 | int se50_priority; /* priority of this device */ |
200 | char se50_path[PATH_MAX+1]; /* path name */ |
201 | }; |
202 | |
203 | /* |
204 | * We keep a of pool vndbuf's and vndxfer structures. |
205 | */ |
206 | static struct pool vndxfer_pool, vndbuf_pool; |
207 | |
208 | /* |
209 | * local variables |
210 | */ |
211 | static vmem_t *swapmap; /* controls the mapping of /dev/drum */ |
212 | |
213 | /* list of all active swap devices [by priority] */ |
214 | LIST_HEAD(swap_priority, swappri); |
215 | static struct swap_priority swap_priority; |
216 | |
217 | /* locks */ |
218 | static krwlock_t swap_syscall_lock; |
219 | |
220 | /* workqueue and use counter for swap to regular files */ |
221 | static int sw_reg_count = 0; |
222 | static struct workqueue *sw_reg_workqueue; |
223 | |
224 | /* tuneables */ |
225 | u_int uvm_swapisfull_factor = 99; |
226 | |
227 | /* |
228 | * prototypes |
229 | */ |
230 | static struct swapdev *swapdrum_getsdp(int); |
231 | |
232 | static struct swapdev *swaplist_find(struct vnode *, bool); |
233 | static void swaplist_insert(struct swapdev *, |
234 | struct swappri *, int); |
235 | static void swaplist_trim(void); |
236 | |
237 | static int swap_on(struct lwp *, struct swapdev *); |
238 | static int swap_off(struct lwp *, struct swapdev *); |
239 | |
240 | static void sw_reg_strategy(struct swapdev *, struct buf *, int); |
241 | static void sw_reg_biodone(struct buf *); |
242 | static void sw_reg_iodone(struct work *wk, void *dummy); |
243 | static void sw_reg_start(struct swapdev *); |
244 | |
245 | static int uvm_swap_io(struct vm_page **, int, int, int); |
246 | |
247 | /* |
248 | * uvm_swap_init: init the swap system data structures and locks |
249 | * |
250 | * => called at boot time from init_main.c after the filesystems |
251 | * are brought up (which happens after uvm_init()) |
252 | */ |
253 | void |
254 | uvm_swap_init(void) |
255 | { |
256 | UVMHIST_FUNC("uvm_swap_init" ); |
257 | |
258 | UVMHIST_CALLED(pdhist); |
259 | /* |
260 | * first, init the swap list, its counter, and its lock. |
261 | * then get a handle on the vnode for /dev/drum by using |
262 | * the its dev_t number ("swapdev", from MD conf.c). |
263 | */ |
264 | |
265 | LIST_INIT(&swap_priority); |
266 | uvmexp.nswapdev = 0; |
267 | rw_init(&swap_syscall_lock); |
268 | mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); |
269 | |
270 | if (bdevvp(swapdev, &swapdev_vp)) |
271 | panic("%s: can't get vnode for swap device" , __func__); |
272 | if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY)) |
273 | panic("%s: can't lock swap device" , __func__); |
274 | if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED)) |
275 | panic("%s: can't open swap device" , __func__); |
276 | VOP_UNLOCK(swapdev_vp); |
277 | |
278 | /* |
279 | * create swap block resource map to map /dev/drum. the range |
280 | * from 1 to INT_MAX allows 2 gigablocks of swap space. note |
281 | * that block 0 is reserved (used to indicate an allocation |
282 | * failure, or no allocation). |
283 | */ |
284 | swapmap = vmem_create("swapmap" , 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, |
285 | VM_NOSLEEP, IPL_NONE); |
286 | if (swapmap == 0) { |
287 | panic("%s: vmem_create failed" , __func__); |
288 | } |
289 | |
290 | pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx" , |
291 | NULL, IPL_BIO); |
292 | pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd" , |
293 | NULL, IPL_BIO); |
294 | |
295 | UVMHIST_LOG(pdhist, "<- done" , 0, 0, 0, 0); |
296 | } |
297 | |
298 | /* |
299 | * swaplist functions: functions that operate on the list of swap |
300 | * devices on the system. |
301 | */ |
302 | |
303 | /* |
304 | * swaplist_insert: insert swap device "sdp" into the global list |
305 | * |
306 | * => caller must hold both swap_syscall_lock and uvm_swap_data_lock |
307 | * => caller must provide a newly allocated swappri structure (we will |
308 | * FREE it if we don't need it... this it to prevent allocation |
309 | * blocking here while adding swap) |
310 | */ |
311 | static void |
312 | swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) |
313 | { |
314 | struct swappri *spp, *pspp; |
315 | UVMHIST_FUNC("swaplist_insert" ); UVMHIST_CALLED(pdhist); |
316 | |
317 | /* |
318 | * find entry at or after which to insert the new device. |
319 | */ |
320 | pspp = NULL; |
321 | LIST_FOREACH(spp, &swap_priority, spi_swappri) { |
322 | if (priority <= spp->spi_priority) |
323 | break; |
324 | pspp = spp; |
325 | } |
326 | |
327 | /* |
328 | * new priority? |
329 | */ |
330 | if (spp == NULL || spp->spi_priority != priority) { |
331 | spp = newspp; /* use newspp! */ |
332 | UVMHIST_LOG(pdhist, "created new swappri = %d" , |
333 | priority, 0, 0, 0); |
334 | |
335 | spp->spi_priority = priority; |
336 | TAILQ_INIT(&spp->spi_swapdev); |
337 | |
338 | if (pspp) |
339 | LIST_INSERT_AFTER(pspp, spp, spi_swappri); |
340 | else |
341 | LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); |
342 | } else { |
343 | /* we don't need a new priority structure, free it */ |
344 | kmem_free(newspp, sizeof(*newspp)); |
345 | } |
346 | |
347 | /* |
348 | * priority found (or created). now insert on the priority's |
349 | * tailq list and bump the total number of swapdevs. |
350 | */ |
351 | sdp->swd_priority = priority; |
352 | TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); |
353 | uvmexp.nswapdev++; |
354 | } |
355 | |
356 | /* |
357 | * swaplist_find: find and optionally remove a swap device from the |
358 | * global list. |
359 | * |
360 | * => caller must hold both swap_syscall_lock and uvm_swap_data_lock |
361 | * => we return the swapdev we found (and removed) |
362 | */ |
363 | static struct swapdev * |
364 | swaplist_find(struct vnode *vp, bool remove) |
365 | { |
366 | struct swapdev *sdp; |
367 | struct swappri *spp; |
368 | |
369 | /* |
370 | * search the lists for the requested vp |
371 | */ |
372 | |
373 | LIST_FOREACH(spp, &swap_priority, spi_swappri) { |
374 | TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { |
375 | if (sdp->swd_vp == vp) { |
376 | if (remove) { |
377 | TAILQ_REMOVE(&spp->spi_swapdev, |
378 | sdp, swd_next); |
379 | uvmexp.nswapdev--; |
380 | } |
381 | return(sdp); |
382 | } |
383 | } |
384 | } |
385 | return (NULL); |
386 | } |
387 | |
388 | /* |
389 | * swaplist_trim: scan priority list for empty priority entries and kill |
390 | * them. |
391 | * |
392 | * => caller must hold both swap_syscall_lock and uvm_swap_data_lock |
393 | */ |
394 | static void |
395 | swaplist_trim(void) |
396 | { |
397 | struct swappri *spp, *nextspp; |
398 | |
399 | LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { |
400 | if (!TAILQ_EMPTY(&spp->spi_swapdev)) |
401 | continue; |
402 | LIST_REMOVE(spp, spi_swappri); |
403 | kmem_free(spp, sizeof(*spp)); |
404 | } |
405 | } |
406 | |
407 | /* |
408 | * swapdrum_getsdp: given a page offset in /dev/drum, convert it back |
409 | * to the "swapdev" that maps that section of the drum. |
410 | * |
411 | * => each swapdev takes one big contig chunk of the drum |
412 | * => caller must hold uvm_swap_data_lock |
413 | */ |
414 | static struct swapdev * |
415 | swapdrum_getsdp(int pgno) |
416 | { |
417 | struct swapdev *sdp; |
418 | struct swappri *spp; |
419 | |
420 | LIST_FOREACH(spp, &swap_priority, spi_swappri) { |
421 | TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { |
422 | if (sdp->swd_flags & SWF_FAKE) |
423 | continue; |
424 | if (pgno >= sdp->swd_drumoffset && |
425 | pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { |
426 | return sdp; |
427 | } |
428 | } |
429 | } |
430 | return NULL; |
431 | } |
432 | |
433 | void swapsys_lock(krw_t op) |
434 | { |
435 | rw_enter(&swap_syscall_lock, op); |
436 | } |
437 | |
438 | void swapsys_unlock(void) |
439 | { |
440 | rw_exit(&swap_syscall_lock); |
441 | } |
442 | |
443 | /* |
444 | * sys_swapctl: main entry point for swapctl(2) system call |
445 | * [with two helper functions: swap_on and swap_off] |
446 | */ |
447 | int |
448 | sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval) |
449 | { |
450 | /* { |
451 | syscallarg(int) cmd; |
452 | syscallarg(void *) arg; |
453 | syscallarg(int) misc; |
454 | } */ |
455 | struct vnode *vp; |
456 | struct nameidata nd; |
457 | struct swappri *spp; |
458 | struct swapdev *sdp; |
459 | struct swapent *sep; |
460 | #define SWAP_PATH_MAX (PATH_MAX + 1) |
461 | char *userpath; |
462 | size_t len = 0; |
463 | int error, misc; |
464 | int priority; |
465 | UVMHIST_FUNC("sys_swapctl" ); UVMHIST_CALLED(pdhist); |
466 | |
467 | /* |
468 | * we handle the non-priv NSWAP and STATS request first. |
469 | * |
470 | * SWAP_NSWAP: return number of config'd swap devices |
471 | * [can also be obtained with uvmexp sysctl] |
472 | */ |
473 | if (SCARG(uap, cmd) == SWAP_NSWAP) { |
474 | const int nswapdev = uvmexp.nswapdev; |
475 | UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d" , nswapdev, 0, 0, 0); |
476 | *retval = nswapdev; |
477 | return 0; |
478 | } |
479 | |
480 | misc = SCARG(uap, misc); |
481 | userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP); |
482 | |
483 | /* |
484 | * ensure serialized syscall access by grabbing the swap_syscall_lock |
485 | */ |
486 | rw_enter(&swap_syscall_lock, RW_WRITER); |
487 | |
488 | /* |
489 | * SWAP_STATS: get stats on current # of configured swap devs |
490 | * |
491 | * note that the swap_priority list can't change as long |
492 | * as we are holding the swap_syscall_lock. we don't want |
493 | * to grab the uvm_swap_data_lock because we may fault&sleep during |
494 | * copyout() and we don't want to be holding that lock then! |
495 | */ |
496 | if (SCARG(uap, cmd) == SWAP_STATS |
497 | #if defined(COMPAT_50) |
498 | || SCARG(uap, cmd) == SWAP_STATS50 |
499 | #endif |
500 | #if defined(COMPAT_13) |
501 | || SCARG(uap, cmd) == SWAP_STATS13 |
502 | #endif |
503 | ) { |
504 | if (misc < 0) { |
505 | error = EINVAL; |
506 | goto out; |
507 | } |
508 | if (misc == 0 || uvmexp.nswapdev == 0) { |
509 | error = 0; |
510 | goto out; |
511 | } |
512 | /* Make sure userland cannot exhaust kernel memory */ |
513 | if ((size_t)misc > (size_t)uvmexp.nswapdev) |
514 | misc = uvmexp.nswapdev; |
515 | KASSERT(misc > 0); |
516 | #if defined(COMPAT_13) |
517 | if (SCARG(uap, cmd) == SWAP_STATS13) |
518 | len = sizeof(struct swapent13) * misc; |
519 | else |
520 | #endif |
521 | #if defined(COMPAT_50) |
522 | if (SCARG(uap, cmd) == SWAP_STATS50) |
523 | len = sizeof(struct swapent50) * misc; |
524 | else |
525 | #endif |
526 | len = sizeof(struct swapent) * misc; |
527 | sep = (struct swapent *)kmem_alloc(len, KM_SLEEP); |
528 | |
529 | uvm_swap_stats(SCARG(uap, cmd), sep, misc, retval); |
530 | error = copyout(sep, SCARG(uap, arg), len); |
531 | |
532 | kmem_free(sep, len); |
533 | UVMHIST_LOG(pdhist, "<- done SWAP_STATS" , 0, 0, 0, 0); |
534 | goto out; |
535 | } |
536 | if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) { |
537 | dev_t *devp = (dev_t *)SCARG(uap, arg); |
538 | |
539 | error = copyout(&dumpdev, devp, sizeof(dumpdev)); |
540 | goto out; |
541 | } |
542 | |
543 | /* |
544 | * all other requests require superuser privs. verify. |
545 | */ |
546 | if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, |
547 | 0, NULL, NULL, NULL))) |
548 | goto out; |
549 | |
550 | if (SCARG(uap, cmd) == SWAP_DUMPOFF) { |
551 | /* drop the current dump device */ |
552 | dumpdev = NODEV; |
553 | dumpcdev = NODEV; |
554 | cpu_dumpconf(); |
555 | goto out; |
556 | } |
557 | |
558 | /* |
559 | * at this point we expect a path name in arg. we will |
560 | * use namei() to gain a vnode reference (vref), and lock |
561 | * the vnode (VOP_LOCK). |
562 | * |
563 | * XXX: a NULL arg means use the root vnode pointer (e.g. for |
564 | * miniroot) |
565 | */ |
566 | if (SCARG(uap, arg) == NULL) { |
567 | vp = rootvp; /* miniroot */ |
568 | vref(vp); |
569 | if (vn_lock(vp, LK_EXCLUSIVE)) { |
570 | vrele(vp); |
571 | error = EBUSY; |
572 | goto out; |
573 | } |
574 | if (SCARG(uap, cmd) == SWAP_ON && |
575 | copystr("miniroot" , userpath, SWAP_PATH_MAX, &len)) |
576 | panic("swapctl: miniroot copy failed" ); |
577 | } else { |
578 | struct pathbuf *pb; |
579 | |
580 | /* |
581 | * This used to allow copying in one extra byte |
582 | * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON. |
583 | * This was completely pointless because if anyone |
584 | * used that extra byte namei would fail with |
585 | * ENAMETOOLONG anyway, so I've removed the excess |
586 | * logic. - dholland 20100215 |
587 | */ |
588 | |
589 | error = pathbuf_copyin(SCARG(uap, arg), &pb); |
590 | if (error) { |
591 | goto out; |
592 | } |
593 | if (SCARG(uap, cmd) == SWAP_ON) { |
594 | /* get a copy of the string */ |
595 | pathbuf_copystring(pb, userpath, SWAP_PATH_MAX); |
596 | len = strlen(userpath) + 1; |
597 | } |
598 | NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); |
599 | if ((error = namei(&nd))) { |
600 | pathbuf_destroy(pb); |
601 | goto out; |
602 | } |
603 | vp = nd.ni_vp; |
604 | pathbuf_destroy(pb); |
605 | } |
606 | /* note: "vp" is referenced and locked */ |
607 | |
608 | error = 0; /* assume no error */ |
609 | switch(SCARG(uap, cmd)) { |
610 | |
611 | case SWAP_DUMPDEV: |
612 | if (vp->v_type != VBLK) { |
613 | error = ENOTBLK; |
614 | break; |
615 | } |
616 | if (bdevsw_lookup(vp->v_rdev)) { |
617 | dumpdev = vp->v_rdev; |
618 | dumpcdev = devsw_blk2chr(dumpdev); |
619 | } else |
620 | dumpdev = NODEV; |
621 | cpu_dumpconf(); |
622 | break; |
623 | |
624 | case SWAP_CTL: |
625 | /* |
626 | * get new priority, remove old entry (if any) and then |
627 | * reinsert it in the correct place. finally, prune out |
628 | * any empty priority structures. |
629 | */ |
630 | priority = SCARG(uap, misc); |
631 | spp = kmem_alloc(sizeof(*spp), KM_SLEEP); |
632 | mutex_enter(&uvm_swap_data_lock); |
633 | if ((sdp = swaplist_find(vp, true)) == NULL) { |
634 | error = ENOENT; |
635 | } else { |
636 | swaplist_insert(sdp, spp, priority); |
637 | swaplist_trim(); |
638 | } |
639 | mutex_exit(&uvm_swap_data_lock); |
640 | if (error) |
641 | kmem_free(spp, sizeof(*spp)); |
642 | break; |
643 | |
644 | case SWAP_ON: |
645 | |
646 | /* |
647 | * check for duplicates. if none found, then insert a |
648 | * dummy entry on the list to prevent someone else from |
649 | * trying to enable this device while we are working on |
650 | * it. |
651 | */ |
652 | |
653 | priority = SCARG(uap, misc); |
654 | sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP); |
655 | spp = kmem_alloc(sizeof(*spp), KM_SLEEP); |
656 | sdp->swd_flags = SWF_FAKE; |
657 | sdp->swd_vp = vp; |
658 | sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; |
659 | bufq_alloc(&sdp->swd_tab, "disksort" , BUFQ_SORT_RAWBLOCK); |
660 | mutex_enter(&uvm_swap_data_lock); |
661 | if (swaplist_find(vp, false) != NULL) { |
662 | error = EBUSY; |
663 | mutex_exit(&uvm_swap_data_lock); |
664 | bufq_free(sdp->swd_tab); |
665 | kmem_free(sdp, sizeof(*sdp)); |
666 | kmem_free(spp, sizeof(*spp)); |
667 | break; |
668 | } |
669 | swaplist_insert(sdp, spp, priority); |
670 | mutex_exit(&uvm_swap_data_lock); |
671 | |
672 | KASSERT(len > 0); |
673 | sdp->swd_pathlen = len; |
674 | sdp->swd_path = kmem_alloc(len, KM_SLEEP); |
675 | if (copystr(userpath, sdp->swd_path, len, 0) != 0) |
676 | panic("swapctl: copystr" ); |
677 | |
678 | /* |
679 | * we've now got a FAKE placeholder in the swap list. |
680 | * now attempt to enable swap on it. if we fail, undo |
681 | * what we've done and kill the fake entry we just inserted. |
682 | * if swap_on is a success, it will clear the SWF_FAKE flag |
683 | */ |
684 | |
685 | if ((error = swap_on(l, sdp)) != 0) { |
686 | mutex_enter(&uvm_swap_data_lock); |
687 | (void) swaplist_find(vp, true); /* kill fake entry */ |
688 | swaplist_trim(); |
689 | mutex_exit(&uvm_swap_data_lock); |
690 | bufq_free(sdp->swd_tab); |
691 | kmem_free(sdp->swd_path, sdp->swd_pathlen); |
692 | kmem_free(sdp, sizeof(*sdp)); |
693 | break; |
694 | } |
695 | break; |
696 | |
697 | case SWAP_OFF: |
698 | mutex_enter(&uvm_swap_data_lock); |
699 | if ((sdp = swaplist_find(vp, false)) == NULL) { |
700 | mutex_exit(&uvm_swap_data_lock); |
701 | error = ENXIO; |
702 | break; |
703 | } |
704 | |
705 | /* |
706 | * If a device isn't in use or enabled, we |
707 | * can't stop swapping from it (again). |
708 | */ |
709 | if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { |
710 | mutex_exit(&uvm_swap_data_lock); |
711 | error = EBUSY; |
712 | break; |
713 | } |
714 | |
715 | /* |
716 | * do the real work. |
717 | */ |
718 | error = swap_off(l, sdp); |
719 | break; |
720 | |
721 | default: |
722 | error = EINVAL; |
723 | } |
724 | |
725 | /* |
726 | * done! release the ref gained by namei() and unlock. |
727 | */ |
728 | vput(vp); |
729 | out: |
730 | rw_exit(&swap_syscall_lock); |
731 | kmem_free(userpath, SWAP_PATH_MAX); |
732 | |
733 | UVMHIST_LOG(pdhist, "<- done! error=%d" , error, 0, 0, 0); |
734 | return (error); |
735 | } |
736 | |
737 | /* |
738 | * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept |
739 | * away from sys_swapctl() in order to allow COMPAT_* swapctl() |
740 | * emulation to use it directly without going through sys_swapctl(). |
741 | * The problem with using sys_swapctl() there is that it involves |
742 | * copying the swapent array to the stackgap, and this array's size |
743 | * is not known at build time. Hence it would not be possible to |
744 | * ensure it would fit in the stackgap in any case. |
745 | */ |
746 | void |
747 | uvm_swap_stats(int cmd, struct swapent *sep, int sec, register_t *retval) |
748 | { |
749 | struct swappri *spp; |
750 | struct swapdev *sdp; |
751 | int count = 0; |
752 | |
753 | KASSERT(rw_lock_held(&swap_syscall_lock)); |
754 | |
755 | LIST_FOREACH(spp, &swap_priority, spi_swappri) { |
756 | TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { |
757 | int inuse; |
758 | |
759 | if (sec-- <= 0) |
760 | break; |
761 | |
762 | /* |
763 | * backwards compatibility for system call. |
764 | * For NetBSD 1.3 and 5.0, we have to use |
765 | * the 32 bit dev_t. For 5.0 and -current |
766 | * we have to add the path. |
767 | */ |
768 | inuse = btodb((uint64_t)sdp->swd_npginuse << |
769 | PAGE_SHIFT); |
770 | |
771 | #if defined(COMPAT_13) || defined(COMPAT_50) |
772 | if (cmd == SWAP_STATS) { |
773 | #endif |
774 | sep->se_dev = sdp->swd_dev; |
775 | sep->se_flags = sdp->swd_flags; |
776 | sep->se_nblks = sdp->swd_nblks; |
777 | sep->se_inuse = inuse; |
778 | sep->se_priority = sdp->swd_priority; |
779 | KASSERT(sdp->swd_pathlen < |
780 | sizeof(sep->se_path)); |
781 | strcpy(sep->se_path, sdp->swd_path); |
782 | sep++; |
783 | #if defined(COMPAT_13) |
784 | } else if (cmd == SWAP_STATS13) { |
785 | struct swapent13 *sep13 = |
786 | (struct swapent13 *)sep; |
787 | |
788 | sep13->se13_dev = sdp->swd_dev; |
789 | sep13->se13_flags = sdp->swd_flags; |
790 | sep13->se13_nblks = sdp->swd_nblks; |
791 | sep13->se13_inuse = inuse; |
792 | sep13->se13_priority = sdp->swd_priority; |
793 | sep = (struct swapent *)(sep13 + 1); |
794 | #endif |
795 | #if defined(COMPAT_50) |
796 | } else if (cmd == SWAP_STATS50) { |
797 | struct swapent50 *sep50 = |
798 | (struct swapent50 *)sep; |
799 | |
800 | sep50->se50_dev = sdp->swd_dev; |
801 | sep50->se50_flags = sdp->swd_flags; |
802 | sep50->se50_nblks = sdp->swd_nblks; |
803 | sep50->se50_inuse = inuse; |
804 | sep50->se50_priority = sdp->swd_priority; |
805 | KASSERT(sdp->swd_pathlen < |
806 | sizeof(sep50->se50_path)); |
807 | strcpy(sep50->se50_path, sdp->swd_path); |
808 | sep = (struct swapent *)(sep50 + 1); |
809 | #endif |
810 | #if defined(COMPAT_13) || defined(COMPAT_50) |
811 | } |
812 | #endif |
813 | count++; |
814 | } |
815 | } |
816 | *retval = count; |
817 | } |
818 | |
819 | /* |
820 | * swap_on: attempt to enable a swapdev for swapping. note that the |
821 | * swapdev is already on the global list, but disabled (marked |
822 | * SWF_FAKE). |
823 | * |
824 | * => we avoid the start of the disk (to protect disk labels) |
825 | * => we also avoid the miniroot, if we are swapping to root. |
826 | * => caller should leave uvm_swap_data_lock unlocked, we may lock it |
827 | * if needed. |
828 | */ |
829 | static int |
830 | swap_on(struct lwp *l, struct swapdev *sdp) |
831 | { |
832 | struct vnode *vp; |
833 | int error, npages, nblocks, size; |
834 | long addr; |
835 | vmem_addr_t result; |
836 | struct vattr va; |
837 | dev_t dev; |
838 | UVMHIST_FUNC("swap_on" ); UVMHIST_CALLED(pdhist); |
839 | |
840 | /* |
841 | * we want to enable swapping on sdp. the swd_vp contains |
842 | * the vnode we want (locked and ref'd), and the swd_dev |
843 | * contains the dev_t of the file, if it a block device. |
844 | */ |
845 | |
846 | vp = sdp->swd_vp; |
847 | dev = sdp->swd_dev; |
848 | |
849 | /* |
850 | * open the swap file (mostly useful for block device files to |
851 | * let device driver know what is up). |
852 | * |
853 | * we skip the open/close for root on swap because the root |
854 | * has already been opened when root was mounted (mountroot). |
855 | */ |
856 | if (vp != rootvp) { |
857 | if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred))) |
858 | return (error); |
859 | } |
860 | |
861 | /* XXX this only works for block devices */ |
862 | UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d" , dev, major(dev), 0,0); |
863 | |
864 | /* |
865 | * we now need to determine the size of the swap area. for |
866 | * block specials we can call the d_psize function. |
867 | * for normal files, we must stat [get attrs]. |
868 | * |
869 | * we put the result in nblks. |
870 | * for normal files, we also want the filesystem block size |
871 | * (which we get with statfs). |
872 | */ |
873 | switch (vp->v_type) { |
874 | case VBLK: |
875 | if ((nblocks = bdev_size(dev)) == -1) { |
876 | error = ENXIO; |
877 | goto bad; |
878 | } |
879 | break; |
880 | |
881 | case VREG: |
882 | if ((error = VOP_GETATTR(vp, &va, l->l_cred))) |
883 | goto bad; |
884 | nblocks = (int)btodb(va.va_size); |
885 | sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift; |
886 | /* |
887 | * limit the max # of outstanding I/O requests we issue |
888 | * at any one time. take it easy on NFS servers. |
889 | */ |
890 | if (vp->v_tag == VT_NFS) |
891 | sdp->swd_maxactive = 2; /* XXX */ |
892 | else |
893 | sdp->swd_maxactive = 8; /* XXX */ |
894 | break; |
895 | |
896 | default: |
897 | error = ENXIO; |
898 | goto bad; |
899 | } |
900 | |
901 | /* |
902 | * save nblocks in a safe place and convert to pages. |
903 | */ |
904 | |
905 | sdp->swd_nblks = nblocks; |
906 | npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; |
907 | |
908 | /* |
909 | * for block special files, we want to make sure that leave |
910 | * the disklabel and bootblocks alone, so we arrange to skip |
911 | * over them (arbitrarily choosing to skip PAGE_SIZE bytes). |
912 | * note that because of this the "size" can be less than the |
913 | * actual number of blocks on the device. |
914 | */ |
915 | if (vp->v_type == VBLK) { |
916 | /* we use pages 1 to (size - 1) [inclusive] */ |
917 | size = npages - 1; |
918 | addr = 1; |
919 | } else { |
920 | /* we use pages 0 to (size - 1) [inclusive] */ |
921 | size = npages; |
922 | addr = 0; |
923 | } |
924 | |
925 | /* |
926 | * make sure we have enough blocks for a reasonable sized swap |
927 | * area. we want at least one page. |
928 | */ |
929 | |
930 | if (size < 1) { |
931 | UVMHIST_LOG(pdhist, " size <= 1!!" , 0, 0, 0, 0); |
932 | error = EINVAL; |
933 | goto bad; |
934 | } |
935 | |
936 | UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld" , dev, size, addr, 0); |
937 | |
938 | /* |
939 | * now we need to allocate an extent to manage this swap device |
940 | */ |
941 | |
942 | sdp->swd_blist = blist_create(npages); |
943 | /* mark all expect the `saved' region free. */ |
944 | blist_free(sdp->swd_blist, addr, size); |
945 | |
946 | /* |
947 | * if the vnode we are swapping to is the root vnode |
948 | * (i.e. we are swapping to the miniroot) then we want |
949 | * to make sure we don't overwrite it. do a statfs to |
950 | * find its size and skip over it. |
951 | */ |
952 | if (vp == rootvp) { |
953 | struct mount *mp; |
954 | struct statvfs *sp; |
955 | int rootblocks, rootpages; |
956 | |
957 | mp = rootvnode->v_mount; |
958 | sp = &mp->mnt_stat; |
959 | rootblocks = sp->f_blocks * btodb(sp->f_frsize); |
960 | /* |
961 | * XXX: sp->f_blocks isn't the total number of |
962 | * blocks in the filesystem, it's the number of |
963 | * data blocks. so, our rootblocks almost |
964 | * definitely underestimates the total size |
965 | * of the filesystem - how badly depends on the |
966 | * details of the filesystem type. there isn't |
967 | * an obvious way to deal with this cleanly |
968 | * and perfectly, so for now we just pad our |
969 | * rootblocks estimate with an extra 5 percent. |
970 | */ |
971 | rootblocks += (rootblocks >> 5) + |
972 | (rootblocks >> 6) + |
973 | (rootblocks >> 7); |
974 | rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; |
975 | if (rootpages > size) |
976 | panic("swap_on: miniroot larger than swap?" ); |
977 | |
978 | if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { |
979 | panic("swap_on: unable to preserve miniroot" ); |
980 | } |
981 | |
982 | size -= rootpages; |
983 | printf("Preserved %d pages of miniroot " , rootpages); |
984 | printf("leaving %d pages of swap\n" , size); |
985 | } |
986 | |
987 | /* |
988 | * add a ref to vp to reflect usage as a swap device. |
989 | */ |
990 | vref(vp); |
991 | |
992 | /* |
993 | * now add the new swapdev to the drum and enable. |
994 | */ |
995 | error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result); |
996 | if (error != 0) |
997 | panic("swapdrum_add" ); |
998 | /* |
999 | * If this is the first regular swap create the workqueue. |
1000 | * => Protected by swap_syscall_lock. |
1001 | */ |
1002 | if (vp->v_type != VBLK) { |
1003 | if (sw_reg_count++ == 0) { |
1004 | KASSERT(sw_reg_workqueue == NULL); |
1005 | if (workqueue_create(&sw_reg_workqueue, "swapiod" , |
1006 | sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0) |
1007 | panic("%s: workqueue_create failed" , __func__); |
1008 | } |
1009 | } |
1010 | |
1011 | sdp->swd_drumoffset = (int)result; |
1012 | sdp->swd_drumsize = npages; |
1013 | sdp->swd_npages = size; |
1014 | mutex_enter(&uvm_swap_data_lock); |
1015 | sdp->swd_flags &= ~SWF_FAKE; /* going live */ |
1016 | sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); |
1017 | uvmexp.swpages += size; |
1018 | uvmexp.swpgavail += size; |
1019 | mutex_exit(&uvm_swap_data_lock); |
1020 | return (0); |
1021 | |
1022 | /* |
1023 | * failure: clean up and return error. |
1024 | */ |
1025 | |
1026 | bad: |
1027 | if (sdp->swd_blist) { |
1028 | blist_destroy(sdp->swd_blist); |
1029 | } |
1030 | if (vp != rootvp) { |
1031 | (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred); |
1032 | } |
1033 | return (error); |
1034 | } |
1035 | |
1036 | /* |
1037 | * swap_off: stop swapping on swapdev |
1038 | * |
1039 | * => swap data should be locked, we will unlock. |
1040 | */ |
1041 | static int |
1042 | swap_off(struct lwp *l, struct swapdev *sdp) |
1043 | { |
1044 | int npages = sdp->swd_npages; |
1045 | int error = 0; |
1046 | |
1047 | UVMHIST_FUNC("swap_off" ); UVMHIST_CALLED(pdhist); |
1048 | UVMHIST_LOG(pdhist, " dev=%x, npages=%d" , sdp->swd_dev,npages,0,0); |
1049 | |
1050 | /* disable the swap area being removed */ |
1051 | sdp->swd_flags &= ~SWF_ENABLE; |
1052 | uvmexp.swpgavail -= npages; |
1053 | mutex_exit(&uvm_swap_data_lock); |
1054 | |
1055 | /* |
1056 | * the idea is to find all the pages that are paged out to this |
1057 | * device, and page them all in. in uvm, swap-backed pageable |
1058 | * memory can take two forms: aobjs and anons. call the |
1059 | * swapoff hook for each subsystem to bring in pages. |
1060 | */ |
1061 | |
1062 | if (uao_swap_off(sdp->swd_drumoffset, |
1063 | sdp->swd_drumoffset + sdp->swd_drumsize) || |
1064 | amap_swap_off(sdp->swd_drumoffset, |
1065 | sdp->swd_drumoffset + sdp->swd_drumsize)) { |
1066 | error = ENOMEM; |
1067 | } else if (sdp->swd_npginuse > sdp->swd_npgbad) { |
1068 | error = EBUSY; |
1069 | } |
1070 | |
1071 | if (error) { |
1072 | mutex_enter(&uvm_swap_data_lock); |
1073 | sdp->swd_flags |= SWF_ENABLE; |
1074 | uvmexp.swpgavail += npages; |
1075 | mutex_exit(&uvm_swap_data_lock); |
1076 | |
1077 | return error; |
1078 | } |
1079 | |
1080 | /* |
1081 | * If this is the last regular swap destroy the workqueue. |
1082 | * => Protected by swap_syscall_lock. |
1083 | */ |
1084 | if (sdp->swd_vp->v_type != VBLK) { |
1085 | KASSERT(sw_reg_count > 0); |
1086 | KASSERT(sw_reg_workqueue != NULL); |
1087 | if (--sw_reg_count == 0) { |
1088 | workqueue_destroy(sw_reg_workqueue); |
1089 | sw_reg_workqueue = NULL; |
1090 | } |
1091 | } |
1092 | |
1093 | /* |
1094 | * done with the vnode. |
1095 | * drop our ref on the vnode before calling VOP_CLOSE() |
1096 | * so that spec_close() can tell if this is the last close. |
1097 | */ |
1098 | vrele(sdp->swd_vp); |
1099 | if (sdp->swd_vp != rootvp) { |
1100 | (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred); |
1101 | } |
1102 | |
1103 | mutex_enter(&uvm_swap_data_lock); |
1104 | uvmexp.swpages -= npages; |
1105 | uvmexp.swpginuse -= sdp->swd_npgbad; |
1106 | |
1107 | if (swaplist_find(sdp->swd_vp, true) == NULL) |
1108 | panic("%s: swapdev not in list" , __func__); |
1109 | swaplist_trim(); |
1110 | mutex_exit(&uvm_swap_data_lock); |
1111 | |
1112 | /* |
1113 | * free all resources! |
1114 | */ |
1115 | vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); |
1116 | blist_destroy(sdp->swd_blist); |
1117 | bufq_free(sdp->swd_tab); |
1118 | kmem_free(sdp, sizeof(*sdp)); |
1119 | return (0); |
1120 | } |
1121 | |
1122 | void |
1123 | uvm_swap_shutdown(struct lwp *l) |
1124 | { |
1125 | struct swapdev *sdp; |
1126 | struct swappri *spp; |
1127 | struct vnode *vp; |
1128 | int error; |
1129 | |
1130 | printf("turning of swap..." ); |
1131 | rw_enter(&swap_syscall_lock, RW_WRITER); |
1132 | mutex_enter(&uvm_swap_data_lock); |
1133 | again: |
1134 | LIST_FOREACH(spp, &swap_priority, spi_swappri) |
1135 | TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { |
1136 | if (sdp->swd_flags & SWF_FAKE) |
1137 | continue; |
1138 | if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) |
1139 | continue; |
1140 | #ifdef DEBUG |
1141 | printf("\nturning off swap on %s..." , |
1142 | sdp->swd_path); |
1143 | #endif |
1144 | if (vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE)) { |
1145 | error = EBUSY; |
1146 | vp = NULL; |
1147 | } else |
1148 | error = 0; |
1149 | if (!error) { |
1150 | error = swap_off(l, sdp); |
1151 | mutex_enter(&uvm_swap_data_lock); |
1152 | } |
1153 | if (error) { |
1154 | printf("stopping swap on %s failed " |
1155 | "with error %d\n" , sdp->swd_path, error); |
1156 | TAILQ_REMOVE(&spp->spi_swapdev, sdp, |
1157 | swd_next); |
1158 | uvmexp.nswapdev--; |
1159 | swaplist_trim(); |
1160 | if (vp) |
1161 | vput(vp); |
1162 | } |
1163 | goto again; |
1164 | } |
1165 | printf(" done\n" ); |
1166 | mutex_exit(&uvm_swap_data_lock); |
1167 | rw_exit(&swap_syscall_lock); |
1168 | } |
1169 | |
1170 | |
1171 | /* |
1172 | * /dev/drum interface and i/o functions |
1173 | */ |
1174 | |
1175 | /* |
1176 | * swstrategy: perform I/O on the drum |
1177 | * |
1178 | * => we must map the i/o request from the drum to the correct swapdev. |
1179 | */ |
1180 | static void |
1181 | swstrategy(struct buf *bp) |
1182 | { |
1183 | struct swapdev *sdp; |
1184 | struct vnode *vp; |
1185 | int pageno, bn; |
1186 | UVMHIST_FUNC("swstrategy" ); UVMHIST_CALLED(pdhist); |
1187 | |
1188 | /* |
1189 | * convert block number to swapdev. note that swapdev can't |
1190 | * be yanked out from under us because we are holding resources |
1191 | * in it (i.e. the blocks we are doing I/O on). |
1192 | */ |
1193 | pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; |
1194 | mutex_enter(&uvm_swap_data_lock); |
1195 | sdp = swapdrum_getsdp(pageno); |
1196 | mutex_exit(&uvm_swap_data_lock); |
1197 | if (sdp == NULL) { |
1198 | bp->b_error = EINVAL; |
1199 | bp->b_resid = bp->b_bcount; |
1200 | biodone(bp); |
1201 | UVMHIST_LOG(pdhist, " failed to get swap device" , 0, 0, 0, 0); |
1202 | return; |
1203 | } |
1204 | |
1205 | /* |
1206 | * convert drum page number to block number on this swapdev. |
1207 | */ |
1208 | |
1209 | pageno -= sdp->swd_drumoffset; /* page # on swapdev */ |
1210 | bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ |
1211 | |
1212 | UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld" , |
1213 | ((bp->b_flags & B_READ) == 0) ? "write" : "read" , |
1214 | sdp->swd_drumoffset, bn, bp->b_bcount); |
1215 | |
1216 | /* |
1217 | * for block devices we finish up here. |
1218 | * for regular files we have to do more work which we delegate |
1219 | * to sw_reg_strategy(). |
1220 | */ |
1221 | |
1222 | vp = sdp->swd_vp; /* swapdev vnode pointer */ |
1223 | switch (vp->v_type) { |
1224 | default: |
1225 | panic("%s: vnode type 0x%x" , __func__, vp->v_type); |
1226 | |
1227 | case VBLK: |
1228 | |
1229 | /* |
1230 | * must convert "bp" from an I/O on /dev/drum to an I/O |
1231 | * on the swapdev (sdp). |
1232 | */ |
1233 | bp->b_blkno = bn; /* swapdev block number */ |
1234 | bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ |
1235 | |
1236 | /* |
1237 | * if we are doing a write, we have to redirect the i/o on |
1238 | * drum's v_numoutput counter to the swapdevs. |
1239 | */ |
1240 | if ((bp->b_flags & B_READ) == 0) { |
1241 | mutex_enter(bp->b_objlock); |
1242 | vwakeup(bp); /* kills one 'v_numoutput' on drum */ |
1243 | mutex_exit(bp->b_objlock); |
1244 | mutex_enter(vp->v_interlock); |
1245 | vp->v_numoutput++; /* put it on swapdev */ |
1246 | mutex_exit(vp->v_interlock); |
1247 | } |
1248 | |
1249 | /* |
1250 | * finally plug in swapdev vnode and start I/O |
1251 | */ |
1252 | bp->b_vp = vp; |
1253 | bp->b_objlock = vp->v_interlock; |
1254 | VOP_STRATEGY(vp, bp); |
1255 | return; |
1256 | |
1257 | case VREG: |
1258 | /* |
1259 | * delegate to sw_reg_strategy function. |
1260 | */ |
1261 | sw_reg_strategy(sdp, bp, bn); |
1262 | return; |
1263 | } |
1264 | /* NOTREACHED */ |
1265 | } |
1266 | |
1267 | /* |
1268 | * swread: the read function for the drum (just a call to physio) |
1269 | */ |
1270 | /*ARGSUSED*/ |
1271 | static int |
1272 | swread(dev_t dev, struct uio *uio, int ioflag) |
1273 | { |
1274 | UVMHIST_FUNC("swread" ); UVMHIST_CALLED(pdhist); |
1275 | |
1276 | UVMHIST_LOG(pdhist, " dev=%x offset=%qx" , dev, uio->uio_offset, 0, 0); |
1277 | return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); |
1278 | } |
1279 | |
1280 | /* |
1281 | * swwrite: the write function for the drum (just a call to physio) |
1282 | */ |
1283 | /*ARGSUSED*/ |
1284 | static int |
1285 | swwrite(dev_t dev, struct uio *uio, int ioflag) |
1286 | { |
1287 | UVMHIST_FUNC("swwrite" ); UVMHIST_CALLED(pdhist); |
1288 | |
1289 | UVMHIST_LOG(pdhist, " dev=%x offset=%qx" , dev, uio->uio_offset, 0, 0); |
1290 | return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); |
1291 | } |
1292 | |
1293 | const struct bdevsw swap_bdevsw = { |
1294 | .d_open = nullopen, |
1295 | .d_close = nullclose, |
1296 | .d_strategy = swstrategy, |
1297 | .d_ioctl = noioctl, |
1298 | .d_dump = nodump, |
1299 | .d_psize = nosize, |
1300 | .d_discard = nodiscard, |
1301 | .d_flag = D_OTHER |
1302 | }; |
1303 | |
1304 | const struct cdevsw swap_cdevsw = { |
1305 | .d_open = nullopen, |
1306 | .d_close = nullclose, |
1307 | .d_read = swread, |
1308 | .d_write = swwrite, |
1309 | .d_ioctl = noioctl, |
1310 | .d_stop = nostop, |
1311 | .d_tty = notty, |
1312 | .d_poll = nopoll, |
1313 | .d_mmap = nommap, |
1314 | .d_kqfilter = nokqfilter, |
1315 | .d_discard = nodiscard, |
1316 | .d_flag = D_OTHER, |
1317 | }; |
1318 | |
1319 | /* |
1320 | * sw_reg_strategy: handle swap i/o to regular files |
1321 | */ |
1322 | static void |
1323 | sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) |
1324 | { |
1325 | struct vnode *vp; |
1326 | struct vndxfer *vnx; |
1327 | daddr_t nbn; |
1328 | char *addr; |
1329 | off_t byteoff; |
1330 | int s, off, nra, error, sz, resid; |
1331 | UVMHIST_FUNC("sw_reg_strategy" ); UVMHIST_CALLED(pdhist); |
1332 | |
1333 | /* |
1334 | * allocate a vndxfer head for this transfer and point it to |
1335 | * our buffer. |
1336 | */ |
1337 | vnx = pool_get(&vndxfer_pool, PR_WAITOK); |
1338 | vnx->vx_flags = VX_BUSY; |
1339 | vnx->vx_error = 0; |
1340 | vnx->vx_pending = 0; |
1341 | vnx->vx_bp = bp; |
1342 | vnx->vx_sdp = sdp; |
1343 | |
1344 | /* |
1345 | * setup for main loop where we read filesystem blocks into |
1346 | * our buffer. |
1347 | */ |
1348 | error = 0; |
1349 | bp->b_resid = bp->b_bcount; /* nothing transfered yet! */ |
1350 | addr = bp->b_data; /* current position in buffer */ |
1351 | byteoff = dbtob((uint64_t)bn); |
1352 | |
1353 | for (resid = bp->b_resid; resid; resid -= sz) { |
1354 | struct vndbuf *nbp; |
1355 | |
1356 | /* |
1357 | * translate byteoffset into block number. return values: |
1358 | * vp = vnode of underlying device |
1359 | * nbn = new block number (on underlying vnode dev) |
1360 | * nra = num blocks we can read-ahead (excludes requested |
1361 | * block) |
1362 | */ |
1363 | nra = 0; |
1364 | error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, |
1365 | &vp, &nbn, &nra); |
1366 | |
1367 | if (error == 0 && nbn == (daddr_t)-1) { |
1368 | /* |
1369 | * this used to just set error, but that doesn't |
1370 | * do the right thing. Instead, it causes random |
1371 | * memory errors. The panic() should remain until |
1372 | * this condition doesn't destabilize the system. |
1373 | */ |
1374 | #if 1 |
1375 | panic("%s: swap to sparse file" , __func__); |
1376 | #else |
1377 | error = EIO; /* failure */ |
1378 | #endif |
1379 | } |
1380 | |
1381 | /* |
1382 | * punt if there was an error or a hole in the file. |
1383 | * we must wait for any i/o ops we have already started |
1384 | * to finish before returning. |
1385 | * |
1386 | * XXX we could deal with holes here but it would be |
1387 | * a hassle (in the write case). |
1388 | */ |
1389 | if (error) { |
1390 | s = splbio(); |
1391 | vnx->vx_error = error; /* pass error up */ |
1392 | goto out; |
1393 | } |
1394 | |
1395 | /* |
1396 | * compute the size ("sz") of this transfer (in bytes). |
1397 | */ |
1398 | off = byteoff % sdp->swd_bsize; |
1399 | sz = (1 + nra) * sdp->swd_bsize - off; |
1400 | if (sz > resid) |
1401 | sz = resid; |
1402 | |
1403 | UVMHIST_LOG(pdhist, "sw_reg_strategy: " |
1404 | "vp %p/%p offset 0x%x/0x%x" , |
1405 | sdp->swd_vp, vp, byteoff, nbn); |
1406 | |
1407 | /* |
1408 | * now get a buf structure. note that the vb_buf is |
1409 | * at the front of the nbp structure so that you can |
1410 | * cast pointers between the two structure easily. |
1411 | */ |
1412 | nbp = pool_get(&vndbuf_pool, PR_WAITOK); |
1413 | buf_init(&nbp->vb_buf); |
1414 | nbp->vb_buf.b_flags = bp->b_flags; |
1415 | nbp->vb_buf.b_cflags = bp->b_cflags; |
1416 | nbp->vb_buf.b_oflags = bp->b_oflags; |
1417 | nbp->vb_buf.b_bcount = sz; |
1418 | nbp->vb_buf.b_bufsize = sz; |
1419 | nbp->vb_buf.b_error = 0; |
1420 | nbp->vb_buf.b_data = addr; |
1421 | nbp->vb_buf.b_lblkno = 0; |
1422 | nbp->vb_buf.b_blkno = nbn + btodb(off); |
1423 | nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; |
1424 | nbp->vb_buf.b_iodone = sw_reg_biodone; |
1425 | nbp->vb_buf.b_vp = vp; |
1426 | nbp->vb_buf.b_objlock = vp->v_interlock; |
1427 | if (vp->v_type == VBLK) { |
1428 | nbp->vb_buf.b_dev = vp->v_rdev; |
1429 | } |
1430 | |
1431 | nbp->vb_xfer = vnx; /* patch it back in to vnx */ |
1432 | |
1433 | /* |
1434 | * Just sort by block number |
1435 | */ |
1436 | s = splbio(); |
1437 | if (vnx->vx_error != 0) { |
1438 | buf_destroy(&nbp->vb_buf); |
1439 | pool_put(&vndbuf_pool, nbp); |
1440 | goto out; |
1441 | } |
1442 | vnx->vx_pending++; |
1443 | |
1444 | /* sort it in and start I/O if we are not over our limit */ |
1445 | /* XXXAD locking */ |
1446 | bufq_put(sdp->swd_tab, &nbp->vb_buf); |
1447 | sw_reg_start(sdp); |
1448 | splx(s); |
1449 | |
1450 | /* |
1451 | * advance to the next I/O |
1452 | */ |
1453 | byteoff += sz; |
1454 | addr += sz; |
1455 | } |
1456 | |
1457 | s = splbio(); |
1458 | |
1459 | out: /* Arrive here at splbio */ |
1460 | vnx->vx_flags &= ~VX_BUSY; |
1461 | if (vnx->vx_pending == 0) { |
1462 | error = vnx->vx_error; |
1463 | pool_put(&vndxfer_pool, vnx); |
1464 | bp->b_error = error; |
1465 | biodone(bp); |
1466 | } |
1467 | splx(s); |
1468 | } |
1469 | |
1470 | /* |
1471 | * sw_reg_start: start an I/O request on the requested swapdev |
1472 | * |
1473 | * => reqs are sorted by b_rawblkno (above) |
1474 | */ |
1475 | static void |
1476 | sw_reg_start(struct swapdev *sdp) |
1477 | { |
1478 | struct buf *bp; |
1479 | struct vnode *vp; |
1480 | UVMHIST_FUNC("sw_reg_start" ); UVMHIST_CALLED(pdhist); |
1481 | |
1482 | /* recursion control */ |
1483 | if ((sdp->swd_flags & SWF_BUSY) != 0) |
1484 | return; |
1485 | |
1486 | sdp->swd_flags |= SWF_BUSY; |
1487 | |
1488 | while (sdp->swd_active < sdp->swd_maxactive) { |
1489 | bp = bufq_get(sdp->swd_tab); |
1490 | if (bp == NULL) |
1491 | break; |
1492 | sdp->swd_active++; |
1493 | |
1494 | UVMHIST_LOG(pdhist, |
1495 | "sw_reg_start: bp %p vp %p blkno %p cnt %lx" , |
1496 | bp, bp->b_vp, bp->b_blkno, bp->b_bcount); |
1497 | vp = bp->b_vp; |
1498 | KASSERT(bp->b_objlock == vp->v_interlock); |
1499 | if ((bp->b_flags & B_READ) == 0) { |
1500 | mutex_enter(vp->v_interlock); |
1501 | vp->v_numoutput++; |
1502 | mutex_exit(vp->v_interlock); |
1503 | } |
1504 | VOP_STRATEGY(vp, bp); |
1505 | } |
1506 | sdp->swd_flags &= ~SWF_BUSY; |
1507 | } |
1508 | |
1509 | /* |
1510 | * sw_reg_biodone: one of our i/o's has completed |
1511 | */ |
1512 | static void |
1513 | sw_reg_biodone(struct buf *bp) |
1514 | { |
1515 | workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); |
1516 | } |
1517 | |
1518 | /* |
1519 | * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup |
1520 | * |
1521 | * => note that we can recover the vndbuf struct by casting the buf ptr |
1522 | */ |
1523 | static void |
1524 | sw_reg_iodone(struct work *wk, void *dummy) |
1525 | { |
1526 | struct vndbuf *vbp = (void *)wk; |
1527 | struct vndxfer *vnx = vbp->vb_xfer; |
1528 | struct buf *pbp = vnx->vx_bp; /* parent buffer */ |
1529 | struct swapdev *sdp = vnx->vx_sdp; |
1530 | int s, resid, error; |
1531 | KASSERT(&vbp->vb_buf.b_work == wk); |
1532 | UVMHIST_FUNC("sw_reg_iodone" ); UVMHIST_CALLED(pdhist); |
1533 | |
1534 | UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p" , |
1535 | vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data); |
1536 | UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx" , |
1537 | vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); |
1538 | |
1539 | /* |
1540 | * protect vbp at splbio and update. |
1541 | */ |
1542 | |
1543 | s = splbio(); |
1544 | resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; |
1545 | pbp->b_resid -= resid; |
1546 | vnx->vx_pending--; |
1547 | |
1548 | if (vbp->vb_buf.b_error != 0) { |
1549 | /* pass error upward */ |
1550 | error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO; |
1551 | UVMHIST_LOG(pdhist, " got error=%d !" , error, 0, 0, 0); |
1552 | vnx->vx_error = error; |
1553 | } |
1554 | |
1555 | /* |
1556 | * kill vbp structure |
1557 | */ |
1558 | buf_destroy(&vbp->vb_buf); |
1559 | pool_put(&vndbuf_pool, vbp); |
1560 | |
1561 | /* |
1562 | * wrap up this transaction if it has run to completion or, in |
1563 | * case of an error, when all auxiliary buffers have returned. |
1564 | */ |
1565 | if (vnx->vx_error != 0) { |
1566 | /* pass error upward */ |
1567 | error = vnx->vx_error; |
1568 | if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { |
1569 | pbp->b_error = error; |
1570 | biodone(pbp); |
1571 | pool_put(&vndxfer_pool, vnx); |
1572 | } |
1573 | } else if (pbp->b_resid == 0) { |
1574 | KASSERT(vnx->vx_pending == 0); |
1575 | if ((vnx->vx_flags & VX_BUSY) == 0) { |
1576 | UVMHIST_LOG(pdhist, " iodone error=%d !" , |
1577 | pbp, vnx->vx_error, 0, 0); |
1578 | biodone(pbp); |
1579 | pool_put(&vndxfer_pool, vnx); |
1580 | } |
1581 | } |
1582 | |
1583 | /* |
1584 | * done! start next swapdev I/O if one is pending |
1585 | */ |
1586 | sdp->swd_active--; |
1587 | sw_reg_start(sdp); |
1588 | splx(s); |
1589 | } |
1590 | |
1591 | |
1592 | /* |
1593 | * uvm_swap_alloc: allocate space on swap |
1594 | * |
1595 | * => allocation is done "round robin" down the priority list, as we |
1596 | * allocate in a priority we "rotate" the circle queue. |
1597 | * => space can be freed with uvm_swap_free |
1598 | * => we return the page slot number in /dev/drum (0 == invalid slot) |
1599 | * => we lock uvm_swap_data_lock |
1600 | * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM |
1601 | */ |
1602 | int |
1603 | uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) |
1604 | { |
1605 | struct swapdev *sdp; |
1606 | struct swappri *spp; |
1607 | UVMHIST_FUNC("uvm_swap_alloc" ); UVMHIST_CALLED(pdhist); |
1608 | |
1609 | /* |
1610 | * no swap devices configured yet? definite failure. |
1611 | */ |
1612 | if (uvmexp.nswapdev < 1) |
1613 | return 0; |
1614 | |
1615 | /* |
1616 | * XXXJAK: BEGIN HACK |
1617 | * |
1618 | * blist_alloc() in subr_blist.c will panic if we try to allocate |
1619 | * too many slots. |
1620 | */ |
1621 | if (*nslots > BLIST_MAX_ALLOC) { |
1622 | if (__predict_false(lessok == false)) |
1623 | return 0; |
1624 | *nslots = BLIST_MAX_ALLOC; |
1625 | } |
1626 | /* XXXJAK: END HACK */ |
1627 | |
1628 | /* |
1629 | * lock data lock, convert slots into blocks, and enter loop |
1630 | */ |
1631 | mutex_enter(&uvm_swap_data_lock); |
1632 | |
1633 | ReTry: /* XXXMRG */ |
1634 | LIST_FOREACH(spp, &swap_priority, spi_swappri) { |
1635 | TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { |
1636 | uint64_t result; |
1637 | |
1638 | /* if it's not enabled, then we can't swap from it */ |
1639 | if ((sdp->swd_flags & SWF_ENABLE) == 0) |
1640 | continue; |
1641 | if (sdp->swd_npginuse + *nslots > sdp->swd_npages) |
1642 | continue; |
1643 | result = blist_alloc(sdp->swd_blist, *nslots); |
1644 | if (result == BLIST_NONE) { |
1645 | continue; |
1646 | } |
1647 | KASSERT(result < sdp->swd_drumsize); |
1648 | |
1649 | /* |
1650 | * successful allocation! now rotate the tailq. |
1651 | */ |
1652 | TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); |
1653 | TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); |
1654 | sdp->swd_npginuse += *nslots; |
1655 | uvmexp.swpginuse += *nslots; |
1656 | mutex_exit(&uvm_swap_data_lock); |
1657 | /* done! return drum slot number */ |
1658 | UVMHIST_LOG(pdhist, |
1659 | "success! returning %d slots starting at %d" , |
1660 | *nslots, result + sdp->swd_drumoffset, 0, 0); |
1661 | return (result + sdp->swd_drumoffset); |
1662 | } |
1663 | } |
1664 | |
1665 | /* XXXMRG: BEGIN HACK */ |
1666 | if (*nslots > 1 && lessok) { |
1667 | *nslots = 1; |
1668 | /* XXXMRG: ugh! blist should support this for us */ |
1669 | goto ReTry; |
1670 | } |
1671 | /* XXXMRG: END HACK */ |
1672 | |
1673 | mutex_exit(&uvm_swap_data_lock); |
1674 | return 0; |
1675 | } |
1676 | |
1677 | /* |
1678 | * uvm_swapisfull: return true if most of available swap is allocated |
1679 | * and in use. we don't count some small portion as it may be inaccessible |
1680 | * to us at any given moment, for example if there is lock contention or if |
1681 | * pages are busy. |
1682 | */ |
1683 | bool |
1684 | uvm_swapisfull(void) |
1685 | { |
1686 | int swpgonly; |
1687 | bool rv; |
1688 | |
1689 | mutex_enter(&uvm_swap_data_lock); |
1690 | KASSERT(uvmexp.swpgonly <= uvmexp.swpages); |
1691 | swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 / |
1692 | uvm_swapisfull_factor); |
1693 | rv = (swpgonly >= uvmexp.swpgavail); |
1694 | mutex_exit(&uvm_swap_data_lock); |
1695 | |
1696 | return (rv); |
1697 | } |
1698 | |
1699 | /* |
1700 | * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors |
1701 | * |
1702 | * => we lock uvm_swap_data_lock |
1703 | */ |
1704 | void |
1705 | uvm_swap_markbad(int startslot, int nslots) |
1706 | { |
1707 | struct swapdev *sdp; |
1708 | UVMHIST_FUNC("uvm_swap_markbad" ); UVMHIST_CALLED(pdhist); |
1709 | |
1710 | mutex_enter(&uvm_swap_data_lock); |
1711 | sdp = swapdrum_getsdp(startslot); |
1712 | KASSERT(sdp != NULL); |
1713 | |
1714 | /* |
1715 | * we just keep track of how many pages have been marked bad |
1716 | * in this device, to make everything add up in swap_off(). |
1717 | * we assume here that the range of slots will all be within |
1718 | * one swap device. |
1719 | */ |
1720 | |
1721 | KASSERT(uvmexp.swpgonly >= nslots); |
1722 | uvmexp.swpgonly -= nslots; |
1723 | sdp->swd_npgbad += nslots; |
1724 | UVMHIST_LOG(pdhist, "now %d bad" , sdp->swd_npgbad, 0,0,0); |
1725 | mutex_exit(&uvm_swap_data_lock); |
1726 | } |
1727 | |
1728 | /* |
1729 | * uvm_swap_free: free swap slots |
1730 | * |
1731 | * => this can be all or part of an allocation made by uvm_swap_alloc |
1732 | * => we lock uvm_swap_data_lock |
1733 | */ |
1734 | void |
1735 | uvm_swap_free(int startslot, int nslots) |
1736 | { |
1737 | struct swapdev *sdp; |
1738 | UVMHIST_FUNC("uvm_swap_free" ); UVMHIST_CALLED(pdhist); |
1739 | |
1740 | UVMHIST_LOG(pdhist, "freeing %d slots starting at %d" , nslots, |
1741 | startslot, 0, 0); |
1742 | |
1743 | /* |
1744 | * ignore attempts to free the "bad" slot. |
1745 | */ |
1746 | |
1747 | if (startslot == SWSLOT_BAD) { |
1748 | return; |
1749 | } |
1750 | |
1751 | /* |
1752 | * convert drum slot offset back to sdp, free the blocks |
1753 | * in the extent, and return. must hold pri lock to do |
1754 | * lookup and access the extent. |
1755 | */ |
1756 | |
1757 | mutex_enter(&uvm_swap_data_lock); |
1758 | sdp = swapdrum_getsdp(startslot); |
1759 | KASSERT(uvmexp.nswapdev >= 1); |
1760 | KASSERT(sdp != NULL); |
1761 | KASSERT(sdp->swd_npginuse >= nslots); |
1762 | blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); |
1763 | sdp->swd_npginuse -= nslots; |
1764 | uvmexp.swpginuse -= nslots; |
1765 | mutex_exit(&uvm_swap_data_lock); |
1766 | } |
1767 | |
1768 | /* |
1769 | * uvm_swap_put: put any number of pages into a contig place on swap |
1770 | * |
1771 | * => can be sync or async |
1772 | */ |
1773 | |
1774 | int |
1775 | uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) |
1776 | { |
1777 | int error; |
1778 | |
1779 | error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | |
1780 | ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); |
1781 | return error; |
1782 | } |
1783 | |
1784 | /* |
1785 | * uvm_swap_get: get a single page from swap |
1786 | * |
1787 | * => usually a sync op (from fault) |
1788 | */ |
1789 | |
1790 | int |
1791 | uvm_swap_get(struct vm_page *page, int swslot, int flags) |
1792 | { |
1793 | int error; |
1794 | |
1795 | uvmexp.nswget++; |
1796 | KASSERT(flags & PGO_SYNCIO); |
1797 | if (swslot == SWSLOT_BAD) { |
1798 | return EIO; |
1799 | } |
1800 | |
1801 | error = uvm_swap_io(&page, swslot, 1, B_READ | |
1802 | ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); |
1803 | if (error == 0) { |
1804 | |
1805 | /* |
1806 | * this page is no longer only in swap. |
1807 | */ |
1808 | |
1809 | mutex_enter(&uvm_swap_data_lock); |
1810 | KASSERT(uvmexp.swpgonly > 0); |
1811 | uvmexp.swpgonly--; |
1812 | mutex_exit(&uvm_swap_data_lock); |
1813 | } |
1814 | return error; |
1815 | } |
1816 | |
1817 | /* |
1818 | * uvm_swap_io: do an i/o operation to swap |
1819 | */ |
1820 | |
1821 | static int |
1822 | uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) |
1823 | { |
1824 | daddr_t startblk; |
1825 | struct buf *bp; |
1826 | vaddr_t kva; |
1827 | int error, mapinflags; |
1828 | bool write, async; |
1829 | UVMHIST_FUNC("uvm_swap_io" ); UVMHIST_CALLED(pdhist); |
1830 | |
1831 | UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d" , |
1832 | startslot, npages, flags, 0); |
1833 | |
1834 | write = (flags & B_READ) == 0; |
1835 | async = (flags & B_ASYNC) != 0; |
1836 | |
1837 | /* |
1838 | * allocate a buf for the i/o. |
1839 | */ |
1840 | |
1841 | KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async)); |
1842 | bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp); |
1843 | if (bp == NULL) { |
1844 | uvm_aio_aiodone_pages(pps, npages, true, ENOMEM); |
1845 | return ENOMEM; |
1846 | } |
1847 | |
1848 | /* |
1849 | * convert starting drum slot to block number |
1850 | */ |
1851 | |
1852 | startblk = btodb((uint64_t)startslot << PAGE_SHIFT); |
1853 | |
1854 | /* |
1855 | * first, map the pages into the kernel. |
1856 | */ |
1857 | |
1858 | mapinflags = !write ? |
1859 | UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : |
1860 | UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; |
1861 | kva = uvm_pagermapin(pps, npages, mapinflags); |
1862 | |
1863 | /* |
1864 | * fill in the bp/sbp. we currently route our i/o through |
1865 | * /dev/drum's vnode [swapdev_vp]. |
1866 | */ |
1867 | |
1868 | bp->b_cflags = BC_BUSY | BC_NOCACHE; |
1869 | bp->b_flags = (flags & (B_READ|B_ASYNC)); |
1870 | bp->b_proc = &proc0; /* XXX */ |
1871 | bp->b_vnbufs.le_next = NOLIST; |
1872 | bp->b_data = (void *)kva; |
1873 | bp->b_blkno = startblk; |
1874 | bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; |
1875 | |
1876 | /* |
1877 | * bump v_numoutput (counter of number of active outputs). |
1878 | */ |
1879 | |
1880 | if (write) { |
1881 | mutex_enter(swapdev_vp->v_interlock); |
1882 | swapdev_vp->v_numoutput++; |
1883 | mutex_exit(swapdev_vp->v_interlock); |
1884 | } |
1885 | |
1886 | /* |
1887 | * for async ops we must set up the iodone handler. |
1888 | */ |
1889 | |
1890 | if (async) { |
1891 | bp->b_iodone = uvm_aio_biodone; |
1892 | UVMHIST_LOG(pdhist, "doing async!" , 0, 0, 0, 0); |
1893 | if (curlwp == uvm.pagedaemon_lwp) |
1894 | BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); |
1895 | else |
1896 | BIO_SETPRIO(bp, BPRIO_TIMELIMITED); |
1897 | } else { |
1898 | bp->b_iodone = NULL; |
1899 | BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); |
1900 | } |
1901 | UVMHIST_LOG(pdhist, |
1902 | "about to start io: data = %p blkno = 0x%x, bcount = %ld" , |
1903 | bp->b_data, bp->b_blkno, bp->b_bcount, 0); |
1904 | |
1905 | /* |
1906 | * now we start the I/O, and if async, return. |
1907 | */ |
1908 | |
1909 | VOP_STRATEGY(swapdev_vp, bp); |
1910 | if (async) |
1911 | return 0; |
1912 | |
1913 | /* |
1914 | * must be sync i/o. wait for it to finish |
1915 | */ |
1916 | |
1917 | error = biowait(bp); |
1918 | |
1919 | /* |
1920 | * kill the pager mapping |
1921 | */ |
1922 | |
1923 | uvm_pagermapout(kva, npages); |
1924 | |
1925 | /* |
1926 | * now dispose of the buf and we're done. |
1927 | */ |
1928 | |
1929 | if (write) { |
1930 | mutex_enter(swapdev_vp->v_interlock); |
1931 | vwakeup(bp); |
1932 | mutex_exit(swapdev_vp->v_interlock); |
1933 | } |
1934 | putiobuf(bp); |
1935 | UVMHIST_LOG(pdhist, "<- done (sync) error=%d" , error, 0, 0, 0); |
1936 | |
1937 | return (error); |
1938 | } |
1939 | |