uvm_swap.c source code [src/src/sys/uvm/uvm_swap.c]

1	/ $NetBSD: uvm_swap.c,v 1.174 2016/07/08 06:45:34 skrll Exp $ /
2
3	/*
4	* Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green
5	* All rights reserved.
6	*
7	* Redistribution and use in source and binary forms, with or without
8	* modification, are permitted provided that the following conditions
9	* are met:
10	* 1. Redistributions of source code must retain the above copyright
11	* notice, this list of conditions and the following disclaimer.
12	* 2. Redistributions in binary form must reproduce the above copyright
13	* notice, this list of conditions and the following disclaimer in the
14	* documentation and/or other materials provided with the distribution.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26	* SUCH DAMAGE.
27	*
28	* from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
29	* from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
30	*/
31
32	#include <sys/cdefs.h>
33	__KERNEL_RCSID(`0`, "$NetBSD: uvm_swap.c,v 1.174 2016/07/08 06:45:34 skrll Exp $");
34
35	#include "opt_uvmhist.h"
36	#include "opt_compat_netbsd.h"
37	#include "opt_ddb.h"
38
39	#include <sys/param.h>
40	#include <sys/systm.h>
41	#include <sys/buf.h>
42	#include <sys/bufq.h>
43	#include <sys/conf.h>
44	#include <sys/proc.h>
45	#include <sys/namei.h>
46	#include <sys/disklabel.h>
47	#include <sys/errno.h>
48	#include <sys/kernel.h>
49	#include <sys/vnode.h>
50	#include <sys/file.h>
51	#include <sys/vmem.h>
52	#include <sys/blist.h>
53	#include <sys/mount.h>
54	#include <sys/pool.h>
55	#include <sys/kmem.h>
56	#include <sys/syscallargs.h>
57	#include <sys/swap.h>
58	#include <sys/kauth.h>
59	#include <sys/sysctl.h>
60	#include <sys/workqueue.h>
61
62	#include <uvm/uvm.h>
63
64	#include <miscfs/specfs/specdev.h>
65
66	/*
67	* uvm_swap.c: manage configuration and i/o to swap space.
68	*/
69
70	/*
71	* swap space is managed in the following way:
72	*
73	* each swap partition or file is described by a "swapdev" structure.
74	* each "swapdev" structure contains a "swapent" structure which contains
75	* information that is passed up to the user (via system calls).
76	*
77	* each swap partition is assigned a "priority" (int) which controls
78	* swap parition usage.
79	*
80	* the system maintains a global data structure describing all swap
81	* partitions/files. there is a sorted LIST of "swappri" structures
82	* which describe "swapdev"'s at that priority. this LIST is headed
83	* by the "swap_priority" global var. each "swappri" contains a
84	* TAILQ of "swapdev" structures at that priority.
85	*
86	* locking:
87	* - swap_syscall_lock (krwlock_t): this lock serializes the swapctl
88	* system call and prevents the swap priority list from changing
89	* while we are in the middle of a system call (e.g. SWAP_STATS).
90	* - uvm_swap_data_lock (kmutex_t): this lock protects all swap data
91	* structures including the priority list, the swapdev structures,
92	* and the swapmap arena.
93	*
94	* each swap device has the following info:
95	* - swap device in use (could be disabled, preventing future use)
96	* - swap enabled (allows new allocations on swap)
97	* - map info in /dev/drum
98	* - vnode pointer
99	* for swap files only:
100	* - block size
101	* - max byte count in buffer
102	* - buffer
103	*
104	* userland controls and configures swap with the swapctl(2) system call.
105	* the sys_swapctl performs the following operations:
106	* [1] SWAP_NSWAP: returns the number of swap devices currently configured
107	* [2] SWAP_STATS: given a pointer to an array of swapent structures
108	* (passed in via "arg") of a size passed in via "misc" ... we load
109	* the current swap config into the array. The actual work is done
110	* in the uvm_swap_stats() function.
111	* [3] SWAP_ON: given a pathname in arg (could be device or file) and a
112	* priority in "misc", start swapping on it.
113	* [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
114	* [5] SWAP_CTL: changes the priority of a swap device (new priority in
115	* "misc")
116	*/
117
118	/*
119	* swapdev: describes a single swap partition/file
120	*
121	* note the following should be true:
122	* swd_inuse <= swd_nblks [number of blocks in use is <= total blocks]
123	* swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
124	*/
125	struct swapdev {
126	dev_t swd_dev; / device id /
127	int swd_flags; / flags:inuse/enable/fake /
128	int swd_priority; / our priority /
129	int swd_nblks; / blocks in this device /
130	char swd_path; /* saved pathname of device /
131	int swd_pathlen; / length of pathname /
132	int swd_npages; / #pages we can use /
133	int swd_npginuse; / #pages in use /
134	int swd_npgbad; / #pages bad /
135	int swd_drumoffset; / page0 offset in drum /
136	int swd_drumsize; / #pages in drum /
137	blist_t swd_blist; / blist for this swapdev /
138	struct vnode swd_vp; /* backing vnode /
139	TAILQ_ENTRY(swapdev) swd_next; / priority tailq /
140
141	int swd_bsize; / blocksize (bytes) /
142	int swd_maxactive; / max active i/o reqs /
143	struct bufq_state swd_tab; /* buffer list /
144	int swd_active; / number of active buffers /
145	};
146
147	/*
148	* swap device priority entry; the list is kept sorted on `spi_priority'.
149	*/
150	struct swappri {
151	int spi_priority; / priority /
152	TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev;
153	/ tailq of swapdevs at this priority /
154	LIST_ENTRY(swappri) spi_swappri; / global list of pri's /
155	};
156
157	/*
158	* The following two structures are used to keep track of data transfers
159	* on swap devices associated with regular files.
160	* NOTE: this code is more or less a copy of vnd.c; we use the same
161	* structure names here to ease porting..
162	*/
163	struct vndxfer {
164	struct buf vx_bp; /* Pointer to parent buffer /
165	struct swapdev *vx_sdp;
166	int vx_error;
167	int vx_pending; / # of pending aux buffers /
168	int vx_flags;
169	#define VX_BUSY 1
170	#define VX_DEAD 2
171	};
172
173	struct vndbuf {
174	struct buf vb_buf;
175	struct vndxfer *vb_xfer;
176	};
177
178	/*
179	* NetBSD 1.3 swapctl(SWAP_STATS, ...) swapent structure; uses 32 bit
180	* dev_t and has no se_path[] member.
181	*/
182	struct swapent13 {
183	int32_t se13_dev; / device id /
184	int se13_flags; / flags /
185	int se13_nblks; / total blocks /
186	int se13_inuse; / blocks in use /
187	int se13_priority; / priority of this device /
188	};
189
190	/*
191	* NetBSD 5.0 swapctl(SWAP_STATS, ...) swapent structure; uses 32 bit
192	* dev_t.
193	*/
194	struct swapent50 {
195	int32_t se50_dev; / device id /
196	int se50_flags; / flags /
197	int se50_nblks; / total blocks /
198	int se50_inuse; / blocks in use /
199	int se50_priority; / priority of this device /
200	char se50_path[PATH_MAX+`1`]; / path name /
201	};
202
203	/*
204	* We keep a of pool vndbuf's and vndxfer structures.
205	*/
206	static struct pool vndxfer_pool, vndbuf_pool;
207
208	/*
209	* local variables
210	*/
211	static vmem_t swapmap; /* controls the mapping of /dev/drum /
212
213	/ list of all active swap devices [by priority] /
214	LIST_HEAD(swap_priority, swappri);
215	static struct swap_priority swap_priority;
216
217	/ locks /
218	static krwlock_t swap_syscall_lock;
219
220	/ workqueue and use counter for swap to regular files /
221	static int sw_reg_count = `0`;
222	static struct workqueue *sw_reg_workqueue;
223
224	/ tuneables /
225	u_int uvm_swapisfull_factor = `99`;
226
227	/*
228	* prototypes
229	*/
230	static struct swapdev swapdrum_getsdp(int*);
231
232	static struct swapdev swaplist_find(struct* vnode *, bool);
233	static void swaplist_insert(struct swapdev *,
234	struct swappri , int*);
235	static void swaplist_trim(void);
236
237	static int swap_on(struct lwp , struct* swapdev *);
238	static int swap_off(struct lwp , struct* swapdev *);
239
240	static void sw_reg_strategy(struct swapdev , struct* buf , int*);
241	static void sw_reg_biodone(struct buf *);
242	static void sw_reg_iodone(struct work wk, void* *dummy);
243	static void sw_reg_start(struct swapdev *);
244
245	static int uvm_swap_io(struct vm_page *, int, int, int*);
246
247	/*
248	* uvm_swap_init: init the swap system data structures and locks
249	*
250	* => called at boot time from init_main.c after the filesystems
251	* are brought up (which happens after uvm_init())
252	*/
253	void
254	uvm_swap_init(void)
255	{
256	UVMHIST_FUNC("uvm_swap_init");
257
258	UVMHIST_CALLED(pdhist);
259	/*
260	* first, init the swap list, its counter, and its lock.
261	* then get a handle on the vnode for /dev/drum by using
262	* the its dev_t number ("swapdev", from MD conf.c).
263	*/
264
265	LIST_INIT(&swap_priority);
266	uvmexp.nswapdev = `0`;
267	rw_init(&swap_syscall_lock);
268	mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE);
269
270	if (bdevvp(swapdev, &swapdev_vp))
271	panic("%s: can't get vnode for swap device", __func__);
272	if (vn_lock(swapdev_vp, LK_EXCLUSIVE \| LK_RETRY))
273	panic("%s: can't lock swap device", __func__);
274	if (VOP_OPEN(swapdev_vp, FREAD \| FWRITE, NOCRED))
275	panic("%s: can't open swap device", __func__);
276	VOP_UNLOCK(swapdev_vp);
277
278	/*
279	* create swap block resource map to map /dev/drum. the range
280	* from 1 to INT_MAX allows 2 gigablocks of swap space. note
281	* that block 0 is reserved (used to indicate an allocation
282	* failure, or no allocation).
283	*/
284	swapmap = vmem_create("swapmap", `1`, INT_MAX - `1`, `1`, NULL, NULL, NULL, `0`,
285	VM_NOSLEEP, IPL_NONE);
286	if (swapmap == `0`) {
287	panic("%s: vmem_create failed", __func__);
288	}
289
290	pool_init(&vndxfer_pool, sizeof(struct vndxfer), `0`, `0`, `0`, "swp vnx",
291	NULL, IPL_BIO);
292	pool_init(&vndbuf_pool, sizeof(struct vndbuf), `0`, `0`, `0`, "swp vnd",
293	NULL, IPL_BIO);
294
295	UVMHIST_LOG(pdhist, "<- done", `0`, `0`, `0`, `0`);
296	}
297
298	/*
299	* swaplist functions: functions that operate on the list of swap
300	* devices on the system.
301	*/
302
303	/*
304	* swaplist_insert: insert swap device "sdp" into the global list
305	*
306	* => caller must hold both swap_syscall_lock and uvm_swap_data_lock
307	* => caller must provide a newly allocated swappri structure (we will
308	* FREE it if we don't need it... this it to prevent allocation
309	* blocking here while adding swap)
310	*/
311	static void
312	swaplist_insert(struct swapdev sdp, struct* swappri newspp, int* priority)
313	{
314	struct swappri spp, pspp;
315	UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
316
317	/*
318	* find entry at or after which to insert the new device.
319	*/
320	pspp = NULL;
321	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
322	if (priority <= spp->spi_priority)
323	break;
324	pspp = spp;
325	}
326
327	/*
328	* new priority?
329	*/
330	if (spp == NULL \|\| spp->spi_priority != priority) {
331	spp = newspp; / use newspp! /
332	UVMHIST_LOG(pdhist, "created new swappri = %d",
333	priority, `0`, `0`, `0`);
334
335	spp->spi_priority = priority;
336	TAILQ_INIT(&spp->spi_swapdev);
337
338	if (pspp)
339	LIST_INSERT_AFTER(pspp, spp, spi_swappri);
340	else
341	LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
342	} else {
343	/ we don't need a new priority structure, free it /
344	kmem_free(newspp, sizeof(*newspp));
345	}
346
347	/*
348	* priority found (or created). now insert on the priority's
349	* tailq list and bump the total number of swapdevs.
350	*/
351	sdp->swd_priority = priority;
352	TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
353	uvmexp.nswapdev++;
354	}
355
356	/*
357	* swaplist_find: find and optionally remove a swap device from the
358	* global list.
359	*
360	* => caller must hold both swap_syscall_lock and uvm_swap_data_lock
361	* => we return the swapdev we found (and removed)
362	*/
363	static struct swapdev *
364	swaplist_find(struct vnode *vp, bool remove)
365	{
366	struct swapdev *sdp;
367	struct swappri *spp;
368
369	/*
370	* search the lists for the requested vp
371	*/
372
373	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
374	TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
375	if (sdp->swd_vp == vp) {
376	if (remove) {
377	TAILQ_REMOVE(&spp->spi_swapdev,
378	sdp, swd_next);
379	uvmexp.nswapdev--;
380	}
381	return(sdp);
382	}
383	}
384	}
385	return (NULL);
386	}
387
388	/*
389	* swaplist_trim: scan priority list for empty priority entries and kill
390	* them.
391	*
392	* => caller must hold both swap_syscall_lock and uvm_swap_data_lock
393	*/
394	static void
395	swaplist_trim(void)
396	{
397	struct swappri spp, nextspp;
398
399	LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) {
400	if (!TAILQ_EMPTY(&spp->spi_swapdev))
401	continue;
402	LIST_REMOVE(spp, spi_swappri);
403	kmem_free(spp, sizeof(*spp));
404	}
405	}
406
407	/*
408	* swapdrum_getsdp: given a page offset in /dev/drum, convert it back
409	* to the "swapdev" that maps that section of the drum.
410	*
411	* => each swapdev takes one big contig chunk of the drum
412	* => caller must hold uvm_swap_data_lock
413	*/
414	static struct swapdev *
415	swapdrum_getsdp(int pgno)
416	{
417	struct swapdev *sdp;
418	struct swappri *spp;
419
420	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
421	TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
422	if (sdp->swd_flags & SWF_FAKE)
423	continue;
424	if (pgno >= sdp->swd_drumoffset &&
425	pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
426	return sdp;
427	}
428	}
429	}
430	return NULL;
431	}
432
433	void swapsys_lock(krw_t op)
434	{
435	rw_enter(&swap_syscall_lock, op);
436	}
437
438	void swapsys_unlock(void)
439	{
440	rw_exit(&swap_syscall_lock);
441	}
442
443	/*
444	* sys_swapctl: main entry point for swapctl(2) system call
445	* [with two helper functions: swap_on and swap_off]
446	*/
447	int
448	sys_swapctl(struct lwp l, const* struct sys_swapctl_args uap, register_t retval)
449	{
450	/ {*
451	syscallarg(int) cmd;
452	syscallarg(void ) arg;*
453	syscallarg(int) misc;
454	} /*
455	struct vnode *vp;
456	struct nameidata nd;
457	struct swappri *spp;
458	struct swapdev *sdp;
459	struct swapent *sep;
460	#define SWAP_PATH_MAX (PATH_MAX + 1)
461	char *userpath;
462	size_t len = `0`;
463	int error, misc;
464	int priority;
465	UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
466
467	/*
468	* we handle the non-priv NSWAP and STATS request first.
469	*
470	* SWAP_NSWAP: return number of config'd swap devices
471	* [can also be obtained with uvmexp sysctl]
472	*/
473	if (SCARG(uap, cmd) == SWAP_NSWAP) {
474	const int nswapdev = uvmexp.nswapdev;
475	UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", nswapdev, `0`, `0`, `0`);
476	*retval = nswapdev;
477	return `0`;
478	}
479
480	misc = SCARG(uap, misc);
481	userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP);
482
483	/*
484	* ensure serialized syscall access by grabbing the swap_syscall_lock
485	*/
486	rw_enter(&swap_syscall_lock, RW_WRITER);
487
488	/*
489	* SWAP_STATS: get stats on current # of configured swap devs
490	*
491	* note that the swap_priority list can't change as long
492	* as we are holding the swap_syscall_lock. we don't want
493	* to grab the uvm_swap_data_lock because we may fault&sleep during
494	* copyout() and we don't want to be holding that lock then!
495	*/
496	if (SCARG(uap, cmd) == SWAP_STATS
497	#if defined(COMPAT_50)
498	\|\| SCARG(uap, cmd) == SWAP_STATS50
499	#endif
500	#if defined(COMPAT_13)
501	\|\| SCARG(uap, cmd) == SWAP_STATS13
502	#endif
503	) {
504	if (misc < `0`) {
505	error = EINVAL;
506	goto out;
507	}
508	if (misc == `0` \|\| uvmexp.nswapdev == `0`) {
509	error = `0`;
510	goto out;
511	}
512	/ Make sure userland cannot exhaust kernel memory /
513	if ((size_t)misc > (size_t)uvmexp.nswapdev)
514	misc = uvmexp.nswapdev;
515	KASSERT(misc > `0`);
516	#if defined(COMPAT_13)
517	if (SCARG(uap, cmd) == SWAP_STATS13)
518	len = sizeof(struct swapent13) * misc;
519	else
520	#endif
521	#if defined(COMPAT_50)
522	if (SCARG(uap, cmd) == SWAP_STATS50)
523	len = sizeof(struct swapent50) * misc;
524	else
525	#endif
526	len = sizeof(struct swapent) * misc;
527	sep = (struct swapent *)kmem_alloc(len, KM_SLEEP);
528
529	uvm_swap_stats(SCARG(uap, cmd), sep, misc, retval);
530	error = copyout(sep, SCARG(uap, arg), len);
531
532	kmem_free(sep, len);
533	UVMHIST_LOG(pdhist, "<- done SWAP_STATS", `0`, `0`, `0`, `0`);
534	goto out;
535	}
536	if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) {
537	dev_t devp = (dev_t )SCARG(uap, arg);
538
539	error = copyout(&dumpdev, devp, sizeof(dumpdev));
540	goto out;
541	}
542
543	/*
544	* all other requests require superuser privs. verify.
545	*/
546	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL,
547	`0`, NULL, NULL, NULL)))
548	goto out;
549
550	if (SCARG(uap, cmd) == SWAP_DUMPOFF) {
551	/ drop the current dump device /
552	dumpdev = NODEV;
553	dumpcdev = NODEV;
554	cpu_dumpconf();
555	goto out;
556	}
557
558	/*
559	* at this point we expect a path name in arg. we will
560	* use namei() to gain a vnode reference (vref), and lock
561	* the vnode (VOP_LOCK).
562	*
563	* XXX: a NULL arg means use the root vnode pointer (e.g. for
564	* miniroot)
565	*/
566	if (SCARG(uap, arg) == NULL) {
567	vp = rootvp; / miniroot /
568	vref(vp);
569	if (vn_lock(vp, LK_EXCLUSIVE)) {
570	vrele(vp);
571	error = EBUSY;
572	goto out;
573	}
574	if (SCARG(uap, cmd) == SWAP_ON &&
575	copystr("miniroot", userpath, SWAP_PATH_MAX, &len))
576	panic("swapctl: miniroot copy failed");
577	} else {
578	struct pathbuf *pb;
579
580	/*
581	* This used to allow copying in one extra byte
582	* (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON.
583	* This was completely pointless because if anyone
584	* used that extra byte namei would fail with
585	* ENAMETOOLONG anyway, so I've removed the excess
586	* logic. - dholland 20100215
587	*/
588
589	error = pathbuf_copyin(SCARG(uap, arg), &pb);
590	if (error) {
591	goto out;
592	}
593	if (SCARG(uap, cmd) == SWAP_ON) {
594	/ get a copy of the string /
595	pathbuf_copystring(pb, userpath, SWAP_PATH_MAX);
596	len = strlen(userpath) + `1`;
597	}
598	NDINIT(&nd, LOOKUP, FOLLOW \| LOCKLEAF \| TRYEMULROOT, pb);
599	if ((error = namei(&nd))) {
600	pathbuf_destroy(pb);
601	goto out;
602	}
603	vp = nd.ni_vp;
604	pathbuf_destroy(pb);
605	}
606	/ note: "vp" is referenced and locked /
607
608	error = `0`; / assume no error /
609	switch(SCARG(uap, cmd)) {
610
611	case SWAP_DUMPDEV:
612	if (vp->v_type != VBLK) {
613	error = ENOTBLK;
614	break;
615	}
616	if (bdevsw_lookup(vp->v_rdev)) {
617	dumpdev = vp->v_rdev;
618	dumpcdev = devsw_blk2chr(dumpdev);
619	} else
620	dumpdev = NODEV;
621	cpu_dumpconf();
622	break;
623
624	case SWAP_CTL:
625	/*
626	* get new priority, remove old entry (if any) and then
627	* reinsert it in the correct place. finally, prune out
628	* any empty priority structures.
629	*/
630	priority = SCARG(uap, misc);
631	spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
632	mutex_enter(&uvm_swap_data_lock);
633	if ((sdp = swaplist_find(vp, true)) == NULL) {
634	error = ENOENT;
635	} else {
636	swaplist_insert(sdp, spp, priority);
637	swaplist_trim();
638	}
639	mutex_exit(&uvm_swap_data_lock);
640	if (error)
641	kmem_free(spp, sizeof(*spp));
642	break;
643
644	case SWAP_ON:
645
646	/*
647	* check for duplicates. if none found, then insert a
648	* dummy entry on the list to prevent someone else from
649	* trying to enable this device while we are working on
650	* it.
651	*/
652
653	priority = SCARG(uap, misc);
654	sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP);
655	spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
656	sdp->swd_flags = SWF_FAKE;
657	sdp->swd_vp = vp;
658	sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
659	bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK);
660	mutex_enter(&uvm_swap_data_lock);
661	if (swaplist_find(vp, false) != NULL) {
662	error = EBUSY;
663	mutex_exit(&uvm_swap_data_lock);
664	bufq_free(sdp->swd_tab);
665	kmem_free(sdp, sizeof(*sdp));
666	kmem_free(spp, sizeof(*spp));
667	break;
668	}
669	swaplist_insert(sdp, spp, priority);
670	mutex_exit(&uvm_swap_data_lock);
671
672	KASSERT(len > `0`);
673	sdp->swd_pathlen = len;
674	sdp->swd_path = kmem_alloc(len, KM_SLEEP);
675	if (copystr(userpath, sdp->swd_path, len, `0`) != `0`)
676	panic("swapctl: copystr");
677
678	/*
679	* we've now got a FAKE placeholder in the swap list.
680	* now attempt to enable swap on it. if we fail, undo
681	* what we've done and kill the fake entry we just inserted.
682	* if swap_on is a success, it will clear the SWF_FAKE flag
683	*/
684
685	if ((error = swap_on(l, sdp)) != `0`) {
686	mutex_enter(&uvm_swap_data_lock);
687	(void) swaplist_find(vp, true); / kill fake entry /
688	swaplist_trim();
689	mutex_exit(&uvm_swap_data_lock);
690	bufq_free(sdp->swd_tab);
691	kmem_free(sdp->swd_path, sdp->swd_pathlen);
692	kmem_free(sdp, sizeof(*sdp));
693	break;
694	}
695	break;
696
697	case SWAP_OFF:
698	mutex_enter(&uvm_swap_data_lock);
699	if ((sdp = swaplist_find(vp, false)) == NULL) {
700	mutex_exit(&uvm_swap_data_lock);
701	error = ENXIO;
702	break;
703	}
704
705	/*
706	* If a device isn't in use or enabled, we
707	* can't stop swapping from it (again).
708	*/
709	if ((sdp->swd_flags & (SWF_INUSE\|SWF_ENABLE)) == `0`) {
710	mutex_exit(&uvm_swap_data_lock);
711	error = EBUSY;
712	break;
713	}
714
715	/*
716	* do the real work.
717	*/
718	error = swap_off(l, sdp);
719	break;
720
721	default:
722	error = EINVAL;
723	}
724
725	/*
726	* done! release the ref gained by namei() and unlock.
727	*/
728	vput(vp);
729	out:
730	rw_exit(&swap_syscall_lock);
731	kmem_free(userpath, SWAP_PATH_MAX);
732
733	UVMHIST_LOG(pdhist, "<- done! error=%d", error, `0`, `0`, `0`);
734	return (error);
735	}
736
737	/*
738	* uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept
739	* away from sys_swapctl() in order to allow COMPAT_* swapctl()
740	* emulation to use it directly without going through sys_swapctl().
741	* The problem with using sys_swapctl() there is that it involves
742	* copying the swapent array to the stackgap, and this array's size
743	* is not known at build time. Hence it would not be possible to
744	* ensure it would fit in the stackgap in any case.
745	*/
746	void
747	uvm_swap_stats(int cmd, struct swapent sep, int* sec, register_t *retval)
748	{
749	struct swappri *spp;
750	struct swapdev *sdp;
751	int count = `0`;
752
753	KASSERT(rw_lock_held(&swap_syscall_lock));
754
755	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
756	TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
757	int inuse;
758
759	if (sec-- <= `0`)
760	break;
761
762	/*
763	* backwards compatibility for system call.
764	* For NetBSD 1.3 and 5.0, we have to use
765	* the 32 bit dev_t. For 5.0 and -current
766	* we have to add the path.
767	*/
768	inuse = btodb((uint64_t)sdp->swd_npginuse <<
769	PAGE_SHIFT);
770
771	#if defined(COMPAT_13) \|\| defined(COMPAT_50)
772	if (cmd == SWAP_STATS) {
773	#endif
774	sep->se_dev = sdp->swd_dev;
775	sep->se_flags = sdp->swd_flags;
776	sep->se_nblks = sdp->swd_nblks;
777	sep->se_inuse = inuse;
778	sep->se_priority = sdp->swd_priority;
779	KASSERT(sdp->swd_pathlen <
780	sizeof(sep->se_path));
781	strcpy(sep->se_path, sdp->swd_path);
782	sep++;
783	#if defined(COMPAT_13)
784	} else if (cmd == SWAP_STATS13) {
785	struct swapent13 *sep13 =
786	(struct swapent13 *)sep;
787
788	sep13->se13_dev = sdp->swd_dev;
789	sep13->se13_flags = sdp->swd_flags;
790	sep13->se13_nblks = sdp->swd_nblks;
791	sep13->se13_inuse = inuse;
792	sep13->se13_priority = sdp->swd_priority;
793	sep = (struct swapent *)(sep13 + `1`);
794	#endif
795	#if defined(COMPAT_50)
796	} else if (cmd == SWAP_STATS50) {
797	struct swapent50 *sep50 =
798	(struct swapent50 *)sep;
799
800	sep50->se50_dev = sdp->swd_dev;
801	sep50->se50_flags = sdp->swd_flags;
802	sep50->se50_nblks = sdp->swd_nblks;
803	sep50->se50_inuse = inuse;
804	sep50->se50_priority = sdp->swd_priority;
805	KASSERT(sdp->swd_pathlen <
806	sizeof(sep50->se50_path));
807	strcpy(sep50->se50_path, sdp->swd_path);
808	sep = (struct swapent *)(sep50 + `1`);
809	#endif
810	#if defined(COMPAT_13) \|\| defined(COMPAT_50)
811	}
812	#endif
813	count++;
814	}
815	}
816	*retval = count;
817	}
818
819	/*
820	* swap_on: attempt to enable a swapdev for swapping. note that the
821	* swapdev is already on the global list, but disabled (marked
822	* SWF_FAKE).
823	*
824	* => we avoid the start of the disk (to protect disk labels)
825	* => we also avoid the miniroot, if we are swapping to root.
826	* => caller should leave uvm_swap_data_lock unlocked, we may lock it
827	* if needed.
828	*/
829	static int
830	swap_on(struct lwp l, struct* swapdev *sdp)
831	{
832	struct vnode *vp;
833	int error, npages, nblocks, size;
834	long addr;
835	vmem_addr_t result;
836	struct vattr va;
837	dev_t dev;
838	UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
839
840	/*
841	* we want to enable swapping on sdp. the swd_vp contains
842	* the vnode we want (locked and ref'd), and the swd_dev
843	* contains the dev_t of the file, if it a block device.
844	*/
845
846	vp = sdp->swd_vp;
847	dev = sdp->swd_dev;
848
849	/*
850	* open the swap file (mostly useful for block device files to
851	* let device driver know what is up).
852	*
853	* we skip the open/close for root on swap because the root
854	* has already been opened when root was mounted (mountroot).
855	*/
856	if (vp != rootvp) {
857	if ((error = VOP_OPEN(vp, FREAD\|FWRITE, l->l_cred)))
858	return (error);
859	}
860
861	/ XXX this only works for block devices /
862	UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), `0`,`0`);
863
864	/*
865	* we now need to determine the size of the swap area. for
866	* block specials we can call the d_psize function.
867	* for normal files, we must stat [get attrs].
868	*
869	* we put the result in nblks.
870	* for normal files, we also want the filesystem block size
871	* (which we get with statfs).
872	*/
873	switch (vp->v_type) {
874	case VBLK:
875	if ((nblocks = bdev_size(dev)) == -`1`) {
876	error = ENXIO;
877	goto bad;
878	}
879	break;
880
881	case VREG:
882	if ((error = VOP_GETATTR(vp, &va, l->l_cred)))
883	goto bad;
884	nblocks = (int)btodb(va.va_size);
885	sdp->swd_bsize = `1` << vp->v_mount->mnt_fs_bshift;
886	/*
887	* limit the max # of outstanding I/O requests we issue
888	* at any one time. take it easy on NFS servers.
889	*/
890	if (vp->v_tag == VT_NFS)
891	sdp->swd_maxactive = `2`; / XXX /
892	else
893	sdp->swd_maxactive = `8`; / XXX /
894	break;
895
896	default:
897	error = ENXIO;
898	goto bad;
899	}
900
901	/*
902	* save nblocks in a safe place and convert to pages.
903	*/
904
905	sdp->swd_nblks = nblocks;
906	npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT;
907
908	/*
909	* for block special files, we want to make sure that leave
910	* the disklabel and bootblocks alone, so we arrange to skip
911	* over them (arbitrarily choosing to skip PAGE_SIZE bytes).
912	* note that because of this the "size" can be less than the
913	* actual number of blocks on the device.
914	*/
915	if (vp->v_type == VBLK) {
916	/ we use pages 1 to (size - 1) [inclusive] /
917	size = npages - `1`;
918	addr = `1`;
919	} else {
920	/ we use pages 0 to (size - 1) [inclusive] /
921	size = npages;
922	addr = `0`;
923	}
924
925	/*
926	* make sure we have enough blocks for a reasonable sized swap
927	* area. we want at least one page.
928	*/
929
930	if (size < `1`) {
931	UVMHIST_LOG(pdhist, " size <= 1!!", `0`, `0`, `0`, `0`);
932	error = EINVAL;
933	goto bad;
934	}
935
936	UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld", dev, size, addr, `0`);
937
938	/*
939	* now we need to allocate an extent to manage this swap device
940	*/
941
942	sdp->swd_blist = blist_create(npages);
943	/ mark all expect the `saved' region free. /
944	blist_free(sdp->swd_blist, addr, size);
945
946	/*
947	* if the vnode we are swapping to is the root vnode
948	* (i.e. we are swapping to the miniroot) then we want
949	* to make sure we don't overwrite it. do a statfs to
950	* find its size and skip over it.
951	*/
952	if (vp == rootvp) {
953	struct mount *mp;
954	struct statvfs *sp;
955	int rootblocks, rootpages;
956
957	mp = rootvnode->v_mount;
958	sp = &mp->mnt_stat;
959	rootblocks = sp->f_blocks * btodb(sp->f_frsize);
960	/*
961	* XXX: sp->f_blocks isn't the total number of
962	* blocks in the filesystem, it's the number of
963	* data blocks. so, our rootblocks almost
964	* definitely underestimates the total size
965	* of the filesystem - how badly depends on the
966	* details of the filesystem type. there isn't
967	* an obvious way to deal with this cleanly
968	* and perfectly, so for now we just pad our
969	* rootblocks estimate with an extra 5 percent.
970	*/
971	rootblocks += (rootblocks >> `5`) +
972	(rootblocks >> `6`) +
973	(rootblocks >> `7`);
974	rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
975	if (rootpages > size)
976	panic("swap_on: miniroot larger than swap?");
977
978	if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) {
979	panic("swap_on: unable to preserve miniroot");
980	}
981
982	size -= rootpages;
983	printf("Preserved %d pages of miniroot ", rootpages);
984	printf("leaving %d pages of swap\n", size);
985	}
986
987	/*
988	* add a ref to vp to reflect usage as a swap device.
989	*/
990	vref(vp);
991
992	/*
993	* now add the new swapdev to the drum and enable.
994	*/
995	error = vmem_alloc(swapmap, npages, VM_BESTFIT \| VM_SLEEP, &result);
996	if (error != `0`)
997	panic("swapdrum_add");
998	/*
999	* If this is the first regular swap create the workqueue.
1000	* => Protected by swap_syscall_lock.
1001	*/
1002	if (vp->v_type != VBLK) {
1003	if (sw_reg_count++ == `0`) {
1004	KASSERT(sw_reg_workqueue == NULL);
1005	if (workqueue_create(&sw_reg_workqueue, "swapiod",
1006	sw_reg_iodone, NULL, PRIBIO, IPL_BIO, `0`) != `0`)
1007	panic("%s: workqueue_create failed", __func__);
1008	}
1009	}
1010
1011	sdp->swd_drumoffset = (int)result;
1012	sdp->swd_drumsize = npages;
1013	sdp->swd_npages = size;
1014	mutex_enter(&uvm_swap_data_lock);
1015	sdp->swd_flags &= ~SWF_FAKE; / going live /
1016	sdp->swd_flags \|= (SWF_INUSE\|SWF_ENABLE);
1017	uvmexp.swpages += size;
1018	uvmexp.swpgavail += size;
1019	mutex_exit(&uvm_swap_data_lock);
1020	return (`0`);
1021
1022	/*
1023	* failure: clean up and return error.
1024	*/
1025
1026	bad:
1027	if (sdp->swd_blist) {
1028	blist_destroy(sdp->swd_blist);
1029	}
1030	if (vp != rootvp) {
1031	(void)VOP_CLOSE(vp, FREAD\|FWRITE, l->l_cred);
1032	}
1033	return (error);
1034	}
1035
1036	/*
1037	* swap_off: stop swapping on swapdev
1038	*
1039	* => swap data should be locked, we will unlock.
1040	*/
1041	static int
1042	swap_off(struct lwp l, struct* swapdev *sdp)
1043	{
1044	int npages = sdp->swd_npages;
1045	int error = `0`;
1046
1047	UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
1048	UVMHIST_LOG(pdhist, " dev=%x, npages=%d", sdp->swd_dev,npages,`0`,`0`);
1049
1050	/ disable the swap area being removed /
1051	sdp->swd_flags &= ~SWF_ENABLE;
1052	uvmexp.swpgavail -= npages;
1053	mutex_exit(&uvm_swap_data_lock);
1054
1055	/*
1056	* the idea is to find all the pages that are paged out to this
1057	* device, and page them all in. in uvm, swap-backed pageable
1058	* memory can take two forms: aobjs and anons. call the
1059	* swapoff hook for each subsystem to bring in pages.
1060	*/
1061
1062	if (uao_swap_off(sdp->swd_drumoffset,
1063	sdp->swd_drumoffset + sdp->swd_drumsize) \|\|
1064	amap_swap_off(sdp->swd_drumoffset,
1065	sdp->swd_drumoffset + sdp->swd_drumsize)) {
1066	error = ENOMEM;
1067	} else if (sdp->swd_npginuse > sdp->swd_npgbad) {
1068	error = EBUSY;
1069	}
1070
1071	if (error) {
1072	mutex_enter(&uvm_swap_data_lock);
1073	sdp->swd_flags \|= SWF_ENABLE;
1074	uvmexp.swpgavail += npages;
1075	mutex_exit(&uvm_swap_data_lock);
1076
1077	return error;
1078	}
1079
1080	/*
1081	* If this is the last regular swap destroy the workqueue.
1082	* => Protected by swap_syscall_lock.
1083	*/
1084	if (sdp->swd_vp->v_type != VBLK) {
1085	KASSERT(sw_reg_count > `0`);
1086	KASSERT(sw_reg_workqueue != NULL);
1087	if (--sw_reg_count == `0`) {
1088	workqueue_destroy(sw_reg_workqueue);
1089	sw_reg_workqueue = NULL;
1090	}
1091	}
1092
1093	/*
1094	* done with the vnode.
1095	* drop our ref on the vnode before calling VOP_CLOSE()
1096	* so that spec_close() can tell if this is the last close.
1097	*/
1098	vrele(sdp->swd_vp);
1099	if (sdp->swd_vp != rootvp) {
1100	(void) VOP_CLOSE(sdp->swd_vp, FREAD\|FWRITE, l->l_cred);
1101	}
1102
1103	mutex_enter(&uvm_swap_data_lock);
1104	uvmexp.swpages -= npages;
1105	uvmexp.swpginuse -= sdp->swd_npgbad;
1106
1107	if (swaplist_find(sdp->swd_vp, true) == NULL)
1108	panic("%s: swapdev not in list", __func__);
1109	swaplist_trim();
1110	mutex_exit(&uvm_swap_data_lock);
1111
1112	/*
1113	* free all resources!
1114	*/
1115	vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize);
1116	blist_destroy(sdp->swd_blist);
1117	bufq_free(sdp->swd_tab);
1118	kmem_free(sdp, sizeof(*sdp));
1119	return (`0`);
1120	}
1121
1122	void
1123	uvm_swap_shutdown(struct lwp *l)
1124	{
1125	struct swapdev *sdp;
1126	struct swappri *spp;
1127	struct vnode *vp;
1128	int error;
1129
1130	printf("turning of swap...");
1131	rw_enter(&swap_syscall_lock, RW_WRITER);
1132	mutex_enter(&uvm_swap_data_lock);
1133	again:
1134	LIST_FOREACH(spp, &swap_priority, spi_swappri)
1135	TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1136	if (sdp->swd_flags & SWF_FAKE)
1137	continue;
1138	if ((sdp->swd_flags & (SWF_INUSE\|SWF_ENABLE)) == `0`)
1139	continue;
1140	#ifdef DEBUG
1141	printf("\nturning off swap on %s...",
1142	sdp->swd_path);
1143	#endif
1144	if (vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE)) {
1145	error = EBUSY;
1146	vp = NULL;
1147	} else
1148	error = `0`;
1149	if (!error) {
1150	error = swap_off(l, sdp);
1151	mutex_enter(&uvm_swap_data_lock);
1152	}
1153	if (error) {
1154	printf("stopping swap on %s failed "
1155	"with error %d\n", sdp->swd_path, error);
1156	TAILQ_REMOVE(&spp->spi_swapdev, sdp,
1157	swd_next);
1158	uvmexp.nswapdev--;
1159	swaplist_trim();
1160	if (vp)
1161	vput(vp);
1162	}
1163	goto again;
1164	}
1165	printf(" done\n");
1166	mutex_exit(&uvm_swap_data_lock);
1167	rw_exit(&swap_syscall_lock);
1168	}
1169
1170
1171	/*
1172	* /dev/drum interface and i/o functions
1173	*/
1174
1175	/*
1176	* swstrategy: perform I/O on the drum
1177	*
1178	* => we must map the i/o request from the drum to the correct swapdev.
1179	*/
1180	static void
1181	swstrategy(struct buf *bp)
1182	{
1183	struct swapdev *sdp;
1184	struct vnode *vp;
1185	int pageno, bn;
1186	UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
1187
1188	/*
1189	* convert block number to swapdev. note that swapdev can't
1190	* be yanked out from under us because we are holding resources
1191	* in it (i.e. the blocks we are doing I/O on).
1192	*/
1193	pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
1194	mutex_enter(&uvm_swap_data_lock);
1195	sdp = swapdrum_getsdp(pageno);
1196	mutex_exit(&uvm_swap_data_lock);
1197	if (sdp == NULL) {
1198	bp->b_error = EINVAL;
1199	bp->b_resid = bp->b_bcount;
1200	biodone(bp);
1201	UVMHIST_LOG(pdhist, " failed to get swap device", `0`, `0`, `0`, `0`);
1202	return;
1203	}
1204
1205	/*
1206	* convert drum page number to block number on this swapdev.
1207	*/
1208
1209	pageno -= sdp->swd_drumoffset; / page # on swapdev /
1210	bn = btodb((uint64_t)pageno << PAGE_SHIFT); / convert to diskblock /
1211
1212	UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld",
1213	((bp->b_flags & B_READ) == `0`) ? "write" : "read",
1214	sdp->swd_drumoffset, bn, bp->b_bcount);
1215
1216	/*
1217	* for block devices we finish up here.
1218	* for regular files we have to do more work which we delegate
1219	* to sw_reg_strategy().
1220	*/
1221
1222	vp = sdp->swd_vp; / swapdev vnode pointer /
1223	switch (vp->v_type) {
1224	default:
1225	panic("%s: vnode type 0x%x", __func__, vp->v_type);
1226
1227	case VBLK:
1228
1229	/*
1230	* must convert "bp" from an I/O on /dev/drum to an I/O
1231	* on the swapdev (sdp).
1232	*/
1233	bp->b_blkno = bn; / swapdev block number /
1234	bp->b_dev = sdp->swd_dev; / swapdev dev_t /
1235
1236	/*
1237	* if we are doing a write, we have to redirect the i/o on
1238	* drum's v_numoutput counter to the swapdevs.
1239	*/
1240	if ((bp->b_flags & B_READ) == `0`) {
1241	mutex_enter(bp->b_objlock);
1242	vwakeup(bp); / kills one 'v_numoutput' on drum /
1243	mutex_exit(bp->b_objlock);
1244	mutex_enter(vp->v_interlock);
1245	vp->v_numoutput++; / put it on swapdev /
1246	mutex_exit(vp->v_interlock);
1247	}
1248
1249	/*
1250	* finally plug in swapdev vnode and start I/O
1251	*/
1252	bp->b_vp = vp;
1253	bp->b_objlock = vp->v_interlock;
1254	VOP_STRATEGY(vp, bp);
1255	return;
1256
1257	case VREG:
1258	/*
1259	* delegate to sw_reg_strategy function.
1260	*/
1261	sw_reg_strategy(sdp, bp, bn);
1262	return;
1263	}
1264	/ NOTREACHED /
1265	}
1266
1267	/*
1268	* swread: the read function for the drum (just a call to physio)
1269	*/
1270	/ARGSUSED/
1271	static int
1272	swread(dev_t dev, struct uio uio, int* ioflag)
1273	{
1274	UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
1275
1276	UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, `0`, `0`);
1277	return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1278	}
1279
1280	/*
1281	* swwrite: the write function for the drum (just a call to physio)
1282	*/
1283	/ARGSUSED/
1284	static int
1285	swwrite(dev_t dev, struct uio uio, int* ioflag)
1286	{
1287	UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
1288
1289	UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, `0`, `0`);
1290	return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1291	}
1292
1293	const struct bdevsw swap_bdevsw = {
1294	.d_open = nullopen,
1295	.d_close = nullclose,
1296	.d_strategy = swstrategy,
1297	.d_ioctl = noioctl,
1298	.d_dump = nodump,
1299	.d_psize = nosize,
1300	.d_discard = nodiscard,
1301	.d_flag = D_OTHER
1302	};
1303
1304	const struct cdevsw swap_cdevsw = {
1305	.d_open = nullopen,
1306	.d_close = nullclose,
1307	.d_read = swread,
1308	.d_write = swwrite,
1309	.d_ioctl = noioctl,
1310	.d_stop = nostop,
1311	.d_tty = notty,
1312	.d_poll = nopoll,
1313	.d_mmap = nommap,
1314	.d_kqfilter = nokqfilter,
1315	.d_discard = nodiscard,
1316	.d_flag = D_OTHER,
1317	};
1318
1319	/*
1320	* sw_reg_strategy: handle swap i/o to regular files
1321	*/
1322	static void
1323	sw_reg_strategy(struct swapdev sdp, struct* buf bp, int* bn)
1324	{
1325	struct vnode *vp;
1326	struct vndxfer *vnx;
1327	daddr_t nbn;
1328	char *addr;
1329	off_t byteoff;
1330	int s, off, nra, error, sz, resid;
1331	UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
1332
1333	/*
1334	* allocate a vndxfer head for this transfer and point it to
1335	* our buffer.
1336	*/
1337	vnx = pool_get(&vndxfer_pool, PR_WAITOK);
1338	vnx->vx_flags = VX_BUSY;
1339	vnx->vx_error = `0`;
1340	vnx->vx_pending = `0`;
1341	vnx->vx_bp = bp;
1342	vnx->vx_sdp = sdp;
1343
1344	/*
1345	* setup for main loop where we read filesystem blocks into
1346	* our buffer.
1347	*/
1348	error = `0`;
1349	bp->b_resid = bp->b_bcount; / nothing transfered yet! /
1350	addr = bp->b_data; / current position in buffer /
1351	byteoff = dbtob((uint64_t)bn);
1352
1353	for (resid = bp->b_resid; resid; resid -= sz) {
1354	struct vndbuf *nbp;
1355
1356	/*
1357	* translate byteoffset into block number. return values:
1358	* vp = vnode of underlying device
1359	* nbn = new block number (on underlying vnode dev)
1360	* nra = num blocks we can read-ahead (excludes requested
1361	* block)
1362	*/
1363	nra = `0`;
1364	error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1365	&vp, &nbn, &nra);
1366
1367	if (error == `0` && nbn == (daddr_t)-`1`) {
1368	/*
1369	* this used to just set error, but that doesn't
1370	* do the right thing. Instead, it causes random
1371	* memory errors. The panic() should remain until
1372	* this condition doesn't destabilize the system.
1373	*/
1374	#if 1
1375	panic("%s: swap to sparse file", __func__);
1376	#else
1377	error = EIO; / failure /
1378	#endif
1379	}
1380
1381	/*
1382	* punt if there was an error or a hole in the file.
1383	* we must wait for any i/o ops we have already started
1384	* to finish before returning.
1385	*
1386	* XXX we could deal with holes here but it would be
1387	* a hassle (in the write case).
1388	*/
1389	if (error) {
1390	s = splbio();
1391	vnx->vx_error = error; / pass error up /
1392	goto out;
1393	}
1394
1395	/*
1396	* compute the size ("sz") of this transfer (in bytes).
1397	*/
1398	off = byteoff % sdp->swd_bsize;
1399	sz = (`1` + nra) * sdp->swd_bsize - off;
1400	if (sz > resid)
1401	sz = resid;
1402
1403	UVMHIST_LOG(pdhist, "sw_reg_strategy: "
1404	"vp %p/%p offset 0x%x/0x%x",
1405	sdp->swd_vp, vp, byteoff, nbn);
1406
1407	/*
1408	* now get a buf structure. note that the vb_buf is
1409	* at the front of the nbp structure so that you can
1410	* cast pointers between the two structure easily.
1411	*/
1412	nbp = pool_get(&vndbuf_pool, PR_WAITOK);
1413	buf_init(&nbp->vb_buf);
1414	nbp->vb_buf.b_flags = bp->b_flags;
1415	nbp->vb_buf.b_cflags = bp->b_cflags;
1416	nbp->vb_buf.b_oflags = bp->b_oflags;
1417	nbp->vb_buf.b_bcount = sz;
1418	nbp->vb_buf.b_bufsize = sz;
1419	nbp->vb_buf.b_error = `0`;
1420	nbp->vb_buf.b_data = addr;
1421	nbp->vb_buf.b_lblkno = `0`;
1422	nbp->vb_buf.b_blkno = nbn + btodb(off);
1423	nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
1424	nbp->vb_buf.b_iodone = sw_reg_biodone;
1425	nbp->vb_buf.b_vp = vp;
1426	nbp->vb_buf.b_objlock = vp->v_interlock;
1427	if (vp->v_type == VBLK) {
1428	nbp->vb_buf.b_dev = vp->v_rdev;
1429	}
1430
1431	nbp->vb_xfer = vnx; / patch it back in to vnx /
1432
1433	/*
1434	* Just sort by block number
1435	*/
1436	s = splbio();
1437	if (vnx->vx_error != `0`) {
1438	buf_destroy(&nbp->vb_buf);
1439	pool_put(&vndbuf_pool, nbp);
1440	goto out;
1441	}
1442	vnx->vx_pending++;
1443
1444	/ sort it in and start I/O if we are not over our limit /
1445	/ XXXAD locking /
1446	bufq_put(sdp->swd_tab, &nbp->vb_buf);
1447	sw_reg_start(sdp);
1448	splx(s);
1449
1450	/*
1451	* advance to the next I/O
1452	*/
1453	byteoff += sz;
1454	addr += sz;
1455	}
1456
1457	s = splbio();
1458
1459	out: / Arrive here at splbio /
1460	vnx->vx_flags &= ~VX_BUSY;
1461	if (vnx->vx_pending == `0`) {
1462	error = vnx->vx_error;
1463	pool_put(&vndxfer_pool, vnx);
1464	bp->b_error = error;
1465	biodone(bp);
1466	}
1467	splx(s);
1468	}
1469
1470	/*
1471	* sw_reg_start: start an I/O request on the requested swapdev
1472	*
1473	* => reqs are sorted by b_rawblkno (above)
1474	*/
1475	static void
1476	sw_reg_start(struct swapdev *sdp)
1477	{
1478	struct buf *bp;
1479	struct vnode *vp;
1480	UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
1481
1482	/ recursion control /
1483	if ((sdp->swd_flags & SWF_BUSY) != `0`)
1484	return;
1485
1486	sdp->swd_flags \|= SWF_BUSY;
1487
1488	while (sdp->swd_active < sdp->swd_maxactive) {
1489	bp = bufq_get(sdp->swd_tab);
1490	if (bp == NULL)
1491	break;
1492	sdp->swd_active++;
1493
1494	UVMHIST_LOG(pdhist,
1495	"sw_reg_start: bp %p vp %p blkno %p cnt %lx",
1496	bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
1497	vp = bp->b_vp;
1498	KASSERT(bp->b_objlock == vp->v_interlock);
1499	if ((bp->b_flags & B_READ) == `0`) {
1500	mutex_enter(vp->v_interlock);
1501	vp->v_numoutput++;
1502	mutex_exit(vp->v_interlock);
1503	}
1504	VOP_STRATEGY(vp, bp);
1505	}
1506	sdp->swd_flags &= ~SWF_BUSY;
1507	}
1508
1509	/*
1510	* sw_reg_biodone: one of our i/o's has completed
1511	*/
1512	static void
1513	sw_reg_biodone(struct buf *bp)
1514	{
1515	workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL);
1516	}
1517
1518	/*
1519	* sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1520	*
1521	* => note that we can recover the vndbuf struct by casting the buf ptr
1522	*/
1523	static void
1524	sw_reg_iodone(struct work wk, void* *dummy)
1525	{
1526	struct vndbuf vbp = (void* *)wk;
1527	struct vndxfer *vnx = vbp->vb_xfer;
1528	struct buf pbp = vnx->vx_bp; /* parent buffer /
1529	struct swapdev *sdp = vnx->vx_sdp;
1530	int s, resid, error;
1531	KASSERT(&vbp->vb_buf.b_work == wk);
1532	UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
1533
1534	UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p",
1535	vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
1536	UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx",
1537	vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, `0`, `0`);
1538
1539	/*
1540	* protect vbp at splbio and update.
1541	*/
1542
1543	s = splbio();
1544	resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1545	pbp->b_resid -= resid;
1546	vnx->vx_pending--;
1547
1548	if (vbp->vb_buf.b_error != `0`) {
1549	/ pass error upward /
1550	error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO;
1551	UVMHIST_LOG(pdhist, " got error=%d !", error, `0`, `0`, `0`);
1552	vnx->vx_error = error;
1553	}
1554
1555	/*
1556	* kill vbp structure
1557	*/
1558	buf_destroy(&vbp->vb_buf);
1559	pool_put(&vndbuf_pool, vbp);
1560
1561	/*
1562	* wrap up this transaction if it has run to completion or, in
1563	* case of an error, when all auxiliary buffers have returned.
1564	*/
1565	if (vnx->vx_error != `0`) {
1566	/ pass error upward /
1567	error = vnx->vx_error;
1568	if ((vnx->vx_flags & VX_BUSY) == `0` && vnx->vx_pending == `0`) {
1569	pbp->b_error = error;
1570	biodone(pbp);
1571	pool_put(&vndxfer_pool, vnx);
1572	}
1573	} else if (pbp->b_resid == `0`) {
1574	KASSERT(vnx->vx_pending == `0`);
1575	if ((vnx->vx_flags & VX_BUSY) == `0`) {
1576	UVMHIST_LOG(pdhist, " iodone error=%d !",
1577	pbp, vnx->vx_error, `0`, `0`);
1578	biodone(pbp);
1579	pool_put(&vndxfer_pool, vnx);
1580	}
1581	}
1582
1583	/*
1584	* done! start next swapdev I/O if one is pending
1585	*/
1586	sdp->swd_active--;
1587	sw_reg_start(sdp);
1588	splx(s);
1589	}
1590
1591
1592	/*
1593	* uvm_swap_alloc: allocate space on swap
1594	*
1595	* => allocation is done "round robin" down the priority list, as we
1596	* allocate in a priority we "rotate" the circle queue.
1597	* => space can be freed with uvm_swap_free
1598	* => we return the page slot number in /dev/drum (0 == invalid slot)
1599	* => we lock uvm_swap_data_lock
1600	* => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1601	*/
1602	int
1603	uvm_swap_alloc(int nslots /* IN/OUT /, bool lessok)
1604	{
1605	struct swapdev *sdp;
1606	struct swappri *spp;
1607	UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
1608
1609	/*
1610	* no swap devices configured yet? definite failure.
1611	*/
1612	if (uvmexp.nswapdev < `1`)
1613	return `0`;
1614
1615	/*
1616	* XXXJAK: BEGIN HACK
1617	*
1618	* blist_alloc() in subr_blist.c will panic if we try to allocate
1619	* too many slots.
1620	*/
1621	if (*nslots > BLIST_MAX_ALLOC) {
1622	if (__predict_false(lessok == false))
1623	return `0`;
1624	*nslots = BLIST_MAX_ALLOC;
1625	}
1626	/ XXXJAK: END HACK /
1627
1628	/*
1629	* lock data lock, convert slots into blocks, and enter loop
1630	*/
1631	mutex_enter(&uvm_swap_data_lock);
1632
1633	ReTry: / XXXMRG /
1634	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
1635	TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1636	uint64_t result;
1637
1638	/ if it's not enabled, then we can't swap from it /
1639	if ((sdp->swd_flags & SWF_ENABLE) == `0`)
1640	continue;
1641	if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1642	continue;
1643	result = blist_alloc(sdp->swd_blist, *nslots);
1644	if (result == BLIST_NONE) {
1645	continue;
1646	}
1647	KASSERT(result < sdp->swd_drumsize);
1648
1649	/*
1650	* successful allocation! now rotate the tailq.
1651	*/
1652	TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1653	TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1654	sdp->swd_npginuse += *nslots;
1655	uvmexp.swpginuse += *nslots;
1656	mutex_exit(&uvm_swap_data_lock);
1657	/ done! return drum slot number /
1658	UVMHIST_LOG(pdhist,
1659	"success! returning %d slots starting at %d",
1660	*nslots, result + sdp->swd_drumoffset, `0`, `0`);
1661	return (result + sdp->swd_drumoffset);
1662	}
1663	}
1664
1665	/ XXXMRG: BEGIN HACK /
1666	if (*nslots > `1` && lessok) {
1667	*nslots = `1`;
1668	/ XXXMRG: ugh! blist should support this for us /
1669	goto ReTry;
1670	}
1671	/ XXXMRG: END HACK /
1672
1673	mutex_exit(&uvm_swap_data_lock);
1674	return `0`;
1675	}
1676
1677	/*
1678	* uvm_swapisfull: return true if most of available swap is allocated
1679	* and in use. we don't count some small portion as it may be inaccessible
1680	* to us at any given moment, for example if there is lock contention or if
1681	* pages are busy.
1682	*/
1683	bool
1684	uvm_swapisfull(void)
1685	{
1686	int swpgonly;
1687	bool rv;
1688
1689	mutex_enter(&uvm_swap_data_lock);
1690	KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
1691	swpgonly = (int)((uint64_t)uvmexp.swpgonly * `100` /
1692	uvm_swapisfull_factor);
1693	rv = (swpgonly >= uvmexp.swpgavail);
1694	mutex_exit(&uvm_swap_data_lock);
1695
1696	return (rv);
1697	}
1698
1699	/*
1700	* uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1701	*
1702	* => we lock uvm_swap_data_lock
1703	*/
1704	void
1705	uvm_swap_markbad(int startslot, int nslots)
1706	{
1707	struct swapdev *sdp;
1708	UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist);
1709
1710	mutex_enter(&uvm_swap_data_lock);
1711	sdp = swapdrum_getsdp(startslot);
1712	KASSERT(sdp != NULL);
1713
1714	/*
1715	* we just keep track of how many pages have been marked bad
1716	* in this device, to make everything add up in swap_off().
1717	* we assume here that the range of slots will all be within
1718	* one swap device.
1719	*/
1720
1721	KASSERT(uvmexp.swpgonly >= nslots);
1722	uvmexp.swpgonly -= nslots;
1723	sdp->swd_npgbad += nslots;
1724	UVMHIST_LOG(pdhist, "now %d bad", sdp->swd_npgbad, `0`,`0`,`0`);
1725	mutex_exit(&uvm_swap_data_lock);
1726	}
1727
1728	/*
1729	* uvm_swap_free: free swap slots
1730	*
1731	* => this can be all or part of an allocation made by uvm_swap_alloc
1732	* => we lock uvm_swap_data_lock
1733	*/
1734	void
1735	uvm_swap_free(int startslot, int nslots)
1736	{
1737	struct swapdev *sdp;
1738	UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
1739
1740	UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots,
1741	startslot, `0`, `0`);
1742
1743	/*
1744	* ignore attempts to free the "bad" slot.
1745	*/
1746
1747	if (startslot == SWSLOT_BAD) {
1748	return;
1749	}
1750
1751	/*
1752	* convert drum slot offset back to sdp, free the blocks
1753	* in the extent, and return. must hold pri lock to do
1754	* lookup and access the extent.
1755	*/
1756
1757	mutex_enter(&uvm_swap_data_lock);
1758	sdp = swapdrum_getsdp(startslot);
1759	KASSERT(uvmexp.nswapdev >= `1`);
1760	KASSERT(sdp != NULL);
1761	KASSERT(sdp->swd_npginuse >= nslots);
1762	blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
1763	sdp->swd_npginuse -= nslots;
1764	uvmexp.swpginuse -= nslots;
1765	mutex_exit(&uvm_swap_data_lock);
1766	}
1767
1768	/*
1769	* uvm_swap_put: put any number of pages into a contig place on swap
1770	*
1771	* => can be sync or async
1772	*/
1773
1774	int
1775	uvm_swap_put(int swslot, struct vm_page *ppsp, int* npages, int flags)
1776	{
1777	int error;
1778
1779	error = uvm_swap_io(ppsp, swslot, npages, B_WRITE \|
1780	((flags & PGO_SYNCIO) ? `0` : B_ASYNC));
1781	return error;
1782	}
1783
1784	/*
1785	* uvm_swap_get: get a single page from swap
1786	*
1787	* => usually a sync op (from fault)
1788	*/
1789
1790	int
1791	uvm_swap_get(struct vm_page page, int* swslot, int flags)
1792	{
1793	int error;
1794
1795	uvmexp.nswget++;
1796	KASSERT(flags & PGO_SYNCIO);
1797	if (swslot == SWSLOT_BAD) {
1798	return EIO;
1799	}
1800
1801	error = uvm_swap_io(&page, swslot, `1`, B_READ \|
1802	((flags & PGO_SYNCIO) ? `0` : B_ASYNC));
1803	if (error == `0`) {
1804
1805	/*
1806	* this page is no longer only in swap.
1807	*/
1808
1809	mutex_enter(&uvm_swap_data_lock);
1810	KASSERT(uvmexp.swpgonly > `0`);
1811	uvmexp.swpgonly--;
1812	mutex_exit(&uvm_swap_data_lock);
1813	}
1814	return error;
1815	}
1816
1817	/*
1818	* uvm_swap_io: do an i/o operation to swap
1819	*/
1820
1821	static int
1822	uvm_swap_io(struct vm_page *pps, int* startslot, int npages, int flags)
1823	{
1824	daddr_t startblk;
1825	struct buf *bp;
1826	vaddr_t kva;
1827	int error, mapinflags;
1828	bool write, async;
1829	UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
1830
1831	UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d",
1832	startslot, npages, flags, `0`);
1833
1834	write = (flags & B_READ) == `0`;
1835	async = (flags & B_ASYNC) != `0`;
1836
1837	/*
1838	* allocate a buf for the i/o.
1839	*/
1840
1841	KASSERT(curlwp != uvm.pagedaemon_lwp \|\| (write && async));
1842	bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp);
1843	if (bp == NULL) {
1844	uvm_aio_aiodone_pages(pps, npages, true, ENOMEM);
1845	return ENOMEM;
1846	}
1847
1848	/*
1849	* convert starting drum slot to block number
1850	*/
1851
1852	startblk = btodb((uint64_t)startslot << PAGE_SHIFT);
1853
1854	/*
1855	* first, map the pages into the kernel.
1856	*/
1857
1858	mapinflags = !write ?
1859	UVMPAGER_MAPIN_WAITOK\|UVMPAGER_MAPIN_READ :
1860	UVMPAGER_MAPIN_WAITOK\|UVMPAGER_MAPIN_WRITE;
1861	kva = uvm_pagermapin(pps, npages, mapinflags);
1862
1863	/*
1864	* fill in the bp/sbp. we currently route our i/o through
1865	* /dev/drum's vnode [swapdev_vp].
1866	*/
1867
1868	bp->b_cflags = BC_BUSY \| BC_NOCACHE;
1869	bp->b_flags = (flags & (B_READ\|B_ASYNC));
1870	bp->b_proc = &proc0; / XXX /
1871	bp->b_vnbufs.le_next = NOLIST;
1872	bp->b_data = (void *)kva;
1873	bp->b_blkno = startblk;
1874	bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
1875
1876	/*
1877	* bump v_numoutput (counter of number of active outputs).
1878	*/
1879
1880	if (write) {
1881	mutex_enter(swapdev_vp->v_interlock);
1882	swapdev_vp->v_numoutput++;
1883	mutex_exit(swapdev_vp->v_interlock);
1884	}
1885
1886	/*
1887	* for async ops we must set up the iodone handler.
1888	*/
1889
1890	if (async) {
1891	bp->b_iodone = uvm_aio_biodone;
1892	UVMHIST_LOG(pdhist, "doing async!", `0`, `0`, `0`, `0`);
1893	if (curlwp == uvm.pagedaemon_lwp)
1894	BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1895	else
1896	BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
1897	} else {
1898	bp->b_iodone = NULL;
1899	BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1900	}
1901	UVMHIST_LOG(pdhist,
1902	"about to start io: data = %p blkno = 0x%x, bcount = %ld",
1903	bp->b_data, bp->b_blkno, bp->b_bcount, `0`);
1904
1905	/*
1906	* now we start the I/O, and if async, return.
1907	*/
1908
1909	VOP_STRATEGY(swapdev_vp, bp);
1910	if (async)
1911	return `0`;
1912
1913	/*
1914	* must be sync i/o. wait for it to finish
1915	*/
1916
1917	error = biowait(bp);
1918
1919	/*
1920	* kill the pager mapping
1921	*/
1922
1923	uvm_pagermapout(kva, npages);
1924
1925	/*
1926	* now dispose of the buf and we're done.
1927	*/
1928
1929	if (write) {
1930	mutex_enter(swapdev_vp->v_interlock);
1931	vwakeup(bp);
1932	mutex_exit(swapdev_vp->v_interlock);
1933	}
1934	putiobuf(bp);
1935	UVMHIST_LOG(pdhist, "<- done (sync) error=%d", error, `0`, `0`, `0`);
1936
1937	return (error);
1938	}
1939

Browse the source code of src/src/sys/uvm/uvm_swap.c