1 | /* $NetBSD: vfs_subr.c,v 1.451 2016/11/03 11:04:21 hannken Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc. |
5 | * All rights reserved. |
6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, |
9 | * NASA Ames Research Center, by Charles M. Hannum, by Andrew Doran, |
10 | * by Marshall Kirk McKusick and Greg Ganger at the University of Michigan. |
11 | * |
12 | * Redistribution and use in source and binary forms, with or without |
13 | * modification, are permitted provided that the following conditions |
14 | * are met: |
15 | * 1. Redistributions of source code must retain the above copyright |
16 | * notice, this list of conditions and the following disclaimer. |
17 | * 2. Redistributions in binary form must reproduce the above copyright |
18 | * notice, this list of conditions and the following disclaimer in the |
19 | * documentation and/or other materials provided with the distribution. |
20 | * |
21 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
23 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
24 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
25 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
26 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
27 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
28 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
29 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
30 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
31 | * POSSIBILITY OF SUCH DAMAGE. |
32 | */ |
33 | |
34 | /* |
35 | * Copyright (c) 1989, 1993 |
36 | * The Regents of the University of California. All rights reserved. |
37 | * (c) UNIX System Laboratories, Inc. |
38 | * All or some portions of this file are derived from material licensed |
39 | * to the University of California by American Telephone and Telegraph |
40 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with |
41 | * the permission of UNIX System Laboratories, Inc. |
42 | * |
43 | * Redistribution and use in source and binary forms, with or without |
44 | * modification, are permitted provided that the following conditions |
45 | * are met: |
46 | * 1. Redistributions of source code must retain the above copyright |
47 | * notice, this list of conditions and the following disclaimer. |
48 | * 2. Redistributions in binary form must reproduce the above copyright |
49 | * notice, this list of conditions and the following disclaimer in the |
50 | * documentation and/or other materials provided with the distribution. |
51 | * 3. Neither the name of the University nor the names of its contributors |
52 | * may be used to endorse or promote products derived from this software |
53 | * without specific prior written permission. |
54 | * |
55 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
56 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
57 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
58 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
59 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
60 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
61 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
62 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
63 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
64 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
65 | * SUCH DAMAGE. |
66 | * |
67 | * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 |
68 | */ |
69 | |
70 | #include <sys/cdefs.h> |
71 | __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.451 2016/11/03 11:04:21 hannken Exp $" ); |
72 | |
73 | #ifdef _KERNEL_OPT |
74 | #include "opt_ddb.h" |
75 | #include "opt_compat_netbsd.h" |
76 | #include "opt_compat_43.h" |
77 | #endif |
78 | |
79 | #include <sys/param.h> |
80 | #include <sys/systm.h> |
81 | #include <sys/conf.h> |
82 | #include <sys/dirent.h> |
83 | #include <sys/filedesc.h> |
84 | #include <sys/kernel.h> |
85 | #include <sys/mount.h> |
86 | #include <sys/vnode_impl.h> |
87 | #include <sys/stat.h> |
88 | #include <sys/sysctl.h> |
89 | #include <sys/namei.h> |
90 | #include <sys/buf.h> |
91 | #include <sys/errno.h> |
92 | #include <sys/kmem.h> |
93 | #include <sys/syscallargs.h> |
94 | #include <sys/kauth.h> |
95 | #include <sys/module.h> |
96 | |
97 | #include <miscfs/genfs/genfs.h> |
98 | #include <miscfs/specfs/specdev.h> |
99 | #include <uvm/uvm_ddb.h> |
100 | |
101 | const enum vtype iftovt_tab[16] = { |
102 | VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, |
103 | VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, |
104 | }; |
105 | const int vttoif_tab[9] = { |
106 | 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, |
107 | S_IFSOCK, S_IFIFO, S_IFMT, |
108 | }; |
109 | |
110 | /* |
111 | * Insq/Remq for the vnode usage lists. |
112 | */ |
113 | #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) |
114 | #define bufremvn(bp) { \ |
115 | LIST_REMOVE(bp, b_vnbufs); \ |
116 | (bp)->b_vnbufs.le_next = NOLIST; \ |
117 | } |
118 | |
119 | int doforce = 1; /* 1 => permit forcible unmounting */ |
120 | int prtactive = 0; /* 1 => print out reclaim of active vnodes */ |
121 | |
122 | extern struct mount *dead_rootmount; |
123 | |
124 | /* |
125 | * Local declarations. |
126 | */ |
127 | |
128 | static void vn_initialize_syncerd(void); |
129 | |
130 | /* |
131 | * Initialize the vnode management data structures. |
132 | */ |
133 | void |
134 | vntblinit(void) |
135 | { |
136 | |
137 | vn_initialize_syncerd(); |
138 | vfs_mount_sysinit(); |
139 | vfs_vnode_sysinit(); |
140 | } |
141 | |
142 | /* |
143 | * Flush out and invalidate all buffers associated with a vnode. |
144 | * Called with the underlying vnode locked, which should prevent new dirty |
145 | * buffers from being queued. |
146 | */ |
147 | int |
148 | vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l, |
149 | bool catch_p, int slptimeo) |
150 | { |
151 | struct buf *bp, *nbp; |
152 | int error; |
153 | int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO | |
154 | (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0); |
155 | |
156 | /* XXXUBC this doesn't look at flags or slp* */ |
157 | mutex_enter(vp->v_interlock); |
158 | error = VOP_PUTPAGES(vp, 0, 0, flushflags); |
159 | if (error) { |
160 | return error; |
161 | } |
162 | |
163 | if (flags & V_SAVE) { |
164 | error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0); |
165 | if (error) |
166 | return (error); |
167 | KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd)); |
168 | } |
169 | |
170 | mutex_enter(&bufcache_lock); |
171 | restart: |
172 | for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { |
173 | KASSERT(bp->b_vp == vp); |
174 | nbp = LIST_NEXT(bp, b_vnbufs); |
175 | error = bbusy(bp, catch_p, slptimeo, NULL); |
176 | if (error != 0) { |
177 | if (error == EPASSTHROUGH) |
178 | goto restart; |
179 | mutex_exit(&bufcache_lock); |
180 | return (error); |
181 | } |
182 | brelsel(bp, BC_INVAL | BC_VFLUSH); |
183 | } |
184 | |
185 | for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { |
186 | KASSERT(bp->b_vp == vp); |
187 | nbp = LIST_NEXT(bp, b_vnbufs); |
188 | error = bbusy(bp, catch_p, slptimeo, NULL); |
189 | if (error != 0) { |
190 | if (error == EPASSTHROUGH) |
191 | goto restart; |
192 | mutex_exit(&bufcache_lock); |
193 | return (error); |
194 | } |
195 | /* |
196 | * XXX Since there are no node locks for NFS, I believe |
197 | * there is a slight chance that a delayed write will |
198 | * occur while sleeping just above, so check for it. |
199 | */ |
200 | if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) { |
201 | #ifdef DEBUG |
202 | printf("buffer still DELWRI\n" ); |
203 | #endif |
204 | bp->b_cflags |= BC_BUSY | BC_VFLUSH; |
205 | mutex_exit(&bufcache_lock); |
206 | VOP_BWRITE(bp->b_vp, bp); |
207 | mutex_enter(&bufcache_lock); |
208 | goto restart; |
209 | } |
210 | brelsel(bp, BC_INVAL | BC_VFLUSH); |
211 | } |
212 | |
213 | #ifdef DIAGNOSTIC |
214 | if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd)) |
215 | panic("vinvalbuf: flush failed, vp %p" , vp); |
216 | #endif |
217 | |
218 | mutex_exit(&bufcache_lock); |
219 | |
220 | return (0); |
221 | } |
222 | |
223 | /* |
224 | * Destroy any in core blocks past the truncation length. |
225 | * Called with the underlying vnode locked, which should prevent new dirty |
226 | * buffers from being queued. |
227 | */ |
228 | int |
229 | vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch_p, int slptimeo) |
230 | { |
231 | struct buf *bp, *nbp; |
232 | int error; |
233 | voff_t off; |
234 | |
235 | off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift); |
236 | mutex_enter(vp->v_interlock); |
237 | error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO); |
238 | if (error) { |
239 | return error; |
240 | } |
241 | |
242 | mutex_enter(&bufcache_lock); |
243 | restart: |
244 | for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { |
245 | KASSERT(bp->b_vp == vp); |
246 | nbp = LIST_NEXT(bp, b_vnbufs); |
247 | if (bp->b_lblkno < lbn) |
248 | continue; |
249 | error = bbusy(bp, catch_p, slptimeo, NULL); |
250 | if (error != 0) { |
251 | if (error == EPASSTHROUGH) |
252 | goto restart; |
253 | mutex_exit(&bufcache_lock); |
254 | return (error); |
255 | } |
256 | brelsel(bp, BC_INVAL | BC_VFLUSH); |
257 | } |
258 | |
259 | for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { |
260 | KASSERT(bp->b_vp == vp); |
261 | nbp = LIST_NEXT(bp, b_vnbufs); |
262 | if (bp->b_lblkno < lbn) |
263 | continue; |
264 | error = bbusy(bp, catch_p, slptimeo, NULL); |
265 | if (error != 0) { |
266 | if (error == EPASSTHROUGH) |
267 | goto restart; |
268 | mutex_exit(&bufcache_lock); |
269 | return (error); |
270 | } |
271 | brelsel(bp, BC_INVAL | BC_VFLUSH); |
272 | } |
273 | mutex_exit(&bufcache_lock); |
274 | |
275 | return (0); |
276 | } |
277 | |
278 | /* |
279 | * Flush all dirty buffers from a vnode. |
280 | * Called with the underlying vnode locked, which should prevent new dirty |
281 | * buffers from being queued. |
282 | */ |
283 | int |
284 | vflushbuf(struct vnode *vp, int flags) |
285 | { |
286 | struct buf *bp, *nbp; |
287 | int error, pflags; |
288 | bool dirty, sync; |
289 | |
290 | sync = (flags & FSYNC_WAIT) != 0; |
291 | pflags = PGO_CLEANIT | PGO_ALLPAGES | |
292 | (sync ? PGO_SYNCIO : 0) | |
293 | ((flags & FSYNC_LAZY) ? PGO_LAZY : 0); |
294 | mutex_enter(vp->v_interlock); |
295 | (void) VOP_PUTPAGES(vp, 0, 0, pflags); |
296 | |
297 | loop: |
298 | mutex_enter(&bufcache_lock); |
299 | for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { |
300 | KASSERT(bp->b_vp == vp); |
301 | nbp = LIST_NEXT(bp, b_vnbufs); |
302 | if ((bp->b_cflags & BC_BUSY)) |
303 | continue; |
304 | if ((bp->b_oflags & BO_DELWRI) == 0) |
305 | panic("vflushbuf: not dirty, bp %p" , bp); |
306 | bp->b_cflags |= BC_BUSY | BC_VFLUSH; |
307 | mutex_exit(&bufcache_lock); |
308 | /* |
309 | * Wait for I/O associated with indirect blocks to complete, |
310 | * since there is no way to quickly wait for them below. |
311 | */ |
312 | if (bp->b_vp == vp || !sync) |
313 | (void) bawrite(bp); |
314 | else { |
315 | error = bwrite(bp); |
316 | if (error) |
317 | return error; |
318 | } |
319 | goto loop; |
320 | } |
321 | mutex_exit(&bufcache_lock); |
322 | |
323 | if (!sync) |
324 | return 0; |
325 | |
326 | mutex_enter(vp->v_interlock); |
327 | while (vp->v_numoutput != 0) |
328 | cv_wait(&vp->v_cv, vp->v_interlock); |
329 | dirty = !LIST_EMPTY(&vp->v_dirtyblkhd); |
330 | mutex_exit(vp->v_interlock); |
331 | |
332 | if (dirty) { |
333 | vprint("vflushbuf: dirty" , vp); |
334 | goto loop; |
335 | } |
336 | |
337 | return 0; |
338 | } |
339 | |
340 | /* |
341 | * Create a vnode for a block device. |
342 | * Used for root filesystem and swap areas. |
343 | * Also used for memory file system special devices. |
344 | */ |
345 | int |
346 | bdevvp(dev_t dev, vnode_t **vpp) |
347 | { |
348 | struct vattr va; |
349 | |
350 | vattr_null(&va); |
351 | va.va_type = VBLK; |
352 | va.va_rdev = dev; |
353 | |
354 | return vcache_new(dead_rootmount, NULL, &va, NOCRED, vpp); |
355 | } |
356 | |
357 | /* |
358 | * Create a vnode for a character device. |
359 | * Used for kernfs and some console handling. |
360 | */ |
361 | int |
362 | cdevvp(dev_t dev, vnode_t **vpp) |
363 | { |
364 | struct vattr va; |
365 | |
366 | vattr_null(&va); |
367 | va.va_type = VCHR; |
368 | va.va_rdev = dev; |
369 | |
370 | return vcache_new(dead_rootmount, NULL, &va, NOCRED, vpp); |
371 | } |
372 | |
373 | /* |
374 | * Associate a buffer with a vnode. There must already be a hold on |
375 | * the vnode. |
376 | */ |
377 | void |
378 | bgetvp(struct vnode *vp, struct buf *bp) |
379 | { |
380 | |
381 | KASSERT(bp->b_vp == NULL); |
382 | KASSERT(bp->b_objlock == &buffer_lock); |
383 | KASSERT(mutex_owned(vp->v_interlock)); |
384 | KASSERT(mutex_owned(&bufcache_lock)); |
385 | KASSERT((bp->b_cflags & BC_BUSY) != 0); |
386 | KASSERT(!cv_has_waiters(&bp->b_done)); |
387 | |
388 | vholdl(vp); |
389 | bp->b_vp = vp; |
390 | if (vp->v_type == VBLK || vp->v_type == VCHR) |
391 | bp->b_dev = vp->v_rdev; |
392 | else |
393 | bp->b_dev = NODEV; |
394 | |
395 | /* |
396 | * Insert onto list for new vnode. |
397 | */ |
398 | bufinsvn(bp, &vp->v_cleanblkhd); |
399 | bp->b_objlock = vp->v_interlock; |
400 | } |
401 | |
402 | /* |
403 | * Disassociate a buffer from a vnode. |
404 | */ |
405 | void |
406 | brelvp(struct buf *bp) |
407 | { |
408 | struct vnode *vp = bp->b_vp; |
409 | |
410 | KASSERT(vp != NULL); |
411 | KASSERT(bp->b_objlock == vp->v_interlock); |
412 | KASSERT(mutex_owned(vp->v_interlock)); |
413 | KASSERT(mutex_owned(&bufcache_lock)); |
414 | KASSERT((bp->b_cflags & BC_BUSY) != 0); |
415 | KASSERT(!cv_has_waiters(&bp->b_done)); |
416 | |
417 | /* |
418 | * Delete from old vnode list, if on one. |
419 | */ |
420 | if (LIST_NEXT(bp, b_vnbufs) != NOLIST) |
421 | bufremvn(bp); |
422 | |
423 | if (vp->v_uobj.uo_npages == 0 && (vp->v_iflag & VI_ONWORKLST) && |
424 | LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { |
425 | vp->v_iflag &= ~VI_WRMAPDIRTY; |
426 | vn_syncer_remove_from_worklist(vp); |
427 | } |
428 | |
429 | bp->b_objlock = &buffer_lock; |
430 | bp->b_vp = NULL; |
431 | holdrelel(vp); |
432 | } |
433 | |
434 | /* |
435 | * Reassign a buffer from one vnode list to another. |
436 | * The list reassignment must be within the same vnode. |
437 | * Used to assign file specific control information |
438 | * (indirect blocks) to the list to which they belong. |
439 | */ |
440 | void |
441 | reassignbuf(struct buf *bp, struct vnode *vp) |
442 | { |
443 | struct buflists *listheadp; |
444 | int delayx; |
445 | |
446 | KASSERT(mutex_owned(&bufcache_lock)); |
447 | KASSERT(bp->b_objlock == vp->v_interlock); |
448 | KASSERT(mutex_owned(vp->v_interlock)); |
449 | KASSERT((bp->b_cflags & BC_BUSY) != 0); |
450 | |
451 | /* |
452 | * Delete from old vnode list, if on one. |
453 | */ |
454 | if (LIST_NEXT(bp, b_vnbufs) != NOLIST) |
455 | bufremvn(bp); |
456 | |
457 | /* |
458 | * If dirty, put on list of dirty buffers; |
459 | * otherwise insert onto list of clean buffers. |
460 | */ |
461 | if ((bp->b_oflags & BO_DELWRI) == 0) { |
462 | listheadp = &vp->v_cleanblkhd; |
463 | if (vp->v_uobj.uo_npages == 0 && |
464 | (vp->v_iflag & VI_ONWORKLST) && |
465 | LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { |
466 | vp->v_iflag &= ~VI_WRMAPDIRTY; |
467 | vn_syncer_remove_from_worklist(vp); |
468 | } |
469 | } else { |
470 | listheadp = &vp->v_dirtyblkhd; |
471 | if ((vp->v_iflag & VI_ONWORKLST) == 0) { |
472 | switch (vp->v_type) { |
473 | case VDIR: |
474 | delayx = dirdelay; |
475 | break; |
476 | case VBLK: |
477 | if (spec_node_getmountedfs(vp) != NULL) { |
478 | delayx = metadelay; |
479 | break; |
480 | } |
481 | /* fall through */ |
482 | default: |
483 | delayx = filedelay; |
484 | break; |
485 | } |
486 | if (!vp->v_mount || |
487 | (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) |
488 | vn_syncer_add_to_worklist(vp, delayx); |
489 | } |
490 | } |
491 | bufinsvn(bp, listheadp); |
492 | } |
493 | |
494 | /* |
495 | * Lookup a vnode by device number and return it referenced. |
496 | */ |
497 | int |
498 | vfinddev(dev_t dev, enum vtype type, vnode_t **vpp) |
499 | { |
500 | |
501 | return (spec_node_lookup_by_dev(type, dev, vpp) == 0); |
502 | } |
503 | |
504 | /* |
505 | * Revoke all the vnodes corresponding to the specified minor number |
506 | * range (endpoints inclusive) of the specified major. |
507 | */ |
508 | void |
509 | vdevgone(int maj, int minl, int minh, enum vtype type) |
510 | { |
511 | vnode_t *vp; |
512 | dev_t dev; |
513 | int mn; |
514 | |
515 | for (mn = minl; mn <= minh; mn++) { |
516 | dev = makedev(maj, mn); |
517 | while (spec_node_lookup_by_dev(type, dev, &vp) == 0) { |
518 | VOP_REVOKE(vp, REVOKEALL); |
519 | vrele(vp); |
520 | } |
521 | } |
522 | } |
523 | |
524 | /* |
525 | * The filesystem synchronizer mechanism - syncer. |
526 | * |
527 | * It is useful to delay writes of file data and filesystem metadata for |
528 | * a certain amount of time so that quickly created and deleted files need |
529 | * not waste disk bandwidth being created and removed. To implement this, |
530 | * vnodes are appended to a "workitem" queue. |
531 | * |
532 | * Most pending metadata should not wait for more than ten seconds. Thus, |
533 | * mounted on block devices are delayed only about a half the time that file |
534 | * data is delayed. Similarly, directory updates are more critical, so are |
535 | * only delayed about a third the time that file data is delayed. |
536 | * |
537 | * There are SYNCER_MAXDELAY queues that are processed in a round-robin |
538 | * manner at a rate of one each second (driven off the filesystem syner |
539 | * thread). The syncer_delayno variable indicates the next queue that is |
540 | * to be processed. Items that need to be processed soon are placed in |
541 | * this queue: |
542 | * |
543 | * syncer_workitem_pending[syncer_delayno] |
544 | * |
545 | * A delay of e.g. fifteen seconds is done by placing the request fifteen |
546 | * entries later in the queue: |
547 | * |
548 | * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] |
549 | * |
550 | * Flag VI_ONWORKLST indicates that vnode is added into the queue. |
551 | */ |
552 | |
553 | #define SYNCER_MAXDELAY 32 |
554 | |
555 | typedef TAILQ_HEAD(synclist, vnode) synclist_t; |
556 | |
557 | static void vn_syncer_add1(struct vnode *, int); |
558 | static void sysctl_vfs_syncfs_setup(struct sysctllog **); |
559 | |
560 | /* |
561 | * Defines and variables for the syncer process. |
562 | */ |
563 | int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ |
564 | time_t syncdelay = 30; /* max time to delay syncing data */ |
565 | time_t filedelay = 30; /* time to delay syncing files */ |
566 | time_t dirdelay = 15; /* time to delay syncing directories */ |
567 | time_t metadelay = 10; /* time to delay syncing metadata */ |
568 | time_t lockdelay = 1; /* time to delay if locking fails */ |
569 | |
570 | kmutex_t syncer_mutex; /* used to freeze syncer, long term */ |
571 | static kmutex_t syncer_data_lock; /* short term lock on data structs */ |
572 | |
573 | static int syncer_delayno = 0; |
574 | static long syncer_last; |
575 | static synclist_t * syncer_workitem_pending; |
576 | |
577 | static void |
578 | vn_initialize_syncerd(void) |
579 | { |
580 | int i; |
581 | |
582 | syncer_last = SYNCER_MAXDELAY + 2; |
583 | |
584 | sysctl_vfs_syncfs_setup(NULL); |
585 | |
586 | syncer_workitem_pending = |
587 | kmem_alloc(syncer_last * sizeof (struct synclist), KM_SLEEP); |
588 | |
589 | for (i = 0; i < syncer_last; i++) |
590 | TAILQ_INIT(&syncer_workitem_pending[i]); |
591 | |
592 | mutex_init(&syncer_mutex, MUTEX_DEFAULT, IPL_NONE); |
593 | mutex_init(&syncer_data_lock, MUTEX_DEFAULT, IPL_NONE); |
594 | } |
595 | |
596 | /* |
597 | * Return delay factor appropriate for the given file system. For |
598 | * WAPBL we use the sync vnode to burst out metadata updates: sync |
599 | * those file systems more frequently. |
600 | */ |
601 | static inline int |
602 | sync_delay(struct mount *mp) |
603 | { |
604 | |
605 | return mp->mnt_wapbl != NULL ? metadelay : syncdelay; |
606 | } |
607 | |
608 | /* |
609 | * Compute the next slot index from delay. |
610 | */ |
611 | static inline int |
612 | sync_delay_slot(int delayx) |
613 | { |
614 | |
615 | if (delayx > syncer_maxdelay - 2) |
616 | delayx = syncer_maxdelay - 2; |
617 | return (syncer_delayno + delayx) % syncer_last; |
618 | } |
619 | |
620 | /* |
621 | * Add an item to the syncer work queue. |
622 | */ |
623 | static void |
624 | vn_syncer_add1(struct vnode *vp, int delayx) |
625 | { |
626 | synclist_t *slp; |
627 | |
628 | KASSERT(mutex_owned(&syncer_data_lock)); |
629 | |
630 | if (vp->v_iflag & VI_ONWORKLST) { |
631 | /* |
632 | * Remove in order to adjust the position of the vnode. |
633 | * Note: called from sched_sync(), which will not hold |
634 | * interlock, therefore we cannot modify v_iflag here. |
635 | */ |
636 | slp = &syncer_workitem_pending[vp->v_synclist_slot]; |
637 | TAILQ_REMOVE(slp, vp, v_synclist); |
638 | } else { |
639 | KASSERT(mutex_owned(vp->v_interlock)); |
640 | vp->v_iflag |= VI_ONWORKLST; |
641 | } |
642 | |
643 | vp->v_synclist_slot = sync_delay_slot(delayx); |
644 | |
645 | slp = &syncer_workitem_pending[vp->v_synclist_slot]; |
646 | TAILQ_INSERT_TAIL(slp, vp, v_synclist); |
647 | } |
648 | |
649 | void |
650 | vn_syncer_add_to_worklist(struct vnode *vp, int delayx) |
651 | { |
652 | |
653 | KASSERT(mutex_owned(vp->v_interlock)); |
654 | |
655 | mutex_enter(&syncer_data_lock); |
656 | vn_syncer_add1(vp, delayx); |
657 | mutex_exit(&syncer_data_lock); |
658 | } |
659 | |
660 | /* |
661 | * Remove an item from the syncer work queue. |
662 | */ |
663 | void |
664 | vn_syncer_remove_from_worklist(struct vnode *vp) |
665 | { |
666 | synclist_t *slp; |
667 | |
668 | KASSERT(mutex_owned(vp->v_interlock)); |
669 | |
670 | mutex_enter(&syncer_data_lock); |
671 | if (vp->v_iflag & VI_ONWORKLST) { |
672 | vp->v_iflag &= ~VI_ONWORKLST; |
673 | slp = &syncer_workitem_pending[vp->v_synclist_slot]; |
674 | TAILQ_REMOVE(slp, vp, v_synclist); |
675 | } |
676 | mutex_exit(&syncer_data_lock); |
677 | } |
678 | |
679 | /* |
680 | * Add this mount point to the syncer. |
681 | */ |
682 | void |
683 | vfs_syncer_add_to_worklist(struct mount *mp) |
684 | { |
685 | static int start, incr, next; |
686 | int vdelay; |
687 | |
688 | KASSERT(mutex_owned(&mp->mnt_updating)); |
689 | KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) == 0); |
690 | |
691 | /* |
692 | * We attempt to scatter the mount points on the list |
693 | * so that they will go off at evenly distributed times |
694 | * even if all the filesystems are mounted at once. |
695 | */ |
696 | |
697 | next += incr; |
698 | if (next == 0 || next > syncer_maxdelay) { |
699 | start /= 2; |
700 | incr /= 2; |
701 | if (start == 0) { |
702 | start = syncer_maxdelay / 2; |
703 | incr = syncer_maxdelay; |
704 | } |
705 | next = start; |
706 | } |
707 | mp->mnt_iflag |= IMNT_ONWORKLIST; |
708 | vdelay = sync_delay(mp); |
709 | mp->mnt_synclist_slot = vdelay > 0 ? next % vdelay : 0; |
710 | } |
711 | |
712 | /* |
713 | * Remove the mount point from the syncer. |
714 | */ |
715 | void |
716 | vfs_syncer_remove_from_worklist(struct mount *mp) |
717 | { |
718 | |
719 | KASSERT(mutex_owned(&mp->mnt_updating)); |
720 | KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) != 0); |
721 | |
722 | mp->mnt_iflag &= ~IMNT_ONWORKLIST; |
723 | } |
724 | |
725 | /* |
726 | * Try lazy sync, return true on success. |
727 | */ |
728 | static bool |
729 | lazy_sync_vnode(struct vnode *vp) |
730 | { |
731 | bool synced; |
732 | |
733 | KASSERT(mutex_owned(&syncer_data_lock)); |
734 | |
735 | synced = false; |
736 | /* We are locking in the wrong direction. */ |
737 | if (mutex_tryenter(vp->v_interlock)) { |
738 | mutex_exit(&syncer_data_lock); |
739 | if (vget(vp, LK_NOWAIT, false /* !wait */) == 0) { |
740 | if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { |
741 | synced = true; |
742 | (void) VOP_FSYNC(vp, curlwp->l_cred, |
743 | FSYNC_LAZY, 0, 0); |
744 | vput(vp); |
745 | } else |
746 | vrele(vp); |
747 | } |
748 | mutex_enter(&syncer_data_lock); |
749 | } |
750 | return synced; |
751 | } |
752 | |
753 | /* |
754 | * System filesystem synchronizer daemon. |
755 | */ |
756 | void |
757 | sched_sync(void *arg) |
758 | { |
759 | synclist_t *slp; |
760 | struct vnode *vp; |
761 | struct mount *mp, *nmp; |
762 | time_t starttime; |
763 | bool synced; |
764 | |
765 | for (;;) { |
766 | mutex_enter(&syncer_mutex); |
767 | |
768 | starttime = time_second; |
769 | |
770 | /* |
771 | * Sync mounts whose dirty time has expired. |
772 | */ |
773 | mutex_enter(&mountlist_lock); |
774 | for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { |
775 | if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0 || |
776 | mp->mnt_synclist_slot != syncer_delayno) { |
777 | nmp = TAILQ_NEXT(mp, mnt_list); |
778 | continue; |
779 | } |
780 | mp->mnt_synclist_slot = sync_delay_slot(sync_delay(mp)); |
781 | if (vfs_busy(mp, &nmp)) |
782 | continue; |
783 | VFS_SYNC(mp, MNT_LAZY, curlwp->l_cred); |
784 | vfs_unbusy(mp, false, &nmp); |
785 | } |
786 | mutex_exit(&mountlist_lock); |
787 | |
788 | mutex_enter(&syncer_data_lock); |
789 | |
790 | /* |
791 | * Push files whose dirty time has expired. |
792 | */ |
793 | slp = &syncer_workitem_pending[syncer_delayno]; |
794 | syncer_delayno += 1; |
795 | if (syncer_delayno >= syncer_last) |
796 | syncer_delayno = 0; |
797 | |
798 | while ((vp = TAILQ_FIRST(slp)) != NULL) { |
799 | synced = lazy_sync_vnode(vp); |
800 | |
801 | /* |
802 | * XXX The vnode may have been recycled, in which |
803 | * case it may have a new identity. |
804 | */ |
805 | if (TAILQ_FIRST(slp) == vp) { |
806 | /* |
807 | * Put us back on the worklist. The worklist |
808 | * routine will remove us from our current |
809 | * position and then add us back in at a later |
810 | * position. |
811 | * |
812 | * Try again sooner rather than later if |
813 | * we were unable to lock the vnode. Lock |
814 | * failure should not prevent us from doing |
815 | * the sync "soon". |
816 | * |
817 | * If we locked it yet arrive here, it's |
818 | * likely that lazy sync is in progress and |
819 | * so the vnode still has dirty metadata. |
820 | * syncdelay is mainly to get this vnode out |
821 | * of the way so we do not consider it again |
822 | * "soon" in this loop, so the delay time is |
823 | * not critical as long as it is not "soon". |
824 | * While write-back strategy is the file |
825 | * system's domain, we expect write-back to |
826 | * occur no later than syncdelay seconds |
827 | * into the future. |
828 | */ |
829 | vn_syncer_add1(vp, |
830 | synced ? syncdelay : lockdelay); |
831 | } |
832 | } |
833 | mutex_exit(&syncer_mutex); |
834 | |
835 | /* |
836 | * If it has taken us less than a second to process the |
837 | * current work, then wait. Otherwise start right over |
838 | * again. We can still lose time if any single round |
839 | * takes more than two seconds, but it does not really |
840 | * matter as we are just trying to generally pace the |
841 | * filesystem activity. |
842 | */ |
843 | if (time_second == starttime) { |
844 | kpause("syncer" , false, hz, &syncer_data_lock); |
845 | } |
846 | mutex_exit(&syncer_data_lock); |
847 | } |
848 | } |
849 | |
850 | static void |
851 | sysctl_vfs_syncfs_setup(struct sysctllog **clog) |
852 | { |
853 | const struct sysctlnode *rnode, *cnode; |
854 | |
855 | sysctl_createv(clog, 0, NULL, &rnode, |
856 | CTLFLAG_PERMANENT, |
857 | CTLTYPE_NODE, "sync" , |
858 | SYSCTL_DESCR("syncer options" ), |
859 | NULL, 0, NULL, 0, |
860 | CTL_VFS, CTL_CREATE, CTL_EOL); |
861 | |
862 | sysctl_createv(clog, 0, &rnode, &cnode, |
863 | CTLFLAG_PERMANENT|CTLFLAG_READWRITE, |
864 | CTLTYPE_QUAD, "delay" , |
865 | SYSCTL_DESCR("max time to delay syncing data" ), |
866 | NULL, 0, &syncdelay, 0, |
867 | CTL_CREATE, CTL_EOL); |
868 | |
869 | sysctl_createv(clog, 0, &rnode, &cnode, |
870 | CTLFLAG_PERMANENT|CTLFLAG_READWRITE, |
871 | CTLTYPE_QUAD, "filedelay" , |
872 | SYSCTL_DESCR("time to delay syncing files" ), |
873 | NULL, 0, &filedelay, 0, |
874 | CTL_CREATE, CTL_EOL); |
875 | |
876 | sysctl_createv(clog, 0, &rnode, &cnode, |
877 | CTLFLAG_PERMANENT|CTLFLAG_READWRITE, |
878 | CTLTYPE_QUAD, "dirdelay" , |
879 | SYSCTL_DESCR("time to delay syncing directories" ), |
880 | NULL, 0, &dirdelay, 0, |
881 | CTL_CREATE, CTL_EOL); |
882 | |
883 | sysctl_createv(clog, 0, &rnode, &cnode, |
884 | CTLFLAG_PERMANENT|CTLFLAG_READWRITE, |
885 | CTLTYPE_QUAD, "metadelay" , |
886 | SYSCTL_DESCR("time to delay syncing metadata" ), |
887 | NULL, 0, &metadelay, 0, |
888 | CTL_CREATE, CTL_EOL); |
889 | } |
890 | |
891 | /* |
892 | * sysctl helper routine to return list of supported fstypes |
893 | */ |
894 | int |
895 | sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) |
896 | { |
897 | char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)]; |
898 | char *where = oldp; |
899 | struct vfsops *v; |
900 | size_t needed, left, slen; |
901 | int error, first; |
902 | |
903 | if (newp != NULL) |
904 | return (EPERM); |
905 | if (namelen != 0) |
906 | return (EINVAL); |
907 | |
908 | first = 1; |
909 | error = 0; |
910 | needed = 0; |
911 | left = *oldlenp; |
912 | |
913 | sysctl_unlock(); |
914 | mutex_enter(&vfs_list_lock); |
915 | LIST_FOREACH(v, &vfs_list, vfs_list) { |
916 | if (where == NULL) |
917 | needed += strlen(v->vfs_name) + 1; |
918 | else { |
919 | memset(bf, 0, sizeof(bf)); |
920 | if (first) { |
921 | strncpy(bf, v->vfs_name, sizeof(bf)); |
922 | first = 0; |
923 | } else { |
924 | bf[0] = ' '; |
925 | strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1); |
926 | } |
927 | bf[sizeof(bf)-1] = '\0'; |
928 | slen = strlen(bf); |
929 | if (left < slen + 1) |
930 | break; |
931 | v->vfs_refcount++; |
932 | mutex_exit(&vfs_list_lock); |
933 | /* +1 to copy out the trailing NUL byte */ |
934 | error = copyout(bf, where, slen + 1); |
935 | mutex_enter(&vfs_list_lock); |
936 | v->vfs_refcount--; |
937 | if (error) |
938 | break; |
939 | where += slen; |
940 | needed += slen; |
941 | left -= slen; |
942 | } |
943 | } |
944 | mutex_exit(&vfs_list_lock); |
945 | sysctl_relock(); |
946 | *oldlenp = needed; |
947 | return (error); |
948 | } |
949 | |
950 | int kinfo_vdebug = 1; |
951 | int kinfo_vgetfailed; |
952 | |
953 | #define KINFO_VNODESLOP 10 |
954 | |
955 | /* |
956 | * Dump vnode list (via sysctl). |
957 | * Copyout address of vnode followed by vnode. |
958 | */ |
959 | int |
960 | sysctl_kern_vnode(SYSCTLFN_ARGS) |
961 | { |
962 | char *where = oldp; |
963 | size_t *sizep = oldlenp; |
964 | struct mount *mp, *nmp; |
965 | vnode_t *vp, vbuf; |
966 | struct vnode_iterator *marker; |
967 | char *bp = where; |
968 | char *ewhere; |
969 | int error; |
970 | |
971 | if (namelen != 0) |
972 | return (EOPNOTSUPP); |
973 | if (newp != NULL) |
974 | return (EPERM); |
975 | |
976 | #define VPTRSZ sizeof(vnode_t *) |
977 | #define VNODESZ sizeof(vnode_t) |
978 | if (where == NULL) { |
979 | *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); |
980 | return (0); |
981 | } |
982 | ewhere = where + *sizep; |
983 | |
984 | sysctl_unlock(); |
985 | mutex_enter(&mountlist_lock); |
986 | for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { |
987 | if (vfs_busy(mp, &nmp)) { |
988 | continue; |
989 | } |
990 | vfs_vnode_iterator_init(mp, &marker); |
991 | while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) { |
992 | if (bp + VPTRSZ + VNODESZ > ewhere) { |
993 | vrele(vp); |
994 | vfs_vnode_iterator_destroy(marker); |
995 | vfs_unbusy(mp, false, NULL); |
996 | sysctl_relock(); |
997 | *sizep = bp - where; |
998 | return (ENOMEM); |
999 | } |
1000 | memcpy(&vbuf, vp, VNODESZ); |
1001 | if ((error = copyout(&vp, bp, VPTRSZ)) || |
1002 | (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) { |
1003 | vrele(vp); |
1004 | vfs_vnode_iterator_destroy(marker); |
1005 | vfs_unbusy(mp, false, NULL); |
1006 | sysctl_relock(); |
1007 | return (error); |
1008 | } |
1009 | vrele(vp); |
1010 | bp += VPTRSZ + VNODESZ; |
1011 | } |
1012 | vfs_vnode_iterator_destroy(marker); |
1013 | vfs_unbusy(mp, false, &nmp); |
1014 | } |
1015 | mutex_exit(&mountlist_lock); |
1016 | sysctl_relock(); |
1017 | |
1018 | *sizep = bp - where; |
1019 | return (0); |
1020 | } |
1021 | |
1022 | /* |
1023 | * Set vnode attributes to VNOVAL |
1024 | */ |
1025 | void |
1026 | vattr_null(struct vattr *vap) |
1027 | { |
1028 | |
1029 | memset(vap, 0, sizeof(*vap)); |
1030 | |
1031 | vap->va_type = VNON; |
1032 | |
1033 | /* |
1034 | * Assign individually so that it is safe even if size and |
1035 | * sign of each member are varied. |
1036 | */ |
1037 | vap->va_mode = VNOVAL; |
1038 | vap->va_nlink = VNOVAL; |
1039 | vap->va_uid = VNOVAL; |
1040 | vap->va_gid = VNOVAL; |
1041 | vap->va_fsid = VNOVAL; |
1042 | vap->va_fileid = VNOVAL; |
1043 | vap->va_size = VNOVAL; |
1044 | vap->va_blocksize = VNOVAL; |
1045 | vap->va_atime.tv_sec = |
1046 | vap->va_mtime.tv_sec = |
1047 | vap->va_ctime.tv_sec = |
1048 | vap->va_birthtime.tv_sec = VNOVAL; |
1049 | vap->va_atime.tv_nsec = |
1050 | vap->va_mtime.tv_nsec = |
1051 | vap->va_ctime.tv_nsec = |
1052 | vap->va_birthtime.tv_nsec = VNOVAL; |
1053 | vap->va_gen = VNOVAL; |
1054 | vap->va_flags = VNOVAL; |
1055 | vap->va_rdev = VNOVAL; |
1056 | vap->va_bytes = VNOVAL; |
1057 | } |
1058 | |
1059 | /* |
1060 | * Vnode state to string. |
1061 | */ |
1062 | const char * |
1063 | vstate_name(enum vnode_state state) |
1064 | { |
1065 | |
1066 | switch (state) { |
1067 | case VS_MARKER: |
1068 | return "MARKER" ; |
1069 | case VS_LOADING: |
1070 | return "LOADING" ; |
1071 | case VS_ACTIVE: |
1072 | return "ACTIVE" ; |
1073 | case VS_BLOCKED: |
1074 | return "BLOCKED" ; |
1075 | case VS_RECLAIMING: |
1076 | return "RECLAIMING" ; |
1077 | case VS_RECLAIMED: |
1078 | return "RECLAIMED" ; |
1079 | default: |
1080 | return "ILLEGAL" ; |
1081 | } |
1082 | } |
1083 | |
1084 | /* |
1085 | * Print a description of a vnode (common part). |
1086 | */ |
1087 | static void |
1088 | vprint_common(struct vnode *vp, const char *prefix, |
1089 | void (*pr)(const char *, ...) __printflike(1, 2)) |
1090 | { |
1091 | int n; |
1092 | char bf[96]; |
1093 | const uint8_t *cp; |
1094 | vnode_impl_t *node; |
1095 | const char * const vnode_tags[] = { VNODE_TAGS }; |
1096 | const char * const vnode_types[] = { VNODE_TYPES }; |
1097 | const char vnode_flagbits[] = VNODE_FLAGBITS; |
1098 | |
1099 | #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) |
1100 | #define ARRAY_PRINT(idx, arr) \ |
1101 | ((unsigned int)(idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN") |
1102 | |
1103 | node = VNODE_TO_VIMPL(vp); |
1104 | |
1105 | snprintb(bf, sizeof(bf), |
1106 | vnode_flagbits, vp->v_iflag | vp->v_vflag | vp->v_uflag); |
1107 | |
1108 | (*pr)("vnode %p flags %s\n" , vp, bf); |
1109 | (*pr)("%stag %s(%d) type %s(%d) mount %p typedata %p\n" , prefix, |
1110 | ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag, |
1111 | ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type, |
1112 | vp->v_mount, vp->v_mountedhere); |
1113 | (*pr)("%susecount %d writecount %d holdcount %d\n" , prefix, |
1114 | vp->v_usecount, vp->v_writecount, vp->v_holdcnt); |
1115 | (*pr)("%ssize %" PRIx64 " writesize %" PRIx64 " numoutput %d\n" , |
1116 | prefix, vp->v_size, vp->v_writesize, vp->v_numoutput); |
1117 | (*pr)("%sfreelisthd %p data %p lock %p\n" , prefix, |
1118 | vp->v_freelisthd, vp->v_data, &vp->v_lock); |
1119 | |
1120 | (*pr)("%sstate %s key(%p %zd)" , prefix, vstate_name(node->vi_state), |
1121 | node->vi_key.vk_mount, node->vi_key.vk_key_len); |
1122 | n = node->vi_key.vk_key_len; |
1123 | cp = node->vi_key.vk_key; |
1124 | while (n-- > 0) |
1125 | (*pr)(" %02x" , *cp++); |
1126 | (*pr)("\n" ); |
1127 | |
1128 | #undef ARRAY_PRINT |
1129 | #undef ARRAY_SIZE |
1130 | } |
1131 | |
1132 | /* |
1133 | * Print out a description of a vnode. |
1134 | */ |
1135 | void |
1136 | vprint(const char *label, struct vnode *vp) |
1137 | { |
1138 | |
1139 | if (label != NULL) |
1140 | printf("%s: " , label); |
1141 | vprint_common(vp, "\t" , printf); |
1142 | if (vp->v_data != NULL) { |
1143 | printf("\t" ); |
1144 | VOP_PRINT(vp); |
1145 | } |
1146 | } |
1147 | |
1148 | /* Deprecated. Kept for KPI compatibility. */ |
1149 | int |
1150 | vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid, |
1151 | mode_t acc_mode, kauth_cred_t cred) |
1152 | { |
1153 | |
1154 | #ifdef DIAGNOSTIC |
1155 | printf("vaccess: deprecated interface used.\n" ); |
1156 | #endif /* DIAGNOSTIC */ |
1157 | |
1158 | return kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(acc_mode, |
1159 | type, file_mode), NULL /* This may panic. */, NULL, |
1160 | genfs_can_access(type, file_mode, uid, gid, acc_mode, cred)); |
1161 | } |
1162 | |
1163 | /* |
1164 | * Given a file system name, look up the vfsops for that |
1165 | * file system, or return NULL if file system isn't present |
1166 | * in the kernel. |
1167 | */ |
1168 | struct vfsops * |
1169 | vfs_getopsbyname(const char *name) |
1170 | { |
1171 | struct vfsops *v; |
1172 | |
1173 | mutex_enter(&vfs_list_lock); |
1174 | LIST_FOREACH(v, &vfs_list, vfs_list) { |
1175 | if (strcmp(v->vfs_name, name) == 0) |
1176 | break; |
1177 | } |
1178 | if (v != NULL) |
1179 | v->vfs_refcount++; |
1180 | mutex_exit(&vfs_list_lock); |
1181 | |
1182 | return (v); |
1183 | } |
1184 | |
1185 | void |
1186 | copy_statvfs_info(struct statvfs *sbp, const struct mount *mp) |
1187 | { |
1188 | const struct statvfs *mbp; |
1189 | |
1190 | if (sbp == (mbp = &mp->mnt_stat)) |
1191 | return; |
1192 | |
1193 | (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx)); |
1194 | sbp->f_fsid = mbp->f_fsid; |
1195 | sbp->f_owner = mbp->f_owner; |
1196 | sbp->f_flag = mbp->f_flag; |
1197 | sbp->f_syncwrites = mbp->f_syncwrites; |
1198 | sbp->f_asyncwrites = mbp->f_asyncwrites; |
1199 | sbp->f_syncreads = mbp->f_syncreads; |
1200 | sbp->f_asyncreads = mbp->f_asyncreads; |
1201 | (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare)); |
1202 | (void)memcpy(sbp->f_fstypename, mbp->f_fstypename, |
1203 | sizeof(sbp->f_fstypename)); |
1204 | (void)memcpy(sbp->f_mntonname, mbp->f_mntonname, |
1205 | sizeof(sbp->f_mntonname)); |
1206 | (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, |
1207 | sizeof(sbp->f_mntfromname)); |
1208 | sbp->f_namemax = mbp->f_namemax; |
1209 | } |
1210 | |
1211 | int |
1212 | set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom, |
1213 | const char *vfsname, struct mount *mp, struct lwp *l) |
1214 | { |
1215 | int error; |
1216 | size_t size; |
1217 | struct statvfs *sfs = &mp->mnt_stat; |
1218 | int (*fun)(const void *, void *, size_t, size_t *); |
1219 | |
1220 | (void)strlcpy(mp->mnt_stat.f_fstypename, vfsname, |
1221 | sizeof(mp->mnt_stat.f_fstypename)); |
1222 | |
1223 | if (onp) { |
1224 | struct cwdinfo *cwdi = l->l_proc->p_cwdi; |
1225 | fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr; |
1226 | if (cwdi->cwdi_rdir != NULL) { |
1227 | size_t len; |
1228 | char *bp; |
1229 | char *path = PNBUF_GET(); |
1230 | |
1231 | bp = path + MAXPATHLEN; |
1232 | *--bp = '\0'; |
1233 | rw_enter(&cwdi->cwdi_lock, RW_READER); |
1234 | error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, |
1235 | path, MAXPATHLEN / 2, 0, l); |
1236 | rw_exit(&cwdi->cwdi_lock); |
1237 | if (error) { |
1238 | PNBUF_PUT(path); |
1239 | return error; |
1240 | } |
1241 | |
1242 | len = strlen(bp); |
1243 | if (len > sizeof(sfs->f_mntonname) - 1) |
1244 | len = sizeof(sfs->f_mntonname) - 1; |
1245 | (void)strncpy(sfs->f_mntonname, bp, len); |
1246 | PNBUF_PUT(path); |
1247 | |
1248 | if (len < sizeof(sfs->f_mntonname) - 1) { |
1249 | error = (*fun)(onp, &sfs->f_mntonname[len], |
1250 | sizeof(sfs->f_mntonname) - len - 1, &size); |
1251 | if (error) |
1252 | return error; |
1253 | size += len; |
1254 | } else { |
1255 | size = len; |
1256 | } |
1257 | } else { |
1258 | error = (*fun)(onp, &sfs->f_mntonname, |
1259 | sizeof(sfs->f_mntonname) - 1, &size); |
1260 | if (error) |
1261 | return error; |
1262 | } |
1263 | (void)memset(sfs->f_mntonname + size, 0, |
1264 | sizeof(sfs->f_mntonname) - size); |
1265 | } |
1266 | |
1267 | if (fromp) { |
1268 | fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr; |
1269 | error = (*fun)(fromp, sfs->f_mntfromname, |
1270 | sizeof(sfs->f_mntfromname) - 1, &size); |
1271 | if (error) |
1272 | return error; |
1273 | (void)memset(sfs->f_mntfromname + size, 0, |
1274 | sizeof(sfs->f_mntfromname) - size); |
1275 | } |
1276 | return 0; |
1277 | } |
1278 | |
1279 | void |
1280 | vfs_timestamp(struct timespec *ts) |
1281 | { |
1282 | |
1283 | nanotime(ts); |
1284 | } |
1285 | |
1286 | time_t rootfstime; /* recorded root fs time, if known */ |
1287 | void |
1288 | setrootfstime(time_t t) |
1289 | { |
1290 | rootfstime = t; |
1291 | } |
1292 | |
1293 | static const uint8_t vttodt_tab[ ] = { |
1294 | [VNON] = DT_UNKNOWN, |
1295 | [VREG] = DT_REG, |
1296 | [VDIR] = DT_DIR, |
1297 | [VBLK] = DT_BLK, |
1298 | [VCHR] = DT_CHR, |
1299 | [VLNK] = DT_LNK, |
1300 | [VSOCK] = DT_SOCK, |
1301 | [VFIFO] = DT_FIFO, |
1302 | [VBAD] = DT_UNKNOWN |
1303 | }; |
1304 | |
1305 | uint8_t |
1306 | vtype2dt(enum vtype vt) |
1307 | { |
1308 | |
1309 | CTASSERT(VBAD == __arraycount(vttodt_tab) - 1); |
1310 | return vttodt_tab[vt]; |
1311 | } |
1312 | |
1313 | int |
1314 | VFS_MOUNT(struct mount *mp, const char *a, void *b, size_t *c) |
1315 | { |
1316 | int error; |
1317 | |
1318 | KERNEL_LOCK(1, NULL); |
1319 | error = (*(mp->mnt_op->vfs_mount))(mp, a, b, c); |
1320 | KERNEL_UNLOCK_ONE(NULL); |
1321 | |
1322 | return error; |
1323 | } |
1324 | |
1325 | int |
1326 | VFS_START(struct mount *mp, int a) |
1327 | { |
1328 | int error; |
1329 | |
1330 | if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { |
1331 | KERNEL_LOCK(1, NULL); |
1332 | } |
1333 | error = (*(mp->mnt_op->vfs_start))(mp, a); |
1334 | if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { |
1335 | KERNEL_UNLOCK_ONE(NULL); |
1336 | } |
1337 | |
1338 | return error; |
1339 | } |
1340 | |
1341 | int |
1342 | VFS_UNMOUNT(struct mount *mp, int a) |
1343 | { |
1344 | int error; |
1345 | |
1346 | KERNEL_LOCK(1, NULL); |
1347 | error = (*(mp->mnt_op->vfs_unmount))(mp, a); |
1348 | KERNEL_UNLOCK_ONE(NULL); |
1349 | |
1350 | return error; |
1351 | } |
1352 | |
1353 | int |
1354 | VFS_ROOT(struct mount *mp, struct vnode **a) |
1355 | { |
1356 | int error; |
1357 | |
1358 | if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { |
1359 | KERNEL_LOCK(1, NULL); |
1360 | } |
1361 | error = (*(mp->mnt_op->vfs_root))(mp, a); |
1362 | if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { |
1363 | KERNEL_UNLOCK_ONE(NULL); |
1364 | } |
1365 | |
1366 | return error; |
1367 | } |
1368 | |
1369 | int |
1370 | VFS_QUOTACTL(struct mount *mp, struct quotactl_args *args) |
1371 | { |
1372 | int error; |
1373 | |
1374 | if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { |
1375 | KERNEL_LOCK(1, NULL); |
1376 | } |
1377 | error = (*(mp->mnt_op->vfs_quotactl))(mp, args); |
1378 | if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { |
1379 | KERNEL_UNLOCK_ONE(NULL); |
1380 | } |
1381 | |
1382 | return error; |
1383 | } |
1384 | |
1385 | int |
1386 | VFS_STATVFS(struct mount *mp, struct statvfs *a) |
1387 | { |
1388 | int error; |
1389 | |
1390 | if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { |
1391 | KERNEL_LOCK(1, NULL); |
1392 | } |
1393 | error = (*(mp->mnt_op->vfs_statvfs))(mp, a); |
1394 | if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { |
1395 | KERNEL_UNLOCK_ONE(NULL); |
1396 | } |
1397 | |
1398 | return error; |
1399 | } |
1400 | |
1401 | int |
1402 | VFS_SYNC(struct mount *mp, int a, struct kauth_cred *b) |
1403 | { |
1404 | int error; |
1405 | |
1406 | if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { |
1407 | KERNEL_LOCK(1, NULL); |
1408 | } |
1409 | error = (*(mp->mnt_op->vfs_sync))(mp, a, b); |
1410 | if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { |
1411 | KERNEL_UNLOCK_ONE(NULL); |
1412 | } |
1413 | |
1414 | return error; |
1415 | } |
1416 | |
1417 | int |
1418 | VFS_FHTOVP(struct mount *mp, struct fid *a, struct vnode **b) |
1419 | { |
1420 | int error; |
1421 | |
1422 | if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { |
1423 | KERNEL_LOCK(1, NULL); |
1424 | } |
1425 | error = (*(mp->mnt_op->vfs_fhtovp))(mp, a, b); |
1426 | if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { |
1427 | KERNEL_UNLOCK_ONE(NULL); |
1428 | } |
1429 | |
1430 | return error; |
1431 | } |
1432 | |
1433 | int |
1434 | VFS_VPTOFH(struct vnode *vp, struct fid *a, size_t *b) |
1435 | { |
1436 | int error; |
1437 | |
1438 | if ((vp->v_vflag & VV_MPSAFE) == 0) { |
1439 | KERNEL_LOCK(1, NULL); |
1440 | } |
1441 | error = (*(vp->v_mount->mnt_op->vfs_vptofh))(vp, a, b); |
1442 | if ((vp->v_vflag & VV_MPSAFE) == 0) { |
1443 | KERNEL_UNLOCK_ONE(NULL); |
1444 | } |
1445 | |
1446 | return error; |
1447 | } |
1448 | |
1449 | int |
1450 | VFS_SNAPSHOT(struct mount *mp, struct vnode *a, struct timespec *b) |
1451 | { |
1452 | int error; |
1453 | |
1454 | if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { |
1455 | KERNEL_LOCK(1, NULL); |
1456 | } |
1457 | error = (*(mp->mnt_op->vfs_snapshot))(mp, a, b); |
1458 | if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { |
1459 | KERNEL_UNLOCK_ONE(NULL); |
1460 | } |
1461 | |
1462 | return error; |
1463 | } |
1464 | |
1465 | int |
1466 | VFS_EXTATTRCTL(struct mount *mp, int a, struct vnode *b, int c, const char *d) |
1467 | { |
1468 | int error; |
1469 | |
1470 | KERNEL_LOCK(1, NULL); /* XXXSMP check ffs */ |
1471 | error = (*(mp->mnt_op->vfs_extattrctl))(mp, a, b, c, d); |
1472 | KERNEL_UNLOCK_ONE(NULL); /* XXX */ |
1473 | |
1474 | return error; |
1475 | } |
1476 | |
1477 | int |
1478 | VFS_SUSPENDCTL(struct mount *mp, int a) |
1479 | { |
1480 | int error; |
1481 | |
1482 | if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { |
1483 | KERNEL_LOCK(1, NULL); |
1484 | } |
1485 | error = (*(mp->mnt_op->vfs_suspendctl))(mp, a); |
1486 | if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { |
1487 | KERNEL_UNLOCK_ONE(NULL); |
1488 | } |
1489 | |
1490 | return error; |
1491 | } |
1492 | |
1493 | #if defined(DDB) || defined(DEBUGPRINT) |
1494 | static const char buf_flagbits[] = BUF_FLAGBITS; |
1495 | |
1496 | void |
1497 | vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...)) |
1498 | { |
1499 | char bf[1024]; |
1500 | |
1501 | (*pr)(" vp %p lblkno 0x%" PRIx64" blkno 0x%" PRIx64" rawblkno 0x%" |
1502 | PRIx64 " dev 0x%x\n" , |
1503 | bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_rawblkno, bp->b_dev); |
1504 | |
1505 | snprintb(bf, sizeof(bf), |
1506 | buf_flagbits, bp->b_flags | bp->b_oflags | bp->b_cflags); |
1507 | (*pr)(" error %d flags 0x%s\n" , bp->b_error, bf); |
1508 | |
1509 | (*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n" , |
1510 | bp->b_bufsize, bp->b_bcount, bp->b_resid); |
1511 | (*pr)(" data %p saveaddr %p\n" , |
1512 | bp->b_data, bp->b_saveaddr); |
1513 | (*pr)(" iodone %p objlock %p\n" , bp->b_iodone, bp->b_objlock); |
1514 | } |
1515 | |
1516 | void |
1517 | vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...)) |
1518 | { |
1519 | |
1520 | uvm_object_printit(&vp->v_uobj, full, pr); |
1521 | (*pr)("\n" ); |
1522 | vprint_common(vp, "" , printf); |
1523 | if (full) { |
1524 | struct buf *bp; |
1525 | |
1526 | (*pr)("clean bufs:\n" ); |
1527 | LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) { |
1528 | (*pr)(" bp %p\n" , bp); |
1529 | vfs_buf_print(bp, full, pr); |
1530 | } |
1531 | |
1532 | (*pr)("dirty bufs:\n" ); |
1533 | LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { |
1534 | (*pr)(" bp %p\n" , bp); |
1535 | vfs_buf_print(bp, full, pr); |
1536 | } |
1537 | } |
1538 | } |
1539 | |
1540 | void |
1541 | vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...)) |
1542 | { |
1543 | char sbuf[256]; |
1544 | |
1545 | (*pr)("vnodecovered = %p data = %p\n" , |
1546 | mp->mnt_vnodecovered,mp->mnt_data); |
1547 | |
1548 | (*pr)("fs_bshift %d dev_bshift = %d\n" , |
1549 | mp->mnt_fs_bshift,mp->mnt_dev_bshift); |
1550 | |
1551 | snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_flag); |
1552 | (*pr)("flag = %s\n" , sbuf); |
1553 | |
1554 | snprintb(sbuf, sizeof(sbuf), __IMNT_FLAG_BITS, mp->mnt_iflag); |
1555 | (*pr)("iflag = %s\n" , sbuf); |
1556 | |
1557 | (*pr)("refcnt = %d unmounting @ %p updating @ %p\n" , mp->mnt_refcnt, |
1558 | &mp->mnt_unmounting, &mp->mnt_updating); |
1559 | |
1560 | (*pr)("statvfs cache:\n" ); |
1561 | (*pr)("\tbsize = %lu\n" ,mp->mnt_stat.f_bsize); |
1562 | (*pr)("\tfrsize = %lu\n" ,mp->mnt_stat.f_frsize); |
1563 | (*pr)("\tiosize = %lu\n" ,mp->mnt_stat.f_iosize); |
1564 | |
1565 | (*pr)("\tblocks = %" PRIu64"\n" ,mp->mnt_stat.f_blocks); |
1566 | (*pr)("\tbfree = %" PRIu64"\n" ,mp->mnt_stat.f_bfree); |
1567 | (*pr)("\tbavail = %" PRIu64"\n" ,mp->mnt_stat.f_bavail); |
1568 | (*pr)("\tbresvd = %" PRIu64"\n" ,mp->mnt_stat.f_bresvd); |
1569 | |
1570 | (*pr)("\tfiles = %" PRIu64"\n" ,mp->mnt_stat.f_files); |
1571 | (*pr)("\tffree = %" PRIu64"\n" ,mp->mnt_stat.f_ffree); |
1572 | (*pr)("\tfavail = %" PRIu64"\n" ,mp->mnt_stat.f_favail); |
1573 | (*pr)("\tfresvd = %" PRIu64"\n" ,mp->mnt_stat.f_fresvd); |
1574 | |
1575 | (*pr)("\tf_fsidx = { 0x%" PRIx32", 0x%" PRIx32" }\n" , |
1576 | mp->mnt_stat.f_fsidx.__fsid_val[0], |
1577 | mp->mnt_stat.f_fsidx.__fsid_val[1]); |
1578 | |
1579 | (*pr)("\towner = %" PRIu32"\n" ,mp->mnt_stat.f_owner); |
1580 | (*pr)("\tnamemax = %lu\n" ,mp->mnt_stat.f_namemax); |
1581 | |
1582 | snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_stat.f_flag); |
1583 | |
1584 | (*pr)("\tflag = %s\n" ,sbuf); |
1585 | (*pr)("\tsyncwrites = %" PRIu64 "\n" ,mp->mnt_stat.f_syncwrites); |
1586 | (*pr)("\tasyncwrites = %" PRIu64 "\n" ,mp->mnt_stat.f_asyncwrites); |
1587 | (*pr)("\tsyncreads = %" PRIu64 "\n" ,mp->mnt_stat.f_syncreads); |
1588 | (*pr)("\tasyncreads = %" PRIu64 "\n" ,mp->mnt_stat.f_asyncreads); |
1589 | (*pr)("\tfstypename = %s\n" ,mp->mnt_stat.f_fstypename); |
1590 | (*pr)("\tmntonname = %s\n" ,mp->mnt_stat.f_mntonname); |
1591 | (*pr)("\tmntfromname = %s\n" ,mp->mnt_stat.f_mntfromname); |
1592 | |
1593 | { |
1594 | int cnt = 0; |
1595 | struct vnode *vp; |
1596 | (*pr)("locked vnodes =" ); |
1597 | TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { |
1598 | if (VOP_ISLOCKED(vp)) { |
1599 | if ((++cnt % 6) == 0) { |
1600 | (*pr)(" %p,\n\t" , vp); |
1601 | } else { |
1602 | (*pr)(" %p," , vp); |
1603 | } |
1604 | } |
1605 | } |
1606 | (*pr)("\n" ); |
1607 | } |
1608 | |
1609 | if (full) { |
1610 | int cnt = 0; |
1611 | struct vnode *vp; |
1612 | (*pr)("all vnodes =" ); |
1613 | TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { |
1614 | if (!TAILQ_NEXT(vp, v_mntvnodes)) { |
1615 | (*pr)(" %p" , vp); |
1616 | } else if ((++cnt % 6) == 0) { |
1617 | (*pr)(" %p,\n\t" , vp); |
1618 | } else { |
1619 | (*pr)(" %p," , vp); |
1620 | } |
1621 | } |
1622 | (*pr)("\n" , vp); |
1623 | } |
1624 | } |
1625 | |
1626 | /* |
1627 | * List all of the locked vnodes in the system. |
1628 | */ |
1629 | void printlockedvnodes(void); |
1630 | |
1631 | void |
1632 | printlockedvnodes(void) |
1633 | { |
1634 | struct mount *mp, *nmp; |
1635 | struct vnode *vp; |
1636 | |
1637 | printf("Locked vnodes\n" ); |
1638 | mutex_enter(&mountlist_lock); |
1639 | for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { |
1640 | if (vfs_busy(mp, &nmp)) { |
1641 | continue; |
1642 | } |
1643 | TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { |
1644 | if (VOP_ISLOCKED(vp)) |
1645 | vprint(NULL, vp); |
1646 | } |
1647 | mutex_enter(&mountlist_lock); |
1648 | vfs_unbusy(mp, false, &nmp); |
1649 | } |
1650 | mutex_exit(&mountlist_lock); |
1651 | } |
1652 | |
1653 | #endif /* DDB || DEBUGPRINT */ |
1654 | |