1 | /* $NetBSD: lfs_syscalls.c,v 1.172 2015/10/15 06:15:48 dholland Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008 |
5 | * The NetBSD Foundation, Inc. |
6 | * All rights reserved. |
7 | * |
8 | * This code is derived from software contributed to The NetBSD Foundation |
9 | * by Konrad E. Schroder <perseant@hhhh.org>. |
10 | * |
11 | * Redistribution and use in source and binary forms, with or without |
12 | * modification, are permitted provided that the following conditions |
13 | * are met: |
14 | * 1. Redistributions of source code must retain the above copyright |
15 | * notice, this list of conditions and the following disclaimer. |
16 | * 2. Redistributions in binary form must reproduce the above copyright |
17 | * notice, this list of conditions and the following disclaimer in the |
18 | * documentation and/or other materials provided with the distribution. |
19 | * |
20 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
21 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
22 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
23 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
24 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
25 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
26 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
27 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
28 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
29 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
30 | * POSSIBILITY OF SUCH DAMAGE. |
31 | */ |
32 | /*- |
33 | * Copyright (c) 1991, 1993, 1994 |
34 | * The Regents of the University of California. All rights reserved. |
35 | * |
36 | * Redistribution and use in source and binary forms, with or without |
37 | * modification, are permitted provided that the following conditions |
38 | * are met: |
39 | * 1. Redistributions of source code must retain the above copyright |
40 | * notice, this list of conditions and the following disclaimer. |
41 | * 2. Redistributions in binary form must reproduce the above copyright |
42 | * notice, this list of conditions and the following disclaimer in the |
43 | * documentation and/or other materials provided with the distribution. |
44 | * 3. Neither the name of the University nor the names of its contributors |
45 | * may be used to endorse or promote products derived from this software |
46 | * without specific prior written permission. |
47 | * |
48 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
49 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
50 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
51 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
52 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
53 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
54 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
55 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
56 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
57 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
58 | * SUCH DAMAGE. |
59 | * |
60 | * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95 |
61 | */ |
62 | |
63 | #include <sys/cdefs.h> |
64 | __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.172 2015/10/15 06:15:48 dholland Exp $" ); |
65 | |
66 | #ifndef LFS |
67 | # define LFS /* for prototypes in syscallargs.h */ |
68 | #endif |
69 | |
70 | #include <sys/param.h> |
71 | #include <sys/systm.h> |
72 | #include <sys/proc.h> |
73 | #include <sys/buf.h> |
74 | #include <sys/mount.h> |
75 | #include <sys/vnode.h> |
76 | #include <sys/kernel.h> |
77 | #include <sys/kauth.h> |
78 | #include <sys/syscallargs.h> |
79 | |
80 | #include <ufs/lfs/ulfs_inode.h> |
81 | #include <ufs/lfs/ulfsmount.h> |
82 | #include <ufs/lfs/ulfs_extern.h> |
83 | |
84 | #include <ufs/lfs/lfs.h> |
85 | #include <ufs/lfs/lfs_accessors.h> |
86 | #include <ufs/lfs/lfs_kernel.h> |
87 | #include <ufs/lfs/lfs_extern.h> |
88 | |
89 | static int lfs_fastvget(struct mount *, ino_t, BLOCK_INFO *, int, |
90 | struct vnode **); |
91 | static struct buf *lfs_fakebuf(struct lfs *, struct vnode *, daddr_t, |
92 | size_t, void *); |
93 | |
94 | /* |
95 | * sys_lfs_markv: |
96 | * |
97 | * This will mark inodes and blocks dirty, so they are written into the log. |
98 | * It will block until all the blocks have been written. The segment create |
99 | * time passed in the block_info and inode_info structures is used to decide |
100 | * if the data is valid for each block (in case some process dirtied a block |
101 | * or inode that is being cleaned between the determination that a block is |
102 | * live and the lfs_markv call). |
103 | * |
104 | * 0 on success |
105 | * -1/errno is return on error. |
106 | */ |
107 | #ifdef USE_64BIT_SYSCALLS |
108 | int |
109 | sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval) |
110 | { |
111 | /* { |
112 | syscallarg(fsid_t *) fsidp; |
113 | syscallarg(struct block_info *) blkiov; |
114 | syscallarg(int) blkcnt; |
115 | } */ |
116 | BLOCK_INFO *blkiov; |
117 | int blkcnt, error; |
118 | fsid_t fsid; |
119 | struct lfs *fs; |
120 | struct mount *mntp; |
121 | |
122 | if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) |
123 | return (error); |
124 | |
125 | if ((mntp = vfs_getvfs(&fsid)) == NULL) |
126 | return (ENOENT); |
127 | fs = VFSTOULFS(mntp)->um_lfs; |
128 | |
129 | blkcnt = SCARG(uap, blkcnt); |
130 | if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) |
131 | return (EINVAL); |
132 | |
133 | KERNEL_LOCK(1, NULL); |
134 | blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); |
135 | if ((error = copyin(SCARG(uap, blkiov), blkiov, |
136 | blkcnt * sizeof(BLOCK_INFO))) != 0) |
137 | goto out; |
138 | |
139 | if ((error = lfs_markv(l, &fsid, blkiov, blkcnt)) == 0) |
140 | copyout(blkiov, SCARG(uap, blkiov), |
141 | blkcnt * sizeof(BLOCK_INFO)); |
142 | out: |
143 | lfs_free(fs, blkiov, LFS_NB_BLKIOV); |
144 | KERNEL_UNLOCK_ONE(NULL); |
145 | return error; |
146 | } |
147 | #else |
148 | int |
149 | sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval) |
150 | { |
151 | /* { |
152 | syscallarg(fsid_t *) fsidp; |
153 | syscallarg(struct block_info *) blkiov; |
154 | syscallarg(int) blkcnt; |
155 | } */ |
156 | BLOCK_INFO *blkiov; |
157 | BLOCK_INFO_15 *blkiov15; |
158 | int i, blkcnt, error; |
159 | fsid_t fsid; |
160 | struct lfs *fs; |
161 | struct mount *mntp; |
162 | |
163 | if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) |
164 | return (error); |
165 | |
166 | if ((mntp = vfs_getvfs(&fsid)) == NULL) |
167 | return (ENOENT); |
168 | fs = VFSTOULFS(mntp)->um_lfs; |
169 | |
170 | blkcnt = SCARG(uap, blkcnt); |
171 | if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) |
172 | return (EINVAL); |
173 | |
174 | KERNEL_LOCK(1, NULL); |
175 | blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); |
176 | blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV); |
177 | if ((error = copyin(SCARG(uap, blkiov), blkiov15, |
178 | blkcnt * sizeof(BLOCK_INFO_15))) != 0) |
179 | goto out; |
180 | |
181 | for (i = 0; i < blkcnt; i++) { |
182 | blkiov[i].bi_inode = blkiov15[i].bi_inode; |
183 | blkiov[i].bi_lbn = blkiov15[i].bi_lbn; |
184 | blkiov[i].bi_daddr = blkiov15[i].bi_daddr; |
185 | blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate; |
186 | blkiov[i].bi_version = blkiov15[i].bi_version; |
187 | blkiov[i].bi_bp = blkiov15[i].bi_bp; |
188 | blkiov[i].bi_size = blkiov15[i].bi_size; |
189 | } |
190 | |
191 | if ((error = lfs_markv(l, &fsid, blkiov, blkcnt)) == 0) { |
192 | for (i = 0; i < blkcnt; i++) { |
193 | blkiov15[i].bi_inode = blkiov[i].bi_inode; |
194 | blkiov15[i].bi_lbn = blkiov[i].bi_lbn; |
195 | blkiov15[i].bi_daddr = blkiov[i].bi_daddr; |
196 | blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate; |
197 | blkiov15[i].bi_version = blkiov[i].bi_version; |
198 | blkiov15[i].bi_bp = blkiov[i].bi_bp; |
199 | blkiov15[i].bi_size = blkiov[i].bi_size; |
200 | } |
201 | copyout(blkiov15, SCARG(uap, blkiov), |
202 | blkcnt * sizeof(BLOCK_INFO_15)); |
203 | } |
204 | out: |
205 | lfs_free(fs, blkiov, LFS_NB_BLKIOV); |
206 | lfs_free(fs, blkiov15, LFS_NB_BLKIOV); |
207 | KERNEL_UNLOCK_ONE(NULL); |
208 | return error; |
209 | } |
210 | #endif |
211 | |
212 | #define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS) |
213 | |
214 | int |
215 | lfs_markv(struct lwp *l, fsid_t *fsidp, BLOCK_INFO *blkiov, |
216 | int blkcnt) |
217 | { |
218 | BLOCK_INFO *blkp; |
219 | IFILE *ifp; |
220 | struct buf *bp; |
221 | struct inode *ip = NULL; |
222 | struct lfs *fs; |
223 | struct mount *mntp; |
224 | struct ulfsmount *ump; |
225 | struct vnode *vp; |
226 | ino_t lastino; |
227 | daddr_t b_daddr; |
228 | int cnt, error; |
229 | int do_again = 0; |
230 | int numrefed = 0; |
231 | ino_t maxino; |
232 | size_t obsize; |
233 | |
234 | /* number of blocks/inodes that we have already bwrite'ed */ |
235 | int nblkwritten, ninowritten; |
236 | |
237 | error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, |
238 | KAUTH_REQ_SYSTEM_LFS_MARKV, NULL, NULL, NULL); |
239 | if (error) |
240 | return (error); |
241 | |
242 | if ((mntp = vfs_getvfs(fsidp)) == NULL) |
243 | return (ENOENT); |
244 | |
245 | ump = VFSTOULFS(mntp); |
246 | fs = ump->um_lfs; |
247 | |
248 | if (fs->lfs_ronly) |
249 | return EROFS; |
250 | |
251 | maxino = (lfs_fragstoblks(fs, lfs_dino_getblocks(fs, VTOI(fs->lfs_ivnode)->i_din)) - |
252 | lfs_sb_getcleansz(fs) - lfs_sb_getsegtabsz(fs)) * lfs_sb_getifpb(fs); |
253 | |
254 | cnt = blkcnt; |
255 | |
256 | if ((error = vfs_busy(mntp, NULL)) != 0) |
257 | return (error); |
258 | |
259 | /* |
260 | * This seglock is just to prevent the fact that we might have to sleep |
261 | * from allowing the possibility that our blocks might become |
262 | * invalid. |
263 | * |
264 | * It is also important to note here that unless we specify SEGM_CKP, |
265 | * any Ifile blocks that we might be asked to clean will never get |
266 | * to the disk. |
267 | */ |
268 | lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); |
269 | |
270 | /* Mark blocks/inodes dirty. */ |
271 | error = 0; |
272 | |
273 | /* these were inside the initialization for the for loop */ |
274 | vp = NULL; |
275 | lastino = LFS_UNUSED_INUM; |
276 | nblkwritten = ninowritten = 0; |
277 | for (blkp = blkiov; cnt--; ++blkp) |
278 | { |
279 | /* Bounds-check incoming data, avoid panic for failed VGET */ |
280 | if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) { |
281 | error = EINVAL; |
282 | goto err3; |
283 | } |
284 | /* |
285 | * Get the IFILE entry (only once) and see if the file still |
286 | * exists. |
287 | */ |
288 | if (lastino != blkp->bi_inode) { |
289 | /* |
290 | * Finish the old file, if there was one. |
291 | */ |
292 | if (vp != NULL) { |
293 | vput(vp); |
294 | vp = NULL; |
295 | numrefed--; |
296 | } |
297 | |
298 | /* |
299 | * Start a new file |
300 | */ |
301 | lastino = blkp->bi_inode; |
302 | |
303 | /* Get the vnode/inode. */ |
304 | error = lfs_fastvget(mntp, blkp->bi_inode, blkp, |
305 | LK_EXCLUSIVE | LK_NOWAIT, &vp); |
306 | if (error) { |
307 | DLOG((DLOG_CLEAN, "lfs_markv: lfs_fastvget" |
308 | " failed with %d (ino %d, segment %d)\n" , |
309 | error, blkp->bi_inode, |
310 | lfs_dtosn(fs, blkp->bi_daddr))); |
311 | /* |
312 | * If we got EAGAIN, that means that the |
313 | * Inode was locked. This is |
314 | * recoverable: just clean the rest of |
315 | * this segment, and let the cleaner try |
316 | * again with another. (When the |
317 | * cleaner runs again, this segment will |
318 | * sort high on the list, since it is |
319 | * now almost entirely empty.) |
320 | */ |
321 | if (error == EAGAIN) { |
322 | error = 0; |
323 | do_again++; |
324 | } else |
325 | KASSERT(error == ENOENT); |
326 | KASSERT(vp == NULL); |
327 | ip = NULL; |
328 | continue; |
329 | } |
330 | |
331 | ip = VTOI(vp); |
332 | numrefed++; |
333 | ninowritten++; |
334 | } else if (vp == NULL) { |
335 | /* |
336 | * This can only happen if the vnode is dead (or |
337 | * in any case we can't get it...e.g., it is |
338 | * inlocked). Keep going. |
339 | */ |
340 | continue; |
341 | } |
342 | |
343 | /* Past this point we are guaranteed that vp, ip are valid. */ |
344 | |
345 | /* Can't clean VU_DIROP directories in case of truncation */ |
346 | /* XXX - maybe we should mark removed dirs specially? */ |
347 | if (vp->v_type == VDIR && (vp->v_uflag & VU_DIROP)) { |
348 | do_again++; |
349 | continue; |
350 | } |
351 | |
352 | /* If this BLOCK_INFO didn't contain a block, keep going. */ |
353 | if (blkp->bi_lbn == LFS_UNUSED_LBN) { |
354 | /* XXX need to make sure that the inode gets written in this case */ |
355 | /* XXX but only write the inode if it's the right one */ |
356 | if (blkp->bi_inode != LFS_IFILE_INUM) { |
357 | LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); |
358 | if (lfs_if_getdaddr(fs, ifp) == blkp->bi_daddr) { |
359 | mutex_enter(&lfs_lock); |
360 | LFS_SET_UINO(ip, IN_CLEANING); |
361 | mutex_exit(&lfs_lock); |
362 | } |
363 | brelse(bp, 0); |
364 | } |
365 | continue; |
366 | } |
367 | |
368 | b_daddr = 0; |
369 | if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) || |
370 | LFS_DBTOFSB(fs, b_daddr) != blkp->bi_daddr) |
371 | { |
372 | if (lfs_dtosn(fs, LFS_DBTOFSB(fs, b_daddr)) == |
373 | lfs_dtosn(fs, blkp->bi_daddr)) |
374 | { |
375 | DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %jx vs %jx\n" , |
376 | (intmax_t)blkp->bi_daddr, (intmax_t)LFS_DBTOFSB(fs, b_daddr))); |
377 | } |
378 | do_again++; |
379 | continue; |
380 | } |
381 | |
382 | /* |
383 | * Check block sizes. The blocks being cleaned come from |
384 | * disk, so they should have the same size as their on-disk |
385 | * counterparts. |
386 | */ |
387 | if (blkp->bi_lbn >= 0) |
388 | obsize = lfs_blksize(fs, ip, blkp->bi_lbn); |
389 | else |
390 | obsize = lfs_sb_getbsize(fs); |
391 | /* Check for fragment size change */ |
392 | if (blkp->bi_lbn >= 0 && blkp->bi_lbn < ULFS_NDADDR) { |
393 | obsize = ip->i_lfs_fragsize[blkp->bi_lbn]; |
394 | } |
395 | if (obsize != blkp->bi_size) { |
396 | DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %jd wrong" |
397 | " size (%ld != %d), try again\n" , |
398 | blkp->bi_inode, (intmax_t)blkp->bi_lbn, |
399 | (long) obsize, blkp->bi_size)); |
400 | do_again++; |
401 | continue; |
402 | } |
403 | |
404 | /* |
405 | * If we get to here, then we are keeping the block. If |
406 | * it is an indirect block, we want to actually put it |
407 | * in the buffer cache so that it can be updated in the |
408 | * finish_meta section. If it's not, we need to |
409 | * allocate a fake buffer so that writeseg can perform |
410 | * the copyin and write the buffer. |
411 | */ |
412 | if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) { |
413 | /* Data Block */ |
414 | bp = lfs_fakebuf(fs, vp, blkp->bi_lbn, |
415 | blkp->bi_size, blkp->bi_bp); |
416 | /* Pretend we used bread() to get it */ |
417 | bp->b_blkno = LFS_FSBTODB(fs, blkp->bi_daddr); |
418 | } else { |
419 | /* Indirect block or ifile */ |
420 | if (blkp->bi_size != lfs_sb_getbsize(fs) && |
421 | ip->i_number != LFS_IFILE_INUM) |
422 | panic("lfs_markv: partial indirect block?" |
423 | " size=%d\n" , blkp->bi_size); |
424 | bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0); |
425 | if (!(bp->b_oflags & (BO_DONE|BO_DELWRI))) { |
426 | /* |
427 | * The block in question was not found |
428 | * in the cache; i.e., the block that |
429 | * getblk() returned is empty. So, we |
430 | * can (and should) copy in the |
431 | * contents, because we've already |
432 | * determined that this was the right |
433 | * version of this block on disk. |
434 | * |
435 | * And, it can't have changed underneath |
436 | * us, because we have the segment lock. |
437 | */ |
438 | error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size); |
439 | if (error) |
440 | goto err2; |
441 | } |
442 | } |
443 | if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0) |
444 | goto err2; |
445 | |
446 | nblkwritten++; |
447 | /* |
448 | * XXX should account indirect blocks and ifile pages as well |
449 | */ |
450 | if (nblkwritten + lfs_lblkno(fs, ninowritten * DINOSIZE(fs)) |
451 | > LFS_MARKV_MAX_BLOCKS) { |
452 | DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos\n" , |
453 | nblkwritten, ninowritten)); |
454 | lfs_segwrite(mntp, SEGM_CLEAN); |
455 | nblkwritten = ninowritten = 0; |
456 | } |
457 | } |
458 | |
459 | /* |
460 | * Finish the old file, if there was one |
461 | */ |
462 | if (vp != NULL) { |
463 | vput(vp); |
464 | vp = NULL; |
465 | numrefed--; |
466 | } |
467 | |
468 | #ifdef DIAGNOSTIC |
469 | if (numrefed != 0) |
470 | panic("lfs_markv: numrefed=%d" , numrefed); |
471 | #endif |
472 | DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos (check point)\n" , |
473 | nblkwritten, ninowritten)); |
474 | |
475 | /* |
476 | * The last write has to be SEGM_SYNC, because of calling semantics. |
477 | * It also has to be SEGM_CKP, because otherwise we could write |
478 | * over the newly cleaned data contained in a checkpoint, and then |
479 | * we'd be unhappy at recovery time. |
480 | */ |
481 | lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); |
482 | |
483 | lfs_segunlock(fs); |
484 | |
485 | vfs_unbusy(mntp, false, NULL); |
486 | if (error) |
487 | return (error); |
488 | else if (do_again) |
489 | return EAGAIN; |
490 | |
491 | return 0; |
492 | |
493 | err2: |
494 | DLOG((DLOG_CLEAN, "lfs_markv err2\n" )); |
495 | |
496 | /* |
497 | * XXX we're here because copyin() failed. |
498 | * XXX it means that we can't trust the cleanerd. too bad. |
499 | * XXX how can we recover from this? |
500 | */ |
501 | |
502 | err3: |
503 | /* |
504 | * XXX should do segwrite here anyway? |
505 | */ |
506 | |
507 | if (vp != NULL) { |
508 | vput(vp); |
509 | vp = NULL; |
510 | --numrefed; |
511 | } |
512 | |
513 | lfs_segunlock(fs); |
514 | vfs_unbusy(mntp, false, NULL); |
515 | #ifdef DIAGNOSTIC |
516 | if (numrefed != 0) |
517 | panic("lfs_markv: numrefed=%d" , numrefed); |
518 | #endif |
519 | |
520 | return (error); |
521 | } |
522 | |
523 | /* |
524 | * sys_lfs_bmapv: |
525 | * |
526 | * This will fill in the current disk address for arrays of blocks. |
527 | * |
528 | * 0 on success |
529 | * -1/errno is return on error. |
530 | */ |
531 | #ifdef USE_64BIT_SYSCALLS |
532 | int |
533 | sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval) |
534 | { |
535 | /* { |
536 | syscallarg(fsid_t *) fsidp; |
537 | syscallarg(struct block_info *) blkiov; |
538 | syscallarg(int) blkcnt; |
539 | } */ |
540 | BLOCK_INFO *blkiov; |
541 | int blkcnt, error; |
542 | fsid_t fsid; |
543 | struct lfs *fs; |
544 | struct mount *mntp; |
545 | |
546 | if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) |
547 | return (error); |
548 | |
549 | if ((mntp = vfs_getvfs(&fsid)) == NULL) |
550 | return (ENOENT); |
551 | fs = VFSTOULFS(mntp)->um_lfs; |
552 | |
553 | blkcnt = SCARG(uap, blkcnt); |
554 | #if SIZE_T_MAX <= UINT_MAX |
555 | if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO)) |
556 | return (EINVAL); |
557 | #endif |
558 | KERNEL_LOCK(1, NULL); |
559 | blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); |
560 | if ((error = copyin(SCARG(uap, blkiov), blkiov, |
561 | blkcnt * sizeof(BLOCK_INFO))) != 0) |
562 | goto out; |
563 | |
564 | if ((error = lfs_bmapv(l, &fsid, blkiov, blkcnt)) == 0) |
565 | copyout(blkiov, SCARG(uap, blkiov), |
566 | blkcnt * sizeof(BLOCK_INFO)); |
567 | out: |
568 | lfs_free(fs, blkiov, LFS_NB_BLKIOV); |
569 | KERNEL_UNLOCK_ONE(NULL); |
570 | return error; |
571 | } |
572 | #else |
573 | int |
574 | sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval) |
575 | { |
576 | /* { |
577 | syscallarg(fsid_t *) fsidp; |
578 | syscallarg(struct block_info *) blkiov; |
579 | syscallarg(int) blkcnt; |
580 | } */ |
581 | BLOCK_INFO *blkiov; |
582 | BLOCK_INFO_15 *blkiov15; |
583 | int i, blkcnt, error; |
584 | fsid_t fsid; |
585 | struct lfs *fs; |
586 | struct mount *mntp; |
587 | |
588 | if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) |
589 | return (error); |
590 | |
591 | if ((mntp = vfs_getvfs(&fsid)) == NULL) |
592 | return (ENOENT); |
593 | fs = VFSTOULFS(mntp)->um_lfs; |
594 | |
595 | blkcnt = SCARG(uap, blkcnt); |
596 | if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO)) |
597 | return (EINVAL); |
598 | KERNEL_LOCK(1, NULL); |
599 | blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); |
600 | blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV); |
601 | if ((error = copyin(SCARG(uap, blkiov), blkiov15, |
602 | blkcnt * sizeof(BLOCK_INFO_15))) != 0) |
603 | goto out; |
604 | |
605 | for (i = 0; i < blkcnt; i++) { |
606 | blkiov[i].bi_inode = blkiov15[i].bi_inode; |
607 | blkiov[i].bi_lbn = blkiov15[i].bi_lbn; |
608 | blkiov[i].bi_daddr = blkiov15[i].bi_daddr; |
609 | blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate; |
610 | blkiov[i].bi_version = blkiov15[i].bi_version; |
611 | blkiov[i].bi_bp = blkiov15[i].bi_bp; |
612 | blkiov[i].bi_size = blkiov15[i].bi_size; |
613 | } |
614 | |
615 | if ((error = lfs_bmapv(l, &fsid, blkiov, blkcnt)) == 0) { |
616 | for (i = 0; i < blkcnt; i++) { |
617 | blkiov15[i].bi_inode = blkiov[i].bi_inode; |
618 | blkiov15[i].bi_lbn = blkiov[i].bi_lbn; |
619 | blkiov15[i].bi_daddr = blkiov[i].bi_daddr; |
620 | blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate; |
621 | blkiov15[i].bi_version = blkiov[i].bi_version; |
622 | blkiov15[i].bi_bp = blkiov[i].bi_bp; |
623 | blkiov15[i].bi_size = blkiov[i].bi_size; |
624 | } |
625 | copyout(blkiov15, SCARG(uap, blkiov), |
626 | blkcnt * sizeof(BLOCK_INFO_15)); |
627 | } |
628 | out: |
629 | lfs_free(fs, blkiov, LFS_NB_BLKIOV); |
630 | lfs_free(fs, blkiov15, LFS_NB_BLKIOV); |
631 | KERNEL_UNLOCK_ONE(NULL); |
632 | return error; |
633 | } |
634 | #endif |
635 | |
636 | int |
637 | lfs_bmapv(struct lwp *l, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt) |
638 | { |
639 | BLOCK_INFO *blkp; |
640 | IFILE *ifp; |
641 | struct buf *bp; |
642 | struct inode *ip = NULL; |
643 | struct lfs *fs; |
644 | struct mount *mntp; |
645 | struct ulfsmount *ump; |
646 | struct vnode *vp; |
647 | ino_t lastino; |
648 | daddr_t v_daddr; |
649 | int cnt, error; |
650 | int numrefed = 0; |
651 | |
652 | error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, |
653 | KAUTH_REQ_SYSTEM_LFS_BMAPV, NULL, NULL, NULL); |
654 | if (error) |
655 | return (error); |
656 | |
657 | if ((mntp = vfs_getvfs(fsidp)) == NULL) |
658 | return (ENOENT); |
659 | |
660 | if ((error = vfs_busy(mntp, NULL)) != 0) |
661 | return (error); |
662 | |
663 | ump = VFSTOULFS(mntp); |
664 | fs = ump->um_lfs; |
665 | |
666 | if (fs->lfs_cleaner_thread == NULL) |
667 | fs->lfs_cleaner_thread = curlwp; |
668 | KASSERT(fs->lfs_cleaner_thread == curlwp); |
669 | |
670 | cnt = blkcnt; |
671 | |
672 | error = 0; |
673 | |
674 | /* these were inside the initialization for the for loop */ |
675 | vp = NULL; |
676 | v_daddr = LFS_UNUSED_DADDR; |
677 | lastino = LFS_UNUSED_INUM; |
678 | for (blkp = blkiov; cnt--; ++blkp) |
679 | { |
680 | /* |
681 | * Get the IFILE entry (only once) and see if the file still |
682 | * exists. |
683 | */ |
684 | if (lastino != blkp->bi_inode) { |
685 | /* |
686 | * Finish the old file, if there was one. |
687 | */ |
688 | if (vp != NULL) { |
689 | vput(vp); |
690 | vp = NULL; |
691 | numrefed--; |
692 | } |
693 | |
694 | /* |
695 | * Start a new file |
696 | */ |
697 | lastino = blkp->bi_inode; |
698 | if (blkp->bi_inode == LFS_IFILE_INUM) |
699 | v_daddr = lfs_sb_getidaddr(fs); |
700 | else { |
701 | LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); |
702 | v_daddr = lfs_if_getdaddr(fs, ifp); |
703 | brelse(bp, 0); |
704 | } |
705 | if (v_daddr == LFS_UNUSED_DADDR) { |
706 | blkp->bi_daddr = LFS_UNUSED_DADDR; |
707 | continue; |
708 | } |
709 | error = lfs_fastvget(mntp, blkp->bi_inode, NULL, |
710 | LK_SHARED, &vp); |
711 | if (error) { |
712 | DLOG((DLOG_CLEAN, "lfs_bmapv: lfs_fastvget ino" |
713 | "%d failed with %d" , |
714 | blkp->bi_inode,error)); |
715 | KASSERT(vp == NULL); |
716 | continue; |
717 | } else { |
718 | KASSERT(VOP_ISLOCKED(vp)); |
719 | numrefed++; |
720 | } |
721 | ip = VTOI(vp); |
722 | } else if (vp == NULL) { |
723 | /* |
724 | * This can only happen if the vnode is dead. |
725 | * Keep going. Note that we DO NOT set the |
726 | * bi_addr to anything -- if we failed to get |
727 | * the vnode, for example, we want to assume |
728 | * conservatively that all of its blocks *are* |
729 | * located in the segment in question. |
730 | * lfs_markv will throw them out if we are |
731 | * wrong. |
732 | */ |
733 | continue; |
734 | } |
735 | |
736 | /* Past this point we are guaranteed that vp, ip are valid. */ |
737 | |
738 | if (blkp->bi_lbn == LFS_UNUSED_LBN) { |
739 | /* |
740 | * We just want the inode address, which is |
741 | * conveniently in v_daddr. |
742 | */ |
743 | blkp->bi_daddr = v_daddr; |
744 | } else { |
745 | daddr_t bi_daddr; |
746 | |
747 | error = VOP_BMAP(vp, blkp->bi_lbn, NULL, |
748 | &bi_daddr, NULL); |
749 | if (error) |
750 | { |
751 | blkp->bi_daddr = LFS_UNUSED_DADDR; |
752 | continue; |
753 | } |
754 | blkp->bi_daddr = LFS_DBTOFSB(fs, bi_daddr); |
755 | /* Fill in the block size, too */ |
756 | if (blkp->bi_lbn >= 0) |
757 | blkp->bi_size = lfs_blksize(fs, ip, blkp->bi_lbn); |
758 | else |
759 | blkp->bi_size = lfs_sb_getbsize(fs); |
760 | } |
761 | } |
762 | |
763 | /* |
764 | * Finish the old file, if there was one. |
765 | */ |
766 | if (vp != NULL) { |
767 | vput(vp); |
768 | vp = NULL; |
769 | numrefed--; |
770 | } |
771 | |
772 | #ifdef DIAGNOSTIC |
773 | if (numrefed != 0) |
774 | panic("lfs_bmapv: numrefed=%d" , numrefed); |
775 | #endif |
776 | |
777 | vfs_unbusy(mntp, false, NULL); |
778 | |
779 | return 0; |
780 | } |
781 | |
782 | /* |
783 | * sys_lfs_segclean: |
784 | * |
785 | * Mark the segment clean. |
786 | * |
787 | * 0 on success |
788 | * -1/errno is return on error. |
789 | */ |
790 | int |
791 | sys_lfs_segclean(struct lwp *l, const struct sys_lfs_segclean_args *uap, register_t *retval) |
792 | { |
793 | /* { |
794 | syscallarg(fsid_t *) fsidp; |
795 | syscallarg(u_long) segment; |
796 | } */ |
797 | struct lfs *fs; |
798 | struct mount *mntp; |
799 | fsid_t fsid; |
800 | int error; |
801 | unsigned long segnum; |
802 | |
803 | error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, |
804 | KAUTH_REQ_SYSTEM_LFS_SEGCLEAN, NULL, NULL, NULL); |
805 | if (error) |
806 | return (error); |
807 | |
808 | if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) |
809 | return (error); |
810 | if ((mntp = vfs_getvfs(&fsid)) == NULL) |
811 | return (ENOENT); |
812 | |
813 | fs = VFSTOULFS(mntp)->um_lfs; |
814 | segnum = SCARG(uap, segment); |
815 | |
816 | if ((error = vfs_busy(mntp, NULL)) != 0) |
817 | return (error); |
818 | |
819 | KERNEL_LOCK(1, NULL); |
820 | lfs_seglock(fs, SEGM_PROT); |
821 | error = lfs_do_segclean(fs, segnum); |
822 | lfs_segunlock(fs); |
823 | KERNEL_UNLOCK_ONE(NULL); |
824 | vfs_unbusy(mntp, false, NULL); |
825 | return error; |
826 | } |
827 | |
828 | /* |
829 | * Actually mark the segment clean. |
830 | * Must be called with the segment lock held. |
831 | */ |
832 | int |
833 | lfs_do_segclean(struct lfs *fs, unsigned long segnum) |
834 | { |
835 | extern int lfs_dostats; |
836 | struct buf *bp; |
837 | CLEANERINFO *cip; |
838 | SEGUSE *sup; |
839 | |
840 | if (lfs_dtosn(fs, lfs_sb_getcurseg(fs)) == segnum) { |
841 | return (EBUSY); |
842 | } |
843 | |
844 | LFS_SEGENTRY(sup, fs, segnum, bp); |
845 | if (sup->su_nbytes) { |
846 | DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" |
847 | " %d live bytes\n" , segnum, sup->su_nbytes)); |
848 | brelse(bp, 0); |
849 | return (EBUSY); |
850 | } |
851 | if (sup->su_flags & SEGUSE_ACTIVE) { |
852 | DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" |
853 | " segment is active\n" , segnum)); |
854 | brelse(bp, 0); |
855 | return (EBUSY); |
856 | } |
857 | if (!(sup->su_flags & SEGUSE_DIRTY)) { |
858 | DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" |
859 | " segment is already clean\n" , segnum)); |
860 | brelse(bp, 0); |
861 | return (EALREADY); |
862 | } |
863 | |
864 | lfs_sb_addavail(fs, lfs_segtod(fs, 1)); |
865 | if (sup->su_flags & SEGUSE_SUPERBLOCK) |
866 | lfs_sb_subavail(fs, lfs_btofsb(fs, LFS_SBPAD)); |
867 | if (lfs_sb_getversion(fs) > 1 && segnum == 0 && |
868 | lfs_sb_gets0addr(fs) < lfs_btofsb(fs, LFS_LABELPAD)) |
869 | lfs_sb_subavail(fs, lfs_btofsb(fs, LFS_LABELPAD) - lfs_sb_gets0addr(fs)); |
870 | mutex_enter(&lfs_lock); |
871 | lfs_sb_addbfree(fs, sup->su_nsums * lfs_btofsb(fs, lfs_sb_getsumsize(fs)) + |
872 | lfs_btofsb(fs, sup->su_ninos * lfs_sb_getibsize(fs))); |
873 | lfs_sb_subdmeta(fs, sup->su_nsums * lfs_btofsb(fs, lfs_sb_getsumsize(fs)) + |
874 | lfs_btofsb(fs, sup->su_ninos * lfs_sb_getibsize(fs))); |
875 | if (lfs_sb_getdmeta(fs) < 0) |
876 | lfs_sb_setdmeta(fs, 0); |
877 | mutex_exit(&lfs_lock); |
878 | sup->su_flags &= ~SEGUSE_DIRTY; |
879 | LFS_WRITESEGENTRY(sup, fs, segnum, bp); |
880 | |
881 | LFS_CLEANERINFO(cip, fs, bp); |
882 | lfs_ci_shiftdirtytoclean(fs, cip, 1); |
883 | lfs_sb_setnclean(fs, lfs_ci_getclean(fs, cip)); |
884 | mutex_enter(&lfs_lock); |
885 | lfs_ci_setbfree(fs, cip, lfs_sb_getbfree(fs)); |
886 | lfs_ci_setavail(fs, cip, lfs_sb_getavail(fs) |
887 | - fs->lfs_ravail - fs->lfs_favail); |
888 | wakeup(&fs->lfs_availsleep); |
889 | mutex_exit(&lfs_lock); |
890 | (void) LFS_BWRITE_LOG(bp); |
891 | |
892 | if (lfs_dostats) |
893 | ++lfs_stats.segs_reclaimed; |
894 | |
895 | return (0); |
896 | } |
897 | |
898 | /* |
899 | * This will block until a segment in file system fsid is written. A timeout |
900 | * in milliseconds may be specified which will awake the cleaner automatically. |
901 | * An fsid of -1 means any file system, and a timeout of 0 means forever. |
902 | */ |
903 | int |
904 | lfs_segwait(fsid_t *fsidp, struct timeval *tv) |
905 | { |
906 | struct mount *mntp; |
907 | void *addr; |
908 | u_long timeout; |
909 | int error; |
910 | |
911 | KERNEL_LOCK(1, NULL); |
912 | if (fsidp == NULL || (mntp = vfs_getvfs(fsidp)) == NULL) |
913 | addr = &lfs_allclean_wakeup; |
914 | else |
915 | addr = &VFSTOULFS(mntp)->um_lfs->lfs_nextsegsleep; |
916 | /* |
917 | * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}! |
918 | * XXX IS THAT WHAT IS INTENDED? |
919 | */ |
920 | timeout = tvtohz(tv); |
921 | error = tsleep(addr, PCATCH | PVFS, "segment" , timeout); |
922 | KERNEL_UNLOCK_ONE(NULL); |
923 | return (error == ERESTART ? EINTR : 0); |
924 | } |
925 | |
926 | /* |
927 | * sys_lfs_segwait: |
928 | * |
929 | * System call wrapper around lfs_segwait(). |
930 | * |
931 | * 0 on success |
932 | * 1 on timeout |
933 | * -1/errno is return on error. |
934 | */ |
935 | int |
936 | sys___lfs_segwait50(struct lwp *l, const struct sys___lfs_segwait50_args *uap, |
937 | register_t *retval) |
938 | { |
939 | /* { |
940 | syscallarg(fsid_t *) fsidp; |
941 | syscallarg(struct timeval *) tv; |
942 | } */ |
943 | struct timeval atv; |
944 | fsid_t fsid; |
945 | int error; |
946 | |
947 | /* XXX need we be su to segwait? */ |
948 | error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, |
949 | KAUTH_REQ_SYSTEM_LFS_SEGWAIT, NULL, NULL, NULL); |
950 | if (error) |
951 | return (error); |
952 | if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) |
953 | return (error); |
954 | |
955 | if (SCARG(uap, tv)) { |
956 | error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval)); |
957 | if (error) |
958 | return (error); |
959 | if (itimerfix(&atv)) |
960 | return (EINVAL); |
961 | } else /* NULL or invalid */ |
962 | atv.tv_sec = atv.tv_usec = 0; |
963 | return lfs_segwait(&fsid, &atv); |
964 | } |
965 | |
966 | /* |
967 | * VFS_VGET call specialized for the cleaner. If the cleaner is |
968 | * processing IINFO structures, it may have the ondisk inode already, so |
969 | * don't go retrieving it again. |
970 | * |
971 | * Return the vnode referenced and locked. |
972 | */ |
973 | |
974 | static int |
975 | lfs_fastvget(struct mount *mp, ino_t ino, BLOCK_INFO *blkp, int lk_flags, |
976 | struct vnode **vpp) |
977 | { |
978 | struct ulfsmount *ump; |
979 | struct lfs *fs; |
980 | int error; |
981 | |
982 | ump = VFSTOULFS(mp); |
983 | fs = ump->um_lfs; |
984 | fs->lfs_cleaner_hint = blkp; |
985 | error = vcache_get(mp, &ino, sizeof(ino), vpp); |
986 | fs->lfs_cleaner_hint = NULL; |
987 | if (error) |
988 | return error; |
989 | error = vn_lock(*vpp, lk_flags); |
990 | if (error) { |
991 | if (error == EBUSY) |
992 | error = EAGAIN; |
993 | vrele(*vpp); |
994 | *vpp = NULL; |
995 | return error; |
996 | } |
997 | |
998 | return 0; |
999 | } |
1000 | |
1001 | /* |
1002 | * Make up a "fake" cleaner buffer, copy the data from userland into it. |
1003 | */ |
1004 | static struct buf * |
1005 | lfs_fakebuf(struct lfs *fs, struct vnode *vp, daddr_t lbn, size_t size, void *uaddr) |
1006 | { |
1007 | struct buf *bp; |
1008 | int error; |
1009 | |
1010 | KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM); |
1011 | |
1012 | bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN); |
1013 | error = copyin(uaddr, bp->b_data, size); |
1014 | if (error) { |
1015 | lfs_freebuf(fs, bp); |
1016 | return NULL; |
1017 | } |
1018 | KDASSERT(bp->b_iodone == lfs_callback); |
1019 | |
1020 | #if 0 |
1021 | mutex_enter(&lfs_lock); |
1022 | ++fs->lfs_iocount; |
1023 | mutex_exit(&lfs_lock); |
1024 | #endif |
1025 | bp->b_bufsize = size; |
1026 | bp->b_bcount = size; |
1027 | return (bp); |
1028 | } |
1029 | |