1 | /* $NetBSD: vfs_lockf.c,v 1.73 2011/01/31 08:25:32 dholland Exp $ */ |
2 | |
3 | /* |
4 | * Copyright (c) 1982, 1986, 1989, 1993 |
5 | * The Regents of the University of California. All rights reserved. |
6 | * |
7 | * This code is derived from software contributed to Berkeley by |
8 | * Scooter Morris at Genentech Inc. |
9 | * |
10 | * Redistribution and use in source and binary forms, with or without |
11 | * modification, are permitted provided that the following conditions |
12 | * are met: |
13 | * 1. Redistributions of source code must retain the above copyright |
14 | * notice, this list of conditions and the following disclaimer. |
15 | * 2. Redistributions in binary form must reproduce the above copyright |
16 | * notice, this list of conditions and the following disclaimer in the |
17 | * documentation and/or other materials provided with the distribution. |
18 | * 3. Neither the name of the University nor the names of its contributors |
19 | * may be used to endorse or promote products derived from this software |
20 | * without specific prior written permission. |
21 | * |
22 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
23 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
24 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
25 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
26 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
27 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
28 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
29 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
30 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
31 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
32 | * SUCH DAMAGE. |
33 | * |
34 | * @(#)ufs_lockf.c 8.4 (Berkeley) 10/26/94 |
35 | */ |
36 | |
37 | #include <sys/cdefs.h> |
38 | __KERNEL_RCSID(0, "$NetBSD: vfs_lockf.c,v 1.73 2011/01/31 08:25:32 dholland Exp $" ); |
39 | |
40 | #include <sys/param.h> |
41 | #include <sys/systm.h> |
42 | #include <sys/kernel.h> |
43 | #include <sys/file.h> |
44 | #include <sys/proc.h> |
45 | #include <sys/vnode.h> |
46 | #include <sys/pool.h> |
47 | #include <sys/fcntl.h> |
48 | #include <sys/lockf.h> |
49 | #include <sys/atomic.h> |
50 | #include <sys/kauth.h> |
51 | #include <sys/uidinfo.h> |
52 | |
53 | /* |
54 | * The lockf structure is a kernel structure which contains the information |
55 | * associated with a byte range lock. The lockf structures are linked into |
56 | * the vnode structure. Locks are sorted by the starting byte of the lock for |
57 | * efficiency. |
58 | * |
59 | * lf_next is used for two purposes, depending on whether the lock is |
60 | * being held, or is in conflict with an existing lock. If this lock |
61 | * is held, it indicates the next lock on the same vnode. |
62 | * For pending locks, if lock->lf_next is non-NULL, then lock->lf_block |
63 | * must be queued on the lf_blkhd TAILQ of lock->lf_next. |
64 | */ |
65 | |
66 | TAILQ_HEAD(locklist, lockf); |
67 | |
68 | struct lockf { |
69 | kcondvar_t lf_cv; /* Signalling */ |
70 | short lf_flags; /* Lock semantics: F_POSIX, F_FLOCK, F_WAIT */ |
71 | short lf_type; /* Lock type: F_RDLCK, F_WRLCK */ |
72 | off_t lf_start; /* The byte # of the start of the lock */ |
73 | off_t lf_end; /* The byte # of the end of the lock (-1=EOF)*/ |
74 | void *lf_id; /* process or file description holding lock */ |
75 | struct lockf **lf_head; /* Back pointer to the head of lockf list */ |
76 | struct lockf *lf_next; /* Next lock on this vnode, or blocking lock */ |
77 | struct locklist lf_blkhd; /* List of requests blocked on this lock */ |
78 | TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */ |
79 | uid_t lf_uid; /* User ID responsible */ |
80 | }; |
81 | |
82 | /* Maximum length of sleep chains to traverse to try and detect deadlock. */ |
83 | #define MAXDEPTH 50 |
84 | |
85 | static pool_cache_t lockf_cache; |
86 | static kmutex_t *lockf_lock; |
87 | static char lockstr[] = "lockf" ; |
88 | |
89 | /* |
90 | * This variable controls the maximum number of processes that will |
91 | * be checked in doing deadlock detection. |
92 | */ |
93 | int maxlockdepth = MAXDEPTH; |
94 | |
95 | #ifdef LOCKF_DEBUG |
96 | int lockf_debug = 0; |
97 | #endif |
98 | |
99 | #define SELF 0x1 |
100 | #define OTHERS 0x2 |
101 | |
102 | /* |
103 | * XXX TODO |
104 | * Misc cleanups: "void *id" should be visible in the API as a |
105 | * "struct proc *". |
106 | * (This requires rototilling all VFS's which support advisory locking). |
107 | */ |
108 | |
109 | /* |
110 | * If there's a lot of lock contention on a single vnode, locking |
111 | * schemes which allow for more paralleism would be needed. Given how |
112 | * infrequently byte-range locks are actually used in typical BSD |
113 | * code, a more complex approach probably isn't worth it. |
114 | */ |
115 | |
116 | /* |
117 | * We enforce a limit on locks by uid, so that a single user cannot |
118 | * run the kernel out of memory. For now, the limit is pretty coarse. |
119 | * There is no limit on root. |
120 | * |
121 | * Splitting a lock will always succeed, regardless of current allocations. |
122 | * If you're slightly above the limit, we still have to permit an allocation |
123 | * so that the unlock can succeed. If the unlocking causes too many splits, |
124 | * however, you're totally cutoff. |
125 | */ |
126 | int maxlocksperuid = 1024; |
127 | |
128 | #ifdef LOCKF_DEBUG |
129 | /* |
130 | * Print out a lock. |
131 | */ |
132 | static void |
133 | lf_print(const char *tag, struct lockf *lock) |
134 | { |
135 | |
136 | printf("%s: lock %p for " , tag, lock); |
137 | if (lock->lf_flags & F_POSIX) |
138 | printf("proc %d" , ((struct proc *)lock->lf_id)->p_pid); |
139 | else |
140 | printf("file %p" , (struct file *)lock->lf_id); |
141 | printf(" %s, start %jd, end %jd" , |
142 | lock->lf_type == F_RDLCK ? "shared" : |
143 | lock->lf_type == F_WRLCK ? "exclusive" : |
144 | lock->lf_type == F_UNLCK ? "unlock" : |
145 | "unknown" , (intmax_t)lock->lf_start, (intmax_t)lock->lf_end); |
146 | if (TAILQ_FIRST(&lock->lf_blkhd)) |
147 | printf(" block %p\n" , TAILQ_FIRST(&lock->lf_blkhd)); |
148 | else |
149 | printf("\n" ); |
150 | } |
151 | |
152 | static void |
153 | lf_printlist(const char *tag, struct lockf *lock) |
154 | { |
155 | struct lockf *lf, *blk; |
156 | |
157 | printf("%s: Lock list:\n" , tag); |
158 | for (lf = *lock->lf_head; lf; lf = lf->lf_next) { |
159 | printf("\tlock %p for " , lf); |
160 | if (lf->lf_flags & F_POSIX) |
161 | printf("proc %d" , ((struct proc *)lf->lf_id)->p_pid); |
162 | else |
163 | printf("file %p" , (struct file *)lf->lf_id); |
164 | printf(", %s, start %jd, end %jd" , |
165 | lf->lf_type == F_RDLCK ? "shared" : |
166 | lf->lf_type == F_WRLCK ? "exclusive" : |
167 | lf->lf_type == F_UNLCK ? "unlock" : |
168 | "unknown" , (intmax_t)lf->lf_start, (intmax_t)lf->lf_end); |
169 | TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) { |
170 | if (blk->lf_flags & F_POSIX) |
171 | printf("; proc %d" , |
172 | ((struct proc *)blk->lf_id)->p_pid); |
173 | else |
174 | printf("; file %p" , (struct file *)blk->lf_id); |
175 | printf(", %s, start %jd, end %jd" , |
176 | blk->lf_type == F_RDLCK ? "shared" : |
177 | blk->lf_type == F_WRLCK ? "exclusive" : |
178 | blk->lf_type == F_UNLCK ? "unlock" : |
179 | "unknown" , (intmax_t)blk->lf_start, (intmax_t)blk->lf_end); |
180 | if (TAILQ_FIRST(&blk->lf_blkhd)) |
181 | panic("lf_printlist: bad list" ); |
182 | } |
183 | printf("\n" ); |
184 | } |
185 | } |
186 | #endif /* LOCKF_DEBUG */ |
187 | |
188 | /* |
189 | * 3 options for allowfail. |
190 | * 0 - always allocate. 1 - cutoff at limit. 2 - cutoff at double limit. |
191 | */ |
192 | static struct lockf * |
193 | lf_alloc(int allowfail) |
194 | { |
195 | struct uidinfo *uip; |
196 | struct lockf *lock; |
197 | u_long lcnt; |
198 | const uid_t uid = kauth_cred_geteuid(kauth_cred_get()); |
199 | |
200 | uip = uid_find(uid); |
201 | lcnt = atomic_inc_ulong_nv(&uip->ui_lockcnt); |
202 | if (uid && allowfail && lcnt > |
203 | (allowfail == 1 ? maxlocksperuid : (maxlocksperuid * 2))) { |
204 | atomic_dec_ulong(&uip->ui_lockcnt); |
205 | return NULL; |
206 | } |
207 | |
208 | lock = pool_cache_get(lockf_cache, PR_WAITOK); |
209 | lock->lf_uid = uid; |
210 | return lock; |
211 | } |
212 | |
213 | static void |
214 | lf_free(struct lockf *lock) |
215 | { |
216 | struct uidinfo *uip; |
217 | |
218 | uip = uid_find(lock->lf_uid); |
219 | atomic_dec_ulong(&uip->ui_lockcnt); |
220 | pool_cache_put(lockf_cache, lock); |
221 | } |
222 | |
223 | static int |
224 | lf_ctor(void *arg, void *obj, int flag) |
225 | { |
226 | struct lockf *lock; |
227 | |
228 | lock = obj; |
229 | cv_init(&lock->lf_cv, lockstr); |
230 | |
231 | return 0; |
232 | } |
233 | |
234 | static void |
235 | lf_dtor(void *arg, void *obj) |
236 | { |
237 | struct lockf *lock; |
238 | |
239 | lock = obj; |
240 | cv_destroy(&lock->lf_cv); |
241 | } |
242 | |
243 | /* |
244 | * Walk the list of locks for an inode to |
245 | * find an overlapping lock (if any). |
246 | * |
247 | * NOTE: this returns only the FIRST overlapping lock. There |
248 | * may be more than one. |
249 | */ |
250 | static int |
251 | lf_findoverlap(struct lockf *lf, struct lockf *lock, int type, |
252 | struct lockf ***prev, struct lockf **overlap) |
253 | { |
254 | off_t start, end; |
255 | |
256 | *overlap = lf; |
257 | if (lf == NULL) |
258 | return 0; |
259 | #ifdef LOCKF_DEBUG |
260 | if (lockf_debug & 2) |
261 | lf_print("lf_findoverlap: looking for overlap in" , lock); |
262 | #endif /* LOCKF_DEBUG */ |
263 | start = lock->lf_start; |
264 | end = lock->lf_end; |
265 | while (lf != NULL) { |
266 | if (((type == SELF) && lf->lf_id != lock->lf_id) || |
267 | ((type == OTHERS) && lf->lf_id == lock->lf_id)) { |
268 | *prev = &lf->lf_next; |
269 | *overlap = lf = lf->lf_next; |
270 | continue; |
271 | } |
272 | #ifdef LOCKF_DEBUG |
273 | if (lockf_debug & 2) |
274 | lf_print("\tchecking" , lf); |
275 | #endif /* LOCKF_DEBUG */ |
276 | /* |
277 | * OK, check for overlap |
278 | * |
279 | * Six cases: |
280 | * 0) no overlap |
281 | * 1) overlap == lock |
282 | * 2) overlap contains lock |
283 | * 3) lock contains overlap |
284 | * 4) overlap starts before lock |
285 | * 5) overlap ends after lock |
286 | */ |
287 | if ((lf->lf_end != -1 && start > lf->lf_end) || |
288 | (end != -1 && lf->lf_start > end)) { |
289 | /* Case 0 */ |
290 | #ifdef LOCKF_DEBUG |
291 | if (lockf_debug & 2) |
292 | printf("no overlap\n" ); |
293 | #endif /* LOCKF_DEBUG */ |
294 | if ((type & SELF) && end != -1 && lf->lf_start > end) |
295 | return 0; |
296 | *prev = &lf->lf_next; |
297 | *overlap = lf = lf->lf_next; |
298 | continue; |
299 | } |
300 | if ((lf->lf_start == start) && (lf->lf_end == end)) { |
301 | /* Case 1 */ |
302 | #ifdef LOCKF_DEBUG |
303 | if (lockf_debug & 2) |
304 | printf("overlap == lock\n" ); |
305 | #endif /* LOCKF_DEBUG */ |
306 | return 1; |
307 | } |
308 | if ((lf->lf_start <= start) && |
309 | (end != -1) && |
310 | ((lf->lf_end >= end) || (lf->lf_end == -1))) { |
311 | /* Case 2 */ |
312 | #ifdef LOCKF_DEBUG |
313 | if (lockf_debug & 2) |
314 | printf("overlap contains lock\n" ); |
315 | #endif /* LOCKF_DEBUG */ |
316 | return 2; |
317 | } |
318 | if (start <= lf->lf_start && |
319 | (end == -1 || |
320 | (lf->lf_end != -1 && end >= lf->lf_end))) { |
321 | /* Case 3 */ |
322 | #ifdef LOCKF_DEBUG |
323 | if (lockf_debug & 2) |
324 | printf("lock contains overlap\n" ); |
325 | #endif /* LOCKF_DEBUG */ |
326 | return 3; |
327 | } |
328 | if ((lf->lf_start < start) && |
329 | ((lf->lf_end >= start) || (lf->lf_end == -1))) { |
330 | /* Case 4 */ |
331 | #ifdef LOCKF_DEBUG |
332 | if (lockf_debug & 2) |
333 | printf("overlap starts before lock\n" ); |
334 | #endif /* LOCKF_DEBUG */ |
335 | return 4; |
336 | } |
337 | if ((lf->lf_start > start) && |
338 | (end != -1) && |
339 | ((lf->lf_end > end) || (lf->lf_end == -1))) { |
340 | /* Case 5 */ |
341 | #ifdef LOCKF_DEBUG |
342 | if (lockf_debug & 2) |
343 | printf("overlap ends after lock\n" ); |
344 | #endif /* LOCKF_DEBUG */ |
345 | return 5; |
346 | } |
347 | panic("lf_findoverlap: default" ); |
348 | } |
349 | return 0; |
350 | } |
351 | |
352 | /* |
353 | * Split a lock and a contained region into |
354 | * two or three locks as necessary. |
355 | */ |
356 | static void |
357 | lf_split(struct lockf *lock1, struct lockf *lock2, struct lockf **sparelock) |
358 | { |
359 | struct lockf *splitlock; |
360 | |
361 | #ifdef LOCKF_DEBUG |
362 | if (lockf_debug & 2) { |
363 | lf_print("lf_split" , lock1); |
364 | lf_print("splitting from" , lock2); |
365 | } |
366 | #endif /* LOCKF_DEBUG */ |
367 | /* |
368 | * Check to see if spliting into only two pieces. |
369 | */ |
370 | if (lock1->lf_start == lock2->lf_start) { |
371 | lock1->lf_start = lock2->lf_end + 1; |
372 | lock2->lf_next = lock1; |
373 | return; |
374 | } |
375 | if (lock1->lf_end == lock2->lf_end) { |
376 | lock1->lf_end = lock2->lf_start - 1; |
377 | lock2->lf_next = lock1->lf_next; |
378 | lock1->lf_next = lock2; |
379 | return; |
380 | } |
381 | /* |
382 | * Make a new lock consisting of the last part of |
383 | * the encompassing lock |
384 | */ |
385 | splitlock = *sparelock; |
386 | *sparelock = NULL; |
387 | cv_destroy(&splitlock->lf_cv); |
388 | memcpy(splitlock, lock1, sizeof(*splitlock)); |
389 | cv_init(&splitlock->lf_cv, lockstr); |
390 | |
391 | splitlock->lf_start = lock2->lf_end + 1; |
392 | TAILQ_INIT(&splitlock->lf_blkhd); |
393 | lock1->lf_end = lock2->lf_start - 1; |
394 | /* |
395 | * OK, now link it in |
396 | */ |
397 | splitlock->lf_next = lock1->lf_next; |
398 | lock2->lf_next = splitlock; |
399 | lock1->lf_next = lock2; |
400 | } |
401 | |
402 | /* |
403 | * Wakeup a blocklist |
404 | */ |
405 | static void |
406 | lf_wakelock(struct lockf *listhead) |
407 | { |
408 | struct lockf *wakelock; |
409 | |
410 | while ((wakelock = TAILQ_FIRST(&listhead->lf_blkhd))) { |
411 | KASSERT(wakelock->lf_next == listhead); |
412 | TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block); |
413 | wakelock->lf_next = NULL; |
414 | #ifdef LOCKF_DEBUG |
415 | if (lockf_debug & 2) |
416 | lf_print("lf_wakelock: awakening" , wakelock); |
417 | #endif |
418 | cv_broadcast(&wakelock->lf_cv); |
419 | } |
420 | } |
421 | |
422 | /* |
423 | * Remove a byte-range lock on an inode. |
424 | * |
425 | * Generally, find the lock (or an overlap to that lock) |
426 | * and remove it (or shrink it), then wakeup anyone we can. |
427 | */ |
428 | static int |
429 | lf_clearlock(struct lockf *unlock, struct lockf **sparelock) |
430 | { |
431 | struct lockf **head = unlock->lf_head; |
432 | struct lockf *lf = *head; |
433 | struct lockf *overlap, **prev; |
434 | int ovcase; |
435 | |
436 | if (lf == NULL) |
437 | return 0; |
438 | #ifdef LOCKF_DEBUG |
439 | if (unlock->lf_type != F_UNLCK) |
440 | panic("lf_clearlock: bad type" ); |
441 | if (lockf_debug & 1) |
442 | lf_print("lf_clearlock" , unlock); |
443 | #endif /* LOCKF_DEBUG */ |
444 | prev = head; |
445 | while ((ovcase = lf_findoverlap(lf, unlock, SELF, |
446 | &prev, &overlap)) != 0) { |
447 | /* |
448 | * Wakeup the list of locks to be retried. |
449 | */ |
450 | lf_wakelock(overlap); |
451 | |
452 | switch (ovcase) { |
453 | |
454 | case 1: /* overlap == lock */ |
455 | *prev = overlap->lf_next; |
456 | lf_free(overlap); |
457 | break; |
458 | |
459 | case 2: /* overlap contains lock: split it */ |
460 | if (overlap->lf_start == unlock->lf_start) { |
461 | overlap->lf_start = unlock->lf_end + 1; |
462 | break; |
463 | } |
464 | lf_split(overlap, unlock, sparelock); |
465 | overlap->lf_next = unlock->lf_next; |
466 | break; |
467 | |
468 | case 3: /* lock contains overlap */ |
469 | *prev = overlap->lf_next; |
470 | lf = overlap->lf_next; |
471 | lf_free(overlap); |
472 | continue; |
473 | |
474 | case 4: /* overlap starts before lock */ |
475 | overlap->lf_end = unlock->lf_start - 1; |
476 | prev = &overlap->lf_next; |
477 | lf = overlap->lf_next; |
478 | continue; |
479 | |
480 | case 5: /* overlap ends after lock */ |
481 | overlap->lf_start = unlock->lf_end + 1; |
482 | break; |
483 | } |
484 | break; |
485 | } |
486 | #ifdef LOCKF_DEBUG |
487 | if (lockf_debug & 1) |
488 | lf_printlist("lf_clearlock" , unlock); |
489 | #endif /* LOCKF_DEBUG */ |
490 | return 0; |
491 | } |
492 | |
493 | /* |
494 | * Walk the list of locks for an inode and |
495 | * return the first blocking lock. |
496 | */ |
497 | static struct lockf * |
498 | lf_getblock(struct lockf *lock) |
499 | { |
500 | struct lockf **prev, *overlap, *lf = *(lock->lf_head); |
501 | |
502 | prev = lock->lf_head; |
503 | while (lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != 0) { |
504 | /* |
505 | * We've found an overlap, see if it blocks us |
506 | */ |
507 | if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) |
508 | return overlap; |
509 | /* |
510 | * Nope, point to the next one on the list and |
511 | * see if it blocks us |
512 | */ |
513 | lf = overlap->lf_next; |
514 | } |
515 | return NULL; |
516 | } |
517 | |
518 | /* |
519 | * Set a byte-range lock. |
520 | */ |
521 | static int |
522 | lf_setlock(struct lockf *lock, struct lockf **sparelock, |
523 | kmutex_t *interlock) |
524 | { |
525 | struct lockf *block; |
526 | struct lockf **head = lock->lf_head; |
527 | struct lockf **prev, *overlap, *ltmp; |
528 | int ovcase, needtolink, error; |
529 | |
530 | #ifdef LOCKF_DEBUG |
531 | if (lockf_debug & 1) |
532 | lf_print("lf_setlock" , lock); |
533 | #endif /* LOCKF_DEBUG */ |
534 | |
535 | /* |
536 | * Scan lock list for this file looking for locks that would block us. |
537 | */ |
538 | while ((block = lf_getblock(lock)) != NULL) { |
539 | /* |
540 | * Free the structure and return if nonblocking. |
541 | */ |
542 | if ((lock->lf_flags & F_WAIT) == 0) { |
543 | lf_free(lock); |
544 | return EAGAIN; |
545 | } |
546 | /* |
547 | * We are blocked. Since flock style locks cover |
548 | * the whole file, there is no chance for deadlock. |
549 | * For byte-range locks we must check for deadlock. |
550 | * |
551 | * Deadlock detection is done by looking through the |
552 | * wait channels to see if there are any cycles that |
553 | * involve us. MAXDEPTH is set just to make sure we |
554 | * do not go off into neverneverland. |
555 | */ |
556 | if ((lock->lf_flags & F_POSIX) && |
557 | (block->lf_flags & F_POSIX)) { |
558 | struct lwp *wlwp; |
559 | volatile const struct lockf *waitblock; |
560 | int i = 0; |
561 | struct proc *p; |
562 | |
563 | p = (struct proc *)block->lf_id; |
564 | KASSERT(p != NULL); |
565 | while (i++ < maxlockdepth) { |
566 | mutex_enter(p->p_lock); |
567 | if (p->p_nlwps > 1) { |
568 | mutex_exit(p->p_lock); |
569 | break; |
570 | } |
571 | wlwp = LIST_FIRST(&p->p_lwps); |
572 | lwp_lock(wlwp); |
573 | if (wlwp->l_wchan == NULL || |
574 | wlwp->l_wmesg != lockstr) { |
575 | lwp_unlock(wlwp); |
576 | mutex_exit(p->p_lock); |
577 | break; |
578 | } |
579 | waitblock = wlwp->l_wchan; |
580 | lwp_unlock(wlwp); |
581 | mutex_exit(p->p_lock); |
582 | /* Get the owner of the blocking lock */ |
583 | waitblock = waitblock->lf_next; |
584 | if ((waitblock->lf_flags & F_POSIX) == 0) |
585 | break; |
586 | p = (struct proc *)waitblock->lf_id; |
587 | if (p == curproc) { |
588 | lf_free(lock); |
589 | return EDEADLK; |
590 | } |
591 | } |
592 | /* |
593 | * If we're still following a dependency chain |
594 | * after maxlockdepth iterations, assume we're in |
595 | * a cycle to be safe. |
596 | */ |
597 | if (i >= maxlockdepth) { |
598 | lf_free(lock); |
599 | return EDEADLK; |
600 | } |
601 | } |
602 | /* |
603 | * For flock type locks, we must first remove |
604 | * any shared locks that we hold before we sleep |
605 | * waiting for an exclusive lock. |
606 | */ |
607 | if ((lock->lf_flags & F_FLOCK) && |
608 | lock->lf_type == F_WRLCK) { |
609 | lock->lf_type = F_UNLCK; |
610 | (void) lf_clearlock(lock, NULL); |
611 | lock->lf_type = F_WRLCK; |
612 | } |
613 | /* |
614 | * Add our lock to the blocked list and sleep until we're free. |
615 | * Remember who blocked us (for deadlock detection). |
616 | */ |
617 | lock->lf_next = block; |
618 | TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block); |
619 | #ifdef LOCKF_DEBUG |
620 | if (lockf_debug & 1) { |
621 | lf_print("lf_setlock: blocking on" , block); |
622 | lf_printlist("lf_setlock" , block); |
623 | } |
624 | #endif /* LOCKF_DEBUG */ |
625 | error = cv_wait_sig(&lock->lf_cv, interlock); |
626 | |
627 | /* |
628 | * We may have been awoken by a signal (in |
629 | * which case we must remove ourselves from the |
630 | * blocked list) and/or by another process |
631 | * releasing a lock (in which case we have already |
632 | * been removed from the blocked list and our |
633 | * lf_next field set to NULL). |
634 | */ |
635 | if (lock->lf_next != NULL) { |
636 | TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block); |
637 | lock->lf_next = NULL; |
638 | } |
639 | if (error) { |
640 | lf_free(lock); |
641 | return error; |
642 | } |
643 | } |
644 | /* |
645 | * No blocks!! Add the lock. Note that we will |
646 | * downgrade or upgrade any overlapping locks this |
647 | * process already owns. |
648 | * |
649 | * Skip over locks owned by other processes. |
650 | * Handle any locks that overlap and are owned by ourselves. |
651 | */ |
652 | prev = head; |
653 | block = *head; |
654 | needtolink = 1; |
655 | for (;;) { |
656 | ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap); |
657 | if (ovcase) |
658 | block = overlap->lf_next; |
659 | /* |
660 | * Six cases: |
661 | * 0) no overlap |
662 | * 1) overlap == lock |
663 | * 2) overlap contains lock |
664 | * 3) lock contains overlap |
665 | * 4) overlap starts before lock |
666 | * 5) overlap ends after lock |
667 | */ |
668 | switch (ovcase) { |
669 | case 0: /* no overlap */ |
670 | if (needtolink) { |
671 | *prev = lock; |
672 | lock->lf_next = overlap; |
673 | } |
674 | break; |
675 | |
676 | case 1: /* overlap == lock */ |
677 | /* |
678 | * If downgrading lock, others may be |
679 | * able to acquire it. |
680 | */ |
681 | if (lock->lf_type == F_RDLCK && |
682 | overlap->lf_type == F_WRLCK) |
683 | lf_wakelock(overlap); |
684 | overlap->lf_type = lock->lf_type; |
685 | lf_free(lock); |
686 | lock = overlap; /* for debug output below */ |
687 | break; |
688 | |
689 | case 2: /* overlap contains lock */ |
690 | /* |
691 | * Check for common starting point and different types. |
692 | */ |
693 | if (overlap->lf_type == lock->lf_type) { |
694 | lf_free(lock); |
695 | lock = overlap; /* for debug output below */ |
696 | break; |
697 | } |
698 | if (overlap->lf_start == lock->lf_start) { |
699 | *prev = lock; |
700 | lock->lf_next = overlap; |
701 | overlap->lf_start = lock->lf_end + 1; |
702 | } else |
703 | lf_split(overlap, lock, sparelock); |
704 | lf_wakelock(overlap); |
705 | break; |
706 | |
707 | case 3: /* lock contains overlap */ |
708 | /* |
709 | * If downgrading lock, others may be able to |
710 | * acquire it, otherwise take the list. |
711 | */ |
712 | if (lock->lf_type == F_RDLCK && |
713 | overlap->lf_type == F_WRLCK) { |
714 | lf_wakelock(overlap); |
715 | } else { |
716 | while ((ltmp = TAILQ_FIRST(&overlap->lf_blkhd))) { |
717 | KASSERT(ltmp->lf_next == overlap); |
718 | TAILQ_REMOVE(&overlap->lf_blkhd, ltmp, |
719 | lf_block); |
720 | ltmp->lf_next = lock; |
721 | TAILQ_INSERT_TAIL(&lock->lf_blkhd, |
722 | ltmp, lf_block); |
723 | } |
724 | } |
725 | /* |
726 | * Add the new lock if necessary and delete the overlap. |
727 | */ |
728 | if (needtolink) { |
729 | *prev = lock; |
730 | lock->lf_next = overlap->lf_next; |
731 | prev = &lock->lf_next; |
732 | needtolink = 0; |
733 | } else |
734 | *prev = overlap->lf_next; |
735 | lf_free(overlap); |
736 | continue; |
737 | |
738 | case 4: /* overlap starts before lock */ |
739 | /* |
740 | * Add lock after overlap on the list. |
741 | */ |
742 | lock->lf_next = overlap->lf_next; |
743 | overlap->lf_next = lock; |
744 | overlap->lf_end = lock->lf_start - 1; |
745 | prev = &lock->lf_next; |
746 | lf_wakelock(overlap); |
747 | needtolink = 0; |
748 | continue; |
749 | |
750 | case 5: /* overlap ends after lock */ |
751 | /* |
752 | * Add the new lock before overlap. |
753 | */ |
754 | if (needtolink) { |
755 | *prev = lock; |
756 | lock->lf_next = overlap; |
757 | } |
758 | overlap->lf_start = lock->lf_end + 1; |
759 | lf_wakelock(overlap); |
760 | break; |
761 | } |
762 | break; |
763 | } |
764 | #ifdef LOCKF_DEBUG |
765 | if (lockf_debug & 1) { |
766 | lf_print("lf_setlock: got the lock" , lock); |
767 | lf_printlist("lf_setlock" , lock); |
768 | } |
769 | #endif /* LOCKF_DEBUG */ |
770 | return 0; |
771 | } |
772 | |
773 | /* |
774 | * Check whether there is a blocking lock, |
775 | * and if so return its process identifier. |
776 | */ |
777 | static int |
778 | lf_getlock(struct lockf *lock, struct flock *fl) |
779 | { |
780 | struct lockf *block; |
781 | |
782 | #ifdef LOCKF_DEBUG |
783 | if (lockf_debug & 1) |
784 | lf_print("lf_getlock" , lock); |
785 | #endif /* LOCKF_DEBUG */ |
786 | |
787 | if ((block = lf_getblock(lock)) != NULL) { |
788 | fl->l_type = block->lf_type; |
789 | fl->l_whence = SEEK_SET; |
790 | fl->l_start = block->lf_start; |
791 | if (block->lf_end == -1) |
792 | fl->l_len = 0; |
793 | else |
794 | fl->l_len = block->lf_end - block->lf_start + 1; |
795 | if (block->lf_flags & F_POSIX) |
796 | fl->l_pid = ((struct proc *)block->lf_id)->p_pid; |
797 | else |
798 | fl->l_pid = -1; |
799 | } else { |
800 | fl->l_type = F_UNLCK; |
801 | } |
802 | return 0; |
803 | } |
804 | |
805 | /* |
806 | * Do an advisory lock operation. |
807 | */ |
808 | int |
809 | lf_advlock(struct vop_advlock_args *ap, struct lockf **head, off_t size) |
810 | { |
811 | struct flock *fl = ap->a_fl; |
812 | struct lockf *lock = NULL; |
813 | struct lockf *sparelock; |
814 | kmutex_t *interlock = lockf_lock; |
815 | off_t start, end; |
816 | int error = 0; |
817 | |
818 | /* |
819 | * Convert the flock structure into a start and end. |
820 | */ |
821 | switch (fl->l_whence) { |
822 | case SEEK_SET: |
823 | case SEEK_CUR: |
824 | /* |
825 | * Caller is responsible for adding any necessary offset |
826 | * when SEEK_CUR is used. |
827 | */ |
828 | start = fl->l_start; |
829 | break; |
830 | |
831 | case SEEK_END: |
832 | start = size + fl->l_start; |
833 | break; |
834 | |
835 | default: |
836 | return EINVAL; |
837 | } |
838 | |
839 | if (fl->l_len == 0) |
840 | end = -1; |
841 | else { |
842 | if (fl->l_len > 0) |
843 | end = start + fl->l_len - 1; |
844 | else { |
845 | /* lockf() allows -ve lengths */ |
846 | end = start - 1; |
847 | start += fl->l_len; |
848 | } |
849 | } |
850 | if (start < 0) |
851 | return EINVAL; |
852 | |
853 | /* |
854 | * Allocate locks before acquiring the interlock. We need two |
855 | * locks in the worst case. |
856 | */ |
857 | switch (ap->a_op) { |
858 | case F_SETLK: |
859 | case F_UNLCK: |
860 | /* |
861 | * XXX For F_UNLCK case, we can re-use the lock. |
862 | */ |
863 | if ((ap->a_flags & F_FLOCK) == 0) { |
864 | /* |
865 | * Byte-range lock might need one more lock. |
866 | */ |
867 | sparelock = lf_alloc(0); |
868 | if (sparelock == NULL) { |
869 | error = ENOMEM; |
870 | goto quit; |
871 | } |
872 | break; |
873 | } |
874 | /* FALLTHROUGH */ |
875 | |
876 | case F_GETLK: |
877 | sparelock = NULL; |
878 | break; |
879 | |
880 | default: |
881 | return EINVAL; |
882 | } |
883 | |
884 | switch (ap->a_op) { |
885 | case F_SETLK: |
886 | lock = lf_alloc(1); |
887 | break; |
888 | case F_UNLCK: |
889 | if (start == 0 || end == -1) { |
890 | /* never split */ |
891 | lock = lf_alloc(0); |
892 | } else { |
893 | /* might split */ |
894 | lock = lf_alloc(2); |
895 | } |
896 | break; |
897 | case F_GETLK: |
898 | lock = lf_alloc(0); |
899 | break; |
900 | } |
901 | if (lock == NULL) { |
902 | error = ENOMEM; |
903 | goto quit; |
904 | } |
905 | |
906 | mutex_enter(interlock); |
907 | |
908 | /* |
909 | * Avoid the common case of unlocking when inode has no locks. |
910 | */ |
911 | if (*head == (struct lockf *)0) { |
912 | if (ap->a_op != F_SETLK) { |
913 | fl->l_type = F_UNLCK; |
914 | error = 0; |
915 | goto quit_unlock; |
916 | } |
917 | } |
918 | |
919 | /* |
920 | * Create the lockf structure. |
921 | */ |
922 | lock->lf_start = start; |
923 | lock->lf_end = end; |
924 | lock->lf_head = head; |
925 | lock->lf_type = fl->l_type; |
926 | lock->lf_next = (struct lockf *)0; |
927 | TAILQ_INIT(&lock->lf_blkhd); |
928 | lock->lf_flags = ap->a_flags; |
929 | if (lock->lf_flags & F_POSIX) { |
930 | KASSERT(curproc == (struct proc *)ap->a_id); |
931 | } |
932 | lock->lf_id = ap->a_id; |
933 | |
934 | /* |
935 | * Do the requested operation. |
936 | */ |
937 | switch (ap->a_op) { |
938 | |
939 | case F_SETLK: |
940 | error = lf_setlock(lock, &sparelock, interlock); |
941 | lock = NULL; /* lf_setlock freed it */ |
942 | break; |
943 | |
944 | case F_UNLCK: |
945 | error = lf_clearlock(lock, &sparelock); |
946 | break; |
947 | |
948 | case F_GETLK: |
949 | error = lf_getlock(lock, fl); |
950 | break; |
951 | |
952 | default: |
953 | break; |
954 | /* NOTREACHED */ |
955 | } |
956 | |
957 | quit_unlock: |
958 | mutex_exit(interlock); |
959 | quit: |
960 | if (lock) |
961 | lf_free(lock); |
962 | if (sparelock) |
963 | lf_free(sparelock); |
964 | |
965 | return error; |
966 | } |
967 | |
968 | /* |
969 | * Initialize subsystem. XXX We use a global lock. This could be the |
970 | * vnode interlock, but the deadlock detection code may need to inspect |
971 | * locks belonging to other files. |
972 | */ |
973 | void |
974 | lf_init(void) |
975 | { |
976 | |
977 | lockf_cache = pool_cache_init(sizeof(struct lockf), 0, 0, 0, "lockf" , |
978 | NULL, IPL_NONE, lf_ctor, lf_dtor, NULL); |
979 | lockf_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); |
980 | } |
981 | |