1/* $NetBSD: uvm_emap.c,v 1.11 2014/11/27 14:25:01 uebayasi Exp $ */
2
3/*-
4 * Copyright (c) 2009, 2010 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Mindaugas Rasiukevicius and Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33 * UVM ephemeral mapping interface.
34 */
35
36/*
37 * Overview:
38 *
39 * On multiprocessor systems, frequent uses of pmap_kenter_pa/pmap_kremove
40 * for ephemeral mappings are not desirable because they likely involve
41 * TLB flush IPIs because that pmap_kernel() is shared among all LWPs.
42 * This interface can be used instead, to reduce the number of IPIs.
43 *
44 * For a single-page mapping, PMAP_DIRECT_MAP is likely a better choice
45 * if available. (__HAVE_DIRECT_MAP)
46 */
47
48/*
49 * How to use:
50 *
51 * Map pages at the address:
52 *
53 * uvm_emap_enter(va, pgs, npages);
54 * gen = uvm_emap_produce();
55 *
56 * Read pages via the mapping:
57 *
58 * uvm_emap_consume(gen);
59 * some_access(va);
60 *
61 * After finishing using the mapping:
62 *
63 * uvm_emap_remove(va, len);
64 */
65
66/*
67 * Notes for pmap developers:
68 *
69 * Generic (more expensive) stubs are implemented for architectures which
70 * do not support pmap.
71 *
72 * Note that uvm_emap_update() is called from lower pmap(9) layer, while
73 * other functions call to pmap(9). Typical pattern of update in pmap:
74 *
75 * u_int gen = uvm_emap_gen_return();
76 * tlbflush();
77 * uvm_emap_update();
78 *
79 * It is also used from IPI context, therefore functions must safe.
80 */
81
82#include <sys/cdefs.h>
83__KERNEL_RCSID(0, "$NetBSD: uvm_emap.c,v 1.11 2014/11/27 14:25:01 uebayasi Exp $");
84
85#include <sys/param.h>
86#include <sys/kernel.h>
87#include <sys/cpu.h>
88#include <sys/atomic.h>
89#include <sys/lwp.h>
90#include <sys/vmem.h>
91#include <sys/types.h>
92
93#include <uvm/uvm.h>
94#include <uvm/uvm_extern.h>
95
96/* XXX: Arbitrary. */
97#ifdef _LP64
98#define UVM_EMAP_SIZE (128 * 1024 * 1024) /* 128 MB */
99#else
100#define UVM_EMAP_SIZE (32 * 1024 * 1024) /* 32 MB */
101#endif
102
103static u_int _uvm_emap_gen[COHERENCY_UNIT - sizeof(u_int)]
104 __aligned(COHERENCY_UNIT);
105
106#define uvm_emap_gen (_uvm_emap_gen[0])
107
108u_int uvm_emap_size = UVM_EMAP_SIZE;
109static vaddr_t uvm_emap_va;
110static vmem_t * uvm_emap_vmem;
111
112/*
113 * uvm_emap_init: initialize subsystem.
114 */
115void
116uvm_emap_sysinit(void)
117{
118 struct uvm_cpu *ucpu;
119 /* size_t qmax; */
120 u_int i;
121
122 uvm_emap_size = roundup(uvm_emap_size, PAGE_SIZE);
123#if 0
124 qmax = 16 * PAGE_SIZE;
125 uvm_emap_va = uvm_km_alloc(kernel_map, uvm_emap_size, 0,
126 UVM_KMF_VAONLY | UVM_KMF_WAITVA);
127 if (uvm_emap_va == 0) {
128 panic("uvm_emap_init: KVA allocation failed");
129 }
130
131 uvm_emap_vmem = vmem_create("emap", uvm_emap_va, uvm_emap_size,
132 PAGE_SIZE, NULL, NULL, NULL, qmax, VM_SLEEP, IPL_NONE);
133 if (uvm_emap_vmem == NULL) {
134 panic("uvm_emap_init: vmem creation failed");
135 }
136#else
137 uvm_emap_va = 0;
138 uvm_emap_vmem = NULL;
139#endif
140 /* Initial generation value is 1. */
141 uvm_emap_gen = 1;
142 for (i = 0; i < maxcpus; i++) {
143 ucpu = uvm.cpus[i];
144 if (ucpu != NULL) {
145 ucpu->emap_gen = 1;
146 }
147 }
148}
149
150/*
151 * uvm_emap_alloc: allocate a window.
152 */
153vaddr_t
154uvm_emap_alloc(vsize_t size, bool waitok)
155{
156 vmem_addr_t addr;
157
158 KASSERT(size > 0);
159 KASSERT(round_page(size) == size);
160
161 if (vmem_alloc(uvm_emap_vmem, size,
162 VM_INSTANTFIT | (waitok ? VM_SLEEP : VM_NOSLEEP), &addr) == 0)
163 return (vaddr_t)addr;
164
165 return (vaddr_t)0;
166}
167
168/*
169 * uvm_emap_free: free a window.
170 */
171void
172uvm_emap_free(vaddr_t va, size_t size)
173{
174
175 KASSERT(va >= uvm_emap_va);
176 KASSERT(size <= uvm_emap_size);
177 KASSERT(va + size <= uvm_emap_va + uvm_emap_size);
178
179 vmem_free(uvm_emap_vmem, va, size);
180}
181
182#ifdef __HAVE_PMAP_EMAP
183
184/*
185 * uvm_emap_enter: enter a new mapping, without TLB flush.
186 */
187void
188uvm_emap_enter(vaddr_t va, struct vm_page **pgs, u_int npages)
189{
190 paddr_t pa;
191 u_int n;
192
193 for (n = 0; n < npages; n++, va += PAGE_SIZE) {
194 pa = VM_PAGE_TO_PHYS(pgs[n]);
195 pmap_emap_enter(va, pa, VM_PROT_READ);
196 }
197}
198
199/*
200 * uvm_emap_remove: remove a mapping.
201 */
202void
203uvm_emap_remove(vaddr_t sva, vsize_t len)
204{
205
206 pmap_emap_remove(sva, len);
207}
208
209/*
210 * uvm_emap_gen_return: get the global generation number.
211 *
212 * => can be called from IPI handler, therefore function must be safe.
213 */
214u_int
215uvm_emap_gen_return(void)
216{
217 u_int gen;
218
219 gen = uvm_emap_gen;
220 if (__predict_false(gen == UVM_EMAP_INACTIVE)) {
221 /*
222 * Instead of looping, just increase in our side.
223 * Other thread could race and increase it again,
224 * but without any negative effect.
225 */
226 gen = atomic_inc_uint_nv(&uvm_emap_gen);
227 }
228 KASSERT(gen != UVM_EMAP_INACTIVE);
229 return gen;
230}
231
232/*
233 * uvm_emap_switch: if the CPU is 'behind' the LWP in emap visibility,
234 * perform TLB flush and thus update the local view. Main purpose is
235 * to handle kernel preemption, while emap is in use.
236 *
237 * => called from mi_switch(), when LWP returns after block or preempt.
238 */
239void
240uvm_emap_switch(lwp_t *l)
241{
242 struct uvm_cpu *ucpu;
243 u_int curgen, gen;
244
245 KASSERT(kpreempt_disabled());
246
247 /* If LWP did not use emap, then nothing to do. */
248 if (__predict_true(l->l_emap_gen == UVM_EMAP_INACTIVE)) {
249 return;
250 }
251
252 /*
253 * No need to synchronise if generation number of current CPU is
254 * newer than the number of this LWP.
255 *
256 * This test assumes two's complement arithmetic and allows
257 * ~2B missed updates before it will produce bad results.
258 */
259 ucpu = curcpu()->ci_data.cpu_uvm;
260 curgen = ucpu->emap_gen;
261 gen = l->l_emap_gen;
262 if (__predict_true((signed int)(curgen - gen) >= 0)) {
263 return;
264 }
265
266 /*
267 * See comments in uvm_emap_consume() about memory
268 * barriers and race conditions.
269 */
270 curgen = uvm_emap_gen_return();
271 pmap_emap_sync(false);
272 ucpu->emap_gen = curgen;
273}
274
275/*
276 * uvm_emap_consume: update the current CPU and LWP to the given generation
277 * of the emap. In a case of LWP migration to a different CPU after block
278 * or preempt, uvm_emap_switch() will synchronise.
279 *
280 * => may be called from both interrupt and thread context.
281 */
282void
283uvm_emap_consume(u_int gen)
284{
285 struct cpu_info *ci;
286 struct uvm_cpu *ucpu;
287 lwp_t *l = curlwp;
288 u_int curgen;
289
290 if (gen == UVM_EMAP_INACTIVE) {
291 return;
292 }
293
294 /*
295 * No need to synchronise if generation number of current CPU is
296 * newer than the number of this LWP.
297 *
298 * This test assumes two's complement arithmetic and allows
299 * ~2B missed updates before it will produce bad results.
300 */
301 kpreempt_disable();
302 ci = l->l_cpu;
303 ucpu = ci->ci_data.cpu_uvm;
304 if (__predict_true((signed int)(ucpu->emap_gen - gen) >= 0)) {
305 l->l_emap_gen = ucpu->emap_gen;
306 kpreempt_enable();
307 return;
308 }
309
310 /*
311 * Record the current generation _before_ issuing the TLB flush.
312 * No need for a memory barrier before, as reading a stale value
313 * for uvm_emap_gen is not a problem.
314 *
315 * pmap_emap_sync() must implicitly perform a full memory barrier,
316 * which prevents us from fetching a value from after the TLB flush
317 * has occurred (which would be bad).
318 *
319 * We can race with an interrupt on the current CPU updating the
320 * counter to a newer value. This could cause us to set a stale
321 * value into ucpu->emap_gen, overwriting a newer update from the
322 * interrupt. However, it does not matter since:
323 * (1) Interrupts always run to completion or block.
324 * (2) Interrupts will only ever install a newer value and,
325 * (3) We will roll the value forward later.
326 */
327 curgen = uvm_emap_gen_return();
328 pmap_emap_sync(true);
329 ucpu->emap_gen = curgen;
330 l->l_emap_gen = curgen;
331 KASSERT((signed int)(curgen - gen) >= 0);
332 kpreempt_enable();
333}
334
335/*
336 * uvm_emap_produce: increment emap generation counter.
337 *
338 * => pmap updates must be globally visible.
339 * => caller must have already entered mappings.
340 * => may be called from both interrupt and thread context.
341 */
342u_int
343uvm_emap_produce(void)
344{
345 u_int gen;
346again:
347 gen = atomic_inc_uint_nv(&uvm_emap_gen);
348 if (__predict_false(gen == UVM_EMAP_INACTIVE)) {
349 goto again;
350 }
351 return gen;
352}
353
354/*
355 * uvm_emap_update: update global emap generation number for current CPU.
356 *
357 * Function is called by MD code (eg. pmap) to take advantage of TLB flushes
358 * initiated for other reasons, that sync the emap as a side effect. Note
359 * update should be performed before the actual TLB flush, to avoid race
360 * with newly generated number.
361 *
362 * => can be called from IPI handler, therefore function must be safe.
363 * => should be called _after_ TLB flush.
364 * => emap generation number should be taken _before_ TLB flush.
365 * => must be called with preemption disabled.
366 */
367void
368uvm_emap_update(u_int gen)
369{
370 struct uvm_cpu *ucpu;
371
372 /*
373 * See comments in uvm_emap_consume() about memory barriers and
374 * race conditions. Store is atomic if emap_gen size is word.
375 */
376 CTASSERT(sizeof(ucpu->emap_gen) == sizeof(int));
377 /* XXX: KASSERT(kpreempt_disabled()); */
378
379 ucpu = curcpu()->ci_data.cpu_uvm;
380 ucpu->emap_gen = gen;
381}
382
383#else
384
385/*
386 * Stubs for architectures which do not support emap.
387 */
388
389void
390uvm_emap_enter(vaddr_t va, struct vm_page **pgs, u_int npages)
391{
392 paddr_t pa;
393 u_int n;
394
395 for (n = 0; n < npages; n++, va += PAGE_SIZE) {
396 pa = VM_PAGE_TO_PHYS(pgs[n]);
397 pmap_kenter_pa(va, pa, VM_PROT_READ, 0);
398 }
399 pmap_update(pmap_kernel());
400}
401
402void
403uvm_emap_remove(vaddr_t sva, vsize_t len)
404{
405
406 pmap_kremove(sva, len);
407 pmap_update(pmap_kernel());
408}
409
410#endif
411