1 | /* $NetBSD: pktqueue.c,v 1.8 2014/07/04 01:50:22 ozaki-r Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 2014 The NetBSD Foundation, Inc. |
5 | * All rights reserved. |
6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Mindaugas Rasiukevicius. |
9 | * |
10 | * Redistribution and use in source and binary forms, with or without |
11 | * modification, are permitted provided that the following conditions |
12 | * are met: |
13 | * 1. Redistributions of source code must retain the above copyright |
14 | * notice, this list of conditions and the following disclaimer. |
15 | * 2. Redistributions in binary form must reproduce the above copyright |
16 | * notice, this list of conditions and the following disclaimer in the |
17 | * documentation and/or other materials provided with the distribution. |
18 | * |
19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
29 | * POSSIBILITY OF SUCH DAMAGE. |
30 | */ |
31 | |
32 | /* |
33 | * The packet queue (pktqueue) interface is a lockless IP input queue |
34 | * which also abstracts and handles network ISR scheduling. It provides |
35 | * a mechanism to enable receiver-side packet steering (RPS). |
36 | */ |
37 | |
38 | #include <sys/cdefs.h> |
39 | __KERNEL_RCSID(0, "$NetBSD: pktqueue.c,v 1.8 2014/07/04 01:50:22 ozaki-r Exp $" ); |
40 | |
41 | #include <sys/param.h> |
42 | #include <sys/types.h> |
43 | |
44 | #include <sys/atomic.h> |
45 | #include <sys/cpu.h> |
46 | #include <sys/pcq.h> |
47 | #include <sys/intr.h> |
48 | #include <sys/mbuf.h> |
49 | #include <sys/proc.h> |
50 | #include <sys/percpu.h> |
51 | |
52 | #include <net/pktqueue.h> |
53 | |
54 | /* |
55 | * WARNING: update this if struct pktqueue changes. |
56 | */ |
57 | #define PKTQ_CLPAD \ |
58 | MAX(COHERENCY_UNIT, COHERENCY_UNIT - sizeof(kmutex_t) - sizeof(u_int)) |
59 | |
60 | struct pktqueue { |
61 | /* |
62 | * The lock used for a barrier mechanism. The barrier counter, |
63 | * as well as the drop counter, are managed atomically though. |
64 | * Ensure this group is in a separate cache line. |
65 | */ |
66 | kmutex_t pq_lock; |
67 | volatile u_int pq_barrier; |
68 | uint8_t _pad[PKTQ_CLPAD]; |
69 | |
70 | /* The size of the queue, counters and the interrupt handler. */ |
71 | u_int pq_maxlen; |
72 | percpu_t * pq_counters; |
73 | void * pq_sih; |
74 | |
75 | /* Finally, per-CPU queues. */ |
76 | pcq_t * pq_queue[]; |
77 | }; |
78 | |
79 | /* The counters of the packet queue. */ |
80 | #define PQCNT_ENQUEUE 0 |
81 | #define PQCNT_DEQUEUE 1 |
82 | #define PQCNT_DROP 2 |
83 | #define PQCNT_NCOUNTERS 3 |
84 | |
85 | typedef struct { |
86 | uint64_t count[PQCNT_NCOUNTERS]; |
87 | } pktq_counters_t; |
88 | |
89 | /* Special marker value used by pktq_barrier() mechanism. */ |
90 | #define PKTQ_MARKER ((void *)(~0ULL)) |
91 | |
92 | /* |
93 | * The total size of pktqueue_t which depends on the number of CPUs. |
94 | */ |
95 | #define PKTQUEUE_STRUCT_LEN(ncpu) \ |
96 | roundup2(offsetof(pktqueue_t, pq_queue[ncpu]), coherency_unit) |
97 | |
98 | pktqueue_t * |
99 | pktq_create(size_t maxlen, void (*intrh)(void *), void *sc) |
100 | { |
101 | const u_int sflags = SOFTINT_NET | SOFTINT_MPSAFE | SOFTINT_RCPU; |
102 | const size_t len = PKTQUEUE_STRUCT_LEN(ncpu); |
103 | pktqueue_t *pq; |
104 | percpu_t *pc; |
105 | void *sih; |
106 | |
107 | if ((pc = percpu_alloc(sizeof(pktq_counters_t))) == NULL) { |
108 | return NULL; |
109 | } |
110 | if ((sih = softint_establish(sflags, intrh, sc)) == NULL) { |
111 | percpu_free(pc, sizeof(pktq_counters_t)); |
112 | return NULL; |
113 | } |
114 | |
115 | pq = kmem_zalloc(len, KM_SLEEP); |
116 | for (u_int i = 0; i < ncpu; i++) { |
117 | pq->pq_queue[i] = pcq_create(maxlen, KM_SLEEP); |
118 | } |
119 | mutex_init(&pq->pq_lock, MUTEX_DEFAULT, IPL_NONE); |
120 | pq->pq_maxlen = maxlen; |
121 | pq->pq_counters = pc; |
122 | pq->pq_sih = sih; |
123 | |
124 | return pq; |
125 | } |
126 | |
127 | void |
128 | pktq_destroy(pktqueue_t *pq) |
129 | { |
130 | const size_t len = PKTQUEUE_STRUCT_LEN(ncpu); |
131 | |
132 | for (u_int i = 0; i < ncpu; i++) { |
133 | pcq_t *q = pq->pq_queue[i]; |
134 | KASSERT(pcq_peek(q) == NULL); |
135 | pcq_destroy(q); |
136 | } |
137 | percpu_free(pq->pq_counters, sizeof(pktq_counters_t)); |
138 | softint_disestablish(pq->pq_sih); |
139 | mutex_destroy(&pq->pq_lock); |
140 | kmem_free(pq, len); |
141 | } |
142 | |
143 | /* |
144 | * - pktq_inc_counter: increment the counter given an ID. |
145 | * - pktq_collect_counts: handler to sum up the counts from each CPU. |
146 | * - pktq_getcount: return the effective count given an ID. |
147 | */ |
148 | |
149 | static inline void |
150 | pktq_inc_count(pktqueue_t *pq, u_int i) |
151 | { |
152 | percpu_t *pc = pq->pq_counters; |
153 | pktq_counters_t *c; |
154 | |
155 | c = percpu_getref(pc); |
156 | c->count[i]++; |
157 | percpu_putref(pc); |
158 | } |
159 | |
160 | static void |
161 | pktq_collect_counts(void *mem, void *arg, struct cpu_info *ci) |
162 | { |
163 | const pktq_counters_t *c = mem; |
164 | pktq_counters_t *sum = arg; |
165 | |
166 | for (u_int i = 0; i < PQCNT_NCOUNTERS; i++) { |
167 | sum->count[i] += c->count[i]; |
168 | } |
169 | } |
170 | |
171 | uint64_t |
172 | pktq_get_count(pktqueue_t *pq, pktq_count_t c) |
173 | { |
174 | pktq_counters_t sum; |
175 | |
176 | if (c != PKTQ_MAXLEN) { |
177 | memset(&sum, 0, sizeof(sum)); |
178 | percpu_foreach(pq->pq_counters, pktq_collect_counts, &sum); |
179 | } |
180 | switch (c) { |
181 | case PKTQ_NITEMS: |
182 | return sum.count[PQCNT_ENQUEUE] - sum.count[PQCNT_DEQUEUE]; |
183 | case PKTQ_DROPS: |
184 | return sum.count[PQCNT_DROP]; |
185 | case PKTQ_MAXLEN: |
186 | return pq->pq_maxlen; |
187 | } |
188 | return 0; |
189 | } |
190 | |
191 | uint32_t |
192 | pktq_rps_hash(const struct mbuf *m __unused) |
193 | { |
194 | /* |
195 | * XXX: No distribution yet; the softnet_lock contention |
196 | * XXX: must be eliminated first. |
197 | */ |
198 | return 0; |
199 | } |
200 | |
201 | /* |
202 | * pktq_enqueue: inject the packet into the end of the queue. |
203 | * |
204 | * => Must be called from the interrupt or with the preemption disabled. |
205 | * => Consumes the packet and returns true on success. |
206 | * => Returns false on failure; caller is responsible to free the packet. |
207 | */ |
208 | bool |
209 | pktq_enqueue(pktqueue_t *pq, struct mbuf *m, const u_int hash __unused) |
210 | { |
211 | #if defined(_RUMPKERNEL) || defined(_RUMP_NATIVE_ABI) |
212 | const unsigned cpuid = curcpu()->ci_index; |
213 | #else |
214 | const unsigned cpuid = hash % ncpu; |
215 | #endif |
216 | |
217 | KASSERT(kpreempt_disabled()); |
218 | |
219 | if (__predict_false(!pcq_put(pq->pq_queue[cpuid], m))) { |
220 | pktq_inc_count(pq, PQCNT_DROP); |
221 | return false; |
222 | } |
223 | softint_schedule_cpu(pq->pq_sih, cpu_lookup(cpuid)); |
224 | pktq_inc_count(pq, PQCNT_ENQUEUE); |
225 | return true; |
226 | } |
227 | |
228 | /* |
229 | * pktq_dequeue: take a packet from the queue. |
230 | * |
231 | * => Must be called with preemption disabled. |
232 | * => Must ensure there are not concurrent dequeue calls. |
233 | */ |
234 | struct mbuf * |
235 | pktq_dequeue(pktqueue_t *pq) |
236 | { |
237 | const struct cpu_info *ci = curcpu(); |
238 | const unsigned cpuid = cpu_index(ci); |
239 | struct mbuf *m; |
240 | |
241 | m = pcq_get(pq->pq_queue[cpuid]); |
242 | if (__predict_false(m == PKTQ_MARKER)) { |
243 | /* Note the marker entry. */ |
244 | atomic_inc_uint(&pq->pq_barrier); |
245 | return NULL; |
246 | } |
247 | if (__predict_true(m != NULL)) { |
248 | pktq_inc_count(pq, PQCNT_DEQUEUE); |
249 | } |
250 | return m; |
251 | } |
252 | |
253 | /* |
254 | * pktq_barrier: waits for a grace period when all packets enqueued at |
255 | * the moment of calling this routine will be processed. This is used |
256 | * to ensure that e.g. packets referencing some interface were drained. |
257 | */ |
258 | void |
259 | pktq_barrier(pktqueue_t *pq) |
260 | { |
261 | u_int pending = 0; |
262 | |
263 | mutex_enter(&pq->pq_lock); |
264 | KASSERT(pq->pq_barrier == 0); |
265 | |
266 | for (u_int i = 0; i < ncpu; i++) { |
267 | pcq_t *q = pq->pq_queue[i]; |
268 | |
269 | /* If the queue is empty - nothing to do. */ |
270 | if (pcq_peek(q) == NULL) { |
271 | continue; |
272 | } |
273 | /* Otherwise, put the marker and entry. */ |
274 | while (!pcq_put(q, PKTQ_MARKER)) { |
275 | kpause("pktqsync" , false, 1, NULL); |
276 | } |
277 | kpreempt_disable(); |
278 | softint_schedule_cpu(pq->pq_sih, cpu_lookup(i)); |
279 | kpreempt_enable(); |
280 | pending++; |
281 | } |
282 | |
283 | /* Wait for each queue to process the markers. */ |
284 | while (pq->pq_barrier != pending) { |
285 | kpause("pktqsync" , false, 1, NULL); |
286 | } |
287 | pq->pq_barrier = 0; |
288 | mutex_exit(&pq->pq_lock); |
289 | } |
290 | |
291 | /* |
292 | * pktq_flush: free mbufs in all queues. |
293 | * |
294 | * => The caller must ensure there are no concurrent writers or flush calls. |
295 | */ |
296 | void |
297 | pktq_flush(pktqueue_t *pq) |
298 | { |
299 | struct mbuf *m; |
300 | |
301 | for (u_int i = 0; i < ncpu; i++) { |
302 | while ((m = pcq_get(pq->pq_queue[i])) != NULL) { |
303 | pktq_inc_count(pq, PQCNT_DEQUEUE); |
304 | m_freem(m); |
305 | } |
306 | } |
307 | } |
308 | |
309 | /* |
310 | * pktq_set_maxlen: create per-CPU queues using a new size and replace |
311 | * the existing queues without losing any packets. |
312 | */ |
313 | int |
314 | pktq_set_maxlen(pktqueue_t *pq, size_t maxlen) |
315 | { |
316 | const u_int slotbytes = ncpu * sizeof(pcq_t *); |
317 | pcq_t **qs; |
318 | |
319 | if (!maxlen || maxlen > PCQ_MAXLEN) |
320 | return EINVAL; |
321 | if (pq->pq_maxlen == maxlen) |
322 | return 0; |
323 | |
324 | /* First, allocate the new queues and replace them. */ |
325 | qs = kmem_zalloc(slotbytes, KM_SLEEP); |
326 | for (u_int i = 0; i < ncpu; i++) { |
327 | qs[i] = pcq_create(maxlen, KM_SLEEP); |
328 | } |
329 | mutex_enter(&pq->pq_lock); |
330 | for (u_int i = 0; i < ncpu; i++) { |
331 | /* Swap: store of a word is atomic. */ |
332 | pcq_t *q = pq->pq_queue[i]; |
333 | pq->pq_queue[i] = qs[i]; |
334 | qs[i] = q; |
335 | } |
336 | pq->pq_maxlen = maxlen; |
337 | mutex_exit(&pq->pq_lock); |
338 | |
339 | /* |
340 | * At this point, the new packets are flowing into the new |
341 | * queues. However, the old queues may have some packets |
342 | * present which are no longer being processed. We are going |
343 | * to re-enqueue them. This may change the order of packet |
344 | * arrival, but it is not considered an issue. |
345 | * |
346 | * There may be in-flight interrupts calling pktq_dequeue() |
347 | * which reference the old queues. Issue a barrier to ensure |
348 | * that we are going to be the only pcq_get() callers on the |
349 | * old queues. |
350 | */ |
351 | pktq_barrier(pq); |
352 | |
353 | for (u_int i = 0; i < ncpu; i++) { |
354 | struct mbuf *m; |
355 | |
356 | while ((m = pcq_get(qs[i])) != NULL) { |
357 | while (!pcq_put(pq->pq_queue[i], m)) { |
358 | kpause("pktqrenq" , false, 1, NULL); |
359 | } |
360 | } |
361 | pcq_destroy(qs[i]); |
362 | } |
363 | |
364 | /* Well, that was fun. */ |
365 | kmem_free(qs, slotbytes); |
366 | return 0; |
367 | } |
368 | |
369 | int |
370 | sysctl_pktq_maxlen(SYSCTLFN_ARGS, pktqueue_t *pq) |
371 | { |
372 | u_int nmaxlen = pktq_get_count(pq, PKTQ_MAXLEN); |
373 | struct sysctlnode node = *rnode; |
374 | int error; |
375 | |
376 | node.sysctl_data = &nmaxlen; |
377 | error = sysctl_lookup(SYSCTLFN_CALL(&node)); |
378 | if (error || newp == NULL) |
379 | return error; |
380 | return pktq_set_maxlen(pq, nmaxlen); |
381 | } |
382 | |
383 | int |
384 | sysctl_pktq_count(SYSCTLFN_ARGS, pktqueue_t *pq, u_int count_id) |
385 | { |
386 | int count = pktq_get_count(pq, count_id); |
387 | struct sysctlnode node = *rnode; |
388 | node.sysctl_data = &count; |
389 | return sysctl_lookup(SYSCTLFN_CALL(&node)); |
390 | } |
391 | |