1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3 * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
4 * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
5 */
6
7 #include "rxe.h"
8 #include "rxe_loc.h"
9
10 /* Return a random 8 bit key value that is
11 * different than the last_key. Set last_key to -1
12 * if this is the first key for an MR or MW
13 */
rxe_get_next_key(u32 last_key)14 u8 rxe_get_next_key(u32 last_key)
15 {
16 u8 key;
17
18 do {
19 get_random_bytes(&key, 1);
20 } while (key == last_key);
21
22 return key;
23 }
24
mr_check_range(struct rxe_mr * mr,u64 iova,size_t length)25 int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length)
26 {
27 struct rxe_map_set *set = mr->cur_map_set;
28
29 switch (mr->type) {
30 case IB_MR_TYPE_DMA:
31 return 0;
32
33 case IB_MR_TYPE_USER:
34 case IB_MR_TYPE_MEM_REG:
35 if (iova < set->iova || length > set->length ||
36 iova > set->iova + set->length - length)
37 return -EFAULT;
38 return 0;
39
40 default:
41 pr_warn("%s: mr type (%d) not supported\n",
42 __func__, mr->type);
43 return -EFAULT;
44 }
45 }
46
47 #define IB_ACCESS_REMOTE (IB_ACCESS_REMOTE_READ \
48 | IB_ACCESS_REMOTE_WRITE \
49 | IB_ACCESS_REMOTE_ATOMIC)
50
rxe_mr_init(int access,struct rxe_mr * mr)51 static void rxe_mr_init(int access, struct rxe_mr *mr)
52 {
53 u32 lkey = mr->pelem.index << 8 | rxe_get_next_key(-1);
54 u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0;
55
56 /* set ibmr->l/rkey and also copy into private l/rkey
57 * for user MRs these will always be the same
58 * for cases where caller 'owns' the key portion
59 * they may be different until REG_MR WQE is executed.
60 */
61 mr->lkey = mr->ibmr.lkey = lkey;
62 mr->rkey = mr->ibmr.rkey = rkey;
63
64 mr->state = RXE_MR_STATE_INVALID;
65 mr->map_shift = ilog2(RXE_BUF_PER_MAP);
66 }
67
rxe_mr_free_map_set(int num_map,struct rxe_map_set * set)68 static void rxe_mr_free_map_set(int num_map, struct rxe_map_set *set)
69 {
70 int i;
71
72 for (i = 0; i < num_map; i++)
73 kfree(set->map[i]);
74
75 kfree(set->map);
76 kfree(set);
77 }
78
rxe_mr_alloc_map_set(int num_map,struct rxe_map_set ** setp)79 static int rxe_mr_alloc_map_set(int num_map, struct rxe_map_set **setp)
80 {
81 int i;
82 struct rxe_map_set *set;
83
84 set = kmalloc(sizeof(*set), GFP_KERNEL);
85 if (!set)
86 goto err_out;
87
88 set->map = kmalloc_array(num_map, sizeof(struct rxe_map *), GFP_KERNEL);
89 if (!set->map)
90 goto err_free_set;
91
92 for (i = 0; i < num_map; i++) {
93 set->map[i] = kmalloc(sizeof(struct rxe_map), GFP_KERNEL);
94 if (!set->map[i])
95 goto err_free_map;
96 }
97
98 *setp = set;
99
100 return 0;
101
102 err_free_map:
103 for (i--; i >= 0; i--)
104 kfree(set->map[i]);
105
106 kfree(set->map);
107 err_free_set:
108 kfree(set);
109 err_out:
110 return -ENOMEM;
111 }
112
113 /**
114 * rxe_mr_alloc() - Allocate memory map array(s) for MR
115 * @mr: Memory region
116 * @num_buf: Number of buffer descriptors to support
117 * @both: If non zero allocate both mr->map and mr->next_map
118 * else just allocate mr->map. Used for fast MRs
119 *
120 * Return: 0 on success else an error
121 */
rxe_mr_alloc(struct rxe_mr * mr,int num_buf,int both)122 static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf, int both)
123 {
124 int ret;
125 int num_map;
126
127 BUILD_BUG_ON(!is_power_of_2(RXE_BUF_PER_MAP));
128 num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP;
129
130 mr->map_shift = ilog2(RXE_BUF_PER_MAP);
131 mr->map_mask = RXE_BUF_PER_MAP - 1;
132 mr->num_buf = num_buf;
133 mr->max_buf = num_map * RXE_BUF_PER_MAP;
134 mr->num_map = num_map;
135
136 ret = rxe_mr_alloc_map_set(num_map, &mr->cur_map_set);
137 if (ret)
138 return -ENOMEM;
139
140 if (both) {
141 ret = rxe_mr_alloc_map_set(num_map, &mr->next_map_set);
142 if (ret)
143 goto err_free;
144 }
145
146 return 0;
147
148 err_free:
149 rxe_mr_free_map_set(mr->num_map, mr->cur_map_set);
150 mr->cur_map_set = NULL;
151 return -ENOMEM;
152 }
153
rxe_mr_init_dma(struct rxe_pd * pd,int access,struct rxe_mr * mr)154 void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr)
155 {
156 rxe_mr_init(access, mr);
157
158 mr->ibmr.pd = &pd->ibpd;
159 mr->access = access;
160 mr->state = RXE_MR_STATE_VALID;
161 mr->type = IB_MR_TYPE_DMA;
162 }
163
rxe_mr_init_user(struct rxe_pd * pd,u64 start,u64 length,u64 iova,int access,struct rxe_mr * mr)164 int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova,
165 int access, struct rxe_mr *mr)
166 {
167 struct rxe_map_set *set;
168 struct rxe_map **map;
169 struct rxe_phys_buf *buf = NULL;
170 struct ib_umem *umem;
171 struct sg_page_iter sg_iter;
172 int num_buf;
173 void *vaddr;
174 int err;
175
176 umem = ib_umem_get(pd->ibpd.device, start, length, access);
177 if (IS_ERR(umem)) {
178 pr_warn("%s: Unable to pin memory region err = %d\n",
179 __func__, (int)PTR_ERR(umem));
180 err = PTR_ERR(umem);
181 goto err_out;
182 }
183
184 num_buf = ib_umem_num_pages(umem);
185
186 rxe_mr_init(access, mr);
187
188 err = rxe_mr_alloc(mr, num_buf, 0);
189 if (err) {
190 pr_warn("%s: Unable to allocate memory for map\n",
191 __func__);
192 goto err_release_umem;
193 }
194
195 set = mr->cur_map_set;
196 set->page_shift = PAGE_SHIFT;
197 set->page_mask = PAGE_SIZE - 1;
198
199 num_buf = 0;
200 map = set->map;
201
202 if (length > 0) {
203 buf = map[0]->buf;
204
205 for_each_sgtable_page (&umem->sgt_append.sgt, &sg_iter, 0) {
206 if (num_buf >= RXE_BUF_PER_MAP) {
207 map++;
208 buf = map[0]->buf;
209 num_buf = 0;
210 }
211
212 vaddr = page_address(sg_page_iter_page(&sg_iter));
213 if (!vaddr) {
214 pr_warn("%s: Unable to get virtual address\n",
215 __func__);
216 err = -ENOMEM;
217 goto err_release_umem;
218 }
219
220 buf->addr = (uintptr_t)vaddr;
221 buf->size = PAGE_SIZE;
222 num_buf++;
223 buf++;
224 }
225 }
226
227 mr->ibmr.pd = &pd->ibpd;
228 mr->umem = umem;
229 mr->access = access;
230 mr->state = RXE_MR_STATE_VALID;
231 mr->type = IB_MR_TYPE_USER;
232
233 set->length = length;
234 set->iova = iova;
235 set->va = start;
236 set->offset = ib_umem_offset(umem);
237
238 return 0;
239
240 err_release_umem:
241 ib_umem_release(umem);
242 err_out:
243 return err;
244 }
245
rxe_mr_init_fast(struct rxe_pd * pd,int max_pages,struct rxe_mr * mr)246 int rxe_mr_init_fast(struct rxe_pd *pd, int max_pages, struct rxe_mr *mr)
247 {
248 int err;
249
250 /* always allow remote access for FMRs */
251 rxe_mr_init(IB_ACCESS_REMOTE, mr);
252
253 err = rxe_mr_alloc(mr, max_pages, 1);
254 if (err)
255 goto err1;
256
257 mr->ibmr.pd = &pd->ibpd;
258 mr->max_buf = max_pages;
259 mr->state = RXE_MR_STATE_FREE;
260 mr->type = IB_MR_TYPE_MEM_REG;
261
262 return 0;
263
264 err1:
265 return err;
266 }
267
lookup_iova(struct rxe_mr * mr,u64 iova,int * m_out,int * n_out,size_t * offset_out)268 static void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out,
269 size_t *offset_out)
270 {
271 struct rxe_map_set *set = mr->cur_map_set;
272 size_t offset = iova - set->iova + set->offset;
273 int map_index;
274 int buf_index;
275 u64 length;
276 struct rxe_map *map;
277
278 if (likely(set->page_shift)) {
279 *offset_out = offset & set->page_mask;
280 offset >>= set->page_shift;
281 *n_out = offset & mr->map_mask;
282 *m_out = offset >> mr->map_shift;
283 } else {
284 map_index = 0;
285 buf_index = 0;
286
287 map = set->map[map_index];
288 length = map->buf[buf_index].size;
289
290 while (offset >= length) {
291 offset -= length;
292 buf_index++;
293
294 if (buf_index == RXE_BUF_PER_MAP) {
295 map_index++;
296 buf_index = 0;
297 }
298 map = set->map[map_index];
299 length = map->buf[buf_index].size;
300 }
301
302 *m_out = map_index;
303 *n_out = buf_index;
304 *offset_out = offset;
305 }
306 }
307
iova_to_vaddr(struct rxe_mr * mr,u64 iova,int length)308 void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length)
309 {
310 size_t offset;
311 int m, n;
312 void *addr;
313
314 if (mr->state != RXE_MR_STATE_VALID) {
315 pr_warn("mr not in valid state\n");
316 addr = NULL;
317 goto out;
318 }
319
320 if (!mr->cur_map_set) {
321 addr = (void *)(uintptr_t)iova;
322 goto out;
323 }
324
325 if (mr_check_range(mr, iova, length)) {
326 pr_warn("range violation\n");
327 addr = NULL;
328 goto out;
329 }
330
331 lookup_iova(mr, iova, &m, &n, &offset);
332
333 if (offset + length > mr->cur_map_set->map[m]->buf[n].size) {
334 pr_warn("crosses page boundary\n");
335 addr = NULL;
336 goto out;
337 }
338
339 addr = (void *)(uintptr_t)mr->cur_map_set->map[m]->buf[n].addr + offset;
340
341 out:
342 return addr;
343 }
344
345 /* copy data from a range (vaddr, vaddr+length-1) to or from
346 * a mr object starting at iova.
347 */
rxe_mr_copy(struct rxe_mr * mr,u64 iova,void * addr,int length,enum rxe_mr_copy_dir dir)348 int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
349 enum rxe_mr_copy_dir dir)
350 {
351 int err;
352 int bytes;
353 u8 *va;
354 struct rxe_map **map;
355 struct rxe_phys_buf *buf;
356 int m;
357 int i;
358 size_t offset;
359
360 if (length == 0)
361 return 0;
362
363 if (mr->type == IB_MR_TYPE_DMA) {
364 u8 *src, *dest;
365
366 src = (dir == RXE_TO_MR_OBJ) ? addr : ((void *)(uintptr_t)iova);
367
368 dest = (dir == RXE_TO_MR_OBJ) ? ((void *)(uintptr_t)iova) : addr;
369
370 memcpy(dest, src, length);
371
372 return 0;
373 }
374
375 WARN_ON_ONCE(!mr->cur_map_set);
376
377 err = mr_check_range(mr, iova, length);
378 if (err) {
379 err = -EFAULT;
380 goto err1;
381 }
382
383 lookup_iova(mr, iova, &m, &i, &offset);
384
385 map = mr->cur_map_set->map + m;
386 buf = map[0]->buf + i;
387
388 while (length > 0) {
389 u8 *src, *dest;
390
391 va = (u8 *)(uintptr_t)buf->addr + offset;
392 src = (dir == RXE_TO_MR_OBJ) ? addr : va;
393 dest = (dir == RXE_TO_MR_OBJ) ? va : addr;
394
395 bytes = buf->size - offset;
396
397 if (bytes > length)
398 bytes = length;
399
400 memcpy(dest, src, bytes);
401
402 length -= bytes;
403 addr += bytes;
404
405 offset = 0;
406 buf++;
407 i++;
408
409 if (i == RXE_BUF_PER_MAP) {
410 i = 0;
411 map++;
412 buf = map[0]->buf;
413 }
414 }
415
416 return 0;
417
418 err1:
419 return err;
420 }
421
422 /* copy data in or out of a wqe, i.e. sg list
423 * under the control of a dma descriptor
424 */
copy_data(struct rxe_pd * pd,int access,struct rxe_dma_info * dma,void * addr,int length,enum rxe_mr_copy_dir dir)425 int copy_data(
426 struct rxe_pd *pd,
427 int access,
428 struct rxe_dma_info *dma,
429 void *addr,
430 int length,
431 enum rxe_mr_copy_dir dir)
432 {
433 int bytes;
434 struct rxe_sge *sge = &dma->sge[dma->cur_sge];
435 int offset = dma->sge_offset;
436 int resid = dma->resid;
437 struct rxe_mr *mr = NULL;
438 u64 iova;
439 int err;
440
441 if (length == 0)
442 return 0;
443
444 if (length > resid) {
445 err = -EINVAL;
446 goto err2;
447 }
448
449 if (sge->length && (offset < sge->length)) {
450 mr = lookup_mr(pd, access, sge->lkey, RXE_LOOKUP_LOCAL);
451 if (!mr) {
452 err = -EINVAL;
453 goto err1;
454 }
455 }
456
457 while (length > 0) {
458 bytes = length;
459
460 if (offset >= sge->length) {
461 if (mr) {
462 rxe_drop_ref(mr);
463 mr = NULL;
464 }
465 sge++;
466 dma->cur_sge++;
467 offset = 0;
468
469 if (dma->cur_sge >= dma->num_sge) {
470 err = -ENOSPC;
471 goto err2;
472 }
473
474 if (sge->length) {
475 mr = lookup_mr(pd, access, sge->lkey,
476 RXE_LOOKUP_LOCAL);
477 if (!mr) {
478 err = -EINVAL;
479 goto err1;
480 }
481 } else {
482 continue;
483 }
484 }
485
486 if (bytes > sge->length - offset)
487 bytes = sge->length - offset;
488
489 if (bytes > 0) {
490 iova = sge->addr + offset;
491
492 err = rxe_mr_copy(mr, iova, addr, bytes, dir);
493 if (err)
494 goto err2;
495
496 offset += bytes;
497 resid -= bytes;
498 length -= bytes;
499 addr += bytes;
500 }
501 }
502
503 dma->sge_offset = offset;
504 dma->resid = resid;
505
506 if (mr)
507 rxe_drop_ref(mr);
508
509 return 0;
510
511 err2:
512 if (mr)
513 rxe_drop_ref(mr);
514 err1:
515 return err;
516 }
517
advance_dma_data(struct rxe_dma_info * dma,unsigned int length)518 int advance_dma_data(struct rxe_dma_info *dma, unsigned int length)
519 {
520 struct rxe_sge *sge = &dma->sge[dma->cur_sge];
521 int offset = dma->sge_offset;
522 int resid = dma->resid;
523
524 while (length) {
525 unsigned int bytes;
526
527 if (offset >= sge->length) {
528 sge++;
529 dma->cur_sge++;
530 offset = 0;
531 if (dma->cur_sge >= dma->num_sge)
532 return -ENOSPC;
533 }
534
535 bytes = length;
536
537 if (bytes > sge->length - offset)
538 bytes = sge->length - offset;
539
540 offset += bytes;
541 resid -= bytes;
542 length -= bytes;
543 }
544
545 dma->sge_offset = offset;
546 dma->resid = resid;
547
548 return 0;
549 }
550
551 /* (1) find the mr corresponding to lkey/rkey
552 * depending on lookup_type
553 * (2) verify that the (qp) pd matches the mr pd
554 * (3) verify that the mr can support the requested access
555 * (4) verify that mr state is valid
556 */
lookup_mr(struct rxe_pd * pd,int access,u32 key,enum rxe_mr_lookup_type type)557 struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key,
558 enum rxe_mr_lookup_type type)
559 {
560 struct rxe_mr *mr;
561 struct rxe_dev *rxe = to_rdev(pd->ibpd.device);
562 int index = key >> 8;
563
564 mr = rxe_pool_get_index(&rxe->mr_pool, index);
565 if (!mr)
566 return NULL;
567
568 if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) ||
569 (type == RXE_LOOKUP_REMOTE && mr->rkey != key) ||
570 mr_pd(mr) != pd || (access && !(access & mr->access)) ||
571 mr->state != RXE_MR_STATE_VALID)) {
572 rxe_drop_ref(mr);
573 mr = NULL;
574 }
575
576 return mr;
577 }
578
rxe_invalidate_mr(struct rxe_qp * qp,u32 rkey)579 int rxe_invalidate_mr(struct rxe_qp *qp, u32 rkey)
580 {
581 struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
582 struct rxe_mr *mr;
583 int ret;
584
585 mr = rxe_pool_get_index(&rxe->mr_pool, rkey >> 8);
586 if (!mr) {
587 pr_err("%s: No MR for rkey %#x\n", __func__, rkey);
588 ret = -EINVAL;
589 goto err;
590 }
591
592 if (rkey != mr->rkey) {
593 pr_err("%s: rkey (%#x) doesn't match mr->rkey (%#x)\n",
594 __func__, rkey, mr->rkey);
595 ret = -EINVAL;
596 goto err_drop_ref;
597 }
598
599 if (atomic_read(&mr->num_mw) > 0) {
600 pr_warn("%s: Attempt to invalidate an MR while bound to MWs\n",
601 __func__);
602 ret = -EINVAL;
603 goto err_drop_ref;
604 }
605
606 if (unlikely(mr->type != IB_MR_TYPE_MEM_REG)) {
607 pr_warn("%s: mr->type (%d) is wrong type\n", __func__, mr->type);
608 ret = -EINVAL;
609 goto err_drop_ref;
610 }
611
612 mr->state = RXE_MR_STATE_FREE;
613 ret = 0;
614
615 err_drop_ref:
616 rxe_drop_ref(mr);
617 err:
618 return ret;
619 }
620
621 /* user can (re)register fast MR by executing a REG_MR WQE.
622 * user is expected to hold a reference on the ib mr until the
623 * WQE completes.
624 * Once a fast MR is created this is the only way to change the
625 * private keys. It is the responsibility of the user to maintain
626 * the ib mr keys in sync with rxe mr keys.
627 */
rxe_reg_fast_mr(struct rxe_qp * qp,struct rxe_send_wqe * wqe)628 int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
629 {
630 struct rxe_mr *mr = to_rmr(wqe->wr.wr.reg.mr);
631 u32 key = wqe->wr.wr.reg.key & 0xff;
632 u32 access = wqe->wr.wr.reg.access;
633 struct rxe_map_set *set;
634
635 /* user can only register MR in free state */
636 if (unlikely(mr->state != RXE_MR_STATE_FREE)) {
637 pr_warn("%s: mr->lkey = 0x%x not free\n",
638 __func__, mr->lkey);
639 return -EINVAL;
640 }
641
642 /* user can only register mr with qp in same protection domain */
643 if (unlikely(qp->ibqp.pd != mr->ibmr.pd)) {
644 pr_warn("%s: qp->pd and mr->pd don't match\n",
645 __func__);
646 return -EINVAL;
647 }
648
649 mr->access = access;
650 mr->lkey = (mr->lkey & ~0xff) | key;
651 mr->rkey = (access & IB_ACCESS_REMOTE) ? mr->lkey : 0;
652 mr->state = RXE_MR_STATE_VALID;
653
654 set = mr->cur_map_set;
655 mr->cur_map_set = mr->next_map_set;
656 mr->cur_map_set->iova = wqe->wr.wr.reg.mr->iova;
657 mr->next_map_set = set;
658
659 return 0;
660 }
661
rxe_mr_set_page(struct ib_mr * ibmr,u64 addr)662 int rxe_mr_set_page(struct ib_mr *ibmr, u64 addr)
663 {
664 struct rxe_mr *mr = to_rmr(ibmr);
665 struct rxe_map_set *set = mr->next_map_set;
666 struct rxe_map *map;
667 struct rxe_phys_buf *buf;
668
669 if (unlikely(set->nbuf == mr->num_buf))
670 return -ENOMEM;
671
672 map = set->map[set->nbuf / RXE_BUF_PER_MAP];
673 buf = &map->buf[set->nbuf % RXE_BUF_PER_MAP];
674
675 buf->addr = addr;
676 buf->size = ibmr->page_size;
677 set->nbuf++;
678
679 return 0;
680 }
681
rxe_dereg_mr(struct ib_mr * ibmr,struct ib_udata * udata)682 int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
683 {
684 struct rxe_mr *mr = to_rmr(ibmr);
685
686 if (atomic_read(&mr->num_mw) > 0) {
687 pr_warn("%s: Attempt to deregister an MR while bound to MWs\n",
688 __func__);
689 return -EINVAL;
690 }
691
692 mr->state = RXE_MR_STATE_INVALID;
693 rxe_drop_ref(mr_pd(mr));
694 rxe_drop_index(mr);
695 rxe_drop_ref(mr);
696
697 return 0;
698 }
699
rxe_mr_cleanup(struct rxe_pool_entry * arg)700 void rxe_mr_cleanup(struct rxe_pool_entry *arg)
701 {
702 struct rxe_mr *mr = container_of(arg, typeof(*mr), pelem);
703
704 ib_umem_release(mr->umem);
705
706 if (mr->cur_map_set)
707 rxe_mr_free_map_set(mr->num_map, mr->cur_map_set);
708
709 if (mr->next_map_set)
710 rxe_mr_free_map_set(mr->num_map, mr->next_map_set);
711 }
712