1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright 2016-2019 HabanaLabs, Ltd.
5  * All Rights Reserved.
6  *
7  */
8 
9 #define pr_fmt(fmt)		"habanalabs: " fmt
10 
11 #include "habanalabs.h"
12 
13 #include <linux/pci.h>
14 #include <linux/aer.h>
15 #include <linux/module.h>
16 
17 #define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
18 
19 #define HL_DRIVER_DESC		"Driver for HabanaLabs's AI Accelerators"
20 
21 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
22 MODULE_DESCRIPTION(HL_DRIVER_DESC);
23 MODULE_LICENSE("GPL v2");
24 
25 static int hl_major;
26 static struct class *hl_class;
27 static DEFINE_IDR(hl_devs_idr);
28 static DEFINE_MUTEX(hl_devs_idr_lock);
29 
30 static int timeout_locked = 30;
31 static int reset_on_lockup = 1;
32 static int memory_scrub;
33 static ulong boot_error_status_mask = ULONG_MAX;
34 
35 module_param(timeout_locked, int, 0444);
36 MODULE_PARM_DESC(timeout_locked,
37 	"Device lockup timeout in seconds (0 = disabled, default 30s)");
38 
39 module_param(reset_on_lockup, int, 0444);
40 MODULE_PARM_DESC(reset_on_lockup,
41 	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
42 
43 module_param(memory_scrub, int, 0444);
44 MODULE_PARM_DESC(memory_scrub,
45 	"Scrub device memory in various states (0 = no, 1 = yes, default no)");
46 
47 module_param(boot_error_status_mask, ulong, 0444);
48 MODULE_PARM_DESC(boot_error_status_mask,
49 	"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
50 
51 #define PCI_VENDOR_ID_HABANALABS	0x1da3
52 
53 #define PCI_IDS_GOYA			0x0001
54 #define PCI_IDS_GAUDI			0x1000
55 #define PCI_IDS_GAUDI_SEC		0x1010
56 
57 static const struct pci_device_id ids[] = {
58 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
59 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
60 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
61 	{ 0, }
62 };
63 MODULE_DEVICE_TABLE(pci, ids);
64 
65 /*
66  * get_asic_type - translate device id to asic type
67  *
68  * @device: id of the PCI device
69  *
70  * Translate device id to asic type.
71  * In case of unidentified device, return -1
72  */
get_asic_type(u16 device)73 static enum hl_asic_type get_asic_type(u16 device)
74 {
75 	enum hl_asic_type asic_type;
76 
77 	switch (device) {
78 	case PCI_IDS_GOYA:
79 		asic_type = ASIC_GOYA;
80 		break;
81 	case PCI_IDS_GAUDI:
82 		asic_type = ASIC_GAUDI;
83 		break;
84 	case PCI_IDS_GAUDI_SEC:
85 		asic_type = ASIC_GAUDI_SEC;
86 		break;
87 	default:
88 		asic_type = ASIC_INVALID;
89 		break;
90 	}
91 
92 	return asic_type;
93 }
94 
is_asic_secured(enum hl_asic_type asic_type)95 static bool is_asic_secured(enum hl_asic_type asic_type)
96 {
97 	switch (asic_type) {
98 	case ASIC_GAUDI_SEC:
99 		return true;
100 	default:
101 		return false;
102 	}
103 }
104 
105 /*
106  * hl_device_open - open function for habanalabs device
107  *
108  * @inode: pointer to inode structure
109  * @filp: pointer to file structure
110  *
111  * Called when process opens an habanalabs device.
112  */
hl_device_open(struct inode * inode,struct file * filp)113 int hl_device_open(struct inode *inode, struct file *filp)
114 {
115 	enum hl_device_status status;
116 	struct hl_device *hdev;
117 	struct hl_fpriv *hpriv;
118 	int rc;
119 
120 	mutex_lock(&hl_devs_idr_lock);
121 	hdev = idr_find(&hl_devs_idr, iminor(inode));
122 	mutex_unlock(&hl_devs_idr_lock);
123 
124 	if (!hdev) {
125 		pr_err("Couldn't find device %d:%d\n",
126 			imajor(inode), iminor(inode));
127 		return -ENXIO;
128 	}
129 
130 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
131 	if (!hpriv)
132 		return -ENOMEM;
133 
134 	hpriv->hdev = hdev;
135 	filp->private_data = hpriv;
136 	hpriv->filp = filp;
137 	mutex_init(&hpriv->restore_phase_mutex);
138 	kref_init(&hpriv->refcount);
139 	nonseekable_open(inode, filp);
140 
141 	hl_cb_mgr_init(&hpriv->cb_mgr);
142 	hl_ctx_mgr_init(&hpriv->ctx_mgr);
143 
144 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
145 
146 	mutex_lock(&hdev->fpriv_list_lock);
147 
148 	if (!hl_device_operational(hdev, &status)) {
149 		dev_err_ratelimited(hdev->dev,
150 			"Can't open %s because it is %s\n",
151 			dev_name(hdev->dev), hdev->status[status]);
152 		rc = -EPERM;
153 		goto out_err;
154 	}
155 
156 	if (hdev->in_debug) {
157 		dev_err_ratelimited(hdev->dev,
158 			"Can't open %s because it is being debugged by another user\n",
159 			dev_name(hdev->dev));
160 		rc = -EPERM;
161 		goto out_err;
162 	}
163 
164 	if (hdev->compute_ctx) {
165 		dev_dbg_ratelimited(hdev->dev,
166 			"Can't open %s because another user is working on it\n",
167 			dev_name(hdev->dev));
168 		rc = -EBUSY;
169 		goto out_err;
170 	}
171 
172 	rc = hl_ctx_create(hdev, hpriv);
173 	if (rc) {
174 		dev_err(hdev->dev, "Failed to create context %d\n", rc);
175 		goto out_err;
176 	}
177 
178 	/* Device is IDLE at this point so it is legal to change PLLs.
179 	 * There is no need to check anything because if the PLL is
180 	 * already HIGH, the set function will return without doing
181 	 * anything
182 	 */
183 	hl_device_set_frequency(hdev, PLL_HIGH);
184 
185 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
186 	mutex_unlock(&hdev->fpriv_list_lock);
187 
188 	hl_debugfs_add_file(hpriv);
189 
190 	hdev->open_counter++;
191 	hdev->last_successful_open_jif = jiffies;
192 
193 	return 0;
194 
195 out_err:
196 	mutex_unlock(&hdev->fpriv_list_lock);
197 	hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
198 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
199 	filp->private_data = NULL;
200 	mutex_destroy(&hpriv->restore_phase_mutex);
201 	put_pid(hpriv->taskpid);
202 
203 	kfree(hpriv);
204 
205 	return rc;
206 }
207 
hl_device_open_ctrl(struct inode * inode,struct file * filp)208 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
209 {
210 	struct hl_device *hdev;
211 	struct hl_fpriv *hpriv;
212 	int rc;
213 
214 	mutex_lock(&hl_devs_idr_lock);
215 	hdev = idr_find(&hl_devs_idr, iminor(inode));
216 	mutex_unlock(&hl_devs_idr_lock);
217 
218 	if (!hdev) {
219 		pr_err("Couldn't find device %d:%d\n",
220 			imajor(inode), iminor(inode));
221 		return -ENXIO;
222 	}
223 
224 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
225 	if (!hpriv)
226 		return -ENOMEM;
227 
228 	/* Prevent other routines from reading partial hpriv data by
229 	 * initializing hpriv fields before inserting it to the list
230 	 */
231 	hpriv->hdev = hdev;
232 	filp->private_data = hpriv;
233 	hpriv->filp = filp;
234 	hpriv->is_control = true;
235 	nonseekable_open(inode, filp);
236 
237 	hpriv->taskpid = find_get_pid(current->pid);
238 
239 	mutex_lock(&hdev->fpriv_list_lock);
240 
241 	if (!hl_device_operational(hdev, NULL)) {
242 		dev_err_ratelimited(hdev->dev_ctrl,
243 			"Can't open %s because it is disabled or in reset\n",
244 			dev_name(hdev->dev_ctrl));
245 		rc = -EPERM;
246 		goto out_err;
247 	}
248 
249 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
250 	mutex_unlock(&hdev->fpriv_list_lock);
251 
252 	return 0;
253 
254 out_err:
255 	mutex_unlock(&hdev->fpriv_list_lock);
256 	filp->private_data = NULL;
257 	put_pid(hpriv->taskpid);
258 
259 	kfree(hpriv);
260 
261 	return rc;
262 }
263 
set_driver_behavior_per_device(struct hl_device * hdev)264 static void set_driver_behavior_per_device(struct hl_device *hdev)
265 {
266 	hdev->fw_components = FW_TYPE_ALL_TYPES;
267 	hdev->cpu_queues_enable = 1;
268 	hdev->heartbeat = 1;
269 	hdev->mmu_enable = 1;
270 	hdev->clock_gating_mask = ULONG_MAX;
271 	hdev->sram_scrambler_enable = 1;
272 	hdev->dram_scrambler_enable = 1;
273 	hdev->bmc_enable = 1;
274 	hdev->hard_reset_on_fw_events = 1;
275 	hdev->reset_on_preboot_fail = 1;
276 	hdev->reset_if_device_not_idle = 1;
277 
278 	hdev->reset_pcilink = 0;
279 	hdev->axi_drain = 0;
280 }
281 
282 /*
283  * create_hdev - create habanalabs device instance
284  *
285  * @dev: will hold the pointer to the new habanalabs device structure
286  * @pdev: pointer to the pci device
287  * @asic_type: in case of simulator device, which device is it
288  * @minor: in case of simulator device, the minor of the device
289  *
290  * Allocate memory for habanalabs device and initialize basic fields
291  * Identify the ASIC type
292  * Allocate ID (minor) for the device (only for real devices)
293  */
create_hdev(struct hl_device ** dev,struct pci_dev * pdev,enum hl_asic_type asic_type,int minor)294 int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
295 		enum hl_asic_type asic_type, int minor)
296 {
297 	struct hl_device *hdev;
298 	int rc, main_id, ctrl_id = 0;
299 
300 	*dev = NULL;
301 
302 	hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
303 	if (!hdev)
304 		return -ENOMEM;
305 
306 	/* First, we must find out which ASIC are we handling. This is needed
307 	 * to configure the behavior of the driver (kernel parameters)
308 	 */
309 	if (pdev) {
310 		hdev->asic_type = get_asic_type(pdev->device);
311 		if (hdev->asic_type == ASIC_INVALID) {
312 			dev_err(&pdev->dev, "Unsupported ASIC\n");
313 			rc = -ENODEV;
314 			goto free_hdev;
315 		}
316 	} else {
317 		hdev->asic_type = asic_type;
318 	}
319 
320 	if (pdev)
321 		hdev->asic_prop.fw_security_enabled =
322 					is_asic_secured(hdev->asic_type);
323 	else
324 		hdev->asic_prop.fw_security_enabled = false;
325 
326 	/* Assign status description string */
327 	strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL],
328 					"operational", HL_STR_MAX);
329 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET],
330 					"in reset", HL_STR_MAX);
331 	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION],
332 					"disabled", HL_STR_MAX);
333 	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET],
334 					"needs reset", HL_STR_MAX);
335 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
336 					"in device creation", HL_STR_MAX);
337 
338 	hdev->major = hl_major;
339 	hdev->reset_on_lockup = reset_on_lockup;
340 	hdev->memory_scrub = memory_scrub;
341 	hdev->boot_error_status_mask = boot_error_status_mask;
342 	hdev->stop_on_err = true;
343 
344 	hdev->pldm = 0;
345 
346 	set_driver_behavior_per_device(hdev);
347 
348 	hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
349 	hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
350 
351 	if (timeout_locked)
352 		hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
353 	else
354 		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
355 
356 	hdev->disabled = true;
357 	hdev->pdev = pdev; /* can be NULL in case of simulator device */
358 
359 	/* Set default DMA mask to 32 bits */
360 	hdev->dma_mask = 32;
361 
362 	mutex_lock(&hl_devs_idr_lock);
363 
364 	/* Always save 2 numbers, 1 for main device and 1 for control.
365 	 * They must be consecutive
366 	 */
367 	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS,
368 				GFP_KERNEL);
369 
370 	if (main_id >= 0)
371 		ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
372 					main_id + 2, GFP_KERNEL);
373 
374 	mutex_unlock(&hl_devs_idr_lock);
375 
376 	if ((main_id < 0) || (ctrl_id < 0)) {
377 		if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
378 			pr_err("too many devices in the system\n");
379 
380 		if (main_id >= 0) {
381 			mutex_lock(&hl_devs_idr_lock);
382 			idr_remove(&hl_devs_idr, main_id);
383 			mutex_unlock(&hl_devs_idr_lock);
384 		}
385 
386 		rc = -EBUSY;
387 		goto free_hdev;
388 	}
389 
390 	hdev->id = main_id;
391 	hdev->id_control = ctrl_id;
392 
393 	*dev = hdev;
394 
395 	return 0;
396 
397 free_hdev:
398 	kfree(hdev);
399 	return rc;
400 }
401 
402 /*
403  * destroy_hdev - destroy habanalabs device instance
404  *
405  * @dev: pointer to the habanalabs device structure
406  *
407  */
destroy_hdev(struct hl_device * hdev)408 void destroy_hdev(struct hl_device *hdev)
409 {
410 	/* Remove device from the device list */
411 	mutex_lock(&hl_devs_idr_lock);
412 	idr_remove(&hl_devs_idr, hdev->id);
413 	idr_remove(&hl_devs_idr, hdev->id_control);
414 	mutex_unlock(&hl_devs_idr_lock);
415 
416 	kfree(hdev);
417 }
418 
hl_pmops_suspend(struct device * dev)419 static int hl_pmops_suspend(struct device *dev)
420 {
421 	struct hl_device *hdev = dev_get_drvdata(dev);
422 
423 	pr_debug("Going to suspend PCI device\n");
424 
425 	if (!hdev) {
426 		pr_err("device pointer is NULL in suspend\n");
427 		return 0;
428 	}
429 
430 	return hl_device_suspend(hdev);
431 }
432 
hl_pmops_resume(struct device * dev)433 static int hl_pmops_resume(struct device *dev)
434 {
435 	struct hl_device *hdev = dev_get_drvdata(dev);
436 
437 	pr_debug("Going to resume PCI device\n");
438 
439 	if (!hdev) {
440 		pr_err("device pointer is NULL in resume\n");
441 		return 0;
442 	}
443 
444 	return hl_device_resume(hdev);
445 }
446 
447 /*
448  * hl_pci_probe - probe PCI habanalabs devices
449  *
450  * @pdev: pointer to pci device
451  * @id: pointer to pci device id structure
452  *
453  * Standard PCI probe function for habanalabs device.
454  * Create a new habanalabs device and initialize it according to the
455  * device's type
456  */
hl_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)457 static int hl_pci_probe(struct pci_dev *pdev,
458 				const struct pci_device_id *id)
459 {
460 	struct hl_device *hdev;
461 	int rc;
462 
463 	dev_info(&pdev->dev, HL_NAME
464 		 " device found [%04x:%04x] (rev %x)\n",
465 		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
466 
467 	rc = create_hdev(&hdev, pdev, ASIC_INVALID, -1);
468 	if (rc)
469 		return rc;
470 
471 	pci_set_drvdata(pdev, hdev);
472 
473 	pci_enable_pcie_error_reporting(pdev);
474 
475 	rc = hl_device_init(hdev, hl_class);
476 	if (rc) {
477 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
478 		rc = -ENODEV;
479 		goto disable_device;
480 	}
481 
482 	return 0;
483 
484 disable_device:
485 	pci_disable_pcie_error_reporting(pdev);
486 	pci_set_drvdata(pdev, NULL);
487 	destroy_hdev(hdev);
488 
489 	return rc;
490 }
491 
492 /*
493  * hl_pci_remove - remove PCI habanalabs devices
494  *
495  * @pdev: pointer to pci device
496  *
497  * Standard PCI remove function for habanalabs device
498  */
hl_pci_remove(struct pci_dev * pdev)499 static void hl_pci_remove(struct pci_dev *pdev)
500 {
501 	struct hl_device *hdev;
502 
503 	hdev = pci_get_drvdata(pdev);
504 	if (!hdev)
505 		return;
506 
507 	hl_device_fini(hdev);
508 	pci_disable_pcie_error_reporting(pdev);
509 	pci_set_drvdata(pdev, NULL);
510 	destroy_hdev(hdev);
511 }
512 
513 /**
514  * hl_pci_err_detected - a PCI bus error detected on this device
515  *
516  * @pdev: pointer to pci device
517  * @state: PCI error type
518  *
519  * Called by the PCI subsystem whenever a non-correctable
520  * PCI bus error is detected
521  */
522 static pci_ers_result_t
hl_pci_err_detected(struct pci_dev * pdev,pci_channel_state_t state)523 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
524 {
525 	struct hl_device *hdev = pci_get_drvdata(pdev);
526 	enum pci_ers_result result;
527 
528 	switch (state) {
529 	case pci_channel_io_normal:
530 		return PCI_ERS_RESULT_CAN_RECOVER;
531 
532 	case pci_channel_io_frozen:
533 		dev_warn(hdev->dev, "frozen state error detected\n");
534 		result = PCI_ERS_RESULT_NEED_RESET;
535 		break;
536 
537 	case pci_channel_io_perm_failure:
538 		dev_warn(hdev->dev, "failure state error detected\n");
539 		result = PCI_ERS_RESULT_DISCONNECT;
540 		break;
541 
542 	default:
543 		result = PCI_ERS_RESULT_NONE;
544 	}
545 
546 	hdev->asic_funcs->halt_engines(hdev, true, false);
547 
548 	return result;
549 }
550 
551 /**
552  * hl_pci_err_resume - resume after a PCI slot reset
553  *
554  * @pdev: pointer to pci device
555  *
556  */
hl_pci_err_resume(struct pci_dev * pdev)557 static void hl_pci_err_resume(struct pci_dev *pdev)
558 {
559 	struct hl_device *hdev = pci_get_drvdata(pdev);
560 
561 	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
562 	hl_device_resume(hdev);
563 }
564 
565 /**
566  * hl_pci_err_slot_reset - a PCI slot reset has just happened
567  *
568  * @pdev: pointer to pci device
569  *
570  * Determine if the driver can recover from the PCI slot reset
571  */
hl_pci_err_slot_reset(struct pci_dev * pdev)572 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
573 {
574 	return PCI_ERS_RESULT_RECOVERED;
575 }
576 
577 static const struct dev_pm_ops hl_pm_ops = {
578 	.suspend = hl_pmops_suspend,
579 	.resume = hl_pmops_resume,
580 };
581 
582 static const struct pci_error_handlers hl_pci_err_handler = {
583 	.error_detected = hl_pci_err_detected,
584 	.slot_reset = hl_pci_err_slot_reset,
585 	.resume = hl_pci_err_resume,
586 };
587 
588 static struct pci_driver hl_pci_driver = {
589 	.name = HL_NAME,
590 	.id_table = ids,
591 	.probe = hl_pci_probe,
592 	.remove = hl_pci_remove,
593 	.shutdown = hl_pci_remove,
594 	.driver = {
595 		.name = HL_NAME,
596 		.pm = &hl_pm_ops,
597 		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
598 	},
599 	.err_handler = &hl_pci_err_handler,
600 };
601 
602 /*
603  * hl_init - Initialize the habanalabs kernel driver
604  */
hl_init(void)605 static int __init hl_init(void)
606 {
607 	int rc;
608 	dev_t dev;
609 
610 	pr_info("loading driver\n");
611 
612 	rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
613 	if (rc < 0) {
614 		pr_err("unable to get major\n");
615 		return rc;
616 	}
617 
618 	hl_major = MAJOR(dev);
619 
620 	hl_class = class_create(THIS_MODULE, HL_NAME);
621 	if (IS_ERR(hl_class)) {
622 		pr_err("failed to allocate class\n");
623 		rc = PTR_ERR(hl_class);
624 		goto remove_major;
625 	}
626 
627 	hl_debugfs_init();
628 
629 	rc = pci_register_driver(&hl_pci_driver);
630 	if (rc) {
631 		pr_err("failed to register pci device\n");
632 		goto remove_debugfs;
633 	}
634 
635 	pr_debug("driver loaded\n");
636 
637 	return 0;
638 
639 remove_debugfs:
640 	hl_debugfs_fini();
641 	class_destroy(hl_class);
642 remove_major:
643 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
644 	return rc;
645 }
646 
647 /*
648  * hl_exit - Release all resources of the habanalabs kernel driver
649  */
hl_exit(void)650 static void __exit hl_exit(void)
651 {
652 	pci_unregister_driver(&hl_pci_driver);
653 
654 	/*
655 	 * Removing debugfs must be after all devices or simulator devices
656 	 * have been removed because otherwise we get a bug in the
657 	 * debugfs module for referencing NULL objects
658 	 */
659 	hl_debugfs_fini();
660 
661 	class_destroy(hl_class);
662 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
663 
664 	idr_destroy(&hl_devs_idr);
665 
666 	pr_debug("driver removed\n");
667 }
668 
669 module_init(hl_init);
670 module_exit(hl_exit);
671