diff options
author | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2020-07-24 20:22:25 +0200 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2020-07-24 20:22:25 +0200 |
commit | 860e73b49cd933c708e3e1e1e07cdea81b6acd1c (patch) | |
tree | d79deee0e36096f56cadcb9abb0be9864bac9e7c | |
parent | 54918b8ed1e52ae5f2a521bdf014e4fba797c920 (diff) | |
parent | 94f8be9eb065412cf069efd45053d33e8911fa9e (diff) |
Merge tag 'misc-habanalabs-next-2020-07-24' of git://people.freedesktop.org/~gabbayo/linux into char-misc-next
Oded writes:
This tag contains the following changes for kernel 5.9-rc1:
- Remove rate limiters from GAUDI configuration (no longer needed).
- Set maximum amount of in-flight CS per ASIC type and increase
the maximum amount for GAUDI.
- Refactor signal/wait command submissions code
- Calculate trace frequency from PLLs to show accurate profiling data
- Rephrase error messages to make them more clear to the common user
- Add statistics of dropped CS (counter per possible reason for drop)
- Get ECC information from firmware
- Remove support for partial SoC reset in Gaudi
- Halt device CPU only when reset is certain to happen. Sometimes we abort
the reset procedure and in that case we can't leave device CPU in halt
mode.
- set each CQ to its own work queue to prevent a race between completions
on different CQs.
- Use queue pi/ci in order to determine queue occupancy. This is done to
make the code reusable between current and future ASICs.
- Add more validations for user inputs.
- Refactor PCIe controller configuration to make the code reusable between
current and future ASICs.
- Update firmware interface headers to latest version
- Move all common code to a dedicated common sub-folder
* tag 'misc-habanalabs-next-2020-07-24' of git://people.freedesktop.org/~gabbayo/linux: (28 commits)
habanalabs: Fix memory leak in error flow of context initialization
habanalabs: use no flags on MMU cache invalidation
habanalabs: enable device before hw_init()
habanalabs: create internal CB pool
habanalabs: update hl_boot_if.h from firmware
habanalabs: create common folder
habanalabs: check for DMA errors when clearing memory
habanalabs: verify queue can contain all cs jobs
habanalabs: Assign each CQ with its own work queue
habanalabs: halt device CPU only upon certain reset
habanalabs: remove unused hash
habanalabs: use queue pi/ci in order to determine queue occupancy
habanalabs: configure maximum queues per asic
habanalabs: remove soft-reset support from GAUDI
habanalabs: PCIe iATU refactoring
habanalabs: Extract ECC information from FW
habanalabs: Add dropped cs statistics info struct
habanalabs: extract cpu boot status lookup
habanalabs: rephrase error messages
habanalabs: Increase queues depth
...
-rw-r--r-- | drivers/misc/habanalabs/Makefile | 11 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/Makefile | 9 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/asid.c (renamed from drivers/misc/habanalabs/asid.c) | 0 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/command_buffer.c (renamed from drivers/misc/habanalabs/command_buffer.c) | 82 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/command_submission.c (renamed from drivers/misc/habanalabs/command_submission.c) | 97 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/context.c (renamed from drivers/misc/habanalabs/context.c) | 39 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/debugfs.c (renamed from drivers/misc/habanalabs/debugfs.c) | 0 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/device.c (renamed from drivers/misc/habanalabs/device.c) | 88 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/firmware_if.c (renamed from drivers/misc/habanalabs/firmware_if.c) | 101 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs.h (renamed from drivers/misc/habanalabs/habanalabs.h) | 172 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs_drv.c (renamed from drivers/misc/habanalabs/habanalabs_drv.c) | 1 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs_ioctl.c (renamed from drivers/misc/habanalabs/habanalabs_ioctl.c) | 24 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/hw_queue.c (renamed from drivers/misc/habanalabs/hw_queue.c) | 165 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/hwmon.c (renamed from drivers/misc/habanalabs/hwmon.c) | 0 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/irq.c (renamed from drivers/misc/habanalabs/irq.c) | 13 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/memory.c (renamed from drivers/misc/habanalabs/memory.c) | 3 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/mmu.c (renamed from drivers/misc/habanalabs/mmu.c) | 1 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/pci.c (renamed from drivers/misc/habanalabs/pci.c) | 136 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/sysfs.c (renamed from drivers/misc/habanalabs/sysfs.c) | 3 | ||||
-rw-r--r-- | drivers/misc/habanalabs/gaudi/Makefile | 2 | ||||
-rw-r--r-- | drivers/misc/habanalabs/gaudi/gaudi.c | 894 | ||||
-rw-r--r-- | drivers/misc/habanalabs/gaudi/gaudiP.h | 16 | ||||
-rw-r--r-- | drivers/misc/habanalabs/gaudi/gaudi_coresight.c | 6 | ||||
-rw-r--r-- | drivers/misc/habanalabs/goya/goya.c | 172 | ||||
-rw-r--r-- | drivers/misc/habanalabs/goya/goyaP.h | 14 | ||||
-rw-r--r-- | drivers/misc/habanalabs/goya/goya_coresight.c | 6 | ||||
-rw-r--r-- | drivers/misc/habanalabs/include/common/armcp_if.h (renamed from drivers/misc/habanalabs/include/armcp_if.h) | 14 | ||||
-rw-r--r-- | drivers/misc/habanalabs/include/common/hl_boot_if.h (renamed from drivers/misc/habanalabs/include/hl_boot_if.h) | 14 | ||||
-rw-r--r-- | drivers/misc/habanalabs/include/common/qman_if.h (renamed from drivers/misc/habanalabs/include/qman_if.h) | 0 | ||||
-rw-r--r-- | drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h | 21 | ||||
-rw-r--r-- | drivers/misc/habanalabs/include/gaudi/asic_reg/psoc_cpu_pll_regs.h | 114 | ||||
-rw-r--r-- | drivers/misc/habanalabs/include/gaudi/gaudi_masks.h | 3 | ||||
-rw-r--r-- | drivers/misc/habanalabs/include/gaudi/gaudi_packets.h | 4 | ||||
-rw-r--r-- | include/uapi/misc/habanalabs.h | 27 |
34 files changed, 1260 insertions, 992 deletions
diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile index 421ebd903069..a786c0a7de9a 100644 --- a/drivers/misc/habanalabs/Makefile +++ b/drivers/misc/habanalabs/Makefile @@ -3,16 +3,15 @@ # Makefile for HabanaLabs AI accelerators driver # -obj-m := habanalabs.o +obj-$(CONFIG_HABANA_AI) := habanalabs.o -habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \ - command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o memory.o \ - command_submission.o mmu.o firmware_if.o pci.o - -habanalabs-$(CONFIG_DEBUG_FS) += debugfs.o +include $(src)/common/Makefile +habanalabs-y += $(HL_COMMON_FILES) include $(src)/goya/Makefile habanalabs-y += $(HL_GOYA_FILES) include $(src)/gaudi/Makefile habanalabs-y += $(HL_GAUDI_FILES) + +habanalabs-$(CONFIG_DEBUG_FS) += common/debugfs.o diff --git a/drivers/misc/habanalabs/common/Makefile b/drivers/misc/habanalabs/common/Makefile new file mode 100644 index 000000000000..97d03b5c8683 --- /dev/null +++ b/drivers/misc/habanalabs/common/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only +subdir-ccflags-y += -I$(src)/common + +HL_COMMON_FILES := common/habanalabs_drv.o common/device.o common/context.o \ + common/asid.o common/habanalabs_ioctl.o \ + common/command_buffer.o common/hw_queue.o common/irq.o \ + common/sysfs.o common/hwmon.o common/memory.o \ + common/command_submission.o common/mmu.o common/firmware_if.o \ + common/pci.o diff --git a/drivers/misc/habanalabs/asid.c b/drivers/misc/habanalabs/common/asid.c index a2fdf31cf27c..a2fdf31cf27c 100644 --- a/drivers/misc/habanalabs/asid.c +++ b/drivers/misc/habanalabs/common/asid.c diff --git a/drivers/misc/habanalabs/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c index 02d13f71b1df..7c38c4f7f9c0 100644 --- a/drivers/misc/habanalabs/command_buffer.c +++ b/drivers/misc/habanalabs/common/command_buffer.c @@ -10,12 +10,18 @@ #include <linux/mm.h> #include <linux/slab.h> +#include <linux/genalloc.h> static void cb_fini(struct hl_device *hdev, struct hl_cb *cb) { - hdev->asic_funcs->asic_dma_free_coherent(hdev, cb->size, - (void *) (uintptr_t) cb->kernel_address, - cb->bus_address); + if (cb->is_internal) + gen_pool_free(hdev->internal_cb_pool, + cb->kernel_address, cb->size); + else + hdev->asic_funcs->asic_dma_free_coherent(hdev, cb->size, + (void *) (uintptr_t) cb->kernel_address, + cb->bus_address); + kfree(cb); } @@ -44,9 +50,10 @@ static void cb_release(struct kref *ref) } static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size, - int ctx_id) + int ctx_id, bool internal_cb) { struct hl_cb *cb; + u32 cb_offset; void *p; /* @@ -65,13 +72,25 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size, if (!cb) return NULL; - if (ctx_id == HL_KERNEL_ASID_ID) + if (internal_cb) { + p = (void *) gen_pool_alloc(hdev->internal_cb_pool, cb_size); + if (!p) { + kfree(cb); + return NULL; + } + + cb_offset = p - hdev->internal_cb_pool_virt_addr; + cb->is_internal = true; + cb->bus_address = hdev->internal_cb_va_base + cb_offset; + } else if (ctx_id == HL_KERNEL_ASID_ID) { p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, cb_size, &cb->bus_address, GFP_ATOMIC); - else + } else { p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, cb_size, &cb->bus_address, GFP_USER | __GFP_ZERO); + } + if (!p) { dev_err(hdev->dev, "failed to allocate %d of dma memory for CB\n", @@ -87,7 +106,7 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size, } int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, - u32 cb_size, u64 *handle, int ctx_id) + u32 cb_size, u64 *handle, int ctx_id, bool internal_cb) { struct hl_cb *cb; bool alloc_new_cb = true; @@ -112,28 +131,30 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, goto out_err; } - /* Minimum allocation must be PAGE SIZE */ - if (cb_size < PAGE_SIZE) - cb_size = PAGE_SIZE; - - if (ctx_id == HL_KERNEL_ASID_ID && - cb_size <= hdev->asic_prop.cb_pool_cb_size) { - - spin_lock(&hdev->cb_pool_lock); - if (!list_empty(&hdev->cb_pool)) { - cb = list_first_entry(&hdev->cb_pool, typeof(*cb), - pool_list); - list_del(&cb->pool_list); - spin_unlock(&hdev->cb_pool_lock); - alloc_new_cb = false; - } else { - spin_unlock(&hdev->cb_pool_lock); - dev_dbg(hdev->dev, "CB pool is empty\n"); + if (!internal_cb) { + /* Minimum allocation must be PAGE SIZE */ + if (cb_size < PAGE_SIZE) + cb_size = PAGE_SIZE; + + if (ctx_id == HL_KERNEL_ASID_ID && + cb_size <= hdev->asic_prop.cb_pool_cb_size) { + + spin_lock(&hdev->cb_pool_lock); + if (!list_empty(&hdev->cb_pool)) { + cb = list_first_entry(&hdev->cb_pool, + typeof(*cb), pool_list); + list_del(&cb->pool_list); + spin_unlock(&hdev->cb_pool_lock); + alloc_new_cb = false; + } else { + spin_unlock(&hdev->cb_pool_lock); + dev_dbg(hdev->dev, "CB pool is empty\n"); + } } } if (alloc_new_cb) { - cb = hl_cb_alloc(hdev, cb_size, ctx_id); + cb = hl_cb_alloc(hdev, cb_size, ctx_id, internal_cb); if (!cb) { rc = -ENOMEM; goto out_err; @@ -229,8 +250,8 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data) rc = -EINVAL; } else { rc = hl_cb_create(hdev, &hpriv->cb_mgr, - args->in.cb_size, &handle, - hpriv->ctx->asid); + args->in.cb_size, &handle, + hpriv->ctx->asid, false); } memset(args, 0, sizeof(*args)); @@ -398,14 +419,15 @@ void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr) idr_destroy(&mgr->cb_handles); } -struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size) +struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size, + bool internal_cb) { u64 cb_handle; struct hl_cb *cb; int rc; rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, cb_size, &cb_handle, - HL_KERNEL_ASID_ID); + HL_KERNEL_ASID_ID, internal_cb); if (rc) { dev_err(hdev->dev, "Failed to allocate CB for the kernel driver %d\n", rc); @@ -437,7 +459,7 @@ int hl_cb_pool_init(struct hl_device *hdev) for (i = 0 ; i < hdev->asic_prop.cb_pool_cb_cnt ; i++) { cb = hl_cb_alloc(hdev, hdev->asic_prop.cb_pool_cb_size, - HL_KERNEL_ASID_ID); + HL_KERNEL_ASID_ID, false); if (cb) { cb->is_pool = true; list_add(&cb->pool_list, &hdev->cb_pool); diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index b0f62cbbdc87..e096532c0e48 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -246,6 +246,18 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job) kfree(job); } +static void cs_counters_aggregate(struct hl_device *hdev, struct hl_ctx *ctx) +{ + hdev->aggregated_cs_counters.device_in_reset_drop_cnt += + ctx->cs_counters.device_in_reset_drop_cnt; + hdev->aggregated_cs_counters.out_of_mem_drop_cnt += + ctx->cs_counters.out_of_mem_drop_cnt; + hdev->aggregated_cs_counters.parsing_drop_cnt += + ctx->cs_counters.parsing_drop_cnt; + hdev->aggregated_cs_counters.queue_full_drop_cnt += + ctx->cs_counters.queue_full_drop_cnt; +} + static void cs_do_release(struct kref *ref) { struct hl_cs *cs = container_of(ref, struct hl_cs, @@ -349,6 +361,9 @@ static void cs_do_release(struct kref *ref) dma_fence_signal(cs->fence); dma_fence_put(cs->fence); + cs_counters_aggregate(hdev, cs->ctx); + + kfree(cs->jobs_in_queue_cnt); kfree(cs); } @@ -373,9 +388,9 @@ static void cs_timedout(struct work_struct *work) hdev = cs->ctx->hdev; ctx_asid = cs->ctx->asid; - /* TODO: add information about last signaled seq and last emitted seq */ - dev_err(hdev->dev, "User %d command submission %llu got stuck!\n", - ctx_asid, cs->sequence); + dev_err(hdev->dev, + "Command submission %llu has not finished in time!\n", + cs->sequence); cs_put(cs); @@ -418,21 +433,29 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, spin_lock(&ctx->cs_lock); cs_cmpl->cs_seq = ctx->cs_sequence; - other = ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)]; + other = ctx->cs_pending[cs_cmpl->cs_seq & + (hdev->asic_prop.max_pending_cs - 1)]; if ((other) && (!dma_fence_is_signaled(other))) { - spin_unlock(&ctx->cs_lock); dev_dbg(hdev->dev, "Rejecting CS because of too many in-flights CS\n"); rc = -EAGAIN; goto free_fence; } + cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues, + sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC); + if (!cs->jobs_in_queue_cnt) { + rc = -ENOMEM; + goto free_fence; + } + dma_fence_init(&cs_cmpl->base_fence, &hl_fence_ops, &cs_cmpl->lock, ctx->asid, ctx->cs_sequence); cs->sequence = cs_cmpl->cs_seq; - ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)] = + ctx->cs_pending[cs_cmpl->cs_seq & + (hdev->asic_prop.max_pending_cs - 1)] = &cs_cmpl->base_fence; ctx->cs_sequence++; @@ -447,6 +470,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, return 0; free_fence: + spin_unlock(&ctx->cs_lock); kfree(cs_cmpl); free_cs: kfree(cs); @@ -463,10 +487,12 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs) void hl_cs_rollback_all(struct hl_device *hdev) { + int i; struct hl_cs *cs, *tmp; /* flush all completions */ - flush_workqueue(hdev->cq_wq); + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) + flush_workqueue(hdev->cq_wq[i]); /* Make sure we don't have leftovers in the H/W queues mirror list */ list_for_each_entry_safe(cs, tmp, &hdev->hw_queues_mirror_list, @@ -499,10 +525,18 @@ static int validate_queue_index(struct hl_device *hdev, struct asic_fixed_properties *asic = &hdev->asic_prop; struct hw_queue_properties *hw_queue_prop; + /* This must be checked here to prevent out-of-bounds access to + * hw_queues_props array + */ + if (chunk->queue_index >= asic->max_queues) { + dev_err(hdev->dev, "Queue index %d is invalid\n", + chunk->queue_index); + return -EINVAL; + } + hw_queue_prop = &asic->hw_queues_props[chunk->queue_index]; - if ((chunk->queue_index >= HL_MAX_QUEUES) || - (hw_queue_prop->type == QUEUE_TYPE_NA)) { + if (hw_queue_prop->type == QUEUE_TYPE_NA) { dev_err(hdev->dev, "Queue index %d is invalid\n", chunk->queue_index); return -EINVAL; @@ -630,12 +664,15 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, rc = validate_queue_index(hdev, chunk, &queue_type, &is_kernel_allocated_cb); - if (rc) + if (rc) { + hpriv->ctx->cs_counters.parsing_drop_cnt++; goto free_cs_object; + } if (is_kernel_allocated_cb) { cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk); if (!cb) { + hpriv->ctx->cs_counters.parsing_drop_cnt++; rc = -EINVAL; goto free_cs_object; } @@ -649,6 +686,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, job = hl_cs_allocate_job(hdev, queue_type, is_kernel_allocated_cb); if (!job) { + hpriv->ctx->cs_counters.out_of_mem_drop_cnt++; dev_err(hdev->dev, "Failed to allocate a new job\n"); rc = -ENOMEM; if (is_kernel_allocated_cb) @@ -681,6 +719,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, rc = cs_parser(hpriv, job); if (rc) { + hpriv->ctx->cs_counters.parsing_drop_cnt++; dev_err(hdev->dev, "Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n", cs->ctx->asid, cs->sequence, job->id, rc); @@ -689,6 +728,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, } if (int_queues_only) { + hpriv->ctx->cs_counters.parsing_drop_cnt++; dev_err(hdev->dev, "Reject CS %d.%llu because only internal queues jobs are present\n", cs->ctx->asid, cs->sequence); @@ -738,6 +778,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, struct hl_cs_job *job; struct hl_cs *cs; struct hl_cb *cb; + enum hl_queue_type q_type; u64 *signal_seq_arr = NULL, signal_seq; u32 size_to_copy, q_idx, signal_seq_arr_len, cb_size; int rc; @@ -770,9 +811,10 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, chunk = &cs_chunk_array[0]; q_idx = chunk->queue_index; hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx]; + q_type = hw_queue_prop->type; - if ((q_idx >= HL_MAX_QUEUES) || - (hw_queue_prop->type != QUEUE_TYPE_EXT)) { + if ((q_idx >= hdev->asic_prop.max_queues) || + (!hw_queue_prop->supports_sync_stream)) { dev_err(hdev->dev, "Queue index %d is invalid\n", q_idx); rc = -EINVAL; goto free_cs_chunk_array; @@ -869,25 +911,28 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, *cs_seq = cs->sequence; - job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true); + job = hl_cs_allocate_job(hdev, q_type, true); if (!job) { + ctx->cs_counters.out_of_mem_drop_cnt++; dev_err(hdev->dev, "Failed to allocate a new job\n"); rc = -ENOMEM; goto put_cs; } - cb = hl_cb_kernel_create(hdev, PAGE_SIZE); + if (cs->type == CS_TYPE_WAIT) + cb_size = hdev->asic_funcs->get_wait_cb_size(hdev); + else + cb_size = hdev->asic_funcs->get_signal_cb_size(hdev); + + cb = hl_cb_kernel_create(hdev, cb_size, + q_type == QUEUE_TYPE_HW && hdev->mmu_enable); if (!cb) { + ctx->cs_counters.out_of_mem_drop_cnt++; kfree(job); rc = -EFAULT; goto put_cs; } - if (cs->type == CS_TYPE_WAIT) - cb_size = hdev->asic_funcs->get_wait_cb_size(hdev); - else - cb_size = hdev->asic_funcs->get_signal_cb_size(hdev); - job->id = 0; job->cs = cs; job->user_cb = cb; @@ -1126,7 +1171,7 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev, rc = PTR_ERR(fence); if (rc == -EINVAL) dev_notice_ratelimited(hdev->dev, - "Can't wait on seq %llu because current CS is at seq %llu\n", + "Can't wait on CS %llu because current CS is at seq %llu\n", seq, ctx->cs_sequence); } else if (fence) { rc = dma_fence_wait_timeout(fence, true, timeout); @@ -1159,15 +1204,21 @@ int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) memset(args, 0, sizeof(*args)); if (rc < 0) { - dev_err_ratelimited(hdev->dev, - "Error %ld on waiting for CS handle %llu\n", - rc, seq); if (rc == -ERESTARTSYS) { + dev_err_ratelimited(hdev->dev, + "user process got signal while waiting for CS handle %llu\n", + seq); args->out.status = HL_WAIT_CS_STATUS_INTERRUPTED; rc = -EINTR; } else if (rc == -ETIMEDOUT) { + dev_err_ratelimited(hdev->dev, + "CS %llu has timed-out while user process is waiting for it\n", + seq); args->out.status = HL_WAIT_CS_STATUS_TIMEDOUT; } else if (rc == -EIO) { + dev_err_ratelimited(hdev->dev, + "CS %llu has been aborted while user process is waiting for it\n", + seq); args->out.status = HL_WAIT_CS_STATUS_ABORTED; } return rc; diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/common/context.c index ec92b3506b1f..3e375958e73b 100644 --- a/drivers/misc/habanalabs/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -22,9 +22,11 @@ static void hl_ctx_fini(struct hl_ctx *ctx) * to this function unless the ref count is 0 */ - for (i = 0 ; i < HL_MAX_PENDING_CS ; i++) + for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++) dma_fence_put(ctx->cs_pending[i]); + kfree(ctx->cs_pending); + if (ctx->asid != HL_KERNEL_ASID_ID) { /* The engines are stopped as there is no executing CS, but the * Coresight might be still working by accessing addresses @@ -110,8 +112,7 @@ void hl_ctx_free(struct hl_device *hdev, struct hl_ctx *ctx) return; dev_warn(hdev->dev, - "Context %d closed or terminated but its CS are executing\n", - ctx->asid); + "user process released device but its command submissions are still executing\n"); } int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) @@ -126,34 +127,49 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) spin_lock_init(&ctx->cs_lock); atomic_set(&ctx->thread_ctx_switch_token, 1); ctx->thread_ctx_switch_wait_token = 0; + ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs, + sizeof(struct dma_fence *), + GFP_KERNEL); + if (!ctx->cs_pending) + return -ENOMEM; if (is_kernel_ctx) { ctx->asid = HL_KERNEL_ASID_ID; /* Kernel driver gets ASID 0 */ rc = hl_mmu_ctx_init(ctx); if (rc) { dev_err(hdev->dev, "Failed to init mmu ctx module\n"); - goto mem_ctx_err; + goto err_free_cs_pending; } } else { ctx->asid = hl_asid_alloc(hdev); if (!ctx->asid) { dev_err(hdev->dev, "No free ASID, failed to create context\n"); - return -ENOMEM; + rc = -ENOMEM; + goto err_free_cs_pending; } rc = hl_vm_ctx_init(ctx); if (rc) { dev_err(hdev->dev, "Failed to init mem ctx module\n"); rc = -ENOMEM; - goto mem_ctx_err; + goto err_asid_free; + } + + rc = hdev->asic_funcs->ctx_init(ctx); + if (rc) { + dev_err(hdev->dev, "ctx_init failed\n"); + goto err_vm_ctx_fini; } } return 0; -mem_ctx_err: - if (ctx->asid != HL_KERNEL_ASID_ID) - hl_asid_free(hdev, ctx->asid); +err_vm_ctx_fini: + hl_vm_ctx_fini(ctx); +err_asid_free: + hl_asid_free(hdev, ctx->asid); +err_free_cs_pending: + kfree(ctx->cs_pending); return rc; } @@ -170,6 +186,7 @@ int hl_ctx_put(struct hl_ctx *ctx) struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq) { + struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop; struct dma_fence *fence; spin_lock(&ctx->cs_lock); @@ -179,13 +196,13 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq) return ERR_PTR(-EINVAL); } - if (seq + HL_MAX_PENDING_CS < ctx->cs_sequence) { + if (seq + asic_prop->max_pending_cs < ctx->cs_sequence) { spin_unlock(&ctx->cs_lock); return NULL; } fence = dma_fence_get( - ctx->cs_pending[seq & (HL_MAX_PENDING_CS - 1)]); + ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]); spin_unlock(&ctx->cs_lock); return fence; diff --git a/drivers/misc/habanalabs/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c index fc4372c18ce2..fc4372c18ce2 100644 --- a/drivers/misc/habanalabs/debugfs.c +++ b/drivers/misc/habanalabs/common/debugfs.c diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/common/device.c index 2b38a119704c..9919ff121067 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -249,7 +249,8 @@ static void device_cdev_sysfs_del(struct hl_device *hdev) */ static int device_early_init(struct hl_device *hdev) { - int rc; + int i, rc; + char workq_name[32]; switch (hdev->asic_type) { case ASIC_GOYA: @@ -274,11 +275,24 @@ static int device_early_init(struct hl_device *hdev) if (rc) goto early_fini; - hdev->cq_wq = alloc_workqueue("hl-free-jobs", WQ_UNBOUND, 0); - if (hdev->cq_wq == NULL) { - dev_err(hdev->dev, "Failed to allocate CQ workqueue\n"); - rc = -ENOMEM; - goto asid_fini; + if (hdev->asic_prop.completion_queues_count) { + hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count, + sizeof(*hdev->cq_wq), + GFP_ATOMIC); + if (!hdev->cq_wq) { + rc = -ENOMEM; + goto asid_fini; + } + } + + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) { + snprintf(workq_name, 32, "hl-free-jobs-%u", i); + hdev->cq_wq[i] = create_singlethread_workqueue(workq_name); + if (hdev->cq_wq == NULL) { + dev_err(hdev->dev, "Failed to allocate CQ workqueue\n"); + rc = -ENOMEM; + goto free_cq_wq; + } } hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0); @@ -321,7 +335,10 @@ free_chip_info: free_eq_wq: destroy_workqueue(hdev->eq_wq); free_cq_wq: - destroy_workqueue(hdev->cq_wq); + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) + if (hdev->cq_wq[i]) + destroy_workqueue(hdev->cq_wq[i]); + kfree(hdev->cq_wq); asid_fini: hl_asid_fini(hdev); early_fini: @@ -339,6 +356,8 @@ early_fini: */ static void device_early_fini(struct hl_device *hdev) { + int i; + mutex_destroy(&hdev->mmu_cache_lock); mutex_destroy(&hdev->debug_lock); mutex_destroy(&hdev->send_cpu_message_lock); @@ -351,7 +370,10 @@ static void device_early_fini(struct hl_device *hdev) kfree(hdev->hl_chip_info); destroy_workqueue(hdev->eq_wq); - destroy_workqueue(hdev->cq_wq); + + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) + destroy_workqueue(hdev->cq_wq[i]); + kfree(hdev->cq_wq); hl_asid_fini(hdev); @@ -838,6 +860,22 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset, if (rc) return 0; + if (hard_reset) { + /* Disable PCI access from device F/W so he won't send + * us additional interrupts. We disable MSI/MSI-X at + * the halt_engines function and we can't have the F/W + * sending us interrupts after that. We need to disable + * the access here because if the device is marked + * disable, the message won't be send. Also, in case + * of heartbeat, the device CPU is marked as disable + * so this message won't be sent + */ + if (hl_fw_send_pci_access_msg(hdev, + ARMCP_PACKET_DISABLE_PCI_ACCESS)) + dev_warn(hdev->dev, + "Failed to disable PCI access by F/W\n"); + } + /* This also blocks future CS/VM/JOB completion operations */ hdev->disabled = true; @@ -995,6 +1033,12 @@ again: } } + /* Device is now enabled as part of the initialization requires + * communication with the device firmware to get information that + * is required for the initialization itself + */ + hdev->disabled = false; + rc = hdev->asic_funcs->hw_init(hdev); if (rc) { dev_err(hdev->dev, @@ -1002,8 +1046,6 @@ again: goto out_err; } - hdev->disabled = false; - /* Check that the communication with the device is working */ rc = hdev->asic_funcs->test_queues(hdev); if (rc) { @@ -1144,14 +1186,17 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) * because there the addresses of the completion queues are being * passed as arguments to request_irq */ - hdev->completion_queue = kcalloc(cq_cnt, - sizeof(*hdev->completion_queue), - GFP_KERNEL); + if (cq_cnt) { + hdev->completion_queue = kcalloc(cq_cnt, + sizeof(*hdev->completion_queue), + GFP_KERNEL); - if (!hdev->completion_queue) { - dev_err(hdev->dev, "failed to allocate completion queues\n"); - rc = -ENOMEM; - goto hw_queues_destroy; + if (!hdev->completion_queue) { + dev_err(hdev->dev, + "failed to allocate completion queues\n"); + rc = -ENOMEM; + goto hw_queues_destroy; + } } for (i = 0, cq_ready_cnt = 0 ; i < cq_cnt ; i++, cq_ready_cnt++) { @@ -1162,6 +1207,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) "failed to initialize completion queue\n"); goto cq_fini; } + hdev->completion_queue[i].cq_idx = i; } /* @@ -1219,6 +1265,12 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) */ add_cdev_sysfs_on_err = true; + /* Device is now enabled as part of the initialization requires + * communication with the device firmware to get information that + * is required for the initialization itself + */ + hdev->disabled = false; + rc = hdev->asic_funcs->hw_init(hdev); if (rc) { dev_err(hdev->dev, "failed to initialize the H/W\n"); @@ -1226,8 +1278,6 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) goto out_disabled; } - hdev->disabled = false; - /* Check that the communication with the device is working */ rc = hdev->asic_funcs->test_queues(hdev); if (rc) { diff --git a/drivers/misc/habanalabs/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 15e0793da655..b2b84510b932 100644 --- a/drivers/misc/habanalabs/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -6,7 +6,7 @@ */ #include "habanalabs.h" -#include "include/hl_boot_if.h" +#include "include/common/hl_boot_if.h" #include <linux/firmware.h> #include <linux/genalloc.h> @@ -289,7 +289,7 @@ int hl_fw_armcp_info_get(struct hl_device *hdev) HL_ARMCP_INFO_TIMEOUT_USEC, &result); if (rc) { dev_err(hdev->dev, - "Failed to send ArmCP info pkt, error %d\n", rc); + "Failed to handle ArmCP info pkt, error %d\n", rc); goto out; } @@ -340,7 +340,7 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size) if (rc) { dev_err(hdev->dev, - "Failed to send ArmCP EEPROM packet, error %d\n", rc); + "Failed to handle ArmCP EEPROM packet, error %d\n", rc); goto out; } @@ -393,6 +393,53 @@ static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg) "Device boot error - NIC F/W initialization failed\n"); } +static void hl_detect_cpu_boot_status(struct hl_device *hdev, u32 status) +{ + switch (status) { + case CPU_BOOT_STATUS_NA: + dev_err(hdev->dev, + "Device boot error - BTL did NOT run\n"); + break; + case CPU_BOOT_STATUS_IN_WFE: + dev_err(hdev->dev, + "Device boot error - Stuck inside WFE loop\n"); + break; + case CPU_BOOT_STATUS_IN_BTL: + dev_err(hdev->dev, + "Device boot error - Stuck in BTL\n"); + break; + case CPU_BOOT_STATUS_IN_PREBOOT: + dev_err(hdev->dev, + "Device boot error - Stuck in Preboot\n"); + break; + case CPU_BOOT_STATUS_IN_SPL: + dev_err(hdev->dev, + "Device boot error - Stuck in SPL\n"); + break; + case CPU_BOOT_STATUS_IN_UBOOT: + dev_err(hdev->dev, + "Device boot error - Stuck in u-boot\n"); + break; + case CPU_BOOT_STATUS_DRAM_INIT_FAIL: + dev_err(hdev->dev, + "Device boot error - DRAM initialization failed\n"); + break; + case CPU_BOOT_STATUS_UBOOT_NOT_READY: + dev_err(hdev->dev, + "Device boot error - u-boot stopped by user\n"); + break; + case CPU_BOOT_STATUS_TS_INIT_FAIL: + dev_err(hdev->dev, + "Device boot error - Thermal Sensor initialization failed\n"); + break; + default: + dev_err(hdev->dev, + "Device boot error - Invalid status code %d\n", + status); + break; + } +} + int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, u32 msg_to_cpu_reg, u32 cpu_msg_status_reg, u32 boot_err0_reg, bool skip_bmc, @@ -466,50 +513,7 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, * versions but we keep them here for backward compatibility */ if (rc) { - switch (status) { - case CPU_BOOT_STATUS_NA: - dev_err(hdev->dev, - "Device boot error - BTL did NOT run\n"); - break; - case CPU_BOOT_STATUS_IN_WFE: - dev_err(hdev->dev, - "Device boot error - Stuck inside WFE loop\n"); - break; - case CPU_BOOT_STATUS_IN_BTL: - dev_err(hdev->dev, - "Device boot error - Stuck in BTL\n"); - break; - case CPU_BOOT_STATUS_IN_PREBOOT: - dev_err(hdev->dev, - "Device boot error - Stuck in Preboot\n"); - break; - case CPU_BOOT_STATUS_IN_SPL: - dev_err(hdev->dev, - "Device boot error - Stuck in SPL\n"); - break; - case CPU_BOOT_STATUS_IN_UBOOT: - dev_err(hdev->dev, - "Device boot error - Stuck in u-boot\n"); - break; - case CPU_BOOT_STATUS_DRAM_INIT_FAIL: - dev_err(hdev->dev, - "Device boot error - DRAM initialization failed\n"); - break; - case CPU_BOOT_STATUS_UBOOT_NOT_READY: - dev_err(hdev->dev, - "Device boot error - u-boot stopped by user\n"); - break; - case CPU_BOOT_STATUS_TS_INIT_FAIL: - dev_err(hdev->dev, - "Device boot error - Thermal Sensor initialization failed\n"); - break; - default: - dev_err(hdev->dev, - "Device boot error - Invalid status code %d\n", - status); - break; - } - + hl_detect_cpu_boot_status(hdev, status); rc = -EIO; goto out; } @@ -569,7 +573,8 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, "Device reports FIT image is corrupted\n"); else dev_err(hdev->dev, - "Device failed to load, %d\n", status); + "Failed to load firmware to device, %d\n", + status); rc = -EIO; goto out; diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 1ecdcf8b763a..bf9abfa47b7a 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -8,8 +8,9 @@ #ifndef HABANALABSP_H_ #define HABANALABSP_H_ -#include "include/armcp_if.h" -#include "include/qman_if.h" +#include "include/common/armcp_if.h" +#include "include/common/qman_if.h" +#include <uapi/misc/habanalabs.h> #include <linux/cdev.h> #include <linux/iopoll.h> @@ -40,11 +41,6 @@ #define HL_SIM_MAX_TIMEOUT_US 10000000 /* 10s */ -#define HL_MAX_QUEUES 128 - -/* MUST BE POWER OF 2 and larger than 1 */ -#define HL_MAX_PENDING_CS 64 - #define HL_IDLE_BUSY_TS_ARR_SIZE 4096 /* Memory */ @@ -53,6 +49,10 @@ /* MMU */ #define MMU_HASH_TABLE_BITS 7 /* 1 << 7 buckets */ +/* + * HL_RSVD_SOBS 'sync stream' reserved sync objects per QMAN stream + * HL_RSVD_MONS 'sync stream' reserved monitors per QMAN stream + */ #define HL_RSVD_SOBS 4 #define HL_RSVD_MONS 2 @@ -61,6 +61,11 @@ #define HL_MAX_SOB_VAL (1 << 15) +#define IS_POWER_OF_2(n) (n != 0 && ((n & (n - 1)) == 0)) +#define IS_MAX_PENDING_CS_VALID(n) (IS_POWER_OF_2(n) && (n > 1)) + +#define HL_PCI_NUM_BARS 6 + /** * struct pgt_info - MMU hop page info. * @node: hash linked-list node for the pgts shadow hash of pgts. @@ -86,6 +91,16 @@ struct hl_device; struct hl_fpriv; /** + * enum hl_pci_match_mode - pci match mode per region + * @PCI_ADDRESS_MATCH_MODE: address match mode + * @PCI_BAR_MATCH_MODE: bar match mode + */ +enum hl_pci_match_mode { + PCI_ADDRESS_MATCH_MODE, + PCI_BAR_MATCH_MODE +}; + +/** * enum hl_fw_component - F/W components to read version through registers. * @FW_COMP_UBOOT: u-boot. * @FW_COMP_PREBOOT: preboot. @@ -121,6 +136,32 @@ enum hl_cs_type { }; /* + * struct hl_inbound_pci_region - inbound region descriptor + * @mode: pci match mode for this region + * @addr: region target address + * @size: region size in bytes + * @offset_in_bar: offset within bar (address match mode) + * @bar: bar id + */ +struct hl_inbound_pci_region { + enum hl_pci_match_mode mode; + u64 addr; + u64 size; + u64 offset_in_bar; + u8 bar; +}; + +/* + * struct hl_outbound_pci_region - outbound region descriptor + * @addr: region target address + * @size: region size in bytes + */ +struct hl_outbound_pci_region { + u64 addr; + u64 size; +}; + +/* * struct hl_hw_sob - H/W SOB info. * @hdev: habanalabs device structure. * @kref: refcount of this SOB. The SOB will reset once the refcount is zero. @@ -141,11 +182,13 @@ struct hl_hw_sob { * false otherwise. * @requires_kernel_cb: true if a CB handle must be provided for jobs on this * queue, false otherwise (a CB address must be provided). + * @supports_sync_stream: True if queue supports sync stream */ struct hw_queue_properties { enum hl_queue_type type; u8 driver_only; u8 requires_kernel_cb; + u8 supports_sync_stream; }; /** @@ -241,14 +284,19 @@ struct hl_mmu_properties { * @psoc_pci_pll_nf: PCI PLL NF value. * @psoc_pci_pll_od: PCI PLL OD value. * @psoc_pci_pll_div_factor: PCI PLL DIV FACTOR 1 value. + * @psoc_timestamp_frequency: frequency of the psoc timestamp clock. * @high_pll: high PLL frequency used by the device. * @cb_pool_cb_cnt: number of CBs in the CB pool. * @cb_pool_cb_size: size of each CB in the CB pool. + * @max_pending_cs: maximum of concurrent pending command submissions + * @max_queues: maximum amount of queues in the system + * @sync_stream_first_sob: first sync object available for sync stream use + * @sync_stream_first_mon: first monitor available for sync stream use * @tpc_enabled_mask: which TPCs are enabled. * @completion_queues_count: number of completion queues. */ struct asic_fixed_properties { - struct hw_queue_properties hw_queues_props[HL_MAX_QUEUES]; + struct hw_queue_properties *hw_queues_props; struct armcp_info armcp_info; char uboot_ver[VERSION_MAX_LEN]; char preboot_ver[VERSION_MAX_LEN]; @@ -282,9 +330,14 @@ struct asic_fixed_properties { u32 psoc_pci_pll_nf; u32 psoc_pci_pll_od; u32 psoc_pci_pll_div_factor; + u32 psoc_timestamp_frequency; u32 high_pll; u32 cb_pool_cb_cnt; u32 cb_pool_cb_size; + u32 max_pending_cs; + u32 max_queues; + u16 sync_stream_first_sob; + u16 sync_stream_first_mon; u8 tpc_enabled_mask; u8 completion_queues_count; }; @@ -339,6 +392,7 @@ struct hl_cb_mgr { * @ctx_id: holds the ID of the owner's context. * @mmap: true if the CB is currently mmaped to user. * @is_pool: true if CB was acquired from the pool, false otherwise. + * @is_internal: internaly allocated */ struct hl_cb { struct kref refcount; @@ -355,6 +409,7 @@ struct hl_cb { u32 ctx_id; u8 mmap; u8 is_pool; + u8 is_internal; }; @@ -364,38 +419,19 @@ struct hl_cb { struct hl_cs_job; -/* - * Currently, there are two limitations on the maximum length of a queue: - * - * 1. The memory footprint of the queue. The current allocated space for the - * queue is PAGE_SIZE. Because each entry in the queue is HL_BD_SIZE, - * the maximum length of the queue can be PAGE_SIZE / HL_BD_SIZE, - * which currently is 4096/16 = 256 entries. - * - * To increase that, we need either to decrease the size of the - * BD (difficult), or allocate more than a single page (easier). - * - * 2. Because the size of the JOB handle field in the BD CTL / completion queue - * is 10-bit, we can have up to 1024 open jobs per hardware queue. - * Therefore, each queue can hold up to 1024 entries. - * - * HL_QUEUE_LENGTH is in units of struct hl_bd. - * HL_QUEUE_LENGTH * sizeof(struct hl_bd) should be <= HL_PAGE_SIZE - */ - -#define HL_PAGE_SIZE 4096 /* minimum page size */ -/* Must be power of 2 (HL_PAGE_SIZE / HL_BD_SIZE) */ -#define HL_QUEUE_LENGTH 256 +/* Queue length of external and HW queues */ +#define HL_QUEUE_LENGTH 4096 #define HL_QUEUE_SIZE_IN_BYTES (HL_QUEUE_LENGTH * HL_BD_SIZE) -/* - * HL_CQ_LENGTH is in units of struct hl_cq_entry. - * HL_CQ_LENGTH should be <= HL_PAGE_SIZE - */ +#if (HL_MAX_JOBS_PER_CS > HL_QUEUE_LENGTH) +#error "HL_QUEUE_LENGTH must be greater than HL_MAX_JOBS_PER_CS" +#endif + +/* HL_CQ_LENGTH is in units of struct hl_cq_entry */ #define HL_CQ_LENGTH HL_QUEUE_LENGTH #define HL_CQ_SIZE_IN_BYTES (HL_CQ_LENGTH * HL_CQ_ENTRY_SIZE) -/* Must be power of 2 (HL_PAGE_SIZE / HL_EQ_ENTRY_SIZE) */ +/* Must be power of 2 */ #define HL_EQ_LENGTH 64 #define HL_EQ_SIZE_IN_BYTES (HL_EQ_LENGTH * HL_EQ_ENTRY_SIZE) @@ -422,6 +458,7 @@ struct hl_cs_job; * exist). * @curr_sob_offset: the id offset to the currently used SOB from the * HL_RSVD_SOBS that are being used by this queue. + * @supports_sync_stream: True if queue supports sync stream */ struct hl_hw_queue { struct hl_hw_sob hw_sob[HL_RSVD_SOBS]; @@ -430,7 +467,7 @@ struct hl_hw_queue { u64 kernel_address; dma_addr_t bus_address; u32 pi; - u32 ci; + atomic_t ci; u32 hw_queue_id; u32 cq_id; u32 msi_vec; @@ -440,6 +477,7 @@ struct hl_hw_queue { u16 base_mon_id; u8 valid; u8 curr_sob_offset; + u8 supports_sync_stream; }; /** @@ -447,6 +485,7 @@ struct hl_hw_queue { * @hdev: pointer to the device structure * @kernel_address: holds the queue's kernel virtual address * @bus_address: holds the queue's DMA address + * @cq_idx: completion queue index in array * @hw_queue_id: the id of the matching H/W queue * @ci: ci inside the queue * @pi: pi inside the queue @@ -456,6 +495,7 @@ struct hl_cq { struct hl_device *hdev; u64 kernel_address; dma_addr_t bus_address; + u32 cq_idx; u32 hw_queue_id; u32 ci; u32 pi; @@ -519,6 +559,15 @@ enum hl_pll_frequency { PLL_LAST }; +#define PLL_REF_CLK 50 + +enum div_select_defs { + DIV_SEL_REF_CLK = 0, + DIV_SEL_PLL_CLK = 1, + DIV_SEL_DIVIDED_REF = 2, + DIV_SEL_DIVIDED_PLL = 3, +}; + /** * struct hl_asic_funcs - ASIC specific functions that are can be called from * common code. @@ -596,14 +645,13 @@ enum hl_pll_frequency { * @rreg: Read a register. Needed for simulator support. * @wreg: Write a register. Needed for simulator support. * @halt_coresight: stop the ETF and ETR traces. + * @ctx_init: context dependent initialization. * @get_clk_rate: Retrieve the ASIC current and maximum clock rate in MHz * @get_queue_id_for_cq: Get the H/W queue id related to the given CQ index. * @read_device_fw_version: read the device's firmware versions that are * contained in registers * @load_firmware_to_device: load the firmware to the device's memory * @load_boot_fit_to_device: load boot fit to device's memory - * @ext_queue_init: Initialize the given external queue. - * @ext_queue_reset: Reset the given external queue. * @get_signal_cb_size: Get signal CB size. * @get_wait_cb_size: Get wait CB size. * @gen_signal_cb: Generate a signal CB. @@ -700,14 +748,13 @@ struct hl_asic_funcs { u32 (*rreg)(struct hl_device *hdev, u32 reg); void (*wreg)(struct hl_device *hdev, u32 reg, u32 val); void (*halt_coresight)(struct hl_device *hdev); + int (*ctx_init)(struct hl_ctx *ctx); int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk); u32 (*get_queue_id_for_cq)(struct hl_device *hdev, u32 cq_idx); void (*read_device_fw_version)(struct hl_device *hdev, enum hl_fw_component fwc); int (*load_firmware_to_device)(struct hl_device *hdev); int (*load_boot_fit_to_device)(struct hl_device *hdev); - void (*ext_queue_init)(struct hl_device *hdev, u32 hw_queue_id); - void (*ext_queue_reset)(struct hl_device *hdev, u32 hw_queue_id); u32 (*get_signal_cb_size)(struct hl_device *hdev); u32 (*get_wait_cb_size)(struct hl_device *hdev); void (*gen_signal_cb)(struct hl_device *hdev, void *data, u16 sob_id); @@ -743,7 +790,6 @@ struct hl_va_range { * struct hl_ctx - user/kernel context. * @mem_hash: holds mapping from virtual address to virtual memory area * descriptor (hl_vm_phys_pg_list or hl_userptr). - * @mmu_phys_hash: holds a mapping from physical address to pgt_info structure. * @mmu_shadow_hash: holds a mapping from shadow address to pgt_info structure. * @hpriv: pointer to the private (Kernel Driver) data of the process (fd). * @hdev: pointer to the device structure. @@ -777,18 +823,18 @@ struct hl_va_range { */ struct hl_ctx { DECLARE_HASHTABLE(mem_hash, MEM_HASH_TABLE_BITS); - DECLARE_HASHTABLE(mmu_phys_hash, MMU_HASH_TABLE_BITS); DECLARE_HASHTABLE(mmu_shadow_hash, MMU_HASH_TABLE_BITS); struct hl_fpriv *hpriv; struct hl_device *hdev; struct kref refcount; - struct dma_fence *cs_pending[HL_MAX_PENDING_CS]; + struct dma_fence **cs_pending; struct hl_va_range *host_va_range; struct hl_va_range *host_huge_va_range; struct hl_va_range *dram_va_range; struct mutex mem_hash_lock; struct mutex mmu_lock; struct list_head debugfs_list; + struct hl_cs_counters cs_counters; u64 cs_sequence; u64 *dram_default_hops; spinlock_t cs_lock; @@ -863,7 +909,7 @@ struct hl_userptr { * @aborted: true if CS was aborted due to some device error. */ struct hl_cs { - u16 jobs_in_queue_cnt[HL_MAX_QUEUES]; + u16 *jobs_in_queue_cnt; struct hl_ctx *ctx; struct list_head job_list; spinlock_t job_lock; @@ -1347,7 +1393,9 @@ struct hl_device_idle_busy_ts { /** * struct hl_device - habanalabs device structure. * @pdev: pointer to PCI device, can be NULL in case of simulator device. - * @pcie_bar: array of available PCIe bars. + * @pcie_bar_phys: array of available PCIe bars physical addresses. + * (required only for PCI address match mode) + * @pcie_bar: array of available PCIe bars virtual addresses. * @rmmio: configuration area address on SRAM. * @cdev: related char device. * @cdev_ctrl: char device for control operations only (INFO IOCTL) @@ -1358,7 +1406,8 @@ struct hl_device_idle_busy_ts { * @asic_name: ASIC specific nmae. * @asic_type: ASIC specific type. * @completion_queue: array of hl_cq. - * @cq_wq: work queue of completion queues for executing work in process context + * @cq_wq: work queues of completion queues for executing work in process + * context. * @eq_wq: work queue of event queue for executing work in process context. * @kernel_ctx: Kernel driver context structure. * @kernel_queues: array of hl_hw_queue. @@ -1387,12 +1436,17 @@ struct hl_device_idle_busy_ts { * @hl_debugfs: device's debugfs manager. * @cb_pool: list of preallocated CBs. * @cb_pool_lock: protects the CB pool. + * @internal_cb_pool_virt_addr: internal command buffer pool virtual address. + * @internal_cb_pool_dma_addr: internal command buffer pool dma address. + * @internal_cb_pool: internal command buffer memory pool. + * @internal_cb_va_base: internal cb pool mmu virtual address base * @fpriv_list: list of file private data structures. Each structure is created * when a user opens the device * @fpriv_list_lock: protects the fpriv_list * @compute_ctx: current compute context executing. * @idle_busy_ts_arr: array to hold time stamps of transitions from idle to busy * and vice-versa + * @aggregated_cs_counters: aggregated cs counters among all contexts * @dram_used_mem: current DRAM memory consumption. * @timeout_jiffies: device CS timeout value. * @max_power: the max power of the device, as configured by the sysadmin. This @@ -1435,12 +1489,14 @@ struct hl_device_idle_busy_ts { * @cdev_sysfs_created: were char devices and sysfs nodes created. * @stop_on_err: true if engines should stop on error. * @supports_sync_stream: is sync stream supported. + * @sync_stream_queue_idx: helper index for sync stream queues initialization. * @supports_coresight: is CoreSight supported. * @supports_soft_reset: is soft reset supported. */ struct hl_device { struct pci_dev *pdev; - void __iomem *pcie_bar[6]; + u64 pcie_bar_phys[HL_PCI_NUM_BARS]; + void __iomem *pcie_bar[HL_PCI_NUM_BARS]; void __iomem *rmmio; struct cdev cdev; struct cdev cdev_ctrl; @@ -1451,7 +1507,7 @@ struct hl_device { char asic_name[16]; enum hl_asic_type asic_type; struct hl_cq *completion_queue; - struct workqueue_struct *cq_wq; + struct workqueue_struct **cq_wq; struct workqueue_struct *eq_wq; struct hl_ctx *kernel_ctx; struct hl_hw_queue *kernel_queues; @@ -1483,6 +1539,11 @@ struct hl_device { struct list_head cb_pool; spinlock_t cb_pool_lock; + void *internal_cb_pool_virt_addr; + dma_addr_t internal_cb_pool_dma_addr; + struct gen_pool *internal_cb_pool; + u64 internal_cb_va_base; + struct list_head fpriv_list; struct mutex fpriv_list_lock; @@ -1490,6 +1551,8 @@ struct hl_device { struct hl_device_idle_busy_ts *idle_busy_ts_arr; + struct hl_cs_counters aggregated_cs_counters; + atomic64_t dram_used_mem; u64 timeout_jiffies; u64 max_power; @@ -1522,6 +1585,7 @@ struct hl_device { u8 cdev_sysfs_created; u8 stop_on_err; u8 supports_sync_stream; + u8 sync_stream_queue_idx; u8 supports_coresight; u8 supports_soft_reset; @@ -1690,7 +1754,7 @@ int hl_hwmon_init(struct hl_device *hdev); void hl_hwmon_fini(struct hl_device *hdev); int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, u32 cb_size, - u64 *handle, int ctx_id); + u64 *handle, int ctx_id, bool internal_cb); int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle); int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma); struct hl_cb *hl_cb_get(struct hl_device *hdev, struct hl_cb_mgr *mgr, @@ -1698,7 +1762,8 @@ struct hl_cb *hl_cb_get(struct hl_device *hdev, struct hl_cb_mgr *mgr, void hl_cb_put(struct hl_cb *cb); void hl_cb_mgr_init(struct hl_cb_mgr *mgr); void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr); -struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size); +struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size, + bool internal_cb); int hl_cb_pool_init(struct hl_device *hdev); int hl_cb_pool_fini(struct hl_device *hdev); @@ -1762,9 +1827,10 @@ int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3], int hl_pci_iatu_write(struct hl_device *hdev, u32 addr, u32 data); int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar, u64 addr); -int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address, - u64 dram_base_address, u64 host_phys_base_address, - u64 host_phys_size); +int hl_pci_set_inbound_region(struct hl_device *hdev, u8 region, + struct hl_inbound_pci_region *pci_region); +int hl_pci_set_outbound_region(struct hl_device *hdev, + struct hl_outbound_pci_region *pci_region); int hl_pci_init(struct hl_device *hdev); void hl_pci_fini(struct hl_device *hdev); diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index 8652c7e5d7f1..f38664b03865 100644 --- a/drivers/misc/habanalabs/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -238,7 +238,6 @@ static void set_driver_behavior_per_device(struct hl_device *hdev) hdev->axi_drain = 0; hdev->sram_scrambler_enable = 1; hdev->dram_scrambler_enable = 1; - hdev->rl_enable = 1; hdev->bmc_enable = 1; hdev->hard_reset_on_fw_events = 1; } diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 52eedd3a6c3a..5af1c03da473 100644 --- a/drivers/misc/habanalabs/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -276,6 +276,27 @@ static int time_sync_info(struct hl_device *hdev, struct hl_info_args *args) min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0; } +static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_info_cs_counters cs_counters = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + memcpy(&cs_counters.cs_counters, &hdev->aggregated_cs_counters, + sizeof(struct hl_cs_counters)); + + if (hpriv->ctx) + memcpy(&cs_counters.ctx_cs_counters, &hpriv->ctx->cs_counters, + sizeof(struct hl_cs_counters)); + + return copy_to_user(out, &cs_counters, + min((size_t) max_size, sizeof(cs_counters))) ? -EFAULT : 0; +} + static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, struct device *dev) { @@ -336,6 +357,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_TIME_SYNC: return time_sync_info(hdev, args); + case HL_INFO_CS_COUNTERS: + return cs_counters_info(hpriv, args); + default: dev_err(dev, "Invalid request %d\n", args->op); rc = -ENOTTY; diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c index f4434b39ef1b..287681646071 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/common/hw_queue.c @@ -23,10 +23,14 @@ inline u32 hl_hw_queue_add_ptr(u32 ptr, u16 val) ptr &= ((HL_QUEUE_LENGTH << 1) - 1); return ptr; } +static inline int queue_ci_get(atomic_t *ci, u32 queue_len) +{ + return atomic_read(ci) & ((queue_len << 1) - 1); +} static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len) { - int delta = (q->pi - q->ci); + int delta = (q->pi - queue_ci_get(&q->ci, queue_len)); if (delta >= 0) return (queue_len - delta); @@ -40,21 +44,14 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs) struct hl_hw_queue *q; int i; - hdev->asic_funcs->hw_queues_lock(hdev); - if (hdev->disabled) - goto out; + return; q = &hdev->kernel_queues[0]; - for (i = 0 ; i < HL_MAX_QUEUES ; i++, q++) { - if (q->queue_type == QUEUE_TYPE_INT) { - q->ci += cs->jobs_in_queue_cnt[i]; - q->ci &= ((q->int_queue_len << 1) - 1); - } + for (i = 0 ; i < hdev->asic_prop.max_queues ; i++, q++) { + if (q->queue_type == QUEUE_TYPE_INT) + atomic_add(cs->jobs_in_queue_cnt[i], &q->ci); } - -out: - hdev->asic_funcs->hw_queues_unlock(hdev); } /* @@ -161,6 +158,13 @@ static int int_queue_sanity_checks(struct hl_device *hdev, { int free_slots_cnt; + if (num_of_entries > q->int_queue_len) { + dev_err(hdev->dev, + "Cannot populate queue %u with %u jobs\n", + q->hw_queue_id, num_of_entries); + return -ENOMEM; + } + /* Check we have enough space in the queue */ free_slots_cnt = queue_free_slots(q, q->int_queue_len); @@ -174,38 +178,26 @@ static int int_queue_sanity_checks(struct hl_device *hdev, } /* - * hw_queue_sanity_checks() - Perform some sanity checks on a H/W queue. + * hw_queue_sanity_checks() - Make sure we have enough space in the h/w queue * @hdev: Pointer to hl_device structure. * @q: Pointer to hl_hw_queue structure. * @num_of_entries: How many entries to check for space. * - * Perform the following: - * - Make sure we have enough space in the completion queue. - * This check also ensures that there is enough space in the h/w queue, as - * both queues are of the same size. - * - Reserve space in the completion queue (needs to be reversed if there - * is a failure down the road before the actual submission of work). + * Notice: We do not reserve queue entries so this function mustn't be called + * more than once per CS for the same queue * - * Both operations are done using the "free_slots_cnt" field of the completion - * queue. The CI counters of the queue and the completion queue are not - * needed/used for the H/W queue type. */ static int hw_queue_sanity_checks(struct hl_device *hdev, struct hl_hw_queue *q, int num_of_entries) { - atomic_t *free_slots = - &hdev->completion_queue[q->cq_id].free_slots_cnt; + int free_slots_cnt; - /* - * Check we have enough space in the completion queue. - * Add -1 to counter (decrement) unless counter was already 0. - * In that case, CQ is full so we can't submit a new CB. - * atomic_add_unless will return 0 if counter was already 0. - */ - if (atomic_add_negative(num_of_entries * -1, free_slots)) { - dev_dbg(hdev->dev, "No space for %d entries on CQ %d\n", - num_of_entries, q->hw_queue_id); - atomic_add(num_of_entries, free_slots); + /* Check we have enough space in the queue */ + free_slots_cnt = queue_free_slots(q, HL_QUEUE_LENGTH); + + if (free_slots_cnt < num_of_entries) { + dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n", + q->hw_queue_id, num_of_entries); return -EAGAIN; } @@ -366,7 +358,6 @@ static void hw_queue_schedule_job(struct hl_cs_job *job) { struct hl_device *hdev = job->cs->ctx->hdev; struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id]; - struct hl_cq *cq; u64 ptr; u32 offset, ctl, len; @@ -376,7 +367,7 @@ static void hw_queue_schedule_job(struct hl_cs_job *job) * write address offset in the SM block (QMAN LBW message). * The write address offset is calculated as "COMP_OFFSET << 2". */ - offset = job->cs->sequence & (HL_MAX_PENDING_CS - 1); + offset = job->cs->sequence & (hdev->asic_prop.max_pending_cs - 1); ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) | ((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK); @@ -395,17 +386,6 @@ static void hw_queue_schedule_job(struct hl_cs_job *job) else ptr = (u64) (uintptr_t) job->user_cb; - /* - * No need to protect pi_offset because scheduling to the - * H/W queues is done under the scheduler mutex - * - * No need to check if CQ is full because it was already - * checked in hw_queue_sanity_checks - */ - cq = &hdev->completion_queue[q->cq_id]; - - cq->pi = hl_cq_inc_ptr(cq->pi); - ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr); } @@ -509,19 +489,23 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) struct hl_device *hdev = ctx->hdev; struct hl_cs_job *job, *tmp; struct hl_hw_queue *q; + u32 max_queues; int rc = 0, i, cq_cnt; hdev->asic_funcs->hw_queues_lock(hdev); if (hl_device_disabled_or_in_reset(hdev)) { + ctx->cs_counters.device_in_reset_drop_cnt++; dev_err(hdev->dev, "device is disabled or in reset, CS rejected!\n"); rc = -EPERM; goto out; } + max_queues = hdev->asic_prop.max_queues; + q = &hdev->kernel_queues[0]; - for (i = 0, cq_cnt = 0 ; i < HL_MAX_QUEUES ; i++, q++) { + for (i = 0, cq_cnt = 0 ; i < max_queues ; i++, q++) { if (cs->jobs_in_queue_cnt[i]) { switch (q->queue_type) { case QUEUE_TYPE_EXT: @@ -543,11 +527,12 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) break; } - if (rc) + if (rc) { + ctx->cs_counters.queue_full_drop_cnt++; goto unroll_cq_resv; + } - if (q->queue_type == QUEUE_TYPE_EXT || - q->queue_type == QUEUE_TYPE_HW) + if (q->queue_type == QUEUE_TYPE_EXT) cq_cnt++; } } @@ -598,10 +583,9 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) unroll_cq_resv: q = &hdev->kernel_queues[0]; - for (i = 0 ; (i < HL_MAX_QUEUES) && (cq_cnt > 0) ; i++, q++) { - if ((q->queue_type == QUEUE_TYPE_EXT || - q->queue_type == QUEUE_TYPE_HW) && - cs->jobs_in_queue_cnt[i]) { + for (i = 0 ; (i < max_queues) && (cq_cnt > 0) ; i++, q++) { + if ((q->queue_type == QUEUE_TYPE_EXT) && + (cs->jobs_in_queue_cnt[i])) { atomic_t *free_slots = &hdev->completion_queue[i].free_slots_cnt; atomic_add(cs->jobs_in_queue_cnt[i], free_slots); @@ -625,7 +609,7 @@ void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id) { struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id]; - q->ci = hl_queue_inc_ptr(q->ci); + atomic_inc(&q->ci); } static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, @@ -660,12 +644,9 @@ static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, } /* Make sure read/write pointers are initialized to start of queue */ - q->ci = 0; + atomic_set(&q->ci, 0); q->pi = 0; - if (!is_cpu_queue) - hdev->asic_funcs->ext_queue_init(hdev, q->hw_queue_id); - return 0; free_queue: @@ -697,7 +678,7 @@ static int int_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) q->kernel_address = (u64) (uintptr_t) p; q->pi = 0; - q->ci = 0; + atomic_set(&q->ci, 0); return 0; } @@ -726,12 +707,48 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) q->kernel_address = (u64) (uintptr_t) p; /* Make sure read/write pointers are initialized to start of queue */ - q->ci = 0; + atomic_set(&q->ci, 0); q->pi = 0; return 0; } +static void sync_stream_queue_init(struct hl_device *hdev, u32 q_idx) +{ + struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx]; + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct hl_hw_sob *hw_sob; + int sob, queue_idx = hdev->sync_stream_queue_idx++; + + hw_queue->base_sob_id = + prop->sync_stream_first_sob + queue_idx * HL_RSVD_SOBS; + hw_queue->base_mon_id = + prop->sync_stream_first_mon + queue_idx * HL_RSVD_MONS; + hw_queue->next_sob_val = 1; + hw_queue->curr_sob_offset = 0; + + for (sob = 0 ; sob < HL_RSVD_SOBS ; sob++) { + hw_sob = &hw_queue->hw_sob[sob]; + hw_sob->hdev = hdev; + hw_sob->sob_id = hw_queue->base_sob_id + sob; + hw_sob->q_idx = q_idx; + kref_init(&hw_sob->kref); + } +} + +static void sync_stream_queue_reset(struct hl_device *hdev, u32 q_idx) +{ + struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx]; + + /* + * In case we got here due to a stuck CS, the refcnt might be bigger + * than 1 and therefore we reset it. + */ + kref_init(&hw_queue->hw_sob[hw_queue->curr_sob_offset].kref); + hw_queue->curr_sob_offset = 0; + hw_queue->next_sob_val = 1; +} + /* * queue_init - main initialization function for H/W queue object * @@ -747,8 +764,6 @@ static int queue_init(struct hl_device *hdev, struct hl_hw_queue *q, { int rc; - BUILD_BUG_ON(HL_QUEUE_SIZE_IN_BYTES > HL_PAGE_SIZE); - q->hw_queue_id = hw_queue_id; switch (q->queue_type) { @@ -774,6 +789,9 @@ static int queue_init(struct hl_device *hdev, struct hl_hw_queue *q, break; } + if (q->supports_sync_stream) + sync_stream_queue_init(hdev, q->hw_queue_id); + if (rc) return rc; @@ -835,7 +853,7 @@ int hl_hw_queues_create(struct hl_device *hdev) struct hl_hw_queue *q; int i, rc, q_ready_cnt; - hdev->kernel_queues = kcalloc(HL_MAX_QUEUES, + hdev->kernel_queues = kcalloc(asic->max_queues, sizeof(*hdev->kernel_queues), GFP_KERNEL); if (!hdev->kernel_queues) { @@ -845,9 +863,11 @@ int hl_hw_queues_create(struct hl_device *hdev) /* Initialize the H/W queues */ for (i = 0, q_ready_cnt = 0, q = hdev->kernel_queues; - i < HL_MAX_QUEUES ; i++, q_ready_cnt++, q++) { + i < asic->max_queues ; i++, q_ready_cnt++, q++) { q->queue_type = asic->hw_queues_props[i].type; + q->supports_sync_stream = + asic->hw_queues_props[i].supports_sync_stream; rc = queue_init(hdev, q, i); if (rc) { dev_err(hdev->dev, @@ -870,9 +890,10 @@ release_queues: void hl_hw_queues_destroy(struct hl_device *hdev) { struct hl_hw_queue *q; + u32 max_queues = hdev->asic_prop.max_queues; int i; - for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++) + for (i = 0, q = hdev->kernel_queues ; i < max_queues ; i++, q++) queue_fini(hdev, q); kfree(hdev->kernel_queues); @@ -881,15 +902,17 @@ void hl_hw_queues_destroy(struct hl_device *hdev) void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset) { struct hl_hw_queue *q; + u32 max_queues = hdev->asic_prop.max_queues; int i; - for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++) { + for (i = 0, q = hdev->kernel_queues ; i < max_queues ; i++, q++) { if ((!q->valid) || ((!hard_reset) && (q->queue_type == QUEUE_TYPE_CPU))) continue; - q->pi = q->ci = 0; + q->pi = 0; + atomic_set(&q->ci, 0); - if (q->queue_type == QUEUE_TYPE_EXT) - hdev->asic_funcs->ext_queue_reset(hdev, q->hw_queue_id); + if (q->supports_sync_stream) + sync_stream_queue_reset(hdev, q->hw_queue_id); } } diff --git a/drivers/misc/habanalabs/hwmon.c b/drivers/misc/habanalabs/common/hwmon.c index 8c6cd77e6af6..8c6cd77e6af6 100644 --- a/drivers/misc/habanalabs/hwmon.c +++ b/drivers/misc/habanalabs/common/hwmon.c diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/common/irq.c index 6981d67153b1..c8db717023f5 100644 --- a/drivers/misc/habanalabs/irq.c +++ b/drivers/misc/habanalabs/common/irq.c @@ -119,15 +119,10 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg) if ((shadow_index_valid) && (!hdev->disabled)) { job = queue->shadow_queue[hl_pi_2_offset(shadow_index)]; - queue_work(hdev->cq_wq, &job->finish_work); + queue_work(hdev->cq_wq[cq->cq_idx], &job->finish_work); } - /* Update ci of the context's queue. There is no - * need to protect it with spinlock because this update is - * done only inside IRQ and there is a different IRQ per - * queue - */ - queue->ci = hl_queue_inc_ptr(queue->ci); + atomic_inc(&queue->ci); /* Clear CQ entry ready bit */ cq_entry->data = cpu_to_le32(le32_to_cpu(cq_entry->data) & @@ -220,8 +215,6 @@ int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id) { void *p; - BUILD_BUG_ON(HL_CQ_SIZE_IN_BYTES > HL_PAGE_SIZE); - p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, HL_CQ_SIZE_IN_BYTES, &q->bus_address, GFP_KERNEL | __GFP_ZERO); if (!p) @@ -282,8 +275,6 @@ int hl_eq_init(struct hl_device *hdev, struct hl_eq *q) { void *p; - BUILD_BUG_ON(HL_EQ_SIZE_IN_BYTES > HL_PAGE_SIZE); - p = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, HL_EQ_SIZE_IN_BYTES, &q->bus_address); diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/common/memory.c index 47da84a17719..e4e1693e5c6c 100644 --- a/drivers/misc/habanalabs/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -1730,8 +1730,7 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx) */ if (!hdev->hard_reset_pending && !hash_empty(ctx->mem_hash)) dev_notice(hdev->dev, - "ctx %d is freed while it has va in use\n", - ctx->asid); + "user released device without removing its memory mappings\n"); hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) { dev_dbg(hdev->dev, diff --git a/drivers/misc/habanalabs/mmu.c b/drivers/misc/habanalabs/common/mmu.c index a290d6b49d78..04303950e630 100644 --- a/drivers/misc/habanalabs/mmu.c +++ b/drivers/misc/habanalabs/common/mmu.c @@ -502,7 +502,6 @@ int hl_mmu_ctx_init(struct hl_ctx *ctx) return 0; mutex_init(&ctx->mmu_lock); - hash_init(ctx->mmu_phys_hash); hash_init(ctx->mmu_shadow_hash); return dram_default_mapping_init(ctx); diff --git a/drivers/misc/habanalabs/pci.c b/drivers/misc/habanalabs/common/pci.c index 61a8bb07262c..1791f6623c69 100644 --- a/drivers/misc/habanalabs/pci.c +++ b/drivers/misc/habanalabs/common/pci.c @@ -9,9 +9,15 @@ #include "include/hw_ip/pci/pci_general.h" #include <linux/pci.h> +#include <linux/bitfield.h> #define HL_PLDM_PCI_ELBI_TIMEOUT_MSEC (HL_PCI_ELBI_TIMEOUT_MSEC * 10) +#define IATU_REGION_CTRL_REGION_EN_MASK BIT(31) +#define IATU_REGION_CTRL_MATCH_MODE_MASK BIT(30) +#define IATU_REGION_CTRL_NUM_MATCH_EN_MASK BIT(19) +#define IATU_REGION_CTRL_BAR_NUM_MASK GENMASK(10, 8) + /** * hl_pci_bars_map() - Map PCI BARs. * @hdev: Pointer to hl_device structure. @@ -187,110 +193,94 @@ static void hl_pci_reset_link_through_bridge(struct hl_device *hdev) } /** - * hl_pci_set_dram_bar_base() - Set DDR BAR to map specific device address. + * hl_pci_set_inbound_region() - Configure inbound region * @hdev: Pointer to hl_device structure. - * @inbound_region: Inbound region number. - * @bar: PCI BAR number. - * @addr: Address in DRAM. Must be aligned to DRAM bar size. + * @region: Inbound region number. + * @pci_region: Inbound region parameters. * - * Configure the iATU so that the DRAM bar will start at the specified address. + * Configure the iATU inbound region. * * Return: 0 on success, negative value for failure. */ -int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar, - u64 addr) +int hl_pci_set_inbound_region(struct hl_device *hdev, u8 region, + struct hl_inbound_pci_region *pci_region) { struct asic_fixed_properties *prop = &hdev->asic_prop; - u32 offset; - int rc; + u64 bar_phys_base, region_base, region_end_address; + u32 offset, ctrl_reg_val; + int rc = 0; - switch (inbound_region) { - case 0: - offset = 0x100; - break; - case 1: - offset = 0x300; - break; - case 2: - offset = 0x500; - break; - default: - dev_err(hdev->dev, "Invalid inbound region %d\n", - inbound_region); - return -EINVAL; - } + /* region offset */ + offset = (0x200 * region) + 0x100; + + if (pci_region->mode == PCI_ADDRESS_MATCH_MODE) { + bar_phys_base = hdev->pcie_bar_phys[pci_region->bar]; + region_base = bar_phys_base + pci_region->offset_in_bar; + region_end_address = region_base + pci_region->size - 1; - if (bar != 0 && bar != 2 && bar != 4) { - dev_err(hdev->dev, "Invalid PCI BAR %d\n", bar); - return -EINVAL; + rc |= hl_pci_iatu_write(hdev, offset + 0x8, + lower_32_bits(region_base)); + rc |= hl_pci_iatu_write(hdev, offset + 0xC, + upper_32_bits(region_base)); + rc |= hl_pci_iatu_write(hdev, offset + 0x10, + lower_32_bits(region_end_address)); } /* Point to the specified address */ - rc = hl_pci_iatu_write(hdev, offset + 0x14, lower_32_bits(addr)); - rc |= hl_pci_iatu_write(hdev, offset + 0x18, upper_32_bits(addr)); + rc = hl_pci_iatu_write(hdev, offset + 0x14, + lower_32_bits(pci_region->addr)); + rc |= hl_pci_iatu_write(hdev, offset + 0x18, + upper_32_bits(pci_region->addr)); rc |= hl_pci_iatu_write(hdev, offset + 0x0, 0); - /* Enable + BAR match + match enable + BAR number */ - rc |= hl_pci_iatu_write(hdev, offset + 0x4, 0xC0080000 | (bar << 8)); + + /* Enable + bar/address match + match enable + bar number */ + ctrl_reg_val = FIELD_PREP(IATU_REGION_CTRL_REGION_EN_MASK, 1); + ctrl_reg_val |= FIELD_PREP(IATU_REGION_CTRL_MATCH_MODE_MASK, + pci_region->mode); + ctrl_reg_val |= FIELD_PREP(IATU_REGION_CTRL_NUM_MATCH_EN_MASK, 1); + + if (pci_region->mode == PCI_BAR_MATCH_MODE) + ctrl_reg_val |= FIELD_PREP(IATU_REGION_CTRL_BAR_NUM_MASK, + pci_region->bar); + + rc |= hl_pci_iatu_write(hdev, offset + 0x4, ctrl_reg_val); /* Return the DBI window to the default location */ rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0); rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr + 4, 0); if (rc) - dev_err(hdev->dev, "failed to map DRAM bar to 0x%08llx\n", - addr); + dev_err(hdev->dev, "failed to map bar %u to 0x%08llx\n", + pci_region->bar, pci_region->addr); return rc; } /** - * hl_pci_init_iatu() - Initialize the iATU unit inside the PCI controller. + * hl_pci_set_outbound_region() - Configure outbound region 0 * @hdev: Pointer to hl_device structure. - * @sram_base_address: SRAM base address. - * @dram_base_address: DRAM base address. - * @host_phys_base_address: Base physical address of host memory for device - * transactions. - * @host_phys_size: Size of host memory for device transactions. + * @pci_region: Outbound region parameters. * - * This is needed in case the firmware doesn't initialize the iATU. + * Configure the iATU outbound region 0. * * Return: 0 on success, negative value for failure. */ -int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address, - u64 dram_base_address, u64 host_phys_base_address, - u64 host_phys_size) +int hl_pci_set_outbound_region(struct hl_device *hdev, + struct hl_outbound_pci_region *pci_region) { struct asic_fixed_properties *prop = &hdev->asic_prop; - u64 host_phys_end_addr; + u64 outbound_region_end_address; int rc = 0; - /* Inbound Region 0 - Bar 0 - Point to SRAM base address */ - rc = hl_pci_iatu_write(hdev, 0x114, lower_32_bits(sram_base_address)); - rc |= hl_pci_iatu_write(hdev, 0x118, upper_32_bits(sram_base_address)); - rc |= hl_pci_iatu_write(hdev, 0x100, 0); - /* Enable + Bar match + match enable */ - rc |= hl_pci_iatu_write(hdev, 0x104, 0xC0080000); - - /* Return the DBI window to the default location */ - rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0); - rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr + 4, 0); - - hdev->asic_funcs->set_dma_mask_from_fw(hdev); - - /* Point to DRAM */ - if (!hdev->asic_funcs->set_dram_bar_base) - return -EINVAL; - if (hdev->asic_funcs->set_dram_bar_base(hdev, dram_base_address) == - U64_MAX) - return -EIO; - - /* Outbound Region 0 - Point to Host */ - host_phys_end_addr = host_phys_base_address + host_phys_size - 1; + /* Outbound Region 0 */ + outbound_region_end_address = + pci_region->addr + pci_region->size - 1; rc |= hl_pci_iatu_write(hdev, 0x008, - lower_32_bits(host_phys_base_address)); + lower_32_bits(pci_region->addr)); rc |= hl_pci_iatu_write(hdev, 0x00C, - upper_32_bits(host_phys_base_address)); - rc |= hl_pci_iatu_write(hdev, 0x010, lower_32_bits(host_phys_end_addr)); + upper_32_bits(pci_region->addr)); + rc |= hl_pci_iatu_write(hdev, 0x010, + lower_32_bits(outbound_region_end_address)); rc |= hl_pci_iatu_write(hdev, 0x014, 0); if ((hdev->power9_64bit_dma_enable) && (hdev->dma_mask == 64)) @@ -298,7 +288,8 @@ int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address, else rc |= hl_pci_iatu_write(hdev, 0x018, 0); - rc |= hl_pci_iatu_write(hdev, 0x020, upper_32_bits(host_phys_end_addr)); + rc |= hl_pci_iatu_write(hdev, 0x020, + upper_32_bits(outbound_region_end_address)); /* Increase region size */ rc |= hl_pci_iatu_write(hdev, 0x000, 0x00002000); /* Enable */ @@ -308,10 +299,7 @@ int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address, rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0); rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr + 4, 0); - if (rc) - return -EIO; - - return 0; + return rc; } /** diff --git a/drivers/misc/habanalabs/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c index 5d78d5e1c782..c4e7c682d584 100644 --- a/drivers/misc/habanalabs/sysfs.c +++ b/drivers/misc/habanalabs/common/sysfs.c @@ -334,6 +334,9 @@ static ssize_t eeprom_read_handler(struct file *filp, struct kobject *kobj, char *data; int rc; + if (hl_device_disabled_or_in_reset(hdev)) + return -ENODEV; + if (!max_size) return -EINVAL; diff --git a/drivers/misc/habanalabs/gaudi/Makefile b/drivers/misc/habanalabs/gaudi/Makefile index f802cdc980ca..75104ae74e2b 100644 --- a/drivers/misc/habanalabs/gaudi/Makefile +++ b/drivers/misc/habanalabs/gaudi/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -subdir-ccflags-y += -I$(src) +subdir-ccflags-y += -I$(src)/common HL_GAUDI_FILES := gaudi/gaudi.o gaudi/gaudi_hwmgr.o gaudi/gaudi_security.o \ gaudi/gaudi_coresight.o diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 7dd99ec41627..4a1a52608fc0 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -21,6 +21,7 @@ #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/iommu.h> #include <linux/seq_file.h> +#include <linux/bitfield.h> /* * Gaudi security scheme: @@ -74,7 +75,6 @@ #define GAUDI_PLDM_RESET_WAIT_MSEC 1000 /* 1s */ #define GAUDI_PLDM_HRESET_TIMEOUT_MSEC 20000 /* 20s */ -#define GAUDI_PLDM_SRESET_TIMEOUT_MSEC 14000 /* 14s */ #define GAUDI_PLDM_TEST_QUEUE_WAIT_USEC 1000000 /* 1s */ #define GAUDI_PLDM_MMU_TIMEOUT_USEC (MMU_CONFIG_TIMEOUT_USEC * 100) #define GAUDI_PLDM_QMAN0_TIMEOUT_USEC (HL_DEVICE_TIMEOUT_USEC * 30) @@ -315,6 +315,13 @@ static enum hl_queue_type gaudi_queue_type[GAUDI_QUEUE_ID_SIZE] = { QUEUE_TYPE_NA, /* GAUDI_QUEUE_ID_NIC_9_3 */ }; +struct ecc_info_extract_params { + u64 block_address; + u32 num_memories; + bool derr; + bool disable_clock_gating; +}; + static int gaudi_mmu_update_asid_hop0_addr(struct hl_device *hdev, u32 asid, u64 phys_addr); static int gaudi_send_job_on_qman0(struct hl_device *hdev, @@ -333,22 +340,25 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev) struct asic_fixed_properties *prop = &hdev->asic_prop; int i; - if (GAUDI_QUEUE_ID_SIZE >= HL_MAX_QUEUES) { - dev_err(hdev->dev, - "Number of H/W queues must be smaller than %d\n", - HL_MAX_QUEUES); - return -EFAULT; - } + prop->max_queues = GAUDI_QUEUE_ID_SIZE; + prop->hw_queues_props = kcalloc(prop->max_queues, + sizeof(struct hw_queue_properties), + GFP_KERNEL); - for (i = 0 ; i < GAUDI_QUEUE_ID_SIZE ; i++) { + if (!prop->hw_queues_props) + return -ENOMEM; + + for (i = 0 ; i < prop->max_queues ; i++) { if (gaudi_queue_type[i] == QUEUE_TYPE_EXT) { prop->hw_queues_props[i].type = QUEUE_TYPE_EXT; prop->hw_queues_props[i].driver_only = 0; prop->hw_queues_props[i].requires_kernel_cb = 1; + prop->hw_queues_props[i].supports_sync_stream = 1; } else if (gaudi_queue_type[i] == QUEUE_TYPE_CPU) { prop->hw_queues_props[i].type = QUEUE_TYPE_CPU; prop->hw_queues_props[i].driver_only = 1; prop->hw_queues_props[i].requires_kernel_cb = 0; + prop->hw_queues_props[i].supports_sync_stream = 0; } else if (gaudi_queue_type[i] == QUEUE_TYPE_INT) { prop->hw_queues_props[i].type = QUEUE_TYPE_INT; prop->hw_queues_props[i].driver_only = 0; @@ -357,14 +367,13 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev) prop->hw_queues_props[i].type = QUEUE_TYPE_NA; prop->hw_queues_props[i].driver_only = 0; prop->hw_queues_props[i].requires_kernel_cb = 0; + prop->hw_queues_props[i].supports_sync_stream = 0; } } - for (; i < HL_MAX_QUEUES; i++) - prop->hw_queues_props[i].type = QUEUE_TYPE_NA; - prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES; - + prop->sync_stream_first_sob = 0; + prop->sync_stream_first_mon = 0; prop->dram_base_address = DRAM_PHYS_BASE; prop->dram_size = GAUDI_HBM_SIZE_32GB; prop->dram_end_address = prop->dram_base_address + @@ -429,6 +438,8 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev) strncpy(prop->armcp_info.card_name, GAUDI_DEFAULT_CARD_NAME, CARD_NAME_MAX_LEN); + prop->max_pending_cs = GAUDI_MAX_PENDING_CS; + return 0; } @@ -451,6 +462,7 @@ static int gaudi_pci_bars_map(struct hl_device *hdev) static u64 gaudi_set_hbm_bar_base(struct hl_device *hdev, u64 addr) { struct gaudi_device *gaudi = hdev->asic_specific; + struct hl_inbound_pci_region pci_region; u64 old_addr = addr; int rc; @@ -458,7 +470,10 @@ static u64 gaudi_set_hbm_bar_base(struct hl_device *hdev, u64 addr) return old_addr; /* Inbound Region 2 - Bar 4 - Point to HBM */ - rc = hl_pci_set_dram_bar_base(hdev, 2, 4, addr); + pci_region.mode = PCI_BAR_MATCH_MODE; + pci_region.bar = HBM_BAR_ID; + pci_region.addr = addr; + rc = hl_pci_set_inbound_region(hdev, 2, &pci_region); if (rc) return U64_MAX; @@ -472,22 +487,43 @@ static u64 gaudi_set_hbm_bar_base(struct hl_device *hdev, u64 addr) static int gaudi_init_iatu(struct hl_device *hdev) { - int rc = 0; + struct hl_inbound_pci_region inbound_region; + struct hl_outbound_pci_region outbound_region; + int rc; + + /* Inbound Region 0 - Bar 0 - Point to SRAM + CFG */ + inbound_region.mode = PCI_BAR_MATCH_MODE; + inbound_region.bar = SRAM_BAR_ID; + inbound_region.addr = SRAM_BASE_ADDR; + rc = hl_pci_set_inbound_region(hdev, 0, &inbound_region); + if (rc) + goto done; /* Inbound Region 1 - Bar 2 - Point to SPI FLASH */ - rc = hl_pci_iatu_write(hdev, 0x314, - lower_32_bits(SPI_FLASH_BASE_ADDR)); - rc |= hl_pci_iatu_write(hdev, 0x318, - upper_32_bits(SPI_FLASH_BASE_ADDR)); - rc |= hl_pci_iatu_write(hdev, 0x300, 0); - /* Enable + Bar match + match enable */ - rc |= hl_pci_iatu_write(hdev, 0x304, 0xC0080200); + inbound_region.mode = PCI_BAR_MATCH_MODE; + inbound_region.bar = CFG_BAR_ID; + inbound_region.addr = SPI_FLASH_BASE_ADDR; + rc = hl_pci_set_inbound_region(hdev, 1, &inbound_region); + if (rc) + goto done; + /* Inbound Region 2 - Bar 4 - Point to HBM */ + inbound_region.mode = PCI_BAR_MATCH_MODE; + inbound_region.bar = HBM_BAR_ID; + inbound_region.addr = DRAM_PHYS_BASE; + rc = hl_pci_set_inbound_region(hdev, 2, &inbound_region); if (rc) - return -EIO; + goto done; + + hdev->asic_funcs->set_dma_mask_from_fw(hdev); + + /* Outbound Region 0 - Point to Host */ + outbound_region.addr = HOST_PHYS_BASE; + outbound_region.size = HOST_PHYS_SIZE; + rc = hl_pci_set_outbound_region(hdev, &outbound_region); - return hl_pci_init_iatu(hdev, SRAM_BASE_ADDR, DRAM_PHYS_BASE, - HOST_PHYS_BASE, HOST_PHYS_SIZE); +done: + return rc; } static int gaudi_early_init(struct hl_device *hdev) @@ -510,7 +546,8 @@ static int gaudi_early_init(struct hl_device *hdev) (unsigned long long) pci_resource_len(pdev, SRAM_BAR_ID), SRAM_BAR_SIZE); - return -ENODEV; + rc = -ENODEV; + goto free_queue_props; } if (pci_resource_len(pdev, CFG_BAR_ID) != CFG_BAR_SIZE) { @@ -520,20 +557,26 @@ static int gaudi_early_init(struct hl_device *hdev) (unsigned long long) pci_resource_len(pdev, CFG_BAR_ID), CFG_BAR_SIZE); - return -ENODEV; + rc = -ENODEV; + goto free_queue_props; } prop->dram_pci_bar_size = pci_resource_len(pdev, HBM_BAR_ID); rc = hl_pci_init(hdev); if (rc) - return rc; + goto free_queue_props; return 0; + +free_queue_props: + kfree(hdev->asic_prop.hw_queues_props); + return rc; } static int gaudi_early_fini(struct hl_device *hdev) { + kfree(hdev->asic_prop.hw_queues_props); hl_pci_fini(hdev); return 0; @@ -548,11 +591,36 @@ static int gaudi_early_fini(struct hl_device *hdev) static void gaudi_fetch_psoc_frequency(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; + u32 trace_freq = 0; + u32 pll_clk = 0; + u32 div_fctr = RREG32(mmPSOC_CPU_PLL_DIV_FACTOR_2); + u32 div_sel = RREG32(mmPSOC_CPU_PLL_DIV_SEL_2); + u32 nr = RREG32(mmPSOC_CPU_PLL_NR); + u32 nf = RREG32(mmPSOC_CPU_PLL_NF); + u32 od = RREG32(mmPSOC_CPU_PLL_OD); + + if (div_sel == DIV_SEL_REF_CLK || div_sel == DIV_SEL_DIVIDED_REF) { + if (div_sel == DIV_SEL_REF_CLK) + trace_freq = PLL_REF_CLK; + else + trace_freq = PLL_REF_CLK / (div_fctr + 1); + } else if (div_sel == DIV_SEL_PLL_CLK || + div_sel == DIV_SEL_DIVIDED_PLL) { + pll_clk = PLL_REF_CLK * (nf + 1) / ((nr + 1) * (od + 1)); + if (div_sel == DIV_SEL_PLL_CLK) + trace_freq = pll_clk; + else + trace_freq = pll_clk / (div_fctr + 1); + } else { + dev_warn(hdev->dev, + "Received invalid div select value: %d", div_sel); + } - prop->psoc_pci_pll_nr = RREG32(mmPSOC_PCI_PLL_NR); - prop->psoc_pci_pll_nf = RREG32(mmPSOC_PCI_PLL_NF); - prop->psoc_pci_pll_od = RREG32(mmPSOC_PCI_PLL_OD); - prop->psoc_pci_pll_div_factor = RREG32(mmPSOC_PCI_PLL_DIV_FACTOR_1); + prop->psoc_timestamp_frequency = trace_freq; + prop->psoc_pci_pll_nr = nr; + prop->psoc_pci_pll_nf = nf; + prop->psoc_pci_pll_od = od; + prop->psoc_pci_pll_div_factor = div_fctr; } static int _gaudi_init_tpc_mem(struct hl_device *hdev, @@ -567,7 +635,7 @@ static int _gaudi_init_tpc_mem(struct hl_device *hdev, u8 tpc_id; int rc; - cb = hl_cb_kernel_create(hdev, PAGE_SIZE); + cb = hl_cb_kernel_create(hdev, PAGE_SIZE, false); if (!cb) return -EFAULT; @@ -1638,8 +1706,8 @@ static void gaudi_init_hbm_cred(struct hl_device *hdev) uint32_t hbm0_wr, hbm1_wr, hbm0_rd, hbm1_rd; hbm0_wr = 0x33333333; - hbm1_wr = 0x33333333; hbm0_rd = 0x77777777; + hbm1_wr = 0x55555555; hbm1_rd = 0xDDDDDDDD; WREG32(mmDMA_IF_E_N_HBM0_WR_CRED_CNT, hbm0_wr); @@ -1689,125 +1757,6 @@ static void gaudi_init_hbm_cred(struct hl_device *hdev) (1 << DMA_IF_HBM_CRED_EN_WRITE_CREDIT_EN_SHIFT)); } -static void gaudi_init_rate_limiter(struct hl_device *hdev) -{ - u32 nr, nf, od, sat, rst, timeout; - u64 freq; - - nr = RREG32(mmPSOC_HBM_PLL_NR); - nf = RREG32(mmPSOC_HBM_PLL_NF); - od = RREG32(mmPSOC_HBM_PLL_OD); - freq = (50 * (nf + 1)) / ((nr + 1) * (od + 1)); - - dev_dbg(hdev->dev, "HBM frequency is %lluMHz\n", freq); - - /* Configuration is for five (5) DDMA channels */ - if (freq == 800) { - sat = 4; - rst = 11; - timeout = 15; - } else if (freq == 900) { - sat = 4; - rst = 15; - timeout = 16; - } else if (freq == 950) { - sat = 4; - rst = 15; - timeout = 15; - } else { - dev_warn(hdev->dev, - "unsupported HBM frequency %lluMHz, no rate-limiters\n", - freq); - return; - } - - WREG32(mmDMA_IF_W_S_DOWN_RSP_MID_WGHT_0, 0x111); - WREG32(mmDMA_IF_W_S_DOWN_RSP_MID_WGHT_1, 0x111); - WREG32(mmDMA_IF_E_S_DOWN_RSP_MID_WGHT_0, 0x111); - WREG32(mmDMA_IF_E_S_DOWN_RSP_MID_WGHT_1, 0x111); - WREG32(mmDMA_IF_W_N_DOWN_RSP_MID_WGHT_0, 0x111); - WREG32(mmDMA_IF_W_N_DOWN_RSP_MID_WGHT_1, 0x111); - WREG32(mmDMA_IF_E_N_DOWN_RSP_MID_WGHT_0, 0x111); - WREG32(mmDMA_IF_E_N_DOWN_RSP_MID_WGHT_1, 0x111); - - if (!hdev->rl_enable) { - dev_info(hdev->dev, "Rate limiters disabled\n"); - return; - } - - WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_HBM_SAT, sat); - WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_HBM_SAT, sat); - WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_HBM_SAT, sat); - WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_HBM_SAT, sat); - WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_HBM_SAT, sat); - WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_HBM_SAT, sat); - WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_HBM_SAT, sat); - WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_HBM_SAT, sat); - - WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_HBM_RST, rst); - WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_HBM_RST, rst); - WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_HBM_RST, rst); - WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_HBM_RST, rst); - WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_HBM_RST, rst); - WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_HBM_RST, rst); - WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_HBM_RST, rst); - WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_HBM_RST, rst); - - WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_HBM_TIMEOUT, timeout); - WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_HBM_TIMEOUT, timeout); - WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_HBM_TIMEOUT, timeout); - WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_HBM_TIMEOUT, timeout); - WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_HBM_TIMEOUT, timeout); - WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_HBM_TIMEOUT, timeout); - WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_HBM_TIMEOUT, timeout); - WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_HBM_TIMEOUT, timeout); - - WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_HBM_EN, 1); - WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_HBM_EN, 1); - WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_HBM_EN, 1); - WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_HBM_EN, 1); - WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_HBM_EN, 1); - WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_HBM_EN, 1); - WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_HBM_EN, 1); - WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_HBM_EN, 1); - - WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_SRAM_SAT, sat); - WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_SRAM_SAT, sat); - WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_SRAM_SAT, sat); - WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_SRAM_SAT, sat); - WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_SRAM_SAT, sat); - WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_SRAM_SAT, sat); - WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_SRAM_SAT, sat); - WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_SRAM_SAT, sat); - - WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_SRAM_RST, rst); - WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_SRAM_RST, rst); - WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_SRAM_RST, rst); - WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_SRAM_RST, rst); - WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_SRAM_RST, rst); - WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_SRAM_RST, rst); - WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_SRAM_RST, rst); - WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_SRAM_RST, rst); - - WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_SRAM_TIMEOUT, timeout); - WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_SRAM_TIMEOUT, timeout); - WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_SRAM_TIMEOUT, timeout); - WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_SRAM_TIMEOUT, timeout); - WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_SRAM_TIMEOUT, timeout); - WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_SRAM_TIMEOUT, timeout); - WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_SRAM_TIMEOUT, timeout); - WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_SRAM_TIMEOUT, timeout); - - WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_SRAM_EN, 1); - WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_SRAM_EN, 1); - WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_SRAM_EN, 1); - WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_SRAM_EN, 1); - WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_SRAM_EN, 1); - WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_SRAM_EN, 1); - WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_SRAM_EN, 1); - WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_SRAM_EN, 1); -} - static void gaudi_init_golden_registers(struct hl_device *hdev) { u32 tpc_offset; @@ -1817,8 +1766,6 @@ static void gaudi_init_golden_registers(struct hl_device *hdev) gaudi_init_hbm_cred(hdev); - gaudi_init_rate_limiter(hdev); - gaudi_disable_clock_gating(hdev); for (tpc_id = 0, tpc_offset = 0; @@ -1839,9 +1786,6 @@ static void gaudi_init_golden_registers(struct hl_device *hdev) WREG32(mmMME1_CTRL_EUS_ROLLUP_CNT_ADD, 3); WREG32(mmMME2_CTRL_EUS_ROLLUP_CNT_ADD, 3); WREG32(mmMME3_CTRL_EUS_ROLLUP_CNT_ADD, 3); - - /* WA for H3-2081 */ - WREG32(mmPCIE_WRAP_MAX_OUTSTAND, 0x10ff); } static void gaudi_init_pci_dma_qman(struct hl_device *hdev, int dma_id, @@ -2634,29 +2578,16 @@ static void gaudi_disable_timestamp(struct hl_device *hdev) static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset) { - u32 wait_timeout_ms, cpu_timeout_ms; + u32 wait_timeout_ms; dev_info(hdev->dev, "Halting compute engines and disabling interrupts\n"); - if (hdev->pldm) { + if (hdev->pldm) wait_timeout_ms = GAUDI_PLDM_RESET_WAIT_MSEC; - cpu_timeout_ms = GAUDI_PLDM_RESET_WAIT_MSEC; - } else { + else wait_timeout_ms = GAUDI_RESET_WAIT_MSEC; - cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC; - } - if (hard_reset) { - /* - * I don't know what is the state of the CPU so make sure it is - * stopped in any means necessary - */ - WREG32(mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU, KMD_MSG_GOTO_WFE); - WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, - GAUDI_EVENT_HALT_MACHINE); - msleep(cpu_timeout_ms); - } gaudi_stop_mme_qmans(hdev); gaudi_stop_tpc_qmans(hdev); @@ -2681,10 +2612,7 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset) gaudi_disable_timestamp(hdev); - if (hard_reset) - gaudi_disable_msi(hdev); - else - gaudi_sync_irqs(hdev); + gaudi_disable_msi(hdev); } static int gaudi_mmu_init(struct hl_device *hdev) @@ -2718,8 +2646,7 @@ static int gaudi_mmu_init(struct hl_device *hdev) WREG32(mmSTLB_CACHE_INV_BASE_39_8, MMU_CACHE_MNG_ADDR >> 8); WREG32(mmSTLB_CACHE_INV_BASE_49_40, MMU_CACHE_MNG_ADDR >> 40); - hdev->asic_funcs->mmu_invalidate_cache(hdev, true, - VM_TYPE_USERPTR | VM_TYPE_PHYS_PACK); + hdev->asic_funcs->mmu_invalidate_cache(hdev, true, 0); WREG32(mmMMU_UP_MMU_ENABLE, 1); WREG32(mmMMU_UP_SPI_MASK, 0xF); @@ -2969,16 +2896,6 @@ static int gaudi_hw_init(struct hl_device *hdev) gaudi_init_hbm_dma_qmans(hdev); - /* - * Before pushing u-boot/linux to device, need to set the hbm bar to - * base address of dram - */ - if (gaudi_set_hbm_bar_base(hdev, DRAM_PHYS_BASE) == U64_MAX) { - dev_err(hdev->dev, - "failed to map HBM bar to DRAM base address\n"); - return -EIO; - } - rc = gaudi_init_cpu(hdev); if (rc) { dev_err(hdev->dev, "failed to initialize CPU\n"); @@ -3037,48 +2954,56 @@ disable_queues: static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset) { struct gaudi_device *gaudi = hdev->asic_specific; - u32 status, reset_timeout_ms, boot_strap = 0; + u32 status, reset_timeout_ms, cpu_timeout_ms, boot_strap = 0; + + if (!hard_reset) { + dev_err(hdev->dev, "GAUDI doesn't support soft-reset\n"); + return; + } if (hdev->pldm) { - if (hard_reset) - reset_timeout_ms = GAUDI_PLDM_HRESET_TIMEOUT_MSEC; - else - reset_timeout_ms = GAUDI_PLDM_SRESET_TIMEOUT_MSEC; + reset_timeout_ms = GAUDI_PLDM_HRESET_TIMEOUT_MSEC; + cpu_timeout_ms = GAUDI_PLDM_RESET_WAIT_MSEC; } else { reset_timeout_ms = GAUDI_RESET_TIMEOUT_MSEC; + cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC; } - if (hard_reset) { - /* Tell ASIC not to re-initialize PCIe */ - WREG32(mmPREBOOT_PCIE_EN, LKD_HARD_RESET_MAGIC); + /* Set device to handle FLR by H/W as we will put the device CPU to + * halt mode + */ + WREG32(mmPCIE_AUX_FLR_CTRL, (PCIE_AUX_FLR_CTRL_HW_CTRL_MASK | + PCIE_AUX_FLR_CTRL_INT_MASK_MASK)); - boot_strap = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS); - /* H/W bug WA: - * rdata[31:0] = strap_read_val; - * wdata[31:0] = rdata[30:21],1'b0,rdata[20:0] - */ - boot_strap = (((boot_strap & 0x7FE00000) << 1) | - (boot_strap & 0x001FFFFF)); - WREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS, boot_strap & ~0x2); - - /* Restart BTL/BLR upon hard-reset */ - WREG32(mmPSOC_GLOBAL_CONF_BOOT_SEQ_RE_START, 1); - - WREG32(mmPSOC_GLOBAL_CONF_SW_ALL_RST, - 1 << PSOC_GLOBAL_CONF_SW_ALL_RST_IND_SHIFT); - dev_info(hdev->dev, - "Issued HARD reset command, going to wait %dms\n", - reset_timeout_ms); - } else { - /* Don't restart BTL/BLR upon soft-reset */ - WREG32(mmPSOC_GLOBAL_CONF_BOOT_SEQ_RE_START, 0); + /* I don't know what is the state of the CPU so make sure it is + * stopped in any means necessary + */ + WREG32(mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU, KMD_MSG_GOTO_WFE); + WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, GAUDI_EVENT_HALT_MACHINE); - WREG32(mmPSOC_GLOBAL_CONF_SOFT_RST, - 1 << PSOC_GLOBAL_CONF_SOFT_RST_IND_SHIFT); - dev_info(hdev->dev, - "Issued SOFT reset command, going to wait %dms\n", - reset_timeout_ms); - } + msleep(cpu_timeout_ms); + + /* Tell ASIC not to re-initialize PCIe */ + WREG32(mmPREBOOT_PCIE_EN, LKD_HARD_RESET_MAGIC); + + boot_strap = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS); + + /* H/W bug WA: + * rdata[31:0] = strap_read_val; + * wdata[31:0] = rdata[30:21],1'b0,rdata[20:0] + */ + boot_strap = (((boot_strap & 0x7FE00000) << 1) | + (boot_strap & 0x001FFFFF)); + WREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS, boot_strap & ~0x2); + + /* Restart BTL/BLR upon hard-reset */ + WREG32(mmPSOC_GLOBAL_CONF_BOOT_SEQ_RE_START, 1); + + WREG32(mmPSOC_GLOBAL_CONF_SW_ALL_RST, + 1 << PSOC_GLOBAL_CONF_SW_ALL_RST_IND_SHIFT); + dev_info(hdev->dev, + "Issued HARD reset command, going to wait %dms\n", + reset_timeout_ms); /* * After hard reset, we can't poll the BTM_FSM register because the PSOC @@ -3092,18 +3017,6 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset) "Timeout while waiting for device to reset 0x%x\n", status); - if (!hard_reset) { - gaudi->hw_cap_initialized &= ~(HW_CAP_PCI_DMA | HW_CAP_MME | - HW_CAP_TPC_MASK | - HW_CAP_HBM_DMA); - - WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, - GAUDI_EVENT_SOFT_RESET); - return; - } - - /* We continue here only for hard-reset */ - WREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS, boot_strap); gaudi->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q | @@ -3558,7 +3471,7 @@ static int gaudi_test_queues(struct hl_device *hdev) { int i, rc, ret_val = 0; - for (i = 0 ; i < HL_MAX_QUEUES ; i++) { + for (i = 0 ; i < hdev->asic_prop.max_queues ; i++) { if (hdev->asic_prop.hw_queues_props[i].type == QUEUE_TYPE_EXT) { rc = gaudi_test_queue(hdev, i); if (rc) @@ -4134,9 +4047,8 @@ static int gaudi_parse_cb_mmu(struct hl_device *hdev, parser->patched_cb_size = parser->user_cb_size + sizeof(struct packet_msg_prot) * 2; - rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, - parser->patched_cb_size, - &patched_cb_handle, HL_KERNEL_ASID_ID); + rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size, + &patched_cb_handle, HL_KERNEL_ASID_ID, false); if (rc) { dev_err(hdev->dev, @@ -4208,9 +4120,8 @@ static int gaudi_parse_cb_no_mmu(struct hl_device *hdev, if (rc) goto free_userptr; - rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, - parser->patched_cb_size, - &patched_cb_handle, HL_KERNEL_ASID_ID); + rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size, + &patched_cb_handle, HL_KERNEL_ASID_ID, false); if (rc) { dev_err(hdev->dev, "Failed to allocate patched CB for DMA CS %d\n", rc); @@ -4339,11 +4250,11 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr, { struct packet_lin_dma *lin_dma_pkt; struct hl_cs_job *job; - u32 cb_size, ctl; + u32 cb_size, ctl, err_cause; struct hl_cb *cb; int rc; - cb = hl_cb_kernel_create(hdev, PAGE_SIZE); + cb = hl_cb_kernel_create(hdev, PAGE_SIZE, false); if (!cb) return -EFAULT; @@ -4368,6 +4279,15 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr, goto release_cb; } + /* Verify DMA is OK */ + err_cause = RREG32(mmDMA0_CORE_ERR_CAUSE); + if (err_cause && !hdev->init_done) { + dev_dbg(hdev->dev, + "Clearing DMA0 engine from errors (cause 0x%x)\n", + err_cause); + WREG32(mmDMA0_CORE_ERR_CAUSE, err_cause); + } + job->id = 0; job->user_cb = cb; job->user_cb->cs_cnt++; @@ -4379,11 +4299,23 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr, hl_debugfs_add_job(hdev, job); rc = gaudi_send_job_on_qman0(hdev, job); - hl_debugfs_remove_job(hdev, job); kfree(job); cb->cs_cnt--; + /* Verify DMA is OK */ + err_cause = RREG32(mmDMA0_CORE_ERR_CAUSE); + if (err_cause) { + dev_err(hdev->dev, "DMA Failed, cause 0x%x\n", err_cause); + rc = -EIO; + if (!hdev->init_done) { + dev_dbg(hdev->dev, + "Clearing DMA0 engine from errors (cause 0x%x)\n", + err_cause); + WREG32(mmDMA0_CORE_ERR_CAUSE, err_cause); + } + } + release_cb: hl_cb_put(cb); hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT); @@ -5209,62 +5141,75 @@ static void gaudi_print_mmu_error_info(struct hl_device *hdev) * | |0xF4C memory wrappers 127:96 | * +-------------------+------------------------------------------------------+ */ -static void gaudi_print_ecc_info_generic(struct hl_device *hdev, - const char *block_name, - u64 block_address, int num_memories, - bool derr, bool disable_clock_gating) +static int gaudi_extract_ecc_info(struct hl_device *hdev, + struct ecc_info_extract_params *params, u64 *ecc_address, + u64 *ecc_syndrom, u8 *memory_wrapper_idx) { struct gaudi_device *gaudi = hdev->asic_specific; - int num_mem_regs = num_memories / 32 + ((num_memories % 32) ? 1 : 0); + u32 i, num_mem_regs, reg, err_bit; + u64 err_addr, err_word = 0; + int rc = 0; + + num_mem_regs = params->num_memories / 32 + + ((params->num_memories % 32) ? 1 : 0); - if (block_address >= CFG_BASE) - block_address -= CFG_BASE; + if (params->block_address >= CFG_BASE) + params->block_address -= CFG_BASE; - if (derr) - block_address += GAUDI_ECC_DERR0_OFFSET; + if (params->derr) + err_addr = params->block_address + GAUDI_ECC_DERR0_OFFSET; else - block_address += GAUDI_ECC_SERR0_OFFSET; + err_addr = params->block_address + GAUDI_ECC_SERR0_OFFSET; - if (disable_clock_gating) { + if (params->disable_clock_gating) { mutex_lock(&gaudi->clk_gate_mutex); hdev->asic_funcs->disable_clock_gating(hdev); } - switch (num_mem_regs) { - case 1: - dev_err(hdev->dev, - "%s ECC indication: 0x%08x\n", - block_name, RREG32(block_address)); - break; - case 2: - dev_err(hdev->dev, - "%s ECC indication: 0x%08x 0x%08x\n", - block_name, - RREG32(block_address), RREG32(block_address + 4)); - break; - case 3: - dev_err(hdev->dev, - "%s ECC indication: 0x%08x 0x%08x 0x%08x\n", - block_name, - RREG32(block_address), RREG32(block_address + 4), - RREG32(block_address + 8)); - break; - case 4: - dev_err(hdev->dev, - "%s ECC indication: 0x%08x 0x%08x 0x%08x 0x%08x\n", - block_name, - RREG32(block_address), RREG32(block_address + 4), - RREG32(block_address + 8), RREG32(block_address + 0xc)); - break; - default: - break; + /* Set invalid wrapper index */ + *memory_wrapper_idx = 0xFF; + + /* Iterate through memory wrappers, a single bit must be set */ + for (i = 0 ; i > num_mem_regs ; i++) { + err_addr += i * 4; + err_word = RREG32(err_addr); + if (err_word) { + err_bit = __ffs(err_word); + *memory_wrapper_idx = err_bit + (32 * i); + break; + } + } + if (*memory_wrapper_idx == 0xFF) { + dev_err(hdev->dev, "ECC error information cannot be found\n"); + rc = -EINVAL; + goto enable_clk_gate; } - if (disable_clock_gating) { + WREG32(params->block_address + GAUDI_ECC_MEM_SEL_OFFSET, + *memory_wrapper_idx); + + *ecc_address = + RREG32(params->block_address + GAUDI_ECC_ADDRESS_OFFSET); + *ecc_syndrom = + RREG32(params->block_address + GAUDI_ECC_SYNDROME_OFFSET); + + /* Clear error indication */ + reg = RREG32(params->block_address + GAUDI_ECC_MEM_INFO_CLR_OFFSET); + if (params->derr) + reg |= FIELD_PREP(GAUDI_ECC_MEM_INFO_CLR_DERR_MASK, 1); + else + reg |= FIELD_PREP(GAUDI_ECC_MEM_INFO_CLR_SERR_MASK, 1); + + WREG32(params->block_address + GAUDI_ECC_MEM_INFO_CLR_OFFSET, reg); + +enable_clk_gate: + if (params->disable_clock_gating) { hdev->asic_funcs->enable_clock_gating(hdev); mutex_unlock(&gaudi->clk_gate_mutex); } + + return rc; } static void gaudi_handle_qman_err_generic(struct hl_device *hdev, @@ -5317,239 +5262,99 @@ static void gaudi_handle_qman_err_generic(struct hl_device *hdev, } } -static void gaudi_print_ecc_info(struct hl_device *hdev, u16 event_type) +static void gaudi_handle_ecc_event(struct hl_device *hdev, u16 event_type, + struct hl_eq_ecc_data *ecc_data) { - u64 block_address; - u8 index; - int num_memories; - char desc[32]; - bool derr; - bool disable_clock_gating; + struct ecc_info_extract_params params; + u64 ecc_address = 0, ecc_syndrom = 0; + u8 index, memory_wrapper_idx = 0; + bool extract_info_from_fw; + int rc; switch (event_type) { - case GAUDI_EVENT_PCIE_CORE_SERR: - snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_CORE"); - block_address = mmPCIE_CORE_BASE; - num_memories = 51; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PCIE_CORE_DERR: - snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_CORE"); - block_address = mmPCIE_CORE_BASE; - num_memories = 51; - derr = true; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PCIE_IF_SERR: - snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_WRAP"); - block_address = mmPCIE_WRAP_BASE; - num_memories = 11; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PCIE_IF_DERR: - snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_WRAP"); - block_address = mmPCIE_WRAP_BASE; - num_memories = 11; - derr = true; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PCIE_PHY_SERR: - snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_PHY"); - block_address = mmPCIE_PHY_BASE; - num_memories = 4; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PCIE_PHY_DERR: - snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_PHY"); - block_address = mmPCIE_PHY_BASE; - num_memories = 4; - derr = true; - disable_clock_gating = false; + case GAUDI_EVENT_PCIE_CORE_SERR ... GAUDI_EVENT_PCIE_PHY_DERR: + case GAUDI_EVENT_DMA0_SERR_ECC ... GAUDI_EVENT_MMU_DERR: + extract_info_from_fw = true; break; case GAUDI_EVENT_TPC0_SERR ... GAUDI_EVENT_TPC7_SERR: index = event_type - GAUDI_EVENT_TPC0_SERR; - block_address = mmTPC0_CFG_BASE + index * TPC_CFG_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "%s%d", "TPC", index); - num_memories = 90; - derr = false; - disable_clock_gating = true; + params.block_address = mmTPC0_CFG_BASE + index * TPC_CFG_OFFSET; + params.num_memories = 90; + params.derr = false; + params.disable_clock_gating = true; + extract_info_from_fw = false; break; case GAUDI_EVENT_TPC0_DERR ... GAUDI_EVENT_TPC7_DERR: index = event_type - GAUDI_EVENT_TPC0_DERR; - block_address = + params.block_address = mmTPC0_CFG_BASE + index * TPC_CFG_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "%s%d", "TPC", index); - num_memories = 90; - derr = true; - disable_clock_gating = true; + params.num_memories = 90; + params.derr = true; + params.disable_clock_gating = true; + extract_info_from_fw = false; break; case GAUDI_EVENT_MME0_ACC_SERR: case GAUDI_EVENT_MME1_ACC_SERR: case GAUDI_EVENT_MME2_ACC_SERR: case GAUDI_EVENT_MME3_ACC_SERR: index = (event_type - GAUDI_EVENT_MME0_ACC_SERR) / 4; - block_address = mmMME0_ACC_BASE + index * MME_ACC_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "MME%d_ACC", index); - num_memories = 128; - derr = false; - disable_clock_gating = true; + params.block_address = mmMME0_ACC_BASE + index * MME_ACC_OFFSET; + params.num_memories = 128; + params.derr = false; + params.disable_clock_gating = true; + extract_info_from_fw = false; break; case GAUDI_EVENT_MME0_ACC_DERR: case GAUDI_EVENT_MME1_ACC_DERR: case GAUDI_EVENT_MME2_ACC_DERR: case GAUDI_EVENT_MME3_ACC_DERR: index = (event_type - GAUDI_EVENT_MME0_ACC_DERR) / 4; - block_address = mmMME0_ACC_BASE + index * MME_ACC_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "MME%d_ACC", index); - num_memories = 128; - derr = true; - disable_clock_gating = true; + params.block_address = mmMME0_ACC_BASE + index * MME_ACC_OFFSET; + params.num_memories = 128; + params.derr = true; + params.disable_clock_gating = true; + extract_info_from_fw = false; break; case GAUDI_EVENT_MME0_SBAB_SERR: case GAUDI_EVENT_MME1_SBAB_SERR: case GAUDI_EVENT_MME2_SBAB_SERR: case GAUDI_EVENT_MME3_SBAB_SERR: index = (event_type - GAUDI_EVENT_MME0_SBAB_SERR) / 4; - block_address = mmMME0_SBAB_BASE + index * MME_ACC_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "MME%d_SBAB", index); - num_memories = 33; - derr = false; - disable_clock_gating = true; + params.block_address = + mmMME0_SBAB_BASE + index * MME_ACC_OFFSET; + params.num_memories = 33; + params.derr = false; + params.disable_clock_gating = true; + extract_info_from_fw = false; break; case GAUDI_EVENT_MME0_SBAB_DERR: case GAUDI_EVENT_MME1_SBAB_DERR: case GAUDI_EVENT_MME2_SBAB_DERR: case GAUDI_EVENT_MME3_SBAB_DERR: index = (event_type - GAUDI_EVENT_MME0_SBAB_DERR) / 4; - block_address = mmMME0_SBAB_BASE + index * MME_ACC_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "MME%d_SBAB", index); - num_memories = 33; - derr = true; - disable_clock_gating = true; - break; - case GAUDI_EVENT_DMA0_SERR_ECC ... GAUDI_EVENT_DMA7_SERR_ECC: - index = event_type - GAUDI_EVENT_DMA0_SERR_ECC; - block_address = mmDMA0_CORE_BASE + index * DMA_CORE_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "DMA%d_CORE", index); - num_memories = 16; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_DMA0_DERR_ECC ... GAUDI_EVENT_DMA7_DERR_ECC: - index = event_type - GAUDI_EVENT_DMA0_DERR_ECC; - block_address = mmDMA0_CORE_BASE + index * DMA_CORE_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "DMA%d_CORE", index); - num_memories = 16; - derr = true; - disable_clock_gating = false; - break; - case GAUDI_EVENT_CPU_IF_ECC_SERR: - block_address = mmCPU_IF_BASE; - snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU"); - num_memories = 4; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_CPU_IF_ECC_DERR: - block_address = mmCPU_IF_BASE; - snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU"); - num_memories = 4; - derr = true; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PSOC_MEM_SERR: - block_address = mmPSOC_GLOBAL_CONF_BASE; - snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU"); - num_memories = 4; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PSOC_MEM_DERR: - block_address = mmPSOC_GLOBAL_CONF_BASE; - snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU"); - num_memories = 4; - derr = true; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PSOC_CORESIGHT_SERR: - block_address = mmPSOC_CS_TRACE_BASE; - snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU"); - num_memories = 2; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PSOC_CORESIGHT_DERR: - block_address = mmPSOC_CS_TRACE_BASE; - snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU"); - num_memories = 2; - derr = true; - disable_clock_gating = false; - break; - case GAUDI_EVENT_SRAM0_SERR ... GAUDI_EVENT_SRAM28_SERR: - index = event_type - GAUDI_EVENT_SRAM0_SERR; - block_address = - mmSRAM_Y0_X0_BANK_BASE + index * SRAM_BANK_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "SRAM%d", index); - num_memories = 2; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_SRAM0_DERR ... GAUDI_EVENT_SRAM28_DERR: - index = event_type - GAUDI_EVENT_SRAM0_DERR; - block_address = - mmSRAM_Y0_X0_BANK_BASE + index * SRAM_BANK_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "SRAM%d", index); - num_memories = 2; - derr = true; - disable_clock_gating = false; - break; - case GAUDI_EVENT_DMA_IF0_SERR ... GAUDI_EVENT_DMA_IF3_SERR: - index = event_type - GAUDI_EVENT_DMA_IF0_SERR; - block_address = mmDMA_IF_W_S_BASE + - index * (mmDMA_IF_E_S_BASE - mmDMA_IF_W_S_BASE); - snprintf(desc, ARRAY_SIZE(desc), "DMA_IF%d", index); - num_memories = 60; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_DMA_IF0_DERR ... GAUDI_EVENT_DMA_IF3_DERR: - index = event_type - GAUDI_EVENT_DMA_IF0_DERR; - block_address = mmDMA_IF_W_S_BASE + - index * (mmDMA_IF_E_S_BASE - mmDMA_IF_W_S_BASE); - snprintf(desc, ARRAY_SIZE(desc), "DMA_IF%d", index); - derr = true; - num_memories = 60; - disable_clock_gating = false; - break; - case GAUDI_EVENT_HBM_0_SERR ... GAUDI_EVENT_HBM_3_SERR: - index = event_type - GAUDI_EVENT_HBM_0_SERR; - /* HBM Registers are at different offsets */ - block_address = mmHBM0_BASE + 0x8000 + - index * (mmHBM1_BASE - mmHBM0_BASE); - snprintf(desc, ARRAY_SIZE(desc), "HBM%d", index); - derr = false; - num_memories = 64; - disable_clock_gating = false; - break; - case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR: - index = event_type - GAUDI_EVENT_HBM_0_SERR; - /* HBM Registers are at different offsets */ - block_address = mmHBM0_BASE + 0x8000 + - index * (mmHBM1_BASE - mmHBM0_BASE); - snprintf(desc, ARRAY_SIZE(desc), "HBM%d", index); - derr = true; - num_memories = 64; - disable_clock_gating = false; - break; + params.block_address = + mmMME0_SBAB_BASE + index * MME_ACC_OFFSET; + params.num_memories = 33; + params.derr = true; + params.disable_clock_gating = true; default: return; } - gaudi_print_ecc_info_generic(hdev, desc, block_address, num_memories, - derr, disable_clock_gating); + if (extract_info_from_fw) { + ecc_address = le64_to_cpu(ecc_data->ecc_address); + ecc_syndrom = le64_to_cpu(ecc_data->ecc_syndrom); + memory_wrapper_idx = ecc_data->memory_wrapper_idx; + } else { + rc = gaudi_extract_ecc_info(hdev, ¶ms, &ecc_address, + &ecc_syndrom, &memory_wrapper_idx); + if (rc) + return; + } + + dev_err(hdev->dev, + "ECC error detected. address: %#llx. Syndrom: %#llx. block id %u\n", + ecc_address, ecc_syndrom, memory_wrapper_idx); } static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type) @@ -5599,8 +5404,6 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type, dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n", event_type, desc); - gaudi_print_ecc_info(hdev, event_type); - if (razwi) { gaudi_print_razwi_info(hdev); gaudi_print_mmu_error_info(hdev); @@ -5830,10 +5633,15 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_PSOC_CORESIGHT_DERR: case GAUDI_EVENT_SRAM0_DERR ... GAUDI_EVENT_SRAM28_DERR: case GAUDI_EVENT_DMA_IF0_DERR ... GAUDI_EVENT_DMA_IF3_DERR: - fallthrough; - case GAUDI_EVENT_GIC500: case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR: case GAUDI_EVENT_MMU_DERR: + gaudi_print_irq_info(hdev, event_type, true); + gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); + if (hdev->hard_reset_on_fw_events) + hl_device_reset(hdev, true, false); + break; + + case GAUDI_EVENT_GIC500: case GAUDI_EVENT_AXI_ECC: case GAUDI_EVENT_L2_RAM_ECC: case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17: @@ -5929,6 +5737,11 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_HBM_0_SERR ... GAUDI_EVENT_HBM_3_SERR: fallthrough; case GAUDI_EVENT_MMU_SERR: + gaudi_print_irq_info(hdev, event_type, true); + gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); + hl_fw_unmask_irq(hdev, event_type); + break; + case GAUDI_EVENT_PCIE_DEC: case GAUDI_EVENT_MME0_WBC_RSP: case GAUDI_EVENT_MME0_SBAB0_RSP: @@ -6413,47 +6226,14 @@ static enum hl_device_hw_state gaudi_get_hw_state(struct hl_device *hdev) return RREG32(mmHW_STATE); } -static u32 gaudi_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx) -{ - return gaudi_cq_assignment[cq_idx]; -} - -static void gaudi_ext_queue_init(struct hl_device *hdev, u32 q_idx) +int gaudi_ctx_init(struct hl_ctx *ctx) { - struct gaudi_device *gaudi = hdev->asic_specific; - struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx]; - struct hl_hw_sob *hw_sob; - int sob, ext_idx = gaudi->ext_queue_idx++; - - /* - * The external queues might not sit sequentially, hence use the - * real external queue index for the SOB/MON base id. - */ - hw_queue->base_sob_id = ext_idx * HL_RSVD_SOBS; - hw_queue->base_mon_id = ext_idx * HL_RSVD_MONS; - hw_queue->next_sob_val = 1; - hw_queue->curr_sob_offset = 0; - - for (sob = 0 ; sob < HL_RSVD_SOBS ; sob++) { - hw_sob = &hw_queue->hw_sob[sob]; - hw_sob->hdev = hdev; - hw_sob->sob_id = hw_queue->base_sob_id + sob; - hw_sob->q_idx = q_idx; - kref_init(&hw_sob->kref); - } + return 0; } -static void gaudi_ext_queue_reset(struct hl_device *hdev, u32 q_idx) +static u32 gaudi_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx) { - struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx]; - - /* - * In case we got here due to a stuck CS, the refcnt might be bigger - * than 1 and therefore we reset it. - */ - kref_init(&hw_queue->hw_sob[hw_queue->curr_sob_offset].kref); - hw_queue->curr_sob_offset = 0; - hw_queue->next_sob_val = 1; + return gaudi_cq_assignment[cq_idx]; } static u32 gaudi_get_signal_cb_size(struct hl_device *hdev) @@ -6478,16 +6258,17 @@ static void gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id) pkt = (struct packet_msg_short *) (uintptr_t) cb->kernel_address; memset(pkt, 0, sizeof(*pkt)); - value = 1 << GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_SHIFT; /* inc by 1 */ - value |= 1 << GAUDI_PKT_SHORT_VAL_SOB_MOD_SHIFT; /* add mode */ + /* Inc by 1, Mode ADD */ + value = FIELD_PREP(GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_MASK, 1); + value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_SOB_MOD_MASK, 1); - ctl = (sob_id * 4) << GAUDI_PKT_SHORT_CTL_ADDR_SHIFT; /* SOB id */ - ctl |= 0 << GAUDI_PKT_SHORT_CTL_OP_SHIFT; /* write the value */ - ctl |= 3 << GAUDI_PKT_SHORT_CTL_BASE_SHIFT; /* W_S SOB base */ - ctl |= PACKET_MSG_SHORT << GAUDI_PKT_SHORT_CTL_OPCODE_SHIFT; - ctl |= 1 << GAUDI_PKT_SHORT_CTL_EB_SHIFT; - ctl |= 1 << GAUDI_PKT_SHORT_CTL_RB_SHIFT; - ctl |= 1 << GAUDI_PKT_SHORT_CTL_MB_SHIFT; + ctl = FIELD_PREP(GAUDI_PKT_SHORT_CTL_ADDR_MASK, sob_id * 4); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OP_MASK, 0); /* write the value */ + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_BASE_MASK, 3); /* W_S SOB base */ + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OPCODE_MASK, PACKET_MSG_SHORT); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 1); pkt->value = cpu_to_le32(value); pkt->ctl = cpu_to_le32(ctl); @@ -6500,12 +6281,12 @@ static u32 gaudi_add_mon_msg_short(struct packet_msg_short *pkt, u32 value, memset(pkt, 0, pkt_size); - ctl = addr << GAUDI_PKT_SHORT_CTL_ADDR_SHIFT; - ctl |= 2 << GAUDI_PKT_SHORT_CTL_BASE_SHIFT; /* W_S MON base */ - ctl |= PACKET_MSG_SHORT << GAUDI_PKT_SHORT_CTL_OPCODE_SHIFT; - ctl |= 0 << GAUDI_PKT_SHORT_CTL_EB_SHIFT; - ctl |= 1 << GAUDI_PKT_SHORT_CTL_RB_SHIFT; - ctl |= 0 << GAUDI_PKT_SHORT_CTL_MB_SHIFT; /* only last pkt needs MB */ + ctl = FIELD_PREP(GAUDI_PKT_SHORT_CTL_ADDR_MASK, addr); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_BASE_MASK, 2); /* W_S MON base */ + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OPCODE_MASK, PACKET_MSG_SHORT); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 0); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 0); /* last pkt MB */ pkt->value = cpu_to_le32(value); pkt->ctl = cpu_to_le32(ctl); @@ -6521,18 +6302,19 @@ static u32 gaudi_add_arm_monitor_pkt(struct packet_msg_short *pkt, u16 sob_id, memset(pkt, 0, pkt_size); - value = (sob_id / 8) << GAUDI_PKT_SHORT_VAL_MON_SYNC_GID_SHIFT; - value |= sob_val << GAUDI_PKT_SHORT_VAL_MON_SYNC_VAL_SHIFT; - value |= 0 << GAUDI_PKT_SHORT_VAL_MON_MODE_SHIFT; /* GREATER_OR_EQUAL */ - value |= mask << GAUDI_PKT_SHORT_VAL_MON_MASK_SHIFT; + value = FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_SYNC_GID_MASK, sob_id / 8); + value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_SYNC_VAL_MASK, sob_val); + value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_MODE_MASK, + 0); /* GREATER OR EQUAL*/ + value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_MASK_MASK, mask); - ctl = addr << GAUDI_PKT_SHORT_CTL_ADDR_SHIFT; - ctl |= 0 << GAUDI_PKT_SHORT_CTL_OP_SHIFT; /* write the value */ - ctl |= 2 << GAUDI_PKT_SHORT_CTL_BASE_SHIFT; /* W_S MON base */ - ctl |= PACKET_MSG_SHORT << GAUDI_PKT_SHORT_CTL_OPCODE_SHIFT; - ctl |= 0 << GAUDI_PKT_SHORT_CTL_EB_SHIFT; - ctl |= 1 << GAUDI_PKT_SHORT_CTL_RB_SHIFT; - ctl |= 1 << GAUDI_PKT_SHORT_CTL_MB_SHIFT; + ctl = FIELD_PREP(GAUDI_PKT_SHORT_CTL_ADDR_MASK, addr); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OP_MASK, 0); /* write the value */ + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_BASE_MASK, 2); /* W_S MON base */ + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OPCODE_MASK, PACKET_MSG_SHORT); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 0); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 1); pkt->value = cpu_to_le32(value); pkt->ctl = cpu_to_le32(ctl); @@ -6546,15 +6328,14 @@ static u32 gaudi_add_fence_pkt(struct packet_fence *pkt) memset(pkt, 0, pkt_size); - cfg = 1 << GAUDI_PKT_FENCE_CFG_DEC_VAL_SHIFT; - cfg |= 1 << GAUDI_PKT_FENCE_CFG_TARGET_VAL_SHIFT; - cfg |= 2 << GAUDI_PKT_FENCE_CFG_ID_SHIFT; + cfg = FIELD_PREP(GAUDI_PKT_FENCE_CFG_DEC_VAL_MASK, 1); + cfg |= FIELD_PREP(GAUDI_PKT_FENCE_CFG_TARGET_VAL_MASK, 1); + cfg |= FIELD_PREP(GAUDI_PKT_FENCE_CFG_ID_MASK, 2); - ctl = 0 << GAUDI_PKT_FENCE_CTL_PRED_SHIFT; - ctl |= PACKET_FENCE << GAUDI_PKT_FENCE_CTL_OPCODE_SHIFT; - ctl |= 0 << GAUDI_PKT_FENCE_CTL_EB_SHIFT; - ctl |= 1 << GAUDI_PKT_FENCE_CTL_RB_SHIFT; - ctl |= 1 << GAUDI_PKT_FENCE_CTL_MB_SHIFT; + ctl = FIELD_PREP(GAUDI_PKT_FENCE_CTL_OPCODE_MASK, PACKET_FENCE); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 0); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 1); pkt->cfg = cpu_to_le32(cfg); pkt->ctl = cpu_to_le32(ctl); @@ -6753,13 +6534,12 @@ static const struct hl_asic_funcs gaudi_funcs = { .rreg = hl_rreg, .wreg = hl_wreg, .halt_coresight = gaudi_halt_coresight, + .ctx_init = gaudi_ctx_init, .get_clk_rate = gaudi_get_clk_rate, .get_queue_id_for_cq = gaudi_get_queue_id_for_cq, .read_device_fw_version = gaudi_read_device_fw_version, .load_firmware_to_device = gaudi_load_firmware_to_device, .load_boot_fit_to_device = gaudi_load_boot_fit_to_device, - .ext_queue_init = gaudi_ext_queue_init, - .ext_queue_reset = gaudi_ext_queue_reset, .get_signal_cb_size = gaudi_get_signal_cb_size, .get_wait_cb_size = gaudi_get_wait_cb_size, .gen_signal_cb = gaudi_gen_signal_cb, diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h index 41a8d9bff6bf..a94ab6a180f0 100644 --- a/drivers/misc/habanalabs/gaudi/gaudiP.h +++ b/drivers/misc/habanalabs/gaudi/gaudiP.h @@ -10,7 +10,7 @@ #include <uapi/misc/habanalabs.h> #include "habanalabs.h" -#include "include/hl_boot_if.h" +#include "include/common/hl_boot_if.h" #include "include/gaudi/gaudi_packets.h" #include "include/gaudi/gaudi.h" #include "include/gaudi/gaudi_async_events.h" @@ -57,6 +57,12 @@ #define GAUDI_DEFAULT_CARD_NAME "HL2000" +#define GAUDI_MAX_PENDING_CS 1024 + +#if !IS_MAX_PENDING_CS_VALID(GAUDI_MAX_PENDING_CS) +#error "GAUDI_MAX_PENDING_CS must be power of 2 and greater than 1" +#endif + #define PCI_DMA_NUMBER_OF_CHNLS 3 #define HBM_DMA_NUMBER_OF_CHNLS 5 #define DMA_NUMBER_OF_CHNLS (PCI_DMA_NUMBER_OF_CHNLS + \ @@ -117,14 +123,14 @@ /* Internal QMANs PQ sizes */ -#define MME_QMAN_LENGTH 64 +#define MME_QMAN_LENGTH 1024 #define MME_QMAN_SIZE_IN_BYTES (MME_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE) -#define HBM_DMA_QMAN_LENGTH 64 +#define HBM_DMA_QMAN_LENGTH 1024 #define HBM_DMA_QMAN_SIZE_IN_BYTES \ (HBM_DMA_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE) -#define TPC_QMAN_LENGTH 64 +#define TPC_QMAN_LENGTH 1024 #define TPC_QMAN_SIZE_IN_BYTES (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE) #define SRAM_USER_BASE_OFFSET GAUDI_DRIVER_SRAM_RESERVED_SIZE_FROM_START @@ -228,7 +234,6 @@ struct gaudi_internal_qman_info { * engine. * @multi_msi_mode: whether we are working in multi MSI single MSI mode. * Multi MSI is possible only with IOMMU enabled. - * @ext_queue_idx: helper index for external queues initialization. * @mmu_cache_inv_pi: PI for MMU cache invalidation flow. The H/W expects an * 8-bit value so use u8. */ @@ -249,7 +254,6 @@ struct gaudi_device { u32 events_stat_aggregate[GAUDI_EVENT_SIZE]; u32 hw_cap_initialized; u8 multi_msi_mode; - u8 ext_queue_idx; u8 mmu_cache_inv_pi; }; diff --git a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c index bf0e062d7b87..c32322cb1728 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c +++ b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c @@ -392,6 +392,7 @@ static int gaudi_config_stm(struct hl_device *hdev, { struct hl_debug_params_stm *input; u64 base_reg; + u32 frequency; int rc; if (params->reg_idx >= ARRAY_SIZE(debug_stm_regs)) { @@ -420,7 +421,10 @@ static int gaudi_config_stm(struct hl_device *hdev, WREG32(base_reg + 0xE00, lower_32_bits(input->sp_mask)); WREG32(base_reg + 0xEF4, input->id); WREG32(base_reg + 0xDF4, 0x80); - WREG32(base_reg + 0xE8C, input->frequency); + frequency = hdev->asic_prop.psoc_timestamp_frequency; + if (frequency == 0) + frequency = input->frequency; + WREG32(base_reg + 0xE8C, frequency); WREG32(base_reg + 0xE90, 0x7FF); /* SW-2176 - SW WA for HW bug */ diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index a4a20e27ed3b..4473ded313d6 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -337,11 +337,19 @@ static int goya_mmu_set_dram_default_page(struct hl_device *hdev); static int goya_mmu_add_mappings_for_device_cpu(struct hl_device *hdev); static void goya_mmu_prepare(struct hl_device *hdev, u32 asid); -void goya_get_fixed_properties(struct hl_device *hdev) +int goya_get_fixed_properties(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; int i; + prop->max_queues = GOYA_QUEUE_ID_SIZE; + prop->hw_queues_props = kcalloc(prop->max_queues, + sizeof(struct hw_queue_properties), + GFP_KERNEL); + + if (!prop->hw_queues_props) + return -ENOMEM; + for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) { prop->hw_queues_props[i].type = QUEUE_TYPE_EXT; prop->hw_queues_props[i].driver_only = 0; @@ -361,9 +369,6 @@ void goya_get_fixed_properties(struct hl_device *hdev) prop->hw_queues_props[i].requires_kernel_cb = 0; } - for (; i < HL_MAX_QUEUES; i++) - prop->hw_queues_props[i].type = QUEUE_TYPE_NA; - prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES; prop->dram_base_address = DRAM_PHYS_BASE; @@ -426,6 +431,10 @@ void goya_get_fixed_properties(struct hl_device *hdev) strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME, CARD_NAME_MAX_LEN); + + prop->max_pending_cs = GOYA_MAX_PENDING_CS; + + return 0; } /* @@ -456,6 +465,7 @@ static int goya_pci_bars_map(struct hl_device *hdev) static u64 goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr) { struct goya_device *goya = hdev->asic_specific; + struct hl_inbound_pci_region pci_region; u64 old_addr = addr; int rc; @@ -463,7 +473,10 @@ static u64 goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr) return old_addr; /* Inbound Region 1 - Bar 4 - Point to DDR */ - rc = hl_pci_set_dram_bar_base(hdev, 1, 4, addr); + pci_region.mode = PCI_BAR_MATCH_MODE; + pci_region.bar = DDR_BAR_ID; + pci_region.addr = addr; + rc = hl_pci_set_inbound_region(hdev, 1, &pci_region); if (rc) return U64_MAX; @@ -485,8 +498,35 @@ static u64 goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr) */ static int goya_init_iatu(struct hl_device *hdev) { - return hl_pci_init_iatu(hdev, SRAM_BASE_ADDR, DRAM_PHYS_BASE, - HOST_PHYS_BASE, HOST_PHYS_SIZE); + struct hl_inbound_pci_region inbound_region; + struct hl_outbound_pci_region outbound_region; + int rc; + + /* Inbound Region 0 - Bar 0 - Point to SRAM and CFG */ + inbound_region.mode = PCI_BAR_MATCH_MODE; + inbound_region.bar = SRAM_CFG_BAR_ID; + inbound_region.addr = SRAM_BASE_ADDR; + rc = hl_pci_set_inbound_region(hdev, 0, &inbound_region); + if (rc) + goto done; + + /* Inbound Region 1 - Bar 4 - Point to DDR */ + inbound_region.mode = PCI_BAR_MATCH_MODE; + inbound_region.bar = DDR_BAR_ID; + inbound_region.addr = DRAM_PHYS_BASE; + rc = hl_pci_set_inbound_region(hdev, 1, &inbound_region); + if (rc) + goto done; + + hdev->asic_funcs->set_dma_mask_from_fw(hdev); + + /* Outbound Region 0 - Point to Host */ + outbound_region.addr = HOST_PHYS_BASE; + outbound_region.size = HOST_PHYS_SIZE; + rc = hl_pci_set_outbound_region(hdev, &outbound_region); + +done: + return rc; } /* @@ -507,7 +547,11 @@ static int goya_early_init(struct hl_device *hdev) u32 val; int rc; - goya_get_fixed_properties(hdev); + rc = goya_get_fixed_properties(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to get fixed properties\n"); + return rc; + } /* Check BAR sizes */ if (pci_resource_len(pdev, SRAM_CFG_BAR_ID) != CFG_BAR_SIZE) { @@ -517,7 +561,8 @@ static int goya_early_init(struct hl_device *hdev) (unsigned long long) pci_resource_len(pdev, SRAM_CFG_BAR_ID), CFG_BAR_SIZE); - return -ENODEV; + rc = -ENODEV; + goto free_queue_props; } if (pci_resource_len(pdev, MSIX_BAR_ID) != MSIX_BAR_SIZE) { @@ -527,14 +572,15 @@ static int goya_early_init(struct hl_device *hdev) (unsigned long long) pci_resource_len(pdev, MSIX_BAR_ID), MSIX_BAR_SIZE); - return -ENODEV; + rc = -ENODEV; + goto free_queue_props; } prop->dram_pci_bar_size = pci_resource_len(pdev, DDR_BAR_ID); rc = hl_pci_init(hdev); if (rc) - return rc; + goto free_queue_props; if (!hdev->pldm) { val = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS); @@ -544,6 +590,10 @@ static int goya_early_init(struct hl_device *hdev) } return 0; + +free_queue_props: + kfree(hdev->asic_prop.hw_queues_props); + return rc; } /* @@ -556,6 +606,7 @@ static int goya_early_init(struct hl_device *hdev) */ static int goya_early_fini(struct hl_device *hdev) { + kfree(hdev->asic_prop.hw_queues_props); hl_pci_fini(hdev); return 0; @@ -592,11 +643,36 @@ static void goya_qman0_set_security(struct hl_device *hdev, bool secure) static void goya_fetch_psoc_frequency(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; + u32 trace_freq = 0; + u32 pll_clk = 0; + u32 div_fctr = RREG32(mmPSOC_PCI_PLL_DIV_FACTOR_1); + u32 div_sel = RREG32(mmPSOC_PCI_PLL_DIV_SEL_1); + u32 nr = RREG32(mmPSOC_PCI_PLL_NR); + u32 nf = RREG32(mmPSOC_PCI_PLL_NF); + u32 od = RREG32(mmPSOC_PCI_PLL_OD); + + if (div_sel == DIV_SEL_REF_CLK || div_sel == DIV_SEL_DIVIDED_REF) { + if (div_sel == DIV_SEL_REF_CLK) + trace_freq = PLL_REF_CLK; + else + trace_freq = PLL_REF_CLK / (div_fctr + 1); + } else if (div_sel == DIV_SEL_PLL_CLK || + div_sel == DIV_SEL_DIVIDED_PLL) { + pll_clk = PLL_REF_CLK * (nf + 1) / ((nr + 1) * (od + 1)); + if (div_sel == DIV_SEL_PLL_CLK) + trace_freq = pll_clk; + else + trace_freq = pll_clk / (div_fctr + 1); + } else { + dev_warn(hdev->dev, + "Received invalid div select value: %d", div_sel); + } - prop->psoc_pci_pll_nr = RREG32(mmPSOC_PCI_PLL_NR); - prop->psoc_pci_pll_nf = RREG32(mmPSOC_PCI_PLL_NF); - prop->psoc_pci_pll_od = RREG32(mmPSOC_PCI_PLL_OD); - prop->psoc_pci_pll_div_factor = RREG32(mmPSOC_PCI_PLL_DIV_FACTOR_1); + prop->psoc_timestamp_frequency = trace_freq; + prop->psoc_pci_pll_nr = nr; + prop->psoc_pci_pll_nf = nf; + prop->psoc_pci_pll_od = od; + prop->psoc_pci_pll_div_factor = div_fctr; } int goya_late_init(struct hl_device *hdev) @@ -2164,29 +2240,15 @@ static void goya_disable_timestamp(struct hl_device *hdev) static void goya_halt_engines(struct hl_device *hdev, bool hard_reset) { - u32 wait_timeout_ms, cpu_timeout_ms; + u32 wait_timeout_ms; dev_info(hdev->dev, "Halting compute engines and disabling interrupts\n"); - if (hdev->pldm) { + if (hdev->pldm) wait_timeout_ms = GOYA_PLDM_RESET_WAIT_MSEC; - cpu_timeout_ms = GOYA_PLDM_RESET_WAIT_MSEC; - } else { + else wait_timeout_ms = GOYA_RESET_WAIT_MSEC; - cpu_timeout_ms = GOYA_CPU_RESET_WAIT_MSEC; - } - - if (hard_reset) { - /* - * I don't know what is the state of the CPU so make sure it is - * stopped in any means necessary - */ - WREG32(mmPSOC_GLOBAL_CONF_UBOOT_MAGIC, KMD_MSG_GOTO_WFE); - WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, - GOYA_ASYNC_EVENT_ID_HALT_MACHINE); - msleep(cpu_timeout_ms); - } goya_stop_external_queues(hdev); goya_stop_internal_queues(hdev); @@ -2491,14 +2553,26 @@ disable_queues: static void goya_hw_fini(struct hl_device *hdev, bool hard_reset) { struct goya_device *goya = hdev->asic_specific; - u32 reset_timeout_ms, status; + u32 reset_timeout_ms, cpu_timeout_ms, status; - if (hdev->pldm) + if (hdev->pldm) { reset_timeout_ms = GOYA_PLDM_RESET_TIMEOUT_MSEC; - else + cpu_timeout_ms = GOYA_PLDM_RESET_WAIT_MSEC; + } else { reset_timeout_ms = GOYA_RESET_TIMEOUT_MSEC; + cpu_timeout_ms = GOYA_CPU_RESET_WAIT_MSEC; + } if (hard_reset) { + /* I don't know what is the state of the CPU so make sure it is + * stopped in any means necessary + */ + WREG32(mmPSOC_GLOBAL_CONF_UBOOT_MAGIC, KMD_MSG_GOTO_WFE); + WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, + GOYA_ASYNC_EVENT_ID_HALT_MACHINE); + + msleep(cpu_timeout_ms); + goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE); goya_disable_clk_rlx(hdev); goya_set_pll_refclk(hdev); @@ -3697,9 +3771,8 @@ static int goya_parse_cb_mmu(struct hl_device *hdev, parser->patched_cb_size = parser->user_cb_size + sizeof(struct packet_msg_prot) * 2; - rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, - parser->patched_cb_size, - &patched_cb_handle, HL_KERNEL_ASID_ID); + rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size, + &patched_cb_handle, HL_KERNEL_ASID_ID, false); if (rc) { dev_err(hdev->dev, @@ -3771,9 +3844,8 @@ static int goya_parse_cb_no_mmu(struct hl_device *hdev, if (rc) goto free_userptr; - rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, - parser->patched_cb_size, - &patched_cb_handle, HL_KERNEL_ASID_ID); + rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size, + &patched_cb_handle, HL_KERNEL_ASID_ID, false); if (rc) { dev_err(hdev->dev, "Failed to allocate patched CB for DMA CS %d\n", rc); @@ -4619,7 +4691,7 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size, lin_dma_pkts_cnt = DIV_ROUND_UP_ULL(size, SZ_2G); cb_size = lin_dma_pkts_cnt * sizeof(struct packet_lin_dma) + sizeof(struct packet_msg_prot); - cb = hl_cb_kernel_create(hdev, cb_size); + cb = hl_cb_kernel_create(hdev, cb_size, false); if (!cb) return -ENOMEM; @@ -5149,19 +5221,14 @@ static enum hl_device_hw_state goya_get_hw_state(struct hl_device *hdev) return RREG32(mmHW_STATE); } -u32 goya_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx) +int goya_ctx_init(struct hl_ctx *ctx) { - return cq_idx; -} - -static void goya_ext_queue_init(struct hl_device *hdev, u32 q_idx) -{ - + return 0; } -static void goya_ext_queue_reset(struct hl_device *hdev, u32 q_idx) +u32 goya_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx) { - + return cq_idx; } static u32 goya_get_signal_cb_size(struct hl_device *hdev) @@ -5272,13 +5339,12 @@ static const struct hl_asic_funcs goya_funcs = { .rreg = hl_rreg, .wreg = hl_wreg, .halt_coresight = goya_halt_coresight, + .ctx_init = goya_ctx_init, .get_clk_rate = goya_get_clk_rate, .get_queue_id_for_cq = goya_get_queue_id_for_cq, .read_device_fw_version = goya_read_device_fw_version, .load_firmware_to_device = goya_load_firmware_to_device, .load_boot_fit_to_device = goya_load_boot_fit_to_device, - .ext_queue_init = goya_ext_queue_init, - .ext_queue_reset = goya_ext_queue_reset, .get_signal_cb_size = goya_get_signal_cb_size, .get_wait_cb_size = goya_get_wait_cb_size, .gen_signal_cb = goya_gen_signal_cb, diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h index d36f8d90c9c9..9e674cf39fd9 100644 --- a/drivers/misc/habanalabs/goya/goyaP.h +++ b/drivers/misc/habanalabs/goya/goyaP.h @@ -10,7 +10,7 @@ #include <uapi/misc/habanalabs.h> #include "habanalabs.h" -#include "include/hl_boot_if.h" +#include "include/common/hl_boot_if.h" #include "include/goya/goya_packets.h" #include "include/goya/goya.h" #include "include/goya/goya_async_events.h" @@ -31,10 +31,6 @@ */ #define NUMBER_OF_INTERRUPTS (NUMBER_OF_CMPLT_QUEUES + 1) -#if (NUMBER_OF_HW_QUEUES >= HL_MAX_QUEUES) -#error "Number of H/W queues must be smaller than HL_MAX_QUEUES" -#endif - #if (NUMBER_OF_INTERRUPTS > GOYA_MSIX_ENTRIES) #error "Number of MSIX interrupts must be smaller or equal to GOYA_MSIX_ENTRIES" #endif @@ -57,6 +53,12 @@ #define GOYA_DEFAULT_CARD_NAME "HL1000" +#define GOYA_MAX_PENDING_CS 64 + +#if !IS_MAX_PENDING_CS_VALID(GOYA_MAX_PENDING_CS) +#error "GOYA_MAX_PENDING_CS must be power of 2 and greater than 1" +#endif + /* DRAM Memory Map */ #define CPU_FW_IMAGE_SIZE 0x10000000 /* 256MB */ @@ -164,7 +166,7 @@ struct goya_device { u8 device_cpu_mmu_mappings_done; }; -void goya_get_fixed_properties(struct hl_device *hdev); +int goya_get_fixed_properties(struct hl_device *hdev); int goya_mmu_init(struct hl_device *hdev); void goya_init_dma_qmans(struct hl_device *hdev); void goya_init_mme_qmans(struct hl_device *hdev); diff --git a/drivers/misc/habanalabs/goya/goya_coresight.c b/drivers/misc/habanalabs/goya/goya_coresight.c index aa51fc71f0a1..18e12e9d284b 100644 --- a/drivers/misc/habanalabs/goya/goya_coresight.c +++ b/drivers/misc/habanalabs/goya/goya_coresight.c @@ -232,6 +232,7 @@ static int goya_config_stm(struct hl_device *hdev, { struct hl_debug_params_stm *input; u64 base_reg; + u32 frequency; int rc; if (params->reg_idx >= ARRAY_SIZE(debug_stm_regs)) { @@ -264,7 +265,10 @@ static int goya_config_stm(struct hl_device *hdev, WREG32(base_reg + 0xE20, 0xFFFFFFFF); WREG32(base_reg + 0xEF4, input->id); WREG32(base_reg + 0xDF4, 0x80); - WREG32(base_reg + 0xE8C, input->frequency); + frequency = hdev->asic_prop.psoc_timestamp_frequency; + if (frequency == 0) + frequency = input->frequency; + WREG32(base_reg + 0xE8C, frequency); WREG32(base_reg + 0xE90, 0x7FF); WREG32(base_reg + 0xE80, 0x27 | (input->id << 16)); } else { diff --git a/drivers/misc/habanalabs/include/armcp_if.h b/drivers/misc/habanalabs/include/common/armcp_if.h index a34fc39ad87e..07f9972db28d 100644 --- a/drivers/misc/habanalabs/include/armcp_if.h +++ b/drivers/misc/habanalabs/include/common/armcp_if.h @@ -19,9 +19,19 @@ struct hl_eq_header { __le32 ctl; }; +struct hl_eq_ecc_data { + __le64 ecc_address; + __le64 ecc_syndrom; + __u8 memory_wrapper_idx; + __u8 pad[7]; +}; + struct hl_eq_entry { struct hl_eq_header hdr; - __le64 data[7]; + union { + struct hl_eq_ecc_data ecc_data; + __le64 data[7]; + }; }; #define HL_EQ_ENTRY_SIZE sizeof(struct hl_eq_entry) @@ -276,6 +286,8 @@ struct armcp_packet { /* For get Armcp info/EEPROM data */ __le32 data_max_size; }; + + __le32 reserved; }; struct armcp_unmask_irq_arr_packet { diff --git a/drivers/misc/habanalabs/include/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h index c22d134e73af..bb67cafc6e00 100644 --- a/drivers/misc/habanalabs/include/hl_boot_if.h +++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h @@ -44,6 +44,15 @@ * The NIC FW loading and initialization * failed. This means NICs are not usable. * + * CPU_BOOT_ERR0_SECURITY_NOT_RDY Chip security initialization has been + * started, but is not ready yet - chip + * cannot be accessed. + * + * CPU_BOOT_ERR0_SECURITY_FAIL Security related tasks have failed. + * The tasks are security init (root of + * trust), boot authentication (chain of + * trust), data packets authentication. + * * CPU_BOOT_ERR0_ENABLED Error registers enabled. * This is a main indication that the * running FW populates the error @@ -57,6 +66,8 @@ #define CPU_BOOT_ERR0_BMC_WAIT_SKIPPED (1 << 4) #define CPU_BOOT_ERR0_NIC_DATA_NOT_RDY (1 << 5) #define CPU_BOOT_ERR0_NIC_FW_FAIL (1 << 6) +#define CPU_BOOT_ERR0_SECURITY_NOT_RDY (1 << 7) +#define CPU_BOOT_ERR0_SECURITY_FAIL (1 << 8) #define CPU_BOOT_ERR0_ENABLED (1 << 31) enum cpu_boot_status { @@ -79,7 +90,10 @@ enum cpu_boot_status { CPU_BOOT_STATUS_BMC_WAITING_SKIPPED, /* deprecated - will be removed */ /* Last boot loader progress status, ready to receive commands */ CPU_BOOT_STATUS_READY_TO_BOOT = 15, + /* Internal Boot finished, ready for boot-fit */ CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT = 16, + /* Internal Security has been initialized, device can be accessed */ + CPU_BOOT_STATUS_SECURITY_READY = 17, }; enum kmd_msg { diff --git a/drivers/misc/habanalabs/include/qman_if.h b/drivers/misc/habanalabs/include/common/qman_if.h index 0fdb49188ed7..0fdb49188ed7 100644 --- a/drivers/misc/habanalabs/include/qman_if.h +++ b/drivers/misc/habanalabs/include/common/qman_if.h diff --git a/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h b/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h index 85e3b5148595..f92dc53af074 100644 --- a/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h +++ b/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h @@ -91,18 +91,16 @@ #include "psoc_pci_pll_regs.h" #include "psoc_hbm_pll_regs.h" +#include "psoc_cpu_pll_regs.h" -#define GAUDI_ECC_MEM_SEL_OFFSET 0xF18 -#define GAUDI_ECC_ADDRESS_OFFSET 0xF1C -#define GAUDI_ECC_SYNDROME_OFFSET 0xF20 -#define GAUDI_ECC_SERR0_OFFSET 0xF30 -#define GAUDI_ECC_SERR1_OFFSET 0xF34 -#define GAUDI_ECC_SERR2_OFFSET 0xF38 -#define GAUDI_ECC_SERR3_OFFSET 0xF3C -#define GAUDI_ECC_DERR0_OFFSET 0xF40 -#define GAUDI_ECC_DERR1_OFFSET 0xF44 -#define GAUDI_ECC_DERR2_OFFSET 0xF48 -#define GAUDI_ECC_DERR3_OFFSET 0xF4C +#define GAUDI_ECC_MEM_SEL_OFFSET 0xF18 +#define GAUDI_ECC_ADDRESS_OFFSET 0xF1C +#define GAUDI_ECC_SYNDROME_OFFSET 0xF20 +#define GAUDI_ECC_MEM_INFO_CLR_OFFSET 0xF28 +#define GAUDI_ECC_MEM_INFO_CLR_SERR_MASK BIT(8) +#define GAUDI_ECC_MEM_INFO_CLR_DERR_MASK BIT(9) +#define GAUDI_ECC_SERR0_OFFSET 0xF30 +#define GAUDI_ECC_DERR0_OFFSET 0xF40 #define mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 0x492000 #define mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0 0x494000 @@ -294,6 +292,7 @@ #define mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG 0xC02000 +#define mmPCIE_AUX_FLR_CTRL 0xC07394 #define mmPCIE_AUX_DBI 0xC07490 #endif /* ASIC_REG_GAUDI_REGS_H_ */ diff --git a/drivers/misc/habanalabs/include/gaudi/asic_reg/psoc_cpu_pll_regs.h b/drivers/misc/habanalabs/include/gaudi/asic_reg/psoc_cpu_pll_regs.h new file mode 100644 index 000000000000..2585c70f59ef --- /dev/null +++ b/drivers/misc/habanalabs/include/gaudi/asic_reg/psoc_cpu_pll_regs.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2016-2018 HabanaLabs, Ltd. + * All Rights Reserved. + * + */ + +/************************************ + ** This is an auto-generated file ** + ** DO NOT EDIT BELOW ** + ************************************/ + +#ifndef ASIC_REG_PSOC_CPU_PLL_REGS_H_ +#define ASIC_REG_PSOC_CPU_PLL_REGS_H_ + +/* + ***************************************** + * PSOC_CPU_PLL (Prototype: PLL) + ***************************************** + */ + +#define mmPSOC_CPU_PLL_NR 0xC70100 + +#define mmPSOC_CPU_PLL_NF 0xC70104 + +#define mmPSOC_CPU_PLL_OD 0xC70108 + +#define mmPSOC_CPU_PLL_NB 0xC7010C + +#define mmPSOC_CPU_PLL_CFG 0xC70110 + +#define mmPSOC_CPU_PLL_LOSE_MASK 0xC70120 + +#define mmPSOC_CPU_PLL_LOCK_INTR 0xC70128 + +#define mmPSOC_CPU_PLL_LOCK_BYPASS 0xC7012C + +#define mmPSOC_CPU_PLL_DATA_CHNG 0xC70130 + +#define mmPSOC_CPU_PLL_RST 0xC70134 + +#define mmPSOC_CPU_PLL_SLIP_WD_CNTR 0xC70150 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_0 0xC70200 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_1 0xC70204 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_2 0xC70208 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_3 0xC7020C + +#define mmPSOC_CPU_PLL_DIV_FACTOR_CMD_0 0xC70220 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_CMD_1 0xC70224 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_CMD_2 0xC70228 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_CMD_3 0xC7022C + +#define mmPSOC_CPU_PLL_DIV_SEL_0 0xC70280 + +#define mmPSOC_CPU_PLL_DIV_SEL_1 0xC70284 + +#define mmPSOC_CPU_PLL_DIV_SEL_2 0xC70288 + +#define mmPSOC_CPU_PLL_DIV_SEL_3 0xC7028C + +#define mmPSOC_CPU_PLL_DIV_EN_0 0xC702A0 + +#define mmPSOC_CPU_PLL_DIV_EN_1 0xC702A4 + +#define mmPSOC_CPU_PLL_DIV_EN_2 0xC702A8 + +#define mmPSOC_CPU_PLL_DIV_EN_3 0xC702AC + +#define mmPSOC_CPU_PLL_DIV_FACTOR_BUSY_0 0xC702C0 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_BUSY_1 0xC702C4 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_BUSY_2 0xC702C8 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_BUSY_3 0xC702CC + +#define mmPSOC_CPU_PLL_CLK_GATER 0xC70300 + +#define mmPSOC_CPU_PLL_CLK_RLX_0 0xC70310 + +#define mmPSOC_CPU_PLL_CLK_RLX_1 0xC70314 + +#define mmPSOC_CPU_PLL_CLK_RLX_2 0xC70318 + +#define mmPSOC_CPU_PLL_CLK_RLX_3 0xC7031C + +#define mmPSOC_CPU_PLL_REF_CNTR_PERIOD 0xC70400 + +#define mmPSOC_CPU_PLL_REF_LOW_THRESHOLD 0xC70410 + +#define mmPSOC_CPU_PLL_REF_HIGH_THRESHOLD 0xC70420 + +#define mmPSOC_CPU_PLL_PLL_NOT_STABLE 0xC70430 + +#define mmPSOC_CPU_PLL_FREQ_CALC_EN 0xC70440 + +#define mmPSOC_CPU_PLL_RLX_BITMAP_CFG 0xC70500 + +#define mmPSOC_CPU_PLL_RLX_BITMAP_0 0xC70510 + +#define mmPSOC_CPU_PLL_RLX_BITMAP_1 0xC70514 + +#define mmPSOC_CPU_PLL_RLX_BITMAP_2 0xC70518 + +#define mmPSOC_CPU_PLL_RLX_BITMAP_3 0xC7051C + +#endif /* ASIC_REG_PSOC_CPU_PLL_REGS_H_ */ diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h b/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h index 96f08050ef0f..13ef6b2887fd 100644 --- a/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h +++ b/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h @@ -455,4 +455,7 @@ enum axi_id { QM_ARB_ERR_MSG_EN_CHOISE_WDT_MASK |\ QM_ARB_ERR_MSG_EN_AXI_LBW_ERR_MASK) +#define PCIE_AUX_FLR_CTRL_HW_CTRL_MASK 0x1 +#define PCIE_AUX_FLR_CTRL_INT_MASK_MASK 0x2 + #endif /* GAUDI_MASKS_H_ */ diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h b/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h index 0f0cd067bb43..f30f2c0458d7 100644 --- a/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h +++ b/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h @@ -85,7 +85,7 @@ struct packet_msg_long { }; #define GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_SHIFT 0 -#define GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_MASK 0x0000EFFF +#define GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_MASK 0x00007FFF #define GAUDI_PKT_SHORT_VAL_SOB_MOD_SHIFT 31 #define GAUDI_PKT_SHORT_VAL_SOB_MOD_MASK 0x80000000 @@ -141,7 +141,7 @@ struct packet_msg_prot { #define GAUDI_PKT_FENCE_CFG_TARGET_VAL_MASK 0x00FF0000 #define GAUDI_PKT_FENCE_CFG_ID_SHIFT 30 -#define GAUDI_PKT_FENCE_CFG_ID_MASK 0xC000000 +#define GAUDI_PKT_FENCE_CFG_ID_MASK 0xC0000000 #define GAUDI_PKT_FENCE_CTL_PRED_SHIFT 0 #define GAUDI_PKT_FENCE_CTL_PRED_MASK 0x0000001F diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index f6267a8d7416..d5c4f983b7a8 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -263,6 +263,7 @@ enum hl_device_status { * time the driver was loaded. * HL_INFO_TIME_SYNC - Retrieve the device's time alongside the host's time * for synchronization. + * HL_INFO_CS_COUNTERS - Retrieve command submission counters */ #define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_EVENTS 1 @@ -274,6 +275,7 @@ enum hl_device_status { #define HL_INFO_CLK_RATE 8 #define HL_INFO_RESET_COUNT 9 #define HL_INFO_TIME_SYNC 10 +#define HL_INFO_CS_COUNTERS 11 #define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_CARD_NAME_MAX_LEN 16 @@ -338,6 +340,25 @@ struct hl_info_time_sync { __u64 host_time; }; +/** + * struct hl_info_cs_counters - command submission counters + * @out_of_mem_drop_cnt: dropped due to memory allocation issue + * @parsing_drop_cnt: dropped due to error in packet parsing + * @queue_full_drop_cnt: dropped due to queue full + * @device_in_reset_drop_cnt: dropped due to device in reset + */ +struct hl_cs_counters { + __u64 out_of_mem_drop_cnt; + __u64 parsing_drop_cnt; + __u64 queue_full_drop_cnt; + __u64 device_in_reset_drop_cnt; +}; + +struct hl_info_cs_counters { + struct hl_cs_counters cs_counters; + struct hl_cs_counters ctx_cs_counters; +}; + struct hl_info_args { /* Location of relevant struct in userspace */ __u64 return_pointer; @@ -530,13 +551,13 @@ union hl_wait_cs_args { struct hl_wait_cs_out out; }; -/* Opcode to alloc device memory */ +/* Opcode to allocate device memory */ #define HL_MEM_OP_ALLOC 0 /* Opcode to free previously allocated device memory */ #define HL_MEM_OP_FREE 1 -/* Opcode to map host memory */ +/* Opcode to map host and device memory */ #define HL_MEM_OP_MAP 2 -/* Opcode to unmap previously mapped host memory */ +/* Opcode to unmap previously mapped host and device memory */ #define HL_MEM_OP_UNMAP 3 /* Memory flags */ |