| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | |
| 3 | /* |
| 4 | * Copyright 2016-2019 HabanaLabs, Ltd. |
| 5 | * All Rights Reserved. |
| 6 | */ |
| 7 | |
| 8 | #include "habanalabs.h" |
| 9 | |
| 10 | #include <linux/slab.h> |
| 11 | |
| 12 | /* |
| 13 | * hl_queue_add_ptr - add to pi or ci and checks if it wraps around |
| 14 | * |
| 15 | * @ptr: the current pi/ci value |
| 16 | * @val: the amount to add |
| 17 | * |
| 18 | * Add val to ptr. It can go until twice the queue length. |
| 19 | */ |
| 20 | inline u32 hl_hw_queue_add_ptr(u32 ptr, u16 val) |
| 21 | { |
| 22 | ptr += val; |
| 23 | ptr &= ((HL_QUEUE_LENGTH << 1) - 1); |
| 24 | return ptr; |
| 25 | } |
| 26 | static inline int queue_ci_get(atomic_t *ci, u32 queue_len) |
| 27 | { |
| 28 | return atomic_read(v: ci) & ((queue_len << 1) - 1); |
| 29 | } |
| 30 | |
| 31 | static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len) |
| 32 | { |
| 33 | int delta = (q->pi - queue_ci_get(ci: &q->ci, queue_len)); |
| 34 | |
| 35 | if (delta >= 0) |
| 36 | return (queue_len - delta); |
| 37 | else |
| 38 | return (abs(delta) - queue_len); |
| 39 | } |
| 40 | |
| 41 | void hl_hw_queue_update_ci(struct hl_cs *cs) |
| 42 | { |
| 43 | struct hl_device *hdev = cs->ctx->hdev; |
| 44 | struct hl_hw_queue *q; |
| 45 | int i; |
| 46 | |
| 47 | if (hdev->disabled) |
| 48 | return; |
| 49 | |
| 50 | q = &hdev->kernel_queues[0]; |
| 51 | |
| 52 | /* There are no internal queues if H/W queues are being used */ |
| 53 | if (!hdev->asic_prop.max_queues || q->queue_type == QUEUE_TYPE_HW) |
| 54 | return; |
| 55 | |
| 56 | /* We must increment CI for every queue that will never get a |
| 57 | * completion, there are 2 scenarios this can happen: |
| 58 | * 1. All queues of a non completion CS will never get a completion. |
| 59 | * 2. Internal queues never gets completion. |
| 60 | */ |
| 61 | for (i = 0 ; i < hdev->asic_prop.max_queues ; i++, q++) { |
| 62 | if (!cs_needs_completion(cs) || q->queue_type == QUEUE_TYPE_INT) |
| 63 | atomic_add(i: cs->jobs_in_queue_cnt[i], v: &q->ci); |
| 64 | } |
| 65 | } |
| 66 | |
| 67 | /* |
| 68 | * hl_hw_queue_submit_bd() - Submit a buffer descriptor to an external or a |
| 69 | * H/W queue. |
| 70 | * @hdev: pointer to habanalabs device structure |
| 71 | * @q: pointer to habanalabs queue structure |
| 72 | * @ctl: BD's control word |
| 73 | * @len: BD's length |
| 74 | * @ptr: BD's pointer |
| 75 | * |
| 76 | * This function assumes there is enough space on the queue to submit a new |
| 77 | * BD to it. It initializes the next BD and calls the device specific |
| 78 | * function to set the pi (and doorbell) |
| 79 | * |
| 80 | * This function must be called when the scheduler mutex is taken |
| 81 | * |
| 82 | */ |
| 83 | void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q, |
| 84 | u32 ctl, u32 len, u64 ptr) |
| 85 | { |
| 86 | struct hl_bd *bd; |
| 87 | u64 addr; |
| 88 | int i; |
| 89 | |
| 90 | bd = q->kernel_address; |
| 91 | bd += hl_pi_2_offset(q->pi); |
| 92 | bd->ctl = cpu_to_le32(ctl); |
| 93 | bd->len = cpu_to_le32(len); |
| 94 | bd->ptr = cpu_to_le64(ptr); |
| 95 | |
| 96 | if (q->dram_bd) |
| 97 | for (i = 0 ; i < 2 ; i++) { |
| 98 | addr = q->pq_dram_address + |
| 99 | ((hl_pi_2_offset(q->pi) * sizeof(struct hl_bd)) + (i * sizeof(u64))); |
| 100 | hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM, addr, |
| 101 | (u64 *)(bd) + i, DEBUGFS_WRITE64); |
| 102 | } |
| 103 | |
| 104 | q->pi = hl_queue_inc_ptr(q->pi); |
| 105 | |
| 106 | hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi); |
| 107 | } |
| 108 | |
| 109 | /* |
| 110 | * ext_queue_sanity_checks - perform some sanity checks on external queue |
| 111 | * |
| 112 | * @hdev : pointer to hl_device structure |
| 113 | * @q : pointer to hl_hw_queue structure |
| 114 | * @num_of_entries : how many entries to check for space |
| 115 | * @reserve_cq_entry : whether to reserve an entry in the cq |
| 116 | * |
| 117 | * H/W queues spinlock should be taken before calling this function |
| 118 | * |
| 119 | * Perform the following: |
| 120 | * - Make sure we have enough space in the h/w queue |
| 121 | * - Make sure we have enough space in the completion queue |
| 122 | * - Reserve space in the completion queue (needs to be reversed if there |
| 123 | * is a failure down the road before the actual submission of work). Only |
| 124 | * do this action if reserve_cq_entry is true |
| 125 | * |
| 126 | */ |
| 127 | static int ext_queue_sanity_checks(struct hl_device *hdev, |
| 128 | struct hl_hw_queue *q, int num_of_entries, |
| 129 | bool reserve_cq_entry) |
| 130 | { |
| 131 | atomic_t *free_slots = |
| 132 | &hdev->completion_queue[q->cq_id].free_slots_cnt; |
| 133 | int free_slots_cnt; |
| 134 | |
| 135 | /* Check we have enough space in the queue */ |
| 136 | free_slots_cnt = queue_free_slots(q, HL_QUEUE_LENGTH); |
| 137 | |
| 138 | if (free_slots_cnt < num_of_entries) { |
| 139 | dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n" , |
| 140 | q->hw_queue_id, num_of_entries); |
| 141 | return -EAGAIN; |
| 142 | } |
| 143 | |
| 144 | if (reserve_cq_entry) { |
| 145 | /* |
| 146 | * Check we have enough space in the completion queue |
| 147 | * Add -1 to counter (decrement) unless counter was already 0 |
| 148 | * In that case, CQ is full so we can't submit a new CB because |
| 149 | * we won't get ack on its completion |
| 150 | * atomic_add_unless will return 0 if counter was already 0 |
| 151 | */ |
| 152 | if (atomic_add_negative(i: num_of_entries * -1, v: free_slots)) { |
| 153 | dev_dbg(hdev->dev, "No space for %d on CQ %d\n" , |
| 154 | num_of_entries, q->hw_queue_id); |
| 155 | atomic_add(i: num_of_entries, v: free_slots); |
| 156 | return -EAGAIN; |
| 157 | } |
| 158 | } |
| 159 | |
| 160 | return 0; |
| 161 | } |
| 162 | |
| 163 | /* |
| 164 | * int_queue_sanity_checks - perform some sanity checks on internal queue |
| 165 | * |
| 166 | * @hdev : pointer to hl_device structure |
| 167 | * @q : pointer to hl_hw_queue structure |
| 168 | * @num_of_entries : how many entries to check for space |
| 169 | * |
| 170 | * H/W queues spinlock should be taken before calling this function |
| 171 | * |
| 172 | * Perform the following: |
| 173 | * - Make sure we have enough space in the h/w queue |
| 174 | * |
| 175 | */ |
| 176 | static int int_queue_sanity_checks(struct hl_device *hdev, |
| 177 | struct hl_hw_queue *q, |
| 178 | int num_of_entries) |
| 179 | { |
| 180 | int free_slots_cnt; |
| 181 | |
| 182 | if (num_of_entries > q->int_queue_len) { |
| 183 | dev_err(hdev->dev, |
| 184 | "Cannot populate queue %u with %u jobs\n" , |
| 185 | q->hw_queue_id, num_of_entries); |
| 186 | return -ENOMEM; |
| 187 | } |
| 188 | |
| 189 | /* Check we have enough space in the queue */ |
| 190 | free_slots_cnt = queue_free_slots(q, queue_len: q->int_queue_len); |
| 191 | |
| 192 | if (free_slots_cnt < num_of_entries) { |
| 193 | dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n" , |
| 194 | q->hw_queue_id, num_of_entries); |
| 195 | return -EAGAIN; |
| 196 | } |
| 197 | |
| 198 | return 0; |
| 199 | } |
| 200 | |
| 201 | /* |
| 202 | * hw_queue_sanity_checks() - Make sure we have enough space in the h/w queue |
| 203 | * @hdev: Pointer to hl_device structure. |
| 204 | * @q: Pointer to hl_hw_queue structure. |
| 205 | * @num_of_entries: How many entries to check for space. |
| 206 | * |
| 207 | * Notice: We do not reserve queue entries so this function mustn't be called |
| 208 | * more than once per CS for the same queue |
| 209 | * |
| 210 | */ |
| 211 | static int hw_queue_sanity_checks(struct hl_device *hdev, struct hl_hw_queue *q, |
| 212 | int num_of_entries) |
| 213 | { |
| 214 | int free_slots_cnt; |
| 215 | |
| 216 | /* Check we have enough space in the queue */ |
| 217 | free_slots_cnt = queue_free_slots(q, HL_QUEUE_LENGTH); |
| 218 | |
| 219 | if (free_slots_cnt < num_of_entries) { |
| 220 | dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n" , |
| 221 | q->hw_queue_id, num_of_entries); |
| 222 | return -EAGAIN; |
| 223 | } |
| 224 | |
| 225 | return 0; |
| 226 | } |
| 227 | |
| 228 | /* |
| 229 | * hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without completion |
| 230 | * |
| 231 | * @hdev: pointer to hl_device structure |
| 232 | * @hw_queue_id: Queue's type |
| 233 | * @cb_size: size of CB |
| 234 | * @cb_ptr: pointer to CB location |
| 235 | * |
| 236 | * This function sends a single CB, that must NOT generate a completion entry. |
| 237 | * Sending CPU messages can be done instead via 'hl_hw_queue_submit_bd()' |
| 238 | */ |
| 239 | int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id, |
| 240 | u32 cb_size, u64 cb_ptr) |
| 241 | { |
| 242 | struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id]; |
| 243 | int rc = 0; |
| 244 | |
| 245 | hdev->asic_funcs->hw_queues_lock(hdev); |
| 246 | |
| 247 | if (hdev->disabled) { |
| 248 | rc = -EPERM; |
| 249 | goto out; |
| 250 | } |
| 251 | |
| 252 | /* |
| 253 | * hl_hw_queue_send_cb_no_cmpl() is called for queues of a H/W queue |
| 254 | * type only on init phase, when the queues are empty and being tested, |
| 255 | * so there is no need for sanity checks. |
| 256 | */ |
| 257 | if (q->queue_type != QUEUE_TYPE_HW) { |
| 258 | rc = ext_queue_sanity_checks(hdev, q, num_of_entries: 1, reserve_cq_entry: false); |
| 259 | if (rc) |
| 260 | goto out; |
| 261 | } |
| 262 | |
| 263 | hl_hw_queue_submit_bd(hdev, q, ctl: 0, len: cb_size, ptr: cb_ptr); |
| 264 | |
| 265 | out: |
| 266 | hdev->asic_funcs->hw_queues_unlock(hdev); |
| 267 | |
| 268 | return rc; |
| 269 | } |
| 270 | |
| 271 | /* |
| 272 | * ext_queue_schedule_job - submit a JOB to an external queue |
| 273 | * |
| 274 | * @job: pointer to the job that needs to be submitted to the queue |
| 275 | * |
| 276 | * This function must be called when the scheduler mutex is taken |
| 277 | * |
| 278 | */ |
| 279 | static void ext_queue_schedule_job(struct hl_cs_job *job) |
| 280 | { |
| 281 | struct hl_device *hdev = job->cs->ctx->hdev; |
| 282 | struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id]; |
| 283 | struct hl_cq_entry cq_pkt; |
| 284 | struct hl_cq *cq; |
| 285 | u64 cq_addr; |
| 286 | struct hl_cb *cb; |
| 287 | u32 ctl; |
| 288 | u32 len; |
| 289 | u64 ptr; |
| 290 | |
| 291 | /* |
| 292 | * Update the JOB ID inside the BD CTL so the device would know what |
| 293 | * to write in the completion queue |
| 294 | */ |
| 295 | ctl = ((q->pi << BD_CTL_SHADOW_INDEX_SHIFT) & BD_CTL_SHADOW_INDEX_MASK); |
| 296 | |
| 297 | cb = job->patched_cb; |
| 298 | len = job->job_cb_size; |
| 299 | ptr = cb->bus_address; |
| 300 | |
| 301 | /* Skip completion flow in case this is a non completion CS */ |
| 302 | if (!cs_needs_completion(cs: job->cs)) |
| 303 | goto submit_bd; |
| 304 | |
| 305 | cq_pkt.data = cpu_to_le32( |
| 306 | ((q->pi << CQ_ENTRY_SHADOW_INDEX_SHIFT) |
| 307 | & CQ_ENTRY_SHADOW_INDEX_MASK) | |
| 308 | FIELD_PREP(CQ_ENTRY_SHADOW_INDEX_VALID_MASK, 1) | |
| 309 | FIELD_PREP(CQ_ENTRY_READY_MASK, 1)); |
| 310 | |
| 311 | /* |
| 312 | * No need to protect pi_offset because scheduling to the |
| 313 | * H/W queues is done under the scheduler mutex |
| 314 | * |
| 315 | * No need to check if CQ is full because it was already |
| 316 | * checked in ext_queue_sanity_checks |
| 317 | */ |
| 318 | cq = &hdev->completion_queue[q->cq_id]; |
| 319 | cq_addr = cq->bus_address + cq->pi * sizeof(struct hl_cq_entry); |
| 320 | |
| 321 | hdev->asic_funcs->add_end_of_cb_packets(hdev, cb->kernel_address, len, |
| 322 | job->user_cb_size, |
| 323 | cq_addr, |
| 324 | le32_to_cpu(cq_pkt.data), |
| 325 | q->msi_vec, |
| 326 | job->contains_dma_pkt); |
| 327 | |
| 328 | q->shadow_queue[hl_pi_2_offset(q->pi)] = job; |
| 329 | |
| 330 | cq->pi = hl_cq_inc_ptr(ptr: cq->pi); |
| 331 | |
| 332 | submit_bd: |
| 333 | hl_hw_queue_submit_bd(hdev, q, ctl, len, ptr); |
| 334 | } |
| 335 | |
| 336 | /* |
| 337 | * int_queue_schedule_job - submit a JOB to an internal queue |
| 338 | * |
| 339 | * @job: pointer to the job that needs to be submitted to the queue |
| 340 | * |
| 341 | * This function must be called when the scheduler mutex is taken |
| 342 | * |
| 343 | */ |
| 344 | static void int_queue_schedule_job(struct hl_cs_job *job) |
| 345 | { |
| 346 | struct hl_device *hdev = job->cs->ctx->hdev; |
| 347 | struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id]; |
| 348 | struct hl_bd bd; |
| 349 | __le64 *pi; |
| 350 | |
| 351 | bd.ctl = 0; |
| 352 | bd.len = cpu_to_le32(job->job_cb_size); |
| 353 | |
| 354 | if (job->is_kernel_allocated_cb) |
| 355 | /* bus_address is actually a mmu mapped address |
| 356 | * allocated from an internal pool |
| 357 | */ |
| 358 | bd.ptr = cpu_to_le64(job->user_cb->bus_address); |
| 359 | else |
| 360 | bd.ptr = cpu_to_le64((u64) (uintptr_t) job->user_cb); |
| 361 | |
| 362 | pi = q->kernel_address + (q->pi & (q->int_queue_len - 1)) * sizeof(bd); |
| 363 | |
| 364 | q->pi++; |
| 365 | q->pi &= ((q->int_queue_len << 1) - 1); |
| 366 | |
| 367 | hdev->asic_funcs->pqe_write(hdev, pi, &bd); |
| 368 | |
| 369 | hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi); |
| 370 | } |
| 371 | |
| 372 | /* |
| 373 | * hw_queue_schedule_job - submit a JOB to a H/W queue |
| 374 | * |
| 375 | * @job: pointer to the job that needs to be submitted to the queue |
| 376 | * |
| 377 | * This function must be called when the scheduler mutex is taken |
| 378 | * |
| 379 | */ |
| 380 | static void hw_queue_schedule_job(struct hl_cs_job *job) |
| 381 | { |
| 382 | struct hl_device *hdev = job->cs->ctx->hdev; |
| 383 | struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id]; |
| 384 | u64 ptr; |
| 385 | u32 offset, ctl, len; |
| 386 | |
| 387 | /* |
| 388 | * Upon PQE completion, COMP_DATA is used as the write data to the |
| 389 | * completion queue (QMAN HBW message), and COMP_OFFSET is used as the |
| 390 | * write address offset in the SM block (QMAN LBW message). |
| 391 | * The write address offset is calculated as "COMP_OFFSET << 2". |
| 392 | */ |
| 393 | offset = job->cs->sequence & (hdev->asic_prop.max_pending_cs - 1); |
| 394 | ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) | |
| 395 | ((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK); |
| 396 | |
| 397 | len = job->job_cb_size; |
| 398 | |
| 399 | /* |
| 400 | * A patched CB is created only if a user CB was allocated by driver and |
| 401 | * MMU is disabled. If MMU is enabled, the user CB should be used |
| 402 | * instead. If the user CB wasn't allocated by driver, assume that it |
| 403 | * holds an address. |
| 404 | */ |
| 405 | if (job->patched_cb) |
| 406 | ptr = job->patched_cb->bus_address; |
| 407 | else if (job->is_kernel_allocated_cb) |
| 408 | ptr = job->user_cb->bus_address; |
| 409 | else |
| 410 | ptr = (u64) (uintptr_t) job->user_cb; |
| 411 | |
| 412 | hl_hw_queue_submit_bd(hdev, q, ctl, len, ptr); |
| 413 | } |
| 414 | |
| 415 | static int init_signal_cs(struct hl_device *hdev, |
| 416 | struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl) |
| 417 | { |
| 418 | struct hl_sync_stream_properties *prop; |
| 419 | struct hl_hw_sob *hw_sob; |
| 420 | u32 q_idx; |
| 421 | int rc = 0; |
| 422 | |
| 423 | q_idx = job->hw_queue_id; |
| 424 | prop = &hdev->kernel_queues[q_idx].sync_stream_prop; |
| 425 | hw_sob = &prop->hw_sob[prop->curr_sob_offset]; |
| 426 | |
| 427 | cs_cmpl->hw_sob = hw_sob; |
| 428 | cs_cmpl->sob_val = prop->next_sob_val; |
| 429 | |
| 430 | dev_dbg(hdev->dev, |
| 431 | "generate signal CB, sob_id: %d, sob val: %u, q_idx: %d, seq: %llu\n" , |
| 432 | cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, q_idx, |
| 433 | cs_cmpl->cs_seq); |
| 434 | |
| 435 | /* we set an EB since we must make sure all oeprations are done |
| 436 | * when sending the signal |
| 437 | */ |
| 438 | hdev->asic_funcs->gen_signal_cb(hdev, job->patched_cb, |
| 439 | cs_cmpl->hw_sob->sob_id, 0, true); |
| 440 | |
| 441 | rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, hw_sob: &hw_sob, count: 1, |
| 442 | encaps_sig: false); |
| 443 | |
| 444 | job->cs->sob_addr_offset = hw_sob->sob_addr; |
| 445 | job->cs->initial_sob_count = prop->next_sob_val - 1; |
| 446 | |
| 447 | return rc; |
| 448 | } |
| 449 | |
| 450 | void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev, |
| 451 | struct hl_cs *cs, struct hl_cs_job *job, |
| 452 | struct hl_cs_compl *cs_cmpl) |
| 453 | { |
| 454 | struct hl_cs_encaps_sig_handle *handle = cs->encaps_sig_hdl; |
| 455 | u32 offset = 0; |
| 456 | |
| 457 | cs_cmpl->hw_sob = handle->hw_sob; |
| 458 | |
| 459 | /* Note that encaps_sig_wait_offset was validated earlier in the flow |
| 460 | * for offset value which exceeds the max reserved signal count. |
| 461 | * always decrement 1 of the offset since when the user |
| 462 | * set offset 1 for example he mean to wait only for the first |
| 463 | * signal only, which will be pre_sob_val, and if he set offset 2 |
| 464 | * then the value required is (pre_sob_val + 1) and so on... |
| 465 | * if user set wait offset to 0, then treat it as legacy wait cs, |
| 466 | * wait for the next signal. |
| 467 | */ |
| 468 | if (job->encaps_sig_wait_offset) |
| 469 | offset = job->encaps_sig_wait_offset - 1; |
| 470 | |
| 471 | cs_cmpl->sob_val = handle->pre_sob_val + offset; |
| 472 | } |
| 473 | |
| 474 | static int init_wait_cs(struct hl_device *hdev, struct hl_cs *cs, |
| 475 | struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl) |
| 476 | { |
| 477 | struct hl_gen_wait_properties wait_prop; |
| 478 | struct hl_sync_stream_properties *prop; |
| 479 | struct hl_cs_compl *signal_cs_cmpl; |
| 480 | u32 q_idx; |
| 481 | |
| 482 | q_idx = job->hw_queue_id; |
| 483 | prop = &hdev->kernel_queues[q_idx].sync_stream_prop; |
| 484 | |
| 485 | signal_cs_cmpl = container_of(cs->signal_fence, |
| 486 | struct hl_cs_compl, |
| 487 | base_fence); |
| 488 | |
| 489 | if (cs->encaps_signals) { |
| 490 | /* use the encaps signal handle stored earlier in the flow |
| 491 | * and set the SOB information from the encaps |
| 492 | * signals handle |
| 493 | */ |
| 494 | hl_hw_queue_encaps_sig_set_sob_info(hdev, cs, job, cs_cmpl); |
| 495 | |
| 496 | dev_dbg(hdev->dev, "Wait for encaps signals handle, qidx(%u), CS sequence(%llu), sob val: 0x%x, offset: %u\n" , |
| 497 | cs->encaps_sig_hdl->q_idx, |
| 498 | cs->encaps_sig_hdl->cs_seq, |
| 499 | cs_cmpl->sob_val, |
| 500 | job->encaps_sig_wait_offset); |
| 501 | } else { |
| 502 | /* Copy the SOB id and value of the signal CS */ |
| 503 | cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob; |
| 504 | cs_cmpl->sob_val = signal_cs_cmpl->sob_val; |
| 505 | } |
| 506 | |
| 507 | /* check again if the signal cs already completed. |
| 508 | * if yes then don't send any wait cs since the hw_sob |
| 509 | * could be in reset already. if signal is not completed |
| 510 | * then get refcount to hw_sob to prevent resetting the sob |
| 511 | * while wait cs is not submitted. |
| 512 | * note that this check is protected by two locks, |
| 513 | * hw queue lock and completion object lock, |
| 514 | * and the same completion object lock also protects |
| 515 | * the hw_sob reset handler function. |
| 516 | * The hw_queue lock prevent out of sync of hw_sob |
| 517 | * refcount value, changed by signal/wait flows. |
| 518 | */ |
| 519 | spin_lock(lock: &signal_cs_cmpl->lock); |
| 520 | |
| 521 | if (completion_done(x: &cs->signal_fence->completion)) { |
| 522 | spin_unlock(lock: &signal_cs_cmpl->lock); |
| 523 | return -EINVAL; |
| 524 | } |
| 525 | |
| 526 | kref_get(kref: &cs_cmpl->hw_sob->kref); |
| 527 | |
| 528 | spin_unlock(lock: &signal_cs_cmpl->lock); |
| 529 | |
| 530 | dev_dbg(hdev->dev, |
| 531 | "generate wait CB, sob_id: %d, sob_val: 0x%x, mon_id: %d, q_idx: %d, seq: %llu\n" , |
| 532 | cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, |
| 533 | prop->base_mon_id, q_idx, cs->sequence); |
| 534 | |
| 535 | wait_prop.data = (void *) job->patched_cb; |
| 536 | wait_prop.sob_base = cs_cmpl->hw_sob->sob_id; |
| 537 | wait_prop.sob_mask = 0x1; |
| 538 | wait_prop.sob_val = cs_cmpl->sob_val; |
| 539 | wait_prop.mon_id = prop->base_mon_id; |
| 540 | wait_prop.q_idx = q_idx; |
| 541 | wait_prop.size = 0; |
| 542 | |
| 543 | hdev->asic_funcs->gen_wait_cb(hdev, &wait_prop); |
| 544 | |
| 545 | mb(); |
| 546 | hl_fence_put(fence: cs->signal_fence); |
| 547 | cs->signal_fence = NULL; |
| 548 | |
| 549 | return 0; |
| 550 | } |
| 551 | |
| 552 | /* |
| 553 | * init_signal_wait_cs - initialize a signal/wait CS |
| 554 | * @cs: pointer to the signal/wait CS |
| 555 | * |
| 556 | * H/W queues spinlock should be taken before calling this function |
| 557 | */ |
| 558 | static int init_signal_wait_cs(struct hl_cs *cs) |
| 559 | { |
| 560 | struct hl_ctx *ctx = cs->ctx; |
| 561 | struct hl_device *hdev = ctx->hdev; |
| 562 | struct hl_cs_job *job; |
| 563 | struct hl_cs_compl *cs_cmpl = |
| 564 | container_of(cs->fence, struct hl_cs_compl, base_fence); |
| 565 | int rc = 0; |
| 566 | |
| 567 | /* There is only one job in a signal/wait CS */ |
| 568 | job = list_first_entry(&cs->job_list, struct hl_cs_job, |
| 569 | cs_node); |
| 570 | |
| 571 | if (cs->type & CS_TYPE_SIGNAL) |
| 572 | rc = init_signal_cs(hdev, job, cs_cmpl); |
| 573 | else if (cs->type & CS_TYPE_WAIT) |
| 574 | rc = init_wait_cs(hdev, cs, job, cs_cmpl); |
| 575 | |
| 576 | return rc; |
| 577 | } |
| 578 | |
| 579 | static int encaps_sig_first_staged_cs_handler |
| 580 | (struct hl_device *hdev, struct hl_cs *cs) |
| 581 | { |
| 582 | struct hl_cs_compl *cs_cmpl = |
| 583 | container_of(cs->fence, |
| 584 | struct hl_cs_compl, base_fence); |
| 585 | struct hl_cs_encaps_sig_handle *encaps_sig_hdl; |
| 586 | struct hl_encaps_signals_mgr *mgr; |
| 587 | int rc = 0; |
| 588 | |
| 589 | mgr = &cs->ctx->sig_mgr; |
| 590 | |
| 591 | spin_lock(lock: &mgr->lock); |
| 592 | encaps_sig_hdl = idr_find(&mgr->handles, id: cs->encaps_sig_hdl_id); |
| 593 | if (encaps_sig_hdl) { |
| 594 | /* |
| 595 | * Set handler CS sequence, |
| 596 | * the CS which contains the encapsulated signals. |
| 597 | */ |
| 598 | encaps_sig_hdl->cs_seq = cs->sequence; |
| 599 | /* store the handle and set encaps signal indication, |
| 600 | * to be used later in cs_do_release to put the last |
| 601 | * reference to encaps signals handlers. |
| 602 | */ |
| 603 | cs_cmpl->encaps_signals = true; |
| 604 | cs_cmpl->encaps_sig_hdl = encaps_sig_hdl; |
| 605 | |
| 606 | /* set hw_sob pointer in completion object |
| 607 | * since it's used in cs_do_release flow to put |
| 608 | * refcount to sob |
| 609 | */ |
| 610 | cs_cmpl->hw_sob = encaps_sig_hdl->hw_sob; |
| 611 | cs_cmpl->sob_val = encaps_sig_hdl->pre_sob_val + |
| 612 | encaps_sig_hdl->count; |
| 613 | |
| 614 | dev_dbg(hdev->dev, "CS seq (%llu) added to encaps signal handler id (%u), count(%u), qidx(%u), sob(%u), val(%u)\n" , |
| 615 | cs->sequence, encaps_sig_hdl->id, |
| 616 | encaps_sig_hdl->count, |
| 617 | encaps_sig_hdl->q_idx, |
| 618 | cs_cmpl->hw_sob->sob_id, |
| 619 | cs_cmpl->sob_val); |
| 620 | |
| 621 | } else { |
| 622 | dev_err(hdev->dev, "encaps handle id(%u) wasn't found!\n" , |
| 623 | cs->encaps_sig_hdl_id); |
| 624 | rc = -EINVAL; |
| 625 | } |
| 626 | |
| 627 | spin_unlock(lock: &mgr->lock); |
| 628 | |
| 629 | return rc; |
| 630 | } |
| 631 | |
| 632 | /* |
| 633 | * hl_hw_queue_schedule_cs - schedule a command submission |
| 634 | * @cs: pointer to the CS |
| 635 | */ |
| 636 | int hl_hw_queue_schedule_cs(struct hl_cs *cs) |
| 637 | { |
| 638 | enum hl_device_status status; |
| 639 | struct hl_cs_counters_atomic *cntr; |
| 640 | struct hl_ctx *ctx = cs->ctx; |
| 641 | struct hl_device *hdev = ctx->hdev; |
| 642 | struct hl_cs_job *job, *tmp; |
| 643 | struct hl_hw_queue *q; |
| 644 | int rc = 0, i, cq_cnt; |
| 645 | bool first_entry; |
| 646 | u32 max_queues; |
| 647 | |
| 648 | cntr = &hdev->aggregated_cs_counters; |
| 649 | |
| 650 | hdev->asic_funcs->hw_queues_lock(hdev); |
| 651 | |
| 652 | if (!hl_device_operational(hdev, status: &status)) { |
| 653 | atomic64_inc(v: &cntr->device_in_reset_drop_cnt); |
| 654 | atomic64_inc(v: &ctx->cs_counters.device_in_reset_drop_cnt); |
| 655 | dev_err(hdev->dev, |
| 656 | "device is %s, CS rejected!\n" , hdev->status[status]); |
| 657 | rc = -EPERM; |
| 658 | goto out; |
| 659 | } |
| 660 | |
| 661 | max_queues = hdev->asic_prop.max_queues; |
| 662 | |
| 663 | q = &hdev->kernel_queues[0]; |
| 664 | for (i = 0, cq_cnt = 0 ; i < max_queues ; i++, q++) { |
| 665 | if (cs->jobs_in_queue_cnt[i]) { |
| 666 | switch (q->queue_type) { |
| 667 | case QUEUE_TYPE_EXT: |
| 668 | rc = ext_queue_sanity_checks(hdev, q, |
| 669 | num_of_entries: cs->jobs_in_queue_cnt[i], |
| 670 | reserve_cq_entry: cs_needs_completion(cs) ? |
| 671 | true : false); |
| 672 | break; |
| 673 | case QUEUE_TYPE_INT: |
| 674 | rc = int_queue_sanity_checks(hdev, q, |
| 675 | num_of_entries: cs->jobs_in_queue_cnt[i]); |
| 676 | break; |
| 677 | case QUEUE_TYPE_HW: |
| 678 | rc = hw_queue_sanity_checks(hdev, q, |
| 679 | num_of_entries: cs->jobs_in_queue_cnt[i]); |
| 680 | break; |
| 681 | default: |
| 682 | dev_err(hdev->dev, "Queue type %d is invalid\n" , |
| 683 | q->queue_type); |
| 684 | rc = -EINVAL; |
| 685 | break; |
| 686 | } |
| 687 | |
| 688 | if (rc) { |
| 689 | atomic64_inc( |
| 690 | v: &ctx->cs_counters.queue_full_drop_cnt); |
| 691 | atomic64_inc(v: &cntr->queue_full_drop_cnt); |
| 692 | goto unroll_cq_resv; |
| 693 | } |
| 694 | |
| 695 | if (q->queue_type == QUEUE_TYPE_EXT) |
| 696 | cq_cnt++; |
| 697 | } |
| 698 | } |
| 699 | |
| 700 | if ((cs->type == CS_TYPE_SIGNAL) || (cs->type == CS_TYPE_WAIT)) { |
| 701 | rc = init_signal_wait_cs(cs); |
| 702 | if (rc) |
| 703 | goto unroll_cq_resv; |
| 704 | } else if (cs->type == CS_TYPE_COLLECTIVE_WAIT) { |
| 705 | rc = hdev->asic_funcs->collective_wait_init_cs(cs); |
| 706 | if (rc) |
| 707 | goto unroll_cq_resv; |
| 708 | } |
| 709 | |
| 710 | rc = hdev->asic_funcs->pre_schedule_cs(cs); |
| 711 | if (rc) { |
| 712 | dev_err(hdev->dev, |
| 713 | "Failed in pre-submission operations of CS %d.%llu\n" , |
| 714 | ctx->asid, cs->sequence); |
| 715 | goto unroll_cq_resv; |
| 716 | } |
| 717 | |
| 718 | hdev->shadow_cs_queue[cs->sequence & |
| 719 | (hdev->asic_prop.max_pending_cs - 1)] = cs; |
| 720 | |
| 721 | if (cs->encaps_signals && cs->staged_first) { |
| 722 | rc = encaps_sig_first_staged_cs_handler(hdev, cs); |
| 723 | if (rc) |
| 724 | goto unroll_cq_resv; |
| 725 | } |
| 726 | |
| 727 | spin_lock(lock: &hdev->cs_mirror_lock); |
| 728 | |
| 729 | /* Verify staged CS exists and add to the staged list */ |
| 730 | if (cs->staged_cs && !cs->staged_first) { |
| 731 | struct hl_cs *staged_cs; |
| 732 | |
| 733 | staged_cs = hl_staged_cs_find_first(hdev, cs_seq: cs->staged_sequence); |
| 734 | if (!staged_cs) { |
| 735 | dev_err(hdev->dev, |
| 736 | "Cannot find staged submission sequence %llu" , |
| 737 | cs->staged_sequence); |
| 738 | rc = -EINVAL; |
| 739 | goto unlock_cs_mirror; |
| 740 | } |
| 741 | |
| 742 | if (is_staged_cs_last_exists(hdev, cs: staged_cs)) { |
| 743 | dev_err(hdev->dev, |
| 744 | "Staged submission sequence %llu already submitted" , |
| 745 | cs->staged_sequence); |
| 746 | rc = -EINVAL; |
| 747 | goto unlock_cs_mirror; |
| 748 | } |
| 749 | |
| 750 | list_add_tail(new: &cs->staged_cs_node, head: &staged_cs->staged_cs_node); |
| 751 | |
| 752 | /* update stream map of the first CS */ |
| 753 | if (hdev->supports_wait_for_multi_cs) |
| 754 | staged_cs->fence->stream_master_qid_map |= |
| 755 | cs->fence->stream_master_qid_map; |
| 756 | } |
| 757 | |
| 758 | list_add_tail(new: &cs->mirror_node, head: &hdev->cs_mirror_list); |
| 759 | |
| 760 | /* Queue TDR if the CS is the first entry and if timeout is wanted */ |
| 761 | first_entry = list_first_entry(&hdev->cs_mirror_list, |
| 762 | struct hl_cs, mirror_node) == cs; |
| 763 | if ((hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT) && |
| 764 | first_entry && cs_needs_timeout(cs)) { |
| 765 | cs->tdr_active = true; |
| 766 | schedule_delayed_work(dwork: &cs->work_tdr, delay: cs->timeout_jiffies); |
| 767 | |
| 768 | } |
| 769 | |
| 770 | spin_unlock(lock: &hdev->cs_mirror_lock); |
| 771 | |
| 772 | list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node) |
| 773 | switch (job->queue_type) { |
| 774 | case QUEUE_TYPE_EXT: |
| 775 | ext_queue_schedule_job(job); |
| 776 | break; |
| 777 | case QUEUE_TYPE_INT: |
| 778 | int_queue_schedule_job(job); |
| 779 | break; |
| 780 | case QUEUE_TYPE_HW: |
| 781 | hw_queue_schedule_job(job); |
| 782 | break; |
| 783 | default: |
| 784 | break; |
| 785 | } |
| 786 | |
| 787 | cs->submitted = true; |
| 788 | |
| 789 | goto out; |
| 790 | |
| 791 | unlock_cs_mirror: |
| 792 | spin_unlock(lock: &hdev->cs_mirror_lock); |
| 793 | unroll_cq_resv: |
| 794 | q = &hdev->kernel_queues[0]; |
| 795 | for (i = 0 ; (i < max_queues) && (cq_cnt > 0) ; i++, q++) { |
| 796 | if ((q->queue_type == QUEUE_TYPE_EXT) && |
| 797 | (cs->jobs_in_queue_cnt[i])) { |
| 798 | atomic_t *free_slots = |
| 799 | &hdev->completion_queue[i].free_slots_cnt; |
| 800 | atomic_add(i: cs->jobs_in_queue_cnt[i], v: free_slots); |
| 801 | cq_cnt--; |
| 802 | } |
| 803 | } |
| 804 | |
| 805 | out: |
| 806 | hdev->asic_funcs->hw_queues_unlock(hdev); |
| 807 | |
| 808 | return rc; |
| 809 | } |
| 810 | |
| 811 | /* |
| 812 | * hl_hw_queue_inc_ci_kernel - increment ci for kernel's queue |
| 813 | * |
| 814 | * @hdev: pointer to hl_device structure |
| 815 | * @hw_queue_id: which queue to increment its ci |
| 816 | */ |
| 817 | void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id) |
| 818 | { |
| 819 | struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id]; |
| 820 | |
| 821 | atomic_inc(v: &q->ci); |
| 822 | } |
| 823 | |
| 824 | static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, |
| 825 | bool is_cpu_queue) |
| 826 | { |
| 827 | void *p; |
| 828 | int rc; |
| 829 | |
| 830 | if (is_cpu_queue) |
| 831 | p = hl_cpu_accessible_dma_pool_alloc(hdev, HL_QUEUE_SIZE_IN_BYTES, dma_handle: &q->bus_address); |
| 832 | else |
| 833 | p = hl_asic_dma_alloc_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, &q->bus_address, |
| 834 | GFP_KERNEL | __GFP_ZERO); |
| 835 | if (!p) |
| 836 | return -ENOMEM; |
| 837 | |
| 838 | q->kernel_address = p; |
| 839 | |
| 840 | q->shadow_queue = kmalloc_array(HL_QUEUE_LENGTH, sizeof(struct hl_cs_job *), GFP_KERNEL); |
| 841 | if (!q->shadow_queue) { |
| 842 | dev_err(hdev->dev, |
| 843 | "Failed to allocate shadow queue for H/W queue %d\n" , |
| 844 | q->hw_queue_id); |
| 845 | rc = -ENOMEM; |
| 846 | goto free_queue; |
| 847 | } |
| 848 | |
| 849 | /* Make sure read/write pointers are initialized to start of queue */ |
| 850 | atomic_set(v: &q->ci, i: 0); |
| 851 | q->pi = 0; |
| 852 | |
| 853 | return 0; |
| 854 | |
| 855 | free_queue: |
| 856 | if (is_cpu_queue) |
| 857 | hl_cpu_accessible_dma_pool_free(hdev, HL_QUEUE_SIZE_IN_BYTES, vaddr: q->kernel_address); |
| 858 | else |
| 859 | hl_asic_dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, q->kernel_address, |
| 860 | q->bus_address); |
| 861 | |
| 862 | return rc; |
| 863 | } |
| 864 | |
| 865 | static int int_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) |
| 866 | { |
| 867 | void *p; |
| 868 | |
| 869 | p = hdev->asic_funcs->get_int_queue_base(hdev, q->hw_queue_id, |
| 870 | &q->bus_address, &q->int_queue_len); |
| 871 | if (!p) { |
| 872 | dev_err(hdev->dev, |
| 873 | "Failed to get base address for internal queue %d\n" , |
| 874 | q->hw_queue_id); |
| 875 | return -EFAULT; |
| 876 | } |
| 877 | |
| 878 | q->kernel_address = p; |
| 879 | q->pi = 0; |
| 880 | atomic_set(v: &q->ci, i: 0); |
| 881 | |
| 882 | return 0; |
| 883 | } |
| 884 | |
| 885 | static int cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) |
| 886 | { |
| 887 | return ext_and_cpu_queue_init(hdev, q, is_cpu_queue: true); |
| 888 | } |
| 889 | |
| 890 | static int ext_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) |
| 891 | { |
| 892 | return ext_and_cpu_queue_init(hdev, q, is_cpu_queue: false); |
| 893 | } |
| 894 | |
| 895 | static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) |
| 896 | { |
| 897 | void *p; |
| 898 | |
| 899 | p = hl_asic_dma_alloc_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, &q->bus_address, |
| 900 | GFP_KERNEL | __GFP_ZERO); |
| 901 | if (!p) |
| 902 | return -ENOMEM; |
| 903 | |
| 904 | q->kernel_address = p; |
| 905 | |
| 906 | /* Make sure read/write pointers are initialized to start of queue */ |
| 907 | atomic_set(v: &q->ci, i: 0); |
| 908 | q->pi = 0; |
| 909 | |
| 910 | return 0; |
| 911 | } |
| 912 | |
| 913 | static void sync_stream_queue_init(struct hl_device *hdev, u32 q_idx) |
| 914 | { |
| 915 | struct hl_sync_stream_properties *sync_stream_prop; |
| 916 | struct asic_fixed_properties *prop = &hdev->asic_prop; |
| 917 | struct hl_hw_sob *hw_sob; |
| 918 | int sob, reserved_mon_idx, queue_idx; |
| 919 | |
| 920 | sync_stream_prop = &hdev->kernel_queues[q_idx].sync_stream_prop; |
| 921 | |
| 922 | /* We use 'collective_mon_idx' as a running index in order to reserve |
| 923 | * monitors for collective master/slave queues. |
| 924 | * collective master queue gets 2 reserved monitors |
| 925 | * collective slave queue gets 1 reserved monitor |
| 926 | */ |
| 927 | if (hdev->kernel_queues[q_idx].collective_mode == |
| 928 | HL_COLLECTIVE_MASTER) { |
| 929 | reserved_mon_idx = hdev->collective_mon_idx; |
| 930 | |
| 931 | /* reserve the first monitor for collective master queue */ |
| 932 | sync_stream_prop->collective_mstr_mon_id[0] = |
| 933 | prop->collective_first_mon + reserved_mon_idx; |
| 934 | |
| 935 | /* reserve the second monitor for collective master queue */ |
| 936 | sync_stream_prop->collective_mstr_mon_id[1] = |
| 937 | prop->collective_first_mon + reserved_mon_idx + 1; |
| 938 | |
| 939 | hdev->collective_mon_idx += HL_COLLECTIVE_RSVD_MSTR_MONS; |
| 940 | } else if (hdev->kernel_queues[q_idx].collective_mode == |
| 941 | HL_COLLECTIVE_SLAVE) { |
| 942 | reserved_mon_idx = hdev->collective_mon_idx++; |
| 943 | |
| 944 | /* reserve a monitor for collective slave queue */ |
| 945 | sync_stream_prop->collective_slave_mon_id = |
| 946 | prop->collective_first_mon + reserved_mon_idx; |
| 947 | } |
| 948 | |
| 949 | if (!hdev->kernel_queues[q_idx].supports_sync_stream) |
| 950 | return; |
| 951 | |
| 952 | queue_idx = hdev->sync_stream_queue_idx++; |
| 953 | |
| 954 | sync_stream_prop->base_sob_id = prop->sync_stream_first_sob + |
| 955 | (queue_idx * HL_RSVD_SOBS); |
| 956 | sync_stream_prop->base_mon_id = prop->sync_stream_first_mon + |
| 957 | (queue_idx * HL_RSVD_MONS); |
| 958 | sync_stream_prop->next_sob_val = 1; |
| 959 | sync_stream_prop->curr_sob_offset = 0; |
| 960 | |
| 961 | for (sob = 0 ; sob < HL_RSVD_SOBS ; sob++) { |
| 962 | hw_sob = &sync_stream_prop->hw_sob[sob]; |
| 963 | hw_sob->hdev = hdev; |
| 964 | hw_sob->sob_id = sync_stream_prop->base_sob_id + sob; |
| 965 | hw_sob->sob_addr = |
| 966 | hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id); |
| 967 | hw_sob->q_idx = q_idx; |
| 968 | kref_init(kref: &hw_sob->kref); |
| 969 | } |
| 970 | } |
| 971 | |
| 972 | static void sync_stream_queue_reset(struct hl_device *hdev, u32 q_idx) |
| 973 | { |
| 974 | struct hl_sync_stream_properties *prop = |
| 975 | &hdev->kernel_queues[q_idx].sync_stream_prop; |
| 976 | |
| 977 | /* |
| 978 | * In case we got here due to a stuck CS, the refcnt might be bigger |
| 979 | * than 1 and therefore we reset it. |
| 980 | */ |
| 981 | kref_init(kref: &prop->hw_sob[prop->curr_sob_offset].kref); |
| 982 | prop->curr_sob_offset = 0; |
| 983 | prop->next_sob_val = 1; |
| 984 | } |
| 985 | |
| 986 | /* |
| 987 | * queue_init - main initialization function for H/W queue object |
| 988 | * |
| 989 | * @hdev: pointer to hl_device device structure |
| 990 | * @q: pointer to hl_hw_queue queue structure |
| 991 | * @hw_queue_id: The id of the H/W queue |
| 992 | * |
| 993 | * Allocate dma-able memory for the queue and initialize fields |
| 994 | * Returns 0 on success |
| 995 | */ |
| 996 | static int queue_init(struct hl_device *hdev, struct hl_hw_queue *q, |
| 997 | u32 hw_queue_id) |
| 998 | { |
| 999 | int rc; |
| 1000 | |
| 1001 | q->hw_queue_id = hw_queue_id; |
| 1002 | |
| 1003 | switch (q->queue_type) { |
| 1004 | case QUEUE_TYPE_EXT: |
| 1005 | rc = ext_queue_init(hdev, q); |
| 1006 | break; |
| 1007 | case QUEUE_TYPE_INT: |
| 1008 | rc = int_queue_init(hdev, q); |
| 1009 | break; |
| 1010 | case QUEUE_TYPE_CPU: |
| 1011 | rc = cpu_queue_init(hdev, q); |
| 1012 | break; |
| 1013 | case QUEUE_TYPE_HW: |
| 1014 | rc = hw_queue_init(hdev, q); |
| 1015 | break; |
| 1016 | case QUEUE_TYPE_NA: |
| 1017 | q->valid = 0; |
| 1018 | return 0; |
| 1019 | default: |
| 1020 | dev_crit(hdev->dev, "wrong queue type %d during init\n" , |
| 1021 | q->queue_type); |
| 1022 | rc = -EINVAL; |
| 1023 | break; |
| 1024 | } |
| 1025 | |
| 1026 | sync_stream_queue_init(hdev, q_idx: q->hw_queue_id); |
| 1027 | |
| 1028 | if (rc) |
| 1029 | return rc; |
| 1030 | |
| 1031 | q->valid = 1; |
| 1032 | |
| 1033 | return 0; |
| 1034 | } |
| 1035 | |
| 1036 | /* |
| 1037 | * hw_queue_fini - destroy queue |
| 1038 | * |
| 1039 | * @hdev: pointer to hl_device device structure |
| 1040 | * @q: pointer to hl_hw_queue queue structure |
| 1041 | * |
| 1042 | * Free the queue memory |
| 1043 | */ |
| 1044 | static void queue_fini(struct hl_device *hdev, struct hl_hw_queue *q) |
| 1045 | { |
| 1046 | if (!q->valid) |
| 1047 | return; |
| 1048 | |
| 1049 | /* |
| 1050 | * If we arrived here, there are no jobs waiting on this queue |
| 1051 | * so we can safely remove it. |
| 1052 | * This is because this function can only called when: |
| 1053 | * 1. Either a context is deleted, which only can occur if all its |
| 1054 | * jobs were finished |
| 1055 | * 2. A context wasn't able to be created due to failure or timeout, |
| 1056 | * which means there are no jobs on the queue yet |
| 1057 | * |
| 1058 | * The only exception are the queues of the kernel context, but |
| 1059 | * if they are being destroyed, it means that the entire module is |
| 1060 | * being removed. If the module is removed, it means there is no open |
| 1061 | * user context. It also means that if a job was submitted by |
| 1062 | * the kernel driver (e.g. context creation), the job itself was |
| 1063 | * released by the kernel driver when a timeout occurred on its |
| 1064 | * Completion. Thus, we don't need to release it again. |
| 1065 | */ |
| 1066 | |
| 1067 | if (q->queue_type == QUEUE_TYPE_INT) |
| 1068 | return; |
| 1069 | |
| 1070 | kfree(objp: q->shadow_queue); |
| 1071 | |
| 1072 | if (q->queue_type == QUEUE_TYPE_CPU) |
| 1073 | hl_cpu_accessible_dma_pool_free(hdev, HL_QUEUE_SIZE_IN_BYTES, vaddr: q->kernel_address); |
| 1074 | else |
| 1075 | hl_asic_dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, q->kernel_address, |
| 1076 | q->bus_address); |
| 1077 | } |
| 1078 | |
| 1079 | int hl_hw_queues_create(struct hl_device *hdev) |
| 1080 | { |
| 1081 | struct asic_fixed_properties *asic = &hdev->asic_prop; |
| 1082 | struct hl_hw_queue *q; |
| 1083 | int i, rc, q_ready_cnt; |
| 1084 | |
| 1085 | hdev->kernel_queues = kcalloc(asic->max_queues, |
| 1086 | sizeof(*hdev->kernel_queues), GFP_KERNEL); |
| 1087 | |
| 1088 | if (!hdev->kernel_queues) { |
| 1089 | dev_err(hdev->dev, "Not enough memory for H/W queues\n" ); |
| 1090 | return -ENOMEM; |
| 1091 | } |
| 1092 | |
| 1093 | /* Initialize the H/W queues */ |
| 1094 | for (i = 0, q_ready_cnt = 0, q = hdev->kernel_queues; |
| 1095 | i < asic->max_queues ; i++, q_ready_cnt++, q++) { |
| 1096 | |
| 1097 | q->queue_type = asic->hw_queues_props[i].type; |
| 1098 | q->supports_sync_stream = |
| 1099 | asic->hw_queues_props[i].supports_sync_stream; |
| 1100 | q->collective_mode = asic->hw_queues_props[i].collective_mode; |
| 1101 | q->dram_bd = asic->hw_queues_props[i].dram_bd; |
| 1102 | |
| 1103 | rc = queue_init(hdev, q, hw_queue_id: i); |
| 1104 | if (rc) { |
| 1105 | dev_err(hdev->dev, |
| 1106 | "failed to initialize queue %d\n" , i); |
| 1107 | goto release_queues; |
| 1108 | } |
| 1109 | |
| 1110 | /* Set DRAM PQ address for the queue if it should be at DRAM */ |
| 1111 | if (q->dram_bd) |
| 1112 | q->pq_dram_address = asic->hw_queues_props[i].q_dram_bd_address; |
| 1113 | } |
| 1114 | |
| 1115 | return 0; |
| 1116 | |
| 1117 | release_queues: |
| 1118 | for (i = 0, q = hdev->kernel_queues ; i < q_ready_cnt ; i++, q++) |
| 1119 | queue_fini(hdev, q); |
| 1120 | |
| 1121 | kfree(objp: hdev->kernel_queues); |
| 1122 | |
| 1123 | return rc; |
| 1124 | } |
| 1125 | |
| 1126 | void hl_hw_queues_destroy(struct hl_device *hdev) |
| 1127 | { |
| 1128 | struct hl_hw_queue *q; |
| 1129 | u32 max_queues = hdev->asic_prop.max_queues; |
| 1130 | int i; |
| 1131 | |
| 1132 | for (i = 0, q = hdev->kernel_queues ; i < max_queues ; i++, q++) |
| 1133 | queue_fini(hdev, q); |
| 1134 | |
| 1135 | kfree(objp: hdev->kernel_queues); |
| 1136 | } |
| 1137 | |
| 1138 | void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset) |
| 1139 | { |
| 1140 | struct hl_hw_queue *q; |
| 1141 | u32 max_queues = hdev->asic_prop.max_queues; |
| 1142 | int i; |
| 1143 | |
| 1144 | for (i = 0, q = hdev->kernel_queues ; i < max_queues ; i++, q++) { |
| 1145 | if ((!q->valid) || |
| 1146 | ((!hard_reset) && (q->queue_type == QUEUE_TYPE_CPU))) |
| 1147 | continue; |
| 1148 | q->pi = 0; |
| 1149 | atomic_set(v: &q->ci, i: 0); |
| 1150 | |
| 1151 | if (q->supports_sync_stream) |
| 1152 | sync_stream_queue_reset(hdev, q_idx: q->hw_queue_id); |
| 1153 | } |
| 1154 | } |
| 1155 | |