accel/habanalabs/gaudi2: assume hard-reset by FW upon MC SEI severe error

FW initiates a hard reset upon an MC SEI severe error.
Align the driver to expect this reset and avoid accessing the device
until the reset is done.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Ofir Bitton <obitton@habana.ai>
Signed-off-by: Ofir Bitton <obitton@habana.ai>
This commit is contained in:
Tomer Tayar
2024-03-10 12:18:35 +02:00
committed by Ofir Bitton
parent c754bcf9dd
commit c8c10dcaca

View File

@@ -10004,6 +10004,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
if (gaudi2_handle_hbm_mc_sei_err(hdev, event_type, &eq_entry->sei_data)) {
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
reset_required = true;
is_critical = eq_entry->sei_data.hdr.is_critical;
}
error_count++;
break;
@@ -10235,8 +10236,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
gaudi2_print_event(hdev, event_type, true,
"No error cause for H/W event %u", event_type);
if ((gaudi2_irq_map_table[event_type].reset != EVENT_RESET_TYPE_NONE) ||
reset_required) {
if ((gaudi2_irq_map_table[event_type].reset != EVENT_RESET_TYPE_NONE) || reset_required) {
if (reset_required ||
(gaudi2_irq_map_table[event_type].reset == EVENT_RESET_TYPE_HARD))
reset_flags |= HL_DRV_RESET_HARD;