cxl: Drop commands if the PCI channel is not in normal state
If the PCI channel has gone down, don't attempt to poke the hardware.
We need to guard every time cxl_whatever_(read|write) is called. This
is because a call to those functions will dereference an offset into an
mmio register, and the mmio mappings get invalidated in the EEH
teardown.
Check in the read/write functions in the header.
We give them the same semantics as usual PCI operations:
- a write to a channel that is down is ignored.
- a read from a channel that is down returns all fs.
Also, we try to access the MMIO space of a vPHB device as part of the
PCI disable path. Because that's a read that bypasses most of our usual
checks, we handle it explicitly.
As far as user visible warnings go:
- Check link state in file ops, return -EIO if down.
- Be reasonably quiet if there's an error in a teardown path,
or when we already know the hardware is going down.
- Throw a big WARN if someone tries to start a CXL operation
while the card is down. This gives a useful stacktrace for
debugging whatever is doing that.
Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index f74ff80..44568dd 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -41,6 +41,13 @@
rc = -EBUSY;
goto out;
}
+
+ if (!cxl_adapter_link_ok(afu->adapter)) {
+ afu->enabled = enabled;
+ rc = -EIO;
+ goto out;
+ }
+
pr_devel_ratelimited("AFU control... (0x%016llx)\n",
AFU_Cntl | command);
cpu_relax();
@@ -85,6 +92,10 @@
int cxl_afu_check_and_enable(struct cxl_afu *afu)
{
+ if (!cxl_adapter_link_ok(afu->adapter)) {
+ WARN(1, "Refusing to enable afu while link down!\n");
+ return -EIO;
+ }
if (afu->enabled)
return 0;
return afu_enable(afu);
@@ -103,6 +114,12 @@
pr_devel("PSL purge request\n");
+ if (!cxl_adapter_link_ok(afu->adapter)) {
+ dev_warn(&afu->dev, "PSL Purge called with link down, ignoring\n");
+ rc = -EIO;
+ goto out;
+ }
+
if ((AFU_Cntl & CXL_AFU_Cntl_An_ES_MASK) != CXL_AFU_Cntl_An_ES_Disabled) {
WARN(1, "psl_purge request while AFU not disabled!\n");
cxl_afu_disable(afu);
@@ -119,6 +136,11 @@
rc = -EBUSY;
goto out;
}
+ if (!cxl_adapter_link_ok(afu->adapter)) {
+ rc = -EIO;
+ goto out;
+ }
+
dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
pr_devel_ratelimited("PSL purging... PSL_CNTL: 0x%016llx PSL_DSISR: 0x%016llx\n", PSL_CNTL, dsisr);
if (dsisr & CXL_PSL_DSISR_TRANS) {
@@ -215,6 +237,8 @@
dev_warn(&adapter->dev, "WARNING: CXL adapter wide TLBIA timed out!\n");
return -EBUSY;
}
+ if (!cxl_adapter_link_ok(adapter))
+ return -EIO;
cpu_relax();
}
@@ -224,6 +248,8 @@
dev_warn(&adapter->dev, "WARNING: CXL adapter wide SLBIA timed out!\n");
return -EBUSY;
}
+ if (!cxl_adapter_link_ok(adapter))
+ return -EIO;
cpu_relax();
}
return 0;
@@ -240,6 +266,11 @@
dev_warn(&afu->dev, "WARNING: CXL AFU SLBIA timed out!\n");
return -EBUSY;
}
+ /* If the adapter has gone down, we can assume that we
+ * will PERST it and that will invalidate everything.
+ */
+ if (!cxl_adapter_link_ok(afu->adapter))
+ return -EIO;
cpu_relax();
}
return 0;
@@ -279,6 +310,8 @@
cxl_p1_write(adapter, CXL_PSL_SLBIA, CXL_TLB_SLB_IQ_LPIDPID);
while (1) {
+ if (!cxl_adapter_link_ok(adapter))
+ break;
slbia = cxl_p1_read(adapter, CXL_PSL_SLBIA);
if (!(slbia & CXL_TLB_SLB_P))
break;
@@ -308,6 +341,11 @@
rc = -EBUSY;
goto out;
}
+ if (!cxl_adapter_link_ok(ctx->afu->adapter)) {
+ dev_warn(&ctx->afu->dev, "WARNING: Device link down, aborting Process Element Command!\n");
+ rc = -EIO;
+ goto out;
+ }
state = be64_to_cpup(ctx->afu->sw_command_status);
if (state == ~0ULL) {
pr_err("cxl: Error adding process element to AFU\n");
@@ -355,8 +393,13 @@
mutex_lock(&ctx->afu->spa_mutex);
pr_devel("%s Terminate pe: %i started\n", __func__, ctx->pe);
- rc = do_process_element_cmd(ctx, CXL_SPA_SW_CMD_TERMINATE,
- CXL_PE_SOFTWARE_STATE_V | CXL_PE_SOFTWARE_STATE_T);
+ /* We could be asked to terminate when the hw is down. That
+ * should always succeed: it's not running if the hw has gone
+ * away and is being reset.
+ */
+ if (cxl_adapter_link_ok(ctx->afu->adapter))
+ rc = do_process_element_cmd(ctx, CXL_SPA_SW_CMD_TERMINATE,
+ CXL_PE_SOFTWARE_STATE_V | CXL_PE_SOFTWARE_STATE_T);
ctx->elem->software_state = 0; /* Remove Valid bit */
pr_devel("%s Terminate pe: %i finished\n", __func__, ctx->pe);
mutex_unlock(&ctx->afu->spa_mutex);
@@ -369,7 +412,14 @@
mutex_lock(&ctx->afu->spa_mutex);
pr_devel("%s Remove pe: %i started\n", __func__, ctx->pe);
- if (!(rc = do_process_element_cmd(ctx, CXL_SPA_SW_CMD_REMOVE, 0)))
+
+ /* We could be asked to remove when the hw is down. Again, if
+ * the hw is down, the PE is gone, so we succeed.
+ */
+ if (cxl_adapter_link_ok(ctx->afu->adapter))
+ rc = do_process_element_cmd(ctx, CXL_SPA_SW_CMD_REMOVE, 0);
+
+ if (!rc)
ctx->pe_inserted = false;
slb_invalid(ctx);
pr_devel("%s Remove pe: %i finished\n", __func__, ctx->pe);
@@ -612,6 +662,11 @@
if (!(mode & afu->modes_supported))
return -EINVAL;
+ if (!cxl_adapter_link_ok(afu->adapter)) {
+ WARN(1, "Device link is down, refusing to activate!\n");
+ return -EIO;
+ }
+
if (mode == CXL_MODE_DIRECTED)
return activate_afu_directed(afu);
if (mode == CXL_MODE_DEDICATED)
@@ -622,6 +677,11 @@
int cxl_attach_process(struct cxl_context *ctx, bool kernel, u64 wed, u64 amr)
{
+ if (!cxl_adapter_link_ok(ctx->afu->adapter)) {
+ WARN(1, "Device link is down, refusing to attach process!\n");
+ return -EIO;
+ }
+
ctx->kernel = kernel;
if (ctx->afu->current_mode == CXL_MODE_DIRECTED)
return attach_afu_directed(ctx, wed, amr);
@@ -666,6 +726,12 @@
{
u64 pidtid;
+ /* If the adapter has gone away, we can't get any meaningful
+ * information.
+ */
+ if (!cxl_adapter_link_ok(afu->adapter))
+ return -EIO;
+
info->dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
info->dar = cxl_p2n_read(afu, CXL_PSL_DAR_An);
info->dsr = cxl_p2n_read(afu, CXL_PSL_DSR_An);