Skip to content

Commit

Permalink
Merge tag 'drm-next-2024-11-29' of https://gitlab.freedesktop.org/drm…
Browse files Browse the repository at this point in the history
…/kernel

Pull drm fixes from Dave Airlie:
 "Merge window fixes, mostly amdgpu and xe, with a few other minor ones,
  all looks fairly normal,

  i915:
   - hdcp: Fix when the first read and write are retried

  xe:
   - Wake up waiters after wait condition set to true
   - Mark the preempt fence workqueue as reclaim
   - Update xe2 graphics name string
   - Fix a couple of guc submit races
   - Fix pat index usage in migrate
   - Ensure non-cached migrate pagetable bo mappings
   - Take a PM ref in the delayed snapshot capture worker

  amdgpu:
   - SMU 13.0.6 fixes
   - XGMI fixes
   - SMU 13.0.7 fixes
   - Misc code cleanups
   - Plane refcount fixes
   - DCN 4.0.1 fixes
   - DC power fixes
   - DTO fixes
   - NBIO 7.11 fixes
   - SMU 14.0.x fixes
   - Reset fixes
   - Enable DC on LoongArch
   - Sysfs hotplug warning fix
   - Misc small fixes
   - VCN 4.0.3 fix
   - Slab usage fix
   - Jpeg delayed work fix

  amdkfd:
   - wptr handling fixes

  radeon:
   - Use ttm_bo_move_null()
   - Constify struct pci_device_id
   - Fix spurious hotplug
   - HPD fix

  rockchip
   - fix 32-bit build"

* tag 'drm-next-2024-11-29' of https://gitlab.freedesktop.org/drm/kernel: (48 commits)
  drm/xe: Take PM ref in delayed snapshot capture worker
  drm/xe/migrate: use XE_BO_FLAG_PAGETABLE
  drm/xe/migrate: fix pat index usage
  drm/xe/guc_submit: fix race around suspend_pending
  drm/xe/guc_submit: fix race around pending_disable
  drm/xe: Update xe2_graphics name string
  drm/rockchip: avoid 64-bit division
  Revert "drm/radeon: Delay Connector detecting when HPD singals is unstable"
  drm/amdgpu/jpeg: cancel the jpeg worker
  drm/amdgpu: fix usage slab after free
  drm/amdgpu/vcn: reset fw_shared when VCPU buffers corrupted on vcn v4.0.3
  drm/amdgpu: Fix sysfs warning when hotplugging
  drm/amdgpu: Add sysfs interface for vcn reset mask
  drm/amdgpu/gmc7: fix wait_for_idle callers
  drm/amd/pm: Remove arcturus min power limit
  drm/amd/pm: skip setting the power source on smu v14.0.2/3
  drm/amd/pm: disable pcie speed switching on Intel platform for smu v14.0.2/3
  drm/amdkfd: Use the correct wptr size
  drm/xe: Mark preempt fence workqueue as reclaim
  drm/xe/ufence: Wake up waiters after setting ufence->signalled
  ...
  • Loading branch information
torvalds committed Nov 29, 2024
2 parents 517363b + 9794b89 commit 2ba9f67
Show file tree
Hide file tree
Showing 75 changed files with 690 additions and 193 deletions.
4 changes: 4 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/aldebaran.c
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,8 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
}

list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
amdgpu_set_init_level(tmp_adev,
AMDGPU_INIT_LEVEL_RESET_RECOVERY);
dev_info(tmp_adev->dev,
"GPU reset succeeded, trying to resume\n");
r = aldebaran_mode2_restore_ip(tmp_adev);
Expand Down Expand Up @@ -375,6 +377,8 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
tmp_adev);

if (!r) {
amdgpu_set_init_level(tmp_adev,
AMDGPU_INIT_LEVEL_DEFAULT);
amdgpu_irq_gpu_reset_resume_helper(tmp_adev);

r = amdgpu_ib_ring_tests(tmp_adev);
Expand Down
1 change: 1 addition & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -839,6 +839,7 @@ struct amdgpu_mqd {
enum amdgpu_init_lvl_id {
AMDGPU_INIT_LEVEL_DEFAULT,
AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
AMDGPU_INIT_LEVEL_RESET_RECOVERY,
};

struct amdgpu_init_level {
Expand Down
29 changes: 24 additions & 5 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,11 @@ struct amdgpu_init_level amdgpu_init_default = {
.hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
};

struct amdgpu_init_level amdgpu_init_recovery = {
.level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
.hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
};

/*
* Minimal blocks needed to be initialized before a XGMI hive can be reset. This
* is used for cases like reset on initialization where the entire hive needs to
Expand All @@ -182,6 +187,9 @@ void amdgpu_set_init_level(struct amdgpu_device *adev,
case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
adev->init_lvl = &amdgpu_init_minimal_xgmi;
break;
case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
adev->init_lvl = &amdgpu_init_recovery;
break;
case AMDGPU_INIT_LEVEL_DEFAULT:
fallthrough;
default:
Expand Down Expand Up @@ -3250,7 +3258,7 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
return r;
}

if (!amdgpu_in_reset(adev))
if (!amdgpu_reset_in_recovery(adev))
amdgpu_ras_set_error_query_ready(adev, true);

amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
Expand Down Expand Up @@ -4669,8 +4677,8 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
int idx;
bool px;

amdgpu_fence_driver_sw_fini(adev);
amdgpu_device_ip_fini(adev);
amdgpu_fence_driver_sw_fini(adev);
amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
adev->accel_working = false;
dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
Expand Down Expand Up @@ -5419,7 +5427,7 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
struct list_head *device_list_handle;
bool full_reset, vram_lost = false;
struct amdgpu_device *tmp_adev;
int r;
int r, init_level;

device_list_handle = reset_context->reset_device_list;

Expand All @@ -5428,10 +5436,18 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)

full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);

/**
* If it's reset on init, it's default init level, otherwise keep level
* as recovery level.
*/
if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
init_level = AMDGPU_INIT_LEVEL_DEFAULT;
else
init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;

r = 0;
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
/* After reset, it's default init level */
amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT);
amdgpu_set_init_level(tmp_adev, init_level);
if (full_reset) {
/* post card */
amdgpu_ras_set_fed(tmp_adev, false);
Expand Down Expand Up @@ -5518,6 +5534,9 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)

out:
if (!r) {
/* IP init is complete now, set level as default */
amdgpu_set_init_level(tmp_adev,
AMDGPU_INIT_LEVEL_DEFAULT);
amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
r = amdgpu_ib_ring_tests(tmp_adev);
if (r) {
Expand Down
8 changes: 5 additions & 3 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
Original file line number Diff line number Diff line change
Expand Up @@ -1778,9 +1778,11 @@ int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev)

void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev)
{
amdgpu_gfx_sysfs_xcp_fini(adev);
amdgpu_gfx_sysfs_isolation_shader_fini(adev);
amdgpu_gfx_sysfs_reset_mask_fini(adev);
if (adev->dev->kobj.sd) {
amdgpu_gfx_sysfs_xcp_fini(adev);
amdgpu_gfx_sysfs_isolation_shader_fini(adev);
amdgpu_gfx_sysfs_reset_mask_fini(adev);
}
}

int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev,
Expand Down
6 changes: 4 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,8 @@ int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device *adev)

void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev)
{
if (adev->jpeg.num_jpeg_inst)
device_remove_file(adev->dev, &dev_attr_jpeg_reset_mask);
if (adev->dev->kobj.sd) {
if (adev->jpeg.num_jpeg_inst)
device_remove_file(adev->dev, &dev_attr_jpeg_reset_mask);
}
}
3 changes: 2 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,8 @@ void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev)
if (ret)
return;

device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used);
if (adev->dev->kobj.sd)
device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used);

ttm_resource_manager_cleanup(man);
ttm_set_driver_manager(&adev->mman.bdev, AMDGPU_PL_PREEMPT, NULL);
Expand Down
10 changes: 5 additions & 5 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
Original file line number Diff line number Diff line change
Expand Up @@ -1298,7 +1298,7 @@ int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
struct ras_manager *obj;

/* in resume phase, no need to create aca fs node */
if (adev->in_suspend || amdgpu_in_reset(adev))
if (adev->in_suspend || amdgpu_reset_in_recovery(adev))
return 0;

obj = get_ras_manager(adev, blk);
Expand Down Expand Up @@ -3610,7 +3610,7 @@ static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev)
ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr;

/* init event manager with node 0 on xgmi system */
if (!amdgpu_in_reset(adev)) {
if (!amdgpu_reset_in_recovery(adev)) {
if (!hive || adev->gmc.xgmi.node_id == 0)
ras_event_mgr_init(ras->event_mgr);
}
Expand Down Expand Up @@ -3825,7 +3825,7 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,

r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
if (r) {
if (adev->in_suspend || amdgpu_in_reset(adev)) {
if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) {
/* in resume phase, if fail to enable ras,
* clean up all ras fs nodes, and disable ras */
goto cleanup;
Expand All @@ -3837,7 +3837,7 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
amdgpu_persistent_edc_harvesting(adev, ras_block);

/* in resume phase, no need to create ras fs node */
if (adev->in_suspend || amdgpu_in_reset(adev))
if (adev->in_suspend || amdgpu_reset_in_recovery(adev))
return 0;

ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
Expand Down Expand Up @@ -3967,7 +3967,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
amdgpu_ras_event_mgr_init(adev);

if (amdgpu_ras_aca_is_supported(adev)) {
if (amdgpu_in_reset(adev)) {
if (amdgpu_reset_in_recovery(adev)) {
if (amdgpu_aca_is_enabled(adev))
r = amdgpu_aca_reset(adev);
else
Expand Down
5 changes: 5 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
Original file line number Diff line number Diff line change
Expand Up @@ -342,3 +342,8 @@ void amdgpu_reset_get_desc(struct amdgpu_reset_context *rst_ctxt, char *buf,
strscpy(buf, "unknown", len);
}
}

bool amdgpu_reset_in_recovery(struct amdgpu_device *adev)
{
return (adev->init_lvl->level == AMDGPU_INIT_LEVEL_RESET_RECOVERY);
}
2 changes: 2 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,4 +158,6 @@ extern struct amdgpu_reset_handler xgmi_reset_on_init_handler;
int amdgpu_reset_do_xgmi_reset_on_init(
struct amdgpu_reset_context *reset_context);

bool amdgpu_reset_in_recovery(struct amdgpu_device *adev);

#endif
6 changes: 4 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,8 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev)
if (!amdgpu_gpu_recovery)
return;

if (adev->sdma.num_instances)
device_remove_file(adev->dev, &dev_attr_sdma_reset_mask);
if (adev->dev->kobj.sd) {
if (adev->sdma.num_instances)
device_remove_file(adev->dev, &dev_attr_sdma_reset_mask);
}
}
6 changes: 3 additions & 3 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
Original file line number Diff line number Diff line change
Expand Up @@ -214,15 +214,15 @@ int amdgpu_vce_sw_fini(struct amdgpu_device *adev)

drm_sched_entity_destroy(&adev->vce.entity);

amdgpu_bo_free_kernel(&adev->vce.vcpu_bo, &adev->vce.gpu_addr,
(void **)&adev->vce.cpu_addr);

for (i = 0; i < adev->vce.num_rings; i++)
amdgpu_ring_fini(&adev->vce.ring[i]);

amdgpu_ucode_release(&adev->vce.fw);
mutex_destroy(&adev->vce.idle_mutex);

amdgpu_bo_free_kernel(&adev->vce.vcpu_bo, &adev->vce.gpu_addr,
(void **)&adev->vce.cpu_addr);

return 0;
}

Expand Down
37 changes: 37 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
Original file line number Diff line number Diff line change
Expand Up @@ -1283,3 +1283,40 @@ int amdgpu_vcn_psp_update_sram(struct amdgpu_device *adev, int inst_idx,

return psp_execute_ip_fw_load(&adev->psp, &ucode);
}

static ssize_t amdgpu_get_vcn_reset_mask(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(ddev);

if (!adev)
return -ENODEV;

return amdgpu_show_reset_mask(buf, adev->vcn.supported_reset);
}

static DEVICE_ATTR(vcn_reset_mask, 0444,
amdgpu_get_vcn_reset_mask, NULL);

int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev)
{
int r = 0;

if (adev->vcn.num_vcn_inst) {
r = device_create_file(adev->dev, &dev_attr_vcn_reset_mask);
if (r)
return r;
}

return r;
}

void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev)
{
if (adev->dev->kobj.sd) {
if (adev->vcn.num_vcn_inst)
device_remove_file(adev->dev, &dev_attr_vcn_reset_mask);
}
}
4 changes: 4 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,8 @@ struct amdgpu_vcn {

/* IP reg dump */
uint32_t *ip_dump;

uint32_t supported_reset;
};

struct amdgpu_fw_shared_rb_ptrs_struct {
Expand Down Expand Up @@ -519,5 +521,7 @@ int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_vcn_psp_update_sram(struct amdgpu_device *adev, int inst_idx,
enum AMDGPU_UCODE_ID ucode_id);
int amdgpu_vcn_save_vcpu_bo(struct amdgpu_device *adev);
int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev);
void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev);

#endif
6 changes: 4 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
Original file line number Diff line number Diff line change
Expand Up @@ -904,8 +904,10 @@ int amdgpu_vpe_sysfs_reset_mask_init(struct amdgpu_device *adev)

void amdgpu_vpe_sysfs_reset_mask_fini(struct amdgpu_device *adev)
{
if (adev->vpe.num_instances)
device_remove_file(adev->dev, &dev_attr_vpe_reset_mask);
if (adev->dev->kobj.sd) {
if (adev->vpe.num_instances)
device_remove_file(adev->dev, &dev_attr_vpe_reset_mask);
}
}

static const struct amdgpu_ring_funcs vpe_ring_funcs = {
Expand Down
41 changes: 41 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@
#define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210
#define smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK 0x12200218

#define XGMI_STATE_DISABLE 0xD1
#define XGMI_STATE_LS0 0x81
#define XGMI_LINK_ACTIVE 1
#define XGMI_LINK_INACTIVE 0

static DEFINE_MUTEX(xgmi_mutex);

#define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4
Expand Down Expand Up @@ -289,6 +294,42 @@ static const struct amdgpu_pcs_ras_field xgmi3x16_pcs_ras_fields[] = {
SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxCMDPktErr)},
};

static u32 xgmi_v6_4_get_link_status(struct amdgpu_device *adev, int global_link_num)
{
const u32 smnpcs_xgmi3x16_pcs_state_hist1 = 0x11a00070;
const int xgmi_inst = 2;
u32 link_inst;
u64 addr;

link_inst = global_link_num % xgmi_inst;

addr = (smnpcs_xgmi3x16_pcs_state_hist1 | (link_inst << 20)) +
adev->asic_funcs->encode_ext_smn_addressing(global_link_num / xgmi_inst);

return RREG32_PCIE_EXT(addr);
}

int amdgpu_get_xgmi_link_status(struct amdgpu_device *adev, int global_link_num)
{
u32 xgmi_state_reg_val;

switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
case IP_VERSION(6, 4, 0):
xgmi_state_reg_val = xgmi_v6_4_get_link_status(adev, global_link_num);
break;
default:
return -EOPNOTSUPP;
}

if ((xgmi_state_reg_val & 0xFF) == XGMI_STATE_DISABLE)
return -ENOLINK;

if ((xgmi_state_reg_val & 0xFF) == XGMI_STATE_LS0)
return XGMI_LINK_ACTIVE;

return XGMI_LINK_INACTIVE;
}

/**
* DOC: AMDGPU XGMI Support
*
Expand Down
2 changes: 2 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,5 +84,7 @@ int amdgpu_xgmi_reset_on_init(struct amdgpu_device *adev);
int amdgpu_xgmi_request_nps_change(struct amdgpu_device *adev,
struct amdgpu_hive_info *hive,
int req_nps_mode);
int amdgpu_get_xgmi_link_status(struct amdgpu_device *adev,
int global_link_num);

#endif
4 changes: 2 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/df_v3_6.c
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,8 @@ static void df_v3_6_sw_init(struct amdgpu_device *adev)

static void df_v3_6_sw_fini(struct amdgpu_device *adev)
{

device_remove_file(adev->dev, &dev_attr_df_cntr_avail);
if (adev->dev->kobj.sd)
device_remove_file(adev->dev, &dev_attr_df_cntr_avail);

}

Expand Down
Loading

0 comments on commit 2ba9f67

Please sign in to comment.