From ad88f5109b2fc80feca65cafc2ed93d2fa21821f Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 9 Sep 2021 11:07:23 +0000 Subject: [PATCH 1/2] Enable inlining check for arm --- crates/stdarch-test/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs index ab7ed4b5e5..757a9bc745 100644 --- a/crates/stdarch-test/src/lib.rs +++ b/crates/stdarch-test/src/lib.rs @@ -103,7 +103,7 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) { // failed inlining something. s[0].starts_with("call ") && s[1].starts_with("pop") // FIXME: original logic but does not match comment }) - } else if cfg!(target_arch = "aarch64") { + } else if cfg!(target_arch = "aarch64") || cfg!(target_arch = "arm") { instrs.iter().any(|s| s.starts_with("bl ")) } else { // FIXME: Add detection for other archs From ea2d37d5a73c9c969dd50e4ec01ff5a1eb0c2c9e Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 9 Sep 2021 11:46:38 +0000 Subject: [PATCH 2/2] Fix arm vfma inlining by using special _v8 dup fns --- .../src/arm_shared/neon/generated.rs | 8 ++++---- crates/core_arch/src/arm_shared/neon/mod.rs | 20 +++++++++++++++++++ crates/stdarch-gen/neon.spec | 4 ++-- crates/stdarch-gen/src/main.rs | 12 ++++++++++- 4 files changed, 37 insertions(+), 7 deletions(-) diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index e8b76ae377..c7652b39c7 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -8697,7 +8697,7 @@ vfmaq_f32_(b, c, a) #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))] pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t { - vfma_f32(a, b, vdup_n_f32(c)) + vfma_f32(a, b, vdup_n_f32_v8(c)) } /// Floating-point fused Multiply-Add to accumulator(vector) @@ -8707,7 +8707,7 @@ pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))] pub unsafe fn vfmaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t { - vfmaq_f32(a, b, vdupq_n_f32(c)) + vfmaq_f32(a, b, vdupq_n_f32_v8(c)) } /// Floating-point fused multiply-subtract from accumulator @@ -8739,7 +8739,7 @@ pub unsafe fn vfmsq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))] pub unsafe fn vfms_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t { - vfms_f32(a, b, vdup_n_f32(c)) + vfms_f32(a, b, vdup_n_f32_v8(c)) } /// Floating-point fused Multiply-subtract to accumulator(vector) @@ -8749,7 +8749,7 @@ pub unsafe fn vfms_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfms))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmls))] pub unsafe fn vfmsq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t { - vfmsq_f32(a, b, vdupq_n_f32(c)) + vfmsq_f32(a, b, vdupq_n_f32_v8(c)) } /// Subtract diff --git a/crates/core_arch/src/arm_shared/neon/mod.rs b/crates/core_arch/src/arm_shared/neon/mod.rs index 369bf07e18..32a7a3f16a 100644 --- a/crates/core_arch/src/arm_shared/neon/mod.rs +++ b/crates/core_arch/src/arm_shared/neon/mod.rs @@ -3704,6 +3704,16 @@ pub unsafe fn vdupq_n_f32(value: f32) -> float32x4_t { float32x4_t(value, value, value, value) } +/// Duplicate vector element to vector or scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))] +unsafe fn vdupq_n_f32_v8(value: f32) -> float32x4_t { + float32x4_t(value, value, value, value) +} + /// Duplicate vector element to vector or scalar #[inline] #[target_feature(enable = "neon")] @@ -3814,6 +3824,16 @@ pub unsafe fn vdup_n_f32(value: f32) -> float32x2_t { float32x2_t(value, value) } +/// Duplicate vector element to vector or scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vdup.32"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(dup))] +unsafe fn vdup_n_f32_v8(value: f32) -> float32x2_t { + float32x2_t(value, value) +} + /// Duplicate vector element to vector or scalar #[inline] #[target_feature(enable = "neon")] diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 789a394885..227bfce661 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2576,7 +2576,7 @@ generate float*_t /// Floating-point fused Multiply-Add to accumulator(vector) name = vfma n-suffix -multi_fn = vfma-self-noext, a, b, {vdup-nself-noext, c} +multi_fn = vfma-self-noext, a, b, {vdup-nselfv8-noext, c} a = 2.0, 3.0, 4.0, 5.0 b = 6.0, 4.0, 7.0, 8.0 c = 8.0 @@ -2653,7 +2653,7 @@ generate float*_t /// Floating-point fused Multiply-subtract to accumulator(vector) name = vfms n-suffix -multi_fn = vfms-self-noext, a, b, {vdup-nself-noext, c} +multi_fn = vfms-self-noext, a, b, {vdup-nselfv8-noext, c} a = 50.0, 35.0, 60.0, 69.0 b = 6.0, 4.0, 7.0, 8.0 c = 8.0 diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs index 82149064d2..d0a867f7f6 100644 --- a/crates/stdarch-gen/src/main.rs +++ b/crates/stdarch-gen/src/main.rs @@ -1108,6 +1108,7 @@ fn gen_aarch64( out_t, fixed, None, + true, )); } calls @@ -1947,6 +1948,7 @@ fn gen_arm( out_t, fixed, None, + false, )); } calls @@ -2364,6 +2366,7 @@ fn get_call( out_t: &str, fixed: &Vec, n: Option, + aarch64: bool, ) -> String { let params: Vec<_> = in_str.split(',').map(|v| v.trim().to_string()).collect(); assert!(params.len() > 0); @@ -2531,7 +2534,8 @@ fn get_call( in_t, out_t, fixed, - Some(i as i32) + Some(i as i32), + aarch64 ) ); call.push_str(&sub_match); @@ -2580,6 +2584,7 @@ fn get_call( out_t, fixed, n.clone(), + aarch64, ); if !param_str.is_empty() { param_str.push_str(", "); @@ -2650,6 +2655,11 @@ fn get_call( fn_name.push_str(type_to_suffix(in_t[1])); } else if fn_format[1] == "nself" { fn_name.push_str(type_to_n_suffix(in_t[1])); + } else if fn_format[1] == "nselfv8" { + fn_name.push_str(type_to_n_suffix(in_t[1])); + if !aarch64 { + fn_name.push_str("_v8"); + } } else if fn_format[1] == "out" { fn_name.push_str(type_to_suffix(out_t)); } else if fn_format[1] == "in0" {