+/* Like neon_scalar_for_mul, this function generate Rm encoding from GAS's
+ internal SCALAR. QUAD_P is 1 if it's for Q format, otherwise it's 0. */
+
+static unsigned
+neon_scalar_for_fmac_fp16_long (unsigned scalar, unsigned quad_p)
+{
+ unsigned regno = NEON_SCALAR_REG (scalar);
+ unsigned elno = NEON_SCALAR_INDEX (scalar);
+
+ if (quad_p)
+ {
+ if (regno > 7 || elno > 3)
+ goto bad_scalar;
+
+ return ((regno & 0x7)
+ | ((elno & 0x1) << 3)
+ | (((elno >> 1) & 0x1) << 5));
+ }
+ else
+ {
+ if (regno > 15 || elno > 1)
+ goto bad_scalar;
+
+ return (((regno & 0x1) << 5)
+ | ((regno >> 1) & 0x7)
+ | ((elno & 0x1) << 3));
+ }
+
+bad_scalar:
+ first_error (_("scalar out of range for multiply instruction"));
+ return 0;
+}
+
+static void
+do_neon_fmac_maybe_scalar_long (int subtype)
+{
+ enum neon_shape rs;
+ int high8;
+ /* NOTE: vfmal/vfmsl use slightly different NEON three-same encoding. 'size"
+ field (bits[21:20]) has different meaning. For scalar index variant, it's
+ used to differentiate add and subtract, otherwise it's with fixed value
+ 0x2. */
+ int size = -1;
+
+ if (inst.cond != COND_ALWAYS)
+ as_warn (_("vfmal/vfmsl with FP16 type cannot be conditional, the "
+ "behaviour is UNPREDICTABLE"));
+
+ constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_fp16_fml),
+ _(BAD_FP16));
+
+ constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
+ _(BAD_FPU));
+
+ /* vfmal/vfmsl are in three-same D/Q register format or the third operand can
+ be a scalar index register. */
+ if (inst.operands[2].isscalar)
+ {
+ high8 = 0xfe000000;
+ if (subtype)
+ size = 16;
+ rs = neon_select_shape (NS_DHS, NS_QDS, NS_NULL);
+ }
+ else
+ {
+ high8 = 0xfc000000;
+ size = 32;
+ if (subtype)
+ inst.instruction |= (0x1 << 23);
+ rs = neon_select_shape (NS_DHH, NS_QDD, NS_NULL);
+ }
+
+ neon_check_type (3, rs, N_EQK, N_EQK, N_KEY | N_F16);
+
+ /* "opcode" from template has included "ubit", so simply pass 0 here. Also,
+ the "S" bit in size field has been reused to differentiate vfmal and vfmsl,
+ so we simply pass -1 as size. */
+ unsigned quad_p = (rs == NS_QDD || rs == NS_QDS);
+ neon_three_same (quad_p, 0, size);
+
+ /* Undo neon_dp_fixup. Redo the high eight bits. */
+ inst.instruction &= 0x00ffffff;
+ inst.instruction |= high8;
+
+#define LOW1(R) ((R) & 0x1)
+#define HI4(R) (((R) >> 1) & 0xf)
+ /* Unlike usually NEON three-same, encoding for Vn and Vm will depend on
+ whether the instruction is in Q form and whether Vm is a scalar indexed
+ operand. */
+ if (inst.operands[2].isscalar)
+ {
+ unsigned rm
+ = neon_scalar_for_fmac_fp16_long (inst.operands[2].reg, quad_p);
+ inst.instruction &= 0xffffffd0;
+ inst.instruction |= rm;
+
+ if (!quad_p)
+ {
+ /* Redo Rn as well. */
+ inst.instruction &= 0xfff0ff7f;
+ inst.instruction |= HI4 (inst.operands[1].reg) << 16;
+ inst.instruction |= LOW1 (inst.operands[1].reg) << 7;
+ }
+ }
+ else if (!quad_p)
+ {
+ /* Redo Rn and Rm. */
+ inst.instruction &= 0xfff0ff50;
+ inst.instruction |= HI4 (inst.operands[1].reg) << 16;
+ inst.instruction |= LOW1 (inst.operands[1].reg) << 7;
+ inst.instruction |= HI4 (inst.operands[2].reg);
+ inst.instruction |= LOW1 (inst.operands[2].reg) << 5;
+ }
+}
+
+static void
+do_neon_vfmal (void)
+{
+ return do_neon_fmac_maybe_scalar_long (0);
+}
+
+static void
+do_neon_vfmsl (void)
+{
+ return do_neon_fmac_maybe_scalar_long (1);
+}
+