radeonsi: fuse or lower ffma optimally on all chips

LLVM is going to support the legacy instructions soon.

This change switches FMA to MAD for gfx10.

54793 shaders in 33659 tests
Totals:
SGPRS: 2632554 -> 2629570 (-0.11 %)
VGPRS: 1536364 -> 1535312 (-0.07 %)
Spilled SGPRs: 3602 -> 3562 (-1.11 %)
Spilled VGPRs: 44 -> 40 (-9.09 %)
Private memory VGPRs: 256 -> 256 (0.00 %)
Scratch size: 312 -> 308 (-1.28 %) dwords per thread
Code Size: 55422660 -> 55345408 (-0.14 %) bytes
Max Waves: 963983 -> 964200 (0.02 %)

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6756>
diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c
index f738b69..ad8868f 100644
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -862,7 +862,7 @@
       break;
    case nir_op_ffma:
       /* FMA is slow on gfx6-8, so it shouldn't be used. */
-      assert(ctx->ac.chip_class >= GFX9);
+      assert(instr->dest.dest.ssa.bit_size != 32 || ctx->ac.chip_class >= GFX9);
       result = emit_intrin_3f_param(&ctx->ac, "llvm.fma", ac_to_float_type(&ctx->ac, def_type),
                                     src[0], src[1], src[2]);
       break;
diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c
index 51a8390..4c68474 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -937,20 +937,28 @@
       .lower_bitfield_insert_to_bitfield_select = true,
       .lower_bitfield_extract = true,
       .lower_sub = true,
-      /* gfx6-8: use MAD (FMA is 4x slower)
-       * gfx9-10: either is OK (MAD and FMA have the same performance)
-       * gfx10.3: use FMA (MAD doesn't exist, separate MUL+ADD are 2x slower)
+      /*        |---------------------------------- Performance & Availability --------------------------------|
+       *        |MAD/MAC/MADAK/MADMK|MAD_LEGACY|MAC_LEGACY|    FMA     |FMAC/FMAAK/FMAMK|FMA_LEGACY|PK_FMA_F16,|Best choice
+       * Arch   |    F32,F16,F64    | F32,F16  | F32,F16  |F32,F16,F64 |    F32,F16     | F32,F16  |PK_FMAC_F16|F16,F32,F64
+       * ------------------------------------------------------------------------------------------------------------------
+       * gfx6,7 |     1 , - , -     |  1 , -   |  1 , -   |1/4, - ,1/16|     - , -      |  - , -   |   - , -   | - ,MAD,FMA
+       * gfx8   |     1 , 1 , -     |  1 , -   |  - , -   |1/4, 1 ,1/16|     - , -      |  - , -   |   - , -   |MAD,MAD,FMA
+       * gfx9   |     1 , 1 , -     |  1 , -   |  1 , -   | 1 , 1 ,1/16|     - , -      |  - , 1   |   2 , -   |FMA,MAD,FMA
+       * gfx10  |     1 , 1 , -     |  1 , -   |  1 , -   | 1 , 1 ,1/16|     1 , 1      |  - , -   |   2 , 2   |FMA,MAD,FMA
+       * gfx10.3|     - , - , -     |  - , -   |  - , -   | 1 , 1 ,1/16|     1 , 1      |  1 , -   |   2 , 2   |  all FMA
        *
-       * FMA has no advantage on gfx9-10 and MAD allows more algebraic optimizations.
-       * Keep FMA enabled on gfx10 to test it, which helps us validate correctness
-       * for gfx10.3 on gfx10.
+       * Tahiti, Hawaii, Carrizo, Vega20: FMA_F32 is full rate, FMA_F64 is 1/4
+       *
+       * gfx8 prefers MAD for F16 because of MAC/MADAK/MADMK.
+       * gfx9 and newer prefer FMA for F16 because of the packed instruction.
+       * gfx10 and older prefer MAD for F32 because of the legacy instruction.
        */
-      .lower_ffma16 = sscreen->info.chip_class <= GFX9,
-      .lower_ffma32 = sscreen->info.chip_class <= GFX9,
-      .lower_ffma64 = sscreen->info.chip_class <= GFX9,
-      .fuse_ffma16 = sscreen->info.chip_class >= GFX10,
-      .fuse_ffma32 = sscreen->info.chip_class >= GFX10,
-      .fuse_ffma64 = sscreen->info.chip_class >= GFX10,
+      .lower_ffma16 = sscreen->info.chip_class < GFX9,
+      .lower_ffma32 = sscreen->info.chip_class < GFX10_3,
+      .lower_ffma64 = false,
+      .fuse_ffma16 = sscreen->info.chip_class >= GFX9,
+      .fuse_ffma32 = sscreen->info.chip_class >= GFX10_3,
+      .fuse_ffma64 = true,
       .lower_fmod = true,
       .lower_pack_snorm_4x8 = true,
       .lower_pack_unorm_4x8 = true,