i965/fs: Switch to the constant cache for uniform pull constants. This reverts to using the oword block read messages for uniform pull constant loads, as used to be the case until 4c1fdae0a01b3f92ec03b61aac1d3df5. There are two important differences though: Now the L3 cacheability bits are set up correctly for UBOs (since 11f5d8a5d4fbb861ec161f68593e429cbd65d1cd), and we target the constant cache instead of the data cache. The latter used to get no L3 way allocation on boot on all platforms that existed at the time, so oword read messages wouldn't get cached on L3 regardless of the MOCS bits, what probably explains the apparent slowness of oword fetches. Constant cache loads seem to perform better than SIMD4x2 sampler loads in a number of cases, they alleviate some of the cache thrashing caused by the competition with textures for the L1/L2 sampler caches, and they allow fetching up to 128B worth of constants with a single oword fetch message. Note that IVB devices suffer from a hardware bug that leads to serialization of L3 read requests overlapping the same cacheline as result of a (on IVB buggy) mechanism of the L3 to preserve coherency. Since read requests for matching cachelines from any L3 client are not pipelined, throughput may decrease in cases where there are no non-overlapping requests left in the queue that can be processed between them. This situation should be relatively uncommon as long as we make sure that we don't use the 1/2 oword messages in cases where the shader intends to read from any other location of the same cacheline at some other point. This is generally a good idea anyway on all generations because using the 1 and 2 oword messages is expected to waste bandwidth since the minimum L3 request size for the DC is exactly 4 owords (i.e. one cacheline). A future commit will have this effect. I haven't been able to find any real-world example where this would still result in a regression on IVB, but if someone happens to find one it shouldn't be too difficult to add an IVB-specific check to have it fall back to the sampler cache for pull constant loads. Note that on SKL+ this change has the additional benefit of reducing the register footprint of pull constant loads. The following table summarizes the effect of the whole series on several shader-db stats: Total instructions Total cycles BWR: 4571248 -> 4568342 (-0.06%) 123375740 -> 123373296 (-0.00%) ELK: 3989020 -> 3985402 (-0.09%) 98757068 -> 98754058 (-0.00%) ILK: 6383591 -> 6376787 (-0.11%) 143649910 -> 143648914 (-0.00%) SNB: 7528395 -> 7501446 (-0.36%) 103503796 -> 102460370 (-1.01%) IVB: 6949221 -> 6943317 (-0.08%) 60592262 -> 60584422 (-0.01%) HSW: 6409753 -> 6403702 (-0.09%) 60609070 -> 60604414 (-0.01%) BDW: 8043467 -> 7976364 (-0.83%) 68427730 -> 68483042 (0.08%) CHV: 8045019 -> 7977916 (-0.83%) 68297426 -> 68352756 (0.08%) SKL: 8204037 -> 7939086 (-3.23%) 66583900 -> 65624378 (-1.44%) Lost->Gained Total spills Total fills BWR: 5 -> 5 1488 -> 1488 (0.00%) 1957 -> 1957 (0.00%) ELK: 5 -> 5 1489 -> 1489 (0.00%) 1958 -> 1958 (0.00%) ILK: 1 -> 4 1449 -> 1449 (0.00%) 1921 -> 1921 (0.00%) SNB: 0 -> 0 549 -> 549 (0.00%) 52 -> 52 (0.00%) IVB: 13 -> 3 1271 -> 1271 (0.00%) 1162 -> 1162 (0.00%) HSW: 11 -> 0 1271 -> 1271 (0.00%) 1162 -> 1162 (0.00%) BDW: 12 -> 0 1340 -> 1340 (0.00%) 1452 -> 1452 (0.00%) CHV: 12 -> 0 1340 -> 1340 (0.00%) 1452 -> 1452 (0.00%) SKL: 0 -> 120 1269 -> 375 (-70.45%) 1563 -> 690 (-55.85%) v3: Non-trivial rebase. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>

commit: ad38ba113491869ab0dffed937f7b3dd50e8a735 [log] [tgz]
author: Francisco Jerez <currojerez@riseup.net> Wed Oct 26 14:25:06 2016 -0700
committer: Francisco Jerez <currojerez@riseup.net> Wed Dec 14 16:50:26 2016 -0800
tree: 1545105f12c1b259d1e883014e6781425a2ac4c2
parent: 3c78d31374422b028b19afa5799689c404a5b73e [diff]
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 72b6df6..341f543 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c

@@ -2266,7 +2266,7 @@
 }
 
 /**
- * Read a float[4] vector from the data port Data Cache (const buffer).
+ * Read a float[4] vector from the data port constant cache.
  * Location (in buffer) should be a multiple of 16.
  * Used for fetching shader constants.
  */
@@ -2278,8 +2278,7 @@
 {
    const struct gen_device_info *devinfo = p->devinfo;
    const unsigned target_cache =
-      (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
-       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_SAMPLER_CACHE :
+      (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
        BRW_DATAPORT_READ_TARGET_DATA_CACHE);
 
    /* On newer hardware, offset is in units of owords. */

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 50266ad..b22dc9a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp

@@ -3202,44 +3202,18 @@
          continue;
 
       if (devinfo->gen >= 7) {
-         /* The offset arg is a vec4-aligned immediate byte offset. */
-         fs_reg const_offset_reg = inst->src[1];
-         assert(const_offset_reg.file == IMM &&
-                const_offset_reg.type == BRW_REGISTER_TYPE_UD);
-         assert(const_offset_reg.ud % 16 == 0);
+         const fs_builder ubld = fs_builder(this, block, inst).exec_all();
+         const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
 
-         fs_reg payload, offset;
-         if (devinfo->gen >= 9) {
-            /* We have to use a message header on Skylake to get SIMD4x2
-             * mode.  Reserve space for the register.
-            */
-            offset = payload = fs_reg(VGRF, alloc.allocate(2));
-            offset.offset += REG_SIZE;
-            inst->mlen = 2;
-         } else {
-            offset = payload = fs_reg(VGRF, alloc.allocate(1));
-            inst->mlen = 1;
-         }
+         ubld.group(8, 0).MOV(payload,
+                              retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+         ubld.group(1, 0).MOV(component(payload, 2),
+                              brw_imm_ud(inst->src[1].ud / 16));
 
-         /* This is actually going to be a MOV, but since only the first dword
-          * is accessed, we have a special opcode to do just that one.  Note
-          * that this needs to be an operation that will be considered a def
-          * by live variable analysis, or register allocation will explode.
-          */
-         fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
-                                               8, offset, const_offset_reg);
-         setup->force_writemask_all = true;
-
-         setup->ir = inst->ir;
-         setup->annotation = inst->annotation;
-         inst->insert_before(block, setup);
-
-         /* Similarly, this will only populate the first 4 channels of the
-          * result register (since we only use smear values from 0-3), but we
-          * don't tell the optimizer.
-          */
          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
          inst->src[1] = payload;
+         inst->header_size = 1;
+         inst->mlen = 1;
 
          invalidate_live_intervals();
       } else {

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 91c3985..941c05f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h

@@ -421,7 +421,7 @@
    void generate_uniform_pull_constant_load_gen7(fs_inst *inst,
                                                  struct brw_reg dst,
                                                  struct brw_reg surf_index,
-                                                 struct brw_reg offset);
+                                                 struct brw_reg payload);
    void generate_varying_pull_constant_load_gen4(fs_inst *inst,
                                                  struct brw_reg dst,
                                                  struct brw_reg index);

diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 4ef1a29..8b9fa8e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp

@@ -1145,42 +1145,13 @@
 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
                                                        struct brw_reg dst,
                                                        struct brw_reg index,
-                                                       struct brw_reg offset)
+                                                       struct brw_reg payload)
 {
    assert(index.type == BRW_REGISTER_TYPE_UD);
-
-   assert(offset.file == BRW_GENERAL_REGISTER_FILE);
-   /* Reference just the dword we need, to avoid angering validate_reg(). */
-   offset = brw_vec1_grf(offset.nr, 0);
-
-   /* We use the SIMD4x2 mode because we want to end up with 4 components in
-    * the destination loaded consecutively from the same offset (which appears
-    * in the first component, and the rest are ignored).
-    */
-   dst.width = BRW_WIDTH_4;
-
-   struct brw_reg src = offset;
-   bool header_present = false;
-
-   if (devinfo->gen >= 9) {
-      /* Skylake requires a message header in order to use SIMD4x2 mode. */
-      src = retype(brw_vec4_grf(offset.nr, 0), BRW_REGISTER_TYPE_UD);
-      header_present = true;
-
-      brw_push_insn_state(p);
-      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-      brw_set_default_exec_size(p, BRW_EXECUTE_8);
-      brw_MOV(p, vec8(src), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
-      brw_set_default_access_mode(p, BRW_ALIGN_1);
-
-      brw_MOV(p, get_element_ud(src, 2),
-              brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
-      brw_pop_insn_state(p);
-   }
+   assert(payload.file == BRW_GENERAL_REGISTER_FILE);
 
    if (index.file == BRW_IMMEDIATE_VALUE) {
-
-      uint32_t surf_index = index.ud;
+      const uint32_t surf_index = index.ud;
 
       brw_push_insn_state(p);
       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
@@ -1189,19 +1160,18 @@
       brw_inst_set_exec_size(devinfo, send, BRW_EXECUTE_4);
       brw_pop_insn_state(p);
 
-      brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
-      brw_set_src0(p, send, src);
-      brw_set_sampler_message(p, send,
+      brw_set_dest(p, send, vec4(retype(dst, BRW_REGISTER_TYPE_UD)));
+      brw_set_src0(p, send, vec4(retype(payload, BRW_REGISTER_TYPE_UD)));
+      brw_set_dp_read_message(p, send,
                               surf_index,
-                              0, /* LD message ignores sampler unit */
-                              GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
-                              1, /* rlen */
-                              inst->mlen,
-                              header_present,
-                              BRW_SAMPLER_SIMD_MODE_SIMD4X2,
-                              0);
-   } else {
+                              BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
+                              GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
+                              GEN6_SFID_DATAPORT_CONSTANT_CACHE,
+                              1, /* mlen */
+                              true, /* header */
+                              1); /* rlen */
 
+   } else {
       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
 
       brw_push_insn_state(p);
@@ -1217,16 +1187,18 @@
 
       /* dst = send(payload, a0.0 | <descriptor>) */
       brw_inst *insn = brw_send_indirect_message(
-         p, BRW_SFID_SAMPLER, dst, src, addr);
-      brw_set_sampler_message(p, insn,
-                              0,
-                              0, /* LD message ignores sampler unit */
-                              GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
-                              1, /* rlen */
-                              inst->mlen,
-                              header_present,
-                              BRW_SAMPLER_SIMD_MODE_SIMD4X2,
-                              0);
+         p, GEN6_SFID_DATAPORT_CONSTANT_CACHE,
+         vec4(retype(dst, BRW_REGISTER_TYPE_UD)),
+         vec4(retype(payload, BRW_REGISTER_TYPE_UD)), addr);
+      brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
+      brw_set_dp_read_message(p, insn,
+                              0, /* surface */
+                              BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
+                              GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
+                              GEN6_SFID_DATAPORT_CONSTANT_CACHE,
+                              1, /* mlen */
+                              true, /* header */
+                              1); /* rlen */
 
       brw_pop_insn_state(p);
    }
commit	ad38ba113491869ab0dffed937f7b3dd50e8a735	[log] [tgz]
author	Francisco Jerez <currojerez@riseup.net>	Wed Oct 26 14:25:06 2016 -0700
committer	Francisco Jerez <currojerez@riseup.net>	Wed Dec 14 16:50:26 2016 -0800
tree	1545105f12c1b259d1e883014e6781425a2ac4c2
parent	3c78d31374422b028b19afa5799689c404a5b73e [diff]