| /* |
| * Copyright © 2010 Intel Corporation |
| * SPDX-License-Identifier: MIT |
| */ |
| |
| #include "brw_shader.h" |
| #include "brw_builder.h" |
| |
| /** |
| * Factor an unsigned 32-bit integer. |
| * |
| * Attempts to factor \c x into two values that are at most 0xFFFF. If no |
| * such factorization is possible, either because the value is too large or is |
| * prime, both \c result_a and \c result_b will be zero. |
| */ |
| static void |
| factor_uint32(uint32_t x, unsigned *result_a, unsigned *result_b) |
| { |
| /* This is necessary to prevent various opportunities for division by zero |
| * below. |
| */ |
| assert(x > 0xffff); |
| |
| /* This represents the actual expected constraints on the input. Namely, |
| * both the upper and lower words should be > 1. |
| */ |
| assert(x >= 0x00020002); |
| |
| *result_a = 0; |
| *result_b = 0; |
| |
| /* The value is too large to factor with the constraints. */ |
| if (x > (0xffffu * 0xffffu)) |
| return; |
| |
| /* A non-prime number will have the form p*q*d where p is some prime |
| * number, q > 1, and 1 <= d <= q. To meet the constraints of this |
| * function, (p*d) < 0x10000. This implies d <= floor(0xffff / p). |
| * Furthermore, since q < 0x10000, d >= floor(x / (0xffff * p)). Finally, |
| * floor(x / (0xffff * p)) <= d <= floor(0xffff / p). |
| * |
| * The observation is finding the largest possible value of p reduces the |
| * possible range of d. After selecting p, all values of d in this range |
| * are tested until a factorization is found. The size of the range of |
| * possible values of d sets an upper bound on the run time of the |
| * function. |
| */ |
| static const uint16_t primes[256] = { |
| 2, 3, 5, 7, 11, 13, 17, 19, |
| 23, 29, 31, 37, 41, 43, 47, 53, |
| 59, 61, 67, 71, 73, 79, 83, 89, |
| 97, 101, 103, 107, 109, 113, 127, 131, /* 32 */ |
| 137, 139, 149, 151, 157, 163, 167, 173, |
| 179, 181, 191, 193, 197, 199, 211, 223, |
| 227, 229, 233, 239, 241, 251, 257, 263, |
| 269, 271, 277, 281, 283, 293, 307, 311, /* 64 */ |
| 313, 317, 331, 337, 347, 349, 353, 359, |
| 367, 373, 379, 383, 389, 397, 401, 409, |
| 419, 421, 431, 433, 439, 443, 449, 457, |
| 461, 463, 467, 479, 487, 491, 499, 503, /* 96 */ |
| 509, 521, 523, 541, 547, 557, 563, 569, |
| 571, 577, 587, 593, 599, 601, 607, 613, |
| 617, 619, 631, 641, 643, 647, 653, 659, |
| 661, 673, 677, 683, 691, 701, 709, 719, /* 128 */ |
| 727, 733, 739, 743, 751, 757, 761, 769, |
| 773, 787, 797, 809, 811, 821, 823, 827, |
| 829, 839, 853, 857, 859, 863, 877, 881, |
| 883, 887, 907, 911, 919, 929, 937, 941, /* 160 */ |
| 947, 953, 967, 971, 977, 983, 991, 997, |
| 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049, |
| 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097, |
| 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, /* 192 */ |
| 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223, |
| 1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283, |
| 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321, |
| 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, /* 224 */ |
| 1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459, |
| 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511, |
| 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, |
| 1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619, /* 256 */ |
| }; |
| |
| unsigned p; |
| unsigned x_div_p; |
| |
| for (int i = ARRAY_SIZE(primes) - 1; i >= 0; i--) { |
| p = primes[i]; |
| x_div_p = x / p; |
| |
| if ((x_div_p * p) == x) |
| break; |
| } |
| |
| /* A prime factor was not found. */ |
| if (x_div_p * p != x) |
| return; |
| |
| /* Terminate early if d=1 is a solution. */ |
| if (x_div_p < 0x10000) { |
| *result_a = x_div_p; |
| *result_b = p; |
| return; |
| } |
| |
| /* Pick the maximum possible value for 'd'. It's important that the loop |
| * below execute while d <= max_d because max_d is a valid value. Having |
| * the wrong loop bound would cause 1627*1367*47 (0x063b0c83) to be |
| * incorrectly reported as not being factorable. The problem would occur |
| * with any value that is a factor of two primes in the table and one prime |
| * not in the table. |
| */ |
| const unsigned max_d = 0xffff / p; |
| |
| /* Pick an initial value of 'd' that (combined with rejecting too large |
| * values above) guarantees that 'q' will always be small enough. |
| * DIV_ROUND_UP is used to prevent 'd' from being zero. |
| */ |
| for (unsigned d = DIV_ROUND_UP(x_div_p, 0xffff); d <= max_d; d++) { |
| unsigned q = x_div_p / d; |
| |
| if ((q * d) == x_div_p) { |
| assert(p * d * q == x); |
| assert((p * d) < 0x10000); |
| |
| *result_a = q; |
| *result_b = p * d; |
| break; |
| } |
| |
| /* Since every value of 'd' is tried, as soon as 'd' is larger |
| * than 'q', we're just re-testing combinations that have |
| * already been tested. |
| */ |
| if (d > q) |
| break; |
| } |
| } |
| |
| static void |
| brw_lower_mul_dword_inst(brw_shader &s, brw_inst *inst) |
| { |
| const intel_device_info *devinfo = s.devinfo; |
| const brw_builder ibld(inst); |
| |
| /* It is correct to use inst->src[1].d in both end of the comparison. |
| * Using .ud in the UINT16_MAX comparison would cause any negative value to |
| * fail the check. |
| */ |
| if (inst->src[1].file == IMM && |
| (inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) { |
| /* The MUL instruction isn't commutative. On Gen >= 7 only |
| * the low 16-bits of src1 are used. |
| * |
| * If multiplying by an immediate value that fits in 16-bits, do a |
| * single MUL instruction with that value in the proper location. |
| */ |
| const bool ud = (inst->src[1].d >= 0); |
| ibld.MUL(inst->dst, inst->src[0], |
| ud ? brw_imm_uw(inst->src[1].ud) |
| : brw_imm_w(inst->src[1].d)); |
| } else { |
| /* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot |
| * do 32-bit integer multiplication in one instruction, but instead |
| * must do a sequence (which actually calculates a 64-bit result): |
| * |
| * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D |
| * mach(8) null g3<8,8,1>D g4<8,8,1>D |
| * mov(8) g2<1>D acc0<8,8,1>D |
| * |
| * But on Gen > 6, the ability to use second accumulator register |
| * (acc1) for non-float data types was removed, preventing a simple |
| * implementation in SIMD16. A 16-channel result can be calculated by |
| * executing the three instructions twice in SIMD8, once with quarter |
| * control of 1Q for the first eight channels and again with 2Q for |
| * the second eight channels. |
| * |
| * Which accumulator register is implicitly accessed (by AccWrEnable |
| * for instance) is determined by the quarter control. Unfortunately |
| * Ivybridge (and presumably Baytrail) has a hardware bug in which an |
| * implicit accumulator access by an instruction with 2Q will access |
| * acc1 regardless of whether the data type is usable in acc1. |
| * |
| * Specifically, the 2Q mach(8) writes acc1 which does not exist for |
| * integer data types. |
| * |
| * Since we only want the low 32-bits of the result, we can do two |
| * 32-bit x 16-bit multiplies (like the mul and mach are doing), and |
| * adjust the high result and add them (like the mach is doing): |
| * |
| * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW |
| * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW |
| * shl(8) g9<1>D g8<8,8,1>D 16D |
| * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D |
| * |
| * We avoid the shl instruction by realizing that we only want to add |
| * the low 16-bits of the "high" result to the high 16-bits of the |
| * "low" result and using proper regioning on the add: |
| * |
| * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW |
| * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW |
| * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW |
| * |
| * Since it does not use the (single) accumulator register, we can |
| * schedule multi-component multiplications much better. |
| */ |
| |
| bool needs_mov = false; |
| brw_reg orig_dst = inst->dst; |
| |
| /* Get a new VGRF for the "low" 32x16-bit multiplication result if |
| * reusing the original destination is impossible due to hardware |
| * restrictions, source/destination overlap, or it being the null |
| * register. |
| */ |
| brw_reg low = inst->dst; |
| if (orig_dst.is_null() || |
| regions_overlap(inst->dst, inst->size_written, |
| inst->src[0], inst->size_read(devinfo, 0)) || |
| regions_overlap(inst->dst, inst->size_written, |
| inst->src[1], inst->size_read(devinfo, 1)) || |
| inst->dst.stride >= 4) { |
| needs_mov = true; |
| low = retype(brw_allocate_vgrf_units(s, regs_written(inst)), inst->dst.type); |
| } |
| |
| /* Get a new VGRF but keep the same stride as inst->dst */ |
| brw_reg high = retype(brw_allocate_vgrf_units(s, regs_written(inst)), inst->dst.type); |
| high.stride = inst->dst.stride; |
| high.offset = inst->dst.offset % REG_SIZE; |
| |
| bool do_addition = true; |
| { |
| /* From Wa_1604601757: |
| * |
| * "When multiplying a DW and any lower precision integer, source modifier |
| * is not supported." |
| * |
| * An unsupported negate modifier on src[1] would ordinarily be |
| * lowered by the subsequent lower_regioning pass. In this case that |
| * pass would spawn another dword multiply. Instead, lower the |
| * modifier first. |
| */ |
| const bool source_mods_unsupported = (devinfo->ver >= 12); |
| |
| if (inst->src[1].abs || (inst->src[1].negate && |
| source_mods_unsupported)) |
| brw_lower_src_modifiers(s, inst, 1); |
| |
| if (inst->src[1].file == IMM) { |
| unsigned a; |
| unsigned b; |
| |
| /* If the immeditate value can be factored into two values, A and |
| * B, that each fit in 16-bits, the multiplication result can |
| * instead be calculated as (src1 * (A * B)) = ((src1 * A) * B). |
| * This saves an operation (the addition) and a temporary register |
| * (high). |
| * |
| * Skip the optimization if either the high word or the low word |
| * is 0 or 1. In these conditions, at least one of the |
| * multiplications generated by the straightforward method will be |
| * eliminated anyway. |
| */ |
| if (inst->src[1].ud > 0x0001ffff && |
| (inst->src[1].ud & 0xffff) > 1) { |
| factor_uint32(inst->src[1].ud, &a, &b); |
| |
| if (a != 0) { |
| ibld.MUL(low, inst->src[0], brw_imm_uw(a)); |
| ibld.MUL(low, low, brw_imm_uw(b)); |
| do_addition = false; |
| } |
| } |
| |
| if (do_addition) { |
| ibld.MUL(low, inst->src[0], |
| brw_imm_uw(inst->src[1].ud & 0xffff)); |
| ibld.MUL(high, inst->src[0], |
| brw_imm_uw(inst->src[1].ud >> 16)); |
| } |
| } else { |
| ibld.MUL(low, inst->src[0], |
| subscript(inst->src[1], BRW_TYPE_UW, 0)); |
| ibld.MUL(high, inst->src[0], |
| subscript(inst->src[1], BRW_TYPE_UW, 1)); |
| } |
| } |
| |
| if (do_addition) { |
| ibld.ADD(subscript(low, BRW_TYPE_UW, 1), |
| subscript(low, BRW_TYPE_UW, 1), |
| subscript(high, BRW_TYPE_UW, 0)); |
| } |
| |
| if (needs_mov || inst->conditional_mod) |
| set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low)); |
| } |
| } |
| |
| static void |
| brw_lower_mul_qword_inst(brw_shader &s, brw_inst *inst) |
| { |
| const intel_device_info *devinfo = s.devinfo; |
| const brw_builder ibld(inst); |
| |
| /* Considering two 64-bit integers ab and cd where each letter ab |
| * corresponds to 32 bits, we get a 128-bit result WXYZ. We * cd |
| * only need to provide the YZ part of the result. ------- |
| * BD |
| * Only BD needs to be 64 bits. For AD and BC we only care + AD |
| * about the lower 32 bits (since they are part of the upper + BC |
| * 32 bits of our result). AC is not needed since it starts + AC |
| * on the 65th bit of the result. ------- |
| * WXYZ |
| */ |
| unsigned int q_regs = regs_written(inst); |
| unsigned int d_regs = (q_regs + 1) / 2; |
| |
| brw_reg bd = retype(brw_allocate_vgrf_units(s, q_regs), BRW_TYPE_UQ); |
| brw_reg ad = retype(brw_allocate_vgrf_units(s, d_regs), BRW_TYPE_UD); |
| brw_reg bc = retype(brw_allocate_vgrf_units(s, d_regs), BRW_TYPE_UD); |
| |
| /* Here we need the full 64 bit result for 32b * 32b. */ |
| if (devinfo->has_integer_dword_mul) { |
| ibld.MUL(bd, subscript(inst->src[0], BRW_TYPE_UD, 0), |
| subscript(inst->src[1], BRW_TYPE_UD, 0)); |
| } else { |
| brw_reg bd_high = retype(brw_allocate_vgrf_units(s, d_regs), BRW_TYPE_UD); |
| brw_reg bd_low = retype(brw_allocate_vgrf_units(s, d_regs), BRW_TYPE_UD); |
| const unsigned acc_width = reg_unit(devinfo) * 8; |
| brw_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_TYPE_UD), |
| inst->group % acc_width); |
| |
| brw_inst *mul = ibld.MUL(acc, |
| subscript(inst->src[0], BRW_TYPE_UD, 0), |
| subscript(inst->src[1], BRW_TYPE_UW, 0)); |
| mul->writes_accumulator = true; |
| |
| ibld.MACH(bd_high, subscript(inst->src[0], BRW_TYPE_UD, 0), |
| subscript(inst->src[1], BRW_TYPE_UD, 0)); |
| ibld.MOV(bd_low, acc); |
| |
| ibld.UNDEF(bd); |
| ibld.MOV(subscript(bd, BRW_TYPE_UD, 0), bd_low); |
| ibld.MOV(subscript(bd, BRW_TYPE_UD, 1), bd_high); |
| } |
| |
| ibld.MUL(ad, subscript(inst->src[0], BRW_TYPE_UD, 1), |
| subscript(inst->src[1], BRW_TYPE_UD, 0)); |
| ibld.MUL(bc, subscript(inst->src[0], BRW_TYPE_UD, 0), |
| subscript(inst->src[1], BRW_TYPE_UD, 1)); |
| |
| ibld.ADD(ad, ad, bc); |
| ibld.ADD(subscript(bd, BRW_TYPE_UD, 1), |
| subscript(bd, BRW_TYPE_UD, 1), ad); |
| |
| if (devinfo->has_64bit_int) { |
| ibld.MOV(inst->dst, bd); |
| } else { |
| if (!inst->is_partial_write()) |
| ibld.emit_undef_for_dst(inst); |
| ibld.MOV(subscript(inst->dst, BRW_TYPE_UD, 0), |
| subscript(bd, BRW_TYPE_UD, 0)); |
| ibld.MOV(subscript(inst->dst, BRW_TYPE_UD, 1), |
| subscript(bd, BRW_TYPE_UD, 1)); |
| } |
| } |
| |
| static void |
| brw_lower_mulh_inst(brw_shader &s, brw_inst *inst) |
| { |
| const intel_device_info *devinfo = s.devinfo; |
| const brw_builder ibld(inst); |
| |
| /* According to the BDW+ BSpec page for the "Multiply Accumulate |
| * High" instruction: |
| * |
| * "An added preliminary mov is required for source modification on |
| * src1: |
| * mov (8) r3.0<1>:d -r3<8;8,1>:d |
| * mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw |
| * mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d" |
| */ |
| if (inst->src[1].negate || inst->src[1].abs) |
| brw_lower_src_modifiers(s, inst, 1); |
| |
| /* Should have been lowered to 8-wide. */ |
| assert(inst->exec_size <= brw_get_lowered_simd_width(&s, inst)); |
| const unsigned acc_width = reg_unit(devinfo) * 8; |
| const brw_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), inst->dst.type), |
| inst->group % acc_width); |
| brw_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]); |
| ibld.MACH(inst->dst, inst->src[0], inst->src[1]); |
| |
| /* Until Gfx8, integer multiplies read 32-bits from one source, |
| * and 16-bits from the other, and relying on the MACH instruction |
| * to generate the high bits of the result. |
| * |
| * On Gfx8, the multiply instruction does a full 32x32-bit |
| * multiply, but in order to do a 64-bit multiply we can simulate |
| * the previous behavior and then use a MACH instruction. |
| */ |
| assert(mul->src[1].type == BRW_TYPE_D || |
| mul->src[1].type == BRW_TYPE_UD); |
| mul->src[1].type = BRW_TYPE_UW; |
| mul->src[1].stride *= 2; |
| |
| if (mul->src[1].file == IMM) { |
| mul->src[1] = brw_imm_uw(mul->src[1].ud); |
| } |
| } |
| |
| bool |
| brw_lower_integer_multiplication(brw_shader &s) |
| { |
| const intel_device_info *devinfo = s.devinfo; |
| bool progress = false; |
| |
| foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) { |
| if (inst->opcode == BRW_OPCODE_MUL) { |
| /* If the instruction is already in a form that does not need lowering, |
| * return early. |
| */ |
| if (brw_type_size_bytes(inst->src[1].type) < 4 && brw_type_size_bytes(inst->src[0].type) <= 4) |
| continue; |
| |
| if ((inst->dst.type == BRW_TYPE_Q || |
| inst->dst.type == BRW_TYPE_UQ) && |
| (inst->src[0].type == BRW_TYPE_Q || |
| inst->src[0].type == BRW_TYPE_UQ) && |
| (inst->src[1].type == BRW_TYPE_Q || |
| inst->src[1].type == BRW_TYPE_UQ)) { |
| brw_lower_mul_qword_inst(s, inst); |
| inst->remove(); |
| progress = true; |
| } else if (!inst->dst.is_accumulator() && |
| (inst->dst.type == BRW_TYPE_D || |
| inst->dst.type == BRW_TYPE_UD) && |
| (!devinfo->has_integer_dword_mul || |
| devinfo->verx10 >= 125)) { |
| brw_lower_mul_dword_inst(s, inst); |
| inst->remove(); |
| progress = true; |
| } |
| } else if (inst->opcode == SHADER_OPCODE_MULH) { |
| brw_lower_mulh_inst(s, inst); |
| inst->remove(); |
| progress = true; |
| } |
| |
| } |
| |
| if (progress) |
| s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS | |
| BRW_DEPENDENCY_VARIABLES); |
| |
| return progress; |
| } |
| |
| |