| .section #gm107_builtin_code |
| // DIV U32 |
| // |
| // UNR recurrence (q = a / b): |
| // look for z such that 2^32 - b <= b * z < 2^32 |
| // then q - 1 <= (a * z) / 2^32 <= q |
| // |
| // INPUT: $r0: dividend, $r1: divisor |
| // OUTPUT: $r0: result, $r1: modulus |
| // CLOBBER: $r2 - $r3, $p0 - $p1 |
| // SIZE: 22 / 14 * 8 bytes |
| // |
| gm107_div_u32: |
| sched (st 0xd wr 0x0 wt 0x3f) (st 0x1 wt 0x1) (st 0x6) |
| flo u32 $r2 $r1 |
| lop xor 1 $r2 $r2 0x1f |
| mov $r3 0x1 0xf |
| sched (st 0x1) (st 0xf wr 0x0) (st 0x6 wr 0x0 wt 0x1) |
| shl $r2 $r3 $r2 |
| i2i u32 u32 $r1 neg $r1 |
| imul u32 u32 $r3 $r1 $r2 |
| sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) |
| imad u32 u32 hi $r2 $r2 $r3 $r2 |
| imul u32 u32 $r3 $r1 $r2 |
| imad u32 u32 hi $r2 $r2 $r3 $r2 |
| sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) |
| imul u32 u32 $r3 $r1 $r2 |
| imad u32 u32 hi $r2 $r2 $r3 $r2 |
| imul u32 u32 $r3 $r1 $r2 |
| sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) |
| imad u32 u32 hi $r2 $r2 $r3 $r2 |
| imul u32 u32 $r3 $r1 $r2 |
| imad u32 u32 hi $r2 $r2 $r3 $r2 |
| sched (st 0x6) (st 0x6 wr 0x0 rd 0x1 wt 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x2) |
| mov $r3 $r0 0xf |
| imul u32 u32 hi $r0 $r0 $r2 |
| i2i u32 u32 $r2 neg $r1 |
| sched (st 0x6 wr 0x0 wt 0x3) (st 0xd wt 0x1) (st 0x1) |
| imad u32 u32 $r1 $r1 $r0 $r3 |
| isetp ge u32 and $p0 1 $r1 $r2 1 |
| $p0 iadd $r1 $r1 neg $r2 |
| sched (st 0x5) (st 0xd) (st 0x1) |
| $p0 iadd $r0 $r0 0x1 |
| $p0 isetp ge u32 and $p0 1 $r1 $r2 1 |
| $p0 iadd $r1 $r1 neg $r2 |
| sched (st 0x1) (st 0xf) (st 0xf) |
| $p0 iadd $r0 $r0 0x1 |
| ret |
| nop 0 |
| |
| // DIV S32, like DIV U32 after taking ABS(inputs) |
| // |
| // INPUT: $r0: dividend, $r1: divisor |
| // OUTPUT: $r0: result, $r1: modulus |
| // CLOBBER: $r2 - $r3, $p0 - $p3 |
| // |
| gm107_div_s32: |
| sched (st 0xd wt 0x3f) (st 0x1) (st 0x1 wr 0x0) |
| isetp lt and $p2 0x1 $r0 0 1 |
| isetp lt xor $p3 1 $r1 0 $p2 |
| i2i s32 s32 $r0 abs $r0 |
| sched (st 0xf wr 0x1) (st 0xd wr 0x1 wt 0x2) (st 0x1 wt 0x2) |
| i2i s32 s32 $r1 abs $r1 |
| flo u32 $r2 $r1 |
| lop xor 1 $r2 $r2 0x1f |
| sched (st 0x6) (st 0x1) (st 0xf wr 0x1) |
| mov $r3 0x1 0xf |
| shl $r2 $r3 $r2 |
| i2i u32 u32 $r1 neg $r1 |
| sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) |
| imul u32 u32 $r3 $r1 $r2 |
| imad u32 u32 hi $r2 $r2 $r3 $r2 |
| imul u32 u32 $r3 $r1 $r2 |
| sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) |
| imad u32 u32 hi $r2 $r2 $r3 $r2 |
| imul u32 u32 $r3 $r1 $r2 |
| imad u32 u32 hi $r2 $r2 $r3 $r2 |
| sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) |
| imul u32 u32 $r3 $r1 $r2 |
| imad u32 u32 hi $r2 $r2 $r3 $r2 |
| imul u32 u32 $r3 $r1 $r2 |
| sched (st 0x6 wr 0x1 rd 0x2 wt 0x2) (st 0x2 wt 0x5) (st 0x6 wr 0x0 rd 0x1 wt 0x2) |
| imad u32 u32 hi $r2 $r2 $r3 $r2 |
| mov $r3 $r0 0xf |
| imul u32 u32 hi $r0 $r0 $r2 |
| sched (st 0xf wr 0x1 rd 0x2 wt 0x2) (st 0x6 wr 0x0 wt 0x5) (st 0xd wt 0x3) |
| i2i u32 u32 $r2 neg $r1 |
| imad u32 u32 $r1 $r1 $r0 $r3 |
| isetp ge u32 and $p0 1 $r1 $r2 1 |
| sched (st 0x1) (st 0x5) (st 0xd) |
| $p0 iadd $r1 $r1 neg $r2 |
| $p0 iadd $r0 $r0 0x1 |
| $p0 isetp ge u32 and $p0 1 $r1 $r2 1 |
| sched (st 0x1) (st 0x2) (st 0xf wr 0x0) |
| $p0 iadd $r1 $r1 neg $r2 |
| $p0 iadd $r0 $r0 0x1 |
| $p3 i2i s32 s32 $r0 neg $r0 |
| sched (st 0xf wr 0x1) (st 0xf wt 0x3) (st 0xf) |
| $p2 i2i s32 s32 $r1 neg $r1 |
| ret |
| nop 0 |
| |
| // STUB |
| gm107_rcp_f64: |
| gm107_rsq_f64: |
| sched (st 0x0) (st 0x0) (st 0x0) |
| ret |
| nop 0 |
| nop 0 |
| |
| .section #gm107_builtin_offsets |
| .b64 #gm107_div_u32 |
| .b64 #gm107_div_s32 |
| .b64 #gm107_rcp_f64 |
| .b64 #gm107_rsq_f64 |