freedreno: slurp in decode tools

cffdump, crashdec, etc

At this point there is some duplication with other files in-tree (ie.
a2xx and a3xx+ disassembly), which will be cleaned up in a later commit.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6070>
diff --git a/src/freedreno/decode/buffers.c b/src/freedreno/decode/buffers.c
new file mode 100644
index 0000000..8e696f8
--- /dev/null
+++ b/src/freedreno/decode/buffers.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Helper lib to track gpu buffers contents/address, and map between gpu and
+ * host address while decoding cmdstream/crashdumps
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "buffers.h"
+
+struct buffer {
+	void *hostptr;
+	unsigned int len;
+	uint64_t gpuaddr;
+
+	/* for 'once' mode, for buffers containing cmdstream keep track per offset
+	 * into buffer of which modes it has already been dumped;
+	 */
+	struct {
+		unsigned offset;
+		unsigned dumped_mask;
+	} offsets[64];
+	unsigned noffsets;
+};
+
+static struct buffer buffers[512];
+static int nbuffers;
+
+static int
+buffer_contains_gpuaddr(struct buffer *buf, uint64_t gpuaddr, uint32_t len)
+{
+	return (buf->gpuaddr <= gpuaddr) && (gpuaddr < (buf->gpuaddr + buf->len));
+}
+
+static int
+buffer_contains_hostptr(struct buffer *buf, void *hostptr)
+{
+	return (buf->hostptr <= hostptr) && (hostptr < (buf->hostptr + buf->len));
+}
+
+
+uint64_t
+gpuaddr(void *hostptr)
+{
+	int i;
+	for (i = 0; i < nbuffers; i++)
+		if (buffer_contains_hostptr(&buffers[i], hostptr))
+			return buffers[i].gpuaddr + (hostptr - buffers[i].hostptr);
+	return 0;
+}
+
+uint64_t
+gpubaseaddr(uint64_t gpuaddr)
+{
+	int i;
+	if (!gpuaddr)
+		return 0;
+	for (i = 0; i < nbuffers; i++)
+		if (buffer_contains_gpuaddr(&buffers[i], gpuaddr, 0))
+			return buffers[i].gpuaddr;
+	return 0;
+}
+
+void *
+hostptr(uint64_t gpuaddr)
+{
+	int i;
+	if (!gpuaddr)
+		return 0;
+	for (i = 0; i < nbuffers; i++)
+		if (buffer_contains_gpuaddr(&buffers[i], gpuaddr, 0))
+			return buffers[i].hostptr + (gpuaddr - buffers[i].gpuaddr);
+	return 0;
+}
+
+unsigned
+hostlen(uint64_t gpuaddr)
+{
+	int i;
+	if (!gpuaddr)
+		return 0;
+	for (i = 0; i < nbuffers; i++)
+		if (buffer_contains_gpuaddr(&buffers[i], gpuaddr, 0))
+			return buffers[i].len + buffers[i].gpuaddr - gpuaddr;
+	return 0;
+}
+
+bool
+has_dumped(uint64_t gpuaddr, unsigned enable_mask)
+{
+	if (!gpuaddr)
+		return false;
+
+	for (int i = 0; i < nbuffers; i++) {
+		if (buffer_contains_gpuaddr(&buffers[i], gpuaddr, 0)) {
+			struct buffer *b = &buffers[i];
+			assert(gpuaddr >= b->gpuaddr);
+			unsigned offset = gpuaddr - b->gpuaddr;
+
+			unsigned n = 0;
+			while (n < b->noffsets) {
+				if (offset == b->offsets[n].offset)
+					break;
+				n++;
+			}
+
+			/* if needed, allocate a new offset entry: */
+			if (n == b->noffsets) {
+				b->noffsets++;
+				assert(b->noffsets < ARRAY_SIZE(b->offsets));
+				b->offsets[n].dumped_mask = 0;
+				b->offsets[n].offset = offset;
+			}
+
+			if ((b->offsets[n].dumped_mask & enable_mask) == enable_mask)
+				return true;
+
+			b->offsets[n].dumped_mask |= enable_mask;
+
+			return false;
+		}
+	}
+
+	return false;
+}
+
+void
+reset_buffers(void)
+{
+	for (int i = 0; i < nbuffers; i++) {
+		free(buffers[i].hostptr);
+		buffers[i].hostptr = NULL;
+		buffers[i].len = 0;
+		buffers[i].noffsets = 0;
+	}
+	nbuffers = 0;
+}
+
+/**
+ * Record buffer contents, takes ownership of hostptr (freed in
+ * reset_buffers())
+ */
+void
+add_buffer(uint64_t gpuaddr, unsigned int len, void *hostptr)
+{
+	int i;
+
+	for (i = 0; i < nbuffers; i++) {
+		if (buffers[i].gpuaddr == gpuaddr)
+			break;
+	}
+
+	if (i == nbuffers) {
+		/* some traces, like test-perf, with some blob versions,
+		 * seem to generate an unreasonable # of gpu buffers (a
+		 * leak?), so just ignore them.
+		 */
+		if (nbuffers >= ARRAY_SIZE(buffers)) {
+			free(hostptr);
+			return;
+		}
+		nbuffers++;
+	}
+
+	buffers[i].hostptr = hostptr;
+	buffers[i].len     = len;
+	buffers[i].gpuaddr = gpuaddr;
+}
diff --git a/src/freedreno/decode/buffers.h b/src/freedreno/decode/buffers.h
new file mode 100644
index 0000000..f63f3f3
--- /dev/null
+++ b/src/freedreno/decode/buffers.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __BUFFERS_H__
+#define __BUFFERS_H__
+
+#include <stdint.h>
+#include <stdbool.h>
+
+uint64_t gpuaddr(void *hostptr);
+uint64_t gpubaseaddr(uint64_t gpuaddr);
+void * hostptr(uint64_t gpuaddr);
+unsigned hostlen(uint64_t gpuaddr);
+bool has_dumped(uint64_t gpuaddr, unsigned enable_mask);
+
+void reset_buffers(void);
+void add_buffer(uint64_t gpuaddr, unsigned int len, void *hostptr);
+
+#ifndef ARRAY_SIZE
+#  define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#endif
+
+#endif /* __BUFFERS_H__ */
diff --git a/src/freedreno/decode/cffdec.c b/src/freedreno/decode/cffdec.c
new file mode 100644
index 0000000..d0b2695
--- /dev/null
+++ b/src/freedreno/decode/cffdec.c
@@ -0,0 +1,2717 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <err.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <string.h>
+#include <assert.h>
+#include <signal.h>
+#include <errno.h>
+
+#include "redump.h"
+#include "disasm.h"
+#include "script.h"
+#include "rnnutil.h"
+#include "buffers.h"
+#include "cffdec.h"
+
+/* ************************************************************************* */
+/* originally based on kernel recovery dump code: */
+
+static const struct cffdec_options *options;
+
+static bool needs_wfi = false;
+static bool summary = false;
+static bool in_summary = false;
+static int vertices;
+
+static inline unsigned regcnt(void)
+{
+	if (options->gpu_id >= 500)
+		return 0xffff;
+	else
+		return 0x7fff;
+}
+
+static int is_64b(void)
+{
+	return options->gpu_id >= 500;
+}
+
+
+static int draws[3];
+static struct {
+	uint64_t base;
+	uint32_t size;   /* in dwords */
+	/* Generally cmdstream consists of multiple IB calls to different
+	 * buffers, which are themselves often re-used for each tile.  The
+	 * triggered flag serves two purposes to help make it more clear
+	 * what part of the cmdstream is before vs after the the GPU hang:
+	 *
+	 * 1) if in IB2 we are passed the point within the IB2 buffer where
+	 *    the GPU hung, but IB1 is not passed the point within its
+	 *    buffer where the GPU had hung, then we know the GPU hang
+	 *    happens on a future use of that IB2 buffer.
+	 *
+	 * 2) if in an IB1 or IB2 buffer that is not the one where the GPU
+	 *    hung, but we've already passed the trigger point at the same
+	 *    IB level, we know that we are passed the point where the GPU
+	 *    had hung.
+	 *
+	 * So this is a one way switch, false->true.  And a higher #'d
+	 * IB level isn't considered triggered unless the lower #'d IB
+	 * level is.
+	 */
+	bool triggered;
+} ibs[4];
+static int ib;
+
+static int draw_count;
+static int current_draw_count;
+
+/* query mode.. to handle symbolic register name queries, we need to
+ * defer parsing query string until after gpu_id is know and rnn db
+ * loaded:
+ */
+static int *queryvals;
+
+static bool
+quiet(int lvl)
+{
+	if ((options->draw_filter != -1) && (options->draw_filter != current_draw_count))
+		return true;
+	if ((lvl >= 3) && (summary || options->querystrs || options->script))
+		return true;
+	if ((lvl >= 2) && (options->querystrs || options->script))
+		return true;
+	return false;
+}
+
+void
+printl(int lvl, const char *fmt, ...)
+{
+	va_list args;
+	if (quiet(lvl))
+		return;
+	va_start(args, fmt);
+	vprintf(fmt, args);
+	va_end(args);
+}
+
+static const char *levels[] = {
+		"\t",
+		"\t\t",
+		"\t\t\t",
+		"\t\t\t\t",
+		"\t\t\t\t\t",
+		"\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t\t\t",
+		"x",
+		"x",
+		"x",
+		"x",
+		"x",
+		"x",
+};
+
+enum state_src_t {
+	STATE_SRC_DIRECT,
+	STATE_SRC_INDIRECT,
+	STATE_SRC_BINDLESS,
+};
+
+/* SDS (CP_SET_DRAW_STATE) helpers: */
+static void load_all_groups(int level);
+static void disable_all_groups(void);
+
+static void dump_tex_samp(uint32_t *texsamp, enum state_src_t src, int num_unit, int level);
+static void dump_tex_const(uint32_t *texsamp, int num_unit, int level);
+
+static bool
+highlight_gpuaddr(uint64_t gpuaddr)
+{
+	if (!options->color)
+		return false;
+
+	if (!options->ibs[ib].base)
+		return false;
+
+	if ((ib > 0) && options->ibs[ib-1].base && !ibs[ib-1].triggered)
+		return false;
+
+	if (ibs[ib].triggered)
+		return true;
+
+	if (options->ibs[ib].base != ibs[ib].base)
+		return false;
+
+	uint64_t start = ibs[ib].base + 4 * (ibs[ib].size - options->ibs[ib].rem);
+	uint64_t end   = ibs[ib].base + 4 * ibs[ib].size;
+
+	bool triggered = (start <= gpuaddr) && (gpuaddr <= end);
+
+	ibs[ib].triggered |= triggered;
+
+	if (triggered)
+		printf("ESTIMATED CRASH LOCATION!\n");
+
+	return triggered;
+}
+
+static void
+dump_hex(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	int i, j;
+	int lastzero = 1;
+
+	if (quiet(2))
+		return;
+
+	for (i = 0; i < sizedwords; i += 8) {
+		int zero = 1;
+
+		/* always show first row: */
+		if (i == 0)
+			zero = 0;
+
+		for (j = 0; (j < 8) && (i+j < sizedwords) && zero; j++)
+			if (dwords[i+j])
+				zero = 0;
+
+		if (zero && !lastzero)
+			printf("*\n");
+
+		lastzero = zero;
+
+		if (zero)
+			continue;
+
+		uint64_t addr = gpuaddr(&dwords[i]);
+		bool highlight = highlight_gpuaddr(addr);
+
+		if (highlight)
+			printf("\x1b[0;1;31m");
+
+		if (is_64b()) {
+			printf("%016lx:%s", addr, levels[level]);
+		} else {
+			printf("%08x:%s", (uint32_t)addr, levels[level]);
+		}
+
+		if (highlight)
+			printf("\x1b[0m");
+
+		printf("%04x:", i * 4);
+
+		for (j = 0; (j < 8) && (i+j < sizedwords); j++) {
+			printf(" %08x", dwords[i+j]);
+		}
+
+		printf("\n");
+	}
+}
+
+static void
+dump_float(float *dwords, uint32_t sizedwords, int level)
+{
+	int i;
+	for (i = 0; i < sizedwords; i++) {
+		if ((i % 8) == 0) {
+			if (is_64b()) {
+				printf("%016lx:%s", gpuaddr(dwords), levels[level]);
+			} else {
+				printf("%08x:%s", (uint32_t)gpuaddr(dwords), levels[level]);
+			}
+		} else {
+			printf(" ");
+		}
+		printf("%8f", *(dwords++));
+		if ((i % 8) == 7)
+			printf("\n");
+	}
+	if (i % 8)
+		printf("\n");
+}
+
+/* I believe the surface format is low bits:
+#define RB_COLOR_INFO__COLOR_FORMAT_MASK                   0x0000000fL
+comments in sys2gmem_tex_const indicate that address is [31:12], but
+looks like at least some of the bits above the format have different meaning..
+*/
+static void parse_dword_addr(uint32_t dword, uint32_t *gpuaddr,
+		uint32_t *flags, uint32_t mask)
+{
+	assert(!is_64b());  /* this is only used on a2xx */
+	*gpuaddr = dword & ~mask;
+	*flags   = dword & mask;
+}
+
+static uint32_t type0_reg_vals[0xffff + 1];
+static uint8_t type0_reg_rewritten[sizeof(type0_reg_vals)/8];  /* written since last draw */
+static uint8_t type0_reg_written[sizeof(type0_reg_vals)/8];
+static uint32_t lastvals[ARRAY_SIZE(type0_reg_vals)];
+
+static bool reg_rewritten(uint32_t regbase)
+{
+	return !!(type0_reg_rewritten[regbase/8] & (1 << (regbase % 8)));
+}
+
+bool reg_written(uint32_t regbase)
+{
+	return !!(type0_reg_written[regbase/8] & (1 << (regbase % 8)));
+}
+
+static void clear_rewritten(void)
+{
+	memset(type0_reg_rewritten, 0, sizeof(type0_reg_rewritten));
+}
+
+static void clear_written(void)
+{
+	memset(type0_reg_written, 0, sizeof(type0_reg_written));
+	clear_rewritten();
+}
+
+uint32_t reg_lastval(uint32_t regbase)
+{
+	return lastvals[regbase];
+}
+
+static void
+clear_lastvals(void)
+{
+	memset(lastvals, 0, sizeof(lastvals));
+}
+
+uint32_t
+reg_val(uint32_t regbase)
+{
+	return type0_reg_vals[regbase];
+}
+
+void
+reg_set(uint32_t regbase, uint32_t val)
+{
+	assert(regbase < regcnt());
+	type0_reg_vals[regbase] = val;
+	type0_reg_written[regbase/8] |= (1 << (regbase % 8));
+	type0_reg_rewritten[regbase/8] |= (1 << (regbase % 8));
+}
+
+static void
+reg_dump_scratch(const char *name, uint32_t dword, int level)
+{
+	unsigned r;
+
+	if (quiet(3))
+		return;
+
+	r = regbase("CP_SCRATCH[0].REG");
+
+	// if not, try old a2xx/a3xx version:
+	if (!r)
+		r = regbase("CP_SCRATCH_REG0");
+
+	if (!r)
+		return;
+
+	printf("%s:%u,%u,%u,%u\n", levels[level],
+			reg_val(r + 4), reg_val(r + 5),
+			reg_val(r + 6), reg_val(r + 7));
+}
+
+static void
+dump_gpuaddr_size(uint64_t gpuaddr, int level, int sizedwords, int quietlvl)
+{
+	void *buf;
+
+	if (quiet(quietlvl))
+		return;
+
+	buf = hostptr(gpuaddr);
+	if (buf) {
+		dump_hex(buf, sizedwords, level+1);
+	}
+}
+
+static void
+dump_gpuaddr(uint64_t gpuaddr, int level)
+{
+	dump_gpuaddr_size(gpuaddr, level, 64, 3);
+}
+
+static void
+reg_dump_gpuaddr(const char *name, uint32_t dword, int level)
+{
+	dump_gpuaddr(dword, level);
+}
+
+uint32_t gpuaddr_lo;
+static void
+reg_gpuaddr_lo(const char *name, uint32_t dword, int level)
+{
+	gpuaddr_lo = dword;
+}
+
+static void
+reg_dump_gpuaddr_hi(const char *name, uint32_t dword, int level)
+{
+	dump_gpuaddr(gpuaddr_lo | (((uint64_t)dword) << 32), level);
+}
+
+
+static void
+dump_shader(const char *ext, void *buf, int bufsz)
+{
+	if (options->dump_shaders) {
+		static int n = 0;
+		char filename[8];
+		int fd;
+		sprintf(filename, "%04d.%s", n++, ext);
+		fd = open(filename, O_WRONLY| O_TRUNC | O_CREAT, 0644);
+		write(fd, buf, bufsz);
+		close(fd);
+	}
+}
+
+static void
+disasm_gpuaddr(const char *name, uint64_t gpuaddr, int level)
+{
+	void *buf;
+
+	gpuaddr &= 0xfffffffffffffff0;
+
+	if (quiet(3))
+		return;
+
+	buf = hostptr(gpuaddr);
+	if (buf) {
+		uint32_t sizedwords = hostlen(gpuaddr) / 4;
+		const char *ext;
+
+		dump_hex(buf, min(64, sizedwords), level+1);
+		disasm_a3xx(buf, sizedwords, level+2, stdout, options->gpu_id);
+
+		/* this is a bit ugly way, but oh well.. */
+		if (strstr(name, "SP_VS_OBJ")) {
+			ext = "vo3";
+		} else if (strstr(name, "SP_FS_OBJ")) {
+			ext = "fo3";
+		} else if (strstr(name, "SP_GS_OBJ")) {
+			ext = "go3";
+		} else if (strstr(name, "SP_CS_OBJ")) {
+			ext = "co3";
+		} else {
+			ext = NULL;
+		}
+
+		if (ext)
+			dump_shader(ext, buf, sizedwords * 4);
+	}
+}
+
+static void
+reg_disasm_gpuaddr(const char *name, uint32_t dword, int level)
+{
+	disasm_gpuaddr(name, dword, level);
+}
+
+static void
+reg_disasm_gpuaddr_hi(const char *name, uint32_t dword, int level)
+{
+	disasm_gpuaddr(name, gpuaddr_lo | (((uint64_t)dword) << 32), level);
+}
+
+/* Find the value of the TEX_COUNT register that corresponds to the named
+ * TEX_SAMP/TEX_CONST reg.
+ *
+ * Note, this kinda assumes an equal # of samplers and textures, but not
+ * really sure if there is a much better option.  I suppose on a6xx we
+ * could instead decode the bitfields in SP_xS_CONFIG
+ */
+static int
+get_tex_count(const char *name)
+{
+	char count_reg[strlen(name) + 5];
+	char *p;
+
+	p = strstr(name, "CONST");
+	if (!p)
+		p = strstr(name, "SAMP");
+	if (!p)
+		return 0;
+
+	int n = p - name;
+	strncpy(count_reg, name, n);
+	strcpy(count_reg + n, "COUNT");
+
+	return reg_val(regbase(count_reg));
+}
+
+static void
+reg_dump_tex_samp_hi(const char *name, uint32_t dword, int level)
+{
+	if (!in_summary)
+		return;
+
+	int num_unit = get_tex_count(name);
+	uint64_t gpuaddr = gpuaddr_lo | (((uint64_t)dword) << 32);
+	void *buf = hostptr(gpuaddr);
+
+	if (!buf)
+		return;
+
+	dump_tex_samp(buf, STATE_SRC_DIRECT, num_unit, level+1);
+}
+
+static void
+reg_dump_tex_const_hi(const char *name, uint32_t dword, int level)
+{
+	if (!in_summary)
+		return;
+
+	int num_unit = get_tex_count(name);
+	uint64_t gpuaddr = gpuaddr_lo | (((uint64_t)dword) << 32);
+	void *buf = hostptr(gpuaddr);
+
+	if (!buf)
+		return;
+
+	dump_tex_const(buf, num_unit, level+1);
+}
+
+/*
+ * Registers with special handling (rnndec_decode() handles rest):
+ */
+#define REG(x, fxn) { #x, fxn }
+static struct {
+	const char *regname;
+	void (*fxn)(const char *name, uint32_t dword, int level);
+	uint32_t regbase;
+} reg_a2xx[] = {
+		REG(CP_SCRATCH_REG0, reg_dump_scratch),
+		REG(CP_SCRATCH_REG1, reg_dump_scratch),
+		REG(CP_SCRATCH_REG2, reg_dump_scratch),
+		REG(CP_SCRATCH_REG3, reg_dump_scratch),
+		REG(CP_SCRATCH_REG4, reg_dump_scratch),
+		REG(CP_SCRATCH_REG5, reg_dump_scratch),
+		REG(CP_SCRATCH_REG6, reg_dump_scratch),
+		REG(CP_SCRATCH_REG7, reg_dump_scratch),
+		{NULL},
+}, reg_a3xx[] = {
+		REG(CP_SCRATCH_REG0, reg_dump_scratch),
+		REG(CP_SCRATCH_REG1, reg_dump_scratch),
+		REG(CP_SCRATCH_REG2, reg_dump_scratch),
+		REG(CP_SCRATCH_REG3, reg_dump_scratch),
+		REG(CP_SCRATCH_REG4, reg_dump_scratch),
+		REG(CP_SCRATCH_REG5, reg_dump_scratch),
+		REG(CP_SCRATCH_REG6, reg_dump_scratch),
+		REG(CP_SCRATCH_REG7, reg_dump_scratch),
+		REG(VSC_SIZE_ADDRESS, reg_dump_gpuaddr),
+		REG(SP_VS_PVT_MEM_ADDR_REG, reg_dump_gpuaddr),
+		REG(SP_FS_PVT_MEM_ADDR_REG, reg_dump_gpuaddr),
+		REG(SP_VS_OBJ_START_REG, reg_disasm_gpuaddr),
+		REG(SP_FS_OBJ_START_REG, reg_disasm_gpuaddr),
+		REG(TPL1_TP_FS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
+		{NULL},
+}, reg_a4xx[] = {
+		REG(CP_SCRATCH[0].REG, reg_dump_scratch),
+		REG(CP_SCRATCH[0x1].REG, reg_dump_scratch),
+		REG(CP_SCRATCH[0x2].REG, reg_dump_scratch),
+		REG(CP_SCRATCH[0x3].REG, reg_dump_scratch),
+		REG(CP_SCRATCH[0x4].REG, reg_dump_scratch),
+		REG(CP_SCRATCH[0x5].REG, reg_dump_scratch),
+		REG(CP_SCRATCH[0x6].REG, reg_dump_scratch),
+		REG(CP_SCRATCH[0x7].REG, reg_dump_scratch),
+		REG(SP_VS_PVT_MEM_ADDR, reg_dump_gpuaddr),
+		REG(SP_FS_PVT_MEM_ADDR, reg_dump_gpuaddr),
+		REG(SP_GS_PVT_MEM_ADDR, reg_dump_gpuaddr),
+		REG(SP_HS_PVT_MEM_ADDR, reg_dump_gpuaddr),
+		REG(SP_DS_PVT_MEM_ADDR, reg_dump_gpuaddr),
+		REG(SP_CS_PVT_MEM_ADDR, reg_dump_gpuaddr),
+		REG(SP_VS_OBJ_START, reg_disasm_gpuaddr),
+		REG(SP_FS_OBJ_START, reg_disasm_gpuaddr),
+		REG(SP_GS_OBJ_START, reg_disasm_gpuaddr),
+		REG(SP_HS_OBJ_START, reg_disasm_gpuaddr),
+		REG(SP_DS_OBJ_START, reg_disasm_gpuaddr),
+		REG(SP_CS_OBJ_START, reg_disasm_gpuaddr),
+		REG(TPL1_TP_VS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
+		REG(TPL1_TP_HS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
+		REG(TPL1_TP_DS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
+		REG(TPL1_TP_GS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
+		REG(TPL1_TP_FS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
+		{NULL},
+}, reg_a5xx[] = {
+		REG(CP_SCRATCH[0x4].REG, reg_dump_scratch),
+		REG(CP_SCRATCH[0x5].REG, reg_dump_scratch),
+		REG(CP_SCRATCH[0x6].REG, reg_dump_scratch),
+		REG(CP_SCRATCH[0x7].REG, reg_dump_scratch),
+		REG(SP_VS_OBJ_START_LO, reg_gpuaddr_lo),
+		REG(SP_VS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+		REG(SP_HS_OBJ_START_LO, reg_gpuaddr_lo),
+		REG(SP_HS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+		REG(SP_DS_OBJ_START_LO, reg_gpuaddr_lo),
+		REG(SP_DS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+		REG(SP_GS_OBJ_START_LO, reg_gpuaddr_lo),
+		REG(SP_GS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+		REG(SP_FS_OBJ_START_LO, reg_gpuaddr_lo),
+		REG(SP_FS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+		REG(SP_CS_OBJ_START_LO, reg_gpuaddr_lo),
+		REG(SP_CS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+		REG(TPL1_VS_TEX_CONST_LO, reg_gpuaddr_lo),
+		REG(TPL1_VS_TEX_CONST_HI, reg_dump_tex_const_hi),
+		REG(TPL1_VS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+		REG(TPL1_VS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+		REG(TPL1_HS_TEX_CONST_LO, reg_gpuaddr_lo),
+		REG(TPL1_HS_TEX_CONST_HI, reg_dump_tex_const_hi),
+		REG(TPL1_HS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+		REG(TPL1_HS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+		REG(TPL1_DS_TEX_CONST_LO, reg_gpuaddr_lo),
+		REG(TPL1_DS_TEX_CONST_HI, reg_dump_tex_const_hi),
+		REG(TPL1_DS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+		REG(TPL1_DS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+		REG(TPL1_GS_TEX_CONST_LO, reg_gpuaddr_lo),
+		REG(TPL1_GS_TEX_CONST_HI, reg_dump_tex_const_hi),
+		REG(TPL1_GS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+		REG(TPL1_GS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+		REG(TPL1_FS_TEX_CONST_LO, reg_gpuaddr_lo),
+		REG(TPL1_FS_TEX_CONST_HI, reg_dump_tex_const_hi),
+		REG(TPL1_FS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+		REG(TPL1_FS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+		REG(TPL1_CS_TEX_CONST_LO, reg_gpuaddr_lo),
+		REG(TPL1_CS_TEX_CONST_HI, reg_dump_tex_const_hi),
+		REG(TPL1_CS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+		REG(TPL1_CS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+		REG(TPL1_TP_BORDER_COLOR_BASE_ADDR_LO,  reg_gpuaddr_lo),
+		REG(TPL1_TP_BORDER_COLOR_BASE_ADDR_HI,  reg_dump_gpuaddr_hi),
+//		REG(RB_MRT_FLAG_BUFFER[0].ADDR_LO, reg_gpuaddr_lo),
+//		REG(RB_MRT_FLAG_BUFFER[0].ADDR_HI, reg_dump_gpuaddr_hi),
+//		REG(RB_MRT_FLAG_BUFFER[1].ADDR_LO, reg_gpuaddr_lo),
+//		REG(RB_MRT_FLAG_BUFFER[1].ADDR_HI, reg_dump_gpuaddr_hi),
+//		REG(RB_MRT_FLAG_BUFFER[2].ADDR_LO, reg_gpuaddr_lo),
+//		REG(RB_MRT_FLAG_BUFFER[2].ADDR_HI, reg_dump_gpuaddr_hi),
+//		REG(RB_MRT_FLAG_BUFFER[3].ADDR_LO, reg_gpuaddr_lo),
+//		REG(RB_MRT_FLAG_BUFFER[3].ADDR_HI, reg_dump_gpuaddr_hi),
+//		REG(RB_MRT_FLAG_BUFFER[4].ADDR_LO, reg_gpuaddr_lo),
+//		REG(RB_MRT_FLAG_BUFFER[4].ADDR_HI, reg_dump_gpuaddr_hi),
+//		REG(RB_MRT_FLAG_BUFFER[5].ADDR_LO, reg_gpuaddr_lo),
+//		REG(RB_MRT_FLAG_BUFFER[5].ADDR_HI, reg_dump_gpuaddr_hi),
+//		REG(RB_MRT_FLAG_BUFFER[6].ADDR_LO, reg_gpuaddr_lo),
+//		REG(RB_MRT_FLAG_BUFFER[6].ADDR_HI, reg_dump_gpuaddr_hi),
+//		REG(RB_MRT_FLAG_BUFFER[7].ADDR_LO, reg_gpuaddr_lo),
+//		REG(RB_MRT_FLAG_BUFFER[7].ADDR_HI, reg_dump_gpuaddr_hi),
+//		REG(RB_BLIT_FLAG_DST_LO, reg_gpuaddr_lo),
+//		REG(RB_BLIT_FLAG_DST_HI, reg_dump_gpuaddr_hi),
+//		REG(RB_MRT[0].BASE_LO, reg_gpuaddr_lo),
+//		REG(RB_MRT[0].BASE_HI, reg_dump_gpuaddr_hi),
+//		REG(RB_DEPTH_BUFFER_BASE_LO, reg_gpuaddr_lo),
+//		REG(RB_DEPTH_BUFFER_BASE_HI, reg_dump_gpuaddr_hi),
+//		REG(RB_DEPTH_FLAG_BUFFER_BASE_LO, reg_gpuaddr_lo),
+//		REG(RB_DEPTH_FLAG_BUFFER_BASE_HI, reg_dump_gpuaddr_hi),
+//		REG(RB_BLIT_DST_LO, reg_gpuaddr_lo),
+//		REG(RB_BLIT_DST_HI, reg_dump_gpuaddr_hi),
+
+//		REG(RB_2D_SRC_LO, reg_gpuaddr_lo),
+//		REG(RB_2D_SRC_HI, reg_dump_gpuaddr_hi),
+//		REG(RB_2D_SRC_FLAGS_LO, reg_gpuaddr_lo),
+//		REG(RB_2D_SRC_FLAGS_HI, reg_dump_gpuaddr_hi),
+//		REG(RB_2D_DST_LO, reg_gpuaddr_lo),
+//		REG(RB_2D_DST_HI, reg_dump_gpuaddr_hi),
+//		REG(RB_2D_DST_FLAGS_LO, reg_gpuaddr_lo),
+//		REG(RB_2D_DST_FLAGS_HI, reg_dump_gpuaddr_hi),
+
+		{NULL},
+}, reg_a6xx[] = {
+		REG(CP_SCRATCH[0x4].REG, reg_dump_scratch),
+		REG(CP_SCRATCH[0x5].REG, reg_dump_scratch),
+		REG(CP_SCRATCH[0x6].REG, reg_dump_scratch),
+		REG(CP_SCRATCH[0x7].REG, reg_dump_scratch),
+
+		REG(SP_VS_OBJ_START_LO, reg_gpuaddr_lo),
+		REG(SP_VS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+		REG(SP_HS_OBJ_START_LO, reg_gpuaddr_lo),
+		REG(SP_HS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+		REG(SP_DS_OBJ_START_LO, reg_gpuaddr_lo),
+		REG(SP_DS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+		REG(SP_GS_OBJ_START_LO, reg_gpuaddr_lo),
+		REG(SP_GS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+		REG(SP_FS_OBJ_START_LO, reg_gpuaddr_lo),
+		REG(SP_FS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+		REG(SP_CS_OBJ_START_LO, reg_gpuaddr_lo),
+		REG(SP_CS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
+
+		REG(SP_VS_TEX_CONST_LO, reg_gpuaddr_lo),
+		REG(SP_VS_TEX_CONST_HI, reg_dump_tex_const_hi),
+		REG(SP_VS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+		REG(SP_VS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+		REG(SP_HS_TEX_CONST_LO, reg_gpuaddr_lo),
+		REG(SP_HS_TEX_CONST_HI, reg_dump_tex_const_hi),
+		REG(SP_HS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+		REG(SP_HS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+		REG(SP_DS_TEX_CONST_LO, reg_gpuaddr_lo),
+		REG(SP_DS_TEX_CONST_HI, reg_dump_tex_const_hi),
+		REG(SP_DS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+		REG(SP_DS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+		REG(SP_GS_TEX_CONST_LO, reg_gpuaddr_lo),
+		REG(SP_GS_TEX_CONST_HI, reg_dump_tex_const_hi),
+		REG(SP_GS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+		REG(SP_GS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+		REG(SP_FS_TEX_CONST_LO, reg_gpuaddr_lo),
+		REG(SP_FS_TEX_CONST_HI, reg_dump_tex_const_hi),
+		REG(SP_FS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+		REG(SP_FS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+		REG(SP_CS_TEX_CONST_LO, reg_gpuaddr_lo),
+		REG(SP_CS_TEX_CONST_HI, reg_dump_tex_const_hi),
+		REG(SP_CS_TEX_SAMP_LO,  reg_gpuaddr_lo),
+		REG(SP_CS_TEX_SAMP_HI,  reg_dump_tex_samp_hi),
+
+		{NULL},
+}, *type0_reg;
+
+static struct rnn *rnn;
+
+static void
+init_rnn(const char *gpuname)
+{
+	rnn = rnn_new(!options->color);
+
+	rnn_load(rnn, gpuname);
+
+	if (options->querystrs) {
+		int i;
+		queryvals = calloc(options->nquery, sizeof(queryvals[0]));
+
+		for (i = 0; i < options->nquery; i++) {
+			int val = strtol(options->querystrs[i], NULL, 0);
+
+			if (val == 0)
+				val = regbase(options->querystrs[i]);
+
+			queryvals[i] = val;
+			printf("querystr: %s -> 0x%x\n", options->querystrs[i], queryvals[i]);
+		}
+	}
+
+	for (unsigned idx = 0; type0_reg[idx].regname; idx++) {
+		type0_reg[idx].regbase = regbase(type0_reg[idx].regname);
+		if (!type0_reg[idx].regbase) {
+			printf("invalid register name: %s\n", type0_reg[idx].regname);
+			exit(1);
+		}
+	}
+}
+
+void
+reset_regs(void)
+{
+	clear_written();
+	clear_lastvals();
+	memset(&ibs, 0, sizeof(ibs));
+}
+
+void
+cffdec_init(const struct cffdec_options *_options)
+{
+	options = _options;
+	summary = options->summary;
+
+	/* in case we're decoding multiple files: */
+	free(queryvals);
+	reset_regs();
+	draw_count = 0;
+
+	/* TODO we need an API to free/cleanup any previous rnn */
+
+	switch (options->gpu_id) {
+	case 200 ... 299:
+		type0_reg = reg_a2xx;
+		init_rnn("a2xx");
+		break;
+	case 300 ... 399:
+		type0_reg = reg_a3xx;
+		init_rnn("a3xx");
+		break;
+	case 400 ... 499:
+		type0_reg = reg_a4xx;
+		init_rnn("a4xx");
+		break;
+	case 500 ... 599:
+		type0_reg = reg_a5xx;
+		init_rnn("a5xx");
+		break;
+	case 600 ... 699:
+		type0_reg = reg_a6xx;
+		init_rnn("a6xx");
+		break;
+	default:
+		errx(-1, "unsupported gpu");
+	}
+}
+
+const char *
+pktname(unsigned opc)
+{
+	return rnn_enumname(rnn, "adreno_pm4_type3_packets", opc);
+}
+
+const char *
+regname(uint32_t regbase, int color)
+{
+	return rnn_regname(rnn, regbase, color);
+}
+
+uint32_t
+regbase(const char *name)
+{
+	return rnn_regbase(rnn, name);
+}
+
+static int
+endswith(uint32_t regbase, const char *suffix)
+{
+	const char *name = regname(regbase, 0);
+	const char *s = strstr(name, suffix);
+	if (!s)
+		return 0;
+	return (s - strlen(name) + strlen(suffix)) == name;
+}
+
+void
+dump_register_val(uint32_t regbase, uint32_t dword, int level)
+{
+	struct rnndecaddrinfo *info = rnn_reginfo(rnn, regbase);
+
+	if (info && info->typeinfo) {
+		uint64_t gpuaddr = 0;
+		char *decoded = rnndec_decodeval(rnn->vc, info->typeinfo, dword);
+		printf("%s%s: %s", levels[level], info->name, decoded);
+
+		/* Try and figure out if we are looking at a gpuaddr.. this
+		 * might be useful for other gen's too, but at least a5xx has
+		 * the _HI/_LO suffix we can look for.  Maybe a better approach
+		 * would be some special annotation in the xml..
+		 */
+		if (options->gpu_id >= 500) {
+			if (endswith(regbase, "_HI") && endswith(regbase-1, "_LO")) {
+				gpuaddr = (((uint64_t)dword) << 32) | reg_val(regbase-1);
+			} else if (endswith(regbase, "_LO") && endswith(regbase+1, "_HI")) {
+				gpuaddr = (((uint64_t)reg_val(regbase+1)) << 32) | dword;
+			}
+		}
+
+		if (gpuaddr && hostptr(gpuaddr)) {
+			printf("\t\tbase=%lx, offset=%lu, size=%u",
+					gpubaseaddr(gpuaddr),
+					gpuaddr - gpubaseaddr(gpuaddr),
+					hostlen(gpubaseaddr(gpuaddr)));
+		}
+
+		printf("\n");
+
+		free(decoded);
+	} else if (info) {
+		printf("%s%s: %08x\n", levels[level], info->name, dword);
+	} else {
+		printf("%s<%04x>: %08x\n", levels[level], regbase, dword);
+	}
+
+	if (info) {
+		free(info->name);
+		free(info);
+	}
+}
+
+static void
+dump_register(uint32_t regbase, uint32_t dword, int level)
+{
+	if (!quiet(3)) {
+		dump_register_val(regbase, dword, level);
+	}
+
+	for (unsigned idx = 0; type0_reg[idx].regname; idx++) {
+		if (type0_reg[idx].regbase == regbase) {
+			type0_reg[idx].fxn(type0_reg[idx].regname, dword, level);
+			break;
+		}
+	}
+}
+
+static bool
+is_banked_reg(uint32_t regbase)
+{
+	return (0x2000 <= regbase) && (regbase < 0x2400);
+}
+
+static void
+dump_registers(uint32_t regbase, uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	while (sizedwords--) {
+		int last_summary = summary;
+
+		/* access to non-banked registers needs a WFI:
+		 * TODO banked register range for a2xx??
+		 */
+		if (needs_wfi && !is_banked_reg(regbase))
+			printl(2, "NEEDS WFI: %s (%x)\n", regname(regbase, 1), regbase);
+
+		reg_set(regbase, *dwords);
+		dump_register(regbase, *dwords, level);
+		regbase++;
+		dwords++;
+		summary = last_summary;
+	}
+}
+
+static void
+dump_domain(uint32_t *dwords, uint32_t sizedwords, int level,
+		const char *name)
+{
+	struct rnndomain *dom;
+	int i;
+
+	dom = rnn_finddomain(rnn->db, name);
+
+	if (!dom)
+		return;
+
+	if (script_packet)
+		script_packet(dwords, sizedwords, rnn, dom);
+
+	if (quiet(2))
+		return;
+
+	for (i = 0; i < sizedwords; i++) {
+		struct rnndecaddrinfo *info = rnndec_decodeaddr(rnn->vc, dom, i, 0);
+		char *decoded;
+		if (!(info && info->typeinfo))
+			break;
+		uint64_t value = dwords[i];
+		if (info->typeinfo->high >= 32 && i < sizedwords - 1) {
+			value |= (uint64_t) dwords[i + 1] << 32;
+			i++; /* skip the next dword since we're printing it now */
+		}
+		decoded = rnndec_decodeval(rnn->vc, info->typeinfo, value);
+		/* Unlike the register printing path, we don't print the name
+		 * of the register, so if it doesn't contain other named
+		 * things (i.e. it isn't a bitset) then print the register
+		 * name as if it's a bitset with a single entry. This avoids
+		 * having to create a dummy register with a single entry to
+		 * get a name in the decoding.
+		 */
+		if (info->typeinfo->type == RNN_TTYPE_BITSET ||
+		    info->typeinfo->type == RNN_TTYPE_INLINE_BITSET) {
+			printf("%s%s\n", levels[level], decoded);
+		} else {
+			printf("%s{ %s%s%s = %s }\n", levels[level],
+					rnn->vc->colors->rname, info->name,
+					rnn->vc->colors->reset, decoded);
+		}
+		free(decoded);
+		free(info->name);
+		free(info);
+	}
+}
+
+
+static uint32_t bin_x1, bin_x2, bin_y1, bin_y2;
+static unsigned mode;
+static const char *render_mode;
+static enum {
+	MODE_BINNING = 0x1,
+	MODE_GMEM    = 0x2,
+	MODE_BYPASS  = 0x4,
+	MODE_ALL     = MODE_BINNING | MODE_GMEM | MODE_BYPASS,
+} enable_mask = MODE_ALL;
+static bool skip_ib2_enable_global;
+static bool skip_ib2_enable_local;
+
+static void
+print_mode(int level)
+{
+	if ((options->gpu_id >= 500) && !quiet(2)) {
+		printf("%smode: %s\n", levels[level], render_mode);
+		printf("%sskip_ib2: g=%d, l=%d\n", levels[level], skip_ib2_enable_global, skip_ib2_enable_local);
+	}
+}
+
+static bool
+skip_query(void)
+{
+	switch (options->query_mode) {
+	case QUERY_ALL:
+		/* never skip: */
+		return false;
+	case QUERY_WRITTEN:
+		for (int i = 0; i < options->nquery; i++) {
+			uint32_t regbase = queryvals[i];
+			if (!reg_written(regbase)) {
+				continue;
+			}
+			if (reg_rewritten(regbase)) {
+				return false;
+			}
+		}
+		return true;
+	case QUERY_DELTA:
+		for (int i = 0; i < options->nquery; i++) {
+			uint32_t regbase = queryvals[i];
+			if (!reg_written(regbase)) {
+				continue;
+			}
+			uint32_t lastval = reg_val(regbase);
+			if (lastval != lastvals[regbase]) {
+				return false;
+			}
+		}
+		return true;
+	}
+	return true;
+}
+
+static void
+__do_query(const char *primtype, uint32_t num_indices)
+{
+	int n = 0;
+
+	if ((500 <= options->gpu_id) && (options->gpu_id < 700)) {
+		uint32_t scissor_tl = reg_val(regbase("GRAS_SC_WINDOW_SCISSOR_TL"));
+		uint32_t scissor_br = reg_val(regbase("GRAS_SC_WINDOW_SCISSOR_BR"));
+
+		bin_x1 = scissor_tl & 0xffff;
+		bin_y1 = scissor_tl >> 16;
+		bin_x2 = scissor_br & 0xffff;
+		bin_y2 = scissor_br >> 16;
+	}
+
+	for (int i = 0; i < options->nquery; i++) {
+		uint32_t regbase = queryvals[i];
+		if (reg_written(regbase)) {
+			uint32_t lastval = reg_val(regbase);
+			printf("%4d: %s(%u,%u-%u,%u):%u:", draw_count, primtype,
+					bin_x1, bin_y1, bin_x2, bin_y2, num_indices);
+			if (options->gpu_id >= 500)
+				printf("%s:", render_mode);
+			printf("\t%08x", lastval);
+			if (lastval != lastvals[regbase]) {
+				printf("!");
+			} else {
+				printf(" ");
+			}
+			if (reg_rewritten(regbase)) {
+				printf("+");
+			} else {
+				printf(" ");
+			}
+			dump_register_val(regbase, lastval, 0);
+			n++;
+		}
+	}
+
+	if (n > 1)
+		printf("\n");
+}
+
+static void
+do_query_compare(const char *primtype, uint32_t num_indices)
+{
+	unsigned saved_enable_mask = enable_mask;
+	const char *saved_render_mode = render_mode;
+
+	/* in 'query-compare' mode, we want to see if the register is writtten
+	 * or changed in any mode:
+	 *
+	 * (NOTE: this could cause false-positive for 'query-delta' if the reg
+	 * is written with different values in binning vs sysmem/gmem mode, as
+	 * we don't track previous values per-mode, but I think we can live with
+	 * that)
+	 */
+	enable_mask = MODE_ALL;
+
+	clear_rewritten();
+	load_all_groups(0);
+
+	if (!skip_query()) {
+		/* dump binning pass values: */
+		enable_mask = MODE_BINNING;
+		render_mode = "BINNING";
+		clear_rewritten();
+		load_all_groups(0);
+		__do_query(primtype, num_indices);
+
+		/* dump draw pass values: */
+		enable_mask = MODE_GMEM | MODE_BYPASS;
+		render_mode = "DRAW";
+		clear_rewritten();
+		load_all_groups(0);
+		__do_query(primtype, num_indices);
+
+		printf("\n");
+	}
+
+	enable_mask = saved_enable_mask;
+	render_mode = saved_render_mode;
+
+	disable_all_groups();
+}
+
+/* well, actually query and script..
+ * NOTE: call this before dump_register_summary()
+ */
+static void
+do_query(const char *primtype, uint32_t num_indices)
+{
+	if (script_draw)
+		script_draw(primtype, num_indices);
+
+	if (options->query_compare) {
+		do_query_compare(primtype, num_indices);
+		return;
+	}
+
+	if (skip_query())
+		return;
+
+	__do_query(primtype, num_indices);
+}
+
+static void
+cp_im_loadi(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	uint32_t start = dwords[1] >> 16;
+	uint32_t size  = dwords[1] & 0xffff;
+	const char *type = NULL, *ext = NULL;
+	enum shader_t disasm_type;
+
+	switch (dwords[0]) {
+	case 0:
+		type = "vertex";
+		ext = "vo";
+		disasm_type = SHADER_VERTEX;
+		break;
+	case 1:
+		type = "fragment";
+		ext = "fo";
+		disasm_type = SHADER_FRAGMENT;
+		break;
+	default:
+		type = "<unknown>";
+		disasm_type = 0;
+		break;
+	}
+
+	printf("%s%s shader, start=%04x, size=%04x\n", levels[level], type, start, size);
+	disasm_a2xx(dwords + 2, sizedwords - 2, level+2, disasm_type);
+
+	/* dump raw shader: */
+	if (ext)
+		dump_shader(ext, dwords + 2, (sizedwords - 2) * 4);
+}
+
+static void
+cp_wide_reg_write(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	uint32_t reg = dwords[0] & 0xffff;
+	int i;
+	for (i = 1; i < sizedwords; i++) {
+		dump_register(reg, dwords[i], level+1);
+		reg_set(reg, dwords[i]);
+		reg++;
+	}
+}
+
+enum state_t {
+	TEX_SAMP = 1,
+	TEX_CONST,
+	TEX_MIPADDR,  /* a3xx only */
+	SHADER_PROG,
+	SHADER_CONST,
+
+	// image/ssbo state:
+	SSBO_0,
+	SSBO_1,
+	SSBO_2,
+
+	UBO,
+
+	// unknown things, just to hexdumps:
+	UNKNOWN_DWORDS,
+	UNKNOWN_2DWORDS,
+	UNKNOWN_4DWORDS,
+};
+
+enum adreno_state_block {
+	SB_VERT_TEX = 0,
+	SB_VERT_MIPADDR = 1,
+	SB_FRAG_TEX = 2,
+	SB_FRAG_MIPADDR = 3,
+	SB_VERT_SHADER = 4,
+	SB_GEOM_SHADER = 5,
+	SB_FRAG_SHADER = 6,
+	SB_COMPUTE_SHADER = 7,
+};
+
+/* TODO there is probably a clever way to let rnndec parse things so
+ * we don't have to care about packet format differences across gens
+ */
+
+static void
+a3xx_get_state_type(uint32_t *dwords, enum shader_t *stage, enum state_t *state,
+		    enum state_src_t *src)
+{
+	unsigned state_block_id = (dwords[0] >> 19) & 0x7;
+	unsigned state_type = dwords[1] & 0x3;
+	static const struct {
+		enum shader_t stage;
+		enum state_t state;
+	} lookup[0xf][0x3] = {
+		[SB_VERT_TEX][0]    = { SHADER_VERTEX,    TEX_SAMP },
+		[SB_VERT_TEX][1]    = { SHADER_VERTEX,    TEX_CONST },
+		[SB_FRAG_TEX][0]    = { SHADER_FRAGMENT,  TEX_SAMP },
+		[SB_FRAG_TEX][1]    = { SHADER_FRAGMENT,  TEX_CONST },
+		[SB_VERT_SHADER][0] = { SHADER_VERTEX,    SHADER_PROG },
+		[SB_VERT_SHADER][1] = { SHADER_VERTEX,    SHADER_CONST },
+		[SB_FRAG_SHADER][0] = { SHADER_FRAGMENT,  SHADER_PROG },
+		[SB_FRAG_SHADER][1] = { SHADER_FRAGMENT,  SHADER_CONST },
+	};
+
+	*stage = lookup[state_block_id][state_type].stage;
+	*state = lookup[state_block_id][state_type].state;
+	unsigned state_src = (dwords[0] >> 16) & 0x7;
+	if (state_src == 0 /* SS_DIRECT */)
+		*src = STATE_SRC_DIRECT;
+	else
+		*src = STATE_SRC_INDIRECT;
+}
+
+static enum state_src_t
+_get_state_src(unsigned dword0)
+{
+	switch ((dword0 >> 16) & 0x3) {
+	case 0: /* SS4_DIRECT / SS6_DIRECT */
+		return STATE_SRC_DIRECT;
+	case 2: /* SS4_INDIRECT / SS6_INDIRECT */
+		return STATE_SRC_INDIRECT;
+	case 1: /* SS6_BINDLESS */
+		return STATE_SRC_BINDLESS;
+	default:
+		return STATE_SRC_DIRECT;
+	}
+}
+
+static void
+_get_state_type(unsigned state_block_id, unsigned state_type,
+		enum shader_t *stage, enum state_t *state)
+{
+	static const struct {
+		enum shader_t stage;
+		enum state_t  state;
+	} lookup[0x10][0x4] = {
+		// SB4_VS_TEX:
+		[0x0][0] = { SHADER_VERTEX,    TEX_SAMP },
+		[0x0][1] = { SHADER_VERTEX,    TEX_CONST },
+		[0x0][2] = { SHADER_VERTEX,    UBO },
+		// SB4_HS_TEX:
+		[0x1][0] = { SHADER_TCS,       TEX_SAMP },
+		[0x1][1] = { SHADER_TCS,       TEX_CONST },
+		[0x1][2] = { SHADER_TCS,       UBO },
+		// SB4_DS_TEX:
+		[0x2][0] = { SHADER_TES,       TEX_SAMP },
+		[0x2][1] = { SHADER_TES,       TEX_CONST },
+		[0x2][2] = { SHADER_TES,       UBO },
+		// SB4_GS_TEX:
+		[0x3][0] = { SHADER_GEOM,      TEX_SAMP },
+		[0x3][1] = { SHADER_GEOM,      TEX_CONST },
+		[0x3][2] = { SHADER_GEOM,      UBO },
+		// SB4_FS_TEX:
+		[0x4][0] = { SHADER_FRAGMENT,  TEX_SAMP },
+		[0x4][1] = { SHADER_FRAGMENT,  TEX_CONST },
+		[0x4][2] = { SHADER_FRAGMENT,  UBO },
+		// SB4_CS_TEX:
+		[0x5][0] = { SHADER_COMPUTE,   TEX_SAMP },
+		[0x5][1] = { SHADER_COMPUTE,   TEX_CONST },
+		[0x5][2] = { SHADER_COMPUTE,   UBO },
+		// SB4_VS_SHADER:
+		[0x8][0] = { SHADER_VERTEX,    SHADER_PROG },
+		[0x8][1] = { SHADER_VERTEX,    SHADER_CONST },
+		[0x8][2] = { SHADER_VERTEX,    UBO },
+		// SB4_HS_SHADER
+		[0x9][0] = { SHADER_TCS,       SHADER_PROG },
+		[0x9][1] = { SHADER_TCS,       SHADER_CONST },
+		[0x9][2] = { SHADER_TCS,       UBO },
+		// SB4_DS_SHADER
+		[0xa][0] = { SHADER_TES,       SHADER_PROG },
+		[0xa][1] = { SHADER_TES,       SHADER_CONST },
+		[0xa][2] = { SHADER_TES,       UBO },
+		// SB4_GS_SHADER
+		[0xb][0] = { SHADER_GEOM,      SHADER_PROG },
+		[0xb][1] = { SHADER_GEOM,      SHADER_CONST },
+		[0xb][2] = { SHADER_GEOM,      UBO },
+		// SB4_FS_SHADER:
+		[0xc][0] = { SHADER_FRAGMENT,  SHADER_PROG },
+		[0xc][1] = { SHADER_FRAGMENT,  SHADER_CONST },
+		[0xc][2] = { SHADER_FRAGMENT,  UBO },
+		// SB4_CS_SHADER:
+		[0xd][0] = { SHADER_COMPUTE,   SHADER_PROG },
+		[0xd][1] = { SHADER_COMPUTE,   SHADER_CONST },
+		[0xd][2] = { SHADER_COMPUTE,   UBO },
+		[0xd][3] = { SHADER_COMPUTE,   SSBO_0 },      /* a6xx location */
+		// SB4_SSBO (shared across all stages)
+		[0xe][0] = { 0, SSBO_0 },                     /* a5xx (and a4xx?) location */
+		[0xe][1] = { 0, SSBO_1 },
+		[0xe][2] = { 0, SSBO_2 },
+		// SB4_CS_SSBO
+		[0xf][0] = { SHADER_COMPUTE, SSBO_0 },
+		[0xf][1] = { SHADER_COMPUTE, SSBO_1 },
+		[0xf][2] = { SHADER_COMPUTE, SSBO_2 },
+		// unknown things
+		/* This looks like combined UBO state for 3d stages (a5xx and
+		 * before??  I think a6xx has UBO state per shader stage:
+		 */
+		[0x6][2] = { 0, UBO },
+		[0x7][1] = { 0, UNKNOWN_2DWORDS },
+	};
+
+	*stage = lookup[state_block_id][state_type].stage;
+	*state = lookup[state_block_id][state_type].state;
+}
+
+static void
+a4xx_get_state_type(uint32_t *dwords, enum shader_t *stage, enum state_t *state,
+		    enum state_src_t *src)
+{
+	unsigned state_block_id = (dwords[0] >> 18) & 0xf;
+	unsigned state_type = dwords[1] & 0x3;
+	_get_state_type(state_block_id, state_type, stage, state);
+	*src = _get_state_src(dwords[0]);
+}
+
+static void
+a6xx_get_state_type(uint32_t *dwords, enum shader_t *stage, enum state_t *state,
+		    enum state_src_t *src)
+{
+	unsigned state_block_id = (dwords[0] >> 18) & 0xf;
+	unsigned state_type = (dwords[0] >> 14) & 0x3;
+	_get_state_type(state_block_id, state_type, stage, state);
+	*src = _get_state_src(dwords[0]);
+}
+
+static void
+dump_tex_samp(uint32_t *texsamp, enum state_src_t src, int num_unit, int level)
+{
+	for (int i = 0; i < num_unit; i++) {
+		/* work-around to reduce noise for opencl blob which always
+		 * writes the max # regardless of # of textures used
+		 */
+		if ((num_unit == 16) && (texsamp[0] == 0) && (texsamp[1] == 0))
+			break;
+
+		if ((300 <= options->gpu_id) && (options->gpu_id < 400)) {
+			dump_domain(texsamp, 2, level+2, "A3XX_TEX_SAMP");
+			dump_hex(texsamp, 2, level+1);
+			texsamp += 2;
+		} else if ((400 <= options->gpu_id) && (options->gpu_id < 500)) {
+			dump_domain(texsamp, 2, level+2, "A4XX_TEX_SAMP");
+			dump_hex(texsamp, 2, level+1);
+			texsamp += 2;
+		} else if ((500 <= options->gpu_id) && (options->gpu_id < 600)) {
+			dump_domain(texsamp, 4, level+2, "A5XX_TEX_SAMP");
+			dump_hex(texsamp, 4, level+1);
+			texsamp += 4;
+		} else if ((600 <= options->gpu_id) && (options->gpu_id < 700)) {
+			dump_domain(texsamp, 4, level+2, "A6XX_TEX_SAMP");
+			dump_hex(texsamp, 4, level+1);
+			texsamp += src == STATE_SRC_BINDLESS ? 16 : 4;
+		}
+	}
+}
+
+static void
+dump_tex_const(uint32_t *texconst, int num_unit, int level)
+{
+	for (int i = 0; i < num_unit; i++) {
+		/* work-around to reduce noise for opencl blob which always
+		 * writes the max # regardless of # of textures used
+		 */
+		if ((num_unit == 16) &&
+			(texconst[0] == 0) && (texconst[1] == 0) &&
+			(texconst[2] == 0) && (texconst[3] == 0))
+			break;
+
+		if ((300 <= options->gpu_id) && (options->gpu_id < 400)) {
+			dump_domain(texconst, 4, level+2, "A3XX_TEX_CONST");
+			dump_hex(texconst, 4, level+1);
+			texconst += 4;
+		} else if ((400 <= options->gpu_id) && (options->gpu_id < 500)) {
+			dump_domain(texconst, 8, level+2, "A4XX_TEX_CONST");
+			if (options->dump_textures) {
+				uint32_t addr = texconst[4] & ~0x1f;
+				dump_gpuaddr(addr, level-2);
+			}
+			dump_hex(texconst, 8, level+1);
+			texconst += 8;
+		} else if ((500 <= options->gpu_id) && (options->gpu_id < 600)) {
+			dump_domain(texconst, 12, level+2, "A5XX_TEX_CONST");
+			if (options->dump_textures) {
+				uint64_t addr = (((uint64_t)texconst[5] & 0x1ffff) << 32) | texconst[4];
+				dump_gpuaddr_size(addr, level-2, hostlen(addr) / 4, 3);
+			}
+			dump_hex(texconst, 12, level+1);
+			texconst += 12;
+		} else if ((600 <= options->gpu_id) && (options->gpu_id < 700)) {
+			dump_domain(texconst, 16, level+2, "A6XX_TEX_CONST");
+			if (options->dump_textures) {
+				uint64_t addr = (((uint64_t)texconst[5] & 0x1ffff) << 32) | texconst[4];
+				dump_gpuaddr_size(addr, level-2, hostlen(addr) / 4, 3);
+			}
+			dump_hex(texconst, 16, level+1);
+			texconst += 16;
+		}
+	}
+}
+
+static void
+cp_load_state(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	enum shader_t stage;
+	enum state_t state;
+	enum state_src_t src;
+	uint32_t num_unit = (dwords[0] >> 22) & 0x1ff;
+	uint64_t ext_src_addr;
+	void *contents;
+	int i;
+
+	if (quiet(2) && !options->script)
+		return;
+
+	if (options->gpu_id >= 600)
+		a6xx_get_state_type(dwords, &stage, &state, &src);
+	else if (options->gpu_id >= 400)
+		a4xx_get_state_type(dwords, &stage, &state, &src);
+	else
+		a3xx_get_state_type(dwords, &stage, &state, &src);
+
+	switch (src) {
+	case STATE_SRC_DIRECT: ext_src_addr = 0; break;
+	case STATE_SRC_INDIRECT:
+		if (is_64b()) {
+			ext_src_addr = dwords[1] & 0xfffffffc;
+			ext_src_addr |= ((uint64_t)dwords[2]) << 32;
+		} else {
+			ext_src_addr = dwords[1] & 0xfffffffc;
+		}
+
+		break;
+	case STATE_SRC_BINDLESS: {
+		const unsigned base_reg =
+			stage == SHADER_COMPUTE ? regbase("HLSQ_CS_BINDLESS_BASE[0]") : regbase("HLSQ_BINDLESS_BASE[0]");
+
+		if (is_64b()) {
+			const unsigned reg = base_reg + (dwords[1] >> 28) * 2;
+			ext_src_addr = reg_val(reg) & 0xfffffffc;
+			ext_src_addr |= ((uint64_t)reg_val(reg + 1)) << 32;
+		} else {
+			const unsigned reg = base_reg + (dwords[1] >> 28);
+			ext_src_addr = reg_val(reg) & 0xfffffffc;
+		}
+
+		ext_src_addr += 4 * (dwords[1] & 0xffffff);
+		break;
+	}
+	}
+
+	if (ext_src_addr)
+		contents = hostptr(ext_src_addr);
+	else
+		contents = is_64b() ? dwords + 3 : dwords + 2;
+
+	if (!contents)
+		return;
+
+	switch (state) {
+	case SHADER_PROG: {
+		const char *ext = NULL;
+
+		if (quiet(2))
+			return;
+
+		if (options->gpu_id >= 400)
+			num_unit *= 16;
+		else if (options->gpu_id >= 300)
+			num_unit *= 4;
+
+		/* shaders:
+		 *
+		 * note: num_unit seems to be # of instruction groups, where
+		 * an instruction group has 4 64bit instructions.
+		 */
+		if (stage == SHADER_VERTEX) {
+			ext = "vo3";
+		} else if (stage == SHADER_GEOM) {
+			ext = "go3";
+		} else if (stage == SHADER_COMPUTE) {
+			ext = "co3";
+		} else if (stage == SHADER_FRAGMENT){
+			ext = "fo3";
+		}
+
+		if (contents)
+			disasm_a3xx(contents, num_unit * 2, level+2, stdout, options->gpu_id);
+
+		/* dump raw shader: */
+		if (ext)
+			dump_shader(ext, contents, num_unit * 2 * 4);
+
+		break;
+	}
+	case SHADER_CONST: {
+		if (quiet(2))
+			return;
+
+		/* uniforms/consts:
+		 *
+		 * note: num_unit seems to be # of pairs of dwords??
+		 */
+
+		if (options->gpu_id >= 400)
+			num_unit *= 2;
+
+		dump_float(contents, num_unit*2, level+1);
+		dump_hex(contents, num_unit*2, level+1);
+
+		break;
+	}
+	case TEX_MIPADDR: {
+		uint32_t *addrs = contents;
+
+		if (quiet(2))
+			return;
+
+		/* mipmap consts block just appears to be array of num_unit gpu addr's: */
+		for (i = 0; i < num_unit; i++) {
+			void *ptr = hostptr(addrs[i]);
+			printf("%s%2d: %08x\n", levels[level+1], i, addrs[i]);
+			if (options->dump_textures) {
+				printf("base=%08x\n", (uint32_t)gpubaseaddr(addrs[i]));
+				dump_hex(ptr, hostlen(addrs[i])/4, level+1);
+			}
+		}
+		break;
+	}
+	case TEX_SAMP: {
+		dump_tex_samp(contents, src, num_unit, level);
+		break;
+	}
+	case TEX_CONST: {
+		dump_tex_const(contents, num_unit, level);
+		break;
+	}
+	case SSBO_0: {
+		uint32_t *ssboconst = (uint32_t *)contents;
+
+		for (i = 0; i < num_unit; i++) {
+			int sz = 4;
+			if (400 <= options->gpu_id && options->gpu_id < 500) {
+				dump_domain(ssboconst, 4, level+2, "A4XX_SSBO_0");
+			} else if (500 <= options->gpu_id && options->gpu_id < 600) {
+				dump_domain(ssboconst, 4, level+2, "A5XX_SSBO_0");
+			} else if (600 <= options->gpu_id && options->gpu_id < 700) {
+				sz = 16;
+				dump_domain(ssboconst, 16, level+2, "A6XX_IBO");
+			}
+			dump_hex(ssboconst, sz, level+1);
+			ssboconst += sz;
+		}
+		break;
+	}
+	case SSBO_1: {
+		uint32_t *ssboconst = (uint32_t *)contents;
+
+		for (i = 0; i < num_unit; i++) {
+			if (400 <= options->gpu_id && options->gpu_id < 500)
+				dump_domain(ssboconst, 2, level+2, "A4XX_SSBO_1");
+			else if (500 <= options->gpu_id && options->gpu_id < 600)
+				dump_domain(ssboconst, 2, level+2, "A5XX_SSBO_1");
+			dump_hex(ssboconst, 2, level+1);
+			ssboconst += 2;
+		}
+		break;
+	}
+	case SSBO_2: {
+		uint32_t *ssboconst = (uint32_t *)contents;
+
+		for (i = 0; i < num_unit; i++) {
+			/* TODO a4xx and a5xx might be same: */
+			if ((500 <= options->gpu_id) && (options->gpu_id < 600)) {
+				dump_domain(ssboconst, 2, level+2, "A5XX_SSBO_2");
+				dump_hex(ssboconst, 2, level+1);
+			}
+			if (options->dump_textures) {
+				uint64_t addr = (((uint64_t)ssboconst[1] & 0x1ffff) << 32) | ssboconst[0];
+				dump_gpuaddr_size(addr, level-2, hostlen(addr) / 4, 3);
+			}
+			ssboconst += 2;
+		}
+		break;
+	}
+	case UBO: {
+		uint32_t *uboconst = (uint32_t *)contents;
+
+		for (i = 0; i < num_unit; i++) {
+			// TODO probably similar on a4xx..
+			if (500 <= options->gpu_id && options->gpu_id < 600)
+				dump_domain(uboconst, 2, level+2, "A5XX_UBO");
+			else if (600 <= options->gpu_id && options->gpu_id < 700)
+				dump_domain(uboconst, 2, level+2, "A6XX_UBO");
+			dump_hex(uboconst, 2, level+1);
+			uboconst += src == STATE_SRC_BINDLESS ? 16 : 2;
+		}
+		break;
+	}
+	case UNKNOWN_DWORDS: {
+		if (quiet(2))
+			return;
+		dump_hex(contents, num_unit, level+1);
+		break;
+	}
+	case UNKNOWN_2DWORDS: {
+		if (quiet(2))
+			return;
+		dump_hex(contents, num_unit * 2, level+1);
+		break;
+	}
+	case UNKNOWN_4DWORDS: {
+		if (quiet(2))
+			return;
+		dump_hex(contents, num_unit * 4, level+1);
+		break;
+	}
+	default:
+		if (quiet(2))
+			return;
+		/* hmm.. */
+		dump_hex(contents, num_unit, level+1);
+		break;
+	}
+}
+
+static void
+cp_set_bin(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	bin_x1 = dwords[1] & 0xffff;
+	bin_y1 = dwords[1] >> 16;
+	bin_x2 = dwords[2] & 0xffff;
+	bin_y2 = dwords[2] >> 16;
+}
+
+static void
+dump_a2xx_tex_const(uint32_t *dwords, uint32_t sizedwords, uint32_t val, int level)
+{
+	uint32_t w, h, p;
+	uint32_t gpuaddr, flags, mip_gpuaddr, mip_flags;
+	uint32_t min, mag, swiz, clamp_x, clamp_y, clamp_z;
+	static const char *filter[] = {
+			"point", "bilinear", "bicubic",
+	};
+	static const char *clamp[] = {
+			"wrap", "mirror", "clamp-last-texel",
+	};
+	static const char swiznames[] = "xyzw01??";
+
+	/* see sys2gmem_tex_const[] in adreno_a2xxx.c */
+
+	/* Texture, FormatXYZW=Unsigned, ClampXYZ=Wrap/Repeat,
+	 * RFMode=ZeroClamp-1, Dim=1:2d, pitch
+	 */
+	p = (dwords[0] >> 22) << 5;
+	clamp_x = (dwords[0] >> 10) & 0x3;
+	clamp_y = (dwords[0] >> 13) & 0x3;
+	clamp_z = (dwords[0] >> 16) & 0x3;
+
+	/* Format=6:8888_WZYX, EndianSwap=0:None, ReqSize=0:256bit, DimHi=0,
+	 * NearestClamp=1:OGL Mode
+	 */
+	parse_dword_addr(dwords[1], &gpuaddr, &flags, 0xfff);
+
+	/* Width, Height, EndianSwap=0:None */
+	w = (dwords[2] & 0x1fff) + 1;
+	h = ((dwords[2] >> 13) & 0x1fff) + 1;
+
+	/* NumFormat=0:RF, DstSelXYZW=XYZW, ExpAdj=0, MagFilt=MinFilt=0:Point,
+	 * Mip=2:BaseMap
+	 */
+	mag = (dwords[3] >> 19) & 0x3;
+	min = (dwords[3] >> 21) & 0x3;
+	swiz = (dwords[3] >> 1) & 0xfff;
+
+	/* VolMag=VolMin=0:Point, MinMipLvl=0, MaxMipLvl=1, LodBiasH=V=0,
+	 * Dim3d=0
+	 */
+	// XXX
+
+	/* BorderColor=0:ABGRBlack, ForceBC=0:diable, TriJuice=0, Aniso=0,
+	 * Dim=1:2d, MipPacking=0
+	 */
+	parse_dword_addr(dwords[5], &mip_gpuaddr, &mip_flags, 0xfff);
+
+	printf("%sset texture const %04x\n", levels[level], val);
+	printf("%sclamp x/y/z: %s/%s/%s\n", levels[level+1],
+			clamp[clamp_x], clamp[clamp_y], clamp[clamp_z]);
+	printf("%sfilter min/mag: %s/%s\n", levels[level+1], filter[min], filter[mag]);
+	printf("%sswizzle: %c%c%c%c\n", levels[level+1],
+			swiznames[(swiz >> 0) & 0x7], swiznames[(swiz >> 3) & 0x7],
+			swiznames[(swiz >> 6) & 0x7], swiznames[(swiz >> 9) & 0x7]);
+	printf("%saddr=%08x (flags=%03x), size=%dx%d, pitch=%d, format=%s\n",
+			levels[level+1], gpuaddr, flags, w, h, p,
+			rnn_enumname(rnn, "a2xx_sq_surfaceformat", flags & 0xf));
+	printf("%smipaddr=%08x (flags=%03x)\n", levels[level+1],
+			mip_gpuaddr, mip_flags);
+}
+
+static void
+dump_a2xx_shader_const(uint32_t *dwords, uint32_t sizedwords, uint32_t val, int level)
+{
+	int i;
+	printf("%sset shader const %04x\n", levels[level], val);
+	for (i = 0; i < sizedwords; ) {
+		uint32_t gpuaddr, flags;
+		parse_dword_addr(dwords[i++], &gpuaddr, &flags, 0xf);
+		void *addr = hostptr(gpuaddr);
+		if (addr) {
+			const char * fmt =
+				rnn_enumname(rnn, "a2xx_sq_surfaceformat", flags & 0xf);
+			uint32_t size = dwords[i++];
+			printf("%saddr=%08x, size=%d, format=%s\n", levels[level+1],
+					gpuaddr, size, fmt);
+			// TODO maybe dump these as bytes instead of dwords?
+			size = (size + 3) / 4; // for now convert to dwords
+			dump_hex(addr, min(size, 64), level + 1);
+			if (size > min(size, 64))
+				printf("%s\t\t...\n", levels[level+1]);
+			dump_float(addr, min(size, 64), level + 1);
+			if (size > min(size, 64))
+				printf("%s\t\t...\n", levels[level+1]);
+		}
+	}
+}
+
+static void
+cp_set_const(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	uint32_t val = dwords[0] & 0xffff;
+	switch((dwords[0] >> 16) & 0xf) {
+	case 0x0:
+		dump_float((float *)(dwords+1), sizedwords-1, level+1);
+		break;
+	case 0x1:
+		/* need to figure out how const space is partitioned between
+		 * attributes, textures, etc..
+		 */
+		if (val < 0x78) {
+			dump_a2xx_tex_const(dwords+1, sizedwords-1, val, level);
+		} else {
+			dump_a2xx_shader_const(dwords+1, sizedwords-1, val, level);
+		}
+		break;
+	case 0x2:
+		printf("%sset bool const %04x\n", levels[level], val);
+		break;
+	case 0x3:
+		printf("%sset loop const %04x\n", levels[level], val);
+		break;
+	case 0x4:
+		val += 0x2000;
+		if (dwords[0] & 0x80000000) {
+			uint32_t srcreg = dwords[1];
+			uint32_t dstval = dwords[2];
+
+			/* TODO: not sure what happens w/ payload != 2.. */
+			assert(sizedwords == 3);
+			assert(srcreg < ARRAY_SIZE(type0_reg_vals));
+
+			/* note: rnn_regname uses a static buf so we can't do
+			 * two regname() calls for one printf..
+			 */
+			printf("%s%s = %08x + ", levels[level], regname(val, 1), dstval);
+			printf("%s (%08x)\n", regname(srcreg, 1), type0_reg_vals[srcreg]);
+
+			dstval += type0_reg_vals[srcreg];
+
+			dump_registers(val, &dstval, 1, level+1);
+		} else {
+			dump_registers(val, dwords+1, sizedwords-1, level+1);
+		}
+		break;
+	}
+}
+
+static void dump_register_summary(int level);
+
+static void
+cp_event_write(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	const char *name = rnn_enumname(rnn, "vgt_event_type", dwords[0]);
+	printl(2, "%sevent %s\n", levels[level], name);
+
+	if (name && (options->gpu_id > 500)) {
+		char eventname[64];
+		snprintf(eventname, sizeof(eventname), "EVENT:%s", name);
+		if (!strcmp(name, "BLIT")) {
+			do_query(eventname, 0);
+			print_mode(level);
+			dump_register_summary(level);
+		}
+	}
+}
+
+static void
+dump_register_summary(int level)
+{
+	uint32_t i;
+	bool saved_summary = summary;
+	summary = false;
+
+	in_summary = true;
+
+	/* dump current state of registers: */
+	printl(2, "%sdraw[%i] register values\n", levels[level], draw_count);
+	for (i = 0; i < regcnt(); i++) {
+		uint32_t regbase = i;
+		uint32_t lastval = reg_val(regbase);
+		/* skip registers that haven't been updated since last draw/blit: */
+		if (!(options->allregs || reg_rewritten(regbase)))
+			continue;
+		if (!reg_written(regbase))
+			continue;
+		if (lastval != lastvals[regbase]) {
+			printl(2, "!");
+			lastvals[regbase] = lastval;
+		} else {
+			printl(2, " ");
+		}
+		if (reg_rewritten(regbase)) {
+			printl(2, "+");
+		} else {
+			printl(2, " ");
+		}
+		printl(2, "\t%08x", lastval);
+		if (!quiet(2)) {
+			dump_register(regbase, lastval, level);
+		}
+	}
+
+	clear_rewritten();
+
+	in_summary = false;
+
+	draw_count++;
+	summary = saved_summary;
+}
+
+static uint32_t
+draw_indx_common(uint32_t *dwords, int level)
+{
+	uint32_t prim_type     = dwords[1] & 0x1f;
+	uint32_t source_select = (dwords[1] >> 6) & 0x3;
+	uint32_t num_indices   = dwords[2];
+	const char *primtype;
+
+	primtype = rnn_enumname(rnn, "pc_di_primtype", prim_type);
+
+	do_query(primtype, num_indices);
+
+	printl(2, "%sdraw:          %d\n", levels[level], draws[ib]);
+	printl(2, "%sprim_type:     %s (%d)\n", levels[level], primtype,
+			prim_type);
+	printl(2, "%ssource_select: %s (%d)\n", levels[level],
+			rnn_enumname(rnn, "pc_di_src_sel", source_select),
+			source_select);
+	printl(2, "%snum_indices:   %d\n", levels[level], num_indices);
+
+	vertices += num_indices;
+
+	draws[ib]++;
+
+	return num_indices;
+}
+
+enum pc_di_index_size {
+	INDEX_SIZE_IGN = 0,
+	INDEX_SIZE_16_BIT = 0,
+	INDEX_SIZE_32_BIT = 1,
+	INDEX_SIZE_8_BIT = 2,
+	INDEX_SIZE_INVALID = 0,
+};
+
+static void
+cp_draw_indx(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	uint32_t num_indices = draw_indx_common(dwords, level);
+
+	assert(!is_64b());
+
+	/* if we have an index buffer, dump that: */
+	if (sizedwords == 5) {
+		void *ptr = hostptr(dwords[3]);
+		printl(2, "%sgpuaddr:       %08x\n", levels[level], dwords[3]);
+		printl(2, "%sidx_size:      %d\n", levels[level], dwords[4]);
+		if (ptr) {
+			enum pc_di_index_size size =
+					((dwords[1] >> 11) & 1) | ((dwords[1] >> 12) & 2);
+			if (!quiet(2)) {
+				int i;
+				printf("%sidxs:         ", levels[level]);
+				if (size == INDEX_SIZE_8_BIT) {
+					uint8_t *idx = ptr;
+					for (i = 0; i < dwords[4]; i++)
+						printf(" %u", idx[i]);
+				} else if (size == INDEX_SIZE_16_BIT) {
+					uint16_t *idx = ptr;
+					for (i = 0; i < dwords[4]/2; i++)
+						printf(" %u", idx[i]);
+				} else if (size == INDEX_SIZE_32_BIT) {
+					uint32_t *idx = ptr;
+					for (i = 0; i < dwords[4]/4; i++)
+						printf(" %u", idx[i]);
+				}
+				printf("\n");
+				dump_hex(ptr, dwords[4]/4, level+1);
+			}
+		}
+	}
+
+	/* don't bother dumping registers for the dummy draw_indx's.. */
+	if (num_indices > 0)
+		dump_register_summary(level);
+
+	needs_wfi = true;
+}
+
+static void
+cp_draw_indx_2(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	uint32_t num_indices = draw_indx_common(dwords, level);
+	enum pc_di_index_size size =
+			((dwords[1] >> 11) & 1) | ((dwords[1] >> 12) & 2);
+	void *ptr = &dwords[3];
+	int sz = 0;
+
+	assert(!is_64b());
+
+	/* CP_DRAW_INDX_2 has embedded/inline idx buffer: */
+	if (!quiet(2)) {
+		int i;
+		printf("%sidxs:         ", levels[level]);
+		if (size == INDEX_SIZE_8_BIT) {
+			uint8_t *idx = ptr;
+			for (i = 0; i < num_indices; i++)
+				printf(" %u", idx[i]);
+			sz = num_indices;
+		} else if (size == INDEX_SIZE_16_BIT) {
+			uint16_t *idx = ptr;
+			for (i = 0; i < num_indices; i++)
+				printf(" %u", idx[i]);
+			sz = num_indices * 2;
+		} else if (size == INDEX_SIZE_32_BIT) {
+			uint32_t *idx = ptr;
+			for (i = 0; i < num_indices; i++)
+				printf(" %u", idx[i]);
+			sz = num_indices * 4;
+		}
+		printf("\n");
+		dump_hex(ptr, sz / 4, level+1);
+	}
+
+	/* don't bother dumping registers for the dummy draw_indx's.. */
+	if (num_indices > 0)
+		dump_register_summary(level);
+}
+
+static void
+cp_draw_indx_offset(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	uint32_t num_indices = dwords[2];
+	uint32_t prim_type = dwords[0] & 0x1f;
+
+	do_query(rnn_enumname(rnn, "pc_di_primtype", prim_type), num_indices);
+	print_mode(level);
+
+	/* don't bother dumping registers for the dummy draw_indx's.. */
+	if (num_indices > 0)
+		dump_register_summary(level);
+}
+
+static void
+cp_draw_indx_indirect(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	uint32_t prim_type = dwords[0] & 0x1f;
+	uint64_t addr;
+
+	do_query(rnn_enumname(rnn, "pc_di_primtype", prim_type), 0);
+	print_mode(level);
+
+	if (is_64b())
+		addr = (((uint64_t)dwords[2] & 0x1ffff) << 32) | dwords[1];
+	else
+		addr = dwords[1];
+	dump_gpuaddr_size(addr, level, 0x10, 2);
+
+	if (is_64b())
+		addr = (((uint64_t)dwords[5] & 0x1ffff) << 32) | dwords[4];
+	else
+		addr = dwords[3];
+	dump_gpuaddr_size(addr, level, 0x10, 2);
+
+	dump_register_summary(level);
+}
+
+static void
+cp_draw_indirect(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	uint32_t prim_type = dwords[0] & 0x1f;
+	uint64_t addr;
+
+	do_query(rnn_enumname(rnn, "pc_di_primtype", prim_type), 0);
+	print_mode(level);
+
+	addr = (((uint64_t)dwords[2] & 0x1ffff) << 32) | dwords[1];
+	dump_gpuaddr_size(addr, level, 0x10, 2);
+
+	dump_register_summary(level);
+}
+
+static void
+cp_run_cl(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	do_query("COMPUTE", 1);
+	dump_register_summary(level);
+}
+
+static void
+cp_nop(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	const char *buf = (void *)dwords;
+	int i;
+
+	if (quiet(3))
+		return;
+
+	// blob doesn't use CP_NOP for string_marker but it does
+	// use it for things that end up looking like, but aren't
+	// ascii chars:
+	if (!options->decode_markers)
+		return;
+
+	for (i = 0; i < 4 * sizedwords; i++) {
+		if (buf[i] == '\0')
+			break;
+		if (isascii(buf[i]))
+			printf("%c", buf[i]);
+	}
+	printf("\n");
+}
+
+static void
+cp_indirect(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	/* traverse indirect buffers */
+	uint64_t ibaddr;
+	uint32_t ibsize;
+	uint32_t *ptr = NULL;
+
+	if (is_64b()) {
+		/* a5xx+.. high 32b of gpu addr, then size: */
+		ibaddr = dwords[0];
+		ibaddr |= ((uint64_t)dwords[1]) << 32;
+		ibsize = dwords[2];
+	} else {
+		ibaddr = dwords[0];
+		ibsize = dwords[1];
+	}
+
+	if (!quiet(3)) {
+		if (is_64b()) {
+			printf("%sibaddr:%016lx\n", levels[level], ibaddr);
+		} else {
+			printf("%sibaddr:%08x\n", levels[level], (uint32_t)ibaddr);
+		}
+		printf("%sibsize:%08x\n", levels[level], ibsize);
+	}
+
+	if (options->once && has_dumped(ibaddr, enable_mask))
+		return;
+
+	/* 'query-compare' mode implies 'once' mode, although we need only to
+	 * process the cmdstream for *any* enable_mask mode, since we are
+	 * comparing binning vs draw reg values at the same time, ie. it is
+	 * not useful to process the same draw in both binning and draw pass.
+	 */
+	if (options->query_compare && has_dumped(ibaddr, MODE_ALL))
+		return;
+
+	/* map gpuaddr back to hostptr: */
+	ptr = hostptr(ibaddr);
+
+	if (ptr) {
+		/* If the GPU hung within the target IB, the trigger point will be
+		 * just after the current CP_INDIRECT_BUFFER.  Because the IB is
+		 * executed but never returns.  Account for this by checking if
+		 * the IB returned:
+		 */
+		highlight_gpuaddr(gpuaddr(&dwords[is_64b() ? 3 : 2]));
+
+		ib++;
+		ibs[ib].base = ibaddr;
+		ibs[ib].size = ibsize;
+
+		dump_commands(ptr, ibsize, level);
+		ib--;
+	} else {
+		fprintf(stderr, "could not find: %016"PRIx64" (%d)\n", ibaddr, ibsize);
+	}
+}
+
+static void
+cp_wfi(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	needs_wfi = false;
+}
+
+static void
+cp_mem_write(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	if (quiet(2))
+		return;
+
+	if (is_64b()) {
+		uint64_t gpuaddr = dwords[0] | (((uint64_t)dwords[1]) << 32);
+		printf("%sgpuaddr:%016lx\n", levels[level], gpuaddr);
+		dump_hex(&dwords[2], sizedwords-2, level+1);
+
+		if (pkt_is_type4(dwords[2]) || pkt_is_type7(dwords[2]))
+			dump_commands(&dwords[2], sizedwords-2, level+1);
+	} else {
+		uint32_t gpuaddr = dwords[0];
+		printf("%sgpuaddr:%08x\n", levels[level], gpuaddr);
+		dump_float((float *)&dwords[1], sizedwords-1, level+1);
+	}
+}
+
+static void
+cp_rmw(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	uint32_t val = dwords[0] & 0xffff;
+	uint32_t and = dwords[1];
+	uint32_t or  = dwords[2];
+	printl(3, "%srmw (%s & 0x%08x) | 0x%08x)\n", levels[level], regname(val, 1), and, or);
+	if (needs_wfi)
+		printl(2, "NEEDS WFI: rmw (%s & 0x%08x) | 0x%08x)\n", regname(val, 1), and, or);
+	reg_set(val, (reg_val(val) & and) | or);
+}
+
+static void
+cp_reg_mem(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	uint32_t val = dwords[0] & 0xffff;
+	printl(3, "%sbase register: %s\n", levels[level], regname(val, 1));
+
+	if (quiet(2))
+		return;
+
+	uint64_t gpuaddr = dwords[1] | (((uint64_t)dwords[2]) << 32);
+	printf("%sgpuaddr:%016lx\n", levels[level], gpuaddr);
+	void *ptr = hostptr(gpuaddr);
+	if (ptr) {
+		uint32_t cnt = (dwords[0] >> 19) & 0x3ff;
+		dump_hex(ptr, cnt, level + 1);
+	}
+}
+
+struct draw_state {
+	uint16_t enable_mask;
+	uint16_t flags;
+	uint32_t count;
+	uint64_t addr;
+};
+
+struct draw_state state[32];
+
+#define FLAG_DIRTY              0x1
+#define FLAG_DISABLE            0x2
+#define FLAG_DISABLE_ALL_GROUPS 0x4
+#define FLAG_LOAD_IMMED         0x8
+
+static int draw_mode;
+
+static void
+disable_group(unsigned group_id)
+{
+	struct draw_state *ds = &state[group_id];
+	memset(ds, 0, sizeof(*ds));
+}
+
+static void
+disable_all_groups(void)
+{
+	for (unsigned i = 0; i < ARRAY_SIZE(state); i++)
+		disable_group(i);
+}
+
+static void
+load_group(unsigned group_id, int level)
+{
+	struct draw_state *ds = &state[group_id];
+
+	if (!ds->count)
+		return;
+
+	printl(2, "%sgroup_id: %u\n", levels[level], group_id);
+	printl(2, "%scount: %d\n", levels[level], ds->count);
+	printl(2, "%saddr: %016llx\n", levels[level], ds->addr);
+	printl(2, "%sflags: %x\n", levels[level], ds->flags);
+
+	if (options->gpu_id >= 600) {
+		printl(2, "%senable_mask: 0x%x\n", levels[level], ds->enable_mask);
+
+		if (!(ds->enable_mask & enable_mask)) {
+			printl(2, "%s\tskipped!\n\n", levels[level]);
+			return;
+		}
+	}
+
+	void *ptr = hostptr(ds->addr);
+	if (ptr) {
+		if (!quiet(2))
+			dump_hex(ptr, ds->count, level+1);
+
+		ib++;
+		dump_commands(ptr, ds->count, level+1);
+		ib--;
+	}
+}
+
+static void
+load_all_groups(int level)
+{
+	/* sanity check, we should never recursively hit recursion here, and if
+	 * we do bad things happen:
+	 */
+	static bool loading_groups = false;
+	if (loading_groups) {
+		printf("ERROR: nothing in draw state should trigger recursively loading groups!\n");
+		return;
+	}
+	loading_groups = true;
+	for (unsigned i = 0; i < ARRAY_SIZE(state); i++)
+		load_group(i, level);
+	loading_groups = false;
+
+	/* in 'query-compare' mode, defer disabling all groups until we have a
+	 * chance to process the query:
+	 */
+	if (!options->query_compare)
+		disable_all_groups();
+}
+
+static void
+cp_set_draw_state(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	uint32_t i;
+
+	for (i = 0; i < sizedwords; ) {
+		struct draw_state *ds;
+		uint32_t count = dwords[i] & 0xffff;
+		uint32_t group_id = (dwords[i] >> 24) & 0x1f;
+		uint32_t enable_mask = (dwords[i] >> 20) & 0xf;
+		uint32_t flags = (dwords[i] >> 16) & 0xf;
+		uint64_t addr;
+
+		if (is_64b()) {
+			addr = dwords[i + 1];
+			addr |= ((uint64_t)dwords[i + 2]) << 32;
+			i += 3;
+		} else {
+			addr = dwords[i + 1];
+			i += 2;
+		}
+
+		if (flags & FLAG_DISABLE_ALL_GROUPS) {
+			disable_all_groups();
+			continue;
+		}
+
+		if (flags & FLAG_DISABLE) {
+			disable_group(group_id);
+			continue;
+		}
+
+		assert(group_id < ARRAY_SIZE(state));
+		disable_group(group_id);
+
+		ds = &state[group_id];
+
+		ds->enable_mask = enable_mask;
+		ds->flags = flags;
+		ds->count = count;
+		ds->addr  = addr;
+
+		if (flags & FLAG_LOAD_IMMED) {
+			load_group(group_id, level);
+			disable_group(group_id);
+		}
+	}
+}
+
+static void
+cp_set_mode(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	draw_mode = dwords[0];
+}
+
+/* execute compute shader */
+static void
+cp_exec_cs(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	do_query("compute", 0);
+	dump_register_summary(level);
+}
+
+static void
+cp_exec_cs_indirect(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	uint64_t addr;
+
+	if (is_64b()) {
+		addr = (((uint64_t)dwords[2] & 0x1ffff) << 32) | dwords[1];
+	} else {
+		addr = dwords[1];
+	}
+
+	printl(3, "%saddr: %016llx\n", levels[level], addr);
+	dump_gpuaddr_size(addr, level, 0x10, 2);
+
+	do_query("compute", 0);
+	dump_register_summary(level);
+}
+
+static void
+cp_set_marker(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	render_mode = rnn_enumname(rnn, "a6xx_render_mode", dwords[0] & 0xf);
+
+	if (!strcmp(render_mode, "RM6_BINNING")) {
+		enable_mask = MODE_BINNING;
+	} else if (!strcmp(render_mode, "RM6_GMEM")) {
+		enable_mask = MODE_GMEM;
+	} else if (!strcmp(render_mode, "RM6_BYPASS")) {
+		enable_mask = MODE_BYPASS;
+	}
+}
+
+static void
+cp_set_render_mode(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	uint64_t addr;
+	uint32_t *ptr, len;
+
+	assert(is_64b());
+
+	/* TODO seems to have two ptrs, 9 dwords total (incl pkt7 hdr)..
+	 * not sure if this can come in different sizes.
+	 *
+	 * First ptr doesn't seem to be cmdstream, second one does.
+	 *
+	 * Comment from downstream kernel:
+	 *
+	 * SRM -- set render mode (ex binning, direct render etc)
+	 * SRM is set by UMD usually at start of IB to tell CP the type of
+	 * preemption.
+	 * KMD needs to set SRM to NULL to indicate CP that rendering is
+	 * done by IB.
+	 * ------------------------------------------------------------------
+	 *
+	 * Seems to always be one of these two:
+	 * 70ec0008 00000001 001c0000 00000000 00000010 00000003 0000000d 001c2000 00000000
+	 * 70ec0008 00000001 001c0000 00000000 00000000 00000003 0000000d 001c2000 00000000
+	 *
+	 */
+
+	assert(options->gpu_id >= 500);
+
+	render_mode = rnn_enumname(rnn, "render_mode_cmd", dwords[0]);
+
+	if (sizedwords == 1)
+		return;
+
+	addr = dwords[1];
+	addr |= ((uint64_t)dwords[2]) << 32;
+
+	mode = dwords[3];
+
+	dump_gpuaddr(addr, level+1);
+
+	if (sizedwords == 5)
+		return;
+
+	assert(sizedwords == 8);
+
+	len = dwords[5];
+	addr = dwords[6];
+	addr |= ((uint64_t)dwords[7]) << 32;
+
+	printl(3, "%saddr: 0x%016lx\n", levels[level], addr);
+	printl(3, "%slen:  0x%x\n", levels[level], len);
+
+	ptr = hostptr(addr);
+
+	if (ptr) {
+		if (!quiet(2)) {
+			ib++;
+			dump_commands(ptr, len, level+1);
+			ib--;
+			dump_hex(ptr, len, level+1);
+		}
+	}
+}
+
+static void
+cp_compute_checkpoint(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	uint64_t addr;
+	uint32_t *ptr, len;
+
+	assert(is_64b());
+	assert(options->gpu_id >= 500);
+
+	assert(sizedwords == 8);
+
+	addr = dwords[5];
+	addr |= ((uint64_t)dwords[6]) << 32;
+	len = dwords[7];
+
+	printl(3, "%saddr: 0x%016lx\n", levels[level], addr);
+	printl(3, "%slen:  0x%x\n", levels[level], len);
+
+	ptr = hostptr(addr);
+
+	if (ptr) {
+		if (!quiet(2)) {
+			ib++;
+			dump_commands(ptr, len, level+1);
+			ib--;
+			dump_hex(ptr, len, level+1);
+		}
+	}
+}
+
+static void
+cp_blit(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	do_query(rnn_enumname(rnn, "cp_blit_cmd", dwords[0]), 0);
+	print_mode(level);
+	dump_register_summary(level);
+}
+
+static void
+cp_context_reg_bunch(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	int i;
+
+	/* NOTE: seems to write same reg multiple times.. not sure if different parts of
+	 * these are triggered by the FLUSH_SO_n events?? (if that is what they actually
+	 * are?)
+	 */
+	bool saved_summary = summary;
+	summary = false;
+
+	for (i = 0; i < sizedwords; i += 2) {
+		dump_register(dwords[i+0], dwords[i+1], level+1);
+		reg_set(dwords[i+0], dwords[i+1]);
+	}
+
+	summary = saved_summary;
+}
+
+static void
+cp_reg_write(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	uint32_t reg = dwords[1] & 0xffff;
+
+	dump_register(reg, dwords[2], level+1);
+	reg_set(reg, dwords[2]);
+}
+
+static void
+cp_set_ctxswitch_ib(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	uint64_t addr;
+	uint32_t size = dwords[2] & 0xffff;
+	void *ptr;
+
+	addr = dwords[0] | ((uint64_t)dwords[1] << 32);
+
+	printf("addr=%lx\n", addr);
+	ptr = hostptr(addr);
+	if (ptr) {
+		dump_commands(ptr, size, level+1);
+	}
+}
+
+static void
+cp_skip_ib2_enable_global(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	skip_ib2_enable_global = dwords[0];
+}
+
+static void
+cp_skip_ib2_enable_local(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	skip_ib2_enable_local = dwords[0];
+}
+
+#define CP(x, fxn, ...)   { "CP_" #x, fxn, ##__VA_ARGS__ }
+static const struct type3_op {
+	const char *name;
+	void (*fxn)(uint32_t *dwords, uint32_t sizedwords, int level);
+	struct {
+		bool load_all_groups;
+	} options;
+} type3_op[] = {
+		CP(NOP, cp_nop),
+		CP(INDIRECT_BUFFER, cp_indirect),
+		CP(INDIRECT_BUFFER_PFD, cp_indirect),
+		CP(WAIT_FOR_IDLE, cp_wfi),
+		CP(REG_RMW, cp_rmw),
+		CP(REG_TO_MEM, cp_reg_mem),
+		CP(MEM_TO_REG, cp_reg_mem),  /* same layout as CP_REG_TO_MEM */
+		CP(MEM_WRITE, cp_mem_write),
+		CP(EVENT_WRITE, cp_event_write),
+		CP(RUN_OPENCL, cp_run_cl),
+		CP(DRAW_INDX, cp_draw_indx, {.load_all_groups=true}),
+		CP(DRAW_INDX_2, cp_draw_indx_2, {.load_all_groups=true}),
+		CP(SET_CONSTANT, cp_set_const),
+		CP(IM_LOAD_IMMEDIATE, cp_im_loadi),
+		CP(WIDE_REG_WRITE, cp_wide_reg_write),
+
+		/* for a3xx */
+		CP(LOAD_STATE, cp_load_state),
+		CP(SET_BIN, cp_set_bin),
+
+		/* for a4xx */
+		CP(LOAD_STATE4, cp_load_state),
+		CP(SET_DRAW_STATE, cp_set_draw_state),
+		CP(DRAW_INDX_OFFSET, cp_draw_indx_offset, {.load_all_groups=true}),
+		CP(EXEC_CS, cp_exec_cs, {.load_all_groups=true}),
+		CP(EXEC_CS_INDIRECT, cp_exec_cs_indirect, {.load_all_groups=true}),
+
+		/* for a5xx */
+		CP(SET_RENDER_MODE, cp_set_render_mode),
+		CP(COMPUTE_CHECKPOINT, cp_compute_checkpoint),
+		CP(BLIT, cp_blit),
+		CP(CONTEXT_REG_BUNCH, cp_context_reg_bunch),
+		CP(DRAW_INDIRECT, cp_draw_indirect, {.load_all_groups=true}),
+		CP(DRAW_INDX_INDIRECT, cp_draw_indx_indirect, {.load_all_groups=true}),
+		CP(SKIP_IB2_ENABLE_GLOBAL, cp_skip_ib2_enable_global),
+		CP(SKIP_IB2_ENABLE_LOCAL, cp_skip_ib2_enable_local),
+
+		/* for a6xx */
+		CP(LOAD_STATE6_GEOM, cp_load_state),
+		CP(LOAD_STATE6_FRAG, cp_load_state),
+		CP(LOAD_STATE6, cp_load_state),
+		CP(SET_MODE, cp_set_mode),
+		CP(SET_MARKER, cp_set_marker),
+		CP(REG_WRITE, cp_reg_write),
+
+		CP(SET_CTXSWITCH_IB, cp_set_ctxswitch_ib),
+};
+
+static void
+noop_fxn(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+}
+
+static const struct type3_op *
+get_type3_op(unsigned opc)
+{
+	static const struct type3_op dummy_op = {
+		.fxn = noop_fxn,
+	};
+	const char *name = pktname(opc);
+
+	if (!name)
+		return &dummy_op;
+
+	for (unsigned i = 0; i < ARRAY_SIZE(type3_op); i++)
+		if (!strcmp(name, type3_op[i].name))
+			return &type3_op[i];
+
+	return &dummy_op;
+}
+
+void
+dump_commands(uint32_t *dwords, uint32_t sizedwords, int level)
+{
+	int dwords_left = sizedwords;
+	uint32_t count = 0; /* dword count including packet header */
+	uint32_t val;
+
+//	assert(dwords);
+	if (!dwords) {
+		printf("NULL cmd buffer!\n");
+		return;
+	}
+
+	draws[ib] = 0;
+
+	while (dwords_left > 0) {
+
+		current_draw_count = draw_count;
+
+		/* hack, this looks like a -1 underflow, in some versions
+		 * when it tries to write zero registers via pkt0
+		 */
+//		if ((dwords[0] >> 16) == 0xffff)
+//			goto skip;
+
+		if (pkt_is_type0(dwords[0])) {
+			printl(3, "t0");
+			count = type0_pkt_size(dwords[0]) + 1;
+			val = type0_pkt_offset(dwords[0]);
+			assert(val < regcnt());
+			printl(3, "%swrite %s%s (%04x)\n", levels[level+1], regname(val, 1),
+					(dwords[0] & 0x8000) ? " (same register)" : "", val);
+			dump_registers(val, dwords+1, count-1, level+2);
+			if (!quiet(3))
+				dump_hex(dwords, count, level+1);
+		} else if (pkt_is_type4(dwords[0])) {
+			/* basically the same(ish) as type0 prior to a5xx */
+			printl(3, "t4");
+			count = type4_pkt_size(dwords[0]) + 1;
+			val = type4_pkt_offset(dwords[0]);
+			assert(val < regcnt());
+			printl(3, "%swrite %s (%04x)\n", levels[level+1], regname(val, 1), val);
+			dump_registers(val, dwords+1, count-1, level+2);
+			if (!quiet(3))
+				dump_hex(dwords, count, level+1);
+#if 0
+		} else if (pkt_is_type1(dwords[0])) {
+			printl(3, "t1");
+			count = 3;
+			val = dwords[0] & 0xfff;
+			printl(3, "%swrite %s\n", levels[level+1], regname(val, 1));
+			dump_registers(val, dwords+1, 1, level+2);
+			val = (dwords[0] >> 12) & 0xfff;
+			printl(3, "%swrite %s\n", levels[level+1], regname(val, 1));
+			dump_registers(val, dwords+2, 1, level+2);
+			if (!quiet(3))
+				dump_hex(dwords, count, level+1);
+		} else if (pkt_is_type2(dwords[0])) {
+			printl(3, "t2");
+			printf("%sNOP\n", levels[level+1]);
+			count = 1;
+			if (!quiet(3))
+				dump_hex(dwords, count, level+1);
+#endif
+		} else if (pkt_is_type3(dwords[0])) {
+			count = type3_pkt_size(dwords[0]) + 1;
+			val = cp_type3_opcode(dwords[0]);
+			const struct type3_op *op = get_type3_op(val);
+			if (op->options.load_all_groups)
+				load_all_groups(level+1);
+			printl(3, "t3");
+			const char *name = pktname(val);
+			if (!quiet(2)) {
+				printf("\t%sopcode: %s%s%s (%02x) (%d dwords)%s\n", levels[level],
+						rnn->vc->colors->bctarg, name, rnn->vc->colors->reset,
+						val, count, (dwords[0] & 0x1) ? " (predicated)" : "");
+			}
+			if (name)
+				dump_domain(dwords+1, count-1, level+2, name);
+			op->fxn(dwords+1, count-1, level+1);
+			if (!quiet(2))
+				dump_hex(dwords, count, level+1);
+		} else if (pkt_is_type7(dwords[0])) {
+			count = type7_pkt_size(dwords[0]) + 1;
+			val = cp_type7_opcode(dwords[0]);
+			const struct type3_op *op = get_type3_op(val);
+			if (op->options.load_all_groups)
+				load_all_groups(level+1);
+			printl(3, "t7");
+			const char *name = pktname(val);
+			if (!quiet(2)) {
+				printf("\t%sopcode: %s%s%s (%02x) (%d dwords)\n", levels[level],
+						rnn->vc->colors->bctarg, name, rnn->vc->colors->reset,
+						val, count);
+			}
+			if (name) {
+				/* special hack for two packets that decode the same way
+				 * on a6xx:
+				 */
+				if (!strcmp(name, "CP_LOAD_STATE6_FRAG") ||
+						!strcmp(name, "CP_LOAD_STATE6_GEOM"))
+					name = "CP_LOAD_STATE6";
+				dump_domain(dwords+1, count-1, level+2, name);
+			}
+			op->fxn(dwords+1, count-1, level+1);
+			if (!quiet(2))
+				dump_hex(dwords, count, level+1);
+		} else if (pkt_is_type2(dwords[0])) {
+			printl(3, "t2");
+			printl(3, "%snop\n", levels[level+1]);
+		} else {
+			/* for 5xx+ we can do a passable job of looking for start of next valid packet: */
+			if (options->gpu_id >= 500) {
+				while (dwords_left > 0) {
+					if (pkt_is_type7(dwords[0]) || pkt_is_type4(dwords[0]))
+						break;
+					printf("bad type! %08x\n", dwords[0]);
+					dwords++;
+					dwords_left--;
+				}
+			} else {
+				printf("bad type! %08x\n", dwords[0]);
+				return;
+			}
+		}
+
+		dwords += count;
+		dwords_left -= count;
+
+	}
+
+	if (dwords_left < 0)
+		printf("**** this ain't right!! dwords_left=%d\n", dwords_left);
+}
diff --git a/src/freedreno/decode/cffdec.h b/src/freedreno/decode/cffdec.h
new file mode 100644
index 0000000..695aec3
--- /dev/null
+++ b/src/freedreno/decode/cffdec.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CFFDEC_H__
+#define __CFFDEC_H__
+
+#include <stdbool.h>
+
+enum query_mode {
+	/* default mode, dump all queried regs on each draw: */
+	QUERY_ALL = 0,
+
+	/* only dump if any of the queried regs were written
+	 * since last draw:
+	 */
+	QUERY_WRITTEN,
+
+	/* only dump if any of the queried regs changed since
+	 * last draw:
+	 */
+	QUERY_DELTA,
+};
+
+struct cffdec_options {
+	unsigned gpu_id;
+	int draw_filter;
+	int color;
+	int dump_shaders;
+	int summary;
+	int allregs;
+	int dump_textures;
+	int decode_markers;
+	char *script;
+
+	int query_compare;  /* binning vs SYSMEM/GMEM compare mode */
+	int query_mode;     /* enum query_mode */
+	char **querystrs;
+	int nquery;
+
+	/* In "once" mode, only decode a cmdstream buffer once (per draw
+	 * mode, in the case of a6xx+ where a single cmdstream buffer can
+	 * be used for both binning and draw pass), rather than each time
+	 * encountered (ie. once per tile/bin in GMEM draw passes)
+	 */
+	int once;
+
+	/* for crashdec, where we know CP_IBx_REM_SIZE, we can use this
+	 * to highlight the cmdstream not parsed yet, to make it easier
+	 * to see how far along the CP is.
+	 */
+	struct {
+		uint64_t base;
+		uint32_t rem;
+	} ibs[4];
+};
+
+void printl(int lvl, const char *fmt, ...);
+const char * pktname(unsigned opc);
+uint32_t regbase(const char *name);
+const char * regname(uint32_t regbase, int color);
+bool reg_written(uint32_t regbase);
+uint32_t reg_lastval(uint32_t regbase);
+uint32_t reg_val(uint32_t regbase);
+void reg_set(uint32_t regbase, uint32_t val);
+void reset_regs(void);
+void cffdec_init(const struct cffdec_options *options);
+void dump_register_val(uint32_t regbase, uint32_t dword, int level);
+void dump_commands(uint32_t *dwords, uint32_t sizedwords, int level);
+
+/*
+ * Helpers for packet parsing:
+ */
+
+
+#define CP_TYPE0_PKT 0x00000000
+#define CP_TYPE2_PKT 0x80000000
+#define CP_TYPE3_PKT 0xc0000000
+#define CP_TYPE4_PKT 0x40000000
+#define CP_TYPE7_PKT 0x70000000
+
+#define pkt_is_type0(pkt) (((pkt) & 0XC0000000) == CP_TYPE0_PKT)
+#define type0_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1)
+#define type0_pkt_offset(pkt) ((pkt) & 0x7FFF)
+
+#define pkt_is_type2(pkt) ((pkt) == CP_TYPE2_PKT)
+
+/*
+ * Check both for the type3 opcode and make sure that the reserved bits [1:7]
+ * and 15 are 0
+ */
+
+static inline uint pm4_calc_odd_parity_bit(uint val)
+{
+	return (0x9669 >> (0xf & ((val) ^
+			((val) >> 4) ^ ((val) >> 8) ^ ((val) >> 12) ^
+			((val) >> 16) ^ ((val) >> 20) ^ ((val) >> 24) ^
+			((val) >> 28)))) & 1;
+}
+
+#define pkt_is_type3(pkt) \
+        ((((pkt) & 0xC0000000) == CP_TYPE3_PKT) && \
+         (((pkt) & 0x80FE) == 0))
+
+#define cp_type3_opcode(pkt) (((pkt) >> 8) & 0xFF)
+#define type3_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1)
+
+#define pkt_is_type4(pkt) \
+        ((((pkt) & 0xF0000000) == CP_TYPE4_PKT) && \
+         ((((pkt) >> 27) & 0x1) == \
+         pm4_calc_odd_parity_bit(type4_pkt_offset(pkt))) \
+         && ((((pkt) >> 7) & 0x1) == \
+         pm4_calc_odd_parity_bit(type4_pkt_size(pkt))))
+
+#define type4_pkt_offset(pkt) (((pkt) >> 8) & 0x7FFFF)
+#define type4_pkt_size(pkt) ((pkt) & 0x7F)
+
+#define pkt_is_type7(pkt) \
+        ((((pkt) & 0xF0000000) == CP_TYPE7_PKT) && \
+         (((pkt) & 0x0F000000) == 0) && \
+         ((((pkt) >> 23) & 0x1) == \
+         pm4_calc_odd_parity_bit(cp_type7_opcode(pkt))) \
+         && ((((pkt) >> 15) & 0x1) == \
+         pm4_calc_odd_parity_bit(type7_pkt_size(pkt))))
+
+#define cp_type7_opcode(pkt) (((pkt) >> 16) & 0x7F)
+#define type7_pkt_size(pkt) ((pkt) & 0x3FFF)
+
+#endif /* __CFFDEC_H__ */
diff --git a/src/freedreno/decode/cffdump.c b/src/freedreno/decode/cffdump.c
new file mode 100644
index 0000000..7fec7dc
--- /dev/null
+++ b/src/freedreno/decode/cffdump.c
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <err.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <string.h>
+#include <assert.h>
+#include <signal.h>
+#include <errno.h>
+
+#include "redump.h"
+#include "disasm.h"
+#include "script.h"
+#include "io.h"
+#include "rnnutil.h"
+#include "pager.h"
+#include "buffers.h"
+#include "cffdec.h"
+
+static struct cffdec_options options = {
+	.gpu_id = 220,
+};
+
+static bool needs_wfi = false;
+static bool is_blob = false;
+static int show_comp = false;
+static int interactive;
+static int vertices;
+
+static int handle_file(const char *filename, int start, int end, int draw);
+
+static void print_usage(const char *name)
+{
+	fprintf(stderr, "Usage:\n\n"
+			"\t%s [OPTSIONS]... FILE...\n\n"
+			"Options:\n"
+			"\t-v, --verbose    - more verbose disassembly\n"
+			"\t--dump-shaders   - dump each shader to a raw file\n"
+			"\t--no-color       - disable colorized output (default for non-console\n"
+			"\t                   output)\n"
+			"\t--color          - enable colorized output (default for tty output)\n"
+			"\t--no-pager       - disable pager (default for non-console output)\n"
+			"\t--pager          - enable pager (default for tty output)\n"
+			"\t-s, --summary    - don't show individual register writes, but just\n"
+			"\t                   register values on draws\n"
+			"\t-a, --allregs    - show all registers (including ones not written\n"
+			"\t                   since previous draw) on each draw\n"
+			"\t-S, --start=N    - start decoding from frame N\n"
+			"\t-E, --end=N      - stop decoding after frame N\n"
+			"\t-F, --frame=N    - decode only frame N\n"
+			"\t-D, --draw=N     - decode only draw N\n"
+			"\t--textures       - dump texture contents (if possible)\n"
+			"\t-L, --script=LUA - run specified lua script to analyze state\n"
+			"\t-q, --query=REG  - query mode, dump only specified query registers on\n"
+			"\t                   each draw; multiple --query/-q args can be given to\n"
+			"\t                   dump multiple registers; register can be specified\n"
+			"\t                   either by name or numeric offset\n"
+			"\t--query-all      - in query mode, show all queried regs on each draw\n"
+			"\t                   (default query mode)\n"
+			"\t--query-written  - in query mode, show queried regs on draws if any of\n"
+			"\t                   them have been written since previous draw\n"
+			"\t--query-delta    - in query mode, show queried regs on draws if any of\n"
+			"\t                   them have changed since previous draw\n"
+			"\t--query-compare  - dump registers for BINNING vs GMEM/BYPASS per draw;\n"
+			"\t                   only applicable for regs set via SDS group (a6xx+),\n"
+			"\t                   implies --once, can be combined with --query-all,\n"
+			"\t                   --query-written, or --query-delta\n"
+			"\t--once           - decode cmdstream only once (per draw mode); if same\n"
+			"\t                   cmdstream is executed for each tile, this will decode\n"
+			"\t                   it only for the first tile and skip the remainder,\n"
+			"\t                   which can be useful when looking at state that does\n"
+			"\t                   not change per tile\n"
+			"\t--not-once       - decode cmdstream for each IB (default)\n"
+			"\t-h, --help       - show this message\n"
+			, name);
+	exit(2);
+}
+
+static const struct option opts[] = {
+	/* Long opts that simply set a flag (no corresponding short alias: */
+	{ "dump-shaders",    no_argument, &options.dump_shaders,  1 },
+	{ "no-color",        no_argument, &options.color,         0 },
+	{ "color",           no_argument, &options.color,         1 },
+	{ "no-pager",        no_argument, &interactive,           0 },
+	{ "pager",           no_argument, &interactive,           1 },
+	{ "textures",        no_argument, &options.dump_textures, 1 },
+	{ "show-compositor", no_argument, &show_comp,             1 },
+	{ "query-all",       no_argument, &options.query_mode,    QUERY_ALL },
+	{ "query-written",   no_argument, &options.query_mode,    QUERY_WRITTEN },
+	{ "query-delta",     no_argument, &options.query_mode,    QUERY_DELTA },
+	{ "query-compare",   no_argument, &options.query_compare, 1 },
+	{ "once",            no_argument, &options.once,          1 },
+	{ "not-once",        no_argument, &options.once,          0 },
+
+	/* Long opts with short alias: */
+	{ "verbose",   no_argument,       0, 'v' },
+	{ "summary",   no_argument,       0, 's' },
+	{ "allregs",   no_argument,       0, 'a' },
+	{ "start",     required_argument, 0, 'S' },
+	{ "end",       required_argument, 0, 'E' },
+	{ "frame",     required_argument, 0, 'F' },
+	{ "draw",      required_argument, 0, 'D' },
+	{ "script",    required_argument, 0, 'L' },
+	{ "query",     required_argument, 0, 'q' },
+	{ "help",      no_argument,       0, 'h' },
+};
+
+int main(int argc, char **argv)
+{
+	int ret = -1;
+	int start = 0, end = 0x7ffffff, draw = -1;
+	int c;
+
+	interactive = isatty(STDOUT_FILENO);
+
+	options.color = interactive;
+
+	while ((c = getopt_long(argc, argv, "vsaS:E:F:D:L:q:h", opts, NULL)) != -1) {
+		switch (c) {
+		case 0:
+			/* option that set a flag, nothing to do */
+			break;
+		case 'v':
+			disasm_set_debug(PRINT_RAW | EXPAND_REPEAT | PRINT_VERBOSE);
+			break;
+		case 's':
+			options.summary = true;
+			break;
+		case 'a':
+			options.allregs = true;
+			break;
+		case 'S':
+			start = atoi(optarg);
+			break;
+		case 'E':
+			end = atoi(optarg);
+			break;
+		case 'F':
+			start = end = atoi(optarg);
+			break;
+		case 'D':
+			draw = atoi(optarg);
+			break;
+		case 'L':
+			options.script = optarg;
+			if (script_load(options.script)) {
+				errx(-1, "error loading %s\n", options.script);
+			}
+			break;
+		case 'q':
+			options.querystrs = realloc(options.querystrs,
+					(options.nquery + 1) * sizeof(*options.querystrs));
+			options.querystrs[options.nquery] = optarg;
+			options.nquery++;
+			interactive = 0;
+			break;
+		case 'h':
+		default:
+			print_usage(argv[0]);
+		}
+	}
+
+	if (interactive) {
+		pager_open();
+	}
+
+	while (optind < argc) {
+		ret = handle_file(argv[optind], start, end, draw);
+		if (ret) {
+			fprintf(stderr, "error reading: %s\n", argv[optind]);
+			fprintf(stderr, "continuing..\n");
+		}
+		optind++;
+	}
+
+	if (ret)
+		print_usage(argv[0]);
+
+	if ((options.query_mode || options.query_compare) && !options.nquery) {
+		fprintf(stderr, "query options only valid in query mode!\n");
+		print_usage(argv[0]);
+	}
+
+	script_finish();
+
+	if (interactive) {
+		pager_close();
+	}
+
+	return ret;
+}
+
+static void parse_addr(uint32_t *buf, int sz, unsigned int *len, uint64_t *gpuaddr)
+{
+	*gpuaddr = buf[0];
+	*len = buf[1];
+	if (sz > 8)
+		*gpuaddr |= ((uint64_t)(buf[2])) << 32;
+}
+
+static int handle_file(const char *filename, int start, int end, int draw)
+{
+	enum rd_sect_type type = RD_NONE;
+	void *buf = NULL;
+	struct io *io;
+	int submit = 0, got_gpu_id = 0;
+	int sz, ret = 0;
+	bool needs_reset = false;
+	bool skip = false;
+
+	options.draw_filter = draw;
+
+	cffdec_init(&options);
+
+	printf("Reading %s...\n", filename);
+
+	script_start_cmdstream(filename);
+
+	if (!strcmp(filename, "-"))
+		io = io_openfd(0);
+	else
+		io = io_open(filename);
+
+	if (!io) {
+		fprintf(stderr, "could not open: %s\n", filename);
+		return -1;
+	}
+
+	struct {
+		unsigned int len;
+		uint64_t gpuaddr;
+	} gpuaddr = {0};
+
+	while (true) {
+		uint32_t arr[2];
+
+		ret = io_readn(io, arr, 8);
+		if (ret <= 0)
+			goto end;
+
+		while ((arr[0] == 0xffffffff) && (arr[1] == 0xffffffff)) {
+			ret = io_readn(io, arr, 8);
+			if (ret <= 0)
+				goto end;
+		}
+
+		type = arr[0];
+		sz = arr[1];
+
+		if (sz < 0) {
+			ret = -1;
+			goto end;
+		}
+
+		free(buf);
+
+		needs_wfi = false;
+
+		buf = malloc(sz + 1);
+		((char *)buf)[sz] = '\0';
+		ret = io_readn(io, buf, sz);
+		if (ret < 0)
+			goto end;
+
+		switch(type) {
+		case RD_TEST:
+			printl(1, "test: %s\n", (char *)buf);
+			break;
+		case RD_CMD:
+			is_blob = true;
+			printl(2, "cmd: %s\n", (char *)buf);
+			skip = false;
+			if (!show_comp) {
+				skip |= (strstr(buf, "fdperf") == buf);
+				skip |= (strstr(buf, "chrome") == buf);
+				skip |= (strstr(buf, "surfaceflinger") == buf);
+				skip |= ((char *)buf)[0] == 'X';
+			}
+			break;
+		case RD_VERT_SHADER:
+			printl(2, "vertex shader:\n%s\n", (char *)buf);
+			break;
+		case RD_FRAG_SHADER:
+			printl(2, "fragment shader:\n%s\n", (char *)buf);
+			break;
+		case RD_GPUADDR:
+			if (needs_reset) {
+				reset_buffers();
+				needs_reset = false;
+			}
+			parse_addr(buf, sz, &gpuaddr.len, &gpuaddr.gpuaddr);
+			break;
+		case RD_BUFFER_CONTENTS:
+			add_buffer(gpuaddr.gpuaddr, gpuaddr.len, buf);
+			buf = NULL;
+			break;
+		case RD_CMDSTREAM_ADDR:
+			if ((start <= submit) && (submit <= end)) {
+				unsigned int sizedwords;
+				uint64_t gpuaddr;
+				parse_addr(buf, sz, &sizedwords, &gpuaddr);
+				printl(2, "############################################################\n");
+				printl(2, "cmdstream: %d dwords\n", sizedwords);
+				if (!skip) {
+					script_start_submit();
+					dump_commands(hostptr(gpuaddr), sizedwords, 0);
+					script_end_submit();
+				}
+				printl(2, "############################################################\n");
+				printl(2, "vertices: %d\n", vertices);
+			}
+			needs_reset = true;
+			submit++;
+			break;
+		case RD_GPU_ID:
+			if (!got_gpu_id) {
+				options.gpu_id = *((unsigned int *)buf);
+				printl(2, "gpu_id: %d\n", options.gpu_id);
+				cffdec_init(&options);
+				got_gpu_id = 1;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+end:
+	script_end_cmdstream();
+
+	io_close(io);
+	fflush(stdout);
+
+	if (ret < 0) {
+		printf("corrupt file\n");
+	}
+	return 0;
+}
diff --git a/src/freedreno/decode/crashdec.c b/src/freedreno/decode/crashdec.c
new file mode 100644
index 0000000..3b17d83
--- /dev/null
+++ b/src/freedreno/decode/crashdec.c
@@ -0,0 +1,1114 @@
+/*
+ * Copyright © 2020 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Decoder for devcoredump traces from drm/msm.  In case of a gpu crash/hang,
+ * the coredump should be found in:
+ *
+ *    /sys/class/devcoredump/devcd<n>/data
+ *
+ * The crashdump will hang around for 5min, it can be cleared by writing to
+ * the file, ie:
+ *
+ *    echo 1 > /sys/class/devcoredump/devcd<n>/data
+ *
+ * (the driver won't log any new crashdumps until the previous one is cleared
+ * or times out after 5min)
+ */
+
+
+#include <assert.h>
+#include <getopt.h>
+#include <inttypes.h>
+#include <setjmp.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "buffers.h"
+#include "cffdec.h"
+#include "disasm.h"
+#include "pager.h"
+#include "rnnutil.h"
+#include "util.h"
+#include "instr-a3xx.h"
+
+
+static FILE *in;
+static bool verbose;
+
+static struct rnn *rnn_gmu;
+static struct rnn *rnn_control;
+static struct rnn *rnn_pipe;
+
+static struct cffdec_options options = {
+	.draw_filter = -1,
+};
+
+static inline bool is_a6xx(void) { return (600 <= options.gpu_id) && (options.gpu_id < 700); }
+static inline bool is_a5xx(void) { return (500 <= options.gpu_id) && (options.gpu_id < 600); }
+static inline bool is_64b(void)  { return options.gpu_id >= 500; }
+
+/*
+ * Helpers to read register values:
+ */
+
+/* read registers that are 64b on 64b GPUs (ie. a5xx+) */
+static uint64_t
+regval64(const char *name)
+{
+	unsigned reg = regbase(name);
+	assert(reg);
+	uint64_t val = reg_val(reg);
+	if (is_64b())
+		val |= ((uint64_t)reg_val(reg + 1)) << 32;
+	return val;
+}
+
+static uint32_t
+regval(const char *name)
+{
+	unsigned reg = regbase(name);
+	assert(reg);
+	return reg_val(reg);
+}
+
+/*
+ * Line reading and string helpers:
+ */
+
+static char *lastline;
+static char *pushedline;
+
+static const char *
+popline(void)
+{
+	char *r = pushedline;
+
+	if (r) {
+		pushedline = NULL;
+		return r;
+	}
+
+	free(lastline);
+
+	size_t n = 0;
+	if (getline(&r, &n, in) < 0)
+		exit(0);
+
+	lastline = r;
+	return r;
+}
+
+static void
+pushline(void)
+{
+	assert(!pushedline);
+	pushedline = lastline;
+}
+
+static uint32_t *
+popline_ascii85(uint32_t sizedwords)
+{
+	const char *line = popline();
+
+	/* At this point we exepct the ascii85 data to be indented *some*
+	 * amount, and to terminate at the end of the line.  So just eat
+	 * up the leading whitespace.
+	 */
+	assert(*line == ' ');
+	while (*line == ' ')
+		line++;
+
+	uint32_t *buf = calloc(1, 4 * sizedwords);
+	int idx = 0;
+
+	while (*line != '\n') {
+		if (*line == 'z') {
+			buf[idx++] = 0;
+			line++;
+			continue;
+		}
+
+		uint32_t accum = 0;
+		for (int i = 0; (i < 5) && (*line != '\n'); i++) {
+			accum *= 85;
+			accum += *line - '!';
+			line++;
+		}
+
+		buf[idx++] = accum;
+	}
+
+	return buf;
+}
+
+static bool
+startswith(const char *line, const char *start)
+{
+	return strstr(line, start) == line;
+}
+
+static void
+parseline(const char *line, const char *fmt, ...)
+{
+	int fmtlen = strlen(fmt);
+	int n = 0;
+	int l = 0;
+
+	/* scan fmt string to extract expected # of conversions: */
+	for (int i = 0; i < fmtlen; i++) {
+		if (fmt[i] == '%') {
+			if (i == (l - 1)) { /* prev char was %, ie. we have %% */
+				n--;
+				l = 0;
+			} else {
+				n++;
+				l = i;
+			}
+		}
+	}
+
+	va_list ap;
+	va_start(ap, fmt);
+	if (vsscanf(line, fmt, ap) != n) {
+		fprintf(stderr, "parse error scanning: '%s'\n", fmt);
+		exit(1);
+	}
+	va_end(ap);
+}
+
+#define foreach_line_in_section(_line) \
+	for (const char *_line = popline(); _line; _line = popline()) \
+		/* check for start of next section */                     \
+		if (_line[0] != ' ') {                                    \
+			pushline();                                           \
+			break;                                                \
+		} else
+
+/*
+ * Provide our own disasm assert() handler, so that we can recover
+ * after attempting to disassemble things that might not be valid
+ * instructions:
+ */
+
+static bool jmp_env_valid;
+static jmp_buf jmp_env;
+
+void
+ir3_assert_handler(const char *expr, const char *file, int line,
+		const char *func)
+{
+	printf("%s:%u: %s: Assertion `%s' failed.\n", file, line, func, expr);
+	if (jmp_env_valid)
+		longjmp(jmp_env, 1);
+	abort();
+}
+
+#define TRY(x) do { \
+		assert(!jmp_env_valid); \
+		if (setjmp(jmp_env) == 0) { \
+			jmp_env_valid = true; \
+			x; \
+		} \
+		jmp_env_valid = false; \
+	} while (0)
+
+/*
+ * Decode ringbuffer section:
+ */
+
+static struct {
+	uint64_t iova;
+	uint32_t rptr;
+	uint32_t wptr;
+	uint32_t size;
+	uint32_t *buf;
+} ringbuffers[5];
+
+static void
+decode_ringbuffer(void)
+{
+	int id = 0;
+
+	foreach_line_in_section (line) {
+		if (startswith(line, "  - id:")) {
+			parseline(line, "  - id: %d", &id);
+			assert(id < ARRAY_SIZE(ringbuffers));
+		} else if (startswith(line, "    iova:")) {
+			parseline(line, "    iova: %"PRIx64, &ringbuffers[id].iova);
+		} else if (startswith(line, "    rptr:")) {
+			parseline(line, "    rptr: %d", &ringbuffers[id].rptr);
+		} else if (startswith(line, "    wptr:")) {
+			parseline(line, "    wptr: %d", &ringbuffers[id].wptr);
+		} else if (startswith(line, "    size:")) {
+			parseline(line, "    size: %d", &ringbuffers[id].size);
+		} else if (startswith(line, "    data: !!ascii85 |")) {
+			ringbuffers[id].buf = popline_ascii85(ringbuffers[id].size / 4);
+			add_buffer(ringbuffers[id].iova, ringbuffers[id].size, ringbuffers[id].buf);
+			continue;
+		}
+
+		printf("%s", line);
+	}
+}
+
+static bool
+valid_header(uint32_t pkt)
+{
+	if (options.gpu_id >= 500) {
+		return pkt_is_type4(pkt) || pkt_is_type7(pkt);
+	} else {
+		/* TODO maybe we can check validish looking pkt3 opc or pkt0
+		 * register offset.. the cmds sent by kernel are usually
+		 * fairly limited (other than initialization) which confines
+		 * the search space a bit..
+		 */
+		return true;
+	}
+}
+
+static void
+dump_cmdstream(void)
+{
+	uint64_t rb_base = regval64("CP_RB_BASE");
+
+	printf("got rb_base=%"PRIx64"\n", rb_base);
+
+	options.ibs[1].base = regval64("CP_IB1_BASE");
+	options.ibs[1].rem  = regval("CP_IB1_REM_SIZE");
+	options.ibs[2].base = regval64("CP_IB2_BASE");
+	options.ibs[2].rem  = regval("CP_IB2_REM_SIZE");
+
+	/* Adjust remaining size to account for cmdstream slurped into ROQ
+	 * but not yet consumed by SQE
+	 *
+	 * TODO add support for earlier GPUs once we tease out the needed
+	 * registers.. see crashit.c in msmtest for hints.
+	 *
+	 * TODO it would be nice to be able to extract out register bitfields
+	 * by name rather than hard-coding this.
+	 */
+	if (is_a6xx()) {
+		options.ibs[1].rem += regval("CP_CSQ_IB1_STAT") >> 16;
+		options.ibs[2].rem += regval("CP_CSQ_IB2_STAT") >> 16;
+	}
+
+	printf("IB1: %"PRIx64", %u\n", options.ibs[1].base, options.ibs[1].rem);
+	printf("IB2: %"PRIx64", %u\n", options.ibs[2].base, options.ibs[2].rem);
+
+	/* now that we've got the regvals we want, reset register state
+	 * so we aren't seeing values from decode_registers();
+	 */
+	reset_regs();
+
+	for (int id = 0; id < ARRAY_SIZE(ringbuffers); id++) {
+		if (ringbuffers[id].iova != rb_base)
+			continue;
+		if (!ringbuffers[id].size)
+			continue;
+
+		printf("found ring!\n");
+
+		/* The kernel level ringbuffer (RB) wraps around, which
+		 * cffdec doesn't really deal with.. so figure out how
+		 * many dwords are unread
+		 */
+		unsigned ringszdw = ringbuffers[id].size >> 2;  /* in dwords */
+
+/* helper macro to deal with modulo size math: */
+#define mod_add(b, v)  ((ringszdw + (int)(b) + (int)(v)) % ringszdw)
+
+		/* The rptr will (most likely) have moved past the IB to
+		 * userspace cmdstream, so back up a bit, and then advance
+		 * until we find a valid start of a packet.. this is going
+		 * to be less reliable on a4xx and before (pkt0/pkt3),
+		 * compared to pkt4/pkt7 with parity bits
+		 */
+		const int lookback = 12;
+		unsigned rptr = mod_add(ringbuffers[id].rptr, -lookback);
+
+		for (int idx = 0; idx < lookback; idx++) {
+			if (valid_header(ringbuffers[id].buf[rptr]))
+				break;
+			rptr = mod_add(rptr, 1);
+		}
+
+		unsigned cmdszdw = mod_add(ringbuffers[id].wptr, -rptr);
+
+		printf("got cmdszdw=%d\n", cmdszdw);
+		uint32_t *buf = malloc(cmdszdw * 4);
+
+		for (int idx = 0; idx < cmdszdw; idx++) {
+			int p = mod_add(rptr, idx);
+			buf[idx] = ringbuffers[id].buf[p];
+		}
+
+		dump_commands(buf, cmdszdw, 0);
+		free(buf);
+	}
+}
+
+/*
+ * Decode 'bos' (buffers) section:
+ */
+
+static void
+decode_bos(void)
+{
+	uint32_t size = 0;
+	uint64_t iova = 0;
+
+	foreach_line_in_section (line) {
+		if (startswith(line, "  - iova:")) {
+			parseline(line, "  - iova: %"PRIx64, &iova);
+		} else if (startswith(line, "    size:")) {
+			parseline(line, "    size: %u", &size);
+		} else if (startswith(line, "    data: !!ascii85 |")) {
+			uint32_t *buf = popline_ascii85(size / 4);
+
+			if (verbose)
+				dump_hex_ascii(buf, size, 1);
+
+			add_buffer(iova, size, buf);
+
+			continue;
+		}
+
+		printf("%s", line);
+	}
+}
+
+/*
+ * Decode registers section:
+ */
+
+static void
+dump_register(struct rnn *rnn, uint32_t offset, uint32_t value)
+{
+	struct rnndecaddrinfo *info = rnn_reginfo(rnn, offset);
+	if (info && info->typeinfo) {
+		char *decoded = rnndec_decodeval(rnn->vc, info->typeinfo, value);
+		printf("%s: %s\n", info->name, decoded);
+	} else if (info) {
+		printf("%s: %08x\n", info->name, value);
+	} else {
+		printf("<%04x>: %08x\n", offset, value);
+	}
+}
+
+static void
+decode_gmu_registers(void)
+{
+	foreach_line_in_section (line) {
+		uint32_t offset, value;
+		parseline(line, "  - { offset: %x, value: %x }", &offset, &value);
+
+		printf("\t%08x\t", value);
+		dump_register(rnn_gmu, offset/4, value);
+	}
+}
+
+static void
+decode_registers(void)
+{
+	foreach_line_in_section (line) {
+		uint32_t offset, value;
+		parseline(line, "  - { offset: %x, value: %x }", &offset, &value);
+
+		reg_set(offset/4, value);
+		printf("\t%08x", value);
+		dump_register_val(offset/4, value, 0);
+	}
+}
+
+/* similar to registers section, but for banked context regs: */
+static void
+decode_clusters(void)
+{
+	foreach_line_in_section (line) {
+		if (startswith(line, "  - cluster-name:") ||
+				startswith(line, "    - context:")) {
+			printf("%s", line);
+			continue;
+		}
+
+		uint32_t offset, value;
+		parseline(line, "      - { offset: %x, value: %x }", &offset, &value);
+
+		printf("\t%08x", value);
+		dump_register_val(offset/4, value, 0);
+	}
+}
+
+/*
+ * Decode indexed-registers.. these aren't like normal registers, but a
+ * sort of FIFO where successive reads pop out associated debug state.
+ */
+
+static void
+dump_cp_seq_stat(uint32_t *stat)
+{
+	printf("\t PC: %04x\n", stat[0]);
+	stat++;
+
+	if (is_a6xx() && valid_header(stat[0])) {
+		if (pkt_is_type7(stat[0])) {
+			unsigned opc = cp_type7_opcode(stat[0]);
+			const char *name = pktname(opc);
+			if (name)
+				printf("\tPKT: %s\n", name);
+		} else {
+			/* Not sure if this case can happen: */
+		}
+	}
+
+	for (int i = 0; i < 16; i++) {
+		printf("\t$%02x: %08x\t\t$%02x: %08x\n",
+				i + 1, stat[i], i + 16 + 1, stat[i + 16]);
+	}
+}
+
+static void
+dump_control_regs(uint32_t *regs)
+{
+	if (!rnn_control)
+		return;
+
+	/* Control regs 0x100-0x17f are a scratch space to be used by the
+	 * firmware however it wants, unlike lower regs which involve some
+	 * fixed-function units. Therefore only these registers get dumped
+	 * directly.
+	 */
+	for (uint32_t i = 0; i < 0x80; i++) {
+		printf("\t%08x\t", regs[i]);
+		dump_register(rnn_control, i + 0x100, regs[i]);
+	}
+}
+
+static void
+dump_cp_ucode_dbg(uint32_t *dbg)
+{
+	/* Notes on the data:
+	 * There seems to be a section every 4096 DWORD's. The sections aren't
+	 * all the same size, so the rest of the 4096 DWORD's are filled with
+	 * mirrors of the actual data.
+	 */
+
+	for (int section = 0; section < 6; section++, dbg += 0x1000) {
+		switch (section) {
+		case 0:
+			/* Contains scattered data from a630_sqe.fw: */
+			printf("\tSQE instruction cache:\n");
+			dump_hex_ascii(dbg, 4 * 0x400, 1);
+			break;
+		case 1:
+			printf("\tUnknown 1:\n");
+			dump_hex_ascii(dbg, 4 * 0x80, 1);
+			break;
+		case 2:
+			printf("\tUnknown 2:\n");
+			dump_hex_ascii(dbg, 4 * 0x200, 1);
+			break;
+		case 3:
+			printf("\tUnknown 3:\n");
+			dump_hex_ascii(dbg, 4 * 0x80, 1);
+			break;
+		case 4:
+			/* Don't bother printing this normally */
+			if (verbose) {
+				printf("\tSQE packet jumptable contents:\n");
+				dump_hex_ascii(dbg, 4 * 0x80, 1);
+			}
+			break;
+		case 5:
+			printf("\tSQE scratch control regs:\n");
+			dump_control_regs(dbg);
+			break;
+		}
+	}
+}
+
+static void
+dump_mem_pool_reg_write(unsigned reg, uint32_t data, unsigned context, bool pipe)
+{
+	if (pipe) {
+		struct rnndecaddrinfo *info = rnn_reginfo(rnn_pipe, reg);
+		printf("\t\twrite %s (%02x) pipe\n", info->name, reg);
+
+		if (!strcmp(info->typeinfo->name, "void")) {
+			/* registers that ignore their payload */
+		} else {
+			printf("\t\t\t");
+			dump_register(rnn_pipe, reg, data);
+		}
+	} else {
+		printf("\t\twrite %s (%05x) context %d\n", regname(reg, 1), reg, context);
+		dump_register_val(reg, data, 2);
+	}
+}
+
+static void
+dump_mem_pool_chunk(const uint32_t *chunk)
+{
+	struct __attribute__((packed)) {
+		bool reg0_enabled : 1;
+		bool reg1_enabled : 1;
+		uint32_t data0 : 32;
+		uint32_t data1 : 32;
+		uint32_t reg0 : 18;
+		uint32_t reg1 : 18;
+		bool reg0_pipe : 1;
+		bool reg1_pipe : 1;
+		uint32_t reg0_context : 1;
+		uint32_t reg1_context : 1;
+		uint32_t padding : 22;
+	} fields;
+
+	memcpy(&fields, chunk, 4 * sizeof(uint32_t));
+
+	if (fields.reg0_enabled) {
+		dump_mem_pool_reg_write(fields.reg0, fields.data0, fields.reg0_context, fields.reg0_pipe);
+	}
+
+	if (fields.reg1_enabled) {
+		dump_mem_pool_reg_write(fields.reg1, fields.data1, fields.reg1_context, fields.reg1_pipe);
+	}
+}
+
+static void
+dump_cp_mem_pool(uint32_t *mempool)
+{
+	/* The mem pool is a shared pool of memory used for storing in-flight
+	 * register writes. There are 6 different queues, one for each
+	 * cluster. Writing to $data (or for some special registers, $addr)
+	 * pushes data onto the appropriate queue, and each queue is pulled
+	 * from by the appropriate cluster. The queues are thus written to
+	 * in-order, but may be read out-of-order.
+	 *
+	 * The queues are conceptually divided into 128-bit "chunks", and the
+	 * read and write pointers are in units of chunks.  These chunks are
+	 * organized internally into 8-chunk "blocks", and memory is allocated
+	 * dynamically in terms of blocks. Each queue is represented as a
+	 * singly-linked list of blocks, as well as 3-bit start/end chunk
+	 * pointers that point within the first/last block.  The next pointers
+	 * are located in a separate array, rather than inline.
+	 */
+
+	/* TODO: The firmware CP_MEM_POOL save/restore routines do something
+	 * like:
+	 *
+	 * cread $02, [ $00 + 0 ]
+	 * and $02, $02, 0x118
+	 * ...
+	 * brne $02, 0, #label
+	 * mov $03, 0x2000
+	 * mov $03, 0x1000
+	 * label:
+	 * ...
+	 *
+	 * I think that control register 0 is the GPU version, and some
+	 * versions have a smaller mem pool. It seems some models have a mem
+	 * pool that's half the size, and a bunch of offsets are shifted
+	 * accordingly. Unfortunately the kernel driver's dumping code doesn't
+	 * seem to take this into account, even the downstream android driver,
+	 * and we don't know which versions 0x8, 0x10, or 0x100 correspond
+	 * to. Or maybe we can use CP_DBG_MEM_POOL_SIZE to figure this out?
+	 */
+	bool small_mem_pool = false;
+
+	/* The array of next pointers for each block. */
+	const uint32_t *next_pointers = small_mem_pool ? &mempool[0x800] : &mempool[0x1000];
+
+	/* Maximum number of blocks in the pool, also the size of the pointers
+	 * array.
+	 */
+	const int num_blocks = small_mem_pool ? 0x30 : 0x80;
+
+	/* Number of queues */
+	const unsigned num_queues = 6;
+
+	/* Unfortunately the per-queue state is a little more complicated than
+	 * a simple pair of begin/end pointers. Instead of a single beginning
+	 * block, there are *two*, with the property that either the two are
+	 * equal or the second is the "next" of the first. Similarly there are
+	 * two end blocks. Thus the queue either looks like this:
+	 *
+	 * A -> B -> ... -> C -> D
+	 *
+	 * Or like this, or some combination:
+	 *
+	 * A/B -> ... -> C/D
+	 *
+	 * However, there's only one beginning/end chunk offset. Now the
+	 * question is, which of A or B is the actual start? I.e. is the chunk
+	 * offset an offset inside A or B? It depends. I'll show a typical read
+	 * cycle, starting here (read pointer marked with a *) with a chunk
+	 * offset of 0:
+	 *
+	 *	  A                    B
+	 *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
+	 * |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_| -> |_|_|_|_|_|_|_|_|
+	 *
+	 * Once the pointer advances far enough, the hardware decides to free
+	 * A, after which the read-side state looks like:
+	 *
+	 *	(free)                A/B
+	 *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
+	 * |_|_|_|_|_|_|_|_|    |_|_|_|*|_|_|_|_| -> |_|_|_|_|_|_|_|_|
+	 *
+	 * Then after advancing the pointer a bit more, the hardware fetches
+	 * the "next" pointer for A and stores it in B:
+	 *
+	 *	(free)                 A                     B
+	 *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
+	 * |_|_|_|_|_|_|_|_|    |_|_|_|_|_|_|_|*| -> |_|_|_|_|_|_|_|_|
+	 *
+	 * Then the read pointer advances into B, at which point we've come
+	 * back to the first state having advanced a whole block:
+	 *
+	 *	(free)                 A                     B
+	 *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
+	 * |_|_|_|_|_|_|_|_|    |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_|
+	 *
+	 *
+	 * There is a similar cycle for the write pointer. Now, the question
+	 * is, how do we know which state we're in? We need to know this to
+	 * know whether the pointer (*) is in A or B if they're different. It
+	 * seems like there should be some bit somewhere describing this, but
+	 * after lots of experimentation I've come up empty-handed. For now we
+	 * assume that if the pointer is in the first half, then we're in
+	 * either the first or second state and use B, and otherwise we're in
+	 * the second or third state and use A. So far I haven't seen anything
+	 * that violates this assumption.
+	 */
+
+	struct {
+		uint32_t unk0;
+		uint32_t padding0[7]; /* Mirrors of unk0 */
+
+		struct {
+			uint32_t chunk : 3;
+			uint32_t first_block : 32 - 3;
+		} writer[6];
+		uint32_t padding1[2]; /* Mirrors of writer[4], writer[5] */
+
+		uint32_t unk1;
+		uint32_t padding2[7]; /* Mirrors of unk1 */
+
+		uint32_t writer_second_block[6];
+		uint32_t padding3[2];
+
+		uint32_t unk2[6];
+		uint32_t padding4[2];
+
+		struct {
+			uint32_t chunk : 3;
+			uint32_t first_block : 32 - 3;
+		} reader[6];
+		uint32_t padding5[2]; /* Mirrors of reader[4], reader[5] */
+
+		uint32_t unk3;
+		uint32_t padding6[7]; /* Mirrors of unk3 */
+
+		uint32_t reader_second_block[6];
+		uint32_t padding7[2];
+
+		uint32_t block_count[6];
+		uint32_t padding[2];
+
+		uint32_t unk4;
+		uint32_t padding9[7]; /* Mirrors of unk4 */
+	} data1;
+
+	const uint32_t *data1_ptr = small_mem_pool ? &mempool[0xc00] : &mempool[0x1800];
+	memcpy(&data1, data1_ptr, sizeof(data1));
+
+	/* Based on the kernel, the first dword is the mem pool size (in
+	 * blocks?) and mirrors CP_MEM_POOL_DBG_SIZE.
+	 */
+	const uint32_t *data2_ptr = small_mem_pool ? &mempool[0x1000] : &mempool[0x2000];
+	const int data2_size = 0x60;
+
+	/* This seems to be the size of each queue in chunks. */
+	const uint32_t *queue_sizes = &data2_ptr[0x18];
+
+	printf("\tdata2:\n");
+	dump_hex_ascii(data2_ptr, 4 * data2_size, 1);
+
+	/* These seem to be some kind of counter of allocated/deallocated blocks */
+	if (verbose) {
+		printf("\tunk0: %x\n", data1.unk0);
+		printf("\tunk1: %x\n", data1.unk1);
+		printf("\tunk3: %x\n", data1.unk3);
+		printf("\tunk4: %x\n\n", data1.unk4);
+	}
+
+	for (int queue = 0; queue < num_queues; queue++) {
+		const char *cluster_names[6] = {
+			"FE", "SP_VS", "PC_VS", "GRAS", "SP_PS", "PS"
+		};
+		printf("\tCLUSTER_%s:\n\n", cluster_names[queue]);
+
+		if (verbose) {
+			printf("\t\twriter_first_block: 0x%x\n", data1.writer[queue].first_block);
+			printf("\t\twriter_second_block: 0x%x\n", data1.writer_second_block[queue]);
+			printf("\t\twriter_chunk: %d\n", data1.writer[queue].chunk);
+			printf("\t\treader_first_block: 0x%x\n", data1.reader[queue].first_block);
+			printf("\t\treader_second_block: 0x%x\n", data1.reader_second_block[queue]);
+			printf("\t\treader_chunk: %d\n", data1.reader[queue].chunk);
+			printf("\t\tblock_count: %d\n", data1.block_count[queue]);
+			printf("\t\tunk2: 0x%x\n", data1.unk2[queue]);
+			printf("\t\tqueue_size: %d\n\n", queue_sizes[queue]);
+		}
+
+		uint32_t cur_chunk = data1.reader[queue].chunk;
+		uint32_t cur_block = cur_chunk > 3 ?
+			data1.reader[queue].first_block :
+			data1.reader_second_block[queue];
+		uint32_t last_chunk = data1.writer[queue].chunk;
+		uint32_t last_block = last_chunk > 3 ?
+			data1.writer[queue].first_block :
+			data1.writer_second_block[queue];
+
+		if (verbose)
+			printf("\tblock %x\n", cur_block);
+		if (cur_block >= num_blocks) {
+			fprintf(stderr, "block %x too large\n", cur_block);
+			exit(1);
+		}
+		unsigned calculated_queue_size = 0;
+		while (cur_block != last_block || cur_chunk != last_chunk) {
+			calculated_queue_size++;
+			uint32_t *chunk_ptr = &mempool[cur_block * 0x20 + cur_chunk * 4];
+
+			dump_mem_pool_chunk(chunk_ptr);
+
+			printf("\t%05x: %08x %08x %08x %08x\n",
+			       4 * (cur_block * 0x20 + cur_chunk + 4),
+			       chunk_ptr[0], chunk_ptr[1], chunk_ptr[2], chunk_ptr[3]);
+
+			cur_chunk++;
+			if (cur_chunk == 8) {
+				cur_block = next_pointers[cur_block];
+				if (verbose)
+					printf("\tblock %x\n", cur_block);
+				if (cur_block >= num_blocks) {
+					fprintf(stderr, "block %x too large\n", cur_block);
+					exit(1);
+				}
+				cur_chunk = 0;
+			}
+		}
+		if (calculated_queue_size != queue_sizes[queue]) {
+			printf("\t\tCALCULATED SIZE %d DOES NOT MATCH!\n", calculated_queue_size);
+		}
+		printf("\n");
+	}
+}
+
+static void
+decode_indexed_registers(void)
+{
+	char *name = NULL;
+	uint32_t sizedwords = 0;
+
+	foreach_line_in_section (line) {
+		if (startswith(line, "  - regs-name:")) {
+			free(name);
+			parseline(line, "  - regs-name: %ms", &name);
+		} else if (startswith(line, "    dwords:")) {
+			parseline(line, "    dwords: %u", &sizedwords);
+		} else if (startswith(line, "    data: !!ascii85 |")) {
+			uint32_t *buf = popline_ascii85(sizedwords);
+
+			/* some of the sections are pretty large, and are (at least
+			 * so far) not useful, so skip them if not in verbose mode:
+			 */
+			bool dump = verbose ||
+				!strcmp(name, "CP_SEQ_STAT") ||
+				!strcmp(name, "CP_DRAW_STATE") ||
+				!strcmp(name, "CP_ROQ") ||
+				0;
+
+			if (!strcmp(name, "CP_SEQ_STAT"))
+				dump_cp_seq_stat(buf);
+
+			if (!strcmp(name, "CP_UCODE_DBG_DATA"))
+				dump_cp_ucode_dbg(buf);
+
+			/* note that name was typo'd in earlier kernels: */
+			if (!strcmp(name, "CP_MEMPOOL") || !strcmp(name, "CP_MEMPOOOL"))
+				dump_cp_mem_pool(buf);
+
+			if (dump)
+				dump_hex_ascii(buf, 4 * sizedwords, 1);
+			free(buf);
+
+			continue;
+		}
+
+		printf("%s", line);
+	}
+}
+
+/*
+ * Decode shader-blocks:
+ */
+
+static void
+decode_shader_blocks(void)
+{
+	char *type = NULL;
+	uint32_t sizedwords = 0;
+
+	foreach_line_in_section (line) {
+		if (startswith(line, "  - type:")) {
+			free(type);
+			parseline(line, "  - type: %ms", &type);
+		} else if (startswith(line, "      size:")) {
+			parseline(line, "      size: %u", &sizedwords);
+		} else if (startswith(line, "    data: !!ascii85 |")) {
+			uint32_t *buf = popline_ascii85(sizedwords);
+
+			/* some of the sections are pretty large, and are (at least
+			 * so far) not useful, so skip them if not in verbose mode:
+			 */
+			bool dump = verbose ||
+				!strcmp(type, "A6XX_SP_INST_DATA") ||
+				!strcmp(type, "A6XX_HLSQ_INST_RAM") ||
+				0;
+
+			if (!strcmp(type, "A6XX_SP_INST_DATA") ||
+					!strcmp(type, "A6XX_HLSQ_INST_RAM")) {
+				/* TODO this section actually contains multiple shaders
+				 * (or parts of shaders?), so perhaps we should search
+				 * for ends of shaders and decode each?
+				 */
+				TRY(disasm_a3xx(buf, sizedwords, 1, stdout, options.gpu_id));
+			}
+
+			if (dump)
+				dump_hex_ascii(buf, 4 * sizedwords, 1);
+
+			free(buf);
+
+			continue;
+		}
+
+		printf("%s", line);
+	}
+
+	free(type);
+}
+
+/*
+ * Decode debugbus section:
+ */
+
+static void
+decode_debugbus(void)
+{
+	char *block = NULL;
+	uint32_t sizedwords = 0;
+
+	foreach_line_in_section (line) {
+		if (startswith(line, "  - debugbus-block:")) {
+			free(block);
+			parseline(line, "  - debugbus-block: %ms", &block);
+		} else if (startswith(line, "    count:")) {
+			parseline(line, "    count: %u", &sizedwords);
+		} else if (startswith(line, "    data: !!ascii85 |")) {
+			uint32_t *buf = popline_ascii85(sizedwords);
+
+			/* some of the sections are pretty large, and are (at least
+			 * so far) not useful, so skip them if not in verbose mode:
+			 */
+			bool dump = verbose ||
+				0;
+
+			if (dump)
+				dump_hex_ascii(buf, 4 * sizedwords, 1);
+
+			free(buf);
+
+			continue;
+		}
+
+		printf("%s", line);
+	}
+}
+
+/*
+ * Main crashdump decode loop:
+ */
+
+static void
+decode(void)
+{
+	const char *line;
+
+	while ((line = popline())) {
+		printf("%s", line);
+		if (startswith(line, "revision:")) {
+			parseline(line, "revision: %u", &options.gpu_id);
+			printf("Got gpu_id=%u\n", options.gpu_id);
+
+			cffdec_init(&options);
+
+			if (is_a6xx()) {
+				rnn_gmu = rnn_new(!options.color);
+				rnn_load_file(rnn_gmu, "adreno/a6xx_gmu.xml", "A6XX");
+				rnn_control = rnn_new(!options.color);
+				rnn_load_file(rnn_control, "adreno/adreno_control_regs.xml", "A6XX_CONTROL_REG");
+				rnn_pipe = rnn_new(!options.color);
+				rnn_load_file(rnn_pipe, "adreno/adreno_pipe_regs.xml", "A6XX_PIPE_REG");
+			} else if (is_a5xx()) {
+				rnn_control = rnn_new(!options.color);
+				rnn_load_file(rnn_control, "adreno/adreno_control_regs.xml", "A5XX_CONTROL_REG");
+			} else {
+				rnn_control = NULL;
+			}
+		} else if (startswith(line, "bos:")) {
+			decode_bos();
+		} else if (startswith(line, "ringbuffer:")) {
+			decode_ringbuffer();
+		} else if (startswith(line, "registers:")) {
+			decode_registers();
+
+			/* after we've recorded buffer contents, and CP register values,
+			 * we can take a stab at decoding the cmdstream:
+			 */
+			dump_cmdstream();
+		} else if (startswith(line, "registers-gmu:")) {
+			decode_gmu_registers();
+		} else if (startswith(line, "indexed-registers:")) {
+			decode_indexed_registers();
+		} else if (startswith(line, "shader-blocks:")) {
+			decode_shader_blocks();
+		} else if (startswith(line, "clusters:")) {
+			decode_clusters();
+		} else if (startswith(line, "debugbus:")) {
+			decode_debugbus();
+		}
+	}
+}
+
+/*
+ * Usage and argument parsing:
+ */
+
+static void
+usage(void)
+{
+	fprintf(stderr, "Usage:\n\n"
+			"\tcrashdec [-achmsv] [-f FILE]\n\n"
+			"Options:\n"
+			"\t-a, --allregs   - show all registers (including ones not written since\n"
+			"\t                  previous draw) at each draw\n"
+			"\t-c, --color     - use colors\n"
+			"\t-f, --file=FILE - read input from specified file (rather than stdin)\n"
+			"\t-h, --help      - this usage message\n"
+			"\t-m, --markers   - try to decode CP_NOP string markers\n"
+			"\t-s, --summary   - don't show individual register writes, but just show\n"
+			"\t                  register values on draws\n"
+			"\t-v, --verbose   - dump more verbose output, including contents of\n"
+			"\t                  less interesting buffers\n"
+			"\n"
+		);
+	exit(2);
+}
+
+static const struct option opts[] = {
+	{ .name = "allregs", .has_arg = 0, NULL, 'a' },
+	{ .name = "color",   .has_arg = 0, NULL, 'c' },
+	{ .name = "file",    .has_arg = 1, NULL, 'f' },
+	{ .name = "help",    .has_arg = 0, NULL, 'h' },
+	{ .name = "markers", .has_arg = 0, NULL, 'm' },
+	{ .name = "summary", .has_arg = 0, NULL, 's' },
+	{ .name = "verbose", .has_arg = 0, NULL, 'v' },
+	{}
+};
+
+static bool interactive;
+
+static void
+cleanup(void)
+{
+	fflush(stdout);
+
+	if (interactive) {
+		pager_close();
+	}
+}
+
+int
+main(int argc, char **argv)
+{
+	int c;
+
+	interactive = isatty(STDOUT_FILENO);
+	options.color = interactive;
+
+	/* default to read from stdin: */
+	in = stdin;
+
+	while ((c = getopt_long(argc, argv, "acf:hmsv", opts, NULL)) != -1) {
+		switch (c) {
+		case 'a':
+			options.allregs = true;
+			break;
+		case 'c':
+			options.color = true;
+			break;
+		case 'f':
+			in = fopen(optarg, "r");
+			break;
+		case 'm':
+			options.decode_markers = true;
+			break;
+		case 's':
+			options.summary = true;
+			break;
+		case 'v':
+			verbose = true;
+			break;
+		case 'h':
+		default:
+			usage();
+		}
+	}
+
+	if (interactive) {
+		pager_open();
+	}
+
+	atexit(cleanup);
+
+	decode();
+	cleanup();
+}
diff --git a/src/freedreno/decode/disasm-a2xx.c b/src/freedreno/decode/disasm-a2xx.c
new file mode 100644
index 0000000..314c9c1
--- /dev/null
+++ b/src/freedreno/decode/disasm-a2xx.c
@@ -0,0 +1,623 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "disasm.h"
+#include "instr-a2xx.h"
+#include "rnnutil.h"
+
+static const char *levels[] = {
+		"",
+		"\t",
+		"\t\t",
+		"\t\t\t",
+		"\t\t\t\t",
+		"\t\t\t\t\t",
+		"\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t\t\t",
+		"x",
+		"x",
+		"x",
+		"x",
+		"x",
+		"x",
+};
+
+enum debug_t debug;
+
+static struct rnn *rnn;
+
+/*
+ * ALU instructions:
+ */
+
+static const char chan_names[] = {
+		'x', 'y', 'z', 'w',
+		/* these only apply to FETCH dst's: */
+		'0', '1', '?', '_',
+};
+
+static void print_srcreg(uint32_t num, uint32_t type,
+		uint32_t swiz, uint32_t negate, uint32_t abs)
+{
+	if (negate)
+		printf("-");
+	if (abs)
+		printf("|");
+	printf("%c%u", type ? 'R' : 'C', num);
+	if (swiz) {
+		int i;
+		printf(".");
+		for (i = 0; i < 4; i++) {
+			printf("%c", chan_names[(swiz + i) & 0x3]);
+			swiz >>= 2;
+		}
+	}
+	if (abs)
+		printf("|");
+}
+
+static void print_dstreg(uint32_t num, uint32_t mask, uint32_t dst_exp)
+{
+	printf("%s%u", dst_exp ? "export" : "R", num);
+	if (mask != 0xf) {
+		int i;
+		printf(".");
+		for (i = 0; i < 4; i++) {
+			printf("%c", (mask & 0x1) ? chan_names[i] : '_');
+			mask >>= 1;
+		}
+	}
+}
+
+static void print_export_comment(uint32_t num, enum shader_t type)
+{
+	const char *name = NULL;
+	switch (type) {
+	case SHADER_VERTEX:
+		switch (num) {
+		case 62: name = "gl_Position";  break;
+		case 63: name = "gl_PointSize"; break;
+		}
+		break;
+	case SHADER_FRAGMENT:
+		switch (num) {
+		case 0:  name = "gl_FragColor"; break;
+		}
+		break;
+	default:
+		break;
+	}
+	/* if we had a symbol table here, we could look
+	 * up the name of the varying..
+	 */
+	if (name) {
+		printf("\t; %s", name);
+	}
+}
+
+struct {
+	uint32_t num_srcs;
+	const char *name;
+} vector_instructions[0x20] = {
+#define INSTR(opc, num_srcs) [opc] = { num_srcs, #opc }
+		INSTR(ADDv, 2),
+		INSTR(MULv, 2),
+		INSTR(MAXv, 2),
+		INSTR(MINv, 2),
+		INSTR(SETEv, 2),
+		INSTR(SETGTv, 2),
+		INSTR(SETGTEv, 2),
+		INSTR(SETNEv, 2),
+		INSTR(FRACv, 1),
+		INSTR(TRUNCv, 1),
+		INSTR(FLOORv, 1),
+		INSTR(MULADDv, 3),
+		INSTR(CNDEv, 3),
+		INSTR(CNDGTEv, 3),
+		INSTR(CNDGTv, 3),
+		INSTR(DOT4v, 2),
+		INSTR(DOT3v, 2),
+		INSTR(DOT2ADDv, 3),  // ???
+		INSTR(CUBEv, 2),
+		INSTR(MAX4v, 1),
+		INSTR(PRED_SETE_PUSHv, 2),
+		INSTR(PRED_SETNE_PUSHv, 2),
+		INSTR(PRED_SETGT_PUSHv, 2),
+		INSTR(PRED_SETGTE_PUSHv, 2),
+		INSTR(KILLEv, 2),
+		INSTR(KILLGTv, 2),
+		INSTR(KILLGTEv, 2),
+		INSTR(KILLNEv, 2),
+		INSTR(DSTv, 2),
+		INSTR(MOVAv, 1),
+}, scalar_instructions[0x40] = {
+		INSTR(ADDs, 1),
+		INSTR(ADD_PREVs, 1),
+		INSTR(MULs, 1),
+		INSTR(MUL_PREVs, 1),
+		INSTR(MUL_PREV2s, 1),
+		INSTR(MAXs, 1),
+		INSTR(MINs, 1),
+		INSTR(SETEs, 1),
+		INSTR(SETGTs, 1),
+		INSTR(SETGTEs, 1),
+		INSTR(SETNEs, 1),
+		INSTR(FRACs, 1),
+		INSTR(TRUNCs, 1),
+		INSTR(FLOORs, 1),
+		INSTR(EXP_IEEE, 1),
+		INSTR(LOG_CLAMP, 1),
+		INSTR(LOG_IEEE, 1),
+		INSTR(RECIP_CLAMP, 1),
+		INSTR(RECIP_FF, 1),
+		INSTR(RECIP_IEEE, 1),
+		INSTR(RECIPSQ_CLAMP, 1),
+		INSTR(RECIPSQ_FF, 1),
+		INSTR(RECIPSQ_IEEE, 1),
+		INSTR(MOVAs, 1),
+		INSTR(MOVA_FLOORs, 1),
+		INSTR(SUBs, 1),
+		INSTR(SUB_PREVs, 1),
+		INSTR(PRED_SETEs, 1),
+		INSTR(PRED_SETNEs, 1),
+		INSTR(PRED_SETGTs, 1),
+		INSTR(PRED_SETGTEs, 1),
+		INSTR(PRED_SET_INVs, 1),
+		INSTR(PRED_SET_POPs, 1),
+		INSTR(PRED_SET_CLRs, 1),
+		INSTR(PRED_SET_RESTOREs, 1),
+		INSTR(KILLEs, 1),
+		INSTR(KILLGTs, 1),
+		INSTR(KILLGTEs, 1),
+		INSTR(KILLNEs, 1),
+		INSTR(KILLONEs, 1),
+		INSTR(SQRT_IEEE, 1),
+		INSTR(MUL_CONST_0, 1),
+		INSTR(MUL_CONST_1, 1),
+		INSTR(ADD_CONST_0, 1),
+		INSTR(ADD_CONST_1, 1),
+		INSTR(SUB_CONST_0, 1),
+		INSTR(SUB_CONST_1, 1),
+		INSTR(SIN, 1),
+		INSTR(COS, 1),
+		INSTR(RETAIN_PREV, 1),
+#undef INSTR
+};
+
+static int disasm_alu(uint32_t *dwords, uint32_t alu_off,
+		int level, int sync, enum shader_t type)
+{
+	instr_alu_t *alu = (instr_alu_t *)dwords;
+
+	printf("%s", levels[level]);
+	if (debug & PRINT_RAW) {
+		printf("%02x: %08x %08x %08x\t", alu_off,
+				dwords[0], dwords[1], dwords[2]);
+	}
+
+	printf("   %sALU:\t", sync ? "(S)" : "   ");
+
+	printf("%s", vector_instructions[alu->vector_opc].name);
+
+	if (alu->pred_select & 0x2) {
+		/* seems to work similar to conditional execution in ARM instruction
+		 * set, so let's use a similar syntax for now:
+		 */
+		printf((alu->pred_select & 0x1) ? "EQ" : "NE");
+	}
+
+	printf("\t");
+
+	print_dstreg(alu->vector_dest, alu->vector_write_mask, alu->export_data);
+	printf(" = ");
+	if (vector_instructions[alu->vector_opc].num_srcs == 3) {
+		print_srcreg(alu->src3_reg, alu->src3_sel, alu->src3_swiz,
+				alu->src3_reg_negate, alu->src3_reg_abs);
+		printf(", ");
+	}
+	print_srcreg(alu->src1_reg, alu->src1_sel, alu->src1_swiz,
+			alu->src1_reg_negate, alu->src1_reg_abs);
+	if (vector_instructions[alu->vector_opc].num_srcs > 1) {
+		printf(", ");
+		print_srcreg(alu->src2_reg, alu->src2_sel, alu->src2_swiz,
+				alu->src2_reg_negate, alu->src2_reg_abs);
+	}
+
+	if (alu->vector_clamp)
+		printf(" CLAMP");
+
+	if (alu->export_data)
+		print_export_comment(alu->vector_dest, type);
+
+	printf("\n");
+
+	if (alu->scalar_write_mask || !alu->vector_write_mask) {
+		/* 2nd optional scalar op: */
+
+		printf("%s", levels[level]);
+		if (debug & PRINT_RAW)
+			printf("                          \t");
+
+		if (scalar_instructions[alu->scalar_opc].name) {
+			printf("\t    \t%s\t", scalar_instructions[alu->scalar_opc].name);
+		} else {
+			printf("\t    \tOP(%u)\t", alu->scalar_opc);
+		}
+
+		print_dstreg(alu->scalar_dest, alu->scalar_write_mask, alu->export_data);
+		printf(" = ");
+		print_srcreg(alu->src3_reg, alu->src3_sel, alu->src3_swiz,
+				alu->src3_reg_negate, alu->src3_reg_abs);
+		// TODO ADD/MUL must have another src?!?
+		if (alu->scalar_clamp)
+			printf(" CLAMP");
+		if (alu->export_data)
+			print_export_comment(alu->scalar_dest, type);
+		printf("\n");
+	}
+
+	return 0;
+}
+
+
+/*
+ * FETCH instructions:
+ */
+
+static void print_fetch_dst(uint32_t dst_reg, uint32_t dst_swiz)
+{
+	int i;
+	printf("\tR%u.", dst_reg);
+	for (i = 0; i < 4; i++) {
+		printf("%c", chan_names[dst_swiz & 0x7]);
+		dst_swiz >>= 3;
+	}
+}
+
+static void print_fetch_vtx(instr_fetch_t *fetch)
+{
+	instr_fetch_vtx_t *vtx = &fetch->vtx;
+
+	if (vtx->pred_select) {
+		/* seems to work similar to conditional execution in ARM instruction
+		 * set, so let's use a similar syntax for now:
+		 */
+		printf(vtx->pred_condition ? "EQ" : "NE");
+	}
+
+	print_fetch_dst(vtx->dst_reg, vtx->dst_swiz);
+	printf(" = R%u.", vtx->src_reg);
+	printf("%c", chan_names[vtx->src_swiz & 0x3]);
+
+	const char *fmt = rnn_enumname(rnn, "a2xx_sq_surfaceformat", vtx->format);
+	if (fmt) {
+		printf(" %s", fmt);
+	} else  {
+		printf(" TYPE(0x%x)", vtx->format);
+	}
+	printf(" %s", vtx->format_comp_all ? "SIGNED" : "UNSIGNED");
+	if (!vtx->num_format_all)
+		printf(" NORMALIZED");
+	printf(" STRIDE(%u)", vtx->stride);
+	if (vtx->offset)
+		printf(" OFFSET(%u)", vtx->offset);
+	printf(" CONST(%u, %u)", vtx->const_index, vtx->const_index_sel);
+	if (0) {
+		// XXX
+		printf(" src_reg_am=%u", vtx->src_reg_am);
+		printf(" dst_reg_am=%u", vtx->dst_reg_am);
+		printf(" num_format_all=%u", vtx->num_format_all);
+		printf(" signed_rf_mode_all=%u", vtx->signed_rf_mode_all);
+		printf(" exp_adjust_all=%u", vtx->exp_adjust_all);
+	}
+}
+
+static void print_fetch_tex(instr_fetch_t *fetch)
+{
+	static const char *filter[] = {
+			[TEX_FILTER_POINT] = "POINT",
+			[TEX_FILTER_LINEAR] = "LINEAR",
+			[TEX_FILTER_BASEMAP] = "BASEMAP",
+	};
+	static const char *aniso_filter[] = {
+			[ANISO_FILTER_DISABLED] = "DISABLED",
+			[ANISO_FILTER_MAX_1_1] = "MAX_1_1",
+			[ANISO_FILTER_MAX_2_1] = "MAX_2_1",
+			[ANISO_FILTER_MAX_4_1] = "MAX_4_1",
+			[ANISO_FILTER_MAX_8_1] = "MAX_8_1",
+			[ANISO_FILTER_MAX_16_1] = "MAX_16_1",
+	};
+	static const char *arbitrary_filter[] = {
+			[ARBITRARY_FILTER_2X4_SYM] = "2x4_SYM",
+			[ARBITRARY_FILTER_2X4_ASYM] = "2x4_ASYM",
+			[ARBITRARY_FILTER_4X2_SYM] = "4x2_SYM",
+			[ARBITRARY_FILTER_4X2_ASYM] = "4x2_ASYM",
+			[ARBITRARY_FILTER_4X4_SYM] = "4x4_SYM",
+			[ARBITRARY_FILTER_4X4_ASYM] = "4x4_ASYM",
+	};
+	static const char *sample_loc[] = {
+			[SAMPLE_CENTROID] = "CENTROID",
+			[SAMPLE_CENTER] = "CENTER",
+	};
+	instr_fetch_tex_t *tex = &fetch->tex;
+	uint32_t src_swiz = tex->src_swiz;
+	int i;
+
+	if (tex->pred_select) {
+		/* seems to work similar to conditional execution in ARM instruction
+		 * set, so let's use a similar syntax for now:
+		 */
+		printf(tex->pred_condition ? "EQ" : "NE");
+	}
+
+	print_fetch_dst(tex->dst_reg, tex->dst_swiz);
+	printf(" = R%u.", tex->src_reg);
+	for (i = 0; i < 3; i++) {
+		printf("%c", chan_names[src_swiz & 0x3]);
+		src_swiz >>= 2;
+	}
+	printf(" CONST(%u)", tex->const_idx);
+	if (tex->fetch_valid_only)
+		printf(" VALID_ONLY");
+	if (tex->tx_coord_denorm)
+		printf(" DENORM");
+	if (tex->mag_filter != TEX_FILTER_USE_FETCH_CONST)
+		printf(" MAG(%s)", filter[tex->mag_filter]);
+	if (tex->min_filter != TEX_FILTER_USE_FETCH_CONST)
+		printf(" MIN(%s)", filter[tex->min_filter]);
+	if (tex->mip_filter != TEX_FILTER_USE_FETCH_CONST)
+		printf(" MIP(%s)", filter[tex->mip_filter]);
+	if (tex->aniso_filter != ANISO_FILTER_USE_FETCH_CONST)
+		printf(" ANISO(%s)", aniso_filter[tex->aniso_filter]);
+	if (tex->arbitrary_filter != ARBITRARY_FILTER_USE_FETCH_CONST)
+		printf(" ARBITRARY(%s)", arbitrary_filter[tex->arbitrary_filter]);
+	if (tex->vol_mag_filter != TEX_FILTER_USE_FETCH_CONST)
+		printf(" VOL_MAG(%s)", filter[tex->vol_mag_filter]);
+	if (tex->vol_min_filter != TEX_FILTER_USE_FETCH_CONST)
+		printf(" VOL_MIN(%s)", filter[tex->vol_min_filter]);
+	if (!tex->use_comp_lod) {
+		printf(" LOD(%u)", tex->use_comp_lod);
+		printf(" LOD_BIAS(%u)", tex->lod_bias);
+	}
+	if (tex->use_reg_lod) {
+		printf(" REG_LOD(%u)", tex->use_reg_lod);
+	}
+	if (tex->use_reg_gradients)
+		printf(" USE_REG_GRADIENTS");
+	printf(" LOCATION(%s)", sample_loc[tex->sample_location]);
+	if (tex->offset_x || tex->offset_y || tex->offset_z)
+		printf(" OFFSET(%u,%u,%u)", tex->offset_x, tex->offset_y, tex->offset_z);
+}
+
+struct {
+	const char *name;
+	void (*fxn)(instr_fetch_t *cf);
+} fetch_instructions[] = {
+#define INSTR(opc, name, fxn) [opc] = { name, fxn }
+		INSTR(VTX_FETCH, "VERTEX", print_fetch_vtx),
+		INSTR(TEX_FETCH, "SAMPLE", print_fetch_tex),
+		INSTR(TEX_GET_BORDER_COLOR_FRAC, "?", print_fetch_tex),
+		INSTR(TEX_GET_COMP_TEX_LOD, "?", print_fetch_tex),
+		INSTR(TEX_GET_GRADIENTS, "?", print_fetch_tex),
+		INSTR(TEX_GET_WEIGHTS, "?", print_fetch_tex),
+		INSTR(TEX_SET_TEX_LOD, "SET_TEX_LOD", print_fetch_tex),
+		INSTR(TEX_SET_GRADIENTS_H, "?", print_fetch_tex),
+		INSTR(TEX_SET_GRADIENTS_V, "?", print_fetch_tex),
+		INSTR(TEX_RESERVED_4, "?", print_fetch_tex),
+#undef INSTR
+};
+
+static int disasm_fetch(uint32_t *dwords, uint32_t alu_off, int level, int sync)
+{
+	instr_fetch_t *fetch = (instr_fetch_t *)dwords;
+
+	printf("%s", levels[level]);
+	if (debug & PRINT_RAW) {
+		printf("%02x: %08x %08x %08x\t", alu_off,
+				dwords[0], dwords[1], dwords[2]);
+	}
+
+	printf("   %sFETCH:\t", sync ? "(S)" : "   ");
+	printf("%s", fetch_instructions[fetch->opc].name);
+	fetch_instructions[fetch->opc].fxn(fetch);
+	printf("\n");
+
+	return 0;
+}
+
+/*
+ * CF instructions:
+ */
+
+static int cf_exec(instr_cf_t *cf)
+{
+	return (cf->opc == EXEC) ||
+			(cf->opc == EXEC_END) ||
+			(cf->opc == COND_EXEC) ||
+			(cf->opc == COND_EXEC_END) ||
+			(cf->opc == COND_PRED_EXEC) ||
+			(cf->opc == COND_PRED_EXEC_END) ||
+			(cf->opc == COND_EXEC_PRED_CLEAN) ||
+			(cf->opc == COND_EXEC_PRED_CLEAN_END);
+}
+
+static int cf_cond_exec(instr_cf_t *cf)
+{
+	return (cf->opc == COND_EXEC) ||
+			(cf->opc == COND_EXEC_END) ||
+			(cf->opc == COND_PRED_EXEC) ||
+			(cf->opc == COND_PRED_EXEC_END) ||
+			(cf->opc == COND_EXEC_PRED_CLEAN) ||
+			(cf->opc == COND_EXEC_PRED_CLEAN_END);
+}
+
+static void print_cf_nop(instr_cf_t *cf)
+{
+}
+
+static void print_cf_exec(instr_cf_t *cf)
+{
+	printf(" ADDR(0x%x) CNT(0x%x)", cf->exec.address, cf->exec.count);
+	if (cf->exec.yeild)
+		printf(" YIELD");
+	if (cf->exec.vc)
+		printf(" VC(0x%x)", cf->exec.vc);
+	if (cf->exec.bool_addr)
+		printf(" BOOL_ADDR(0x%x)", cf->exec.bool_addr);
+	if (cf->exec.address_mode == ABSOLUTE_ADDR)
+		printf(" ABSOLUTE_ADDR");
+	if (cf_cond_exec(cf))
+		printf(" COND(%d)", cf->exec.condition);
+}
+
+static void print_cf_loop(instr_cf_t *cf)
+{
+	printf(" ADDR(0x%x) LOOP_ID(%d)", cf->loop.address, cf->loop.loop_id);
+	if (cf->loop.address_mode == ABSOLUTE_ADDR)
+		printf(" ABSOLUTE_ADDR");
+}
+
+static void print_cf_jmp_call(instr_cf_t *cf)
+{
+	printf(" ADDR(0x%x) DIR(%d)", cf->jmp_call.address, cf->jmp_call.direction);
+	if (cf->jmp_call.force_call)
+		printf(" FORCE_CALL");
+	if (cf->jmp_call.predicated_jmp)
+		printf(" COND(%d)", cf->jmp_call.condition);
+	if (cf->jmp_call.bool_addr)
+		printf(" BOOL_ADDR(0x%x)", cf->jmp_call.bool_addr);
+	if (cf->jmp_call.address_mode == ABSOLUTE_ADDR)
+		printf(" ABSOLUTE_ADDR");
+}
+
+static void print_cf_alloc(instr_cf_t *cf)
+{
+	static const char *bufname[] = {
+			[SQ_NO_ALLOC] = "NO ALLOC",
+			[SQ_POSITION] = "POSITION",
+			[SQ_PARAMETER_PIXEL] = "PARAM/PIXEL",
+			[SQ_MEMORY] = "MEMORY",
+	};
+	printf(" %s SIZE(0x%x)", bufname[cf->alloc.buffer_select], cf->alloc.size);
+	if (cf->alloc.no_serial)
+		printf(" NO_SERIAL");
+	if (cf->alloc.alloc_mode) // ???
+		printf(" ALLOC_MODE");
+}
+
+struct {
+	const char *name;
+	void (*fxn)(instr_cf_t *cf);
+} cf_instructions[] = {
+#define INSTR(opc, fxn) [opc] = { #opc, fxn }
+		INSTR(NOP, print_cf_nop),
+		INSTR(EXEC, print_cf_exec),
+		INSTR(EXEC_END, print_cf_exec),
+		INSTR(COND_EXEC, print_cf_exec),
+		INSTR(COND_EXEC_END, print_cf_exec),
+		INSTR(COND_PRED_EXEC, print_cf_exec),
+		INSTR(COND_PRED_EXEC_END, print_cf_exec),
+		INSTR(LOOP_START, print_cf_loop),
+		INSTR(LOOP_END, print_cf_loop),
+		INSTR(COND_CALL, print_cf_jmp_call),
+		INSTR(RETURN, print_cf_jmp_call),
+		INSTR(COND_JMP, print_cf_jmp_call),
+		INSTR(ALLOC, print_cf_alloc),
+		INSTR(COND_EXEC_PRED_CLEAN, print_cf_exec),
+		INSTR(COND_EXEC_PRED_CLEAN_END, print_cf_exec),
+		INSTR(MARK_VS_FETCH_DONE, print_cf_nop),  // ??
+#undef INSTR
+};
+
+static void print_cf(instr_cf_t *cf, int level)
+{
+	printf("%s", levels[level]);
+	if (debug & PRINT_RAW) {
+		uint16_t *words = (uint16_t *)cf;
+		printf("    %04x %04x %04x            \t",
+				words[0], words[1], words[2]);
+	}
+	printf("%s", cf_instructions[cf->opc].name);
+	cf_instructions[cf->opc].fxn(cf);
+	printf("\n");
+}
+
+/*
+ * The adreno shader microcode consists of two parts:
+ *   1) A CF (control-flow) program, at the header of the compiled shader,
+ *      which refers to ALU/FETCH instructions that follow it by address.
+ *   2) ALU and FETCH instructions
+ */
+
+int disasm_a2xx(uint32_t *dwords, int sizedwords, int level, enum shader_t type)
+{
+	instr_cf_t *cfs = (instr_cf_t *)dwords;
+	int idx, max_idx;
+
+	if (!rnn) {
+		rnn = rnn_new(1);
+		rnn_load(rnn, "a2xx");
+	}
+
+	for (idx = 0; ; idx++) {
+		instr_cf_t *cf = &cfs[idx];
+		if (cf_exec(cf)) {
+			max_idx = 2 * cf->exec.address;
+			break;
+		}
+	}
+
+	for (idx = 0; idx < max_idx; idx++) {
+		instr_cf_t *cf = &cfs[idx];
+
+		print_cf(cf, level);
+
+		if (cf_exec(cf)) {
+			uint32_t sequence = cf->exec.serialize;
+			uint32_t i;
+			for (i = 0; i < cf->exec.count; i++) {
+				uint32_t alu_off = (cf->exec.address + i);
+				if (sequence & 0x1) {
+					disasm_fetch(dwords + alu_off * 3, alu_off, level, sequence & 0x2);
+				} else {
+					disasm_alu(dwords + alu_off * 3, alu_off, level, sequence & 0x2, type);
+				}
+				sequence >>= 2;
+			}
+		}
+	}
+
+	return 0;
+}
+
+void disasm_set_debug(enum debug_t d)
+{
+	debug = d;
+}
diff --git a/src/freedreno/decode/disasm-a3xx.c b/src/freedreno/decode/disasm-a3xx.c
new file mode 100644
index 0000000..9645dc5
--- /dev/null
+++ b/src/freedreno/decode/disasm-a3xx.c
@@ -0,0 +1,1641 @@
+/*
+ * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+
+#include "disasm.h"
+#include "instr-a3xx.h"
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+extern enum debug_t debug;
+
+static const char *levels[] = {
+		"",
+		"\t",
+		"\t\t",
+		"\t\t\t",
+		"\t\t\t\t",
+		"\t\t\t\t\t",
+		"\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t\t\t",
+		"x",
+		"x",
+		"x",
+		"x",
+		"x",
+		"x",
+};
+
+static const char *component = "xyzw";
+
+static const char *type[] = {
+		[TYPE_F16] = "f16",
+		[TYPE_F32] = "f32",
+		[TYPE_U16] = "u16",
+		[TYPE_U32] = "u32",
+		[TYPE_S16] = "s16",
+		[TYPE_S32] = "s32",
+		[TYPE_U8]  = "u8",
+		[TYPE_S8]  = "s8",
+};
+
+
+#define MAX_REG 4096
+
+typedef struct {
+	uint8_t full[MAX_REG/8];
+	uint8_t half[MAX_REG/8];
+} regmask_t;
+
+struct disasm_ctx {
+	FILE *out;
+	int level;
+	unsigned gpu_id;
+
+	struct shader_stats *stats;
+
+	/* we have to process the dst register after src to avoid tripping up
+	 * the read-before-write detection
+	 */
+	unsigned last_dst;
+	bool last_dst_full;
+	bool last_dst_valid;
+
+	/* current instruction repeat flag: */
+	unsigned repeat;
+	/* current instruction repeat indx/offset (for --expand): */
+	unsigned repeatidx;
+
+	/* tracking for register usage */
+	struct {
+		regmask_t used;
+		regmask_t used_merged;
+		regmask_t rbw;      /* read before write */
+		regmask_t war;      /* write after read */
+		regmask_t cnst;     /* used consts */
+	} regs;
+};
+
+static const char *float_imms[] = {
+	"0.0",
+	"0.5",
+	"1.0",
+	"2.0",
+	"e",
+	"pi",
+	"1/pi",
+	"1/log2(e)",
+	"log2(e)",
+	"1/log2(10)",
+	"log2(10)",
+	"4.0",
+};
+
+static void print_reg(struct disasm_ctx *ctx, reg_t reg, bool full,
+		bool is_float, bool r,
+		bool c, bool im, bool neg, bool abs, bool addr_rel)
+{
+	const char type = c ? 'c' : 'r';
+
+	// XXX I prefer - and || for neg/abs, but preserving format used
+	// by libllvm-a3xx for easy diffing..
+
+	if (abs && neg)
+		fprintf(ctx->out, "(absneg)");
+	else if (neg)
+		fprintf(ctx->out, "(neg)");
+	else if (abs)
+		fprintf(ctx->out, "(abs)");
+
+	if (r)
+		fprintf(ctx->out, "(r)");
+
+	if (im) {
+		if (is_float && full && reg.iim_val < ARRAY_SIZE(float_imms)) {
+			fprintf(ctx->out, "(%s)", float_imms[reg.iim_val]);
+		} else {
+			fprintf(ctx->out, "%d", reg.iim_val);
+		}
+	} else if (addr_rel) {
+		/* I would just use %+d but trying to make it diff'able with
+		 * libllvm-a3xx...
+		 */
+		if (reg.iim_val < 0)
+			fprintf(ctx->out, "%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val);
+		else if (reg.iim_val > 0)
+			fprintf(ctx->out, "%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val);
+		else
+			fprintf(ctx->out, "%s%c<a0.x>", full ? "" : "h", type);
+	} else if ((reg.num == REG_A0) && !c) {
+		/* This matches libllvm output, the second (scalar) address register
+		 * seems to be called a1.x instead of a0.y.
+		 */
+		fprintf(ctx->out, "a%d.x", reg.comp);
+	} else if ((reg.num == REG_P0) && !c) {
+		fprintf(ctx->out, "p0.%c", component[reg.comp]);
+	} else {
+		fprintf(ctx->out, "%s%c%d.%c", full ? "" : "h", type, reg.num, component[reg.comp]);
+	}
+}
+
+/* Tracking for registers used, read-before-write (input), and
+ * write-after-read (output.. but not 100%)..
+ */
+
+static void regmask_set(regmask_t *regmask, unsigned num, bool full, unsigned val)
+{
+	unsigned i = num / 8;
+	unsigned j = num % 8;
+	ir3_assert(num < MAX_REG);
+	if (full) {
+		regmask->full[i] = (regmask->full[i] & ~(1 << j)) | (val << j);
+	} else {
+		regmask->half[i] = (regmask->half[i] & ~(1 << j)) | (val << j);
+	}
+}
+
+static unsigned regmask_get(regmask_t *regmask, unsigned num, bool full)
+{
+	unsigned i = num / 8;
+	unsigned j = num % 8;
+	ir3_assert(num < MAX_REG);
+	if (full) {
+		return (regmask->full[i] >> j) & 0x1;
+	} else {
+		return (regmask->half[i] >> j) & 0x1;
+	}
+}
+
+static unsigned regidx(reg_t reg)
+{
+	return (4 * reg.num) + reg.comp;
+}
+
+static reg_t idxreg(unsigned idx)
+{
+	return (reg_t){
+		.comp = idx & 0x3,
+		.num  = idx >> 2,
+	};
+}
+
+static int print_regs(struct disasm_ctx *ctx, regmask_t *regmask, bool full)
+{
+	int num, max = 0, cnt = 0;
+	int first, last;
+
+	void print_sequence(void)
+	{
+		if (first != MAX_REG) {
+			if (first == last) {
+				fprintf(ctx->out, " %d", first);
+			} else {
+				fprintf(ctx->out, " %d-%d", first, last);
+			}
+		}
+	}
+
+	first = last = MAX_REG;
+
+	for (num = 0; num < MAX_REG; num++) {
+		if (regmask_get(regmask, num, full)) {
+			if (num != (last + 1)) {
+				print_sequence();
+				first = num;
+			}
+			last = num;
+			if (num < (48*4))
+				max = num;
+			cnt++;
+		}
+	}
+
+	print_sequence();
+
+	fprintf(ctx->out, " (cnt=%d, max=%d)", cnt, max);
+
+	return max;
+}
+
+static void print_reg_stats(struct disasm_ctx *ctx)
+{
+	int fullreg, halfreg;
+
+	fprintf(ctx->out, "%sRegister Stats:\n", levels[ctx->level]);
+	fprintf(ctx->out, "%s- used (half):", levels[ctx->level]);
+	halfreg = print_regs(ctx, &ctx->regs.used, false);
+	fprintf(ctx->out, "\n");
+	fprintf(ctx->out, "%s- used (full):", levels[ctx->level]);
+	fullreg = print_regs(ctx, &ctx->regs.used, true);
+	fprintf(ctx->out, "\n");
+	fprintf(ctx->out, "%s- used (merged):", levels[ctx->level]);
+	print_regs(ctx, &ctx->regs.used_merged, false);
+	fprintf(ctx->out, "\n");
+	fprintf(ctx->out, "%s- input (half):", levels[ctx->level]);
+	print_regs(ctx, &ctx->regs.rbw, false);
+	fprintf(ctx->out, "\n");
+	fprintf(ctx->out, "%s- input (full):", levels[ctx->level]);
+	print_regs(ctx, &ctx->regs.rbw, true);
+	fprintf(ctx->out, "\n");
+	fprintf(ctx->out, "%s- const (half):", levels[ctx->level]);
+	print_regs(ctx, &ctx->regs.cnst, false);
+	fprintf(ctx->out, "\n");
+	fprintf(ctx->out, "%s- const (full):", levels[ctx->level]);
+	print_regs(ctx, &ctx->regs.cnst, true);
+	fprintf(ctx->out, "\n");
+	fprintf(ctx->out, "%s- output (half):", levels[ctx->level]);
+	print_regs(ctx, &ctx->regs.war, false);
+	fprintf(ctx->out, "  (estimated)\n");
+	fprintf(ctx->out, "%s- output (full):", levels[ctx->level]);
+	print_regs(ctx, &ctx->regs.war, true);
+	fprintf(ctx->out, "  (estimated)\n");
+
+	/* convert to vec4, which is the granularity that registers are
+	 * assigned to shader:
+	 */
+	fullreg = (fullreg + 3) / 4;
+	halfreg = (halfreg + 3) / 4;
+
+	// Note this count of instructions includes rptN, which matches
+	// up to how mesa prints this:
+	fprintf(ctx->out, "%s- shaderdb: %d instructions, %d nops, %d non-nops, "
+			"(%d instlen), %d half, %d full\n",
+			levels[ctx->level], ctx->stats->instructions, ctx->stats->nops,
+			ctx->stats->instructions - ctx->stats->nops, ctx->stats->instlen,
+			halfreg, fullreg);
+	fprintf(ctx->out, "%s- shaderdb: %d (ss), %d (sy)\n", levels[ctx->level],
+			ctx->stats->ss, ctx->stats->sy);
+}
+
+static void process_reg_dst(struct disasm_ctx *ctx)
+{
+	int i;
+
+	if (!ctx->last_dst_valid)
+		return;
+
+	for (i = 0; i <= ctx->repeat; i++) {
+		unsigned dst = ctx->last_dst + i;
+
+		regmask_set(&ctx->regs.war, dst, ctx->last_dst_full, 1);
+		regmask_set(&ctx->regs.used, dst, ctx->last_dst_full, 1);
+
+		if (ctx->last_dst_full) {
+			regmask_set(&ctx->regs.used_merged, (dst*2)+0, false, 1);
+			regmask_set(&ctx->regs.used_merged, (dst*2)+1, false, 1);
+		} else {
+			regmask_set(&ctx->regs.used_merged, dst, false, 1);
+		}
+	}
+
+	ctx->last_dst_valid = false;
+}
+
+static void print_reg_dst(struct disasm_ctx *ctx, reg_t reg, bool full, bool addr_rel)
+{
+	/* presumably the special registers a0.c and p0.c don't count.. */
+	if (!(addr_rel || (reg.num == 61) || (reg.num == 62))) {
+		ctx->last_dst = regidx(reg);
+		ctx->last_dst_full = full;
+		ctx->last_dst_valid = true;
+	}
+	reg = idxreg(regidx(reg) + ctx->repeatidx);
+	print_reg(ctx, reg, full, false, false, false, false, false, false, addr_rel);
+}
+
+static void print_reg_src(struct disasm_ctx *ctx, reg_t reg, bool full, bool f, bool r,
+		bool c, bool im, bool neg, bool abs, bool addr_rel)
+{
+	/* presumably the special registers a0.c and p0.c don't count.. */
+	if (!(addr_rel || c || im || (reg.num == 61) || (reg.num == 62))) {
+		int i, num = regidx(reg);
+		for (i = 0; i <= ctx->repeat; i++) {
+			unsigned src = num + i;
+
+			if (!regmask_get(&ctx->regs.used, src, full))
+				regmask_set(&ctx->regs.rbw, src, full, 1);
+
+			regmask_set(&ctx->regs.war, src, full, 0);
+			regmask_set(&ctx->regs.used, src, full, 1);
+
+			if (full) {
+				regmask_set(&ctx->regs.used_merged, (src*2)+0, false, 1);
+				regmask_set(&ctx->regs.used_merged, (src*2)+1, false, 1);
+			} else {
+				regmask_set(&ctx->regs.used_merged, src, false, 1);
+			}
+
+			if (!r)
+				break;
+		}
+	} else if (c) {
+		int i, num = regidx(reg);
+		for (i = 0; i <= ctx->repeat; i++) {
+			unsigned src = num + i;
+
+			regmask_set(&ctx->regs.cnst, src, full, 1);
+
+			if (!r)
+				break;
+		}
+
+		unsigned max = (num + ctx->repeat + 1 + 3) / 4;
+		if (max > ctx->stats->constlen)
+			ctx->stats->constlen = max;
+	}
+
+	if (r)
+		reg = idxreg(regidx(reg) + ctx->repeatidx);
+
+	print_reg(ctx, reg, full, f, r, c, im, neg, abs, addr_rel);
+}
+
+/* TODO switch to using reginfo struct everywhere, since more readable
+ * than passing a bunch of bools to print_reg_src
+ */
+
+struct reginfo {
+	reg_t reg;
+	bool full;
+	bool r;
+	bool c;
+	bool f; /* src reg is interpreted as float, used for printing immediates */
+	bool im;
+	bool neg;
+	bool abs;
+	bool addr_rel;
+};
+
+static void print_src(struct disasm_ctx *ctx, struct reginfo *info)
+{
+	reg_t reg = info->reg;
+
+	if (info->r)
+		reg = idxreg(regidx(info->reg) + ctx->repeatidx);
+
+	print_reg_src(ctx, reg, info->full, info->f, info->r, info->c, info->im,
+			info->neg, info->abs, info->addr_rel);
+}
+
+//static void print_dst(struct disasm_ctx *ctx, struct reginfo *info)
+//{
+//	print_reg_dst(ctx, info->reg, info->full, info->addr_rel);
+//}
+
+static void print_instr_cat0(struct disasm_ctx *ctx, instr_t *instr)
+{
+	static const struct {
+		const char *suffix;
+		int nsrc;
+		bool idx;
+	} brinfo[7] = {
+		[BRANCH_PLAIN] = { "r",   1, false },
+		[BRANCH_OR]    = { "rao", 2, false },
+		[BRANCH_AND]   = { "raa", 2, false },
+		[BRANCH_CONST] = { "rac", 0, true  },
+		[BRANCH_ANY]   = { "any", 1, false },
+		[BRANCH_ALL]   = { "all", 1, false },
+		[BRANCH_X]     = { "rax", 0, false },
+	};
+	instr_cat0_t *cat0 = &instr->cat0;
+
+	switch (instr_opc(instr, ctx->gpu_id)) {
+	case OPC_KILL:
+	case OPC_PREDT:
+	case OPC_PREDF:
+		fprintf(ctx->out, " %sp0.%c", cat0->inv0 ? "!" : "",
+				component[cat0->comp0]);
+		break;
+	case OPC_B:
+		fprintf(ctx->out, "%s", brinfo[cat0->brtype].suffix);
+		if (brinfo[cat0->brtype].idx) {
+			fprintf(ctx->out, ".%u", cat0->idx);
+		}
+		if (brinfo[cat0->brtype].nsrc >= 1) {
+			fprintf(ctx->out, " %sp0.%c,", cat0->inv0 ? "!" : "",
+					component[cat0->comp0]);
+		}
+		if (brinfo[cat0->brtype].nsrc >= 2) {
+			fprintf(ctx->out, " %sp0.%c,", cat0->inv1 ? "!" : "",
+					component[cat0->comp1]);
+		}
+		fprintf(ctx->out, " #%d", cat0->a3xx.immed);
+		break;
+	case OPC_JUMP:
+	case OPC_CALL:
+	case OPC_BKT:
+	case OPC_GETONE:
+	case OPC_SHPS:
+		fprintf(ctx->out, " #%d", cat0->a3xx.immed);
+		break;
+	}
+
+	if ((debug & PRINT_VERBOSE) && (cat0->dummy3|cat0->dummy4))
+		fprintf(ctx->out, "\t{0: %x,%x}", cat0->dummy3, cat0->dummy4);
+}
+
+static void print_instr_cat1(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat1_t *cat1 = &instr->cat1;
+
+	if (cat1->ul)
+		fprintf(ctx->out, "(ul)");
+
+	if (cat1->src_type == cat1->dst_type) {
+		if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) {
+			/* special case (nmemonic?): */
+			fprintf(ctx->out, "mova");
+		} else {
+			fprintf(ctx->out, "mov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+		}
+	} else {
+		fprintf(ctx->out, "cov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+	}
+
+	fprintf(ctx->out, " ");
+
+	if (cat1->even)
+		fprintf(ctx->out, "(even)");
+
+	if (cat1->pos_inf)
+		fprintf(ctx->out, "(pos_infinity)");
+
+	print_reg_dst(ctx, (reg_t)(cat1->dst), type_size(cat1->dst_type) == 32,
+			cat1->dst_rel);
+
+	fprintf(ctx->out, ", ");
+
+	/* ugg, have to special case this.. vs print_reg().. */
+	if (cat1->src_im) {
+		if (type_float(cat1->src_type))
+			fprintf(ctx->out, "(%f)", cat1->fim_val);
+		else if (type_uint(cat1->src_type))
+			fprintf(ctx->out, "0x%08x", cat1->uim_val);
+		else
+			fprintf(ctx->out, "%d", cat1->iim_val);
+	} else if (cat1->src_rel && !cat1->src_c) {
+		/* I would just use %+d but trying to make it diff'able with
+		 * libllvm-a3xx...
+		 */
+		char type = cat1->src_rel_c ? 'c' : 'r';
+		const char *full = (type_size(cat1->src_type) == 32) ? "" : "h";
+		if (cat1->off < 0)
+			fprintf(ctx->out, "%s%c<a0.x - %d>", full, type, -cat1->off);
+		else if (cat1->off > 0)
+			fprintf(ctx->out, "%s%c<a0.x + %d>", full, type, cat1->off);
+		else
+			fprintf(ctx->out, "%s%c<a0.x>", full, type);
+	} else {
+		struct reginfo src = {
+			.reg = (reg_t)cat1->src,
+			.full = type_size(cat1->src_type) == 32,
+			.r = cat1->src_r,
+			.c = cat1->src_c,
+			.im = cat1->src_im,
+		};
+		print_src(ctx, &src);
+	}
+
+	if ((debug & PRINT_VERBOSE) && (cat1->must_be_0))
+		fprintf(ctx->out, "\t{1: %x}", cat1->must_be_0);
+}
+
+static void print_instr_cat2(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat2_t *cat2 = &instr->cat2;
+	int opc = _OPC(2, cat2->opc);
+	static const char *cond[] = {
+			"lt",
+			"le",
+			"gt",
+			"ge",
+			"eq",
+			"ne",
+			"?6?",
+	};
+
+	switch (opc) {
+	case OPC_CMPS_F:
+	case OPC_CMPS_U:
+	case OPC_CMPS_S:
+	case OPC_CMPV_F:
+	case OPC_CMPV_U:
+	case OPC_CMPV_S:
+		fprintf(ctx->out, ".%s", cond[cat2->cond]);
+		break;
+	}
+
+	fprintf(ctx->out, " ");
+	if (cat2->ei)
+		fprintf(ctx->out, "(ei)");
+	print_reg_dst(ctx, (reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false);
+	fprintf(ctx->out, ", ");
+
+	struct reginfo src1 = {
+		.full = cat2->full,
+		.r = cat2->repeat ? cat2->src1_r : 0,
+		.f = is_cat2_float(opc),
+		.im = cat2->src1_im,
+		.abs = cat2->src1_abs,
+		.neg = cat2->src1_neg,
+	};
+
+	if (cat2->c1.src1_c) {
+		src1.reg = (reg_t)(cat2->c1.src1);
+		src1.c = true;
+	} else if (cat2->rel1.src1_rel) {
+		src1.reg = (reg_t)(cat2->rel1.src1);
+		src1.c = cat2->rel1.src1_c;
+		src1.addr_rel = true;
+	} else {
+		src1.reg = (reg_t)(cat2->src1);
+	}
+	print_src(ctx, &src1);
+
+	struct reginfo src2 = {
+		.r = cat2->repeat ? cat2->src2_r : 0,
+		.full = cat2->full,
+		.f = is_cat2_float(opc),
+		.abs = cat2->src2_abs,
+		.neg = cat2->src2_neg,
+		.im = cat2->src2_im,
+	};
+	switch (opc) {
+	case OPC_ABSNEG_F:
+	case OPC_ABSNEG_S:
+	case OPC_CLZ_B:
+	case OPC_CLZ_S:
+	case OPC_SIGN_F:
+	case OPC_FLOOR_F:
+	case OPC_CEIL_F:
+	case OPC_RNDNE_F:
+	case OPC_RNDAZ_F:
+	case OPC_TRUNC_F:
+	case OPC_NOT_B:
+	case OPC_BFREV_B:
+	case OPC_SETRM:
+	case OPC_CBITS_B:
+		/* these only have one src reg */
+		break;
+	default:
+		fprintf(ctx->out, ", ");
+		if (cat2->c2.src2_c) {
+			src2.reg = (reg_t)(cat2->c2.src2);
+			src2.c = true;
+		} else if (cat2->rel2.src2_rel) {
+			src2.reg = (reg_t)(cat2->rel2.src2);
+			src2.c = cat2->rel2.src2_c;
+			src2.addr_rel = true;
+		} else {
+			src2.reg = (reg_t)(cat2->src2);
+		}
+		print_src(ctx, &src2);
+		break;
+	}
+}
+
+static void print_instr_cat3(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat3_t *cat3 = &instr->cat3;
+	bool full = instr_cat3_full(cat3);
+
+	fprintf(ctx->out, " ");
+	print_reg_dst(ctx, (reg_t)(cat3->dst), full ^ cat3->dst_half, false);
+	fprintf(ctx->out, ", ");
+
+	struct reginfo src1 = {
+		.r = cat3->repeat ? cat3->src1_r : 0,
+		.full = full,
+		.neg = cat3->src1_neg,
+	};
+	if (cat3->c1.src1_c) {
+		src1.reg = (reg_t)(cat3->c1.src1);
+		src1.c = true;
+	} else if (cat3->rel1.src1_rel) {
+		src1.reg = (reg_t)(cat3->rel1.src1);
+		src1.c = cat3->rel1.src1_c;
+		src1.addr_rel = true;
+	} else {
+		src1.reg = (reg_t)(cat3->src1);
+	}
+	print_src(ctx, &src1);
+
+	fprintf(ctx->out, ", ");
+	struct reginfo src2 = {
+		.reg = (reg_t)cat3->src2,
+		.full = full,
+		.r = cat3->repeat ? cat3->src2_r : 0,
+		.c = cat3->src2_c,
+		.neg = cat3->src2_neg,
+	};
+	print_src(ctx, &src2);
+
+	fprintf(ctx->out, ", ");
+	struct reginfo src3 = {
+		.r = cat3->src3_r,
+		.full = full,
+		.neg = cat3->src3_neg,
+	};
+	if (cat3->c2.src3_c) {
+		src3.reg = (reg_t)(cat3->c2.src3);
+		src3.c = true;
+	} else if (cat3->rel2.src3_rel) {
+		src3.reg = (reg_t)(cat3->rel2.src3);
+		src3.c = cat3->rel2.src3_c;
+		src3.addr_rel = true;
+	} else {
+		src3.reg = (reg_t)(cat3->src3);
+	}
+	print_src(ctx, &src3);
+}
+
+static void print_instr_cat4(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat4_t *cat4 = &instr->cat4;
+
+	fprintf(ctx->out, " ");
+	print_reg_dst(ctx, (reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false);
+	fprintf(ctx->out, ", ");
+
+	struct reginfo src = {
+		.r = cat4->src_r,
+		.im = cat4->src_im,
+		.full = cat4->full,
+		.neg = cat4->src_neg,
+		.abs = cat4->src_abs,
+	};
+	if (cat4->c.src_c) {
+		src.reg = (reg_t)(cat4->c.src);
+		src.c = true;
+	} else if (cat4->rel.src_rel) {
+		src.reg = (reg_t)(cat4->rel.src);
+		src.c = cat4->rel.src_c;
+		src.addr_rel = true;
+	} else {
+		src.reg = (reg_t)(cat4->src);
+	}
+	print_src(ctx, &src);
+
+	if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2))
+		fprintf(ctx->out, "\t{4: %x,%x}", cat4->dummy1, cat4->dummy2);
+}
+
+static void print_instr_cat5(struct disasm_ctx *ctx, instr_t *instr)
+{
+	static const struct {
+		bool src1, src2, samp, tex;
+	} info[0x1f] = {
+			[opc_op(OPC_ISAM)]     = { true,  false, true,  true,  },
+			[opc_op(OPC_ISAML)]    = { true,  true,  true,  true,  },
+			[opc_op(OPC_ISAMM)]    = { true,  false, true,  true,  },
+			[opc_op(OPC_SAM)]      = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMB)]     = { true,  true,  true,  true,  },
+			[opc_op(OPC_SAML)]     = { true,  true,  true,  true,  },
+			[opc_op(OPC_SAMGQ)]    = { true,  false, true,  true,  },
+			[opc_op(OPC_GETLOD)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_CONV)]     = { true,  true,  true,  true,  },
+			[opc_op(OPC_CONVM)]    = { true,  true,  true,  true,  },
+			[opc_op(OPC_GETSIZE)]  = { true,  false, false, true,  },
+			[opc_op(OPC_GETBUF)]   = { false, false, false, true,  },
+			[opc_op(OPC_GETPOS)]   = { true,  false, false, true,  },
+			[opc_op(OPC_GETINFO)]  = { false, false, false, true,  },
+			[opc_op(OPC_DSX)]      = { true,  false, false, false, },
+			[opc_op(OPC_DSY)]      = { true,  false, false, false, },
+			[opc_op(OPC_GATHER4R)] = { true,  false, true,  true,  },
+			[opc_op(OPC_GATHER4G)] = { true,  false, true,  true,  },
+			[opc_op(OPC_GATHER4B)] = { true,  false, true,  true,  },
+			[opc_op(OPC_GATHER4A)] = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMGP0)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMGP1)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMGP2)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMGP3)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_DSXPP_1)]  = { true,  false, false, false, },
+			[opc_op(OPC_DSYPP_1)]  = { true,  false, false, false, },
+			[opc_op(OPC_RGETPOS)]  = { true,  false, false, false, },
+			[opc_op(OPC_RGETINFO)] = { false, false, false, false, },
+	};
+
+	static const struct {
+		bool indirect;
+		bool bindless;
+		bool use_a1;
+		bool uniform;
+	} desc_features[8] = {
+		[CAT5_NONUNIFORM] = { .indirect = true, },
+		[CAT5_UNIFORM] = { .indirect = true, .uniform = true, },
+		[CAT5_BINDLESS_IMM] = { .bindless = true, },
+		[CAT5_BINDLESS_UNIFORM] = {
+			.bindless = true,
+			.indirect = true,
+			.uniform = true,
+		},
+		[CAT5_BINDLESS_NONUNIFORM] = {
+			.bindless = true,
+			.indirect = true,
+		},
+		[CAT5_BINDLESS_A1_IMM] = {
+			.bindless = true,
+			.use_a1 = true,
+		},
+		[CAT5_BINDLESS_A1_UNIFORM] = {
+			.bindless = true,
+			.indirect = true,
+			.uniform = true,
+			.use_a1 = true,
+		},
+		[CAT5_BINDLESS_A1_NONUNIFORM] = {
+			.bindless = true,
+			.indirect = true,
+			.use_a1 = true,
+		},
+	};
+
+	instr_cat5_t *cat5 = &instr->cat5;
+	int i;
+
+	bool desc_indirect =
+		cat5->is_s2en_bindless &&
+		desc_features[cat5->s2en_bindless.desc_mode].indirect;
+	bool bindless =
+		cat5->is_s2en_bindless &&
+		desc_features[cat5->s2en_bindless.desc_mode].bindless;
+	bool use_a1 =
+		cat5->is_s2en_bindless &&
+		desc_features[cat5->s2en_bindless.desc_mode].use_a1;
+	bool uniform =
+		cat5->is_s2en_bindless &&
+		desc_features[cat5->s2en_bindless.desc_mode].uniform;
+
+	if (cat5->is_3d)   fprintf(ctx->out, ".3d");
+	if (cat5->is_a)    fprintf(ctx->out, ".a");
+	if (cat5->is_o)    fprintf(ctx->out, ".o");
+	if (cat5->is_p)    fprintf(ctx->out, ".p");
+	if (cat5->is_s)    fprintf(ctx->out, ".s");
+	if (desc_indirect) fprintf(ctx->out, ".s2en");
+	if (uniform)       fprintf(ctx->out, ".uniform");
+
+	if (bindless) {
+		unsigned base = (cat5->s2en_bindless.base_hi << 1) | cat5->base_lo;
+		fprintf(ctx->out, ".base%d", base);
+	}
+
+	fprintf(ctx->out, " ");
+
+	switch (_OPC(5, cat5->opc)) {
+	case OPC_DSXPP_1:
+	case OPC_DSYPP_1:
+		break;
+	default:
+		fprintf(ctx->out, "(%s)", type[cat5->type]);
+		break;
+	}
+
+	fprintf(ctx->out, "(");
+	for (i = 0; i < 4; i++)
+		if (cat5->wrmask & (1 << i))
+			fprintf(ctx->out, "%c", "xyzw"[i]);
+	fprintf(ctx->out, ")");
+
+	print_reg_dst(ctx, (reg_t)(cat5->dst), type_size(cat5->type) == 32, false);
+
+	if (info[cat5->opc].src1) {
+		fprintf(ctx->out, ", ");
+		struct reginfo src = { .reg = (reg_t)(cat5->src1), .full = cat5->full };
+		print_src(ctx, &src);
+	}
+
+	if (cat5->is_o || info[cat5->opc].src2) {
+		fprintf(ctx->out, ", ");
+		struct reginfo src = { .reg = (reg_t)(cat5->src2), .full = cat5->full };
+		print_src(ctx, &src);
+	}
+	if (cat5->is_s2en_bindless) {
+		if (!desc_indirect) {
+			if (info[cat5->opc].samp) {
+				if (use_a1)
+					fprintf(ctx->out, ", s#%d", cat5->s2en_bindless.src3);
+				else
+					fprintf(ctx->out, ", s#%d", cat5->s2en_bindless.src3 & 0xf);
+			}
+
+			if (info[cat5->opc].tex && !use_a1) {
+				fprintf(ctx->out, ", t#%d", cat5->s2en_bindless.src3 >> 4);
+			}
+		}
+	} else {
+		if (info[cat5->opc].samp)
+			fprintf(ctx->out, ", s#%d", cat5->norm.samp);
+		if (info[cat5->opc].tex)
+			fprintf(ctx->out, ", t#%d", cat5->norm.tex);
+	}
+
+	if (desc_indirect) {
+		fprintf(ctx->out, ", ");
+		struct reginfo src = { .reg = (reg_t)(cat5->s2en_bindless.src3), .full = bindless };
+		print_src(ctx, &src);
+	}
+
+	if (use_a1)
+		fprintf(ctx->out, ", a1.x");
+
+	if (debug & PRINT_VERBOSE) {
+		if (cat5->is_s2en_bindless) {
+			if ((debug & PRINT_VERBOSE) && cat5->s2en_bindless.dummy1)
+				fprintf(ctx->out, "\t{5: %x}", cat5->s2en_bindless.dummy1);
+		} else {
+			if ((debug & PRINT_VERBOSE) && cat5->norm.dummy1)
+				fprintf(ctx->out, "\t{5: %x}", cat5->norm.dummy1);
+		}
+	}
+}
+
+static void print_instr_cat6_a3xx(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat6_t *cat6 = &instr->cat6;
+	char sd = 0, ss = 0;  /* dst/src address space */
+	bool nodst = false;
+	struct reginfo dst, src1, src2;
+	int src1off = 0, dstoff = 0;
+
+	memset(&dst, 0, sizeof(dst));
+	memset(&src1, 0, sizeof(src1));
+	memset(&src2, 0, sizeof(src2));
+
+	switch (_OPC(6, cat6->opc)) {
+	case OPC_RESINFO:
+	case OPC_RESFMT:
+		dst.full  = type_size(cat6->type) == 32;
+		src1.full = type_size(cat6->type) == 32;
+		src2.full = type_size(cat6->type) == 32;
+		break;
+	case OPC_L2G:
+	case OPC_G2L:
+		dst.full = true;
+		src1.full = true;
+		src2.full = true;
+		break;
+	case OPC_STG:
+	case OPC_STL:
+	case OPC_STP:
+	case OPC_STLW:
+	case OPC_STIB:
+		dst.full  = type_size(cat6->type) == 32;
+		src1.full = type_size(cat6->type) == 32;
+		src2.full = type_size(cat6->type) == 32;
+		break;
+	default:
+		dst.full  = type_size(cat6->type) == 32;
+		src1.full = true;
+		src2.full = true;
+		break;
+	}
+
+	switch (_OPC(6, cat6->opc)) {
+	case OPC_PREFETCH:
+		break;
+	case OPC_RESINFO:
+		fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+		break;
+	case OPC_LDGB:
+		fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
+		fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+		fprintf(ctx->out, ".%s", type[cat6->type]);
+		fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
+		break;
+	case OPC_STGB:
+	case OPC_STIB:
+		fprintf(ctx->out, ".%s", cat6->stgb.typed ? "typed" : "untyped");
+		fprintf(ctx->out, ".%dd", cat6->stgb.d + 1);
+		fprintf(ctx->out, ".%s", type[cat6->type]);
+		fprintf(ctx->out, ".%d", cat6->stgb.type_size + 1);
+		break;
+	case OPC_ATOMIC_ADD:
+	case OPC_ATOMIC_SUB:
+	case OPC_ATOMIC_XCHG:
+	case OPC_ATOMIC_INC:
+	case OPC_ATOMIC_DEC:
+	case OPC_ATOMIC_CMPXCHG:
+	case OPC_ATOMIC_MIN:
+	case OPC_ATOMIC_MAX:
+	case OPC_ATOMIC_AND:
+	case OPC_ATOMIC_OR:
+	case OPC_ATOMIC_XOR:
+		ss = cat6->g ? 'g' : 'l';
+		fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
+		fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+		fprintf(ctx->out, ".%s", type[cat6->type]);
+		fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
+		fprintf(ctx->out, ".%c", ss);
+		break;
+	default:
+		dst.im = cat6->g && !cat6->dst_off;
+		fprintf(ctx->out, ".%s", type[cat6->type]);
+		break;
+	}
+	fprintf(ctx->out, " ");
+
+	switch (_OPC(6, cat6->opc)) {
+	case OPC_STG:
+		sd = 'g';
+		break;
+	case OPC_STP:
+		sd = 'p';
+		break;
+	case OPC_STL:
+	case OPC_STLW:
+		sd = 'l';
+		break;
+
+	case OPC_LDG:
+	case OPC_LDC:
+		ss = 'g';
+		break;
+	case OPC_LDP:
+		ss = 'p';
+		break;
+	case OPC_LDL:
+	case OPC_LDLW:
+	case OPC_LDLV:
+		ss = 'l';
+		break;
+
+	case OPC_L2G:
+		ss = 'l';
+		sd = 'g';
+		break;
+
+	case OPC_G2L:
+		ss = 'g';
+		sd = 'l';
+		break;
+
+	case OPC_PREFETCH:
+		ss = 'g';
+		nodst = true;
+		break;
+	}
+
+	if ((_OPC(6, cat6->opc) == OPC_STGB) || (_OPC(6, cat6->opc) == OPC_STIB)) {
+		struct reginfo src3;
+
+		memset(&src3, 0, sizeof(src3));
+
+		src1.reg = (reg_t)(cat6->stgb.src1);
+		src2.reg = (reg_t)(cat6->stgb.src2);
+		src2.im  = cat6->stgb.src2_im;
+		src3.reg = (reg_t)(cat6->stgb.src3);
+		src3.im  = cat6->stgb.src3_im;
+		src3.full = true;
+
+		fprintf(ctx->out, "g[%u], ", cat6->stgb.dst_ssbo);
+		print_src(ctx, &src1);
+		fprintf(ctx->out, ", ");
+		print_src(ctx, &src2);
+		fprintf(ctx->out, ", ");
+		print_src(ctx, &src3);
+
+		if (debug & PRINT_VERBOSE)
+			fprintf(ctx->out, " (pad0=%x, pad3=%x)", cat6->stgb.pad0, cat6->stgb.pad3);
+
+		return;
+	}
+
+	if (is_atomic(_OPC(6, cat6->opc))) {
+
+		src1.reg = (reg_t)(cat6->ldgb.src1);
+		src1.im  = cat6->ldgb.src1_im;
+		src2.reg = (reg_t)(cat6->ldgb.src2);
+		src2.im  = cat6->ldgb.src2_im;
+		dst.reg  = (reg_t)(cat6->ldgb.dst);
+
+		print_src(ctx, &dst);
+		fprintf(ctx->out, ", ");
+		if (ss == 'g') {
+			struct reginfo src3;
+			memset(&src3, 0, sizeof(src3));
+
+			src3.reg = (reg_t)(cat6->ldgb.src3);
+			src3.full = true;
+
+			/* For images, the ".typed" variant is used and src2 is
+			 * the ivecN coordinates, ie ivec2 for 2d.
+			 *
+			 * For SSBOs, the ".untyped" variant is used and src2 is
+			 * a simple dword offset..  src3 appears to be
+			 * uvec2(offset * 4, 0).  Not sure the point of that.
+			 */
+
+			fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
+			print_src(ctx, &src1);  /* value */
+			fprintf(ctx->out, ", ");
+			print_src(ctx, &src2);  /* offset/coords */
+			fprintf(ctx->out, ", ");
+			print_src(ctx, &src3);  /* 64b byte offset.. */
+
+			if (debug & PRINT_VERBOSE) {
+				fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0,
+						cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+			}
+		} else { /* ss == 'l' */
+			fprintf(ctx->out, "l[");
+			print_src(ctx, &src1);  /* simple byte offset */
+			fprintf(ctx->out, "], ");
+			print_src(ctx, &src2);  /* value */
+
+			if (debug & PRINT_VERBOSE) {
+				fprintf(ctx->out, " (src3=%x, pad0=%x, pad3=%x, mustbe0=%x)",
+						cat6->ldgb.src3, cat6->ldgb.pad0,
+						cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+			}
+		}
+
+		return;
+	} else if (_OPC(6, cat6->opc) == OPC_RESINFO) {
+		dst.reg  = (reg_t)(cat6->ldgb.dst);
+
+		print_src(ctx, &dst);
+		fprintf(ctx->out, ", ");
+		fprintf(ctx->out, "g[%u]", cat6->ldgb.src_ssbo);
+
+		return;
+	} else if (_OPC(6, cat6->opc) == OPC_LDGB) {
+
+		src1.reg = (reg_t)(cat6->ldgb.src1);
+		src1.im  = cat6->ldgb.src1_im;
+		src2.reg = (reg_t)(cat6->ldgb.src2);
+		src2.im  = cat6->ldgb.src2_im;
+		dst.reg  = (reg_t)(cat6->ldgb.dst);
+
+		print_src(ctx, &dst);
+		fprintf(ctx->out, ", ");
+		fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
+		print_src(ctx, &src1);
+		fprintf(ctx->out, ", ");
+		print_src(ctx, &src2);
+
+		if (debug & PRINT_VERBOSE)
+			fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0, cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+
+		return;
+	} else if (_OPC(6, cat6->opc) == OPC_LDG && cat6->a.src1_im && cat6->a.src2_im) {
+		struct reginfo src3;
+
+		memset(&src3, 0, sizeof(src3));
+		src1.reg = (reg_t)(cat6->a.src1);
+		src2.reg = (reg_t)(cat6->a.src2);
+		src2.im  = cat6->a.src2_im;
+		src3.reg = (reg_t)(cat6->a.off);
+		src3.full = true;
+		dst.reg  = (reg_t)(cat6->d.dst);
+
+		print_src(ctx, &dst);
+		fprintf(ctx->out, ", g[");
+		print_src(ctx, &src1);
+		fprintf(ctx->out, "+");
+		print_src(ctx, &src3);
+		fprintf(ctx->out, "], ");
+		print_src(ctx, &src2);
+
+		return;
+	}
+	if (cat6->dst_off) {
+		dst.reg = (reg_t)(cat6->c.dst);
+		dstoff  = cat6->c.off;
+	} else {
+		dst.reg = (reg_t)(cat6->d.dst);
+	}
+
+	if (cat6->src_off) {
+		src1.reg = (reg_t)(cat6->a.src1);
+		src1.im  = cat6->a.src1_im;
+		src2.reg = (reg_t)(cat6->a.src2);
+		src2.im  = cat6->a.src2_im;
+		src1off  = cat6->a.off;
+	} else {
+		src1.reg = (reg_t)(cat6->b.src1);
+		src1.im  = cat6->b.src1_im;
+		src2.reg = (reg_t)(cat6->b.src2);
+		src2.im  = cat6->b.src2_im;
+	}
+
+	if (!nodst) {
+		if (sd)
+			fprintf(ctx->out, "%c[", sd);
+		/* note: dst might actually be a src (ie. address to store to) */
+		print_src(ctx, &dst);
+		if (cat6->dst_off && cat6->g) {
+			struct reginfo dstoff_reg = {0};
+			dstoff_reg.reg = (reg_t) cat6->c.off;
+			dstoff_reg.full  = true;
+			fprintf(ctx->out, "+");
+			print_src(ctx, &dstoff_reg);
+		} else if (dstoff)
+			fprintf(ctx->out, "%+d", dstoff);
+		if (sd)
+			fprintf(ctx->out, "]");
+		fprintf(ctx->out, ", ");
+	}
+
+	if (ss)
+		fprintf(ctx->out, "%c[", ss);
+
+	/* can have a larger than normal immed, so hack: */
+	if (src1.im) {
+		fprintf(ctx->out, "%u", src1.reg.dummy13);
+	} else {
+		print_src(ctx, &src1);
+	}
+
+	if (cat6->src_off && cat6->g)
+		print_src(ctx, &src2);
+	else if (src1off)
+		fprintf(ctx->out, "%+d", src1off);
+	if (ss)
+		fprintf(ctx->out, "]");
+
+	switch (_OPC(6, cat6->opc)) {
+	case OPC_RESINFO:
+	case OPC_RESFMT:
+		break;
+	default:
+		fprintf(ctx->out, ", ");
+		print_src(ctx, &src2);
+		break;
+	}
+}
+
+static void print_instr_cat6_a6xx(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat6_a6xx_t *cat6 = &instr->cat6_a6xx;
+	struct reginfo src1, src2, ssbo;
+	bool uses_type = _OPC(6, cat6->opc) != OPC_LDC;
+
+	static const struct {
+		bool indirect;
+		bool bindless;
+		const char *name;
+	} desc_features[8] = {
+		[CAT6_IMM] = {
+			.name = "imm"
+		},
+		[CAT6_UNIFORM] = {
+			.indirect = true,
+			.name = "uniform"
+		},
+		[CAT6_NONUNIFORM] = {
+			.indirect = true,
+			.name = "nonuniform"
+		},
+		[CAT6_BINDLESS_IMM] = {
+			.bindless = true,
+			.name = "imm"
+		},
+		[CAT6_BINDLESS_UNIFORM] = {
+			.bindless = true,
+			.indirect = true,
+			.name = "uniform"
+		},
+		[CAT6_BINDLESS_NONUNIFORM] = {
+			.bindless = true,
+			.indirect = true,
+			.name = "nonuniform"
+		},
+	};
+
+	bool indirect_ssbo = desc_features[cat6->desc_mode].indirect;
+	bool bindless = desc_features[cat6->desc_mode].bindless;
+	bool type_full = cat6->type != TYPE_U16;
+
+
+	memset(&src1, 0, sizeof(src1));
+	memset(&src2, 0, sizeof(src2));
+	memset(&ssbo, 0, sizeof(ssbo));
+
+	if (uses_type) {
+		fprintf(ctx->out, ".%s", cat6->typed ? "typed" : "untyped");
+		fprintf(ctx->out, ".%dd", cat6->d + 1);
+		fprintf(ctx->out, ".%s", type[cat6->type]);
+	} else {
+		fprintf(ctx->out, ".offset%d", cat6->d);
+	}
+	fprintf(ctx->out, ".%u", cat6->type_size + 1);
+
+	fprintf(ctx->out, ".%s", desc_features[cat6->desc_mode].name);
+	if (bindless)
+		fprintf(ctx->out, ".base%d", cat6->base);
+	fprintf(ctx->out, " ");
+
+	src2.reg = (reg_t)(cat6->src2);
+	src2.full = type_full;
+	print_src(ctx, &src2);
+	fprintf(ctx->out, ", ");
+
+	src1.reg = (reg_t)(cat6->src1);
+	src1.full = true; // XXX
+	print_src(ctx, &src1);
+	fprintf(ctx->out, ", ");
+	ssbo.reg = (reg_t)(cat6->ssbo);
+	ssbo.im = !indirect_ssbo;
+	ssbo.full = true;
+	print_src(ctx, &ssbo);
+
+	if (debug & PRINT_VERBOSE) {
+		fprintf(ctx->out, " (pad1=%x, pad2=%x, pad3=%x, pad4=%x, pad5=%x)",
+				cat6->pad1, cat6->pad2, cat6->pad3, cat6->pad4, cat6->pad5);
+	}
+}
+
+static void print_instr_cat6(struct disasm_ctx *ctx, instr_t *instr)
+{
+	if (!is_cat6_legacy(instr, ctx->gpu_id)) {
+		print_instr_cat6_a6xx(ctx, instr);
+		if (debug & PRINT_VERBOSE)
+			fprintf(ctx->out, " NEW");
+	} else {
+		print_instr_cat6_a3xx(ctx, instr);
+		if (debug & PRINT_VERBOSE)
+			fprintf(ctx->out, " LEGACY");
+	}
+}
+static void print_instr_cat7(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat7_t *cat7 = &instr->cat7;
+
+	if (cat7->g)
+		fprintf(ctx->out, ".g");
+	if (cat7->l)
+		fprintf(ctx->out, ".l");
+
+	if (_OPC(7, cat7->opc) == OPC_FENCE) {
+		if (cat7->r)
+			fprintf(ctx->out, ".r");
+		if (cat7->w)
+			fprintf(ctx->out, ".w");
+	}
+}
+
+/* size of largest OPC field of all the instruction categories: */
+#define NOPC_BITS 6
+
+static const struct opc_info {
+	uint16_t cat;
+	uint16_t opc;
+	const char *name;
+	void (*print)(struct disasm_ctx *ctx, instr_t *instr);
+} opcs[1 << (3+NOPC_BITS)] = {
+#define OPC(cat, opc, name) [(opc)] = { (cat), (opc), #name, print_instr_cat##cat }
+	/* category 0: */
+	OPC(0, OPC_NOP,          nop),
+	OPC(0, OPC_B,            b),
+	OPC(0, OPC_JUMP,         jump),
+	OPC(0, OPC_CALL,         call),
+	OPC(0, OPC_RET,          ret),
+	OPC(0, OPC_KILL,         kill),
+	OPC(0, OPC_END,          end),
+	OPC(0, OPC_EMIT,         emit),
+	OPC(0, OPC_CUT,          cut),
+	OPC(0, OPC_CHMASK,       chmask),
+	OPC(0, OPC_CHSH,         chsh),
+	OPC(0, OPC_FLOW_REV,     flow_rev),
+	OPC(0, OPC_PREDT,        predt),
+	OPC(0, OPC_PREDF,        predf),
+	OPC(0, OPC_PREDE,        prede),
+	OPC(0, OPC_BKT,          bkt),
+	OPC(0, OPC_STKS,         stks),
+	OPC(0, OPC_STKR,         stkr),
+	OPC(0, OPC_XSET,         xset),
+	OPC(0, OPC_XCLR,         xclr),
+	OPC(0, OPC_GETONE,       getone),
+	OPC(0, OPC_DBG,          dbg),
+	OPC(0, OPC_SHPS,         shps),
+	OPC(0, OPC_SHPE,         shpe),
+
+	/* category 1: */
+	OPC(1, OPC_MOV, ),
+
+	/* category 2: */
+	OPC(2, OPC_ADD_F,        add.f),
+	OPC(2, OPC_MIN_F,        min.f),
+	OPC(2, OPC_MAX_F,        max.f),
+	OPC(2, OPC_MUL_F,        mul.f),
+	OPC(2, OPC_SIGN_F,       sign.f),
+	OPC(2, OPC_CMPS_F,       cmps.f),
+	OPC(2, OPC_ABSNEG_F,     absneg.f),
+	OPC(2, OPC_CMPV_F,       cmpv.f),
+	OPC(2, OPC_FLOOR_F,      floor.f),
+	OPC(2, OPC_CEIL_F,       ceil.f),
+	OPC(2, OPC_RNDNE_F,      rndne.f),
+	OPC(2, OPC_RNDAZ_F,      rndaz.f),
+	OPC(2, OPC_TRUNC_F,      trunc.f),
+	OPC(2, OPC_ADD_U,        add.u),
+	OPC(2, OPC_ADD_S,        add.s),
+	OPC(2, OPC_SUB_U,        sub.u),
+	OPC(2, OPC_SUB_S,        sub.s),
+	OPC(2, OPC_CMPS_U,       cmps.u),
+	OPC(2, OPC_CMPS_S,       cmps.s),
+	OPC(2, OPC_MIN_U,        min.u),
+	OPC(2, OPC_MIN_S,        min.s),
+	OPC(2, OPC_MAX_U,        max.u),
+	OPC(2, OPC_MAX_S,        max.s),
+	OPC(2, OPC_ABSNEG_S,     absneg.s),
+	OPC(2, OPC_AND_B,        and.b),
+	OPC(2, OPC_OR_B,         or.b),
+	OPC(2, OPC_NOT_B,        not.b),
+	OPC(2, OPC_XOR_B,        xor.b),
+	OPC(2, OPC_CMPV_U,       cmpv.u),
+	OPC(2, OPC_CMPV_S,       cmpv.s),
+	OPC(2, OPC_MUL_U24,      mul.u24),
+	OPC(2, OPC_MUL_S24,      mul.s24),
+	OPC(2, OPC_MULL_U,       mull.u),
+	OPC(2, OPC_BFREV_B,      bfrev.b),
+	OPC(2, OPC_CLZ_S,        clz.s),
+	OPC(2, OPC_CLZ_B,        clz.b),
+	OPC(2, OPC_SHL_B,        shl.b),
+	OPC(2, OPC_SHR_B,        shr.b),
+	OPC(2, OPC_ASHR_B,       ashr.b),
+	OPC(2, OPC_BARY_F,       bary.f),
+	OPC(2, OPC_MGEN_B,       mgen.b),
+	OPC(2, OPC_GETBIT_B,     getbit.b),
+	OPC(2, OPC_SETRM,        setrm),
+	OPC(2, OPC_CBITS_B,      cbits.b),
+	OPC(2, OPC_SHB,          shb),
+	OPC(2, OPC_MSAD,         msad),
+
+	/* category 3: */
+	OPC(3, OPC_MAD_U16,      mad.u16),
+	OPC(3, OPC_MADSH_U16,    madsh.u16),
+	OPC(3, OPC_MAD_S16,      mad.s16),
+	OPC(3, OPC_MADSH_M16,    madsh.m16),
+	OPC(3, OPC_MAD_U24,      mad.u24),
+	OPC(3, OPC_MAD_S24,      mad.s24),
+	OPC(3, OPC_MAD_F16,      mad.f16),
+	OPC(3, OPC_MAD_F32,      mad.f32),
+	OPC(3, OPC_SEL_B16,      sel.b16),
+	OPC(3, OPC_SEL_B32,      sel.b32),
+	OPC(3, OPC_SEL_S16,      sel.s16),
+	OPC(3, OPC_SEL_S32,      sel.s32),
+	OPC(3, OPC_SEL_F16,      sel.f16),
+	OPC(3, OPC_SEL_F32,      sel.f32),
+	OPC(3, OPC_SAD_S16,      sad.s16),
+	OPC(3, OPC_SAD_S32,      sad.s32),
+
+	/* category 4: */
+	OPC(4, OPC_RCP,          rcp),
+	OPC(4, OPC_RSQ,          rsq),
+	OPC(4, OPC_LOG2,         log2),
+	OPC(4, OPC_EXP2,         exp2),
+	OPC(4, OPC_SIN,          sin),
+	OPC(4, OPC_COS,          cos),
+	OPC(4, OPC_SQRT,         sqrt),
+	OPC(4, OPC_HRSQ,         hrsq),
+	OPC(4, OPC_HLOG2,        hlog2),
+	OPC(4, OPC_HEXP2,        hexp2),
+
+	/* category 5: */
+	OPC(5, OPC_ISAM,         isam),
+	OPC(5, OPC_ISAML,        isaml),
+	OPC(5, OPC_ISAMM,        isamm),
+	OPC(5, OPC_SAM,          sam),
+	OPC(5, OPC_SAMB,         samb),
+	OPC(5, OPC_SAML,         saml),
+	OPC(5, OPC_SAMGQ,        samgq),
+	OPC(5, OPC_GETLOD,       getlod),
+	OPC(5, OPC_CONV,         conv),
+	OPC(5, OPC_CONVM,        convm),
+	OPC(5, OPC_GETSIZE,      getsize),
+	OPC(5, OPC_GETBUF,       getbuf),
+	OPC(5, OPC_GETPOS,       getpos),
+	OPC(5, OPC_GETINFO,      getinfo),
+	OPC(5, OPC_DSX,          dsx),
+	OPC(5, OPC_DSY,          dsy),
+	OPC(5, OPC_GATHER4R,     gather4r),
+	OPC(5, OPC_GATHER4G,     gather4g),
+	OPC(5, OPC_GATHER4B,     gather4b),
+	OPC(5, OPC_GATHER4A,     gather4a),
+	OPC(5, OPC_SAMGP0,       samgp0),
+	OPC(5, OPC_SAMGP1,       samgp1),
+	OPC(5, OPC_SAMGP2,       samgp2),
+	OPC(5, OPC_SAMGP3,       samgp3),
+	OPC(5, OPC_DSXPP_1,      dsxpp.1),
+	OPC(5, OPC_DSYPP_1,      dsypp.1),
+	OPC(5, OPC_RGETPOS,      rgetpos),
+	OPC(5, OPC_RGETINFO,     rgetinfo),
+
+
+	/* category 6: */
+	OPC(6, OPC_LDG,          ldg),
+	OPC(6, OPC_LDL,          ldl),
+	OPC(6, OPC_LDP,          ldp),
+	OPC(6, OPC_STG,          stg),
+	OPC(6, OPC_STL,          stl),
+	OPC(6, OPC_STP,          stp),
+	OPC(6, OPC_LDIB,         ldib),
+	OPC(6, OPC_G2L,          g2l),
+	OPC(6, OPC_L2G,          l2g),
+	OPC(6, OPC_PREFETCH,     prefetch),
+	OPC(6, OPC_LDLW,         ldlw),
+	OPC(6, OPC_STLW,         stlw),
+	OPC(6, OPC_RESFMT,       resfmt),
+	OPC(6, OPC_RESINFO,      resinfo),
+	OPC(6, OPC_ATOMIC_ADD,     atomic.add),
+	OPC(6, OPC_ATOMIC_SUB,     atomic.sub),
+	OPC(6, OPC_ATOMIC_XCHG,    atomic.xchg),
+	OPC(6, OPC_ATOMIC_INC,     atomic.inc),
+	OPC(6, OPC_ATOMIC_DEC,     atomic.dec),
+	OPC(6, OPC_ATOMIC_CMPXCHG, atomic.cmpxchg),
+	OPC(6, OPC_ATOMIC_MIN,     atomic.min),
+	OPC(6, OPC_ATOMIC_MAX,     atomic.max),
+	OPC(6, OPC_ATOMIC_AND,     atomic.and),
+	OPC(6, OPC_ATOMIC_OR,      atomic.or),
+	OPC(6, OPC_ATOMIC_XOR,     atomic.xor),
+	OPC(6, OPC_LDGB,         ldgb),
+	OPC(6, OPC_STGB,         stgb),
+	OPC(6, OPC_STIB,         stib),
+	OPC(6, OPC_LDC,          ldc),
+	OPC(6, OPC_LDLV,         ldlv),
+
+	OPC(7, OPC_BAR,          bar),
+	OPC(7, OPC_FENCE,        fence),
+
+
+#undef OPC
+};
+
+#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr, ctx->gpu_id)]))
+
+static void print_single_instr(struct disasm_ctx *ctx, instr_t *instr)
+{
+	const char *name = GETINFO(instr)->name;
+	uint32_t opc = instr_opc(instr, ctx->gpu_id);
+
+	if (name) {
+		fprintf(ctx->out, "%s", name);
+		GETINFO(instr)->print(ctx, instr);
+	} else {
+		fprintf(ctx->out, "unknown(%d,%d)", instr->opc_cat, opc);
+
+		switch (instr->opc_cat) {
+		case 0: print_instr_cat0(ctx, instr); break;
+		case 1: print_instr_cat1(ctx, instr); break;
+		case 2: print_instr_cat2(ctx, instr); break;
+		case 3: print_instr_cat3(ctx, instr); break;
+		case 4: print_instr_cat4(ctx, instr); break;
+		case 5: print_instr_cat5(ctx, instr); break;
+		case 6: print_instr_cat6(ctx, instr); break;
+		case 7: print_instr_cat7(ctx, instr); break;
+		}
+	}
+}
+
+static bool print_instr(struct disasm_ctx *ctx, uint32_t *dwords, int n)
+{
+	instr_t *instr = (instr_t *)dwords;
+	uint32_t opc = instr_opc(instr, ctx->gpu_id);
+	unsigned nop = 0;
+	unsigned cycles = ctx->stats->instructions;
+
+	fprintf(ctx->out, "%s:%d:%04d:%04d[%08xx_%08xx] ", levels[ctx->level],
+			instr->opc_cat, n, cycles++, dwords[1], dwords[0]);
+
+#if 0
+	/* print unknown bits: */
+	if (debug & PRINT_RAW)
+		fprintf(ctx->out, "[%08xx_%08xx] ", dwords[1] & 0x001ff800, dwords[0] & 0x00000000);
+
+	if (debug & PRINT_VERBOSE)
+		fprintf(ctx->out, "%d,%02d ", instr->opc_cat, opc);
+#endif
+
+	/* NOTE: order flags are printed is a bit fugly.. but for now I
+	 * try to match the order in llvm-a3xx disassembler for easy
+	 * diff'ing..
+	 */
+
+	ctx->repeat = instr_repeat(instr);
+	ctx->stats->instructions += 1 + ctx->repeat;
+	ctx->stats->instlen++;
+
+	if (instr->sync) {
+		fprintf(ctx->out, "(sy)");
+		ctx->stats->sy++;
+	}
+	if (instr->ss && ((instr->opc_cat <= 4) || (instr->opc_cat == 7))) {
+		fprintf(ctx->out, "(ss)");
+		ctx->stats->ss++;
+	}
+	if (instr->jmp_tgt)
+		fprintf(ctx->out, "(jp)");
+	if ((instr->opc_cat == 0) && instr->cat0.eq)
+		fprintf(ctx->out, "(eq)");
+	if (instr_sat(instr))
+		fprintf(ctx->out, "(sat)");
+	if (ctx->repeat)
+		fprintf(ctx->out, "(rpt%d)", ctx->repeat);
+	else if ((instr->opc_cat == 2) && (instr->cat2.src1_r || instr->cat2.src2_r))
+		nop = (instr->cat2.src2_r * 2) + instr->cat2.src1_r;
+	else if ((instr->opc_cat == 3) && (instr->cat3.src1_r || instr->cat3.src2_r))
+		nop = (instr->cat3.src2_r * 2) + instr->cat3.src1_r;
+	ctx->stats->instructions += nop;
+	ctx->stats->nops += nop;
+	if (opc == OPC_NOP)
+		ctx->stats->nops += 1 + ctx->repeat;
+	if (nop)
+		fprintf(ctx->out, "(nop%d) ", nop);
+
+	if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4)))
+		fprintf(ctx->out, "(ul)");
+
+	print_single_instr(ctx, instr);
+	fprintf(ctx->out, "\n");
+
+	process_reg_dst(ctx);
+
+	if ((instr->opc_cat <= 4) && (debug & EXPAND_REPEAT)) {
+		int i;
+		for (i = 0; i < nop; i++) {
+			fprintf(ctx->out, "%s:%d:%04d:%04d[                   ] ",
+					levels[ctx->level], instr->opc_cat, n, cycles++);
+			fprintf(ctx->out, "nop\n");
+		}
+		for (i = 0; i < ctx->repeat; i++) {
+			ctx->repeatidx = i + 1;
+			fprintf(ctx->out, "%s:%d:%04d:%04d[                   ] ",
+					levels[ctx->level], instr->opc_cat, n, cycles++);
+
+			print_single_instr(ctx, instr);
+			fprintf(ctx->out, "\n");
+		}
+		ctx->repeatidx = 0;
+	}
+
+	return (instr->opc_cat == 0) &&
+		((opc == OPC_END) || (opc == OPC_CHSH));
+}
+
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out, unsigned gpu_id)
+{
+	struct shader_stats stats;
+	return disasm_a3xx_stat(dwords, sizedwords, level, out, gpu_id, &stats);
+}
+
+int disasm_a3xx_stat(uint32_t *dwords, int sizedwords, int level, FILE *out,
+		unsigned gpu_id, struct shader_stats *stats)
+{
+	struct disasm_ctx ctx;
+	int i;
+	int nop_count = 0;
+	bool has_end = false;
+
+//	ir3_assert((sizedwords % 2) == 0);
+
+	memset(&ctx, 0, sizeof(ctx));
+	ctx.out = out;
+	ctx.level = level;
+	ctx.gpu_id = gpu_id;
+	ctx.stats = stats;
+	memset(ctx.stats, 0, sizeof(*ctx.stats));
+
+	for (i = 0; i < sizedwords; i += 2) {
+		has_end |= print_instr(&ctx, &dwords[i], i/2);
+		if (!has_end)
+			continue;
+		if (dwords[i] == 0 && dwords[i + 1] == 0)
+			nop_count++;
+		else
+			nop_count = 0;
+		if (nop_count > 3)
+			break;
+	}
+
+	print_reg_stats(&ctx);
+
+	return 0;
+}
diff --git a/src/freedreno/decode/disasm.h b/src/freedreno/decode/disasm.h
new file mode 100644
index 0000000..21ae5a1
--- /dev/null
+++ b/src/freedreno/decode/disasm.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright © 2012 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef DISASM_H_
+#define DISASM_H_
+
+#include <stdio.h>
+
+enum shader_t {
+	SHADER_VERTEX,
+	SHADER_TCS,
+	SHADER_TES,
+	SHADER_GEOM,
+	SHADER_FRAGMENT,
+	SHADER_COMPUTE,
+};
+
+/* bitmask of debug flags */
+enum debug_t {
+	PRINT_RAW      = 0x1,    /* dump raw hexdump */
+	PRINT_VERBOSE  = 0x2,
+	EXPAND_REPEAT  = 0x4,
+};
+
+struct shader_stats {
+	/* instructions counts rpnN, and instlen does not */
+	int instructions, instlen;
+	int nops;
+	int ss, sy;
+	int constlen;
+};
+
+int disasm_a2xx(uint32_t *dwords, int sizedwords, int level, enum shader_t type);
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out, unsigned gpu_id);
+int disasm_a3xx_stat(uint32_t *dwords, int sizedwords, int level, FILE *out,
+		unsigned gpu_id, struct shader_stats *stats);
+void disasm_set_debug(enum debug_t debug);
+
+#endif /* DISASM_H_ */
diff --git a/src/freedreno/decode/instr-a2xx.h b/src/freedreno/decode/instr-a2xx.h
new file mode 100644
index 0000000..03d1991
--- /dev/null
+++ b/src/freedreno/decode/instr-a2xx.h
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef INSTR_A2XX_H_
+#define INSTR_A2XX_H_
+
+#define PACKED __attribute__((__packed__))
+
+
+/*
+ * ALU instructions:
+ */
+
+typedef enum {
+	ADDs = 0,
+	ADD_PREVs = 1,
+	MULs = 2,
+	MUL_PREVs = 3,
+	MUL_PREV2s = 4,
+	MAXs = 5,
+	MINs = 6,
+	SETEs = 7,
+	SETGTs = 8,
+	SETGTEs = 9,
+	SETNEs = 10,
+	FRACs = 11,
+	TRUNCs = 12,
+	FLOORs = 13,
+	EXP_IEEE = 14,
+	LOG_CLAMP = 15,
+	LOG_IEEE = 16,
+	RECIP_CLAMP = 17,
+	RECIP_FF = 18,
+	RECIP_IEEE = 19,
+	RECIPSQ_CLAMP = 20,
+	RECIPSQ_FF = 21,
+	RECIPSQ_IEEE = 22,
+	MOVAs = 23,
+	MOVA_FLOORs = 24,
+	SUBs = 25,
+	SUB_PREVs = 26,
+	PRED_SETEs = 27,
+	PRED_SETNEs = 28,
+	PRED_SETGTs = 29,
+	PRED_SETGTEs = 30,
+	PRED_SET_INVs = 31,
+	PRED_SET_POPs = 32,
+	PRED_SET_CLRs = 33,
+	PRED_SET_RESTOREs = 34,
+	KILLEs = 35,
+	KILLGTs = 36,
+	KILLGTEs = 37,
+	KILLNEs = 38,
+	KILLONEs = 39,
+	SQRT_IEEE = 40,
+	MUL_CONST_0 = 42,
+	MUL_CONST_1 = 43,
+	ADD_CONST_0 = 44,
+	ADD_CONST_1 = 45,
+	SUB_CONST_0 = 46,
+	SUB_CONST_1 = 47,
+	SIN = 48,
+	COS = 49,
+	RETAIN_PREV = 50,
+} instr_scalar_opc_t;
+
+typedef enum {
+	ADDv = 0,
+	MULv = 1,
+	MAXv = 2,
+	MINv = 3,
+	SETEv = 4,
+	SETGTv = 5,
+	SETGTEv = 6,
+	SETNEv = 7,
+	FRACv = 8,
+	TRUNCv = 9,
+	FLOORv = 10,
+	MULADDv = 11,
+	CNDEv = 12,
+	CNDGTEv = 13,
+	CNDGTv = 14,
+	DOT4v = 15,
+	DOT3v = 16,
+	DOT2ADDv = 17,
+	CUBEv = 18,
+	MAX4v = 19,
+	PRED_SETE_PUSHv = 20,
+	PRED_SETNE_PUSHv = 21,
+	PRED_SETGT_PUSHv = 22,
+	PRED_SETGTE_PUSHv = 23,
+	KILLEv = 24,
+	KILLGTv = 25,
+	KILLGTEv = 26,
+	KILLNEv = 27,
+	DSTv = 28,
+	MOVAv = 29,
+} instr_vector_opc_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	uint8_t             vector_dest              : 6;
+	uint8_t             vector_dest_rel          : 1;
+	uint8_t             low_precision_16b_fp     : 1;
+	uint8_t             scalar_dest              : 6;
+	uint8_t             scalar_dest_rel          : 1;
+	uint8_t             export_data              : 1;
+	uint8_t             vector_write_mask        : 4;
+	uint8_t             scalar_write_mask        : 4;
+	uint8_t             vector_clamp             : 1;
+	uint8_t             scalar_clamp             : 1;
+	instr_scalar_opc_t  scalar_opc               : 6;
+	/* dword1: */
+	uint8_t             src3_swiz                : 8;
+	uint8_t             src2_swiz                : 8;
+	uint8_t             src1_swiz                : 8;
+	uint8_t             src3_reg_negate          : 1;
+	uint8_t             src2_reg_negate          : 1;
+	uint8_t             src1_reg_negate          : 1;
+	uint8_t             pred_select              : 2;
+	uint8_t             relative_addr            : 1;
+	uint8_t             const_1_rel_abs          : 1;
+	uint8_t             const_0_rel_abs          : 1;
+	/* dword2: */
+	uint8_t             src3_reg                 : 6;
+	uint8_t             src3_reg_select          : 1;
+	uint8_t             src3_reg_abs             : 1;
+	uint8_t             src2_reg                 : 6;
+	uint8_t             src2_reg_select          : 1;
+	uint8_t             src2_reg_abs             : 1;
+	uint8_t             src1_reg                 : 6;
+	uint8_t             src1_reg_select          : 1;
+	uint8_t             src1_reg_abs             : 1;
+	instr_vector_opc_t  vector_opc               : 5;
+	uint8_t             src3_sel                 : 1;
+	uint8_t             src2_sel                 : 1;
+	uint8_t             src1_sel                 : 1;
+} instr_alu_t;
+
+
+
+/*
+ * CF instructions:
+ */
+
+typedef enum {
+	NOP = 0,
+	EXEC = 1,
+	EXEC_END = 2,
+	COND_EXEC = 3,
+	COND_EXEC_END = 4,
+	COND_PRED_EXEC = 5,
+	COND_PRED_EXEC_END = 6,
+	LOOP_START = 7,
+	LOOP_END = 8,
+	COND_CALL = 9,
+	RETURN = 10,
+	COND_JMP = 11,
+	ALLOC = 12,
+	COND_EXEC_PRED_CLEAN = 13,
+	COND_EXEC_PRED_CLEAN_END = 14,
+	MARK_VS_FETCH_DONE = 15,
+} instr_cf_opc_t;
+
+typedef enum {
+	RELATIVE_ADDR = 0,
+	ABSOLUTE_ADDR = 1,
+} instr_addr_mode_t;
+
+typedef enum {
+	SQ_NO_ALLOC = 0,
+	SQ_POSITION = 1,
+	SQ_PARAMETER_PIXEL = 2,
+	SQ_MEMORY = 3,
+} instr_alloc_type_t;
+
+typedef struct PACKED {
+	uint16_t            address                  : 9;
+	uint8_t             reserved0                : 3;
+	uint8_t             count                    : 3;
+	uint8_t             yeild                    : 1;
+	uint16_t            serialize                : 12;
+	uint8_t             vc                       : 6;   /* vertex cache? */
+	uint8_t             bool_addr                : 8;
+	uint8_t             condition                : 1;
+	instr_addr_mode_t   address_mode             : 1;
+	instr_cf_opc_t      opc                      : 4;
+} instr_cf_exec_t;
+
+typedef struct PACKED {
+	uint16_t            address                  : 10;
+	uint8_t             reserved0                : 6;
+	uint8_t             loop_id                  : 5;
+	uint32_t            reserved1                : 22;
+	instr_addr_mode_t   address_mode             : 1;
+	instr_cf_opc_t      opc                      : 4;
+} instr_cf_loop_t;
+
+typedef struct PACKED {
+	uint16_t            address                  : 10;
+	uint8_t             reserved0                : 3;
+	uint8_t             force_call               : 1;
+	uint8_t             predicated_jmp           : 1;
+	uint32_t            reserved1                : 18;
+	uint8_t             direction                : 1;
+	uint8_t             bool_addr                : 8;
+	uint8_t             condition                : 1;
+	instr_addr_mode_t   address_mode             : 1;
+	instr_cf_opc_t      opc                      : 4;
+} instr_cf_jmp_call_t;
+
+typedef struct PACKED {
+	uint8_t             size                     : 4;
+	uint64_t            reserved0                : 36;
+	uint8_t             no_serial                : 1;
+	instr_alloc_type_t  buffer_select            : 2;
+	uint8_t             alloc_mode               : 1;
+	instr_cf_opc_t      opc                      : 4;
+} instr_cf_alloc_t;
+
+typedef union PACKED {
+	instr_cf_exec_t     exec;
+	instr_cf_loop_t     loop;
+	instr_cf_jmp_call_t jmp_call;
+	instr_cf_alloc_t    alloc;
+	struct PACKED {
+		uint64_t        dummy                    : 44;
+		instr_cf_opc_t  opc                      : 4;
+	};
+} instr_cf_t;
+
+
+
+/*
+ * FETCH instructions:
+ */
+
+typedef enum {
+	VTX_FETCH = 0,
+	TEX_FETCH = 1,
+	TEX_GET_BORDER_COLOR_FRAC = 16,
+	TEX_GET_COMP_TEX_LOD = 17,
+	TEX_GET_GRADIENTS = 18,
+	TEX_GET_WEIGHTS = 19,
+	TEX_SET_TEX_LOD = 24,
+	TEX_SET_GRADIENTS_H = 25,
+	TEX_SET_GRADIENTS_V = 26,
+	TEX_RESERVED_4 = 27,
+} instr_fetch_opc_t;
+
+typedef enum {
+	TEX_FILTER_POINT = 0,
+	TEX_FILTER_LINEAR = 1,
+	TEX_FILTER_BASEMAP = 2,            /* only applicable for mip-filter */
+	TEX_FILTER_USE_FETCH_CONST = 3,
+} instr_tex_filter_t;
+
+typedef enum {
+	ANISO_FILTER_DISABLED = 0,
+	ANISO_FILTER_MAX_1_1 = 1,
+	ANISO_FILTER_MAX_2_1 = 2,
+	ANISO_FILTER_MAX_4_1 = 3,
+	ANISO_FILTER_MAX_8_1 = 4,
+	ANISO_FILTER_MAX_16_1 = 5,
+	ANISO_FILTER_USE_FETCH_CONST = 7,
+} instr_aniso_filter_t;
+
+typedef enum {
+	ARBITRARY_FILTER_2X4_SYM = 0,
+	ARBITRARY_FILTER_2X4_ASYM = 1,
+	ARBITRARY_FILTER_4X2_SYM = 2,
+	ARBITRARY_FILTER_4X2_ASYM = 3,
+	ARBITRARY_FILTER_4X4_SYM = 4,
+	ARBITRARY_FILTER_4X4_ASYM = 5,
+	ARBITRARY_FILTER_USE_FETCH_CONST = 7,
+} instr_arbitrary_filter_t;
+
+typedef enum {
+	SAMPLE_CENTROID = 0,
+	SAMPLE_CENTER = 1,
+} instr_sample_loc_t;
+
+typedef unsigned instr_surf_fmt_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	instr_fetch_opc_t   opc                      : 5;
+	uint8_t             src_reg                  : 6;
+	uint8_t             src_reg_am               : 1;
+	uint8_t             dst_reg                  : 6;
+	uint8_t             dst_reg_am               : 1;
+	uint8_t             fetch_valid_only         : 1;
+	uint8_t             const_idx                : 5;
+	uint8_t             tx_coord_denorm          : 1;
+	uint8_t             src_swiz                 : 6;
+	/* dword1: */
+	uint16_t            dst_swiz                 : 12;
+	instr_tex_filter_t  mag_filter               : 2;
+	instr_tex_filter_t  min_filter               : 2;
+	instr_tex_filter_t  mip_filter               : 2;
+	instr_aniso_filter_t aniso_filter            : 3;
+	instr_arbitrary_filter_t arbitrary_filter    : 3;
+	instr_tex_filter_t  vol_mag_filter           : 2;
+	instr_tex_filter_t  vol_min_filter           : 2;
+	uint8_t             use_comp_lod             : 1;
+	uint8_t             use_reg_lod              : 2;
+	uint8_t             pred_select              : 1;
+	/* dword2: */
+	uint8_t             use_reg_gradients        : 1;
+	instr_sample_loc_t  sample_location          : 1;
+	uint8_t             lod_bias                 : 7;
+	uint8_t             unused                   : 7;
+	uint8_t             offset_x                 : 5;
+	uint8_t             offset_y                 : 5;
+	uint8_t             offset_z                 : 5;
+	uint8_t             pred_condition           : 1;
+} instr_fetch_tex_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	instr_fetch_opc_t   opc                      : 5;
+	uint8_t             src_reg                  : 6;
+	uint8_t             src_reg_am               : 1;
+	uint8_t             dst_reg                  : 6;
+	uint8_t             dst_reg_am               : 1;
+	uint8_t             must_be_one              : 1;
+	uint8_t             const_index              : 5;
+	uint8_t             const_index_sel          : 2;
+	uint8_t             reserved0                : 3;
+	uint8_t             src_swiz                 : 2;
+	/* dword1: */
+	uint16_t            dst_swiz                 : 12;
+	uint8_t             format_comp_all          : 1;   /* '1' for signed, '0' for unsigned? */
+	uint8_t             num_format_all           : 1;   /* '0' for normalized, '1' for unnormalized */
+	uint8_t             signed_rf_mode_all       : 1;
+	uint8_t             reserved1                : 1;
+	instr_surf_fmt_t    format                   : 6;
+	uint8_t             reserved2                : 1;
+	uint8_t             exp_adjust_all           : 7;
+	uint8_t             reserved3                : 1;
+	uint8_t             pred_select              : 1;
+	/* dword2: */
+	uint8_t             stride                   : 8;
+	/* possibly offset and reserved4 are swapped on a200? */
+	uint8_t             offset                   : 8;
+	uint8_t             reserved4                : 8;
+	uint8_t             reserved5                : 7;
+	uint8_t             pred_condition           : 1;
+} instr_fetch_vtx_t;
+
+typedef union PACKED {
+	instr_fetch_tex_t   tex;
+	instr_fetch_vtx_t   vtx;
+	struct PACKED {
+		/* dword0: */
+		instr_fetch_opc_t opc                    : 5;
+		uint32_t        dummy0                   : 27;
+		/* dword1: */
+		uint32_t        dummy1                   : 32;
+		/* dword2: */
+		uint32_t        dummy2                   : 32;
+	};
+} instr_fetch_t;
+
+#endif /* INSTR_H_ */
diff --git a/src/freedreno/decode/instr-a3xx.h b/src/freedreno/decode/instr-a3xx.h
new file mode 100644
index 0000000..218bdc3
--- /dev/null
+++ b/src/freedreno/decode/instr-a3xx.h
@@ -0,0 +1,1115 @@
+/*
+ * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef INSTR_A3XX_H_
+#define INSTR_A3XX_H_
+
+#define PACKED __attribute__((__packed__))
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <assert.h>
+
+void ir3_assert_handler(const char *expr, const char *file, int line,
+		const char *func) __attribute__((weak)) __attribute__ ((__noreturn__));
+
+/* A wrapper for assert() that allows overriding handling of a failed
+ * assert.  This is needed for tools like crashdec which can want to
+ * attempt to disassemble memory that might not actually be valid
+ * instructions.
+ */
+#define ir3_assert(expr) do { \
+		if (!(expr)) { \
+			if (ir3_assert_handler) { \
+				ir3_assert_handler(#expr, __FILE__, __LINE__, __func__); \
+			} \
+			assert(expr); \
+		} \
+	} while (0)
+
+/* size of largest OPC field of all the instruction categories: */
+#define NOPC_BITS 6
+
+#define _OPC(cat, opc)   (((cat) << NOPC_BITS) | opc)
+
+typedef enum {
+	/* category 0: */
+	OPC_NOP             = _OPC(0, 0),
+	OPC_B               = _OPC(0, 1),
+	OPC_JUMP            = _OPC(0, 2),
+	OPC_CALL            = _OPC(0, 3),
+	OPC_RET             = _OPC(0, 4),
+	OPC_KILL            = _OPC(0, 5),
+	OPC_END             = _OPC(0, 6),
+	OPC_EMIT            = _OPC(0, 7),
+	OPC_CUT             = _OPC(0, 8),
+	OPC_CHMASK          = _OPC(0, 9),
+	OPC_CHSH            = _OPC(0, 10),
+	OPC_FLOW_REV        = _OPC(0, 11),
+
+	OPC_BKT             = _OPC(0, 16),
+	OPC_STKS            = _OPC(0, 17),
+	OPC_STKR            = _OPC(0, 18),
+	OPC_XSET            = _OPC(0, 19),
+	OPC_XCLR            = _OPC(0, 20),
+	OPC_GETONE          = _OPC(0, 21),
+	OPC_DBG             = _OPC(0, 22),
+	OPC_SHPS            = _OPC(0, 23),   /* shader prologue start */
+	OPC_SHPE            = _OPC(0, 24),   /* shader prologue end */
+
+	OPC_PREDT           = _OPC(0, 29),   /* predicated true */
+	OPC_PREDF           = _OPC(0, 30),   /* predicated false */
+	OPC_PREDE           = _OPC(0, 31),   /* predicated end */
+
+	/* category 1: */
+	OPC_MOV             = _OPC(1, 0),
+
+	/* category 2: */
+	OPC_ADD_F           = _OPC(2, 0),
+	OPC_MIN_F           = _OPC(2, 1),
+	OPC_MAX_F           = _OPC(2, 2),
+	OPC_MUL_F           = _OPC(2, 3),
+	OPC_SIGN_F          = _OPC(2, 4),
+	OPC_CMPS_F          = _OPC(2, 5),
+	OPC_ABSNEG_F        = _OPC(2, 6),
+	OPC_CMPV_F          = _OPC(2, 7),
+	/* 8 - invalid */
+	OPC_FLOOR_F         = _OPC(2, 9),
+	OPC_CEIL_F          = _OPC(2, 10),
+	OPC_RNDNE_F         = _OPC(2, 11),
+	OPC_RNDAZ_F         = _OPC(2, 12),
+	OPC_TRUNC_F         = _OPC(2, 13),
+	/* 14-15 - invalid */
+	OPC_ADD_U           = _OPC(2, 16),
+	OPC_ADD_S           = _OPC(2, 17),
+	OPC_SUB_U           = _OPC(2, 18),
+	OPC_SUB_S           = _OPC(2, 19),
+	OPC_CMPS_U          = _OPC(2, 20),
+	OPC_CMPS_S          = _OPC(2, 21),
+	OPC_MIN_U           = _OPC(2, 22),
+	OPC_MIN_S           = _OPC(2, 23),
+	OPC_MAX_U           = _OPC(2, 24),
+	OPC_MAX_S           = _OPC(2, 25),
+	OPC_ABSNEG_S        = _OPC(2, 26),
+	/* 27 - invalid */
+	OPC_AND_B           = _OPC(2, 28),
+	OPC_OR_B            = _OPC(2, 29),
+	OPC_NOT_B           = _OPC(2, 30),
+	OPC_XOR_B           = _OPC(2, 31),
+	/* 32 - invalid */
+	OPC_CMPV_U          = _OPC(2, 33),
+	OPC_CMPV_S          = _OPC(2, 34),
+	/* 35-47 - invalid */
+	OPC_MUL_U24         = _OPC(2, 48), /* 24b mul into 32b result */
+	OPC_MUL_S24         = _OPC(2, 49), /* 24b mul into 32b result with sign extension */
+	OPC_MULL_U          = _OPC(2, 50),
+	OPC_BFREV_B         = _OPC(2, 51),
+	OPC_CLZ_S           = _OPC(2, 52),
+	OPC_CLZ_B           = _OPC(2, 53),
+	OPC_SHL_B           = _OPC(2, 54),
+	OPC_SHR_B           = _OPC(2, 55),
+	OPC_ASHR_B          = _OPC(2, 56),
+	OPC_BARY_F          = _OPC(2, 57),
+	OPC_MGEN_B          = _OPC(2, 58),
+	OPC_GETBIT_B        = _OPC(2, 59),
+	OPC_SETRM           = _OPC(2, 60),
+	OPC_CBITS_B         = _OPC(2, 61),
+	OPC_SHB             = _OPC(2, 62),
+	OPC_MSAD            = _OPC(2, 63),
+
+	/* category 3: */
+	OPC_MAD_U16         = _OPC(3, 0),
+	OPC_MADSH_U16       = _OPC(3, 1),
+	OPC_MAD_S16         = _OPC(3, 2),
+	OPC_MADSH_M16       = _OPC(3, 3),   /* should this be .s16? */
+	OPC_MAD_U24         = _OPC(3, 4),
+	OPC_MAD_S24         = _OPC(3, 5),
+	OPC_MAD_F16         = _OPC(3, 6),
+	OPC_MAD_F32         = _OPC(3, 7),
+	OPC_SEL_B16         = _OPC(3, 8),
+	OPC_SEL_B32         = _OPC(3, 9),
+	OPC_SEL_S16         = _OPC(3, 10),
+	OPC_SEL_S32         = _OPC(3, 11),
+	OPC_SEL_F16         = _OPC(3, 12),
+	OPC_SEL_F32         = _OPC(3, 13),
+	OPC_SAD_S16         = _OPC(3, 14),
+	OPC_SAD_S32         = _OPC(3, 15),
+
+	/* category 4: */
+	OPC_RCP             = _OPC(4, 0),
+	OPC_RSQ             = _OPC(4, 1),
+	OPC_LOG2            = _OPC(4, 2),
+	OPC_EXP2            = _OPC(4, 3),
+	OPC_SIN             = _OPC(4, 4),
+	OPC_COS             = _OPC(4, 5),
+	OPC_SQRT            = _OPC(4, 6),
+	/* NOTE that these are 8+opc from their highp equivs, so it's possible
+	 * that the high order bit in the opc field has been repurposed for
+	 * half-precision use?  But note that other ops (rcp/lsin/cos/sqrt)
+	 * still use the same opc as highp
+	 */
+	OPC_HRSQ            = _OPC(4, 9),
+	OPC_HLOG2           = _OPC(4, 10),
+	OPC_HEXP2           = _OPC(4, 11),
+
+	/* category 5: */
+	OPC_ISAM            = _OPC(5, 0),
+	OPC_ISAML           = _OPC(5, 1),
+	OPC_ISAMM           = _OPC(5, 2),
+	OPC_SAM             = _OPC(5, 3),
+	OPC_SAMB            = _OPC(5, 4),
+	OPC_SAML            = _OPC(5, 5),
+	OPC_SAMGQ           = _OPC(5, 6),
+	OPC_GETLOD          = _OPC(5, 7),
+	OPC_CONV            = _OPC(5, 8),
+	OPC_CONVM           = _OPC(5, 9),
+	OPC_GETSIZE         = _OPC(5, 10),
+	OPC_GETBUF          = _OPC(5, 11),
+	OPC_GETPOS          = _OPC(5, 12),
+	OPC_GETINFO         = _OPC(5, 13),
+	OPC_DSX             = _OPC(5, 14),
+	OPC_DSY             = _OPC(5, 15),
+	OPC_GATHER4R        = _OPC(5, 16),
+	OPC_GATHER4G        = _OPC(5, 17),
+	OPC_GATHER4B        = _OPC(5, 18),
+	OPC_GATHER4A        = _OPC(5, 19),
+	OPC_SAMGP0          = _OPC(5, 20),
+	OPC_SAMGP1          = _OPC(5, 21),
+	OPC_SAMGP2          = _OPC(5, 22),
+	OPC_SAMGP3          = _OPC(5, 23),
+	OPC_DSXPP_1         = _OPC(5, 24),
+	OPC_DSYPP_1         = _OPC(5, 25),
+	OPC_RGETPOS         = _OPC(5, 26),
+	OPC_RGETINFO        = _OPC(5, 27),
+
+	/* category 6: */
+	OPC_LDG             = _OPC(6, 0),        /* load-global */
+	OPC_LDL             = _OPC(6, 1),
+	OPC_LDP             = _OPC(6, 2),
+	OPC_STG             = _OPC(6, 3),        /* store-global */
+	OPC_STL             = _OPC(6, 4),
+	OPC_STP             = _OPC(6, 5),
+	OPC_LDIB            = _OPC(6, 6),
+	OPC_G2L             = _OPC(6, 7),
+	OPC_L2G             = _OPC(6, 8),
+	OPC_PREFETCH        = _OPC(6, 9),
+	OPC_LDLW            = _OPC(6, 10),
+	OPC_STLW            = _OPC(6, 11),
+	OPC_RESFMT          = _OPC(6, 14),
+	OPC_RESINFO         = _OPC(6, 15),
+	OPC_ATOMIC_ADD      = _OPC(6, 16),
+	OPC_ATOMIC_SUB      = _OPC(6, 17),
+	OPC_ATOMIC_XCHG     = _OPC(6, 18),
+	OPC_ATOMIC_INC      = _OPC(6, 19),
+	OPC_ATOMIC_DEC      = _OPC(6, 20),
+	OPC_ATOMIC_CMPXCHG  = _OPC(6, 21),
+	OPC_ATOMIC_MIN      = _OPC(6, 22),
+	OPC_ATOMIC_MAX      = _OPC(6, 23),
+	OPC_ATOMIC_AND      = _OPC(6, 24),
+	OPC_ATOMIC_OR       = _OPC(6, 25),
+	OPC_ATOMIC_XOR      = _OPC(6, 26),
+	OPC_LDGB            = _OPC(6, 27),
+	OPC_STGB            = _OPC(6, 28),
+	OPC_STIB            = _OPC(6, 29),
+	OPC_LDC             = _OPC(6, 30),
+	OPC_LDLV            = _OPC(6, 31),
+
+	/* category 7: */
+	OPC_BAR             = _OPC(7, 0),
+	OPC_FENCE           = _OPC(7, 1),
+} opc_t;
+
+#define opc_cat(opc) ((int)((opc) >> NOPC_BITS))
+#define opc_op(opc)  ((unsigned)((opc) & ((1 << NOPC_BITS) - 1)))
+
+typedef enum {
+	TYPE_F16 = 0,
+	TYPE_F32 = 1,
+	TYPE_U16 = 2,
+	TYPE_U32 = 3,
+	TYPE_S16 = 4,
+	TYPE_S32 = 5,
+	TYPE_U8  = 6,
+	TYPE_S8  = 7,  // XXX I assume?
+} type_t;
+
+static inline uint32_t type_size(type_t type)
+{
+	switch (type) {
+	case TYPE_F32:
+	case TYPE_U32:
+	case TYPE_S32:
+		return 32;
+	case TYPE_F16:
+	case TYPE_U16:
+	case TYPE_S16:
+		return 16;
+	case TYPE_U8:
+	case TYPE_S8:
+		return 8;
+	default:
+		ir3_assert(0); /* invalid type */
+		return 0;
+	}
+}
+
+static inline int type_float(type_t type)
+{
+	return (type == TYPE_F32) || (type == TYPE_F16);
+}
+
+static inline int type_uint(type_t type)
+{
+	return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
+}
+
+static inline int type_sint(type_t type)
+{
+	return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
+}
+
+typedef union PACKED {
+	/* normal gpr or const src register: */
+	struct PACKED {
+		uint32_t comp  : 2;
+		uint32_t num   : 10;
+	};
+	/* for immediate val: */
+	int32_t  iim_val   : 11;
+	/* to make compiler happy: */
+	uint32_t dummy32;
+	uint32_t dummy10   : 10;
+	int32_t  idummy10  : 10;
+	uint32_t dummy11   : 11;
+	uint32_t dummy12   : 12;
+	uint32_t dummy13   : 13;
+	uint32_t dummy8    : 8;
+	int32_t  idummy13  : 13;
+	int32_t  idummy8   : 8;
+} reg_t;
+
+/* special registers: */
+#define REG_A0 61       /* address register */
+#define REG_P0 62       /* predicate register */
+
+static inline int reg_special(reg_t reg)
+{
+	return (reg.num == REG_A0) || (reg.num == REG_P0);
+}
+
+typedef enum {
+	BRANCH_PLAIN = 0,   /* br */
+	BRANCH_OR    = 1,   /* brao */
+	BRANCH_AND   = 2,   /* braa */
+	BRANCH_CONST = 3,   /* brac */
+	BRANCH_ANY   = 4,   /* bany */
+	BRANCH_ALL   = 5,   /* ball */
+	BRANCH_X     = 6,   /* brax ??? */
+} brtype_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		struct PACKED {
+			int16_t  immed    : 16;
+			uint32_t dummy1   : 16;
+		} a3xx;
+		struct PACKED {
+			int32_t  immed    : 20;
+			uint32_t dummy1   : 12;
+		} a4xx;
+		struct PACKED {
+			int32_t immed     : 32;
+		} a5xx;
+	};
+
+	/* dword1: */
+	uint32_t idx      : 5;  /* brac.N index */
+	uint32_t brtype   : 3;  /* branch type, see brtype_t */
+	uint32_t repeat   : 3;
+	uint32_t dummy3   : 1;
+	uint32_t ss       : 1;
+	uint32_t inv1     : 1;
+	uint32_t comp1    : 2;
+	uint32_t eq       : 1;
+	uint32_t opc_hi   : 1;  /* at least one bit */
+	uint32_t dummy4   : 2;
+	uint32_t inv0     : 1;
+	uint32_t comp0    : 2;  /* component for first src */
+	uint32_t opc      : 4;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat0_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		/* for normal src register: */
+		struct PACKED {
+			uint32_t src : 11;
+			/* at least low bit of pad must be zero or it will
+			 * look like a address relative src
+			 */
+			uint32_t pad : 21;
+		};
+		/* for address relative: */
+		struct PACKED {
+			int32_t  off : 10;
+			uint32_t src_rel_c : 1;
+			uint32_t src_rel : 1;
+			uint32_t unknown : 20;
+		};
+		/* for immediate: */
+		int32_t  iim_val;
+		uint32_t uim_val;
+		float    fim_val;
+	};
+
+	/* dword1: */
+	uint32_t dst        : 8;
+	uint32_t repeat     : 3;
+	uint32_t src_r      : 1;
+	uint32_t ss         : 1;
+	uint32_t ul         : 1;
+	uint32_t dst_type   : 3;
+	uint32_t dst_rel    : 1;
+	uint32_t src_type   : 3;
+	uint32_t src_c      : 1;
+	uint32_t src_im     : 1;
+	uint32_t even       : 1;
+	uint32_t pos_inf    : 1;
+	uint32_t must_be_0  : 2;
+	uint32_t jmp_tgt    : 1;
+	uint32_t sync       : 1;
+	uint32_t opc_cat    : 3;
+} instr_cat1_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		struct PACKED {
+			uint32_t src1         : 11;
+			uint32_t must_be_zero1: 2;
+			uint32_t src1_im      : 1;   /* immediate */
+			uint32_t src1_neg     : 1;   /* negate */
+			uint32_t src1_abs     : 1;   /* absolute value */
+		};
+		struct PACKED {
+			uint32_t src1         : 10;
+			uint32_t src1_c       : 1;   /* relative-const */
+			uint32_t src1_rel     : 1;   /* relative address */
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel1;
+		struct PACKED {
+			uint32_t src1         : 12;
+			uint32_t src1_c       : 1;   /* const */
+			uint32_t dummy        : 3;
+		} c1;
+	};
+
+	union PACKED {
+		struct PACKED {
+			uint32_t src2         : 11;
+			uint32_t must_be_zero2: 2;
+			uint32_t src2_im      : 1;   /* immediate */
+			uint32_t src2_neg     : 1;   /* negate */
+			uint32_t src2_abs     : 1;   /* absolute value */
+		};
+		struct PACKED {
+			uint32_t src2         : 10;
+			uint32_t src2_c       : 1;   /* relative-const */
+			uint32_t src2_rel     : 1;   /* relative address */
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel2;
+		struct PACKED {
+			uint32_t src2         : 12;
+			uint32_t src2_c       : 1;   /* const */
+			uint32_t dummy        : 3;
+		} c2;
+	};
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t repeat   : 2;
+	uint32_t sat      : 1;
+	uint32_t src1_r   : 1;   /* doubles as nop0 if repeat==0 */
+	uint32_t ss       : 1;
+	uint32_t ul       : 1;   /* dunno */
+	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+	uint32_t ei       : 1;
+	uint32_t cond     : 3;
+	uint32_t src2_r   : 1;   /* doubles as nop1 if repeat==0 */
+	uint32_t full     : 1;   /* not half */
+	uint32_t opc      : 6;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat2_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		struct PACKED {
+			uint32_t src1         : 11;
+			uint32_t must_be_zero1: 2;
+			uint32_t src2_c       : 1;
+			uint32_t src1_neg     : 1;
+			uint32_t src2_r       : 1;  /* doubles as nop1 if repeat==0 */
+		};
+		struct PACKED {
+			uint32_t src1         : 10;
+			uint32_t src1_c       : 1;
+			uint32_t src1_rel     : 1;
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel1;
+		struct PACKED {
+			uint32_t src1         : 12;
+			uint32_t src1_c       : 1;
+			uint32_t dummy        : 3;
+		} c1;
+	};
+
+	union PACKED {
+		struct PACKED {
+			uint32_t src3         : 11;
+			uint32_t must_be_zero2: 2;
+			uint32_t src3_r       : 1;
+			uint32_t src2_neg     : 1;
+			uint32_t src3_neg     : 1;
+		};
+		struct PACKED {
+			uint32_t src3         : 10;
+			uint32_t src3_c       : 1;
+			uint32_t src3_rel     : 1;
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel2;
+		struct PACKED {
+			uint32_t src3         : 12;
+			uint32_t src3_c       : 1;
+			uint32_t dummy        : 3;
+		} c2;
+	};
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t repeat   : 2;
+	uint32_t sat      : 1;
+	uint32_t src1_r   : 1;   /* doubles as nop0 if repeat==0 */
+	uint32_t ss       : 1;
+	uint32_t ul       : 1;
+	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+	uint32_t src2     : 8;
+	uint32_t opc      : 4;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat3_t;
+
+static inline bool instr_cat3_full(instr_cat3_t *cat3)
+{
+	switch (_OPC(3, cat3->opc)) {
+	case OPC_MAD_F16:
+	case OPC_MAD_U16:
+	case OPC_MAD_S16:
+	case OPC_SEL_B16:
+	case OPC_SEL_S16:
+	case OPC_SEL_F16:
+	case OPC_SAD_S16:
+	case OPC_SAD_S32:  // really??
+		return false;
+	default:
+		return true;
+	}
+}
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		struct PACKED {
+			uint32_t src          : 11;
+			uint32_t must_be_zero1: 2;
+			uint32_t src_im       : 1;   /* immediate */
+			uint32_t src_neg      : 1;   /* negate */
+			uint32_t src_abs      : 1;   /* absolute value */
+		};
+		struct PACKED {
+			uint32_t src          : 10;
+			uint32_t src_c        : 1;   /* relative-const */
+			uint32_t src_rel      : 1;   /* relative address */
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel;
+		struct PACKED {
+			uint32_t src          : 12;
+			uint32_t src_c        : 1;   /* const */
+			uint32_t dummy        : 3;
+		} c;
+	};
+	uint32_t dummy1   : 16;  /* seem to be ignored */
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t repeat   : 2;
+	uint32_t sat      : 1;
+	uint32_t src_r    : 1;
+	uint32_t ss       : 1;
+	uint32_t ul       : 1;
+	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+	uint32_t dummy2   : 5;   /* seem to be ignored */
+	uint32_t full     : 1;   /* not half */
+	uint32_t opc      : 6;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat4_t;
+
+/* With is_bindless_s2en = 1, this determines whether bindless is enabled and
+ * if so, how to get the (base, index) pair for both sampler and texture.
+ * There is a single base embedded in the instruction, which is always used
+ * for the texture.
+ */
+typedef enum {
+	/* Use traditional GL binding model, get texture and sampler index
+	 * from src3 which is not presumed to be uniform. This is
+	 * backwards-compatible with earlier generations, where this field was
+	 * always 0 and nonuniform-indexed sampling always worked.
+	 */
+	CAT5_NONUNIFORM = 0,
+
+	/* The sampler base comes from the low 3 bits of a1.x, and the sampler
+	 * and texture index come from src3 which is presumed to be uniform.
+	 */
+	CAT5_BINDLESS_A1_UNIFORM = 1,
+
+	/* The texture and sampler share the same base, and the sampler and
+	 * texture index come from src3 which is *not* presumed to be uniform.
+	 */
+	CAT5_BINDLESS_NONUNIFORM = 2,
+
+	/* The sampler base comes from the low 3 bits of a1.x, and the sampler
+	 * and texture index come from src3 which is *not* presumed to be
+	 * uniform.
+	 */
+	CAT5_BINDLESS_A1_NONUNIFORM = 3,
+
+	/* Use traditional GL binding model, get texture and sampler index
+	 * from src3 which is presumed to be uniform.
+	 */
+	CAT5_UNIFORM = 4,
+
+	/* The texture and sampler share the same base, and the sampler and
+	 * texture index come from src3 which is presumed to be uniform.
+	 */
+	CAT5_BINDLESS_UNIFORM = 5,
+
+	/* The texture and sampler share the same base, get sampler index from low
+	 * 4 bits of src3 and texture index from high 4 bits.
+	 */
+	CAT5_BINDLESS_IMM = 6,
+
+	/* The sampler base comes from the low 3 bits of a1.x, and the texture
+	 * index comes from the next 8 bits of a1.x. The sampler index is an
+	 * immediate in src3.
+	 */
+	CAT5_BINDLESS_A1_IMM = 7,
+} cat5_desc_mode_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		/* normal case: */
+		struct PACKED {
+			uint32_t full     : 1;   /* not half */
+			uint32_t src1     : 8;
+			uint32_t src2     : 8;
+			uint32_t dummy1   : 4;   /* seem to be ignored */
+			uint32_t samp     : 4;
+			uint32_t tex      : 7;
+		} norm;
+		/* s2en case: */
+		struct PACKED {
+			uint32_t full         : 1;   /* not half */
+			uint32_t src1         : 8;
+			uint32_t src2         : 8;
+			uint32_t dummy1       : 2;
+			uint32_t base_hi      : 2;
+			uint32_t src3         : 8;
+			uint32_t desc_mode    : 3;
+		} s2en_bindless;
+		/* same in either case: */
+		// XXX I think, confirm this
+		struct PACKED {
+			uint32_t full     : 1;   /* not half */
+			uint32_t src1     : 8;
+			uint32_t src2     : 8;
+			uint32_t pad      : 15;
+		};
+	};
+
+	/* dword1: */
+	uint32_t dst              : 8;
+	uint32_t wrmask           : 4;   /* write-mask */
+	uint32_t type             : 3;
+	uint32_t base_lo          : 1;   /* used with bindless */
+	uint32_t is_3d            : 1;
+
+	uint32_t is_a             : 1;
+	uint32_t is_s             : 1;
+	uint32_t is_s2en_bindless : 1;
+	uint32_t is_o             : 1;
+	uint32_t is_p             : 1;
+
+	uint32_t opc              : 5;
+	uint32_t jmp_tgt          : 1;
+	uint32_t sync             : 1;
+	uint32_t opc_cat          : 3;
+} instr_cat5_t;
+
+/* dword0 encoding for src_off: [src1 + off], src2: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t mustbe1  : 1;
+	int32_t  off      : 13;
+	uint32_t src1     : 8;
+	uint32_t src1_im  : 1;
+	uint32_t src2_im  : 1;
+	uint32_t src2     : 8;
+
+	/* dword1: */
+	uint32_t dword1;
+} instr_cat6a_t;
+
+/* dword0 encoding for !src_off: [src1], src2 */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t mustbe0  : 1;
+	uint32_t src1     : 13;
+	uint32_t ignore0  : 8;
+	uint32_t src1_im  : 1;
+	uint32_t src2_im  : 1;
+	uint32_t src2     : 8;
+
+	/* dword1: */
+	uint32_t dword1;
+} instr_cat6b_t;
+
+/* dword1 encoding for dst_off: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t dword0;
+
+	/* note: there is some weird stuff going on where sometimes
+	 * cat6->a.off is involved.. but that seems like a bug in
+	 * the blob, since it is used even if !cat6->src_off
+	 * It would make sense for there to be some more bits to
+	 * bring us to 11 bits worth of offset, but not sure..
+	 */
+	int32_t off       : 8;
+	uint32_t mustbe1  : 1;
+	uint32_t dst      : 8;
+	uint32_t pad1     : 15;
+} instr_cat6c_t;
+
+/* dword1 encoding for !dst_off: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t dword0;
+
+	uint32_t dst      : 8;
+	uint32_t mustbe0  : 1;
+	uint32_t idx      : 8;
+	uint32_t pad0     : 15;
+} instr_cat6d_t;
+
+/* ldgb and atomics..
+ *
+ * ldgb:      pad0=0, pad3=1
+ * atomic .g: pad0=1, pad3=1
+ *        .l: pad0=1, pad3=0
+ */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t pad0     : 1;
+	uint32_t src3     : 8;
+	uint32_t d        : 2;
+	uint32_t typed    : 1;
+	uint32_t type_size : 2;
+	uint32_t src1     : 8;
+	uint32_t src1_im  : 1;
+	uint32_t src2_im  : 1;
+	uint32_t src2     : 8;
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t mustbe0  : 1;
+	uint32_t src_ssbo : 8;
+	uint32_t pad2     : 3;  // type
+	uint32_t g        : 1;
+	uint32_t pad3     : 1;
+	uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
+} instr_cat6ldgb_t;
+
+/* stgb, pad0=0, pad3=2
+ */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t mustbe1  : 1;  // ???
+	uint32_t src1     : 8;
+	uint32_t d        : 2;
+	uint32_t typed    : 1;
+	uint32_t type_size : 2;
+	uint32_t pad0     : 9;
+	uint32_t src2_im  : 1;
+	uint32_t src2     : 8;
+
+	/* dword1: */
+	uint32_t src3     : 8;
+	uint32_t src3_im  : 1;
+	uint32_t dst_ssbo : 8;
+	uint32_t pad2     : 3;  // type
+	uint32_t pad3     : 2;
+	uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
+} instr_cat6stgb_t;
+
+typedef union PACKED {
+	instr_cat6a_t a;
+	instr_cat6b_t b;
+	instr_cat6c_t c;
+	instr_cat6d_t d;
+	instr_cat6ldgb_t ldgb;
+	instr_cat6stgb_t stgb;
+	struct PACKED {
+		/* dword0: */
+		uint32_t src_off  : 1;
+		uint32_t pad1     : 31;
+
+		/* dword1: */
+		uint32_t pad2     : 8;
+		uint32_t dst_off  : 1;
+		uint32_t pad3     : 8;
+		uint32_t type     : 3;
+		uint32_t g        : 1;  /* or in some cases it means dst immed */
+		uint32_t pad4     : 1;
+		uint32_t opc      : 5;
+		uint32_t jmp_tgt  : 1;
+		uint32_t sync     : 1;
+		uint32_t opc_cat  : 3;
+	};
+} instr_cat6_t;
+
+/* Similar to cat5_desc_mode_t, describes how the descriptor is loaded.
+ */
+typedef enum {
+	/* Use old GL binding model with an immediate index. */
+	CAT6_IMM = 0,
+
+	CAT6_UNIFORM = 1,
+
+	CAT6_NONUNIFORM = 2,
+
+	/* Use the bindless model, with an immediate index.
+	 */
+	CAT6_BINDLESS_IMM = 4,
+
+	/* Use the bindless model, with a uniform register index.
+	 */
+	CAT6_BINDLESS_UNIFORM = 5,
+
+	/* Use the bindless model, with a register index that isn't guaranteed
+	 * to be uniform. This presumably checks if the indices are equal and
+	 * splits up the load/store, because it works the way you would
+	 * expect.
+	 */
+	CAT6_BINDLESS_NONUNIFORM = 6,
+} cat6_desc_mode_t;
+
+/**
+ * For atomic ops (which return a value):
+ *
+ *    pad1=1, pad3=c, pad5=3
+ *    src1    - vecN offset/coords
+ *    src2.x  - is actually dest register
+ *    src2.y  - is 'data' except for cmpxchg where src2.y is 'compare'
+ *              and src2.z is 'data'
+ *
+ * For stib (which does not return a value):
+ *    pad1=0, pad3=c, pad5=2
+ *    src1    - vecN offset/coords
+ *    src2    - value to store
+ *
+ * For ldib:
+ *    pad1=1, pad3=c, pad5=2
+ *    src1    - vecN offset/coords
+ *
+ * for ldc (load from UBO using descriptor):
+ *    pad1=0, pad3=8, pad5=2
+ *
+ * pad2 and pad5 are only observed to be 0.
+ */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t pad1     : 1;
+	uint32_t base     : 3;
+	uint32_t pad2     : 2;
+	uint32_t desc_mode : 3;
+	uint32_t d        : 2;
+	uint32_t typed    : 1;
+	uint32_t type_size : 2;
+	uint32_t opc      : 5;
+	uint32_t pad3     : 5;
+	uint32_t src1     : 8;  /* coordinate/offset */
+
+	/* dword1: */
+	uint32_t src2     : 8;  /* or the dst for load instructions */
+	uint32_t pad4     : 1;  //mustbe0 ??
+	uint32_t ssbo     : 8;  /* ssbo/image binding point */
+	uint32_t type     : 3;
+	uint32_t pad5     : 7;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat6_a6xx_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t pad1     : 32;
+
+	/* dword1: */
+	uint32_t pad2     : 12;
+	uint32_t ss       : 1;  /* maybe in the encoding, but blob only uses (sy) */
+	uint32_t pad3     : 6;
+	uint32_t w        : 1;  /* write */
+	uint32_t r        : 1;  /* read */
+	uint32_t l        : 1;  /* local */
+	uint32_t g        : 1;  /* global */
+	uint32_t opc      : 4;  /* presumed, but only a couple known OPCs */
+	uint32_t jmp_tgt  : 1;  /* (jp) */
+	uint32_t sync     : 1;  /* (sy) */
+	uint32_t opc_cat  : 3;
+} instr_cat7_t;
+
+typedef union PACKED {
+	instr_cat0_t cat0;
+	instr_cat1_t cat1;
+	instr_cat2_t cat2;
+	instr_cat3_t cat3;
+	instr_cat4_t cat4;
+	instr_cat5_t cat5;
+	instr_cat6_t cat6;
+	instr_cat6_a6xx_t cat6_a6xx;
+	instr_cat7_t cat7;
+	struct PACKED {
+		/* dword0: */
+		uint32_t pad1     : 32;
+
+		/* dword1: */
+		uint32_t pad2     : 12;
+		uint32_t ss       : 1;  /* cat1-cat4 (cat0??) and cat7 (?) */
+		uint32_t ul       : 1;  /* cat2-cat4 (and cat1 in blob.. which may be bug??) */
+		uint32_t pad3     : 13;
+		uint32_t jmp_tgt  : 1;
+		uint32_t sync     : 1;
+		uint32_t opc_cat  : 3;
+
+	};
+} instr_t;
+
+static inline uint32_t instr_repeat(instr_t *instr)
+{
+	switch (instr->opc_cat) {
+	case 0:  return instr->cat0.repeat;
+	case 1:  return instr->cat1.repeat;
+	case 2:  return instr->cat2.repeat;
+	case 3:  return instr->cat3.repeat;
+	case 4:  return instr->cat4.repeat;
+	default: return 0;
+	}
+}
+
+static inline bool instr_sat(instr_t *instr)
+{
+	switch (instr->opc_cat) {
+	case 2:  return instr->cat2.sat;
+	case 3:  return instr->cat3.sat;
+	case 4:  return instr->cat4.sat;
+	default: return false;
+	}
+}
+
+/* We can probably drop the gpu_id arg, but keeping it for now so we can
+ * assert if we see something we think should be new encoding on an older
+ * gpu.
+ */
+static inline bool is_cat6_legacy(instr_t *instr, unsigned gpu_id)
+{
+	instr_cat6_a6xx_t *cat6 = &instr->cat6_a6xx;
+
+	/* At least one of these two bits is pad in all the possible
+	 * "legacy" cat6 encodings, and a analysis of all the pre-a6xx
+	 * cmdstream traces I have indicates that the pad bit is zero
+	 * in all cases.  So we can use this to detect new encoding:
+	 */
+	if ((cat6->pad3 & 0x8) && (cat6->pad5 & 0x2)) {
+		ir3_assert(gpu_id >= 600);
+		ir3_assert(instr->cat6.opc == 0);
+		return false;
+	}
+
+	return true;
+}
+
+static inline uint32_t instr_opc(instr_t *instr, unsigned gpu_id)
+{
+	switch (instr->opc_cat) {
+	case 0:  return instr->cat0.opc | instr->cat0.opc_hi << 4;
+	case 1:  return 0;
+	case 2:  return instr->cat2.opc;
+	case 3:  return instr->cat3.opc;
+	case 4:  return instr->cat4.opc;
+	case 5:  return instr->cat5.opc;
+	case 6:
+		if (!is_cat6_legacy(instr, gpu_id))
+			return instr->cat6_a6xx.opc;
+		return instr->cat6.opc;
+	case 7:  return instr->cat7.opc;
+	default: return 0;
+	}
+}
+
+static inline bool is_mad(opc_t opc)
+{
+	switch (opc) {
+	case OPC_MAD_U16:
+	case OPC_MAD_S16:
+	case OPC_MAD_U24:
+	case OPC_MAD_S24:
+	case OPC_MAD_F16:
+	case OPC_MAD_F32:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_madsh(opc_t opc)
+{
+	switch (opc) {
+	case OPC_MADSH_U16:
+	case OPC_MADSH_M16:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_atomic(opc_t opc)
+{
+	switch (opc) {
+	case OPC_ATOMIC_ADD:
+	case OPC_ATOMIC_SUB:
+	case OPC_ATOMIC_XCHG:
+	case OPC_ATOMIC_INC:
+	case OPC_ATOMIC_DEC:
+	case OPC_ATOMIC_CMPXCHG:
+	case OPC_ATOMIC_MIN:
+	case OPC_ATOMIC_MAX:
+	case OPC_ATOMIC_AND:
+	case OPC_ATOMIC_OR:
+	case OPC_ATOMIC_XOR:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_ssbo(opc_t opc)
+{
+	switch (opc) {
+	case OPC_RESFMT:
+	case OPC_RESINFO:
+	case OPC_LDGB:
+	case OPC_STGB:
+	case OPC_STIB:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_isam(opc_t opc)
+{
+	switch (opc) {
+	case OPC_ISAM:
+	case OPC_ISAML:
+	case OPC_ISAMM:
+		return true;
+	default:
+		return false;
+	}
+}
+
+
+static inline bool is_cat2_float(opc_t opc)
+{
+	switch (opc) {
+	case OPC_ADD_F:
+	case OPC_MIN_F:
+	case OPC_MAX_F:
+	case OPC_MUL_F:
+	case OPC_SIGN_F:
+	case OPC_CMPS_F:
+	case OPC_ABSNEG_F:
+	case OPC_CMPV_F:
+	case OPC_FLOOR_F:
+	case OPC_CEIL_F:
+	case OPC_RNDNE_F:
+	case OPC_RNDAZ_F:
+	case OPC_TRUNC_F:
+		return true;
+
+	default:
+		return false;
+	}
+}
+
+static inline bool is_cat3_float(opc_t opc)
+{
+	switch (opc) {
+	case OPC_MAD_F16:
+	case OPC_MAD_F32:
+	case OPC_SEL_F16:
+	case OPC_SEL_F32:
+		return true;
+	default:
+		return false;
+	}
+}
+
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out, unsigned gpu_id);
+
+#endif /* INSTR_A3XX_H_ */
diff --git a/src/freedreno/decode/io.c b/src/freedreno/decode/io.c
new file mode 100644
index 0000000..5fc5752
--- /dev/null
+++ b/src/freedreno/decode/io.c
@@ -0,0 +1,163 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <archive.h>
+#include <archive_entry.h>
+
+#include "io.h"
+
+struct io {
+	struct archive *a;
+	struct archive_entry *entry;
+	unsigned offset;
+};
+
+static void io_error(struct io *io)
+{
+	fprintf(stderr, "%s\n", archive_error_string(io->a));
+	io_close(io);
+}
+
+static struct io * io_new(void)
+{
+	struct io *io = calloc(1, sizeof(*io));
+	int ret;
+
+	if (!io)
+		return NULL;
+
+	io->a = archive_read_new();
+	ret = archive_read_support_filter_gzip(io->a);
+	if (ret != ARCHIVE_OK) {
+		io_error(io);
+		return NULL;
+	}
+
+	ret = archive_read_support_filter_none(io->a);
+	if (ret != ARCHIVE_OK) {
+		io_error(io);
+		return NULL;
+	}
+
+	ret = archive_read_support_format_all(io->a);
+	if (ret != ARCHIVE_OK) {
+		io_error(io);
+		return NULL;
+	}
+
+	ret = archive_read_support_format_raw(io->a);
+	if (ret != ARCHIVE_OK) {
+		io_error(io);
+		return NULL;
+	}
+
+	return io;
+}
+
+struct io * io_open(const char *filename)
+{
+	struct io *io = io_new();
+	int ret;
+
+	if (!io)
+		return NULL;
+
+	ret = archive_read_open_filename(io->a, filename, 10240);
+	if (ret != ARCHIVE_OK) {
+		io_error(io);
+		return NULL;
+	}
+
+	ret = archive_read_next_header(io->a, &io->entry);
+	if (ret != ARCHIVE_OK) {
+		io_error(io);
+		return NULL;
+	}
+
+	return io;
+}
+
+struct io * io_openfd(int fd)
+{
+	struct io *io = io_new();
+	int ret;
+
+	if (!io)
+		return NULL;
+
+	ret = archive_read_open_fd(io->a, fd, 10240);
+	if (ret != ARCHIVE_OK) {
+		io_error(io);
+		return NULL;
+	}
+
+	ret = archive_read_next_header(io->a, &io->entry);
+	if (ret != ARCHIVE_OK) {
+		io_error(io);
+		return NULL;
+	}
+
+	return io;
+}
+
+void io_close(struct io *io)
+{
+	archive_read_free(io->a);
+	free(io);
+}
+
+unsigned io_offset(struct io *io)
+{
+	return io->offset;
+}
+
+#include <assert.h>
+int io_readn(struct io *io, void *buf, int nbytes)
+{
+	char *ptr = buf;
+	int ret = 0;
+	while (nbytes > 0) {
+		int n = archive_read_data(io->a, ptr, nbytes);
+		if (n < 0) {
+			fprintf(stderr, "%s\n", archive_error_string(io->a));
+			return n;
+		}
+		if (n == 0)
+			break;
+		ptr += n;
+		nbytes -= n;
+		ret += n;
+		io->offset += n;
+	}
+	return ret;
+}
diff --git a/src/freedreno/decode/io.h b/src/freedreno/decode/io.h
new file mode 100644
index 0000000..d26ba4b
--- /dev/null
+++ b/src/freedreno/decode/io.h
@@ -0,0 +1,51 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef IO_H_
+#define IO_H_
+
+/* Simple API to abstract reading from file which might be compressed.
+ * Maybe someday I'll add writing..
+ */
+
+struct io;
+
+struct io * io_open(const char *filename);
+struct io * io_openfd(int fd);
+void io_close(struct io *io);
+unsigned io_offset(struct io *io);
+int io_readn(struct io *io, void *buf, int nbytes);
+
+
+static inline int
+check_extension(const char *path, const char *ext)
+{
+	return strcmp(path + strlen(path) - strlen(ext), ext) == 0;
+}
+
+#endif /* IO_H_ */
diff --git a/src/freedreno/decode/meson.build b/src/freedreno/decode/meson.build
new file mode 100644
index 0000000..0ec9995
--- /dev/null
+++ b/src/freedreno/decode/meson.build
@@ -0,0 +1,144 @@
+# Copyright © 2020 Google, Inc
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+dep_lua = dependency('lua53', required: false)
+if not dep_lua.found()
+  dep_lua = dependency('lua52', required: false)
+endif
+if not dep_lua.found()
+  dep_lua = dependency('lua', required: false)
+endif
+
+dep_libarchive = dependency('libarchive', required: false)
+
+# Shared cmdstream decoding:
+libfreedreno_cffdec = static_library(
+  'freedreno_cffdec',
+  [
+    'buffers.c',
+    'buffers.h',
+    'cffdec.c',
+    'cffdec.h',
+    'disasm-a2xx.c',
+    'disasm-a3xx.c',
+    'disasm.h',
+    'instr-a2xx.h',
+    'instr-a3xx.h',
+    'pager.c',
+    'pager.h',
+    'rnnutil.c',
+    'rnnutil.h',
+    'util.h',
+  ],
+  include_directories: [
+    inc_freedreno_rnn,
+  ],
+  c_args : [ no_override_init_args ],
+  gnu_symbol_visibility: 'hidden',
+  dependencies: [],
+  link_with: libfreedreno_rnn,
+  build_by_default: false,
+)
+
+if dep_libarchive.found()
+  libfreedreno_io = static_library(
+    'libfreedreno_io',
+    [
+      'io.c',
+      'io.h',
+    ],
+    include_directories: [],
+    c_args : [no_override_init_args],
+    gnu_symbol_visibility: 'hidden',
+    dependencies: [
+      dep_libarchive,
+    ],
+    build_by_default: false,
+  )
+endif
+
+if dep_lua.found() and dep_libarchive.found()
+  cffdump = executable(
+    'cffdump',
+    [
+      'cffdump.c',
+      'script.c',
+      'script.h'
+    ],
+    include_directories: [
+      inc_freedreno_rnn,
+    ],
+    c_args : [no_override_init_args],
+    gnu_symbol_visibility: 'hidden',
+    dependencies: [
+      dep_lua,
+    ],
+    link_with: [
+      libfreedreno_cffdec,
+      libfreedreno_io,
+    ],
+    build_by_default: with_tools.contains('freedreno'),
+    install : with_tools.contains('freedreno'),
+  )
+endif
+
+crashdec = executable(
+  'crashdec',
+  'crashdec.c',
+  include_directories: [
+    inc_freedreno_rnn,
+  ],
+  gnu_symbol_visibility: 'hidden',
+  dependencies: [],
+  link_with: [
+    libfreedreno_cffdec,
+  ],
+  build_by_default: with_tools.contains('freedreno'),
+  install : with_tools.contains('freedreno'),
+)
+
+if dep_libarchive.found()
+  pgmdump = executable(
+    'pgmdump',
+    'pgmdump.c',
+    include_directories: [],
+    gnu_symbol_visibility: 'hidden',
+    dependencies: [],
+    link_with: [
+      libfreedreno_cffdec,
+      libfreedreno_io,
+    ],
+    build_by_default: with_tools.contains('freedreno'),
+    install: false,
+  )
+  pgmdump2 = executable(
+    'pgmdump2',
+    'pgmdump2.c',
+    include_directories: [],
+    gnu_symbol_visibility: 'hidden',
+    dependencies: [],
+    link_with: [
+      libfreedreno_cffdec,
+      libfreedreno_io,
+    ],
+    build_by_default: with_tools.contains('freedreno'),
+    install: false,
+  )
+endif
diff --git a/src/freedreno/decode/pager.c b/src/freedreno/decode/pager.c
new file mode 100644
index 0000000..fa07c10
--- /dev/null
+++ b/src/freedreno/decode/pager.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2018 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <errno.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "pager.h"
+
+static pid_t pager_pid;
+
+
+static void
+pager_death(int n)
+{
+	exit(0);
+}
+
+void
+pager_open(void)
+{
+	int fd[2];
+
+	if (pipe(fd) < 0) {
+		fprintf(stderr, "Failed to create pager pipe: %m\n");
+		exit(-1);
+	}
+
+	pager_pid = fork();
+	if (pager_pid < 0) {
+		fprintf(stderr, "Failed to fork pager: %m\n");
+		exit(-1);
+	}
+
+	if (pager_pid == 0) {
+		const char* less_opts;
+
+		dup2(fd[0], STDIN_FILENO);
+		close(fd[0]);
+		close(fd[1]);
+
+		less_opts = "FRSMKX";
+		setenv("LESS", less_opts, 1);
+
+		execlp("less", "less", NULL);
+
+	} else {
+		/* we want to kill the parent process when pager exits: */
+		signal(SIGCHLD, pager_death);
+		dup2(fd[1], STDOUT_FILENO);
+		close(fd[0]);
+		close(fd[1]);
+	}
+}
+
+int
+pager_close(void)
+{
+	siginfo_t status;
+
+	close(STDOUT_FILENO);
+
+	while (true) {
+		memset(&status, 0, sizeof(status));
+		if (waitid(P_PID, pager_pid, &status, WEXITED) < 0) {
+			if (errno == EINTR)
+				continue;
+			return -errno;
+		}
+
+		return 0;
+	}
+}
diff --git a/src/freedreno/decode/pager.h b/src/freedreno/decode/pager.h
new file mode 100644
index 0000000..022786e
--- /dev/null
+++ b/src/freedreno/decode/pager.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __PAGER_H__
+#define __PAGER_H__
+
+void pager_open(void);
+int pager_close(void);
+
+#endif /* __PAGER_H__ */
diff --git a/src/freedreno/decode/pgmdump.c b/src/freedreno/decode/pgmdump.c
new file mode 100644
index 0000000..b8d7cd3
--- /dev/null
+++ b/src/freedreno/decode/pgmdump.c
@@ -0,0 +1,1054 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+
+#include "redump.h"
+#include "disasm.h"
+#include "io.h"
+
+#define ASCII_XOR 0xff
+#include "util.h"
+
+struct pgm_header {
+	uint32_t size;
+	uint32_t unknown1;
+	uint32_t unknown2;
+	uint32_t revision;
+	uint32_t unknown4;
+	uint32_t unknown5;
+	uint32_t unknown6;
+	uint32_t unknown7;
+	uint32_t unknown8;
+	uint32_t num_attribs;
+	uint32_t num_uniforms;
+	uint32_t num_samplers;
+	uint32_t num_varyings;
+	uint32_t num_uniformblocks;
+};
+
+struct vs_header {
+	uint32_t unknown1;  /* seems to be # of sections up to and including shader */
+	uint32_t unknown2;  /* seems to be low byte or so of SQ_PROGRAM_CNTL */
+	uint32_t unknown3;
+	uint32_t unknown4;
+	uint32_t unknown5;
+	uint32_t unknown6;
+	uint32_t unknown7;
+	uint32_t unknown8;
+	uint32_t unknown9;  /* seems to be # of sections following shader */
+};
+
+struct fs_header {
+	uint32_t unknown1;
+};
+/*
+	// Covers a lot of type_info
+	// varying, attribute, uniform, sampler
+	type_info & 0xFF
+	if ((type_info >> 8) == 0x8b) // vector
+		0x50 = vec2
+		0x51 = vec3
+		0x52 = vec4
+		0x53 = ivec2
+		0x54 = ivec3
+		0x55 = ivec4
+		0x56 = bool // Why is this in vector?
+		0x57 = bvec2
+		0x58 = bvec3
+		0x59 = bvec4
+		0x5a = mat2
+		0x5b = mat3
+		0x5c = mat4
+		0x5a = mat2x2 // Same as mat2
+		0x65 = mat2x3
+		0x66 = mat2x4
+		0x67 = mat3x2
+		0x5b = mat3x3 // Same as mat3
+		0x68 = mat3x4
+		0x69 = mat4x2
+		0x6a = mat4x3
+		0x5c = mat4x4 // same as mat4
+		0x5e = sampler2D
+		0x5f = sampler3D
+		0x60 = samplerCube // XXX: Doesn't work
+		0x62 = sampler2DShadow
+		0xc6 = uvec2
+		0xc7 = uvec3
+		0xc8 = uvec4
+	else if ((type_info >> 8) == 0x8d) // GLES3 samplers
+		0xC1 = sampler2DArray
+		0xC4 = sampler2DArrayShadow
+		0xC5 = samplerCubeShadow
+		0xCA = isampler2D
+		0xCB = isampler3D
+		0xCC = isamplerCube
+		0xD2 = usampler2D
+		0xD3 = usampler3D
+		0xD4 = usamplerCube
+		0xD7 = isampler2DArray
+		0xD7 = usampler2DArray // Is the same as isampler2DArray?
+	else // 0x14 = single
+		0x04 = int
+		0x05 = uint
+		0x06 = float
+*/
+struct attribute {
+	uint32_t type_info;
+	uint32_t reg;       /* seems to be the register the fetch instruction loads to */
+	uint32_t const_idx; /* the CONST() indx value for sampler */
+	uint32_t unknown2;
+	uint32_t unknown3;
+	uint32_t unknown4;
+	uint32_t unknown5;
+	char name[];
+};
+
+struct uniform {
+	uint32_t type_info;
+	uint32_t unknown2;
+	uint32_t unknown3;
+	uint32_t unknown4;
+	uint32_t const_base; /* const base register (for uniforms that take more than one const reg, ie. matrices) */
+	uint32_t unknown6;
+	uint32_t const_reg; /* the const register holding the value */
+	uint32_t unknown7;
+	uint32_t unknown8;
+	uint32_t unknown9;
+	union {
+		struct {
+			char name[1];
+		} v1;
+		struct {
+			uint32_t unknown10;
+			uint32_t unknown11;
+			uint32_t unknown12;
+			char name[];
+		} v2;
+	};
+};
+
+struct uniformblockmember {
+	uint32_t type_info;
+	uint32_t is_array;
+	uint32_t array_size; /* elements in the array */
+	uint32_t unknown2; /* Same as array_size */
+	uint32_t unknown3; /* Seems to be a offset within UBO in vertex (by components) */
+	uint32_t unknown4;
+	uint32_t unknown5; /* Seems to be a offset within UBO in fragment (by vec4) */
+	uint32_t unknown6;
+	uint32_t unknown7;
+	uint32_t unknown8;
+	uint32_t unknown9; /* UBO block index? */
+	uint32_t unknown10;
+	uint32_t unknown11;
+	uint32_t unknown12;
+	char name[];
+};
+
+struct uniformblock
+{
+	uint32_t type_info;
+	uint32_t unknown1;
+	uint32_t unknown2;
+	uint32_t unknown3;
+	uint32_t unknown4;
+	uint32_t num_members;
+	uint32_t num_members2;
+	uint32_t unknown5;
+	uint32_t unknown6;
+	uint32_t unknown7;
+	char name[];
+};
+
+
+struct sampler {
+	uint32_t type_info;
+	uint32_t is_array;
+	uint32_t array_size; /* elements in the array */
+	uint32_t unknown4; /* same as array_size */
+	uint32_t unknown5;
+	uint32_t unknown6;
+	uint32_t const_idx; /* the CONST() indx value for the sampler */
+	uint32_t unknown7;
+	char name[];
+};
+
+struct varying {
+	uint32_t type_info;
+	uint32_t unknown2;
+	uint32_t unknown3;
+	uint32_t reg;       /* the register holding the value (on entry to the shader) */
+	char name[];
+};
+
+struct output {
+	uint32_t type_info;
+	uint32_t unknown2;
+	uint32_t unknown3;
+	uint32_t unknown4;
+	uint32_t unknown5;
+	uint32_t unknown6;
+	uint32_t unknown7;
+	uint32_t unknown8;
+	char name[];
+};
+
+struct constant {
+	uint32_t unknown1;
+	uint32_t unknown2;
+	uint32_t unknown3;
+	uint32_t const_idx;
+	float val[];
+};
+
+struct state {
+	char *buf;
+	int sz;
+	struct pgm_header *hdr;
+	struct attribute *attribs[32];  /* don't really know the upper limit.. */
+	struct uniform *uniforms[32];
+	struct sampler *samplers[32];
+	struct varying *varyings[32];
+	struct {
+		struct uniformblock *header;
+		struct uniformblockmember **members; /* GL ES 3.0 spec mandates minimum 16K support. a3xx supports 65K */
+	} uniformblocks[24]; /* Maximum a330 supports */
+	struct output  *outputs[0];  /* I guess only one?? */
+};
+
+static const char *infile;
+static int full_dump = 1;
+static int dump_shaders = 0;
+static int gpu_id;
+
+static char *find_sect_end(char *buf, int sz)
+{
+	uint8_t *ptr = (uint8_t *)buf;
+	uint8_t *end = ptr + sz - 3;
+
+	while (ptr < end) {
+		uint32_t d = 0;
+
+		d |= ptr[0] <<  0;
+		d |= ptr[1] <<  8;
+		d |= ptr[2] << 16;
+		d |= ptr[3] << 24;
+
+		/* someone at QC likes baseball */
+		if (d == 0xba5eba11)
+			return (char *)ptr;
+
+		ptr++;
+	}
+	return NULL;
+}
+
+static void *next_sect(struct state *state, int *sect_size)
+{
+	char *end = find_sect_end(state->buf, state->sz);
+	void *sect;
+
+	if (!end)
+		return NULL;
+
+	*sect_size = end - state->buf;
+
+	/* copy the section to keep things nicely 32b aligned: */
+	sect = malloc(ALIGN(*sect_size, 4));
+	memcpy(sect, state->buf, *sect_size);
+
+	state->sz -= *sect_size + 4;
+	state->buf = end + 4;
+
+	return sect;
+}
+
+static int valid_type(uint32_t type_info)
+{
+	switch ((type_info >> 8) & 0xff) {
+	case 0x8b:     /* vector */
+	case 0x8d:     /* GLES3 samplers */
+	case 0x14:     /* float */
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+#if 0
+static int valid_uniformblock(uint32_t type_info)
+{
+	if (type_info == 0x128)
+		return 1;
+	return 0;
+}
+#endif
+
+static void dump_attribute(struct attribute *attrib)
+{
+	printf("\tR%d, CONST(%d): %s\n", attrib->reg,
+			attrib->const_idx, attrib->name);
+}
+
+static inline int is_uniform_v2(struct uniform *uniform)
+{
+	/* TODO maybe this should be based on revision #? */
+	if (uniform->v2.unknown10 == 0)
+		return 1;
+	return 0;
+}
+
+static void dump_uniform(struct uniform *uniform)
+{
+	char *name = is_uniform_v2(uniform) ? uniform->v2.name : uniform->v1.name;
+	if (uniform->const_reg == -1) {
+		printf("\tC%d+: %s\n", uniform->const_base, name);
+	} else {
+		printf("\tC%d: %s\n", uniform->const_reg, name);
+	}
+}
+
+static void dump_sampler(struct sampler *sampler)
+{
+	printf("\tCONST(%d): %s\n", sampler->const_idx, sampler->name);
+}
+
+static void dump_varying(struct varying *varying)
+{
+	printf("\tR%d: %s\n", varying->reg, varying->name);
+}
+
+static void dump_uniformblock(struct uniformblock *uniformblock)
+{
+	printf("\tUniform Block: %s(%d)\n", uniformblock->name, uniformblock->num_members);
+}
+
+static void dump_uniformblockmember(struct uniformblockmember *member)
+{
+	printf("Uniform Block member: %s\n", member->name);
+}
+
+static void dump_output(struct output *output)
+{
+	printf("\tR?: %s\n", output->name);
+}
+
+static void dump_constant(struct constant *constant)
+{
+	printf("\tC%d: %f, %f, %f, %f\n", constant->const_idx,
+			constant->val[0], constant->val[1],
+			constant->val[2], constant->val[3]);
+}
+
+/* dump attr/uniform/sampler/varying/const summary: */
+static void dump_short_summary(struct state *state, int nconsts,
+		struct constant **constants)
+{
+	int i;
+
+	/* dump attr/uniform/sampler/varying/const summary: */
+	for (i = 0; i < state->hdr->num_varyings; i++) {
+		dump_varying(state->varyings[i]);
+	}
+	for (i = 0; i < state->hdr->num_attribs; i++) {
+		dump_attribute(state->attribs[i]);
+	}
+	for (i = 0; i < state->hdr->num_uniforms; i++) {
+		dump_uniform(state->uniforms[i]);
+	}
+	for (i = 0; i < state->hdr->num_samplers; i++) {
+		dump_sampler(state->samplers[i]);
+	}
+	for (i = 0; i < nconsts - 1; i++) {
+		if (constants[i]->unknown2 == 0) {
+			dump_constant(constants[i]);
+		}
+	}
+	printf("\n");
+}
+
+static void dump_raw_shader(uint32_t *dwords, uint32_t sizedwords, int n, char *ext)
+{
+	static char filename[256];
+	int fd;
+
+	if (!dump_shaders)
+		return;
+
+	sprintf(filename, "%.*s-%d.%s", (int)strlen(infile)-3, infile, n, ext);
+	fd = open(filename, O_WRONLY | O_TRUNC | O_CREAT, 0644);
+	write(fd, dwords, sizedwords * 4);
+}
+
+static void dump_shaders_a2xx(struct state *state)
+{
+	int i, sect_size;
+	uint8_t *ptr;
+
+	/* dump vertex shaders: */
+	for (i = 0; i < 3; i++) {
+		struct vs_header *vs_hdr = next_sect(state, &sect_size);
+		struct constant *constants[32];
+		int j, level = 0;
+
+		printf("\n");
+
+		if (full_dump) {
+			printf("#######################################################\n");
+			printf("######## VS%d HEADER: (size %d)\n", i, sect_size);
+			dump_hex((void *)vs_hdr, sect_size);
+		}
+
+		for (j = 0; j < (int)vs_hdr->unknown1 - 1; j++) {
+			constants[j] = next_sect(state, &sect_size);
+			if (full_dump) {
+				printf("######## VS%d CONST: (size=%d)\n", i, sect_size);
+				dump_constant(constants[j]);
+				dump_hex((char *)constants[j], sect_size);
+			}
+		}
+
+		ptr = next_sect(state, &sect_size);
+		printf("######## VS%d SHADER: (size=%d)\n", i, sect_size);
+		if (full_dump) {
+			dump_hex(ptr, sect_size);
+			level = 1;
+		} else {
+			dump_short_summary(state, vs_hdr->unknown1 - 1, constants);
+		}
+		disasm_a2xx((uint32_t *)(ptr + 32), (sect_size - 32) / 4, level+1, SHADER_VERTEX);
+		dump_raw_shader((uint32_t *)(ptr + 32), (sect_size - 32) / 4, i, "vo");
+		free(ptr);
+
+		for (j = 0; j < vs_hdr->unknown9; j++) {
+			ptr = next_sect(state, &sect_size);
+			if (full_dump) {
+				printf("######## VS%d CONST?: (size=%d)\n", i, sect_size);
+				dump_hex(ptr, sect_size);
+			}
+			free(ptr);
+		}
+
+		for (j = 0; j < vs_hdr->unknown1 - 1; j++) {
+			free(constants[j]);
+		}
+
+		free(vs_hdr);
+	}
+
+	/* dump fragment shaders: */
+	for (i = 0; i < 1; i++) {
+		struct fs_header *fs_hdr = next_sect(state, &sect_size);
+		struct constant *constants[32];
+		int j, level = 0;
+
+		printf("\n");
+
+		if (full_dump) {
+			printf("#######################################################\n");
+			printf("######## FS%d HEADER: (size %d)\n", i, sect_size);
+			dump_hex((void *)fs_hdr, sect_size);
+		}
+
+		for (j = 0; j < fs_hdr->unknown1 - 1; j++) {
+			constants[j] = next_sect(state, &sect_size);
+			if (full_dump) {
+				printf("######## FS%d CONST: (size=%d)\n", i, sect_size);
+				dump_constant(constants[j]);
+				dump_hex((char *)constants[j], sect_size);
+			}
+		}
+
+		ptr = next_sect(state, &sect_size);
+		printf("######## FS%d SHADER: (size=%d)\n", i, sect_size);
+		if (full_dump) {
+			dump_hex(ptr, sect_size);
+			level = 1;
+		} else {
+			dump_short_summary(state, fs_hdr->unknown1 - 1, constants);
+		}
+		disasm_a2xx((uint32_t *)(ptr + 32), (sect_size - 32) / 4, level+1, SHADER_FRAGMENT);
+		dump_raw_shader((uint32_t *)(ptr + 32), (sect_size - 32) / 4, i, "fo");
+		free(ptr);
+
+		for (j = 0; j < fs_hdr->unknown1 - 1; j++) {
+			free(constants[j]);
+		}
+
+		free(fs_hdr);
+	}
+}
+
+static void dump_shaders_a3xx(struct state *state)
+{
+	int i, j;
+
+	/* dump vertex shaders: */
+	for (i = 0; i < 2; i++) {
+		int instrs_size, hdr_size, sect_size, nconsts = 0, level = 0, compact = 0;
+		uint8_t *vs_hdr;
+		struct constant *constants[32];
+		uint8_t *instrs = NULL;
+
+		vs_hdr = next_sect(state, &hdr_size);
+printf("hdr_size=%d\n", hdr_size);
+
+		/* seems like there are two cases, either:
+		 *  1) 152 byte header,
+		 *  2) zero or more 32 byte compiler const sections
+		 *  3) followed by shader instructions
+		 * or, if there are no compiler consts, this can be
+		 * all smashed in one large section
+		 */
+		int n;
+		if (state->hdr->revision >= 0xb)
+			n = 160;
+		else if (state->hdr->revision >= 7)
+			n = 156;
+		else
+			n = 152;
+		if (hdr_size > n) {
+			instrs = &vs_hdr[n];
+			instrs_size = hdr_size - n;
+			hdr_size = n;
+			compact = 1;
+		} else {
+			while (1) {
+				void *ptr = next_sect(state, &sect_size);
+
+				if ((sect_size != 32) && (sect_size != 44)) {
+					/* end of constants: */
+					instrs = ptr;
+					instrs_size = sect_size;
+					break;
+				}
+				dump_hex_ascii(ptr, sect_size, 0);
+				constants[nconsts++] = ptr;
+			}
+		}
+
+		printf("\n");
+
+		if (full_dump) {
+			printf("#######################################################\n");
+			printf("######## VS%d HEADER: (size %d)\n", i, hdr_size);
+			dump_hex((void *)vs_hdr, hdr_size);
+			for (j = 0; j < nconsts; j++) {
+				printf("######## VS%d CONST: (size=%d)\n", i, (int)sizeof(constants[i]));
+				dump_constant(constants[j]);
+				dump_hex((char *)constants[j], sizeof(constants[j]));
+			}
+		}
+
+		printf("######## VS%d SHADER: (size=%d)\n", i, instrs_size);
+		if (full_dump) {
+			dump_hex(instrs, instrs_size);
+			level = 1;
+		} else {
+			dump_short_summary(state, nconsts, constants);
+		}
+
+		if (!compact) {
+			if (state->hdr->revision >= 7) {
+				instrs += ALIGN(instrs_size, 8) - instrs_size;
+				instrs_size = ALIGN(instrs_size, 8);
+			}
+			instrs += 32;
+			instrs_size -= 32;
+		}
+
+		disasm_a3xx((uint32_t *)instrs, instrs_size / 4, level+1, SHADER_VERTEX, gpu_id);
+		dump_raw_shader((uint32_t *)instrs, instrs_size / 4, i, "vo3");
+		free(vs_hdr);
+	}
+
+	/* dump fragment shaders: */
+	for (i = 0; i < 1; i++) {
+		int instrs_size, hdr_size, sect_size, nconsts = 0, level = 0, compact = 0;
+		uint8_t *fs_hdr;
+		struct constant *constants[32];
+		uint8_t *instrs = NULL;
+
+		fs_hdr = next_sect(state, &hdr_size);
+
+printf("hdr_size=%d\n", hdr_size);
+		/* two cases, similar to vertex shader, but magic # is 200
+		 * (or 208 for newer?)..
+		 */
+		int n;
+		if (state->hdr->revision >= 0xb)
+			n = 256;
+		else if (state->hdr->revision >= 8)
+			n = 208;
+		else if (state->hdr->revision == 7)
+			n = 204;
+		else
+			n = 200;
+
+		if (hdr_size > n) {
+			instrs = &fs_hdr[n];
+			instrs_size = hdr_size - n;
+			hdr_size = n;
+			compact = 1;
+		} else {
+			while (1) {
+				void *ptr = next_sect(state, &sect_size);
+
+				if ((sect_size != 32) && (sect_size != 44)) {
+					/* end of constants: */
+					instrs = ptr;
+					instrs_size = sect_size;
+					break;
+				}
+
+				dump_hex_ascii(ptr, sect_size, 0);
+				constants[nconsts++] = ptr;
+			}
+		}
+
+		printf("\n");
+
+		if (full_dump) {
+			printf("#######################################################\n");
+			printf("######## FS%d HEADER: (size %d)\n", i, hdr_size);
+			dump_hex((void *)fs_hdr, hdr_size);
+			for (j = 0; j < nconsts; j++) {
+				printf("######## FS%d CONST: (size=%d)\n", i, (int)sizeof(constants[i]));
+				dump_constant(constants[j]);
+				dump_hex((char *)constants[j], sizeof(constants[j]));
+			}
+		}
+
+		printf("######## FS%d SHADER: (size=%d)\n", i, instrs_size);
+		if (full_dump) {
+			dump_hex(instrs, instrs_size);
+			level = 1;
+		} else {
+			dump_short_summary(state, nconsts, constants);
+		}
+
+		if (!compact) {
+			if (state->hdr->revision >= 7) {
+				instrs += 44;
+				instrs_size -= 44;
+			} else {
+				instrs += 32;
+				instrs_size -= 32;
+			}
+		}
+		disasm_a3xx((uint32_t *)instrs, instrs_size / 4, level+1, stdout, gpu_id);
+		dump_raw_shader((uint32_t *)instrs, instrs_size / 4, i, "fo3");
+		free(fs_hdr);
+	}
+}
+
+static void dump_program(struct state *state)
+{
+	int i, sect_size;
+	uint8_t *ptr;
+
+	state->hdr = next_sect(state, &sect_size);
+
+	printf("######## HEADER: (size %d)\n", sect_size);
+	printf("\tsize:           %d\n", state->hdr->size);
+	printf("\trevision:       %d\n", state->hdr->revision);
+	printf("\tattributes:     %d\n", state->hdr->num_attribs);
+	printf("\tuniforms:       %d\n", state->hdr->num_uniforms);
+	printf("\tsamplers:       %d\n", state->hdr->num_samplers);
+	printf("\tvaryings:       %d\n", state->hdr->num_varyings);
+	printf("\tuniform blocks: %d\n", state->hdr->num_uniformblocks);
+	if (full_dump)
+		dump_hex((void *)state->hdr, sect_size);
+	printf("\n");
+
+	/* there seems to be two 0xba5eba11's at the end of the header, possibly
+	 * with some other stuff between them:
+	 */
+	ptr = next_sect(state, &sect_size);
+	if (full_dump) {
+		dump_hex_ascii(ptr, sect_size, 0);
+	}
+
+	for (i = 0; (i < state->hdr->num_attribs) && (state->sz > 0); i++) {
+		state->attribs[i] = next_sect(state, &sect_size);
+
+		/* hmm, for a3xx (or maybe just newer driver version), we have some
+		 * extra sections that don't seem useful, so skip these:
+		 */
+		while (!valid_type(state->attribs[i]->type_info)) {
+			dump_hex_ascii(state->attribs[i], sect_size, 0);
+			state->attribs[i] = next_sect(state, &sect_size);
+		}
+
+		clean_ascii(state->attribs[i]->name, sect_size - 28);
+		if (full_dump) {
+			printf("######## ATTRIBUTE: (size %d)\n", sect_size);
+			dump_attribute(state->attribs[i]);
+			dump_hex((char *)state->attribs[i], sect_size);
+		}
+	}
+
+	for (i = 0; (i < state->hdr->num_uniforms) && (state->sz > 0); i++) {
+		state->uniforms[i] = next_sect(state, &sect_size);
+
+		/* hmm, for a3xx (or maybe just newer driver version), we have some
+		 * extra sections that don't seem useful, so skip these:
+		 */
+		while (!valid_type(state->uniforms[i]->type_info)) {
+			dump_hex_ascii(state->uniforms[i], sect_size, 0);
+			state->uniforms[i] = next_sect(state, &sect_size);
+		}
+
+		if (is_uniform_v2(state->uniforms[i])) {
+			clean_ascii(state->uniforms[i]->v2.name, sect_size - 53);
+		} else {
+			clean_ascii(state->uniforms[i]->v1.name, sect_size - 41);
+		}
+
+		if (full_dump) {
+			printf("######## UNIFORM: (size %d)\n", sect_size);
+			dump_uniform(state->uniforms[i]);
+			dump_hex((char *)state->uniforms[i], sect_size);
+		}
+	}
+
+	for (i = 0; (i < state->hdr->num_samplers) && (state->sz > 0); i++) {
+		state->samplers[i] = next_sect(state, &sect_size);
+
+		/* hmm, for a3xx (or maybe just newer driver version), we have some
+		 * extra sections that don't seem useful, so skip these:
+		 */
+		while (!valid_type(state->samplers[i]->type_info)) {
+			dump_hex_ascii(state->samplers[i], sect_size, 0);
+			state->samplers[i] = next_sect(state, &sect_size);
+		}
+
+		clean_ascii(state->samplers[i]->name, sect_size - 33);
+		if (full_dump) {
+			printf("######## SAMPLER: (size %d)\n", sect_size);
+			dump_sampler(state->samplers[i]);
+			dump_hex((char *)state->samplers[i], sect_size);
+		}
+
+	}
+
+	// These sections show up after all of the other sampler sections
+	// Loops through them all since we don't deal with them
+	if (state->hdr->revision >= 7) {
+		for (i = 0; (i < state->hdr->num_samplers) && (state->sz > 0); i++) {
+			ptr = next_sect(state, &sect_size);
+			dump_hex_ascii(ptr, sect_size, 0);
+		}
+	}
+
+
+	for (i = 0; (i < state->hdr->num_varyings) && (state->sz > 0); i++) {
+		state->varyings[i] = next_sect(state, &sect_size);
+
+		/* hmm, for a3xx (or maybe just newer driver version), we have some
+		 * extra sections that don't seem useful, so skip these:
+		 */
+		while (!valid_type(state->varyings[i]->type_info)) {
+			dump_hex_ascii(state->varyings[i], sect_size, 0);
+			state->varyings[i] = next_sect(state, &sect_size);
+		}
+
+		clean_ascii(state->varyings[i]->name, sect_size - 16);
+		if (full_dump) {
+			printf("######## VARYING: (size %d)\n", sect_size);
+			dump_varying(state->varyings[i]);
+			dump_hex((char *)state->varyings[i], sect_size);
+		}
+	}
+
+	/* show up again for revision >= 14?? */
+	if (state->hdr->revision >= 14) {
+		for (i = 0; (i < state->hdr->num_varyings) && (state->sz > 0); i++) {
+			ptr = next_sect(state, &sect_size);
+			dump_hex_ascii(ptr, sect_size, 0);
+		}
+	}
+
+	/* not sure exactly which revision started this, but seems at least
+	 * rev7 and rev8 implicitly include a new section for gl_FragColor:
+	 */
+	if (state->hdr->revision >= 7) {
+		/* I guess only one? */
+		state->outputs[0] = next_sect(state, &sect_size);
+
+		clean_ascii(state->outputs[0]->name, sect_size - 32);
+		if (full_dump) {
+			printf("######## OUTPUT: (size %d)\n", sect_size);
+			dump_output(state->outputs[0]);
+			dump_hex((char *)state->outputs[0], sect_size);
+		}
+	}
+
+	for (i = 0; (i < state->hdr->num_uniformblocks) && (state->sz > 0); i++) {
+		state->uniformblocks[i].header = next_sect(state, &sect_size);
+
+		clean_ascii(state->uniformblocks[i].header->name, sect_size - 40);
+		if (full_dump) {
+			printf("######## UNIFORM BLOCK: (size %d)\n", sect_size);
+			dump_uniformblock(state->uniformblocks[i].header);
+			dump_hex((char *)state->uniformblocks[i].header, sect_size);
+		}
+
+		/*
+		 * OpenGL ES 3.0 spec mandates a minimum amount of 16K members supported
+		 * a330 supports a minimum of 65K
+		 */
+		state->uniformblocks[i].members = malloc(state->uniformblocks[i].header->num_members * sizeof(void*));
+
+		int member = 0;
+		for (member = 0; (member < state->uniformblocks[i].header->num_members) && (state->sz > 0); member++) {
+			state->uniformblocks[i].members[member] = next_sect(state, &sect_size);
+
+			clean_ascii(state->uniformblocks[i].members[member]->name, sect_size - 56);
+			if (full_dump) {
+				printf("######## UNIFORM BLOCK MEMBER: (size %d)\n", sect_size);
+				dump_uniformblockmember(state->uniformblocks[i].members[member]);
+				dump_hex((char *)state->uniformblocks[i].members[member], sect_size);
+			}
+		}
+		/*
+		 * Qualcomm saves the UBO members twice for each UBO
+		 * Don't ask me why
+		 */
+		for (member = 0; (member < state->uniformblocks[i].header->num_members) && (state->sz > 0); member++) {
+			state->uniformblocks[i].members[member] = next_sect(state, &sect_size);
+
+			clean_ascii(state->uniformblocks[i].members[member]->name, sect_size - 56);
+			if (full_dump) {
+				printf("######## UNIFORM BLOCK MEMBER2: (size %d)\n", sect_size);
+				dump_uniformblockmember(state->uniformblocks[i].members[member]);
+				dump_hex((char *)state->uniformblocks[i].members[member], sect_size);
+			}
+		}
+	}
+
+	if (gpu_id >= 300) {
+		dump_shaders_a3xx(state);
+	} else {
+		dump_shaders_a2xx(state);
+	}
+
+	if (!full_dump)
+		return;
+
+	/* dump ascii version of shader program: */
+	ptr = next_sect(state, &sect_size);
+	printf("\n#######################################################\n");
+	printf("######## SHADER SRC: (size=%d)\n", sect_size);
+	dump_ascii(ptr, sect_size);
+	free(ptr);
+
+	/* dump remaining sections (there shouldn't be any): */
+	while (state->sz > 0) {
+		ptr = next_sect(state, &sect_size);
+		printf("######## section (size=%d)\n", sect_size);
+		printf("as hex:\n");
+		dump_hex(ptr, sect_size);
+		printf("as float:\n");
+		dump_float(ptr, sect_size);
+		printf("as ascii:\n");
+		dump_ascii(ptr, sect_size);
+		free(ptr);
+	}
+	/* cleanup the uniform buffer members we allocated */
+	if (state->hdr->num_uniformblocks > 0)
+		free (state->uniformblocks[i].members);
+}
+
+int main(int argc, char **argv)
+{
+	enum rd_sect_type type = RD_NONE;
+	enum debug_t debug = 0;
+	void *buf = NULL;
+	int sz;
+	struct io *io;
+	int raw_program = 0;
+
+	/* lame argument parsing: */
+
+	while (1) {
+		if ((argc > 1) && !strcmp(argv[1], "--verbose")) {
+			debug |= PRINT_RAW | PRINT_VERBOSE;
+			argv++;
+			argc--;
+			continue;
+		}
+		if ((argc > 1) && !strcmp(argv[1], "--expand")) {
+			debug |= EXPAND_REPEAT;
+			argv++;
+			argc--;
+			continue;
+		}
+		if ((argc > 1) && !strcmp(argv[1], "--short")) {
+			/* only short dump, original shader, symbol table, and disassembly */
+			full_dump = 0;
+			argv++;
+			argc--;
+			continue;
+		}
+		if ((argc > 1) && !strcmp(argv[1], "--dump-shaders")) {
+			dump_shaders = 1;
+			argv++;
+			argc--;
+			continue;
+		}
+		if ((argc > 1) && !strcmp(argv[1], "--raw")) {
+			raw_program = 1;
+			argv++;
+			argc--;
+			continue;
+		}
+		if ((argc > 1) && !strcmp(argv[1], "--gpu300")) {
+			gpu_id = 320;
+			argv++;
+			argc--;
+			continue;
+		}
+		break;
+	}
+
+	if (argc != 2) {
+		fprintf(stderr, "usage: pgmdump [--verbose] [--short] [--dump-shaders] testlog.rd\n");
+		return -1;
+	}
+
+	disasm_set_debug(debug);
+
+	infile = argv[1];
+
+	io = io_open(infile);
+	if (!io) {
+		fprintf(stderr, "could not open: %s\n", infile);
+		return -1;
+	}
+
+	if (raw_program)
+	{
+		io_readn(io, &sz, 4);
+		free(buf);
+
+		/* note: allow hex dumps to go a bit past the end of the buffer..
+		 * might see some garbage, but better than missing the last few bytes..
+		 */
+		buf = calloc(1, sz + 3);
+		io_readn(io, buf + 4, sz);
+		(*(int*)buf) = sz;
+
+		struct state state = {
+				.buf = buf,
+				.sz = sz,
+		};
+		printf("############################################################\n");
+		printf("program:\n");
+		dump_program(&state);
+		printf("############################################################\n");
+		return 0;
+	}
+
+	/* figure out what sort of input we are dealing with: */
+	if (!(check_extension(infile, ".rd") || check_extension(infile, ".rd.gz"))) {
+		enum shader_t shader = ~0;
+		int ret;
+		if (check_extension(infile, ".vo")) {
+			shader = SHADER_VERTEX;
+		} else if (check_extension(infile, ".fo")) {
+			shader = SHADER_FRAGMENT;
+		} else if (check_extension(infile, ".vo3")) {
+		} else if (check_extension(infile, ".fo3")) {
+		} else if (check_extension(infile, ".co3")) {
+		} else {
+			fprintf(stderr, "invalid input file: %s\n", infile);
+			return -1;
+		}
+		buf = calloc(1, 100 * 1024);
+		ret = io_readn(io, buf, 100 * 1024);
+		if (ret < 0) {
+			fprintf(stderr, "error: %m");
+			return -1;
+		}
+		if (shader != ~0) {
+			return disasm_a2xx(buf, ret/4, 0, shader);
+		} else {
+			/* disassembly does not depend on shader stage on a3xx+: */
+			return disasm_a3xx(buf, ret/4, 0, stdout, gpu_id);
+		}
+	}
+
+	while ((io_readn(io, &type, sizeof(type)) > 0) && (io_readn(io, &sz, 4) > 0)) {
+		free(buf);
+
+		/* note: allow hex dumps to go a bit past the end of the buffer..
+		 * might see some garbage, but better than missing the last few bytes..
+		 */
+		buf = calloc(1, sz + 3);
+		io_readn(io, buf, sz);
+
+		switch(type) {
+		case RD_TEST:
+			if (full_dump)
+				printf("test: %s\n", (char *)buf);
+			break;
+		case RD_VERT_SHADER:
+			printf("vertex shader:\n%s\n", (char *)buf);
+			break;
+		case RD_FRAG_SHADER:
+			printf("fragment shader:\n%s\n", (char *)buf);
+			break;
+		case RD_PROGRAM: {
+			struct state state = {
+					.buf = buf,
+					.sz = sz,
+			};
+			printf("############################################################\n");
+			printf("program:\n");
+			dump_program(&state);
+			printf("############################################################\n");
+			break;
+		}
+		case RD_GPU_ID:
+			gpu_id = *((unsigned int *)buf);
+			printf("gpu_id: %d\n", gpu_id);
+			break;
+		default:
+			break;
+		}
+	}
+
+	io_close(io);
+
+	return 0;
+}
+
diff --git a/src/freedreno/decode/pgmdump2.c b/src/freedreno/decode/pgmdump2.c
new file mode 100644
index 0000000..7410bcd
--- /dev/null
+++ b/src/freedreno/decode/pgmdump2.c
@@ -0,0 +1,585 @@
+/*
+ * Copyright (c) 2018 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Decoder for "new" GL_OES_get_program_binary format.
+ *
+ * Overall structure is:
+ *
+ *   - header at top, contains, amongst other things, offsets of
+ *     per shader stage sections.
+ *   - per shader stage section (shader_info) starts with a header,
+ *     followed by a variably length list of descriptors.  Each
+ *     descriptor has a type/count/size plus offset from the start
+ *     of shader_info section where the data is found
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stddef.h>
+#include <fcntl.h>
+#include <string.h>
+
+#include "redump.h"
+#include "disasm.h"
+#include "io.h"
+#include "util.h"
+
+const char *infile;
+static int dump_full = 0;
+static int dump_offsets = 0;
+static int gpu_id = 320;
+static int shaderdb = 0;     /* output shaderdb style traces to stderr */
+
+struct state {
+	char *buf;
+	int sz;
+	int lvl;
+
+	/* current shader_info section, some offsets calculated relative to
+	 * this, rather than relative to start of buffer.
+	 */
+	void *shader;
+
+	/* size of each entry within a shader_descriptor_blk: */
+	int desc_size;
+
+	const char *shader_type;
+	int full_regs;
+	int half_regs;
+};
+
+#define PACKED __attribute__((__packed__))
+
+#define OFF(field) do {                                               \
+		if (dump_offsets)                                             \
+			printf("%08x: ", (uint32_t)((char *)&field - state->buf));\
+	} while (0)
+
+/* decode field as hex */
+#define X(s, field)  do {                                             \
+		OFF(s->field);                                                \
+		printf("%s%12s:\t0x%x\n", tab(state->lvl), #field, s->field); \
+	} while (0)
+
+/* decode field as digit */
+#define D(s, field)  do {                                             \
+		OFF(s->field);                                                \
+		printf("%s%12s:\t%u\n", tab(state->lvl), #field, s->field);   \
+	} while (0)
+
+/* decode field as float/hex */
+#define F(s, field)  do {                                             \
+		OFF(s->field);                                                \
+		printf("%s%12s:\t%f (0x%0x)\n", tab(state->lvl), #field,      \
+				d2f(s->field), s->field);                             \
+	} while (0)
+
+/* decode field as register: (type is 'r' or 'c') */
+#define R(s, field, type) do {                                        \
+		OFF(s->field);                                                \
+		printf("%s%12s:\t%c%u.%c\n", tab(state->lvl), #field, type,   \
+				(s->field >> 2), "xyzw"[s->field & 0x3]);             \
+	} while (0)
+
+/* decode inline string (presumably null terminated?) */
+#define S(s, field)  do {                                             \
+		OFF(s->field);                                                \
+		printf("%s%12s:\t%s\n", tab(state->lvl), #field, s->field);   \
+	} while (0)
+
+/* decode string-table string */
+#define T(s, field)  TODO
+
+/* decode field as unknown */
+#define U(s, start, end) \
+	dump_unknown(state, s->unk_ ## start ## _ ## end, 0x ## start, (4 + 0x ## end - 0x ## start) / 4)
+
+/* decode field as offset to other section */
+#define O(s, field, type) do {              \
+		X(s, field);                        \
+		assert(s->field < state->sz);       \
+		void *_p = &state->buf[s->field];   \
+		state->lvl++;                       \
+		decode_ ## type (state, _p);        \
+		state->lvl--;                       \
+	} while (0)
+
+struct shader_info;
+static void decode_shader_info(struct state *state, struct shader_info *info);
+
+static void dump_unknown(struct state *state, void *buf, unsigned start, unsigned n)
+{
+	uint32_t *ptr = buf;
+	uint8_t *ascii = buf;
+
+	for (unsigned i = 0; i < n; i++) {
+		uint32_t d = ptr[i];
+
+		if (dump_offsets)
+			printf("%08x:", (uint32_t)((char *)&ptr[i] - state->buf));
+
+		printf("%s        %04x:\t%08x", tab(state->lvl), start + i * 4, d);
+
+		printf("\t|");
+		for (unsigned j = 0; j < 4; j++) {
+			uint8_t c = *(ascii++);
+			printf("%c", (isascii(c) && !iscntrl(c)) ? c : '.');
+		}
+		printf("|\t%f", d2f(d));
+
+		/* TODO maybe scan for first non-null and non-ascii char starting from
+		 * end of shader binary to (roughly) establish the start of the string
+		 * table.. that would be a bit better filter for deciding if something
+		 * might be a pointer into the string table.  Also, the previous char
+		 * to what it points to should probably be null.
+		 */
+		if ((d < state->sz) &&
+				isascii(state->buf[d]) &&
+				(strlen(&state->buf[d]) > 2) &&
+				isascii(state->buf[d+1]))
+			printf("\t<== %s", &state->buf[d]);
+
+		printf("\n");
+	}
+}
+
+struct PACKED header {
+	uint32_t version;   /* I guess, always b10bcace ? */
+	uint32_t unk_0004_0014[5];
+	uint32_t size;
+	uint32_t size2;     /* just to be sure? */
+	uint32_t unk_0020_0020[1];
+	uint32_t chksum;    /* I guess?  Small changes seem to result in big diffs here */
+	uint32_t unk_0028_0050[11];
+	uint32_t fs_info;   /* offset of FS shader_info section */
+	uint32_t unk_0058_0090[15];
+	uint32_t vs_info;   /* offset of VS shader_info section */
+	uint32_t unk_0098_00b0[7];
+	uint32_t vs_info2;  /* offset of VS shader_info section (again?) */
+	uint32_t unk_00b8_0110[23];
+	uint32_t bs_info;   /* offset of binning shader_info section */
+};
+
+static void decode_header(struct state *state, struct header *hdr)
+{
+	X(hdr, version);
+	U(hdr, 0004, 0014);
+	X(hdr, size);
+	X(hdr, size2);
+	U(hdr, 0020, 0020);
+	X(hdr, chksum);
+	U(hdr, 0028, 0050);
+	state->shader_type = "FRAG";
+	O(hdr, fs_info, shader_info);
+	U(hdr, 0058, 0090);
+	state->shader_type = "VERT";
+	O(hdr, vs_info, shader_info);
+	U(hdr, 0098, 00b0);
+	assert(hdr->vs_info == hdr->vs_info2);  /* not sure what this if it is ever different */
+	X(hdr, vs_info2);
+	U(hdr, 00b8, 0110);
+	state->shader_type = "BVERT";
+	O(hdr, bs_info, shader_info);
+
+	/* not sure how much of the rest of contents before start of fs_info
+	 * is the header, vs other things.. just dump it all as unknown for
+	 * now:
+	 */
+	dump_unknown(state, (void *)hdr + sizeof(*hdr),
+		sizeof(*hdr), (hdr->fs_info - sizeof(*hdr)) / 4);
+}
+
+struct PACKED shader_entry_point {
+	/* entry point name, ie. "main" of TBD length, followed by unknown */
+	char name[8];
+};
+
+static void decode_shader_entry_point(struct state *state,
+		struct shader_entry_point *e)
+{
+	S(e, name);
+}
+
+struct PACKED shader_config {
+	uint32_t unk_0000_0008[3];
+	uint32_t full_regs;
+	uint32_t half_regs;
+};
+
+static void decode_shader_config(struct state *state, struct shader_config *cfg)
+{
+	U(cfg, 0000, 0008);
+	D(cfg, full_regs);
+	D(cfg, half_regs);
+
+	state->full_regs = cfg->full_regs;
+	state->half_regs = cfg->half_regs;
+
+	/* dump reset of unknown (size differs btwn versions) */
+	dump_unknown(state, (void *)cfg + sizeof(*cfg), sizeof(*cfg),
+			(state->desc_size - sizeof(*cfg))/4);
+}
+
+struct PACKED shader_io_block {
+	/* name of TBD length followed by unknown.. 42 dwords total */
+	char name[20];
+	uint32_t unk_0014_00a4[37];
+};
+
+static void decode_shader_io_block(struct state *state,
+		struct shader_io_block *io)
+{
+	S(io, name);
+	U(io, 0014, 00a4);
+}
+
+struct PACKED shader_constant_block {
+	uint32_t value;
+	uint32_t unk_0004_000c[3];
+	uint32_t regid;
+	uint32_t unk_0014_0024[5];
+};
+
+static void decode_shader_constant_block(struct state *state,
+		struct shader_constant_block *c)
+{
+	F(c, value);
+	U(c, 0004, 000c);
+	R(c, regid, 'c');
+	U(c, 0014, 0024);
+}
+
+enum {
+	ENTRY_POINT    =  0,     /* shader_entry_point */
+	SHADER_CONFIG  =  1,     /* XXX placeholder name */
+	SHADER_INPUT   =  2,     /* shader_io_block */
+	SHADER_OUTPUT  =  3,     /* shader_io_block */
+	CONSTANTS      =  6,     /* shader_constant_block */
+	INTERNAL       =  8,     /* internal input, like bary.f coord */
+	SHADER         = 10,
+} shader_info_block_type;
+
+/* Refers to location of some type of records, with an offset relative to
+ * start of shader_info block.
+ */
+struct PACKED shader_descriptor_block {
+	uint32_t type;      /* block type */
+	uint32_t offset;    /* offset (relative to start of shader_info block) */
+	uint32_t size;      /* size in bytes */
+	uint32_t count;     /* number of records */
+	uint32_t unk_0010_0010[1];
+};
+
+static void decode_shader_descriptor_block(struct state *state,
+		struct shader_descriptor_block *blk)
+{
+	D(blk, type);
+	X(blk, offset);
+	D(blk, size);
+	D(blk, count);
+	U(blk, 0010, 0010);
+
+	/* offset relative to current shader block: */
+	void *ptr = state->shader + blk->offset;
+
+	if (blk->count == 0) {
+		assert(blk->size == 0);
+	} else {
+		assert((blk->size % blk->count) == 0);
+	}
+
+	state->desc_size = blk->size / blk->count;
+	state->lvl++;
+	for (unsigned i = 0; i < blk->count; i++) {
+		switch (blk->type) {
+		case ENTRY_POINT:
+			printf("%sentry point %u:\n", tab(state->lvl-1), i);
+			decode_shader_entry_point(state, ptr);
+			break;
+		case SHADER_CONFIG:
+			printf("%sconfig %u:\n", tab(state->lvl-1), i);
+			decode_shader_config(state, ptr);
+			break;
+		case SHADER_INPUT:
+			printf("%sinput %u:\n", tab(state->lvl-1), i);
+			decode_shader_io_block(state, ptr);
+			break;
+		case SHADER_OUTPUT:
+			printf("%soutput %u:\n", tab(state->lvl-1), i);
+			decode_shader_io_block(state, ptr);
+			break;
+		case INTERNAL:
+			printf("%sinternal input %u:\n", tab(state->lvl-1), i);
+			decode_shader_io_block(state, ptr);
+			break;
+		case CONSTANTS:
+			printf("%sconstant %u:\n", tab(state->lvl-1), i);
+			decode_shader_constant_block(state, ptr);
+			break;
+		case SHADER: {
+			struct shader_stats stats;
+			printf("%sshader %u:\n", tab(state->lvl-1), i);
+			disasm_a3xx_stat(ptr, blk->size/4, state->lvl, stdout, gpu_id, &stats);
+			if (shaderdb) {
+				unsigned dwords = 2 * stats.instlen;
+
+				if (gpu_id >= 400) {
+					dwords = ALIGN(dwords, 16 * 2);
+				} else {
+					dwords = ALIGN(dwords, 4 * 2);
+				}
+
+				unsigned half_regs = state->half_regs;
+				unsigned full_regs = state->full_regs;
+
+				/* On a6xx w/ merged/conflicting half and full regs, the
+				 * full_regs footprint will be max of full_regs and half
+				 * of half_regs.. we only care about which value is higher.
+				 */
+				if (gpu_id >= 600) {
+					/* footprint of half_regs in units of full_regs: */
+					unsigned half_full = (half_regs + 1) / 2;
+					if (half_full > full_regs)
+						full_regs = half_full;
+					half_regs = 0;
+				}
+
+				fprintf(stderr,
+						"%s shader: %u inst, %u nops, %u non-nops, %u dwords, "
+						"%u half, %u full, %u constlen, "
+						"%u (ss), %u (sy), %d max_sun, %d loops\n",
+					state->shader_type, stats.instructions,
+					stats.nops, stats.instructions - stats.nops,
+					dwords, half_regs, full_regs,
+					stats.constlen, stats.ss, stats.sy,
+					0, 0);  /* max_sun or loops not possible */
+			}
+			/* this is a special case in a way, blk->count is # of
+			 * instructions but disasm_a3xx() decodes all instructions,
+			 * so just bail.
+			 */
+			i = blk->count;
+			break;
+		}
+		default:
+			dump_unknown(state, ptr, 0, state->desc_size/4);
+			break;
+		}
+		ptr += state->desc_size;
+	}
+	state->lvl--;
+}
+
+/* there looks like one of these per shader, followed by "main" and
+ * some more info, and then the shader itself.
+ */
+struct PACKED shader_info {
+	uint32_t unk_0000_0010[5];
+	uint32_t desc_off;       /* offset to first descriptor block */
+	uint32_t num_blocks;
+};
+
+static void decode_shader_info(struct state *state, struct shader_info *info)
+{
+	assert((info->desc_off % 4) == 0);
+
+	U(info, 0000, 0010);
+	X(info, desc_off);
+	D(info, num_blocks);
+
+	dump_unknown(state, &info[1], 0, (info->desc_off - sizeof(*info))/4);
+
+	state->shader = info;
+
+	struct shader_descriptor_block *blocks = ((void *)info) + info->desc_off;
+	for (unsigned i = 0; i < info->num_blocks; i++) {
+		printf("%sdescriptor %u:\n", tab(state->lvl), i);
+		state->lvl++;
+		decode_shader_descriptor_block(state, &blocks[i]);
+		state->lvl--;
+	}
+}
+
+static void dump_program(struct state *state)
+{
+	struct header *hdr = (void *)state->buf;
+
+	if (dump_full)
+		dump_unknown(state, state->buf, 0, state->sz/4);
+
+	decode_header(state, hdr);
+}
+
+int main(int argc, char **argv)
+{
+	enum rd_sect_type type = RD_NONE;
+	enum debug_t debug = 0;
+	void *buf = NULL;
+	int sz;
+	struct io *io;
+	int raw_program = 0;
+
+	/* lame argument parsing: */
+
+	while (1) {
+		if ((argc > 1) && !strcmp(argv[1], "--verbose")) {
+			debug |= PRINT_RAW | PRINT_VERBOSE;
+			argv++;
+			argc--;
+			continue;
+		}
+		if ((argc > 1) && !strcmp(argv[1], "--expand")) {
+			debug |= EXPAND_REPEAT;
+			argv++;
+			argc--;
+			continue;
+		}
+		if ((argc > 1) && !strcmp(argv[1], "--full")) {
+			/* only short dump, original shader, symbol table, and disassembly */
+			dump_full = 1;
+			argv++;
+			argc--;
+			continue;
+		}
+		if ((argc > 1) && !strcmp(argv[1], "--dump-offsets")) {
+			dump_offsets = 1;
+			argv++;
+			argc--;
+			continue;
+		}
+		if ((argc > 1) && !strcmp(argv[1], "--raw")) {
+			raw_program = 1;
+			argv++;
+			argc--;
+			continue;
+		}
+		if ((argc > 1) && !strcmp(argv[1], "--shaderdb")) {
+			shaderdb = 1;
+			argv++;
+			argc--;
+			continue;
+		}
+		break;
+	}
+
+	if (argc != 2) {
+		fprintf(stderr, "usage: pgmdump2 [--verbose] [--expand] [--full] [--dump-offsets] [--raw] [--shaderdb] testlog.rd\n");
+		return -1;
+	}
+
+	disasm_set_debug(debug);
+
+	infile = argv[1];
+
+	io = io_open(infile);
+	if (!io) {
+		fprintf(stderr, "could not open: %s\n", infile);
+		return -1;
+	}
+
+	if (raw_program)
+	{
+		io_readn(io, &sz, 4);
+		free(buf);
+
+		/* note: allow hex dumps to go a bit past the end of the buffer..
+		 * might see some garbage, but better than missing the last few bytes..
+		 */
+		buf = calloc(1, sz + 3);
+		io_readn(io, buf + 4, sz);
+		(*(int*)buf) = sz;
+
+		struct state state = {
+				.buf = buf,
+				.sz = sz,
+		};
+		printf("############################################################\n");
+		printf("program:\n");
+		dump_program(&state);
+		printf("############################################################\n");
+		return 0;
+	}
+
+	/* figure out what sort of input we are dealing with: */
+	if (!(check_extension(infile, ".rd") || check_extension(infile, ".rd.gz"))) {
+		int ret;
+		buf = calloc(1, 100 * 1024);
+		ret = io_readn(io, buf, 100 * 1024);
+		if (ret < 0) {
+			fprintf(stderr, "error: %m");
+			return -1;
+		}
+		return disasm_a3xx(buf, ret/4, 0, stdout, gpu_id);
+	}
+
+	while ((io_readn(io, &type, sizeof(type)) > 0) && (io_readn(io, &sz, 4) > 0)) {
+		free(buf);
+
+		/* note: allow hex dumps to go a bit past the end of the buffer..
+		 * might see some garbage, but better than missing the last few bytes..
+		 */
+		buf = calloc(1, sz + 3);
+		io_readn(io, buf, sz);
+
+		switch(type) {
+		case RD_TEST:
+			if (dump_full)
+				printf("test: %s\n", (char *)buf);
+			break;
+		case RD_VERT_SHADER:
+			printf("vertex shader:\n%s\n", (char *)buf);
+			break;
+		case RD_FRAG_SHADER:
+			printf("fragment shader:\n%s\n", (char *)buf);
+			break;
+		case RD_PROGRAM: {
+			struct state state = {
+					.buf = buf,
+					.sz = sz,
+			};
+			printf("############################################################\n");
+			printf("program:\n");
+			dump_program(&state);
+			printf("############################################################\n");
+			break;
+		}
+		case RD_GPU_ID:
+			gpu_id = *((unsigned int *)buf);
+			printf("gpu_id: %d\n", gpu_id);
+			break;
+		default:
+			break;
+		}
+	}
+
+	io_close(io);
+
+	return 0;
+}
diff --git a/src/freedreno/decode/redump.h b/src/freedreno/decode/redump.h
new file mode 100644
index 0000000..c77344e
--- /dev/null
+++ b/src/freedreno/decode/redump.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright © 2012 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef REDUMP_H_
+#define REDUMP_H_
+
+enum rd_sect_type {
+	RD_NONE,
+	RD_TEST,       /* ascii text */
+	RD_CMD,        /* ascii text */
+	RD_GPUADDR,    /* u32 gpuaddr, u32 size */
+	RD_CONTEXT,    /* raw dump */
+	RD_CMDSTREAM,  /* raw dump */
+	RD_CMDSTREAM_ADDR, /* gpu addr of cmdstream */
+	RD_PARAM,      /* u32 param_type, u32 param_val, u32 bitlen */
+	RD_FLUSH,      /* empty, clear previous params */
+	RD_PROGRAM,    /* shader program, raw dump */
+	RD_VERT_SHADER,
+	RD_FRAG_SHADER,
+	RD_BUFFER_CONTENTS,
+	RD_GPU_ID,
+};
+
+/* RD_PARAM types: */
+enum rd_param_type {
+	RD_PARAM_SURFACE_WIDTH,
+	RD_PARAM_SURFACE_HEIGHT,
+	RD_PARAM_SURFACE_PITCH,
+	RD_PARAM_COLOR,
+	RD_PARAM_BLIT_X,
+	RD_PARAM_BLIT_Y,
+	RD_PARAM_BLIT_WIDTH,
+	RD_PARAM_BLIT_HEIGHT,
+	RD_PARAM_BLIT_X2,      /* BLIT_X + BLIT_WIDTH */
+	RD_PARAM_BLIT_Y2,      /* BLIT_Y + BLIT_WIDTH */
+};
+
+void rd_start(const char *name, const char *fmt, ...) __attribute__((weak));
+void rd_end(void) __attribute__((weak));
+void rd_write_section(enum rd_sect_type type, const void *buf, int sz) __attribute__((weak));
+
+/* for code that should run with and without libwrap, use the following
+ * macros which check if the fxns are present before calling
+ */
+#define RD_START(n,f,...)        do { if (rd_start) rd_start(n,f,##__VA_ARGS__); } while (0)
+#define RD_END()                 do { if (rd_end) rd_end(); } while (0)
+#define RD_WRITE_SECTION(t,b,s)  do { if (rd_write_section) rd_write_section(t,b,s); } while (0)
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#undef ALIGN
+#define ALIGN(v,a) (((v) + (a) - 1) & ~((a) - 1))
+
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+#define max(a, b) (((a) > (b)) ? (a) : (b))
+
+#endif /* REDUMP_H_ */
diff --git a/src/freedreno/decode/rnnutil.c b/src/freedreno/decode/rnnutil.c
new file mode 100644
index 0000000..7891597
--- /dev/null
+++ b/src/freedreno/decode/rnnutil.c
@@ -0,0 +1,217 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+
+#include "rnnutil.h"
+
+static struct rnndomain *finddom(struct rnn *rnn, uint32_t regbase)
+{
+	if (rnndec_checkaddr(rnn->vc, rnn->dom[0], regbase, 0))
+		return rnn->dom[0];
+	return rnn->dom[1];
+}
+
+void _rnn_init(struct rnn *rnn, int nocolor)
+{
+	rnn_init();
+
+	rnn->db = rnn_newdb();
+	rnn->vc_nocolor = rnndec_newcontext(rnn->db);
+	rnn->vc_nocolor->colors = &envy_null_colors;
+	if (nocolor) {
+		rnn->vc = rnn->vc_nocolor;
+	} else {
+		rnn->vc = rnndec_newcontext(rnn->db);
+		rnn->vc->colors = &envy_def_colors;
+	}
+}
+
+struct rnn *rnn_new(int nocolor)
+{
+	struct rnn *rnn = calloc(sizeof(*rnn), 1);
+
+	if (!rnn)
+		return NULL;
+
+	_rnn_init(rnn, nocolor);
+
+	return rnn;
+}
+
+static void init(struct rnn *rnn, char *file, char *domain)
+{
+	/* prepare rnn stuff for lookup */
+	rnn_parsefile(rnn->db, file);
+	rnn_prepdb(rnn->db);
+	rnn->dom[0] = rnn_finddomain(rnn->db, domain);
+	if ((strcmp(domain, "A2XX") == 0) || (strcmp(domain, "A3XX") == 0)) {
+		rnn->dom[1] = rnn_finddomain(rnn->db, "AXXX");
+	} else {
+		rnn->dom[1] = rnn->dom[0];
+	}
+	if (!rnn->dom[0] && rnn->dom[1]) {
+		fprintf(stderr, "Could not find domain %s in %s\n", domain, file);
+	}
+	rnn->variant = domain;
+
+	rnndec_varadd(rnn->vc, "chip", domain);
+	if (rnn->vc != rnn->vc_nocolor)
+		rnndec_varadd(rnn->vc_nocolor, "chip", domain);
+}
+
+void rnn_load_file(struct rnn *rnn, char *file, char *domain)
+{
+	init(rnn, file, domain);
+}
+
+void rnn_load(struct rnn *rnn, const char *gpuname)
+{
+	if (strstr(gpuname, "a2")) {
+		init(rnn, "adreno/a2xx.xml", "A2XX");
+	} else if (strstr(gpuname, "a3")) {
+		init(rnn, "adreno/a3xx.xml", "A3XX");
+	} else if (strstr(gpuname, "a4")) {
+		init(rnn, "adreno/a4xx.xml", "A4XX");
+	} else if (strstr(gpuname, "a5")) {
+		init(rnn, "adreno/a5xx.xml", "A5XX");
+	} else if (strstr(gpuname, "a6")) {
+		init(rnn, "adreno/a6xx.xml", "A6XX");
+	}
+}
+
+uint32_t rnn_regbase(struct rnn *rnn, const char *name)
+{
+	uint32_t regbase = rnndec_decodereg(rnn->vc_nocolor, rnn->dom[0], name);
+	if (!regbase)
+		regbase = rnndec_decodereg(rnn->vc_nocolor, rnn->dom[1], name);
+	return regbase;
+}
+
+const char *rnn_regname(struct rnn *rnn, uint32_t regbase, int color)
+{
+	static char buf[128];
+	struct rnndecaddrinfo *info;
+
+	info = rnndec_decodeaddr(color ? rnn->vc : rnn->vc_nocolor,
+			finddom(rnn, regbase), regbase, 0);
+	if (info) {
+		strcpy(buf, info->name);
+		free(info->name);
+		free(info);
+		return buf;
+	}
+	return NULL;
+}
+
+struct rnndecaddrinfo *rnn_reginfo(struct rnn *rnn, uint32_t regbase)
+{
+	return rnndec_decodeaddr(rnn->vc, finddom(rnn, regbase), regbase, 0);
+}
+
+const char *rnn_enumname(struct rnn *rnn, const char *name, uint32_t val)
+{
+	struct rnndeccontext *ctx = rnn->vc;
+	struct rnnenum *en = rnn_findenum(ctx->db, name);
+	if (en) {
+		int i;
+		for (i = 0; i < en->valsnum; i++) {
+			struct rnnvalue *eval = en->vals[i];
+			if (eval->valvalid && eval->value == val &&
+					rnndec_varmatch(ctx, &eval->varinfo)) {
+				return en->vals[i]->name;
+			}
+		}
+	}
+	return NULL;
+}
+
+static struct rnndelem *regelem(struct rnndomain *domain, const char *name)
+{
+	int i;
+	for (i = 0; i < domain->subelemsnum; i++) {
+		struct rnndelem *elem = domain->subelems[i];
+		if (!strcmp(elem->name, name))
+			return elem;
+	}
+	return NULL;
+}
+
+/* Lookup rnndelem by name: */
+struct rnndelem *rnn_regelem(struct rnn *rnn, const char *name)
+{
+	struct rnndelem *elem = regelem(rnn->dom[0], name);
+	if (elem)
+		return elem;
+	return regelem(rnn->dom[1], name);
+}
+
+static struct rnndelem *regoff(struct rnndomain *domain, uint32_t offset)
+{
+	int i;
+	for (i = 0; i < domain->subelemsnum; i++) {
+		struct rnndelem *elem = domain->subelems[i];
+		if (elem->offset == offset)
+			return elem;
+	}
+	return NULL;
+}
+
+/* Lookup rnndelem by offset: */
+struct rnndelem *rnn_regoff(struct rnn *rnn, uint32_t offset)
+{
+	struct rnndelem *elem = regoff(rnn->dom[0], offset);
+	if (elem)
+		return elem;
+	return regoff(rnn->dom[1], offset);
+}
+
+enum rnnttype rnn_decodelem(struct rnn *rnn, struct rnntypeinfo *info,
+		uint32_t regval, union rnndecval *val)
+{
+	val->u = regval;
+	switch (info->type) {
+	case RNN_TTYPE_INLINE_ENUM:
+	case RNN_TTYPE_ENUM:
+	case RNN_TTYPE_HEX:
+	case RNN_TTYPE_INT:
+	case RNN_TTYPE_UINT:
+	case RNN_TTYPE_FLOAT:
+	case RNN_TTYPE_BOOLEAN:
+		return info->type;
+	case RNN_TTYPE_FIXED:
+	case RNN_TTYPE_UFIXED:
+		/* TODO */
+	default:
+		return RNN_TTYPE_INVALID;
+	}
+}
diff --git a/src/freedreno/decode/rnnutil.h b/src/freedreno/decode/rnnutil.h
new file mode 100644
index 0000000..ea66747
--- /dev/null
+++ b/src/freedreno/decode/rnnutil.h
@@ -0,0 +1,66 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef RNNUTIL_H_
+#define RNNUTIL_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+
+#include "rnn.h"
+#include "rnndec.h"
+
+struct rnn {
+	struct rnndb *db;
+	struct rnndeccontext *vc, *vc_nocolor;
+	struct rnndomain *dom[2];
+	const char *variant;
+};
+
+union rnndecval {
+	uint32_t u;
+	int32_t i;
+	float f;
+};
+
+void _rnn_init(struct rnn *rnn, int nocolor);
+struct rnn *rnn_new(int nocolor);
+void rnn_load_file(struct rnn *rnn, char *file, char *domain);
+void rnn_load(struct rnn *rnn, const char *gpuname);
+uint32_t rnn_regbase(struct rnn *rnn, const char *name);
+const char *rnn_regname(struct rnn *rnn, uint32_t regbase, int color);
+struct rnndecaddrinfo *rnn_reginfo(struct rnn *rnn, uint32_t regbase);
+const char *rnn_enumname(struct rnn *rnn, const char *name, uint32_t val);
+
+struct rnndelem *rnn_regelem(struct rnn *rnn, const char *name);
+struct rnndelem *rnn_regoff(struct rnn *rnn, uint32_t offset);
+enum rnnttype rnn_decodelem(struct rnn *rnn, struct rnntypeinfo *info,
+		uint32_t regval, union rnndecval *val);
+
+#endif /* RNNUTIL_H_ */
diff --git a/src/freedreno/decode/script.c b/src/freedreno/decode/script.c
new file mode 100644
index 0000000..a882dd2
--- /dev/null
+++ b/src/freedreno/decode/script.c
@@ -0,0 +1,775 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#define _GNU_SOURCE
+#define LUA_COMPAT_APIINTCASTS
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <lua.h>
+#include <lauxlib.h>
+#include <lualib.h>
+#include <assert.h>
+
+#include "script.h"
+#include "cffdec.h"
+#include "rnnutil.h"
+
+static lua_State *L;
+
+#if 0
+#define DBG(fmt, ...) \
+		do { printf(" ** %s:%d ** "fmt "\n", \
+				__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
+#else
+#define DBG(fmt, ...) do {} while (0)
+#endif
+
+/* An rnn based decoder, which can either be decoding current register
+ * values, or domain based decoding of a pm4 packet.
+ *
+ */
+struct rnndec {
+	struct rnn base;
+
+	/* for pm4 packet decoding: */
+	uint32_t sizedwords;
+	uint32_t *dwords;
+};
+
+static inline struct rnndec *to_rnndec(struct rnn *rnn)
+{
+	return (struct rnndec *)rnn;
+}
+
+static uint32_t rnn_val(struct rnn *rnn, uint32_t regbase)
+{
+	struct rnndec *rnndec = to_rnndec(rnn);
+
+	if (!rnndec->sizedwords) {
+		return reg_val(regbase);
+	} else if (regbase < rnndec->sizedwords) {
+		return rnndec->dwords[regbase];
+	} else {
+		// XXX throw an error
+		return -1;
+	}
+}
+
+/* does not return */
+static void error(const char *fmt)
+{
+	fprintf(stderr, fmt, lua_tostring(L, -1));
+	exit(1);
+}
+
+/*
+ * An enum type that can be used as string or number:
+ */
+
+struct rnndenum {
+	const char *str;
+	int val;
+};
+
+static int l_meta_rnn_enum_tostring(lua_State *L)
+{
+	struct rnndenum *e = lua_touserdata(L, 1);
+	if (e->str) {
+		lua_pushstring(L, e->str);
+	} else {
+		char buf[32];
+		sprintf(buf, "%u", e->val);
+		lua_pushstring(L, buf);
+	}
+	return 1;
+}
+
+/* so, this doesn't actually seem to be implemented yet, but hopefully
+ * some day lua comes to it's senses
+ */
+static int l_meta_rnn_enum_tonumber(lua_State *L)
+{
+	struct rnndenum *e = lua_touserdata(L, 1);
+	lua_pushinteger(L, e->val);
+	return 1;
+}
+
+static const struct luaL_Reg l_meta_rnn_enum[] = {
+	{"__tostring", l_meta_rnn_enum_tostring},
+	{"__tonumber", l_meta_rnn_enum_tonumber},
+	{NULL, NULL}  /* sentinel */
+};
+
+static void pushenum(struct lua_State *L, int val, struct rnnenum *info)
+{
+	struct rnndenum *e = lua_newuserdata(L, sizeof(*e));
+
+	e->val = val;
+	e->str = NULL;
+
+	for (int i = 0; i < info->valsnum; i++) {
+		if (info->vals[i]->valvalid && (info->vals[i]->value == val)) {
+			e->str = info->vals[i]->name;
+			break;
+		}
+	}
+
+	luaL_newmetatable(L, "rnnmetaenum");
+	luaL_setfuncs(L, l_meta_rnn_enum, 0);
+	lua_pop(L, 1);
+
+	luaL_setmetatable(L, "rnnmetaenum");
+}
+
+/* Expose rnn decode to script environment as "rnn" library:
+ */
+
+struct rnndoff {
+	struct rnn *rnn;
+	struct rnndelem *elem;
+	uint64_t offset;
+};
+
+static void push_rnndoff(lua_State *L, struct rnn *rnn,
+		struct rnndelem *elem, uint64_t offset)
+{
+	struct rnndoff *rnndoff = lua_newuserdata(L, sizeof(*rnndoff));
+	rnndoff->rnn = rnn;
+	rnndoff->elem = elem;
+	rnndoff->offset = offset;
+}
+
+static int l_rnn_etype_array(lua_State *L, struct rnn *rnn,
+		struct rnndelem *elem, uint64_t offset);
+static int l_rnn_etype_reg(lua_State *L, struct rnn *rnn,
+		struct rnndelem *elem, uint64_t offset);
+
+static int pushdecval(struct lua_State *L, struct rnn *rnn,
+		uint32_t regval, struct rnntypeinfo *info)
+{
+	union rnndecval val;
+	switch (rnn_decodelem(rnn, info, regval, &val)) {
+	case RNN_TTYPE_ENUM:
+	case RNN_TTYPE_INLINE_ENUM:
+		pushenum(L, val.i, info->eenum);
+		return 1;
+	case RNN_TTYPE_INT:
+		lua_pushinteger(L, val.i);
+		return 1;
+	case RNN_TTYPE_UINT:
+	case RNN_TTYPE_HEX:
+		lua_pushunsigned(L, val.u);
+		return 1;
+	case RNN_TTYPE_FLOAT:
+		lua_pushnumber(L, val.f);
+		return 1;
+	case RNN_TTYPE_BOOLEAN:
+		lua_pushboolean(L, val.u);
+		return 1;
+	case RNN_TTYPE_INVALID:
+	default:
+		return 0;
+	}
+
+}
+
+static int l_rnn_etype(lua_State *L, struct rnn *rnn,
+		struct rnndelem *elem, uint64_t offset)
+{
+	int ret;
+	uint32_t regval;
+	DBG("elem=%p (%d), offset=%lu", elem, elem->type, offset);
+	switch (elem->type) {
+	case RNN_ETYPE_REG:
+		/* if a register has no bitfields, just return
+		 * the raw value:
+		 */
+		regval = rnn_val(rnn, offset);
+		regval <<= elem->typeinfo.shr;
+		ret = pushdecval(L, rnn, regval, &elem->typeinfo);
+		if (ret)
+			return ret;
+		return l_rnn_etype_reg(L, rnn, elem, offset);
+	case RNN_ETYPE_ARRAY:
+		return l_rnn_etype_array(L, rnn, elem, offset);
+	default:
+		/* hmm.. */
+		printf("unhandled type: %d\n", elem->type);
+		return 0;
+	}
+}
+
+/*
+ * Struct Object:
+ * To implement stuff like 'RB_MRT[n].CONTROL' we need a struct-object
+ * to represent the current array index (ie. 'RB_MRT[n]')
+ */
+
+static int l_rnn_struct_meta_index(lua_State *L)
+{
+	struct rnndoff *rnndoff = lua_touserdata(L, 1);
+	const char *name = lua_tostring(L, 2);
+	struct rnndelem *elem = rnndoff->elem;
+	int i;
+
+	for (i = 0; i < elem->subelemsnum; i++) {
+		struct rnndelem *subelem = elem->subelems[i];
+		if (!strcmp(name, subelem->name)) {
+			return l_rnn_etype(L, rnndoff->rnn, subelem,
+					rnndoff->offset + subelem->offset);
+		}
+	}
+
+	return 0;
+}
+
+static const struct luaL_Reg l_meta_rnn_struct[] = {
+	{"__index", l_rnn_struct_meta_index},
+	{NULL, NULL}  /* sentinel */
+};
+
+static int l_rnn_etype_struct(lua_State *L, struct rnn *rnn,
+		struct rnndelem *elem, uint64_t offset)
+{
+	push_rnndoff(L, rnn, elem, offset);
+
+	luaL_newmetatable(L, "rnnmetastruct");
+	luaL_setfuncs(L, l_meta_rnn_struct, 0);
+	lua_pop(L, 1);
+
+	luaL_setmetatable(L, "rnnmetastruct");
+
+	return 1;
+}
+
+/*
+ * Array Object:
+ */
+
+static int l_rnn_array_meta_index(lua_State *L)
+{
+	struct rnndoff *rnndoff = lua_touserdata(L, 1);
+	int idx = lua_tointeger(L, 2);
+	struct rnndelem *elem = rnndoff->elem;
+	uint64_t offset = rnndoff->offset + (elem->stride * idx);
+
+	DBG("rnndoff=%p, idx=%d, numsubelems=%d",
+			rnndoff, idx, rnndoff->elem->subelemsnum);
+
+	/* if just a single sub-element, it is directly a register,
+	 * otherwise we need to accumulate the array index while
+	 * we wait for the register name within the array..
+	 */
+	if (elem->subelemsnum == 1) {
+		return l_rnn_etype(L, rnndoff->rnn, elem->subelems[0], offset);
+	} else {
+		return l_rnn_etype_struct(L, rnndoff->rnn, elem, offset);
+	}
+
+	return 0;
+}
+
+static const struct luaL_Reg l_meta_rnn_array[] = {
+	{"__index", l_rnn_array_meta_index},
+	{NULL, NULL}  /* sentinel */
+};
+
+static int l_rnn_etype_array(lua_State *L, struct rnn *rnn,
+		struct rnndelem *elem, uint64_t offset)
+{
+	push_rnndoff(L, rnn, elem, offset);
+
+	luaL_newmetatable(L, "rnnmetaarray");
+	luaL_setfuncs(L, l_meta_rnn_array, 0);
+	lua_pop(L, 1);
+
+	luaL_setmetatable(L, "rnnmetaarray");
+
+	return 1;
+}
+
+/*
+ * Register element:
+ */
+
+static int l_rnn_reg_meta_index(lua_State *L)
+{
+	struct rnndoff *rnndoff = lua_touserdata(L, 1);
+	const char *name = lua_tostring(L, 2);
+	struct rnndelem *elem = rnndoff->elem;
+	struct rnntypeinfo *info = &elem->typeinfo;
+	struct rnnbitfield **bitfields;
+	int bitfieldsnum;
+	int i;
+
+	switch (info->type) {
+	case RNN_TTYPE_BITSET:
+		bitfields = info->ebitset->bitfields;
+		bitfieldsnum = info->ebitset->bitfieldsnum;
+		break;
+	case RNN_TTYPE_INLINE_BITSET:
+		bitfields = info->bitfields;
+		bitfieldsnum = info->bitfieldsnum;
+		break;
+	default:
+		printf("invalid register type: %d\n", info->type);
+		return 0;
+	}
+
+	for (i = 0; i < bitfieldsnum; i++) {
+		struct rnnbitfield *bf = bitfields[i];
+		if (!strcmp(name, bf->name)) {
+			uint32_t regval = rnn_val(rnndoff->rnn, rnndoff->offset);
+
+			regval &= typeinfo_mask(&bf->typeinfo);
+			regval >>= bf->typeinfo.low;
+			regval <<= bf->typeinfo.shr;
+
+			DBG("name=%s, info=%p, subelemsnum=%d, type=%d, regval=%x",
+					name, info, rnndoff->elem->subelemsnum,
+					bf->typeinfo.type, regval);
+
+			return pushdecval(L, rnndoff->rnn, regval, &bf->typeinfo);
+		}
+	}
+
+	printf("invalid member: %s\n", name);
+	return 0;
+}
+
+static int l_rnn_reg_meta_tostring(lua_State *L)
+{
+	struct rnndoff *rnndoff = lua_touserdata(L, 1);
+	uint32_t regval = rnn_val(rnndoff->rnn, rnndoff->offset);
+	struct rnndecaddrinfo *info = rnn_reginfo(rnndoff->rnn, rnndoff->offset);
+	char *decoded;
+	if (info && info->typeinfo) {
+		decoded = rnndec_decodeval(rnndoff->rnn->vc,
+				info->typeinfo, regval);
+	} else {
+		asprintf(&decoded, "%08x", regval);
+	}
+	lua_pushstring(L, decoded);
+	free(decoded);
+	if (info) {
+		free(info->name);
+		free(info);
+	}
+	return 1;
+}
+
+static int l_rnn_reg_meta_tonumber(lua_State *L)
+{
+	struct rnndoff *rnndoff = lua_touserdata(L, 1);
+	uint32_t regval = rnn_val(rnndoff->rnn, rnndoff->offset);
+
+	regval <<= rnndoff->elem->typeinfo.shr;
+
+	lua_pushnumber(L, regval);
+	return 1;
+}
+
+static const struct luaL_Reg l_meta_rnn_reg[] = {
+	{"__index", l_rnn_reg_meta_index},
+	{"__tostring", l_rnn_reg_meta_tostring},
+	{"__tonumber", l_rnn_reg_meta_tonumber},
+	{NULL, NULL}  /* sentinel */
+};
+
+static int l_rnn_etype_reg(lua_State *L, struct rnn *rnn,
+		struct rnndelem *elem, uint64_t offset)
+{
+	push_rnndoff(L, rnn, elem, offset);
+
+	luaL_newmetatable(L, "rnnmetareg");
+	luaL_setfuncs(L, l_meta_rnn_reg, 0);
+	lua_pop(L, 1);
+
+	luaL_setmetatable(L, "rnnmetareg");
+
+	return 1;
+}
+
+/*
+ *
+ */
+
+static int l_rnn_meta_index(lua_State *L)
+{
+	struct rnn *rnn = lua_touserdata(L, 1);
+	const char *name = lua_tostring(L, 2);
+	struct rnndelem *elem;
+
+	elem = rnn_regelem(rnn, name);
+	if (!elem)
+		return 0;
+
+	return l_rnn_etype(L, rnn, elem, elem->offset);
+}
+
+static int l_rnn_meta_gc(lua_State *L)
+{
+	// TODO
+	//struct rnn *rnn = lua_touserdata(L, 1);
+	//rnn_deinit(rnn);
+	return 0;
+}
+
+static const struct luaL_Reg l_meta_rnn[] = {
+	{"__index", l_rnn_meta_index},
+	{"__gc", l_rnn_meta_gc},
+	{NULL, NULL}  /* sentinel */
+};
+
+static int l_rnn_init(lua_State *L)
+{
+	const char *gpuname = lua_tostring(L, 1);
+	struct rnndec *rnndec = lua_newuserdata(L, sizeof(*rnndec));
+	_rnn_init(&rnndec->base, 0);
+	rnn_load(&rnndec->base, gpuname);
+	rnndec->sizedwords = 0;
+
+	luaL_newmetatable(L, "rnnmeta");
+	luaL_setfuncs(L, l_meta_rnn, 0);
+	lua_pop(L, 1);
+
+	luaL_setmetatable(L, "rnnmeta");
+
+	return 1;
+}
+
+static int l_rnn_enumname(lua_State *L)
+{
+	struct rnn *rnn = lua_touserdata(L, 1);
+	const char *name = lua_tostring(L, 2);
+	uint32_t val = (uint32_t)lua_tonumber(L, 3);
+	lua_pushstring(L, rnn_enumname(rnn, name, val));
+	return 1;
+}
+
+static int l_rnn_regname(lua_State *L)
+{
+	struct rnn *rnn = lua_touserdata(L, 1);
+	uint32_t regbase = (uint32_t)lua_tonumber(L, 2);
+	lua_pushstring(L, rnn_regname(rnn, regbase, 1));
+	return 1;
+}
+
+static int l_rnn_regval(lua_State *L)
+{
+	struct rnn *rnn = lua_touserdata(L, 1);
+	uint32_t regbase = (uint32_t)lua_tonumber(L, 2);
+	uint32_t regval = (uint32_t)lua_tonumber(L, 3);
+	struct rnndecaddrinfo *info = rnn_reginfo(rnn, regbase);
+	char *decoded;
+	if (info && info->typeinfo) {
+		decoded = rnndec_decodeval(rnn->vc, info->typeinfo, regval);
+	} else {
+		asprintf(&decoded, "%08x", regval);
+	}
+	lua_pushstring(L, decoded);
+	free(decoded);
+	if (info) {
+		free(info->name);
+		free(info);
+	}
+	return 1;
+}
+
+static const struct luaL_Reg l_rnn[] = {
+	{"init", l_rnn_init},
+	{"enumname", l_rnn_enumname},
+	{"regname", l_rnn_regname},
+	{"regval", l_rnn_regval},
+	{NULL, NULL}  /* sentinel */
+};
+
+
+
+/* Expose the register state to script enviroment as a "regs" library:
+ */
+
+static int l_reg_written(lua_State *L)
+{
+	uint32_t regbase = (uint32_t)lua_tonumber(L, 1);
+	lua_pushnumber(L, reg_written(regbase));
+	return 1;
+}
+
+static int l_reg_lastval(lua_State *L)
+{
+	uint32_t regbase = (uint32_t)lua_tonumber(L, 1);
+	lua_pushnumber(L, reg_lastval(regbase));
+	return 1;
+}
+
+static int l_reg_val(lua_State *L)
+{
+	uint32_t regbase = (uint32_t)lua_tonumber(L, 1);
+	lua_pushnumber(L, reg_val(regbase));
+	return 1;
+}
+
+static const struct luaL_Reg l_regs[] = {
+	{"written", l_reg_written},
+	{"lastval", l_reg_lastval},
+	{"val",     l_reg_val},
+	{NULL, NULL}  /* sentinel */
+};
+
+/* Expose API to lookup snapshot buffers:
+ */
+
+uint64_t gpubaseaddr(uint64_t gpuaddr);
+unsigned hostlen(uint64_t gpuaddr);
+
+/* given address, return base-address of buffer: */
+static int l_bo_base(lua_State *L)
+{
+	uint64_t addr = (uint64_t)lua_tonumber(L, 1);
+	lua_pushnumber(L, gpubaseaddr(addr));
+	return 1;
+}
+
+/* given address, return the remaining size of the buffer: */
+static int l_bo_size(lua_State *L)
+{
+	uint64_t addr = (uint64_t)lua_tonumber(L, 1);
+	lua_pushnumber(L, hostlen(addr));
+	return 1;
+}
+
+static const struct luaL_Reg l_bos[] = {
+	{"base", l_bo_base},
+	{"size", l_bo_size},
+	{NULL, NULL}  /* sentinel */
+};
+
+static void openlib(const char *lib, const luaL_Reg *reg)
+{
+  lua_newtable(L);
+  luaL_setfuncs(L, reg, 0);
+  lua_setglobal(L, lib);
+}
+
+/* called at start to load the script: */
+int script_load(const char *file)
+{
+	int ret;
+
+	assert(!L);
+
+	L = luaL_newstate();
+	luaL_openlibs(L);
+	openlib("bos", l_bos);
+	openlib("regs", l_regs);
+	openlib("rnn", l_rnn);
+
+	ret = luaL_loadfile(L, file);
+	if (ret)
+		error("%s\n");
+
+	ret = lua_pcall(L, 0, LUA_MULTRET, 0);
+	if (ret)
+		error("%s\n");
+
+	return 0;
+}
+
+
+/* called at start of each cmdstream file: */
+void script_start_cmdstream(const char *name)
+{
+	if (!L)
+		return;
+
+	lua_getglobal(L, "start_cmdstream");
+
+	/* if no handler just ignore it: */
+	if (!lua_isfunction(L, -1)) {
+		lua_pop(L, 1);
+		return;
+	}
+
+	lua_pushstring(L, name);
+
+	/* do the call (1 arguments, 0 result) */
+	if (lua_pcall(L, 1, 0, 0) != 0)
+		error("error running function `f': %s\n");
+}
+
+/* called at each DRAW_INDX, calls script drawidx fxn to process
+ * the current state
+ */
+void script_draw(const char *primtype, uint32_t nindx)
+{
+	if (!L)
+		return;
+
+	lua_getglobal(L, "draw");
+
+	/* if no handler just ignore it: */
+	if (!lua_isfunction(L, -1)) {
+		lua_pop(L, 1);
+		return;
+	}
+
+	lua_pushstring(L, primtype);
+	lua_pushnumber(L, nindx);
+
+	/* do the call (2 arguments, 0 result) */
+	if (lua_pcall(L, 2, 0, 0) != 0)
+		error("error running function `f': %s\n");
+}
+
+
+static int l_rnn_meta_dom_index(lua_State *L)
+{
+	struct rnn *rnn = lua_touserdata(L, 1);
+	uint32_t offset = (uint32_t)lua_tonumber(L, 2);
+	struct rnndelem *elem;
+
+	/* TODO might be nicer if the arg isn't a number, to search the domain
+	 * for matching bitfields.. so that the script could do something like
+	 * 'pkt.WIDTH' insteadl of 'pkt[1].WIDTH', ie. not have to remember the
+	 * offset of the dword containing the bitfield..
+	 */
+
+	elem = rnn_regoff(rnn, offset);
+	if (!elem)
+		return 0;
+
+	return l_rnn_etype(L, rnn, elem, elem->offset);
+}
+
+/*
+ * A wrapper object for rnndomain based decoding of an array of dwords
+ * (ie. for pm4 packet decoding).  Mostly re-uses the register-value
+ * decoding for the individual dwords and bitfields.
+ */
+
+static int l_rnn_meta_dom_gc(lua_State *L)
+{
+	// TODO
+	//struct rnn *rnn = lua_touserdata(L, 1);
+	//rnn_deinit(rnn);
+	return 0;
+}
+
+static const struct luaL_Reg l_meta_rnn_dom[] = {
+	{"__index", l_rnn_meta_dom_index},
+	{"__gc", l_rnn_meta_dom_gc},
+	{NULL, NULL}  /* sentinel */
+};
+
+/* called to general pm4 packet decoding, such as texture/sampler state
+ */
+void script_packet(uint32_t *dwords, uint32_t sizedwords,
+		struct rnn *rnn, struct rnndomain *dom)
+{
+	if (!L)
+		return;
+
+	lua_getglobal(L, dom->name);
+
+	/* if no handler for the packet, just ignore it: */
+	if (!lua_isfunction(L, -1)) {
+		lua_pop(L, 1);
+		return;
+	}
+
+	struct rnndec *rnndec = lua_newuserdata(L, sizeof(*rnndec));
+
+	rnndec->base = *rnn;
+	rnndec->base.dom[0] = dom;
+	rnndec->base.dom[1] = NULL;
+	rnndec->dwords = dwords;
+	rnndec->sizedwords = sizedwords;
+
+	luaL_newmetatable(L, "rnnmetadom");
+	luaL_setfuncs(L, l_meta_rnn_dom, 0);
+	lua_pop(L, 1);
+
+	luaL_setmetatable(L, "rnnmetadom");
+
+	lua_pushnumber(L, sizedwords);
+
+	if (lua_pcall(L, 2, 0, 0) != 0)
+		error("error running function `f': %s\n");
+}
+
+/* helper to call fxn that takes and returns void: */
+static void simple_call(const char *name)
+{
+	if (!L)
+		return;
+
+	lua_getglobal(L, name);
+
+	/* if no handler just ignore it: */
+	if (!lua_isfunction(L, -1)) {
+		lua_pop(L, 1);
+		return;
+	}
+
+	/* do the call (0 arguments, 0 result) */
+	if (lua_pcall(L, 0, 0, 0) != 0)
+		error("error running function `f': %s\n");
+}
+
+/* called at end of each cmdstream file: */
+void script_end_cmdstream(void)
+{
+	simple_call("end_cmdstream");
+}
+
+/* called at start of submit/issueibcmds: */
+void script_start_submit(void)
+{
+	simple_call("start_submit");
+}
+
+/* called at end of submit/issueibcmds: */
+void script_end_submit(void)
+{
+	simple_call("end_submit");
+}
+
+/* called after last cmdstream file: */
+void script_finish(void)
+{
+	if (!L)
+		return;
+
+	simple_call("finish");
+
+	lua_close(L);
+	L = NULL;
+}
diff --git a/src/freedreno/decode/script.h b/src/freedreno/decode/script.h
new file mode 100644
index 0000000..d14b69a
--- /dev/null
+++ b/src/freedreno/decode/script.h
@@ -0,0 +1,76 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef SCRIPT_H_
+#define SCRIPT_H_
+
+#include <stdint.h>
+
+
+// XXX make script support optional
+#define ENABLE_SCRIPTING 1
+
+#ifdef ENABLE_SCRIPTING
+
+/* called at start to load the script: */
+int script_load(const char *file);
+
+/* called at start of each cmdstream file: */
+void script_start_cmdstream(const char *name);
+
+/* called at each DRAW_INDX, calls script drawidx fxn to process
+ * the current state
+ */
+__attribute__((weak))
+void script_draw(const char *primtype, uint32_t nindx);
+
+struct rnn;
+struct rnndomain;
+__attribute__((weak))
+void script_packet(uint32_t *dwords, uint32_t sizedwords,
+		struct rnn *rnn, struct rnndomain *dom);
+
+/* maybe at some point it is interesting to add additional script
+ * hooks for CP_EVENT_WRITE, etc?
+ */
+
+/* called at end of each cmdstream file: */
+void script_end_cmdstream(void);
+
+void script_start_submit(void);
+void script_end_submit(void);
+
+/* called after last cmdstream file: */
+void script_finish(void);
+
+#else
+// TODO no-op stubs..
+#endif
+
+
+#endif /* SCRIPT_H_ */
diff --git a/src/freedreno/decode/scripts/analyze.lua b/src/freedreno/decode/scripts/analyze.lua
new file mode 100644
index 0000000..27e97ec
--- /dev/null
+++ b/src/freedreno/decode/scripts/analyze.lua
@@ -0,0 +1,178 @@
+-- A script that compares a set of equivalent cmdstream captures from
+-- various generations, looking for equivalencies between registers.
+--
+-- This would be run across a group of similar tests for various
+-- generations, for example:
+--
+--   cffdump --script scripts/analyze.lua a320/quad-flat-*.rd a420/quad-flat-*.rd
+--
+-- This is done by comparing unique register values.  Ie. for each
+-- generation, find the set of registers that have different values
+-- between equivalent draw calls.
+
+local posix = require "posix"
+
+io.write("Analyzing Data...\n")
+
+-- results - table structure:
+-- * [gpuname] - gpu
+--   * tests
+--     * [testname] - current test
+--       * draws
+--         * [1..n] - the draws
+--           * primtype - the primitive type
+--           * regs - table of values for draw
+--             * [regbase] - regval
+--   * regvals - table of unique values across all draws
+--     * [regbase]
+--       * [regval] - list of test names
+--         * [1..n] - testname "." didx
+local results = {}
+
+local test = nil
+local gpuname = nil
+local testname = nil
+
+
+-- srsly, no sparse table size() op?
+function tblsz(tbl)
+  local n = 0;
+  for k,v in pairs(tbl) do
+    n = n + 1
+  end
+  return n
+end
+
+
+function start_cmdstream(name)
+  testname = posix.basename(name)
+  gpuname = posix.basename(posix.dirname(name))
+  --io.write("START: gpuname=" .. gpuname .. ", testname=" .. testname .. "\n");
+  local gpu = results[gpuname]
+  if gpu == nil then
+    gpu = {["tests"] = {}, ["regvals"] = {}}
+    results[gpuname] = gpu
+  end
+  test = {["draws"] = {}}
+  gpu["tests"][testname] = test
+end
+
+function draw(primtype, nindx)
+  -- RECTLIST is only used internally.. we want to ignore it for
+  -- now, although it could potentially be interesting to track
+  -- these separately (separating clear/restore/resolve) just to
+  -- figure out which registers are used for which..
+  if primtype == "DI_PT_RECTLIST" then
+    return
+  end
+  local regtbl = {}
+  local draw = {["primtype"] = primtype, ["regs"] = regtbl}
+  local didx = tblsz(test["draws"])
+
+  test["draws"][didx] = draw
+
+  -- populate current regs.  For now just consider ones that have
+  -- been written.. maybe we need to make that configurable in
+  -- case it filters out too many registers.
+  for regbase=0,0xffff do
+    if regs.written(regbase) ~= 0 then
+      local regval = regs.val(regbase)
+
+      -- track reg vals per draw:
+      regtbl[regbase] = regval
+
+      -- also track which reg vals appear in which tests:
+      local uniq_regvals = results[gpuname]["regvals"][regbase]
+      if uniq_regvals == nil then
+        uniq_regvals = {}
+        results[gpuname]["regvals"][regbase] = uniq_regvals;
+      end
+      local drawlist = uniq_regvals[regval]
+      if drawlist == nil then
+        drawlist = {}
+        uniq_regvals[regval] = drawlist
+      end
+      table.insert(drawlist, testname .. "." .. didx)
+    end
+  end
+
+  -- TODO maybe we want to whitelist a few well known regs, for the
+  -- convenience of the code that runs at the end to analyze the data?
+  -- TODO also would be useful to somehow capture CP_SET_BIN..
+
+end
+
+function end_cmdstream()
+  test = nil
+  gpuname = nil
+  testname = nil
+end
+
+function print_draws(gpuname, gpu)
+  io.write("  " .. gpuname .. "\n")
+  for testname,test in pairs(gpu["tests"]) do
+    io.write("    " .. testname .. ", draws=" .. #test["draws"] .. "\n")
+    for didx,draw in pairs(test["draws"]) do
+      io.write("      " .. didx .. ": " .. draw["primtype"] .. "\n")
+    end
+  end
+end
+
+-- sort and concat a list of draw names to form a key which can be
+-- compared to other drawlists to check for equality
+-- TODO maybe we instead want a scheme that allows for some fuzzyness
+-- in the matching??
+function drawlistname(drawlist)
+  local name = nil
+  for idx,draw in pairs(drawlist) do
+    if name == nil then
+      name = draw
+    else
+      name = name .. ":" .. draw
+    end
+  end
+  return name
+end
+
+local rnntbl = {}
+
+function dumpmatches(name)
+  for gpuname,gpu in pairs(results) do
+    local r = rnntbl[gpuname]
+    if r == nil then
+      io.write("loading rnn database: \n" .. gpuname)
+      r = rnn.init(gpuname)
+      rnntbl[gpuname] = r
+    end
+    for regbase,regvals in pairs(gpu["regvals"]) do
+      for regval,drawlist in pairs(regvals) do
+        local name2 = drawlistname(drawlist)
+        if name == name2 then
+          io.write(string.format("  %s:%s:\t%08x  %s\n",
+                                 gpuname, rnn.regname(r, regbase),
+                                 regval, rnn.regval(r, regbase, regval)))
+        end
+      end
+    end
+  end
+end
+
+function finish()
+  -- drawlistnames that we've already dumped:
+  local dumped = {}
+
+  for gpuname,gpu in pairs(results) do
+    -- print_draws(gpuname, gpu)
+    for regbase,regvals in pairs(gpu["regvals"]) do
+      for regval,drawlist in pairs(regvals) do
+        local name = drawlistname(drawlist)
+        if dumped[name] == nil then
+          io.write("\n" .. name .. ":\n")
+          dumpmatches(name)
+          dumped[name] = 1
+        end
+      end
+    end
+  end
+end
+
diff --git a/src/freedreno/decode/scripts/parse-submits.lua b/src/freedreno/decode/scripts/parse-submits.lua
new file mode 100644
index 0000000..1d21716
--- /dev/null
+++ b/src/freedreno/decode/scripts/parse-submits.lua
@@ -0,0 +1,413 @@
+-- Parse cmdstream dump and analyse blits and batches
+
+--local posix = require "posix"
+
+function printf(fmt, ...)
+	return io.write(string.format(fmt, ...))
+end
+
+function dbg(fmt, ...)
+	--printf(fmt, ...)
+end
+
+printf("Analyzing Data...\n")
+
+local r = rnn.init("a630")
+
+-- Each submit, all draws will target the same N MRTs:
+local mrts = {}
+local allmrts = {}  -- includes historical render targets
+function push_mrt(fmt, w, h, samples, base, flag, gmem)
+	dbg("MRT: %s %ux%u 0x%x\n", fmt, w, h, base)
+
+	local mrt = {}
+	mrt.format = fmt
+	mrt.w = w
+	mrt.h = h
+	mrt.samples = samples
+	mrt.base = base
+	mrt.flag = flag
+	mrt.gmem = gmem
+
+	mrts[base] = mrt
+	allmrts[base] = mrt
+end
+
+-- And each each draw will read from M sources/textures:
+local sources = {}
+function push_source(fmt, w, h, samples, base, flag)
+	dbg("SRC: %s %ux%u 0x%x\n", fmt, w, h, base)
+
+	local source = {}
+	source.format = fmt
+	source.w = w
+	source.h = h
+	source.samples = samples
+	source.base = base
+	source.flag = flag
+
+	sources[base] = source
+end
+
+local binw
+local binh
+local nbins
+local blits = 0
+local draws = 0
+local drawmode
+local cleared
+local restored
+local resolved
+local nullbatch
+local depthtest
+local depthwrite
+local stenciltest
+local stencilwrite
+
+function start_cmdstream(name)
+	printf("Parsing %s\n", name)
+end
+
+function reset()
+	dbg("reset\n")
+	mrts = {}
+	sources = {}
+	draws = 0
+	blits = 0
+	cleared = {}
+	restored = {}
+	resolved = {}
+	depthtest = false
+	depthwrite = false
+	stenciltest = false
+	stencilwrite = false
+	drawmode = Nil
+end
+
+function start_submit()
+	dbg("start_submit\n")
+	reset()
+	nullbatch = true
+end
+
+function finish()
+	dbg("finish\n")
+
+	printf("\n")
+
+	-- TODO we get false-positives for 'NULL BATCH!' because we don't have
+	-- a really good way to differentiate between submits and cmds.  Ie.
+	-- with growable cmdstream, and a large # of tiles, IB1 can get split
+	-- across multiple buffers.  Since we ignore GMEM draws for window-
+	-- offset != 0,0, the later cmds will appear as null batches
+	if draws == 0 and blits == 0 then
+		if nullbatch then
+			printf("NULL BATCH!\n");
+		end
+		return
+	end
+
+	if draws > 0 then
+		printf("Batch:\n")
+		printf("-------\n")
+		printf("  # of draws: %u\n", draws)
+		printf("  mode: %s\n", drawmode)
+		if drawmode == "RM6_GMEM" then
+			printf("  bin size: %ux%u (%u bins)\n", binw, binh, nbins)
+		end
+		if depthtest or depthwrite then
+			printf("  ")
+			if depthtest then
+				printf("DEPTHTEST ")
+			end
+			if depthwrite then
+				printf("DEPTHWRITE")
+			end
+			printf("\n")
+		end
+		if stenciltest or stencilwrite then
+			printf("  ")
+			if stenciltest then
+				printf("STENCILTEST ")
+			end
+			if stencilwrite then
+				printf("STENCILWRITE")
+			end
+			printf("\n")
+		end
+	else
+		printf("Blit:\n")
+		printf("-----\n")
+	end
+
+	for base,mrt in pairs(mrts) do
+		printf("  MRT[0x%x:0x%x]:\t%ux%u\t\t%s (%s)", base, mrt.flag, mrt.w, mrt.h, mrt.format, mrt.samples)
+		if drawmode == "RM6_GMEM" then
+			if cleared[mrt.gmem] then
+				printf("\tCLEARED")
+			end
+			if restored[mrt.gmem] then
+				printf("\tRESTORED")
+			end
+			if resolved[mrt.gmem] then
+				printf("\tRESOLVED")
+			end
+		else
+			if cleared[mrt.base] then
+				printf("\tCLEARED")
+			end
+		end
+		printf("\n")
+	end
+
+	function print_source(source)
+		printf("  SRC[0x%x:0x%x]:\t%ux%u\t\t%s (%s)\n", source.base, source.flag, source.w, source.h, source.format, source.samples)
+	end
+
+	for base,source in pairs(sources) do
+		-- only show sources that have been previously rendered to, other
+		-- textures are less interesting.  Possibly this should be an
+		-- option somehow
+		if draws < 10 then
+			print_source(source)
+		elseif allmrts[base] or draws == 0 then
+			print_source(source)
+		elseif source.flag and allmrts[source.flag] then
+			print_source(source)
+		end
+	end
+	reset()
+end
+
+function end_submit()
+	dbg("end_submit\n")
+	finish()
+end
+
+-- Track the current mode:
+local mode = ""
+function CP_SET_MARKER(pkt, size)
+	mode = pkt[0].MARKER
+	dbg("mode: %s\n", mode)
+end
+
+function CP_EVENT_WRITE(pkt, size)
+	if tostring(pkt[0].EVENT) ~= "BLIT" then
+		return
+	end
+	nullbatch = false
+	local m = tostring(mode)
+	if m == "RM6_GMEM" then
+		-- either clear or restore:
+		if r.RB_BLIT_INFO.CLEAR_MASK == 0 then
+			restored[r.RB_BLIT_BASE_GMEM] = 1
+		else
+			cleared[r.RB_BLIT_BASE_GMEM] = 1
+		end
+		-- push_mrt() because we could have GMEM
+		-- passes with only a clear and no draws:
+		local flag = 0
+		local sysmem = 0;
+		-- try to match up the GMEM addr with the MRT/DEPTH state,
+		-- to avoid relying on RB_BLIT_DST also getting written:
+		for n = 0,r.RB_FS_OUTPUT_CNTL1.MRT-1 do
+			if r.RB_MRT[n].BASE_GMEM == r.RB_BLIT_BASE_GMEM then
+				sysmem = r.RB_MRT[n].BASE_LO | (r.RB_MRT[n].BASE_HI << 32)
+				flag = r.RB_MRT_FLAG_BUFFER[n].ADDR_LO | (r.RB_MRT_FLAG_BUFFER[n].ADDR_HI << 32)
+				break
+			end
+		end
+		if sysmem == 0 and r.RB_BLIT_BASE_GMEM == r.RB_DEPTH_BUFFER_BASE_GMEM then
+			sysmem = r.RB_DEPTH_BUFFER_BASE_LO | (r.RB_DEPTH_BUFFER_BASE_HI << 32)
+			flag = r.RB_DEPTH_FLAG_BUFFER_BASE_LO | (r.RB_DEPTH_FLAG_BUFFER_BASE_HI << 32)
+
+		end
+		--NOTE this can get confused by previous blits:
+		--if sysmem == 0 then
+		--	-- fallback:
+		--	sysmem = r.RB_BLIT_DST_LO | (r.RB_BLIT_DST_HI << 32)
+		--	flag = r.RB_BLIT_FLAG_DST_LO | (r.RB_BLIT_FLAG_DST_HI << 32)
+		--end
+		if not r.RB_BLIT_DST_INFO.FLAGS then
+			flag = 0
+		end
+		-- TODO maybe just emit RB_BLIT_DST_LO/HI for clears.. otherwise
+		-- we get confused by stale values in registers.. not sure
+		-- if this is a problem w/ blob
+		push_mrt(r.RB_BLIT_DST_INFO.COLOR_FORMAT,
+			r.RB_BLIT_SCISSOR_BR.X + 1,
+			r.RB_BLIT_SCISSOR_BR.Y + 1,
+			r.RB_BLIT_DST_INFO.SAMPLES,
+			sysmem,
+			flag,
+			r.RB_BLIT_BASE_GMEM)
+	elseif m == "RM6_RESOLVE" then
+		resolved[r.RB_BLIT_BASE_GMEM] = 1
+	else
+		printf("I am confused!!!\n")
+	end
+end
+
+function A6XX_TEX_CONST(pkt, size)
+	push_source(pkt[0].FMT,
+		pkt[1].WIDTH, pkt[1].HEIGHT,
+		pkt[0].SAMPLES,
+		pkt[4].BASE_LO | (pkt[5].BASE_HI << 32),
+		pkt[7].FLAG_LO | (pkt[8].FLAG_HI << 32))
+end
+
+function handle_blit()
+	-- blob sometimes uses CP_BLIT for resolves, so filter those out:
+	-- TODO it would be nice to not hard-code GMEM addr:
+	-- TODO I guess the src can be an offset from GMEM addr..
+	if r.SP_PS_2D_SRC_LO == 0x100000 and not r.RB_2D_BLIT_CNTL.SOLID_COLOR then
+		resolved[0] = 1
+		return
+	end
+	if draws > 0 then
+		finish()
+	end
+	reset()
+	drawmode = "BLIT"
+	-- This kinda assumes that we are doing full img blits, which is maybe
+	-- Not completely legit.  We could perhaps instead just track pitch and
+	-- size/pitch??  Or maybe the size doesn't matter much
+	push_mrt(r.RB_2D_DST_INFO.COLOR_FORMAT,
+		r.GRAS_2D_DST_BR.X + 1,
+		r.GRAS_2D_DST_BR.Y + 1,
+		"MSAA_ONE",
+		r.RB_2D_DST_LO | (r.RB_2D_DST_HI << 32),
+		r.RB_2D_DST_FLAGS_LO | (r.RB_2D_DST_FLAGS_HI << 32),
+		-1)
+	if r.RB_2D_BLIT_CNTL.SOLID_COLOR then
+		dbg("CLEAR=%x\n", r.RB_2D_DST_LO | (r.RB_2D_DST_HI << 32))
+		cleared[r.RB_2D_DST_LO | (r.RB_2D_DST_HI << 32)] = 1
+	else
+		push_source(r.SP_2D_SRC_FORMAT.COLOR_FORMAT,
+			r.GRAS_2D_SRC_BR_X.X + 1,
+			r.GRAS_2D_SRC_BR_Y.Y + 1,
+			"MSAA_ONE",
+			r.SP_PS_2D_SRC_LO | (r.SP_PS_2D_SRC_HI << 32),
+			r.SP_PS_2D_SRC_FLAGS_LO | (r.SP_PS_2D_SRC_FLAGS_HI << 32))
+	end
+	blits = blits + 1
+	finish()
+end
+
+function valid_transition(curmode, newmode)
+	if curmode == "RM6_BINNING" and newmode == "RM6_GMEM" then
+		return true
+	end
+	if curmode == "RM6_GMEM" and newmode == "RM6_RESOLVE" then
+		return true
+	end
+	return false
+end
+
+function draw(primtype, nindx)
+	dbg("draw: %s (%s)\n", primtype, mode)
+	nullbatch = false
+	if primtype == "BLIT_OP_SCALE" then
+		handle_blit()
+		return
+	elseif primtype == "EVENT:BLIT" then
+		return
+	end
+
+	local m = tostring(mode)
+
+	-- detect changes in drawmode which indicate a different
+	-- pass..  BINNING->GMEM means same pass, but other
+	-- transitions mean different pass:
+	if drawmode and m ~= drawmode then
+		dbg("%s -> %s transition\n", drawmode, m)
+		if not valid_transition(drawmode, m) then
+			dbg("invalid transition, new render pass!\n")
+			finish()
+			reset()
+		end
+	end
+
+	if m ~= "RM6_GMEM" and m ~= "RM6_BYPASS" then
+		if m == "RM6_BINNING" then
+			drawmode = m
+			return
+		end
+		if m == "RM6_RESOLVE" and primtype == "EVENT:BLIT" then
+			return
+		end
+		printf("unknown MODE %s for primtype %s\n", m, primtype)
+		return
+	end
+
+	-- Only count the first tile for GMEM mode to avoid counting
+	-- each draw for each tile
+	if m == "RM6_GMEM" then
+		if r.RB_WINDOW_OFFSET.X ~= 0 or r.RB_WINDOW_OFFSET.Y ~= 0 then
+			return
+		end
+	end
+
+	drawmode = m
+	local render_components = {}
+	render_components[0] = r.RB_RENDER_COMPONENTS.RT0;
+	render_components[1] = r.RB_RENDER_COMPONENTS.RT1;
+	render_components[2] = r.RB_RENDER_COMPONENTS.RT2;
+	render_components[3] = r.RB_RENDER_COMPONENTS.RT3;
+	render_components[4] = r.RB_RENDER_COMPONENTS.RT4;
+	render_components[5] = r.RB_RENDER_COMPONENTS.RT5;
+	render_components[6] = r.RB_RENDER_COMPONENTS.RT6;
+	render_components[7] = r.RB_RENDER_COMPONENTS.RT7;
+	for n = 0,r.RB_FS_OUTPUT_CNTL1.MRT-1 do
+		if render_components[n] ~= 0 then
+			push_mrt(r.RB_MRT[n].BUF_INFO.COLOR_FORMAT,
+				r.GRAS_SC_SCREEN_SCISSOR[0].BR.X + 1,
+				r.GRAS_SC_SCREEN_SCISSOR[0].BR.Y + 1,
+				r.RB_MSAA_CNTL.SAMPLES,
+				r.RB_MRT[n].BASE_LO | (r.RB_MRT[n].BASE_HI << 32),
+				r.RB_MRT_FLAG_BUFFER[n].ADDR_LO | (r.RB_MRT_FLAG_BUFFER[n].ADDR_HI << 32),
+				r.RB_MRT[n].BASE_GMEM)
+		end
+	end
+
+	local depthbase = r.RB_DEPTH_BUFFER_BASE_LO |
+			(r.RB_DEPTH_BUFFER_BASE_HI << 32)
+
+	if depthbase ~= 0 then
+		push_mrt(r.RB_DEPTH_BUFFER_INFO.DEPTH_FORMAT,
+			r.GRAS_SC_SCREEN_SCISSOR[0].BR.X + 1,
+			r.GRAS_SC_SCREEN_SCISSOR[0].BR.Y + 1,
+			r.RB_MSAA_CNTL.SAMPLES,
+			depthbase,
+			r.RB_DEPTH_FLAG_BUFFER_BASE_LO | (r.RB_DEPTH_FLAG_BUFFER_BASE_HI << 32),
+			r.RB_DEPTH_BUFFER_BASE_GMEM)
+	end
+
+	if r.RB_DEPTH_CNTL.Z_WRITE_ENABLE then
+		depthwrite = true
+	end
+
+	if r.RB_DEPTH_CNTL.Z_ENABLE then
+		depthtest = true
+	end
+
+	-- clearly 0 != false.. :-/
+	if r.RB_STENCILWRMASK.WRMASK ~= 0 then
+		stencilwrite = true
+	end
+
+	if r.RB_STENCIL_CONTROL.STENCIL_ENABLE then
+		stenciltest = true
+	end
+
+	-- TODO should also check for stencil buffer for z32+s8 case
+
+	if m == "RM6_GMEM" then
+		binw = r.VSC_BIN_SIZE.WIDTH
+		binh = r.VSC_BIN_SIZE.HEIGHT
+		nbins = r.VSC_BIN_COUNT.NX * r.VSC_BIN_COUNT.NY
+	end
+
+	draws = draws + 1
+end
+
diff --git a/src/freedreno/decode/scripts/sanity-a6xx.lua b/src/freedreno/decode/scripts/sanity-a6xx.lua
new file mode 100644
index 0000000..68e4c73
--- /dev/null
+++ b/src/freedreno/decode/scripts/sanity-a6xx.lua
@@ -0,0 +1,76 @@
+-- Parse cmdstream dump and check for common errors
+--  1) Check for overflowing HLSQ_xS_CNTL.CONSTLEN
+--  2) Check for constant uploades that overwrite each other.  The
+--     range checking is reset on  each draw, since it is a valid
+--     use-case to do partial constant upload.  But if we see two
+--     CP_LOAD_STATE* that overwrite the same range of constants
+--     within the same draw, that is almost certainly unintentional.
+--
+-- TODO add more checks
+-- TODO maybe some parts could be shared across
+--      different generations
+
+--local posix = require "posix"
+
+function printf(fmt, ...)
+	return io.write(string.format(fmt, ...))
+end
+
+function dbg(fmt, ...)
+	--printf(fmt, ...)
+end
+
+stages = {
+	"SB6_VS_SHADER",
+	"SB6_HS_SHADER",
+	"SB6_DS_SHADER",
+	"SB6_GS_SHADER",
+	"SB6_FS_SHADER",
+	"SB6_CS_SHADER",
+}
+
+-- maps shader stage to HLSQ_xS_CNTL register name:
+cntl_regs = {
+	["SB6_VS_SHADER"] = "HLSQ_VS_CNTL",
+	["SB6_HS_SHADER"] = "HLSQ_HS_CNTL",
+	["SB6_DS_SHADER"] = "HLSQ_DS_CNTL",
+	["SB6_GS_SHADER"] = "HLSQ_GS_CNTL",
+	["SB6_FS_SHADER"] = "HLSQ_FS_CNTL",
+	["SB6_CS_SHADER"] = "HLSQ_CS_CNTL",
+}
+
+-- initialize constant updated ranges:
+--   constranges[stagename] -> table of offsets that have been uploaded
+constranges = {}
+function reset_constranges()
+	for i,stage in ipairs(stages) do
+		constranges[stage] = {}
+	end
+end
+
+reset_constranges()
+
+printf("Checking cmdstream...\n")
+
+local r = rnn.init("a630")
+
+function draw(primtype, nindx)
+	printf("draw!\n")
+	-- reset ranges of uploaded consts on each draw:
+	reset_constranges()
+end
+
+function CP_LOAD_STATE6(pkt, size)
+	if tostring(pkt[0].STATE_TYPE) ~= "ST6_CONSTANTS" then
+		return
+	end
+	dbg("got CP_LOAD_STATE6\n")
+	stage = tostring(pkt[0].STATE_BLOCK)
+	max = pkt[0].DST_OFF + pkt[0].NUM_UNIT
+	cntl_reg = cntl_regs[stage]
+	dbg("looking for %s.. max=%d vs %d\n", cntl_reg, max, r[cntl_reg].CONSTLEN)
+	if max > r[cntl_reg].CONSTLEN then
+		printf("ERROR: invalid max constant offset for stage %s: %d vs %d\n", stage, max, r[cntl_reg].CONSTLEN)
+	end
+
+end
diff --git a/src/freedreno/decode/scripts/test.lua b/src/freedreno/decode/scripts/test.lua
new file mode 100644
index 0000000..e9d8db2
--- /dev/null
+++ b/src/freedreno/decode/scripts/test.lua
@@ -0,0 +1,31 @@
+io.write("HELLO WORLD\n")
+
+r = rnn.init("a630")
+
+function start_cmdstream(name)
+  io.write("START: " .. name .. "\n")
+end
+
+function draw(primtype, nindx)
+  io.write("DRAW: " .. primtype .. ", " .. nindx .. "\n")
+  -- io.write("GRAS_CL_VPORT_XOFFSET: " .. r.GRAS_CL_VPORT_XOFFSET .. "\n")
+  io.write("RB_MRT[0].CONTROL.ROP_CODE: " .. r.RB_MRT[0].CONTROL.ROP_CODE .. "\n")
+  io.write("SP_VS_OUT[0].A_COMPMASK: " .. r.SP_VS_OUT[0].A_COMPMASK .. "\n")
+  --io.write("RB_DEPTH_CONTROL.Z_ENABLE: " .. tostring(r.RB_DEPTH_CONTROL.Z_ENABLE) .. "\n")
+  io.write("0x2280: written=" .. regs.written(0x2280) .. ", lastval=" .. regs.lastval(0x2280) .. ", val=" .. regs.val(0x2280) .. "\n")
+end
+
+function A6XX_TEX_CONST(pkt, size)
+  io.write("\n-------- " .. size .. "\n")
+  io.write("-------- w=" .. pkt[1].WIDTH .. ", h=" .. pkt[1].HEIGHT .. "\n")
+  io.write("\n");
+end
+
+function end_cmdstream()
+  io.write("END\n")
+end
+
+function finish()
+  io.write("FINISH\n")
+end
+
diff --git a/src/freedreno/decode/scripts/tex3d-layout.lua b/src/freedreno/decode/scripts/tex3d-layout.lua
new file mode 100644
index 0000000..2d5069f
--- /dev/null
+++ b/src/freedreno/decode/scripts/tex3d-layout.lua
@@ -0,0 +1,137 @@
+-- Parse logs from test-quad-textured-3d.c to exctract layer/level
+-- offsets
+--
+-- We figure out the offsets from blits, but there may be some
+-- unrelated blits.  So just save all of them until we find the
+-- texture state for the 3d texture.  This gives us the base
+-- address, and the miplevel #0 width/height/depth.  Then work
+-- backwards from there finding the blits to the same dst buffer
+-- and deducing the miplevel from the minified dimensions
+
+local posix = require "posix"
+
+io.write("Analyzing Data...\n")
+
+local allblits = {}
+local nallblits = 0
+local r = rnn.init("a630")
+
+function minify(val, lvls)
+  val = val >> lvls
+  if val < 1 then
+    return 1
+  end
+  return val
+end
+
+function printf(fmt, ...)
+  return io.write(string.format(fmt, ...))
+end
+
+function start_cmdstream(name)
+  io.write("Parsing " .. name .. "\n")
+  allblits = {}
+  nallblits = 0
+end
+
+function draw(primtype, nindx)
+  if primtype ~= "BLIT_OP_SCALE" then
+    return
+  end
+
+  -- Just in case, filter out anything that isn't starting
+  -- at 0,0
+  if r.GRAS_2D_DST_TL.X ~= 0 or r.GRAS_2D_DST_TL.Y ~= 0 then
+    return
+  end
+
+  local blit = {}
+  
+  blit.width   = r.GRAS_2D_DST_BR.X + 1
+  blit.height  = r.GRAS_2D_DST_BR.Y + 1
+  blit.pitch   = r.RB_2D_DST_SIZE.PITCH
+  blit.addr    = r.RB_2D_DST_LO | (r.RB_2D_DST_HI << 32)
+  blit.base    = bos.base(blit.addr)
+  blit.endaddr = 0  -- filled in later
+  --printf("Found blit: 0x%x (0x%x)\n", blit.addr, blit.base)
+
+  allblits[nallblits] = blit
+  nallblits = nallblits + 1
+end
+
+function A6XX_TEX_CONST(pkt, size)
+  -- ignore any texture state w/ DEPTH=1, these aren't the 3d tex state we
+  -- are looking for
+  if pkt[5].DEPTH <= 1 then
+    return
+  end
+
+  local base = pkt[4].BASE_LO | (pkt[5].BASE_HI << 32)
+  local width0  = pkt[1].WIDTH
+  local height0 = pkt[1].HEIGHT
+  local depth0  = pkt[5].DEPTH
+
+  printf("Found texture state: %ux%ux%u (MIN_LAYERSZ=0x%x)\n",
+         width0, height0, depth0, pkt[3].MIN_LAYERSZ)
+
+  -- Note that in some case the texture has some extra page or so
+  -- at the beginning:
+  local basebase = bos.base(base)
+  printf("base: 0x%x (0x%x)\n", base, basebase)
+
+  -- see if we can find the associated blits..  The blob always seems to
+  -- start from the lower (larger) mipmap levels and layers, so we don't
+  -- need to sort by dst address.  Also, while we are at it, fill in the
+  -- end-addr (at least for everything but the last blit)
+  local blits = {}
+  local nblits = 0
+  local lastblit = nil
+  for n = 0,nallblits-1 do
+    local blit = allblits[n]
+    --printf("blit addr: 0x%x (0x%x)\n", blit.addr, blit.base)
+    if blit.base == basebase and blit.addr >= base then
+      blits[nblits] = blit
+      nblits = nblits + 1
+      if lastblit then
+        lastblit.endaddr = blit.addr
+      end
+      lastblit = blit
+    end
+  end
+
+  -- now go thru the relevant blits and print out interesting details
+  local level = 0
+  local layer = 0
+  local w = width0   -- track current width/height to detect changing
+  local h = height0  -- mipmap level
+  for n = 0,nblits-1 do
+    local blit = blits[n]
+    --printf("%u: %ux%u, addr=%x\n", n, blit.width, blit.height, blit.addr)
+    if w ~= blit.width or h ~= blit.height then
+      level = level + 1
+      layer = 0
+
+      if blit.width ~= minify(w, 1) or blit.height ~= minify(h, 1) then
+        printf("I am confused! %ux%u vs %ux%u\n", blit.width, blit.height, minify(w, 1), minify(h, 1))
+	printf("addr=%x\n", blit.addr)
+        --return
+      end
+
+      w = blit.width
+      h = blit.height
+    end
+
+    printf("level=%u, layer=%u, sz=%ux%u, pitch=%u, offset=0x%x, addr=%x",
+           level, layer, w, h, blit.pitch, blit.addr - base, blit.addr)
+    if blit.endaddr ~= 0 then
+      local layersz = blit.endaddr - blit.addr
+      local alignedheight = layersz / blit.pitch
+      printf(", layersz=0x%x, alignedheight=%f", layersz, alignedheight)
+    end
+    printf("\n")
+
+    layer = layer + 1
+  end
+  printf("\n\n")
+end
+
diff --git a/src/freedreno/decode/scripts/texturator-to-unit-test-5xx.lua b/src/freedreno/decode/scripts/texturator-to-unit-test-5xx.lua
new file mode 100644
index 0000000..b0ac8cb
--- /dev/null
+++ b/src/freedreno/decode/scripts/texturator-to-unit-test-5xx.lua
@@ -0,0 +1,200 @@
+-- Parse logs from https://github.com/freedreno/freedreno/
+-- test-texturator.c to generate a src/freedreno/fdl/fd5_layout_test.c
+-- block.  We figure out the offsets from blits, but there may be some
+-- unrelated blits.  So just save all of them until we find the
+-- texture state.  This gives us the base address, and the miplevel #0
+-- width/height/depth.  Then work backwards from there finding the
+-- blits to the same dst buffer and deducing the miplevel from the
+-- minified dimensions
+
+local posix = require "posix"
+
+io.write("Analyzing Data...\n")
+
+local r = rnn.init("a530")
+local found_tex = 0
+
+local allblits = {}
+local nallblits = 0
+
+function get_first_blit(base, width, height)
+  local first_blit = nil
+
+  for n = 0,nallblits-1 do
+    local blit = allblits[n]
+    if blit.base == base and blit.width == width and blit.height == height then
+      if not first_blit or blit.addr < first_blit.addr then
+        first_blit = blit
+      end
+    end
+  end
+
+  return first_blit
+end
+
+function minify(val, lvls)
+  val = val >> lvls
+  if val < 1 then
+    return 1
+  end
+  return val
+end
+
+function printf(fmt, ...)
+  return io.write(string.format(fmt, ...))
+end
+
+function start_cmdstream(name)
+  io.write("Parsing " .. name .. "\n")
+  allblits = {}
+  nallblits = 0
+end
+
+-- Record texture upload blits done through CP_EVENT_WRITE
+function CP_EVENT_WRITE(pkt, size)
+  if tostring(pkt[0].EVENT) ~= "BLIT" then
+    return
+  end
+
+  local blit = {}
+
+  blit.width   = r.RB_RESOLVE_CNTL_2.X + 1
+  blit.height  = r.RB_RESOLVE_CNTL_2.Y + 1
+  blit.pitch   = r.RB_BLIT_DST_PITCH
+  blit.addr    = r.RB_BLIT_DST_LO | (r.RB_BLIT_DST_HI << 32)
+  blit.base    = bos.base(blit.addr)
+  blit.ubwc_addr = r.RB_BLIT_FLAG_DST_LO | (r.RB_BLIT_FLAG_DST_HI << 32)
+  blit.ubwc_base = bos.base(blit.ubwc_addr)
+  blit.ubwc_pitch = r.RB_BLIT_FLAG_DST_PITCH
+  blit.endaddr = 0  -- filled in later
+  printf("Found event blit: 0x%x (0x%x) %dx%d UBWC 0x%x (0x%x) tiled %s\n", blit.addr, blit.base, blit.width, blit.height, blit.ubwc_addr, blit.ubwc_base, r.RB_RESOLVE_CNTL_3.TILED)
+
+  allblits[nallblits] = blit
+  nallblits = nallblits + 1
+end
+
+function CP_BLIT(pkt, size)
+  -- Just in case, filter out anything that isn't starting
+  -- at 0,0
+  if pkt[1].SRC_X1 ~= 0 or pkt[1].SRC_Y1 ~= 0 then
+    return
+  end
+
+  local blit = {}
+
+  blit.width   = pkt[2].SRC_X2 + 1
+  blit.height  = pkt[2].SRC_Y2 + 1
+  blit.pitch   = r.RB_2D_DST_SIZE.PITCH
+  blit.addr    = r.RB_2D_DST_LO | (r.RB_2D_DST_HI << 32)
+  blit.base    = bos.base(blit.addr)
+  blit.ubwc_addr = r.RB_2D_DST_FLAGS_LO | (r.RB_2D_DST_FLAGS_HI << 32)
+  blit.ubwc_base = bos.base(blit.ubwc_addr)
+  blit.ubwc_pitch = r.RB_2D_DST_FLAGS_PITCH
+  blit.endaddr = 0  -- filled in later
+  printf("Found cp blit: 0x%x (0x%x) %dx%d UBWC 0x%x (0x%x) %s\n", blit.addr, blit.base, blit.width, blit.height, blit.ubwc_addr, blit.ubwc_base, r.RB_2D_DST_INFO.TILE_MODE)
+
+  allblits[nallblits] = blit
+  nallblits = nallblits + 1
+end
+
+function A5XX_TEX_CONST(pkt, size)
+  -- ignore any texture state w/ DEPTH=1, these aren't the 3d tex state we
+  -- are looking for
+
+  local base = pkt[4].BASE_LO | (pkt[5].BASE_HI << 32)
+  -- UBWC base on a5xx seems to be at the start of each miplevel, followed by pixels
+  -- somewhere past that.
+  local ubwc_base = base
+  local width0  = pkt[1].WIDTH
+  local height0 = pkt[1].HEIGHT
+  local depth0  = pkt[5].DEPTH
+
+  if (found_tex ~= 0) then
+    return
+  end
+  found_tex = 1
+
+  printf("Found texture state:\n  %ux%ux%u (%s, %s, UBWC=%s)\n",
+         width0, height0, depth0, pkt[0].FMT, pkt[0].TILE_MODE, tostring(pkt[3].FLAG))
+
+  -- Note that in some case the texture has some extra page or so
+  -- at the beginning:
+  local basebase = bos.base(base)
+  printf("base: 0x%x (0x%x)\n", base, basebase)
+  printf("ubwcbase: 0x%x (0x%x)\n", ubwc_base, bos.base(ubwc_base))
+
+  -- see if we can find the associated blits..  The blob always seems to
+  -- start from the lower (larger) mipmap levels and layers, so we don't
+  -- need to sort by dst address.  Also, while we are at it, fill in the
+  -- end-addr (at least for everything but the last blit)
+  local blits = {}
+  local nblits = 0
+  local lastblit = nil
+  for n = 0,nallblits-1 do
+    local blit = allblits[n]
+    --printf("blit addr: 0x%x (0x%x)\n", blit.addr, blit.base)
+    if blit.base == basebase and blit.addr >= base then
+      blits[nblits] = blit
+      nblits = nblits + 1
+      if lastblit then
+        lastblit.endaddr = blit.addr
+      end
+      lastblit = blit
+    end
+  end
+
+  printf("	{\n")
+  printf("		.format = %s,\n", pkt[0].FMT)
+  if (tostring(pkt[2].TYPE) == "A5XX_TEX_3D") then
+    printf("		.is_3d = true,\n")
+  end
+
+  printf("		.layout = {\n")
+  printf("			.tile_mode = %s,\n", pkt[0].TILE_MODE)
+  printf("			.ubwc = %s,\n", tostring(pkt[3].FLAG))
+
+  if (tostring(pkt[2].TYPE) == "A5XX_TEX_3D") then
+    printf("			.width0 = %d, .height0 = %d, .depth0 = %d,\n", width0, height0, depth0)
+  else
+    printf("			.width0 = %d, .height0 = %d,\n", width0, height0)
+  end
+
+  printf("			.slices = {\n")
+  local w = 0
+  local h = 0
+  local level = 0
+  repeat
+    local w = minify(width0, level)
+    local h = minify(height0, level)
+    local blit = get_first_blit(basebase, w, h)
+    if blit then
+      printf("				{ .offset = %d, .pitch = %u },\n",
+          blit.addr - base,
+          blit.pitch);
+    end
+    level = level + 1
+  until w == 1 and h == 1
+  printf("			},\n")
+
+  if pkt[3].FLAG then
+    printf("			.ubwc_slices = {\n")
+    level = 0
+    repeat
+      local w = minify(width0, level)
+      local h = minify(height0, level)
+      local blit = get_first_blit(basebase, w, h)
+      if blit then
+        printf("				{ .offset = %d, .pitch = %u },\n",
+            blit.ubwc_addr - ubwc_base,
+            blit.ubwc_pitch);
+      end
+      level = level + 1
+    until w == 1 and h == 1
+    printf("			},\n")
+  end
+
+  printf("		},\n")
+  printf("	},\n")
+  printf("\n\n")
+end
+
diff --git a/src/freedreno/decode/scripts/texturator-to-unit-test.lua b/src/freedreno/decode/scripts/texturator-to-unit-test.lua
new file mode 100644
index 0000000..8836d59
--- /dev/null
+++ b/src/freedreno/decode/scripts/texturator-to-unit-test.lua
@@ -0,0 +1,179 @@
+-- Parse logs from https://github.com/freedreno/freedreno/
+-- test-texturator.c to generate a src/freedreno/fdl/fd6_layout_test.c
+-- block.  We figure out the offsets from blits, but there may be some
+-- unrelated blits.  So just save all of them until we find the
+-- texture state.  This gives us the base address, and the miplevel #0
+-- width/height/depth.  Then work backwards from there finding the
+-- blits to the same dst buffer and deducing the miplevel from the
+-- minified dimensions
+
+local posix = require "posix"
+
+io.write("Analyzing Data...\n")
+
+local r = rnn.init("a630")
+local found_tex = 0
+
+local allblits = {}
+local nallblits = 0
+
+function get_first_blit(base, width, height)
+  local first_blit = nil
+
+  for n = 0,nallblits-1 do
+    local blit = allblits[n]
+    if blit.base == base and blit.width == width and blit.height == height then
+      if not first_blit or blit.addr < first_blit.addr then
+        first_blit = blit
+      end
+    end
+  end
+
+  return first_blit
+end
+
+function minify(val, lvls)
+  val = val >> lvls
+  if val < 1 then
+    return 1
+  end
+  return val
+end
+
+function printf(fmt, ...)
+  return io.write(string.format(fmt, ...))
+end
+
+function start_cmdstream(name)
+  io.write("Parsing " .. name .. "\n")
+  allblits = {}
+  nallblits = 0
+end
+
+function draw(primtype, nindx)
+  if primtype ~= "BLIT_OP_SCALE" then
+    return
+  end
+
+  -- Just in case, filter out anything that isn't starting
+  -- at 0,0
+  if r.GRAS_2D_DST_TL.X ~= 0 or r.GRAS_2D_DST_TL.Y ~= 0 then
+    return
+  end
+
+  local blit = {}
+
+  blit.width   = r.GRAS_2D_DST_BR.X + 1
+  blit.height  = r.GRAS_2D_DST_BR.Y + 1
+  blit.pitch   = r.RB_2D_DST_SIZE.PITCH
+  blit.addr    = r.RB_2D_DST_LO | (r.RB_2D_DST_HI << 32)
+  blit.base    = bos.base(blit.addr)
+  blit.ubwc_addr = r.RB_2D_DST_FLAGS_LO | (r.RB_2D_DST_FLAGS_HI << 32)
+  blit.ubwc_base = bos.base(blit.uwbc_addr)
+  blit.ubwc_pitch = r.RB_2D_DST_FLAGS_PITCH.PITCH
+  blit.endaddr = 0  -- filled in later
+  printf("Found blit: 0x%x (0x%x) %dx%d UBWC 0x%x (0x%x)\n", blit.addr, blit.base, blit.width, blit.height, blit.ubwc_addr, blit.ubwc_base)
+
+  allblits[nallblits] = blit
+  nallblits = nallblits + 1
+end
+
+function A6XX_TEX_CONST(pkt, size)
+  -- ignore any texture state w/ DEPTH=1, these aren't the 3d tex state we
+  -- are looking for
+
+  local base = pkt[4].BASE_LO | (pkt[5].BASE_HI << 32)
+  local ubwc_base = pkt[7].FLAG_LO | (pkt[8].FLAG_HI << 32)
+  local width0  = pkt[1].WIDTH
+  local height0 = pkt[1].HEIGHT
+  local depth0  = pkt[5].DEPTH
+
+  if (found_tex ~= 0) then
+    return
+  end
+  found_tex = 1
+
+  printf("Found texture state:\n  %ux%ux%u (%s, %s, MIN_LAYERSZ=0x%x, TILE_ALL=%s, UBWC=%s FLAG_LOG2=%ux%u)\n",
+         width0, height0, depth0, pkt[0].FMT, pkt[0].TILE_MODE, pkt[3].MIN_LAYERSZ, tostring(pkt[3].TILE_ALL), tostring(pkt[3].FLAG), pkt[10].FLAG_BUFFER_LOGW, pkt[10].FLAG_BUFFER_LOGH)
+
+  -- Note that in some case the texture has some extra page or so
+  -- at the beginning:
+  local basebase = bos.base(base)
+  printf("base: 0x%x (0x%x)\n", base, basebase)
+  printf("ubwcbase: 0x%x (0x%x)\n", ubwc_base, bos.base(ubwc_base))
+
+  -- see if we can find the associated blits..  The blob always seems to
+  -- start from the lower (larger) mipmap levels and layers, so we don't
+  -- need to sort by dst address.  Also, while we are at it, fill in the
+  -- end-addr (at least for everything but the last blit)
+  local blits = {}
+  local nblits = 0
+  local lastblit = nil
+  for n = 0,nallblits-1 do
+    local blit = allblits[n]
+    --printf("blit addr: 0x%x (0x%x)\n", blit.addr, blit.base)
+    if blit.base == basebase and blit.addr >= base then
+      blits[nblits] = blit
+      nblits = nblits + 1
+      if lastblit then
+        lastblit.endaddr = blit.addr
+      end
+      lastblit = blit
+    end
+  end
+
+  printf("	{\n")
+  printf("		.format = %s,\n", pkt[0].FMT)
+  if (tostring(pkt[2].TYPE) == "A6XX_TEX_3D") then
+    printf("		.is_3d = true,\n")
+  end
+
+  printf("		.layout = {\n")
+  printf("			.tile_mode = %s,\n", pkt[0].TILE_MODE)
+  printf("			.ubwc = %s,\n", tostring(pkt[3].FLAG))
+
+  if (tostring(pkt[2].TYPE) == "A6XX_TEX_3D") then
+    printf("			.width0 = %d, .height0 = %d, .depth = %d,\n", width0, height0, depth0)
+  else
+    printf("			.width0 = %d, .height0 = %d,\n", width0, height0)
+  end
+
+  printf("			.slices = {\n")
+  local w = 0
+  local h = 0
+  local level = 0
+  repeat
+    local w = minify(width0, level)
+    local h = minify(height0, level)
+    local blit = get_first_blit(basebase, w, h)
+    if blit then
+      printf("				{ .offset = %d, .pitch = %u },\n",
+          blit.addr - base,
+          blit.pitch);
+    end
+    level = level + 1
+  until w == 1 and h == 1
+  printf("			},\n")
+
+  if pkt[3].FLAG then
+    printf("			.ubwc_slices = {\n")
+    level = 0
+    repeat
+      local w = minify(width0, level)
+      local h = minify(height0, level)
+      local blit = get_first_blit(basebase, w, h)
+      if blit then
+        printf("				{ .offset = %d, .pitch = %u },\n",
+            blit.ubwc_addr - ubwc_base,
+            blit.ubwc_pitch);
+      end
+      level = level + 1
+    until w == 1 and h == 1
+    printf("			},\n")
+  end
+
+  printf("		},\n")
+  printf("	},\n")
+  printf("\n\n")
+end
+
diff --git a/src/freedreno/decode/util.h b/src/freedreno/decode/util.h
new file mode 100644
index 0000000..1ec0202
--- /dev/null
+++ b/src/freedreno/decode/util.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2012-2018 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __UTIL_H__
+#define __UTIL_H__
+
+#include <ctype.h>
+#include <stdint.h>
+#include <stdio.h>
+
+/* old-style program binary XOR'd ascii w/ 0xff */
+#ifndef ASCII_XOR
+#  define ASCII_XOR 0
+#endif
+
+static inline const char *tab(int lvl)
+{
+	const char *TAB = "\t\t\t\t\t\t\t\t\0";
+	return &TAB[strlen(TAB) - lvl];
+}
+
+/* convert float to dword */
+static inline float d2f(uint32_t d)
+{
+	union {
+		float f;
+		uint32_t d;
+	} u = {
+		.d = d,
+	};
+	return u.f;
+}
+
+static inline void dump_hex(const void *buf, int sz)
+{
+	uint8_t *ptr = (uint8_t *)buf;
+	uint8_t *end = ptr + sz;
+	int i = 0;
+
+	while (ptr < end) {
+		uint32_t d = 0;
+
+		printf((i % 8) ? " " : "\t");
+
+		d |= *(ptr++) <<  0;
+		d |= *(ptr++) <<  8;
+		d |= *(ptr++) << 16;
+		d |= *(ptr++) << 24;
+
+		printf("%08x", d);
+
+		if ((i % 8) == 7) {
+			printf("\n");
+		}
+
+		i++;
+	}
+
+	if (i % 8) {
+		printf("\n");
+	}
+}
+
+static inline void
+dump_float(const void *buf, int sz)
+{
+	uint8_t *ptr = (uint8_t *)buf;
+	uint8_t *end = ptr + sz - 3;
+	int i = 0;
+
+	while (ptr < end) {
+		uint32_t d = 0;
+
+		printf((i % 8) ? " " : "\t");
+
+		d |= *(ptr++) <<  0;
+		d |= *(ptr++) <<  8;
+		d |= *(ptr++) << 16;
+		d |= *(ptr++) << 24;
+
+		printf("%8f", d2f(d));
+
+		if ((i % 8) == 7) {
+			printf("\n");
+		}
+
+		i++;
+	}
+
+	if (i % 8) {
+		printf("\n");
+	}
+}
+
+#define is_ok_ascii(c) \
+	(isascii(c) && ((c == '\t') || !iscntrl(c)))
+
+static inline void
+clean_ascii(char *buf, int sz)
+{
+	uint8_t *ptr = (uint8_t *)buf;
+	uint8_t *end = ptr + sz;
+	while (ptr < end) {
+		*(ptr++) ^= ASCII_XOR;
+	}
+}
+
+static inline void
+dump_ascii(const void *buf, int sz)
+{
+	uint8_t *ptr = (uint8_t *)buf;
+	uint8_t *end = ptr + sz;
+	printf("\t");
+	while (ptr < end) {
+		uint8_t c = *(ptr++) ^ ASCII_XOR;
+		if (c == '\n') {
+			printf("\n\t");
+		} else if (c == '\0') {
+			printf("\n\t-----------------------------------\n\t");
+		} else if (is_ok_ascii(c)) {
+			printf("%c", c);
+		} else {
+			printf("?");
+		}
+	}
+	printf("\n");
+}
+
+static inline void
+dump_hex_ascii(const void *buf, int sz, int level)
+{
+	uint8_t *ptr = (uint8_t *)buf;
+	uint8_t *end = ptr + sz;
+	uint8_t *ascii = ptr;
+	int i = 0;
+
+	printf("%s-----------------------------------------------\n", tab(level));
+	printf("%s%d (0x%x) bytes\n", tab(level), sz, sz);
+
+	while (ptr < end) {
+		uint32_t d = 0;
+
+		if (i % 4) {
+			printf(" ");
+		} else {
+			printf("%s%06x: ", tab(level), (uint32_t)(ptr - (uint8_t *)buf));
+		}
+
+		d |= *(ptr++) <<  0;
+		d |= *(ptr++) <<  8;
+		d |= *(ptr++) << 16;
+		d |= *(ptr++) << 24;
+
+		printf("%08x", d);
+
+		if ((i % 4) == 3) {
+			int j;
+			printf("\t|");
+			for (j = 0; j < 16; j++) {
+				uint8_t c = *(ascii++);
+				c ^= ASCII_XOR;
+				printf("%c", (isascii(c) && !iscntrl(c)) ? c : '.');
+			}
+			printf("|\n");
+		}
+
+		i++;
+	}
+
+	if (i % 4) {
+		for (int j = 4 - (i % 4); j > 0; j--) {
+			printf("         ");
+		}
+		printf("\t|");
+		while (ascii < end) {
+			uint8_t c = *(ascii++);
+			c ^= ASCII_XOR;
+			printf("%c", (isascii(c) && !iscntrl(c)) ? c : '.');
+		}
+		printf("|\n");
+	}
+}
+
+#endif /* __UTIL_H__ */
diff --git a/src/freedreno/meson.build b/src/freedreno/meson.build
index 7b6ab53..6405a7d 100644
--- a/src/freedreno/meson.build
+++ b/src/freedreno/meson.build
@@ -19,6 +19,7 @@
 # SOFTWARE.
 
 inc_freedreno = include_directories(['.', './registers'])
+inc_freedreno_rnn = include_directories('rnn')
 
 subdir('common')
 subdir('registers')
@@ -33,6 +34,7 @@
 # Everything that depends on rnn requires (indirectly) libxml2:
 if dep_libxml2.found()
   subdir('rnn')
+  subdir('decode')
 endif
 
 if with_tools.contains('drm-shim')