x86 oprofile enablement
diff --git a/Android.mk b/Android.mk
index 582ddc9..5053e7d 100644
--- a/Android.mk
+++ b/Android.mk
@@ -1,3 +1 @@
-ifeq ($(TARGET_ARCH),arm)
 include $(call all-subdir-makefiles)
-endif
diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h
index 35080ad..b1e3b4c 100644
--- a/libop/op_hw_specific.h
+++ b/libop/op_hw_specific.h
@@ -20,7 +20,8 @@
 		char v[12];
 	} v;
 	unsigned eax;
-	asm("cpuid" : "=a" (eax), "=b" (v.b), "=c" (v.c), "=d" (v.d) : "0" (0));
+	asm volatile(	"pushl %%ebx; cpuid; movl %%ebx, %1; popl %%ebx"
+			: "=a" (eax), "=S" (v.b), "=c" (v.c), "=d" (v.d) : "0" (0));
 	return !strncmp(v.v, vnd, 12);
 }
 
@@ -46,7 +47,8 @@
 
 	if (!cpuid_vendor("GenuineIntel"))
 		return;
-	asm("cpuid" : "=a" (v.eax) : "0" (1) : "ecx","ebx","edx");
+	asm volatile(	"pushl %%ebx; cpuid; movl %%ebx, %1; popl %%ebx"
+			: "=a" (v.eax) : "0" (1) : "ecx","edx");
 	model = (v.ext_model << 4) + v.model;
 	if (v.family != 6 || model != 26 || v.stepping > 4)
 		return;
@@ -57,7 +59,8 @@
 {
 	if (cpu_type == CPU_ARCH_PERFMON) { 
 		unsigned ebx, eax;
-		asm("cpuid" : "=a" (eax), "=b" (ebx) : "0" (0xa) : "ecx","edx");
+		asm volatile(	"pushl %%ebx; cpuid; movl %%ebx, %1; popl %%ebx"
+				: "=a" (eax), "=S" (ebx) : "0" (0xa) : "ecx","edx");
 		workaround_nehalem_aaj79(&ebx);
 		return ebx & num_to_mask(eax >> 24);
 	}
@@ -68,7 +71,8 @@
 {
 	if (cpu_type == CPU_ARCH_PERFMON) {
 		unsigned v;
-		asm("cpuid" : "=a" (v) : "0" (0xa) : "ebx","ecx","edx");
+		asm volatile(	"pushl %%ebx; cpuid; movl %%eax, %1; popl %%ebx"
+				: "=a" (v) : "0" (0xa) : "ecx","edx");
 		return (v >> 8) & 0xff;
 	} 
 	return -1;
@@ -77,7 +81,8 @@
 static inline unsigned arch_get_counter_mask(void)
 {
 	unsigned v;
-	asm("cpuid" : "=a" (v) : "0" (0xa) : "ebx","ecx","edx");
+	asm volatile(	"pushl %%ebx; cpuid; movl %%ebx, %1; popl %%ebx"
+			: "=a" (v) : "0" (0xa) : "ecx","edx");
 	return num_to_mask((v >> 8) & 0xff);	
 }
 
diff --git a/opcontrol/opcontrol.cpp b/opcontrol/opcontrol.cpp
index 2558760..4cfc595 100644
--- a/opcontrol/opcontrol.cpp
+++ b/opcontrol/opcontrol.cpp
@@ -30,17 +30,16 @@
 
 #include "op_config.h"
 
-#if 0
-#define verbose(fmt...) printf(fmt)
-#else
-#define verbose(fmt...)
-#endif
+#define verbose(fmt...) if (verbose_print) printf(fmt)
 
 /* Experiments found that using a small interval may hang the device, and the
  * more events tracked simultaneously, the longer the interval has to be.
  */
 
-#if !defined(WITH_ARM_V7_A)
+#if defined(__i386__) || defined(__x86_64__)
+#define MAX_EVENTS 2
+int min_count[MAX_EVENTS] = {60000, 100000};
+#elif !defined(WITH_ARM_V7_A)
 #define MAX_EVENTS 3
 int min_count[MAX_EVENTS] = {150000, 200000, 250000};
 #else
@@ -48,6 +47,7 @@
 int min_count[MAX_EVENTS] = {150000, 200000, 250000, 300000, 350000};
 #endif
 
+int verbose_print;
 int list_events; 
 int show_usage;
 int setup;
@@ -60,6 +60,7 @@
 int selected_events[MAX_EVENTS];
 int selected_counts[MAX_EVENTS];
 
+char callgraph[8];
 char kernel_range[512];
 char vmlinux[512];
 
@@ -69,84 +70,133 @@
     {"reset", 0, &reset, 1},
     {"setup", 0, &setup, 1},
     {"quick", 0, &quick, 1},
+    {"callgraph", 1, 0, 'c'},
     {"event", 1, 0, 'e'},
     {"vmlinux", 1, 0, 'v'},
     {"kernel-range", 1, 0, 'r'},
     {"start", 0, &start, 1},
     {"stop", 0, &stop, 1},
+    {"dump", 0, 0, 'd'},
     {"shutdown", 0, 0, 'h'},
     {"status", 0, 0, 't'},
+    {"verbose", 0, 0, 'V'},
     {0, 0, 0, 0},
 };
 
 struct event_info {
     int id;
+    int um;
     const char *name;
     const char *explanation;
 } event_info[] = {
-#if !defined(WITH_ARM_V7_A)
+#if defined(__i386__) || defined(__x86_64__)
+    /* INTEL_ARCH_PERFMON events */
+
+    /* 0x3c counters:cpuid um:zero minimum:6000 filter:0 name:CPU_CLK_UNHALTED :
+     * Clock cycles when not halted
+     */
+    {0x3c, 0, "CPU_CLK_UNHALTED",
+     "Clock cycles when not halted" },
+
+    /* event:0x3c counters:cpuid um:one minimum:6000 filter:2 name:UNHALTED_REFERENCE_CYCLES :
+     * Unhalted reference cycles
+     */
+    {0x3c, 1, "UNHALTED_REFERENCE_CYCLES",
+      "Unhalted reference cycles" },
+
+    /* event:0xc0 counters:cpuid um:zero minimum:6000 filter:1 name:INST_RETIRED :
+     * number of instructions retired
+     */
+     {0xc0, 0, "INST_RETIRED",
+       "number of instructions retired"},
+
+    /* event:0x2e counters:cpuid um:x41 minimum:6000 filter:5 name:LLC_MISSES :
+     * Last level cache demand requests from this core that missed the LLC
+     */
+     {0x2e, 0x41, "LLC_MISSES",
+       "Last level cache demand requests from this core that missed the LLC"},
+
+    /* event:0x2e counters:cpuid um:x4f minimum:6000 filter:4 name:LLC_REFS :
+     * Last level cache demand requests from this core
+     */
+     {0x2e, 0x4f, "LLC_REFS",
+      "Last level cache demand requests from this core"},
+
+    /* event:0xc4 counters:cpuid um:zero minimum:500 filter:6 name:BR_INST_RETIRED :
+     * number of branch instructions retired
+     */
+     {0xc4, 0, "BR_INST_RETIRED",
+       "number of branch instructions retired"},
+
+    /* event:0xc5 counters:cpuid um:zero minimum:500 filter:7 name:BR_MISS_PRED_RETIRED :
+     * number of mispredicted branches retired (precise)
+     */
+     {0xc5, 0, "BR_MISS_PRED_RETIRED",
+       "number of mispredicted branches retired (precise)"},
+
+#elif !defined(WITH_ARM_V7_A)
     /* ARM V6 events */
-    {0x00, "IFU_IFETCH_MISS", 
+    {0x00, 0, "IFU_IFETCH_MISS", 
      "number of instruction fetch misses"},
-    {0x01, "CYCLES_IFU_MEM_STALL", 
+    {0x01, 0, "CYCLES_IFU_MEM_STALL", 
      "cycles instruction fetch pipe is stalled"},
-    {0x02, "CYCLES_DATA_STALL", 
+    {0x02, 0, "CYCLES_DATA_STALL", 
      "cycles stall occurs for due to data dependency"},
-    {0x03, "ITLB_MISS", 
+    {0x03, 0, "ITLB_MISS", 
      "number of Instruction MicroTLB misses"},
-    {0x04, "DTLB_MISS", 
+    {0x04, 0, "DTLB_MISS", 
      "number of Data MicroTLB misses"},
-    {0x05, "BR_INST_EXECUTED", 
+    {0x05, 0, "BR_INST_EXECUTED", 
      "branch instruction executed w/ or w/o program flow change"},
-    {0x06, "BR_INST_MISS_PRED", 
+    {0x06, 0, "BR_INST_MISS_PRED", 
      "branch mispredicted"},
-    {0x07, "INSN_EXECUTED", 
+    {0x07, 0, "INSN_EXECUTED", 
      "instructions executed"},
-    {0x09, "DCACHE_ACCESS", 
+    {0x09, 0, "DCACHE_ACCESS", 
      "data cache access, cacheable locations"},
-    {0x0a, "DCACHE_ACCESS_ALL", 
+    {0x0a, 0, "DCACHE_ACCESS_ALL", 
      "data cache access, all locations"},
-    {0x0b, "DCACHE_MISS", 
+    {0x0b, 0, "DCACHE_MISS", 
      "data cache miss"},
-    {0x0c, "DCACHE_WB", 
+    {0x0c, 0, "DCACHE_WB", 
      "data cache writeback, 1 event for every half cacheline"},
-    {0x0d, "PC_CHANGE", 
+    {0x0d, 0, "PC_CHANGE", 
      "number of times the program counter was changed without a mode switch"},
-    {0x0f, "TLB_MISS", 
+    {0x0f, 0, "TLB_MISS", 
      "Main TLB miss"},
-    {0x10, "EXP_EXTERNAL", 
+    {0x10, 0, "EXP_EXTERNAL", 
      "Explicit external data access"},
-    {0x11, "LSU_STALL", 
+    {0x11, 0, "LSU_STALL", 
      "cycles stalled because Load Store request queue is full"},
-    {0x12, "WRITE_DRAIN", 
+    {0x12, 0, "WRITE_DRAIN", 
      "Times write buffer was drained"},
-    {0xff, "CPU_CYCLES", 
+    {0xff, 0, "CPU_CYCLES", 
      "clock cycles counter"}, 
 #else
     /* ARM V7 events */
-    {0x00, "PMNC_SW_INCR",
+    {0x00, 0, "PMNC_SW_INCR",
      "Software increment of PMNC registers"},
-    {0x01, "IFETCH_MISS",
+    {0x01, 0, "IFETCH_MISS",
      "Instruction fetch misses from cache or normal cacheable memory"},
-    {0x02, "ITLB_MISS",
+    {0x02, 0, "ITLB_MISS",
      "Instruction fetch misses from TLB"},
-    {0x03, "DCACHE_REFILL",
+    {0x03, 0, "DCACHE_REFILL",
      "Data R/W operation that causes a refill from cache or normal cacheable"
      "memory"},
-    {0x04, "DCACHE_ACCESS",
+    {0x04, 0, "DCACHE_ACCESS",
      "Data R/W from cache"},
-    {0x05, "DTLB_REFILL",
+    {0x05, 0, "DTLB_REFILL",
      "Data R/W that causes a TLB refill"},
-    {0x06, "DREAD",
+    {0x06, 0, "DREAD",
      "Data read architecturally executed (note: architecturally executed = for"
      "instructions that are unconditional or that pass the condition code)"},
-    {0x07, "DWRITE",
+    {0x07, 0, "DWRITE",
      "Data write architecturally executed"},
-    {0x08, "INSTR_EXECUTED",
+    {0x08, 0, "INSTR_EXECUTED",
      "All executed instructions"},
-    {0x09, "EXC_TAKEN",
+    {0x09, 0, "EXC_TAKEN",
      "Exception taken"},
-    {0x0A, "EXC_EXECUTED",
+    {0x0A, 0, "EXC_EXECUTED",
      "Exception return architecturally executed"},
     {0x0B, "CID_WRITE",
      "Instruction that writes to the Context ID Register architecturally"
@@ -238,13 +288,19 @@
     printf("\nopcontrol: usage:\n"
            "   --list-events    list event types\n"
            "   --help           this message\n"
+           "   --verbose        show extra status\n"
            "   --setup          setup directories\n"
+#if defined(__i386__) || defined(__x86_64__)
+           "   --quick          setup and select CPU_CLK_UNHALTED:60000\n"
+#else
            "   --quick          setup and select CPU_CYCLES:150000\n"
+#endif
            "   --status         show configuration\n"
            "   --start          start data collection\n"
            "   --stop           stop data collection\n"
            "   --reset          clears out data from current session\n"
            "   --shutdown       kill the oprofile daeman\n"
+           "   --callgraph=depth callgraph depth\n"
            "   --event=eventspec\n"
            "      Choose an event. May be specified multiple times.\n"
            "      eventspec is in the form of name[:count], where :\n"
@@ -459,8 +515,9 @@
             printf("  %9u samples received\n", num);
             num = read_num(OP_DRIVER_BASE"/stats/cpu0/sample_lost_overflow");
             printf("  %9u samples lost overflow\n", num);
-#if 0
-            /* FIXME - backtrace seems broken */
+
+#if defined(__i386__) || defined(__x86_64__)
+            /* FIXME on ARM - backtrace seems broken there */
             num = read_num(OP_DRIVER_BASE"/stats/cpu0/backtrace_aborted");
             printf("  %9u backtrace aborted\n", num);
             num = read_num(OP_DRIVER_BASE"/backtrace_depth");
@@ -495,13 +552,17 @@
     strcpy(kernel_range, "");
 
     while (1) {
-        int c = getopt_long(argc, argv, "", long_options, &option_index);
+        int c = getopt_long(argc, argv, "c:e:v:r:dhVt", long_options, &option_index);
         if (c == -1) {
             break;
         }
         switch (c) {
             case 0:
                 break;
+            /* --callgraph */
+            case 'c':
+		strncpy(callgraph, optarg, sizeof(callgraph));
+                break;
             /* --event */
             case 'e':   
                 if (num_events == MAX_EVENTS) {
@@ -521,15 +582,33 @@
             case 'r':
                 sprintf(kernel_range, "-r %s", optarg);
                 break;
+            case 'd':
+            /* --dump */ {
+                int pid = read_num(OP_DATA_DIR"/lock");
+                echo_dev("1", 0, "dump", -1);
+                if (pid >= 0) {
+                    sleep(1);
+                    kill(pid, SIGHUP);
+                }
+                break;
+            }
             /* --shutdown */
             case 'h': {
                 int pid = read_num(OP_DATA_DIR"/lock");
                 if (pid >= 0) {
+                    kill(pid, SIGHUP); /* Politely ask the daemon to close files */
+                    sleep(1);
+                    kill(pid, SIGTERM);/* Politely ask the daemon to die */
+                    sleep(1);
                     kill(pid, SIGKILL);
                 }   
                 setup_session_dir();
                 break;
             }
+            /* --verbose */
+            case 'V':
+                verbose_print++;
+                break;
             /* --status */
             case 't':
                 do_status();
@@ -547,7 +626,11 @@
     }
 
     if (quick) {
+#if defined(__i386__) || defined(__x86_64__)
+        process_event("CPU_CLK_UNHALTED");
+#else
         process_event("CPU_CYCLES");
+#endif
         setup = 1;
     }
 
@@ -566,12 +649,18 @@
         }
     }
 
+    if (strlen(callgraph)) {
+        echo_dev(callgraph, 0, "backtrace_depth", -1);
+    }
+
     if (num_events != 0) {
         int i;
 
         strcpy(command, "oprofiled --session-dir="OP_DATA_DIR);
 
-#if !defined(WITH_ARM_V7_A)
+#if defined(__i386__) || defined(__x86_64__)
+        /* Nothing */
+#elif !defined(WITH_ARM_V7_A)
         /* Since counter #3 can only handle CPU_CYCLES, check and shuffle the 
          * order a bit so that the maximal number of events can be profiled
          * simultaneously
@@ -624,15 +713,16 @@
              * --events=CYCLES_DATA_STALL:2:0:200000:0:1:1,....
              */
             snprintf(command+strlen(command), 1024 - strlen(command), 
-                     "%s:%d:%d:%d:0:1:1",
+                     "%s:%d:%d:%d:%d:1:1",
                      event_info[event_idx].name,
                      event_info[event_idx].id,
                      i,
-                     selected_counts[i]);
+                     selected_counts[i],
+                     event_info[event_idx].um);
 
             setup_result |= echo_dev("1", 0, "user", i);
             setup_result |= echo_dev("1", 0, "kernel", i);
-            setup_result |= echo_dev("0", 0, "unit_mask", i);
+            setup_result |= echo_dev(NULL, event_info[event_idx].um, "unit_mask", i);
             setup_result |= echo_dev("1", 0, "enabled", i);
             setup_result |= echo_dev(NULL, selected_counts[i], "count", i);
             setup_result |= echo_dev(NULL, event_info[event_idx].id,