Support different watchdog timeouts for different entities

We need to be able to perform very lengthy operations on some threads
(e.g. the I/O thread responsible for installing multi-gigabyte APKs) but
still have long-run deadlock/hang detection applied to those threads.
Previously the watchdog mechanism applied the same policy to all
monitored threads: unresponsive after 60 seconds => restart the system.

Now, each monitored entity can have its own independent timeout after
which the watchdog declares deadlock and restarts the runtime.  The
halfway-finished intermediate thread stacks are dumped based on the
specific entity's declared timeout, not the global 30 second checking
interval.

With that new mechanism in place, the Package Manager's lengthy-I/O
thread watchdog timeout is raised to 10 minutes.

Bug 11278188

Change-Id: I512599260009c31416b2385f778681e5b9597f05
diff --git a/services/java/com/android/server/Watchdog.java b/services/java/com/android/server/Watchdog.java
index 616090e..8054788 100644
--- a/services/java/com/android/server/Watchdog.java
+++ b/services/java/com/android/server/Watchdog.java
@@ -58,7 +58,14 @@
     // Set this to true to have the watchdog record kernel thread stacks when it fires
     static final boolean RECORD_KERNEL_THREADS = true;
 
-    static final int TIME_TO_WAIT = DB ? 5*1000 : 30*1000;
+    static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
+    static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
+
+    // These are temporally ordered: larger values as lateness increases
+    static final int COMPLETED = 0;
+    static final int WAITING = 1;
+    static final int WAITED_HALF = 2;
+    static final int OVERDUE = 3;
 
     static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
         "/system/bin/mediaserver",
@@ -87,13 +94,17 @@
     public final class HandlerChecker implements Runnable {
         private final Handler mHandler;
         private final String mName;
+        private final long mWaitMax;
         private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
         private boolean mCompleted;
         private Monitor mCurrentMonitor;
+        private long mStartTime;
 
-        HandlerChecker(Handler handler, String name) {
+        HandlerChecker(Handler handler, String name, long waitMaxMillis) {
             mHandler = handler;
             mName = name;
+            mWaitMax = waitMaxMillis;
+            mCompleted = true;
         }
 
         public void addMonitor(Monitor monitor) {
@@ -111,13 +122,34 @@
                 mCompleted = true;
                 return;
             }
+
+            if (!mCompleted) {
+                // we already have a check in flight, so no need
+                return;
+            }
+
             mCompleted = false;
             mCurrentMonitor = null;
+            mStartTime = SystemClock.uptimeMillis();
             mHandler.postAtFrontOfQueue(this);
         }
 
-        public boolean isCompletedLocked() {
-            return mCompleted;
+        public boolean isOverdueLocked() {
+            return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
+        }
+
+        public int getCompletionStateLocked() {
+            if (mCompleted) {
+                return COMPLETED;
+            } else {
+                long latency = SystemClock.uptimeMillis() - mStartTime;
+                if (latency < mWaitMax/2) {
+                    return WAITING;
+                } else if (latency < mWaitMax) {
+                    return WAITED_HALF;
+                }
+            }
+            return OVERDUE;
         }
 
         public Thread getThread() {
@@ -186,16 +218,19 @@
 
         // The shared foreground thread is the main checker.  It is where we
         // will also dispatch monitor checks and do other work.
-        mMonitorChecker = new HandlerChecker(FgThread.getHandler(), "foreground thread");
+        mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
+                "foreground thread", DEFAULT_TIMEOUT);
         mHandlerCheckers.add(mMonitorChecker);
         // Add checker for main thread.  We only do a quick check since there
         // can be UI running on the thread.
         mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
-                "main thread"));
+                "main thread", DEFAULT_TIMEOUT));
         // Add checker for shared UI thread.
-        mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), "ui thread"));
+        mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
+                "ui thread", DEFAULT_TIMEOUT));
         // And also check IO thread.
-        mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), "i/o thread"));
+        mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
+                "i/o thread", DEFAULT_TIMEOUT));
     }
 
     public void init(Context context, BatteryService battery,
@@ -242,11 +277,15 @@
     }
 
     public void addThread(Handler thread, String name) {
+        addThread(thread, name, DEFAULT_TIMEOUT);
+    }
+
+    public void addThread(Handler thread, String name, long timeoutMillis) {
         synchronized (this) {
             if (isAlive()) {
                 throw new RuntimeException("Threads can't be added once the Watchdog is running");
             }
-            mHandlerCheckers.add(new HandlerChecker(thread, name));
+            mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
         }
     }
 
@@ -259,21 +298,20 @@
         pms.reboot(false, reason, false);
     }
 
-    private boolean haveAllCheckersCompletedLocked() {
+    private int evaluateCheckerCompletionLocked() {
+        int state = COMPLETED;
         for (int i=0; i<mHandlerCheckers.size(); i++) {
             HandlerChecker hc = mHandlerCheckers.get(i);
-            if (!hc.isCompletedLocked()) {
-                return false;
-            }
+            state = Math.max(state, hc.getCompletionStateLocked());
         }
-        return true;
+        return state;
     }
 
     private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
         ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
         for (int i=0; i<mHandlerCheckers.size(); i++) {
             HandlerChecker hc = mHandlerCheckers.get(i);
-            if (!hc.isCompletedLocked()) {
+            if (hc.isOverdueLocked()) {
                 checkers.add(hc);
             }
         }
@@ -299,14 +337,12 @@
             final String subject;
             final boolean allowRestart;
             synchronized (this) {
-                long timeout = TIME_TO_WAIT;
-                if (!waitedHalf) {
-                    // If we are not at the half-point of waiting, perform a
-                    // new set of checks.  Otherwise we are still waiting for a previous set.
-                    for (int i=0; i<mHandlerCheckers.size(); i++) {
-                        HandlerChecker hc = mHandlerCheckers.get(i);
-                        hc.scheduleCheckLocked();
-                    }
+                long timeout = CHECK_INTERVAL;
+                // Make sure we (re)spin the checkers that have become idle within
+                // this wait-and-check interval
+                for (int i=0; i<mHandlerCheckers.size(); i++) {
+                    HandlerChecker hc = mHandlerCheckers.get(i);
+                    hc.scheduleCheckLocked();
                 }
 
                 // NOTE: We use uptimeMillis() here because we do not want to increment the time we
@@ -320,26 +356,31 @@
                     } catch (InterruptedException e) {
                         Log.wtf(TAG, e);
                     }
-                    timeout = TIME_TO_WAIT - (SystemClock.uptimeMillis() - start);
+                    timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
                 }
 
-                if (haveAllCheckersCompletedLocked()) {
-                    // The monitors have returned.
+                final int waitState = evaluateCheckerCompletionLocked();
+                if (waitState == COMPLETED) {
+                    // The monitors have returned; reset
                     waitedHalf = false;
                     continue;
-                }
-
-                if (!waitedHalf) {
-                    // We've waited half the deadlock-detection interval.  Pull a stack
-                    // trace and wait another half.
-                    ArrayList<Integer> pids = new ArrayList<Integer>();
-                    pids.add(Process.myPid());
-                    ActivityManagerService.dumpStackTraces(true, pids, null, null,
-                            NATIVE_STACKS_OF_INTEREST);
-                    waitedHalf = true;
+                } else if (waitState == WAITING) {
+                    // still waiting but within their configured intervals; back off and recheck
+                    continue;
+                } else if (waitState == WAITED_HALF) {
+                    if (!waitedHalf) {
+                        // We've waited half the deadlock-detection interval.  Pull a stack
+                        // trace and wait another half.
+                        ArrayList<Integer> pids = new ArrayList<Integer>();
+                        pids.add(Process.myPid());
+                        ActivityManagerService.dumpStackTraces(true, pids, null, null,
+                                NATIVE_STACKS_OF_INTEREST);
+                        waitedHalf = true;
+                    }
                     continue;
                 }
 
+                // something is overdue!
                 blockedCheckers = getBlockedCheckersLocked();
                 subject = describeCheckersLocked(blockedCheckers);
                 allowRestart = mAllowRestart;
diff --git a/services/java/com/android/server/pm/PackageManagerService.java b/services/java/com/android/server/pm/PackageManagerService.java
index e075862..a781d5f 100755
--- a/services/java/com/android/server/pm/PackageManagerService.java
+++ b/services/java/com/android/server/pm/PackageManagerService.java
@@ -221,6 +221,14 @@
     static final int REMOVE_CHATTY = 1<<16;
 
     /**
+     * Timeout (in milliseconds) after which the watchdog should declare that
+     * our handler thread is wedged.  The usual default for such things is one
+     * minute but we sometimes do very lengthy I/O operations on this thread,
+     * such as installing multi-gigabyte applications, so ours needs to be longer.
+     */
+    private static final long WATCHDOG_TIMEOUT = 1000*60*10;     // ten minutes
+
+    /**
      * Whether verification is enabled by default.
      */
     private static final boolean DEFAULT_VERIFY_ENABLE = true;
@@ -1115,7 +1123,8 @@
         synchronized (mPackages) {
             mHandlerThread.start();
             mHandler = new PackageHandler(mHandlerThread.getLooper());
-            Watchdog.getInstance().addThread(mHandler, mHandlerThread.getName());
+            Watchdog.getInstance().addThread(mHandler, mHandlerThread.getName(),
+                    WATCHDOG_TIMEOUT);
 
             File dataDir = Environment.getDataDirectory();
             mAppDataDir = new File(dataDir, "data");