Work-around for system property failures Bug: 236932163 This change adds a work-around to PropertyInvalidatedCache for failures to set a system property. Such failures cause fatal crashes in Android but it is believed that a simple retry will succeed. (Note: the failures have only been reported on partner devices and cannot be reproduced on Pixel phones, so no root-cause is available yet.) RuntimeExceptions (thrown from android_os_SystemProperties.cpp) are caught and retried. The retry limit is 5 times with a 200ms delay between attempts. This means that the maximum delay is 1s, whcih should avoid triggering an ANR. In addition to automated testing, a manual test was created with ambush code in the system property component of libc. The ambush code injected an error into the set-property logic for cache keys. It was observed that the system recovered properly. Test: atest: * FrameworksCoreTests:PropertyInvalidatedCacheTests * FrameworksCoreTests:IpcDataCacheTest Change-Id: I3a124b185c7499a45b27df7cbf889ae6d1d33377 (cherry picked from commit 9b0f81025ca809b0b0cac29fb75f02c568b98a23)
diff --git a/core/java/android/app/PropertyInvalidatedCache.java b/core/java/android/app/PropertyInvalidatedCache.java index 58ddd49..13934e5 100644 --- a/core/java/android/app/PropertyInvalidatedCache.java +++ b/core/java/android/app/PropertyInvalidatedCache.java
@@ -310,6 +310,21 @@ public static final String MODULE_TELEPHONY = "telephony"; /** + * Constants that affect retries when the process is unable to write the property. + * The first constant is the number of times the process will attempt to set the + * property. The second constant is the delay between attempts. + */ + + /** + * Wait 200ms between retry attempts and the retry limit is 5. That gives a total possible + * delay of 1s, which should be less than ANR timeouts. The goal is to have the system crash + * because the property could not be set (which is a condition that is easily recognized) and + * not crash because of an ANR (which can be confusing to debug). + */ + private static final int PROPERTY_FAILURE_RETRY_DELAY_MILLIS = 200; + private static final int PROPERTY_FAILURE_RETRY_LIMIT = 5; + + /** * Construct a system property that matches the rules described above. The module is * one of the permitted values above. The API is a string that is a legal Java simple * identifier. The api is modified to conform to the system property style guide by @@ -670,7 +685,33 @@ } } } - SystemProperties.set(name, Long.toString(val)); + RuntimeException failure = null; + for (int attempt = 0; attempt < PROPERTY_FAILURE_RETRY_LIMIT; attempt++) { + try { + SystemProperties.set(name, Long.toString(val)); + if (attempt > 0) { + // This log is not guarded. Based on known bug reports, it should + // occur once a week or less. The purpose of the log message is to + // identify the retries as a source of delay that might be otherwise + // be attributed to the cache itself. + Log.w(TAG, "Nonce set after " + attempt + " tries"); + } + return; + } catch (RuntimeException e) { + if (failure == null) { + failure = e; + } + try { + Thread.sleep(PROPERTY_FAILURE_RETRY_DELAY_MILLIS); + } catch (InterruptedException x) { + // Ignore this exception. The desired delay is only approximate and + // there is no issue if the sleep sometimes terminates early. + } + } + } + // This point is reached only if SystemProperties.set() fails at least once. + // Rethrow the first exception that was received. + throw failure; } // Set the nonce in a static context. No handle is available.