[autotest] Added a apache_access_log_metrics daemon

BUG=chromium:723907
TEST=unit tests; ran the daemon.

Change-Id: Ibeafee9edfae5bf52c60ea0afcffebc9b371c950
Reviewed-on: https://chromium-review.googlesource.com/509934
Commit-Ready: Paul Hobbs <phobbs@google.com>
Tested-by: Paul Hobbs <phobbs@google.com>
Reviewed-by: Paul Hobbs <phobbs@google.com>
diff --git a/site_utils/stats/apache_access_log_metrics.py b/site_utils/stats/apache_access_log_metrics.py
new file mode 100755
index 0000000..9da6e92
--- /dev/null
+++ b/site_utils/stats/apache_access_log_metrics.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python2
+
+# Copyright 2017 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Script to upload metrics from apache access logs to Monarch."""
+
+from __future__ import print_function
+
+import argparse
+import re
+import sys
+
+import common
+
+from chromite.lib import ts_mon_config
+from chromite.lib import metrics
+
+from autotest_lib.site_utils.stats import log_daemon_common
+
+
+"""
+The log format is set to:
+  %v:%p %h %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\" %T
+
+These are documented as follows:
+  (from https://httpd.apache.org/docs/current/mod/mod_log_config.html)
+
+%h: Remote host
+%l: Remote logname (from identd, if supplied)
+%O: Bytes sent, including headers. May be zero in rare cases such as when a
+    request is aborted before a response is sent. You need to enable mod_logio
+    to use this.
+%p: The canonical Port of the server serving the request
+%r: First line of request
+%s: Status.  For requests that got internally redirected, this is
+    the status of the *original* request --- %...>s for the last.
+%t: Time, in common log format time format (standard english format)
+%T: The time taken to serve the request, in seconds.
+%u: Remote user (from auth; may be bogus if return status (%s) is 401)
+%v: The canonical ServerName of the server serving the request.
+"""
+
+# Lemma: a regex to match sections delimited be double-quotes ("), which
+# possible contained escaped quotes (\").
+# This works by matching non-quotes or the string r'\"' repeatedly; then it ends
+# when finding a quote (") preceeded by a character which is not a backslash.
+MATCH_UNTIL_QUOTE = r'([^"]|\\")*[^\\]'
+
+ACCESS_MATCHER = re.compile(
+    r'^'
+    r'\S+ \S+ \S+ \S+ '               # Ignore %v:%p %h %l %u
+    r'\[[^]]+\] '                     # Ignore %t
+    r'"('                             # Begin %r
+    r'(?P<request_method>\S+) '       # e.g. POST
+    r'(?P<endpoint>\S+)'              # e.g. /afe/server/noauth/rpc/
+    + MATCH_UNTIL_QUOTE +             # Ignore protocol (e.g. HTTP/1.1)
+    r'|-'                             # The request data might just be "-"
+    r')" '                            # End %r
+    r'(?P<response_code>\d+) '        # %>s (e.g. 200)
+    r'(?P<bytes_sent>\d+)'            # %O
+    r' "' + MATCH_UNTIL_QUOTE + '"'   # Ignore Referer
+    r' "' + MATCH_UNTIL_QUOTE + '"'   # Ignore User-Agent
+    r' ?(?P<response_seconds>\d+?)'   # The server time in seconds
+    r'.*'                             # Allow adding extra stuff afterward.
+)
+
+ACCESS_TIME_METRIC = '/chromeos/autotest/http/server/response_seconds'
+ACCESS_BYTES_METRIC = '/chromeos/autotest/http/server/response_bytes'
+
+
+# TODO(phobbs) use something more systematic than a whitelist.
+WHITELISTED_ENDPOINTS = frozenset((
+    '/',
+    '/afe/clear.cache.gif',
+    '/afe/Open+Sans:300.woff',
+    '/embedded_spreadsheet/autotest.EmbeddedSpreadsheetClient.nocache.js',
+    '/afe/afeclient.css',
+    '/afe/common.css',
+    '/afe/header.png',
+    '/afe/spinner.gif',
+    '/afe/standard.css',
+    '/afe/2371F6F3D4E42171A3563D94B7BF42BF.cache.html',
+    '/afe/autotest.AfeClient.nocache.js',
+    '/afe/',
+    '/new_tko/server/rpc/',
+    '/afe/server/rpc/',
+    '/___rPc_sWiTcH___',
+    '*',
+    '/afe/server/noauth/rpc/',
+))
+
+
+def EmitRequestMetrics(m):
+    """Emits metrics for each line in the access log.
+
+    @param m: A regex match object
+    """
+    # TODO(phobbs) use a memory-efficient structure to detect non-unique paths.
+    # We can't just include the endpoint because it will cause a cardinality
+    # explosion.
+    endpoint = SanitizeEndpoint(m.group('endpoint'))
+    fields = {
+        'request_method': m.groupdict().get('request_method', ''),
+        'endpoint': endpoint,
+        'response_code': int(m.group('response_code')),
+    }
+
+    # Request seconds and bytes sent are both extremely high cardinality, so
+    # they must be the VAL of a metric, not a metric field.
+    if m.group('response_seconds'):
+      response_seconds = int(m.group('response_seconds'))
+      metrics.SecondsDistribution(ACCESS_TIME_METRIC).add(
+          response_seconds, fields=fields)
+
+    bytes_sent = int(m.group('bytes_sent'))
+    metrics.CumulativeDistribution(ACCESS_BYTES_METRIC).add(
+        bytes_sent, fields=fields)
+
+
+def SanitizeEndpoint(endpoint):
+    """Returns empty string if endpoint is not whitelisted.
+
+    @param endpoint: The endpoint to sanitize.
+    """
+    if endpoint in WHITELISTED_ENDPOINTS:
+        return endpoint
+    else:
+        return ''
+
+
+MATCHERS = [
+    (ACCESS_MATCHER, EmitRequestMetrics),
+]
+
+
+def ParseArgs():
+    """Parses the command line arguments."""
+    p = argparse.ArgumentParser(
+        description='Parses apache logs and emits metrics to Monarch')
+    p.add_argument('--output-logfile')
+    p.add_argument('--debug-metrics-file',
+                   help='Output metrics to the given file instead of sending '
+                   'them to production.')
+    return p.parse_args()
+
+
+def Main():
+    """Sets up logging and runs matchers against stdin."""
+    args = ParseArgs()
+    log_daemon_common.SetupLogging(args)
+
+    # Set up metrics sending and go.
+    ts_mon_args = {}
+    if args.debug_metrics_file:
+        ts_mon_args['debug_file'] = args.debug_metrics_file
+
+    with ts_mon_config.SetupTsMonGlobalState('apache_access_log_metrics',
+                                             **ts_mon_args):
+      log_daemon_common.RunMatchers(sys.stdin, MATCHERS)
+
+
+if __name__ == '__main__':
+    Main()
diff --git a/site_utils/stats/apache_access_log_metrics_unittest.py b/site_utils/stats/apache_access_log_metrics_unittest.py
new file mode 100755
index 0000000..2c45d96
--- /dev/null
+++ b/site_utils/stats/apache_access_log_metrics_unittest.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python2
+
+# Copyright 2016 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Unit tests for apache_access_log_metrics.py"""
+
+from __future__ import print_function
+
+import os
+import mock
+import re
+import subprocess
+import tempfile
+import unittest
+
+import apache_access_log_metrics
+
+
+SCRIPT_PATH = os.path.abspath(
+    os.path.join(os.path.dirname(__file__),
+                 'apache_access_log_metrics.py'))
+
+
+EXAMPLE_REQUEST_LINE = (
+    r'chromeos-server2.mtv.corp.google.com:80 100.108.96.5 - - '
+    r'[19/May/2017:11:47:03 -0700] '
+    r'"POST /afe/server/noauth/rpc/ HTTP/1.1\"" '
+    r'200 354 "-" "Python-urllib/2.7" 5'
+)
+
+
+class TestParsers(unittest.TestCase):
+    """Tests the parsing functions in apache_access_log_metrics."""
+
+    def testParseResponse(self):
+        """Tests that the regex matches the example log line."""
+        match = apache_access_log_metrics.ACCESS_MATCHER.match(
+            EXAMPLE_REQUEST_LINE)
+        self.assertTrue(match)
+
+        self.assertEqual(match.group('bytes_sent'), '354')
+        self.assertEqual(match.group('response_seconds'), '5')
+
+
+class TestEmitters(unittest.TestCase):
+    """Tests the emitter functions in apache_access_log_metrics."""
+
+    def testEmitResponse(self):
+        """Tests that the matcher function doesn't throw an Exception."""
+        match = apache_access_log_metrics.ACCESS_MATCHER.match(
+            EXAMPLE_REQUEST_LINE)
+        # Calling the emitter should not raise any exceptions (for example, by
+        # referencing regex match groups that don't exist.
+        with mock.patch.object(apache_access_log_metrics, 'metrics'):
+            apache_access_log_metrics.EmitRequestMetrics(match)
+
+
+class TestScript(unittest.TestCase):
+    """Tests the script end-to-end."""
+    def testApachAccessLogScriptWithMatchingLine(self):
+        """Try shelling out the the script with --debug-file.
+
+        Sending it a line which matches the first-line regex should result in
+        output from ACCESS_TIME_METRIC.
+        """
+        with tempfile.NamedTemporaryFile() as temp_file:
+            p = subprocess.Popen([SCRIPT_PATH,
+                                  '--debug-metrics-file', temp_file.name],
+                                 stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+            p.communicate(EXAMPLE_REQUEST_LINE)
+
+            with open(temp_file.name) as fh:
+                contents = fh.read()
+
+            self.assertTrue(re.search(
+                apache_access_log_metrics.ACCESS_TIME_METRIC[1:] + r'\b',
+                contents))
+            self.assertTrue(re.search(
+                apache_access_log_metrics.ACCESS_BYTES_METRIC[1:] + r'\b',
+
+                contents))
+
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/site_utils/stats/apache_error_log_metrics.py b/site_utils/stats/apache_error_log_metrics.py
index 729b3b2..de9524a 100755
--- a/site_utils/stats/apache_error_log_metrics.py
+++ b/site_utils/stats/apache_error_log_metrics.py
@@ -11,19 +11,16 @@
 """
 from __future__ import print_function
 
-from logging import handlers
 import argparse
-import contextlib
 import re
 import sys
 
 import common
 
-from chromite.lib import cros_logging as logging
 from chromite.lib import metrics
 from chromite.lib import ts_mon_config
 
-from infra_libs import ts_mon
+from autotest_lib.site_utils.stats import log_daemon_common
 
 
 LOOP_INTERVAL = 60
@@ -70,25 +67,6 @@
 ]
 
 
-def RunMatchers(stream, matchers):
-    """Parses lines of |stream| using patterns and emitters from |matchers|
-
-    @param stream: A file object to read from.
-    @param matchers: A list of pairs of (matcher, emitter), where matcher is a
-        regex and emitter is a function called when the regex matches.
-    """
-    # The input might terminate if the log gets rotated. Make sure that Monarch
-    # flushes any pending metrics before quitting.
-    with contextlib.closing(ts_mon):
-        for line in iter(stream.readline, ''):
-            for matcher, emitter in matchers:
-                m = matcher.match(line)
-                if m:
-                    logging.debug('Emitting %s for input "%s"',
-                                  emitter.__name__, line.strip())
-                    emitter(m)
-
-
 def ParseArgs():
     """Parses the command line arguments."""
     p = argparse.ArgumentParser(
@@ -103,16 +81,7 @@
 def Main():
     """Sets up logging and runs matchers against stdin"""
     args = ParseArgs()
-
-    # Set up logging.
-    root = logging.getLogger()
-    if args.output_logfile:
-        handler = handlers.RotatingFileHandler(
-            args.output_logfile, maxBytes=10**6, backupCount=5)
-        root.addHandler(handler)
-    else:
-        root.addHandler(logging.StreamHandler(sys.stdout))
-    root.setLevel(logging.DEBUG)
+    log_daemon_common.SetupLogging(args)
 
     # Set up metrics sending and go.
     ts_mon_args = {}
@@ -121,7 +90,7 @@
 
     with ts_mon_config.SetupTsMonGlobalState('apache_error_log_metrics',
                                              **ts_mon_args):
-      RunMatchers(sys.stdin, MATCHERS)
+      log_daemon_common.RunMatchers(sys.stdin, MATCHERS)
       metrics.Flush()
 
 
diff --git a/site_utils/stats/log_daemon_common.py b/site_utils/stats/log_daemon_common.py
new file mode 100644
index 0000000..f13cbde
--- /dev/null
+++ b/site_utils/stats/log_daemon_common.py
@@ -0,0 +1,44 @@
+from __future__ import print_function
+
+import sys
+from logging import handlers
+
+import common
+
+from chromite.lib import cros_logging as logging
+from infra_libs import ts_mon
+
+
+def RunMatchers(stream, matchers):
+    """Parses lines of |stream| using patterns and emitters from |matchers|
+
+    @param stream: A file object to read from.
+    @param matchers: A list of pairs of (matcher, emitter), where matcher is a
+        regex and emitter is a function called when the regex matches.
+    """
+    # The input might terminate if the log gets rotated. Make sure that Monarch
+    # flushes any pending metrics before quitting.
+    try:
+        for line in iter(stream.readline, ''):
+            for matcher, emitter in matchers:
+                m = matcher.match(line)
+                if m:
+                    logging.debug('Emitting %s for input "%s"',
+                                  emitter.__name__, line.strip())
+                    emitter(m)
+    finally:
+        ts_mon.close()
+        ts_mon.flush()
+
+
+def SetupLogging(args):
+    """Sets up logging based on the parsed arguments."""
+    # Set up logging.
+    root = logging.getLogger()
+    if args.output_logfile:
+        handler = handlers.RotatingFileHandler(
+            args.output_logfile, maxBytes=10**6, backupCount=5)
+        root.addHandler(handler)
+    else:
+        root.addHandler(logging.StreamHandler(sys.stdout))
+    root.setLevel(logging.DEBUG)