[autotest] Added a apache_access_log_metrics daemon
BUG=chromium:723907
TEST=unit tests; ran the daemon.
Change-Id: Ibeafee9edfae5bf52c60ea0afcffebc9b371c950
Reviewed-on: https://chromium-review.googlesource.com/509934
Commit-Ready: Paul Hobbs <phobbs@google.com>
Tested-by: Paul Hobbs <phobbs@google.com>
Reviewed-by: Paul Hobbs <phobbs@google.com>
diff --git a/site_utils/stats/apache_access_log_metrics.py b/site_utils/stats/apache_access_log_metrics.py
new file mode 100755
index 0000000..9da6e92
--- /dev/null
+++ b/site_utils/stats/apache_access_log_metrics.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python2
+
+# Copyright 2017 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Script to upload metrics from apache access logs to Monarch."""
+
+from __future__ import print_function
+
+import argparse
+import re
+import sys
+
+import common
+
+from chromite.lib import ts_mon_config
+from chromite.lib import metrics
+
+from autotest_lib.site_utils.stats import log_daemon_common
+
+
+"""
+The log format is set to:
+ %v:%p %h %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\" %T
+
+These are documented as follows:
+ (from https://httpd.apache.org/docs/current/mod/mod_log_config.html)
+
+%h: Remote host
+%l: Remote logname (from identd, if supplied)
+%O: Bytes sent, including headers. May be zero in rare cases such as when a
+ request is aborted before a response is sent. You need to enable mod_logio
+ to use this.
+%p: The canonical Port of the server serving the request
+%r: First line of request
+%s: Status. For requests that got internally redirected, this is
+ the status of the *original* request --- %...>s for the last.
+%t: Time, in common log format time format (standard english format)
+%T: The time taken to serve the request, in seconds.
+%u: Remote user (from auth; may be bogus if return status (%s) is 401)
+%v: The canonical ServerName of the server serving the request.
+"""
+
+# Lemma: a regex to match sections delimited be double-quotes ("), which
+# possible contained escaped quotes (\").
+# This works by matching non-quotes or the string r'\"' repeatedly; then it ends
+# when finding a quote (") preceeded by a character which is not a backslash.
+MATCH_UNTIL_QUOTE = r'([^"]|\\")*[^\\]'
+
+ACCESS_MATCHER = re.compile(
+ r'^'
+ r'\S+ \S+ \S+ \S+ ' # Ignore %v:%p %h %l %u
+ r'\[[^]]+\] ' # Ignore %t
+ r'"(' # Begin %r
+ r'(?P<request_method>\S+) ' # e.g. POST
+ r'(?P<endpoint>\S+)' # e.g. /afe/server/noauth/rpc/
+ + MATCH_UNTIL_QUOTE + # Ignore protocol (e.g. HTTP/1.1)
+ r'|-' # The request data might just be "-"
+ r')" ' # End %r
+ r'(?P<response_code>\d+) ' # %>s (e.g. 200)
+ r'(?P<bytes_sent>\d+)' # %O
+ r' "' + MATCH_UNTIL_QUOTE + '"' # Ignore Referer
+ r' "' + MATCH_UNTIL_QUOTE + '"' # Ignore User-Agent
+ r' ?(?P<response_seconds>\d+?)' # The server time in seconds
+ r'.*' # Allow adding extra stuff afterward.
+)
+
+ACCESS_TIME_METRIC = '/chromeos/autotest/http/server/response_seconds'
+ACCESS_BYTES_METRIC = '/chromeos/autotest/http/server/response_bytes'
+
+
+# TODO(phobbs) use something more systematic than a whitelist.
+WHITELISTED_ENDPOINTS = frozenset((
+ '/',
+ '/afe/clear.cache.gif',
+ '/afe/Open+Sans:300.woff',
+ '/embedded_spreadsheet/autotest.EmbeddedSpreadsheetClient.nocache.js',
+ '/afe/afeclient.css',
+ '/afe/common.css',
+ '/afe/header.png',
+ '/afe/spinner.gif',
+ '/afe/standard.css',
+ '/afe/2371F6F3D4E42171A3563D94B7BF42BF.cache.html',
+ '/afe/autotest.AfeClient.nocache.js',
+ '/afe/',
+ '/new_tko/server/rpc/',
+ '/afe/server/rpc/',
+ '/___rPc_sWiTcH___',
+ '*',
+ '/afe/server/noauth/rpc/',
+))
+
+
+def EmitRequestMetrics(m):
+ """Emits metrics for each line in the access log.
+
+ @param m: A regex match object
+ """
+ # TODO(phobbs) use a memory-efficient structure to detect non-unique paths.
+ # We can't just include the endpoint because it will cause a cardinality
+ # explosion.
+ endpoint = SanitizeEndpoint(m.group('endpoint'))
+ fields = {
+ 'request_method': m.groupdict().get('request_method', ''),
+ 'endpoint': endpoint,
+ 'response_code': int(m.group('response_code')),
+ }
+
+ # Request seconds and bytes sent are both extremely high cardinality, so
+ # they must be the VAL of a metric, not a metric field.
+ if m.group('response_seconds'):
+ response_seconds = int(m.group('response_seconds'))
+ metrics.SecondsDistribution(ACCESS_TIME_METRIC).add(
+ response_seconds, fields=fields)
+
+ bytes_sent = int(m.group('bytes_sent'))
+ metrics.CumulativeDistribution(ACCESS_BYTES_METRIC).add(
+ bytes_sent, fields=fields)
+
+
+def SanitizeEndpoint(endpoint):
+ """Returns empty string if endpoint is not whitelisted.
+
+ @param endpoint: The endpoint to sanitize.
+ """
+ if endpoint in WHITELISTED_ENDPOINTS:
+ return endpoint
+ else:
+ return ''
+
+
+MATCHERS = [
+ (ACCESS_MATCHER, EmitRequestMetrics),
+]
+
+
+def ParseArgs():
+ """Parses the command line arguments."""
+ p = argparse.ArgumentParser(
+ description='Parses apache logs and emits metrics to Monarch')
+ p.add_argument('--output-logfile')
+ p.add_argument('--debug-metrics-file',
+ help='Output metrics to the given file instead of sending '
+ 'them to production.')
+ return p.parse_args()
+
+
+def Main():
+ """Sets up logging and runs matchers against stdin."""
+ args = ParseArgs()
+ log_daemon_common.SetupLogging(args)
+
+ # Set up metrics sending and go.
+ ts_mon_args = {}
+ if args.debug_metrics_file:
+ ts_mon_args['debug_file'] = args.debug_metrics_file
+
+ with ts_mon_config.SetupTsMonGlobalState('apache_access_log_metrics',
+ **ts_mon_args):
+ log_daemon_common.RunMatchers(sys.stdin, MATCHERS)
+
+
+if __name__ == '__main__':
+ Main()
diff --git a/site_utils/stats/apache_access_log_metrics_unittest.py b/site_utils/stats/apache_access_log_metrics_unittest.py
new file mode 100755
index 0000000..2c45d96
--- /dev/null
+++ b/site_utils/stats/apache_access_log_metrics_unittest.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python2
+
+# Copyright 2016 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Unit tests for apache_access_log_metrics.py"""
+
+from __future__ import print_function
+
+import os
+import mock
+import re
+import subprocess
+import tempfile
+import unittest
+
+import apache_access_log_metrics
+
+
+SCRIPT_PATH = os.path.abspath(
+ os.path.join(os.path.dirname(__file__),
+ 'apache_access_log_metrics.py'))
+
+
+EXAMPLE_REQUEST_LINE = (
+ r'chromeos-server2.mtv.corp.google.com:80 100.108.96.5 - - '
+ r'[19/May/2017:11:47:03 -0700] '
+ r'"POST /afe/server/noauth/rpc/ HTTP/1.1\"" '
+ r'200 354 "-" "Python-urllib/2.7" 5'
+)
+
+
+class TestParsers(unittest.TestCase):
+ """Tests the parsing functions in apache_access_log_metrics."""
+
+ def testParseResponse(self):
+ """Tests that the regex matches the example log line."""
+ match = apache_access_log_metrics.ACCESS_MATCHER.match(
+ EXAMPLE_REQUEST_LINE)
+ self.assertTrue(match)
+
+ self.assertEqual(match.group('bytes_sent'), '354')
+ self.assertEqual(match.group('response_seconds'), '5')
+
+
+class TestEmitters(unittest.TestCase):
+ """Tests the emitter functions in apache_access_log_metrics."""
+
+ def testEmitResponse(self):
+ """Tests that the matcher function doesn't throw an Exception."""
+ match = apache_access_log_metrics.ACCESS_MATCHER.match(
+ EXAMPLE_REQUEST_LINE)
+ # Calling the emitter should not raise any exceptions (for example, by
+ # referencing regex match groups that don't exist.
+ with mock.patch.object(apache_access_log_metrics, 'metrics'):
+ apache_access_log_metrics.EmitRequestMetrics(match)
+
+
+class TestScript(unittest.TestCase):
+ """Tests the script end-to-end."""
+ def testApachAccessLogScriptWithMatchingLine(self):
+ """Try shelling out the the script with --debug-file.
+
+ Sending it a line which matches the first-line regex should result in
+ output from ACCESS_TIME_METRIC.
+ """
+ with tempfile.NamedTemporaryFile() as temp_file:
+ p = subprocess.Popen([SCRIPT_PATH,
+ '--debug-metrics-file', temp_file.name],
+ stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ p.communicate(EXAMPLE_REQUEST_LINE)
+
+ with open(temp_file.name) as fh:
+ contents = fh.read()
+
+ self.assertTrue(re.search(
+ apache_access_log_metrics.ACCESS_TIME_METRIC[1:] + r'\b',
+ contents))
+ self.assertTrue(re.search(
+ apache_access_log_metrics.ACCESS_BYTES_METRIC[1:] + r'\b',
+
+ contents))
+
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/site_utils/stats/apache_error_log_metrics.py b/site_utils/stats/apache_error_log_metrics.py
index 729b3b2..de9524a 100755
--- a/site_utils/stats/apache_error_log_metrics.py
+++ b/site_utils/stats/apache_error_log_metrics.py
@@ -11,19 +11,16 @@
"""
from __future__ import print_function
-from logging import handlers
import argparse
-import contextlib
import re
import sys
import common
-from chromite.lib import cros_logging as logging
from chromite.lib import metrics
from chromite.lib import ts_mon_config
-from infra_libs import ts_mon
+from autotest_lib.site_utils.stats import log_daemon_common
LOOP_INTERVAL = 60
@@ -70,25 +67,6 @@
]
-def RunMatchers(stream, matchers):
- """Parses lines of |stream| using patterns and emitters from |matchers|
-
- @param stream: A file object to read from.
- @param matchers: A list of pairs of (matcher, emitter), where matcher is a
- regex and emitter is a function called when the regex matches.
- """
- # The input might terminate if the log gets rotated. Make sure that Monarch
- # flushes any pending metrics before quitting.
- with contextlib.closing(ts_mon):
- for line in iter(stream.readline, ''):
- for matcher, emitter in matchers:
- m = matcher.match(line)
- if m:
- logging.debug('Emitting %s for input "%s"',
- emitter.__name__, line.strip())
- emitter(m)
-
-
def ParseArgs():
"""Parses the command line arguments."""
p = argparse.ArgumentParser(
@@ -103,16 +81,7 @@
def Main():
"""Sets up logging and runs matchers against stdin"""
args = ParseArgs()
-
- # Set up logging.
- root = logging.getLogger()
- if args.output_logfile:
- handler = handlers.RotatingFileHandler(
- args.output_logfile, maxBytes=10**6, backupCount=5)
- root.addHandler(handler)
- else:
- root.addHandler(logging.StreamHandler(sys.stdout))
- root.setLevel(logging.DEBUG)
+ log_daemon_common.SetupLogging(args)
# Set up metrics sending and go.
ts_mon_args = {}
@@ -121,7 +90,7 @@
with ts_mon_config.SetupTsMonGlobalState('apache_error_log_metrics',
**ts_mon_args):
- RunMatchers(sys.stdin, MATCHERS)
+ log_daemon_common.RunMatchers(sys.stdin, MATCHERS)
metrics.Flush()
diff --git a/site_utils/stats/log_daemon_common.py b/site_utils/stats/log_daemon_common.py
new file mode 100644
index 0000000..f13cbde
--- /dev/null
+++ b/site_utils/stats/log_daemon_common.py
@@ -0,0 +1,44 @@
+from __future__ import print_function
+
+import sys
+from logging import handlers
+
+import common
+
+from chromite.lib import cros_logging as logging
+from infra_libs import ts_mon
+
+
+def RunMatchers(stream, matchers):
+ """Parses lines of |stream| using patterns and emitters from |matchers|
+
+ @param stream: A file object to read from.
+ @param matchers: A list of pairs of (matcher, emitter), where matcher is a
+ regex and emitter is a function called when the regex matches.
+ """
+ # The input might terminate if the log gets rotated. Make sure that Monarch
+ # flushes any pending metrics before quitting.
+ try:
+ for line in iter(stream.readline, ''):
+ for matcher, emitter in matchers:
+ m = matcher.match(line)
+ if m:
+ logging.debug('Emitting %s for input "%s"',
+ emitter.__name__, line.strip())
+ emitter(m)
+ finally:
+ ts_mon.close()
+ ts_mon.flush()
+
+
+def SetupLogging(args):
+ """Sets up logging based on the parsed arguments."""
+ # Set up logging.
+ root = logging.getLogger()
+ if args.output_logfile:
+ handler = handlers.RotatingFileHandler(
+ args.output_logfile, maxBytes=10**6, backupCount=5)
+ root.addHandler(handler)
+ else:
+ root.addHandler(logging.StreamHandler(sys.stdout))
+ root.setLevel(logging.DEBUG)