blob: e56009df332a45d8c27964524444a89d882a1059 [file] [log] [blame]
#!/usr/bin/env python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Used for Google-internal artifact size tracking.
See go/tf-devinfra/sizetrack.
INVOCATION: The following flags are required:
sizetrack_helper.py \
--artifact=ARTIFACT, or --manual_bytes=MANUAL_BYTES
--artifact_id=ARTIFACT_ID \
--team=TEAM \
... other optional args ...
On Windows you might need something like:
C:\Python38\python.exe C:\path\to\sizetrack_helper.py ...
PREREQUISITES:
1. Your current activated GCP user must have access scopes and IAM permissions
to do the following:
1. Query and load data into BigQuery
2. Upload files to GCS
2. Your environment must match the following criteria:
1. Current directory is a git repository
2. CL-based commits have a PiperOrigin-RevId trailer. This is the case
for any use of Copybara Single-source-of-truth, e.g. TensorFlow.
Only these commits are considered when running commands.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import csv
import datetime
import os
import os.path
import pathlib
import platform
import subprocess
parser = argparse.ArgumentParser(
usage=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"--project",
type=str,
default="tensorflow-testing",
help="GCP project you can access.")
parser.add_argument(
"--dataset",
type=str,
default="sizetracker",
help="BigQuery dataset containing --table")
parser.add_argument(
"--table", type=str, default="tensorflow_devinfra", help="BigQuery table.")
parser.add_argument(
"--upload",
action="store_true",
help="Upload the artifact to --bucket for analysis.")
parser.add_argument(
"--bucket",
type=str,
default="gs://tf-sizetracker-artifacts",
help="GCS bucket for artifacts.")
parser.add_argument(
"--team",
type=str,
help="For grouping in the dashboard and buckets; e.g. tf-lite-team.")
parser.add_argument(
"--artifact_id",
type=str,
help="Unique ID for your artifact, used for sorting dashboards.")
parser.add_argument(
"-n",
"--dry_run",
action="store_true",
help="Dry run: do not load to BigQuery or upload to GCS.")
parser.add_argument(
"--job",
type=str,
help="Name of job calling this script. Default: $KOKORO_JOB_NAME.")
parser.add_argument(
"--build_id",
type=str,
help="UUID of build calling this script. Default: $KOKORO_BUILD_ID.")
parser.add_argument(
"--print_schema",
action="store_true",
help="Print the table schema and don't do anything else.")
size = parser.add_mutually_exclusive_group()
size.add_argument(
"--artifact",
type=argparse.FileType("r"),
help="Local file you are measuring.")
size.add_argument(
"--manual_bytes",
type=int,
help="Manually set the recorded size instead of providing an artifact.")
FLAGS = parser.parse_args()
NOW = datetime.datetime.now(
datetime.timezone.utc).replace(microsecond=0).isoformat()
TABLE_NAME = "{}.{}".format(FLAGS.dataset, FLAGS.table)
PROJECT_LEVEL_TABLE_NAME = "{}:{}".format(FLAGS.project, TABLE_NAME)
CL_TRAILER = "PiperOrigin-RevId"
PRETTY_COMMIT_DATE = "%cI"
PRETTY_CL = "%(trailers:key={},valueonly)".format(CL_TRAILER)
PRETTY_HEAD_INFO = "%h\t{cl}\t%s\t%ae\t%aI\t%ce\t%cI".format(cl=PRETTY_CL)
PRETTY_EARLY = "{cl}\t%aI\t%cI".format(cl=PRETTY_CL)
PRETTY_COMMIT = "%h"
# This is a BigQuery table schema defined as CSV
# See https://cloud.google.com/bigquery/docs/schemas
SCHEMA = ",".join([
"id:string",
"filename:string",
# These 6 lines are from git's format=pretty
# %h $CL_PRETTY %s %ae %aI %ce %cI
"commit:string",
"cl:int64",
"description:string",
"author:string",
"author_date:timestamp",
"committer:string",
"commit_date:timestamp",
# Done with format=pretty
"earliest_commit:string",
"earliest_cl:int64",
"earliest_author_date:timestamp",
"earliest_commit_date:timestamp",
"all_commits:string",
"all_cls:string",
"bytes:int64",
"team:string",
"logged_date:timestamp",
"uploaded_to:string",
"job:string",
"build_id:string",
])
# Select the earliest recorded commit in the same table for the same artifact
# and team. Used to determine the full range of tested commits for each
# invocation. Returns empty string if there are no earlier records.
BQ_GET_EARLIEST_INCLUDED_COMMIT = """
SELECT
commit
FROM {table} WHERE
commit_date < '{earlier_than_this_date}'
AND id = '{artifact_id}'
AND team = '{team}'
ORDER BY commit_date DESC LIMIT 1
"""
# pylint: disable=unused-argument
def git_pretty(commit_range, pretty_format, n=None):
r"""Run git log and return the cleaned results.
Git is assumed to be available in the PATH.
The PiperOrigin-RevId trailer always picks up an extra newline, so this splits
entries on a null byte (\0, or %x00 for git log) and removes newlines.
Args:
commit_range: Standard range given to git log, e.g. HEAD~1..HEAD
pretty_format: See https://git-scm.com/docs/pretty-formats
n: Number of commits to get. By default, get all within commit_range.
Returns:
List of strings of whatever the format string was.
"""
n = [] if n is None else ["-n", "1"]
try:
ret = subprocess.run([
"git", "log", *n, "--date", "iso", "--grep", CL_TRAILER, commit_range,
"--pretty=format:" + pretty_format + "%x00"
],
check=True,
universal_newlines=True,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE)
except subprocess.CalledProcessError as e:
print(e.stderr)
print(e.stdout)
raise e
out = ret.stdout.replace("\n", "")
# Split by \0 and make list of text, extra whitespace and empty lines removed
return list(filter(None, map(str.strip, out.split("\0"))))
def gcloud(tool, args, stdin=None):
r"""Run a Google cloud utility.
On Linux and MacOS, utilities are assumed to be in the PATH.
On Windows, utilities are assumed to be available as
C:\Program Files (x86)\Google\Cloud SDK\google-cloud-sdk\bin\{tool}.cmd
Args:
tool: CLI tool, e.g. bq, gcloud, gsutil
args: List of arguments, same format as subprocess.run
stdin: String to send to stdin
Returns:
String, the stdout of the tool
"""
if platform.system() == "Windows":
tool = (r"C:\Program Files (x86)\Google\Cloud "
r"SDK\google-cloud-sdk\bin\{}.cmd").format(tool)
try:
ret = subprocess.run([tool, *args],
check=True,
universal_newlines=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
input=stdin)
except subprocess.CalledProcessError as e:
print(e.stderr)
print(e.stdout)
raise e
return ret.stdout.strip()
def bq(args, stdin=None):
"""Helper for running bq, the BigQuery tool."""
# bq prints extra messages to stdout if ~/.bigqueryrc doesn't exist
pathlib.Path(pathlib.Path.home() / ".bigqueryrc").touch()
return gcloud(
"bq", ["--project_id", FLAGS.project, "--headless", *args],
stdin=stdin)
def get_all_tested_commits():
"""Get details about the full commit range tested by this invocation."""
head_info = git_pretty("HEAD", PRETTY_HEAD_INFO, n=1)
_, _, _, _, _, _, current_commit_date = head_info[0].split("\t")
query_earliest_included_commit = BQ_GET_EARLIEST_INCLUDED_COMMIT.format(
table=TABLE_NAME,
earlier_than_this_date=current_commit_date,
artifact_id=FLAGS.artifact_id,
team=FLAGS.team)
# --format=csv returns an empty string if no results, or else two lines:
# commit
# COMMIT_HASH
earliest_commit = bq(["query", "--format", "csv", "--nouse_legacy_sql"],
stdin=query_earliest_included_commit)
# Compute the commit/CL range since the last test
if earliest_commit:
earliest_commit = earliest_commit.splitlines()[-1] # Ignore CSV header
early_cl, early_author_date, early_commit_date = git_pretty(
earliest_commit, PRETTY_EARLY, n=1)[0].split("\t")
all_range = "{commit}..HEAD".format(commit=earliest_commit)
all_commits = ",".join(git_pretty(all_range, PRETTY_COMMIT))
all_changelists = ",".join(git_pretty(all_range, PRETTY_CL))
return [
earliest_commit, early_cl, early_author_date, early_commit_date,
all_commits, all_changelists
]
# If the artifact has never been tracked before this commit
# Empty cells in CSV loads are loaded as NULL values
else:
return [""] * 6
def get_upload_path():
"""Generate URL for 'gsutil cp'."""
if FLAGS.upload and FLAGS.artifact:
artifact_filename = os.path.basename(FLAGS.artifact.name)
# note: not os.path.join here, because gsutil is always linux-style
# Using a timestamp prevents duplicate entries
path = "{bucket}/{team}/{artifact_id}/{now}.{artifact_filename}".format(
bucket=FLAGS.bucket,
team=FLAGS.team,
artifact_id=FLAGS.artifact_id,
now=NOW,
artifact_filename=artifact_filename)
return path
else:
return ""
def build_row():
"""Assemble one row of data about this artifact."""
(earliest_commit, early_cl, early_author_date, early_commit_date, all_commits,
all_changelists) = get_all_tested_commits()
# Use UTC to make sure machines in different timezones load consistent data
current_time = datetime.datetime.now(datetime.timezone.utc).isoformat()
artifact_filename = ("NO_FILE" if not FLAGS.artifact else os.path.basename(
FLAGS.artifact.name))
size_bytes = FLAGS.manual_bytes or os.path.getsize(FLAGS.artifact.name)
head_info = git_pretty("HEAD", PRETTY_HEAD_INFO, n=1)
all_head_info_items = head_info[0].split("\t")
return [
FLAGS.artifact_id,
artifact_filename,
*all_head_info_items,
earliest_commit,
early_cl,
early_author_date,
early_commit_date,
all_commits,
all_changelists,
size_bytes,
FLAGS.team,
current_time,
get_upload_path(),
FLAGS.job,
FLAGS.build_id,
]
def main():
# Validate flags
if FLAGS.print_schema:
print(SCHEMA)
exit(0)
elif not FLAGS.team or not FLAGS.artifact_id or not (FLAGS.artifact or
FLAGS.manual_bytes):
print(
"--team and --artifact_id are required if --print_schema is not "
"specified.\nYou must also specify one of --artifact or --manual_bytes."
"\nPass -h or --help for usage.")
exit(1)
if not FLAGS.job:
FLAGS.job = os.environ.get("KOKORO_JOB_NAME", "NO_JOB")
if not FLAGS.build_id:
FLAGS.build_id = os.environ.get("KOKORO_BUILD_ID", "NO_BUILD")
# Generate data about this artifact into a Tab Separated Value file
next_tsv_row = build_row()
# Upload artifact into GCS if it exists
if FLAGS.upload and FLAGS.artifact:
upload_path = get_upload_path()
if FLAGS.dry_run:
print("DRY RUN: Would gsutil cp to:\n{}".format(upload_path))
else:
gcloud("gsutil", ["cp", FLAGS.artifact.name, upload_path])
# Load into BigQuery
if FLAGS.dry_run:
print("DRY RUN: Generated this TSV row:")
print("\t".join(map(str, next_tsv_row)))
else:
with open("data.tsv", "w", newline="") as tsvfile:
writer = csv.writer(tsvfile, delimiter="\t", quoting=csv.QUOTE_MINIMAL,
lineterminator=os.linesep)
writer.writerow(next_tsv_row)
bq([
"load", "--source_format", "CSV", "--field_delimiter", "tab",
PROJECT_LEVEL_TABLE_NAME, "data.tsv", SCHEMA
])
if __name__ == "__main__":
main()