development/build_log_simplifier/build_log_simplifier.py - platform/frameworks/support - Git at Google

 #!/usr/bin/python3
 #
 # Copyright (C) 2016 The Android Open Source Project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse, collections, os, re, sys

 dir_of_this_script = os.path.dirname(os.path.realpath(__file__))

 parser = argparse.ArgumentParser(
     description="""USAGE:
     Simplifies a build.log from hundreds of megabytes to <100 lines. Prints output to terminal.
     Pass this script a filepath to parse. You should be able to type "python3 build_log_simplifier.py"
     And then drag-and-drop a log file onto the terminal window to get its path.

     Sample usage: python3 development/build_log_simplifier.py Users/owengray/Desktop/build.log
     """)
 parser.add_argument("--validate", action="store_true", help="Validate that no unrecognized messages exist in the given log")
 parser.add_argument("--update", action="store_true", help="Update our list of recognized messages to include all messages from the given log")
 parser.add_argument("--gc", action="store_true", help="When generating a new exemptions file, exclude any exemptions that were not found in the given log. Only relevant with --update or --validate")
 parser.add_argument("log_path", help="Filepath of log(s) to process", nargs="+")

 # a regexes_matcher can quickly identify which of a set of regexes matches a given text
 class regexes_matcher(object):
     def __init__(self, regexes):
         self.regex_texts = regexes
         self.children = None
         self.matcher = None

     # returns a list of regexes that match the given text
     def get_matching_regexes(self, text, expect_match=True):
         if expect_match and len(self.regex_texts) > 1:
             # If we already expect our matcher to match, we can directly jump to asking our children
             return self.query_children_for_matching_regexes(text)
         # It takes more time to match lots of regexes than to match one composite regex
         # So, we try to match one composite regex first
         if self.matches(text):
             if len(self.regex_texts) > 1:
                 # At least one child regex matches, so we have to determine which ones
                 return self.query_children_for_matching_regexes(text)
             else:
                 return self.regex_texts
         # Our composite regex yielded no matches
         return []

     # queries our children for regexes that match <text>
     def query_children_for_matching_regexes(self, text):
         # Create children if they don't yet exist
         self.ensure_split()
         # query children and join their results
         results = []
         for child in self.children:
             results += child.get_matching_regexes(text, False)
         return results

     # Returns the index of the first regex matching this string, or None of not found
     def index_first_matching_regex(self, text):
         if len(self.regex_texts) <= 1:
             if len(self.regex_texts) == 0:
                 return None
             if self.matches(text):
                 return 0
             return None
         if not self.matches(text):
             return None
         self.ensure_split()
         count = 0
         for child in self.children:
             child_index = child.index_first_matching_regex(text)
             if child_index is not None:
                 return count + child_index
             count += len(child.regex_texts)
         return None

     # Create children if they don't yet exist
     def ensure_split(self):
         if self.children is None:
             # It takes more time to compile a longer regex, but it also takes more time to
             # test lots of small regexes.
             # In practice, this number of children seems to result in fast execution
             num_children = min(len(self.regex_texts), 32)
             child_start = 0
             self.children = []
             for i in range(num_children):
                 child_end = int(len(self.regex_texts) * (i + 1) / num_children)
                 self.children.append(regexes_matcher(self.regex_texts[child_start:child_end]))
                 child_start = child_end


     def matches(self, text):
         if self.matcher is None:
             full_regex_text = "(?:" + ")|(?:".join(self.regex_texts) + ")"
             self.matcher = re.compile(full_regex_text)
         return self.matcher.fullmatch(text)


 def print_failing_task_names(lines):
     tasks_of_interest = []
     # first, find tasks of interest
     for line in lines:
         if line.startswith("Execution failed for task"):
             tasks_of_interest.append(line.split("task '")[1][:-3])

     print("Detected these failing tasks: " + str(tasks_of_interest))

 def shorten_uninteresting_stack_frames(lines):
     result = []
     prev_line_is_boring = False
     for line in lines:
         if line.startswith("\tat org.gradle"):
             if not prev_line_is_boring:
                 result.append("\tat org.gradle...\n")
             prev_line_is_boring = True
         elif line.startswith("\tat java.base"):
             if not prev_line_is_boring:
                 result.append("\tat java.base...\n")
             prev_line_is_boring = True
         else:
             result.append(line)
             prev_line_is_boring = False
     return result

 # Returns the path of the config file holding exemptions for deterministic/consistent output.
 # These exemptions can be garbage collected via the `--gc` argument
 def get_deterministic_exemptions_path():
     return os.path.join(dir_of_this_script, "messages.ignore")

 # Returns the path of the config file holding exemptions for nondetermistic/flaky output.
 # These exemptions will not be garbage collected via the `--gc` argument
 def get_flake_exemptions_path():
     return os.path.join(dir_of_this_script, "message-flakes.ignore")

 # Returns a regexes_matcher that matches what is described by our config file
 # Ignores comments and ordering in our config file
 def build_exemptions_matcher(config_lines):
     config_lines = [line.replace("\n", "") for line in config_lines]
     regexes = []
     for line in config_lines:
         line = line.strip()
         if line.startswith("#") or line == "":
             # skip comments
             continue
         regexes.append(line)
         if remove_control_characters(line) != line:
             raise Exception("Unexpected control characters found in configuration line:\n\n " +
                 "'" + line + "'\n\n. This line is unexpected to match anything. Is this a copying mistake?")

     return regexes_matcher(sorted(regexes))

 # Returns a regexes_matcher that matches the content of our config file
 # Can match comments
 # Respects ordering in the config
 # This is used for editing the config file itself
 def build_exemptions_code_matcher(config_lines):
     config_lines = [line.strip() for line in config_lines]
     regexes = []
     for line in config_lines:
         line = line.strip()
         if line == "":
             continue
         regexes.append(line)
     return regexes_matcher(regexes)

 def remove_by_regexes(lines, config_lines, validate_no_duplicates):
     fast_matcher = build_exemptions_matcher(config_lines)
     result = []
     for line in lines:
         stripped = line.strip()
         matching_exemptions = fast_matcher.get_matching_regexes(stripped, expect_match=True)
         if validate_no_duplicates and len(matching_exemptions) > 1:
            print("")
            print("build_log_simplifier.py: Invalid configuration: multiple message exemptions match the same message. Are some exemptions too broad?")
            print("")
            print("Line: '" + stripped + "'")
            print("")
            print(str(len(matching_exemptions)) + " Matching exemptions:")
            for exemption_text in matching_exemptions:
                print("'" + exemption_text + "'")
            exit(1)
         if len(matching_exemptions) < 1:
             result.append(line)
     return result

 def collapse_consecutive_blank_lines(lines):
     result = []
     prev_blank = True
     for line in lines:
         if line.strip() == "":
             if not prev_blank:
                 result.append(line)
             prev_blank = True
         else:
             result.append(line)
             prev_blank = False
     return result

 def extract_task_name(line):
     prefix = "> Task "
     if line.startswith(prefix):
         return line[len(prefix):].strip()
     return None

 def is_task_line(line):
     return extract_task_name(line) is not None

 def extract_task_names(lines):
     names = []
     for line in lines:
         name = extract_task_name(line)
         if name is not None and name not in names:
             names.append(name)
     return names

 # If a task has no output (or only blank output), this function removes the task (and its output)
 # For example, turns this:
 #  > Task :a
 #  > Task :b
 #  some message
 #
 # into this:
 #
 #  > Task :b
 #  some message
 def collapse_tasks_having_no_output(lines):
     result = []
     # When we see a task name, we might not emit it if it doesn't have any output
     # This variable is that pending task name, or none if we have no pending task
     pending_task = None
     pending_blanks = []
     for line in lines:
         is_section = is_task_line(line) or line.startswith("> Configure project ") or line.startswith("FAILURE: Build failed with an exception.")
         if is_section:
             pending_task = line
             pending_blanks = []
         elif line.strip() == "":
             # If we have a pending task and we found a blank line, then hold the blank line,
             # and only output it if we later find some nonempty output
             if pending_task is not None:
                 pending_blanks.append(line)
             else:
                 result.append(line)
         else:
             # We found some nonempty output, now we emit any pending task names
             if pending_task is not None:
                 result.append(pending_task)
                 result += pending_blanks
                 pending_task = None
                 pending_blanks = []
             result.append(line)
     return result

 # Removes color characters and other ANSI control characters from this input
 control_character_regex = re.compile(r"""
         \x1B  # Escape
         (?:   # 7-bit C1 Fe (except CSI)
             [@-Z\\-_]
         |     # or [ for CSI, followed by a control sequence
             \[
             [0-?]*  # Parameters
             [ -/]*  # Intermediate bytes
             [@-~]   # End
         )
         """, re.VERBOSE)

 def remove_control_characters(line):
     return control_character_regex.sub("", line)

 # Normalizes some filepaths to more easily simplify/skip some messages
 def normalize_paths(lines):
     # get OUT_DIR, DIST_DIR, and the path of the root of the checkout
     out_dir = None
     dist_dir = None
     checkout_dir = None
     gradle_user_home = None
     # we read checkout_root from the log file in case this build was run in a location,
     # such as on a build server
     out_marker = "OUT_DIR="
     dist_marker = "DIST_DIR="
     checkout_marker = "CHECKOUT="
     gradle_user_home_marker="GRADLE_USER_HOME="
     for line in lines:
         if line.startswith(out_marker):
             out_dir = line.split(out_marker)[1].strip()
             continue
         if line.startswith(dist_marker):
             dist_dir = line.split(dist_marker)[1].strip()
             continue
         if line.startswith(checkout_marker):
             checkout_dir = line.split(checkout_marker)[1].strip()
             continue
         if line.startswith(gradle_user_home_marker):
             gradle_user_home = line.split(gradle_user_home_marker)[1].strip()
             continue
         if out_dir is not None and dist_dir is not None and checkout_dir is not None and gradle_user_home is not None:
             break

     # Remove any mentions of these paths, and replace them with consistent values
     # Make sure to put these paths in the correct order so that more-specific paths will
     # be matched first
     remove_paths = collections.OrderedDict()
     if gradle_user_home is not None:
         remove_paths[gradle_user_home] = "$GRADLE_USER_HOME"
     if dist_dir is not None:
         remove_paths[dist_dir] = "$DIST_DIR"
     if out_dir is not None:
         remove_paths[out_dir] = "$OUT_DIR"
     if checkout_dir is not None:
         remove_paths[checkout_dir + "/frameworks/support"] = "$SUPPORT"
         remove_paths[checkout_dir] = "$CHECKOUT"
     result = []
     for line in lines:
         for path in remove_paths:
             if path in line:
                 replacement = remove_paths[path]
                 line = line.replace(path + "/", replacement + "/")
                 line = line.replace(path, replacement)
         result.append(line)
     return result

 # Given a regex with hashes in it like ".gradle/caches/transforms-2/files-2.1/73f631f487bd87cfd8cb2aabafbac6a8",
 # tries to return a more generalized regex like ".gradle/caches/transforms-2/files-2.1/[0-9a-f]{32}"
 def generalize_hashes(message):
     hash_matcher = "[0-9a-f]{32}"
     return re.sub(hash_matcher, hash_matcher, message)

 # Given a regex with numbers in it like ".gradle/caches/transforms-2/files-2.1/73f631f487bd87cfd8cb2aabafbac6a8"
 # tries to return a more generalized regex like ".gradle/caches/transforms-[0-9]*/files-[0-9]*.[0-9]*/73f631f487bd87cfd8cb2aabafbac6a8"
 def generalize_numbers(message):
     matcher = "[0-9]+"
     generalized = re.sub(matcher, matcher, message)
     # the above replacement corrupts strings of the form "[0-9a-f]{32}", so we fix them before returning
     return generalized.replace("[[0-9]+-[0-9]+a-f]{[0-9]+}", "[0-9a-f]{32}")

 # Given a list of output messages and a list of existing exemption lines,
 # generates a new list of exemption lines
 def generate_suggested_exemptions(messages, config_lines, remove_unmatched_lines):
     new_config = suggest_missing_exemptions(messages, config_lines)
     if remove_unmatched_lines:
         new_config = remove_unmatched_exemptions(messages, new_config)
     return new_config

 # Given a list of output messages and a list of existing exemption lines,
 # generates an augmented list of exemptions containing any necessary new exemptions
 def suggest_missing_exemptions(messages, config_lines):
     # given a message, finds the index of the existing exemption for that message, if any
     existing_matcher = build_exemptions_code_matcher(config_lines)
     # the index of the previously matched exemption
     previous_found_index = -1
     # map from line index to list of lines to insert there
     insertions_by_position = collections.defaultdict(lambda: [])
     insertions_by_task_name = collections.OrderedDict()
     # current task generating any subsequent output
     pending_task_line = None
     # new, suggested exemptions
     new_suggestions = set()
     # generate new suggestions
     for line in messages:
         line = line.strip()
         if line == "":
             continue
         # save task name
         is_section = False
         if is_task_line(line) or line.startswith("> Configure project "):
             # If a task creates output, we record its name
             line = "# " + line
             pending_task_line = line
             is_section = True
         # determine where to put task name
         current_found_index = existing_matcher.index_first_matching_regex(line)
         if current_found_index is not None:
             # We already have a mention of this line
             # We don't need to exempt it again, but this informs where to insert our next exemption
             previous_found_index = current_found_index
             pending_task_line = None
             continue
         # skip outputting task names for tasks that don't output anything
         if is_section:
             continue

         # escape message
         escaped = re.escape(line)
         escaped = escaped.replace("\ ", " ") # spaces don't need to be escaped
         escaped = generalize_hashes(escaped)
         escaped = generalize_numbers(escaped)
         # confirm that we haven't already inserted this message
         if escaped in new_suggestions:
             continue
         # insert this regex into an appropriate position
         if pending_task_line is not None:
             # We know which task this line came from, and it's a task that didn't previously make output
             if pending_task_line not in insertions_by_task_name:
                 insertions_by_task_name[pending_task_line] = []
             insertions_by_task_name[pending_task_line].append(escaped)
         else:
             # This line of output didn't come from a new task
             # So we append it after the previous line that we found
             insertions_by_position[previous_found_index].append(escaped)
         new_suggestions.add(escaped)

     # for each regex for which we chose a position in the file, insert it there
     exemption_lines = []
     for i in range(len(existing_matcher.regex_texts)):
         exemption_lines.append(existing_matcher.regex_texts[i])
         if i in insertions_by_position:
             exemption_lines += insertions_by_position[i]
     # for regexes that could not be assigned to a task, insert them next
     if -1 in insertions_by_position:
         exemption_lines += insertions_by_position[-1]
     # for regexes that were simply assigned to certain task names, insert the there, grouped by task
     for task_name in insertions_by_task_name:
         exemption_lines.append(task_name)
         exemption_lines += insertions_by_task_name[task_name]
     return exemption_lines

 # Searches for config lines in <config_lines> that match no line in <messages>
 # Create and returns a new list of config lines, which excludes unmatched lines and
 # any corresponding comments
 def remove_unmatched_exemptions(messages, config_lines):
     existing_matcher = build_exemptions_matcher(config_lines)
     matched_config_lines = set()
     # find all of the regexes that match at least one message
     for line in messages:
         line = line.strip()
         if line.startswith("#"):
             continue
         for regex in existing_matcher.get_matching_regexes(line):
             matched_config_lines.add(regex)
     # generate a new list of config lines
     # keep config lines that were matched in the list of messages
     # keep comments where there remains a matched config line before the next comment
     # skip comments that were previously followed by other config lines that were deleted
     result = []
     pending_comments = [] # comments that we haven't yet decided to keep or not
     found_unused_line_after_comment = False
     for line in config_lines:
         if line.startswith("#"):
             # We found a comment
             if found_unused_line_after_comment:
                 # We found an unused config line more recently than the previous comment,
                 # and now we've found a new comment.
                 if len(pending_comments) > 0:
                     # We also haven't found any used config lines more recently than the previous comment
                     # Presumably these pending comments were intended to describe the lines that we're removing
                     # So, we skip emitting these pending comments too
                     pending_comments = []
             pending_comments.append(line)
             found_unused_line_after_comment = False
             continue
         matched = (line in matched_config_lines)
         if matched:
             # If this config line is being used, then we keep its comments too
             result += pending_comments
             pending_comments = []
             result.append(line)
         else:
             found_unused_line_after_comment = True
     # If there are any comments at the bottom of the file, then keep them too
     if not found_unused_line_after_comment:
         result += pending_comments
     return result

 # opens a file and reads the lines in it
 def readlines(path):
     infile = open(path)
     lines = infile.readlines()
     infile.close()
     return lines

 def writelines(path, lines):
     destfile = open(path, 'w')
     destfile.write("\n".join(lines))
     destfile.close()

 def main():
     arguments = parser.parse_args()

     # read each file
     log_paths = arguments.log_path
     all_lines = []
     for log_path in log_paths:
         lines = readlines(log_path)
         lines = [remove_control_characters(line) for line in lines]
         lines = normalize_paths(lines)
         all_lines += lines
     # load configuration
     flake_exemption_regexes = readlines(get_flake_exemptions_path())
     deterministic_exemption_regexes = readlines(get_deterministic_exemptions_path())
     exemption_regexes = flake_exemption_regexes + deterministic_exemption_regexes
     # load configuration
     # remove lines we're not interested in
     update = arguments.update or arguments.gc
     validate = update or arguments.validate
     interesting_lines = all_lines
     if not validate:
         print_failing_task_names(interesting_lines)
     interesting_lines = remove_by_regexes(interesting_lines, exemption_regexes, validate)
     interesting_lines = collapse_tasks_having_no_output(interesting_lines)
     interesting_lines = collapse_consecutive_blank_lines(interesting_lines)

     # process results
     if update:
         if arguments.gc or len(interesting_lines) != 0:
             update_path = get_deterministic_exemptions_path()
             # filter out any inconsistently observed messages so we don't try to exempt them twice
             all_lines = remove_by_regexes(all_lines, flake_exemption_regexes, validate)
             # update the deterministic exemptions file based on the result
             suggested = generate_suggested_exemptions(all_lines, deterministic_exemption_regexes, arguments.gc)
             writelines(update_path, suggested)
             print("build_log_simplifier.py updated exemptions " + update_path)
     elif validate:
         if len(interesting_lines) != 0:
             print("")
             print("=" * 80)
             print("build_log_simplifier.py: Error: Found " + str(len(interesting_lines)) + " new lines of warning output!")
             print("")
             print("The new output:")
             print("  " + "  ".join(interesting_lines))
             print("")
             print("To reproduce this failure:")
             print("  Try $ ./gradlew -Pandroidx.validateNoUnrecognizedMessages --rerun-tasks " + " ".join(extract_task_names(interesting_lines)))
             print("")
             print("Instructions:")
             print("  If you can fix these messages, do so.")
             print("  If you cannot fix these messages, you may suppress them.")
             print("    To automatically suppress new output from build server builds, run development/build_log_simplifier/update.sh")
             print("  See also https://android.googlesource.com/platform/frameworks/support/+/androidx-main/development/build_log_simplifier/VALIDATION_FAILURE.md")
             print("")
             new_exemptions_path = log_paths[0] + ".ignore"
             # filter out any inconsistently observed messages so we don't try to exempt them twice
             all_lines = remove_by_regexes(all_lines, flake_exemption_regexes, validate)
             # update deterministic exemptions file based on the result
             suggested = generate_suggested_exemptions(all_lines, deterministic_exemption_regexes, arguments.gc)
             writelines(new_exemptions_path, suggested)
             print("Files:")
             print("  Full Log                   : " + ",".join(log_paths))
             print("  Baseline                   : " + get_deterministic_exemptions_path())
             print("  Autogenerated new baseline : " + new_exemptions_path)
             exit(1)
     else:
         interesting_lines = shorten_uninteresting_stack_frames(interesting_lines)
         print("".join(interesting_lines))

 if __name__ == "__main__":
     main()