| """Diffs one repo source tree an upstream repo source tree. |
| |
| Matches the projects from a Gerrit repo workspace to the projects |
| of an upstream workspace. After identifying exist both in the |
| downstream and the upstream workspace it then diffs the each project. |
| |
| Finally, the results of the project matching and diffing are reported. |
| |
| """ |
| |
| from __future__ import absolute_import |
| from __future__ import division |
| from __future__ import print_function |
| import argparse |
| import csv |
| import datetime |
| import multiprocessing |
| import multiprocessing.pool |
| import os |
| import re |
| import subprocess |
| import xml.etree.ElementTree as et |
| import git_commits_not_upstreamed |
| |
| |
| def get_projects(source_tree): |
| """Retrieve the dict of projects names and paths. |
| |
| Args: |
| source_tree: A path to the source tree. |
| |
| Returns: |
| A dict of project paths keyed by project names. |
| """ |
| |
| projects = {} |
| |
| manifest = source_tree + '/.repo/manifest.xml' |
| tree = et.parse(manifest) |
| root = tree.getroot() |
| |
| for project in root.findall('project'): |
| # Ignore projects that are not synced by default |
| if 'notdefault' in project.get('groups', ''): |
| continue |
| path = project.get('path', project.get('name')) |
| path = os.path.abspath(os.path.join(source_tree, path)) |
| name = project.get('name') |
| |
| # check if project files actually exist |
| if not os.path.exists(path): |
| continue |
| |
| projects[name] = path |
| |
| return projects |
| |
| |
| def git(args): |
| """Git command. |
| |
| Args: |
| args: A list of arguments to be sent to the git command. |
| |
| Returns: |
| The output of the git command. |
| """ |
| |
| command = ['git'] |
| command.extend(args) |
| with open(os.devnull, 'w') as devull: |
| return subprocess.check_output(command, stderr=devull) |
| |
| |
| def get_revision_diff_stats(directory, rev_a, rev_b): |
| """Retrieves stats of diff between two git revisions. |
| |
| Args: |
| directory: A path to the git directory to diff. |
| rev_a: A git revision to diff. |
| rev_b: A git revision to diff. |
| |
| Returns: |
| A dict with the count of files modified, lines added |
| and lines removed. |
| """ |
| stats = { |
| 'file': 0, |
| 'insertion': 0, |
| 'deletion': 0, |
| } |
| |
| git_diffstat = git( |
| ['-C', directory, 'diff', '--shortstat', rev_a, rev_b]) |
| for element in git_diffstat.split(','): |
| for key in stats: |
| if key in element: |
| stats[key] = int(element.split()[0]) |
| |
| return stats |
| |
| |
| def get_project_stats(upstream_dir, downstream_dir): |
| """Retrieves stats of diff between two git projects. |
| |
| Diffs a downstream directory against an upstream directory. |
| Lines that exist only in the downstream directory are considered insertions. |
| Lines that exist only in the upstream directory are considered deletions. |
| |
| Args: |
| upstream_dir: A path to the upstream directory to compare. |
| downstream_dir: A path to the downstream directory to compare. |
| |
| Returns: |
| A dict with the count of files modified, lines added |
| and lines removed. |
| """ |
| stats = { |
| 'file': 0, |
| 'insertion': 0, |
| 'deletion': 0, |
| } |
| |
| if upstream_dir and downstream_dir: |
| print('Diffing %s vs %s' % (downstream_dir, upstream_dir)) |
| git(['-C', downstream_dir, 'fetch', '--update-shallow', upstream_dir]) |
| stats = get_revision_diff_stats(downstream_dir, 'FETCH_HEAD', 'HEAD') |
| |
| return stats |
| |
| |
| def match_project_by_root_commits( |
| downstream_project_name, downstream_project_path, upstream_root_commits): |
| """Match a downstream project to an upstream project using their root commits. |
| |
| Find all root commits in a downstream project and find a matching |
| upstream project that have a root commit in common. |
| |
| Args: |
| downstream_project_name: A string with the downstream project name. |
| downstream_project_path: A string with the downstream project path. |
| upstream_root_commits: A dict of root commits and their upstream project. |
| |
| Returns: |
| A string with the matched upstream project name. |
| """ |
| upstream_match = None |
| downstream_root_commits = find_root_commits_in_path(downstream_project_path) |
| for root in downstream_root_commits: |
| if root in upstream_root_commits: |
| upstream_project_list = upstream_root_commits[root] |
| if len(upstream_project_list) > 1: |
| print('Warning: ' + downstream_project_name + |
| ' matches multiple projects') |
| print(upstream_project_list) |
| else: |
| upstream_match = upstream_project_list[0]['name'] |
| # Once there's a root commit match, stop looking for a project match |
| break |
| |
| return upstream_match |
| |
| |
| def match_projects(upstream_projects, downstream_projects): |
| """Match downstream projects to upstream projects. |
| |
| Args: |
| upstream_projects: A dict of upstream projects. |
| downstream_projects: A dict of downstream projects. |
| |
| Returns: |
| A list of upstream and downstream project pairs. |
| """ |
| |
| project_matches = [] |
| |
| # keep a list of upstream projects that have not been matched |
| unmatched_upstream_projects = set(upstream_projects.keys()) |
| |
| upstream_root_commits = find_root_commits_in_projects(upstream_projects) |
| # Match all downstream projects to an upstream project |
| for downstream_name, downstream_path in downstream_projects.iteritems(): |
| # First try to match projects by name |
| if downstream_name in upstream_projects: |
| upstream_match = downstream_name |
| # If there is no project name match then try matching by commit |
| else: |
| upstream_match = match_project_by_root_commits( |
| downstream_name, downstream_path, upstream_root_commits) |
| |
| project_matches.append({ |
| 'upstream': upstream_match, |
| 'downstream': downstream_name, |
| }) |
| unmatched_upstream_projects.discard(upstream_match) |
| |
| # Add all upstream projects that have not been matched |
| for project in unmatched_upstream_projects: |
| project_matches.append({ |
| 'upstream': project, |
| 'downstream': None, |
| }) |
| |
| return project_matches |
| |
| |
| def filter_exclusion_list(projects, exclusion_file): |
| """Removes all projects that match the exclusion patterns.""" |
| |
| filtered = {} |
| |
| exclusion_list = [] |
| if exclusion_file: |
| with open(exclusion_file) as f: |
| exclusion_list = f.readlines() |
| exclusion_list = [line.strip() for line in exclusion_list] |
| exclusion_pattern = '|'.join(exclusion_list) |
| |
| if exclusion_pattern: |
| for name, path in projects.iteritems(): |
| if re.match(exclusion_pattern, name): |
| print('Excluding ' + name) |
| else: |
| filtered[name] = path |
| else: |
| filtered = projects |
| |
| return filtered |
| |
| |
| def get_all_projects_stats(upstream_source_tree, |
| downstream_source_tree, |
| exclusion_file): |
| """Finds the stats of all project in a source tree. |
| |
| Args: |
| upstream_source_tree: A string with the path to the upstream gerrit |
| source tree. |
| downstream_source_tree: A string with the path to the downstream gerrit |
| source tree. |
| exclusion_file: A string with the path to the exclusion file. |
| |
| Returns: |
| A list of dicts of matching upstream and downstream projects |
| including stats for projects that matches. |
| """ |
| upstream_projects, downstream_projects = map( |
| lambda t: get_projects_with_filter(t, exclusion_file), |
| (upstream_source_tree, downstream_source_tree), |
| ) |
| |
| return multiprocessing.pool.ThreadPool( |
| processes=multiprocessing.cpu_count() |
| ).map( |
| lambda match: stats_from_match( |
| upstream_projects, |
| downstream_projects, |
| match, |
| ), |
| match_projects(upstream_projects, downstream_projects), |
| ) |
| |
| |
| def stats_from_match(upstream_projects, downstream_projects, match): |
| """Finds the stats of a single match of two projects. |
| |
| Args: |
| upstream_projects: list of dicts obtained from get_project_stats |
| downstream_projects: list of dicts obtained from get_project_stats |
| match: a single match dict obtained from match_projects |
| |
| Returns: |
| A dict of stats for this particular match |
| """ |
| |
| def display_status(upstream_project_name, |
| downstream_project_name, |
| project_stats): |
| if not upstream_project_name: |
| return 'Downstream Only Projects' |
| elif not downstream_project_name: |
| return 'Upstream Only Projects' |
| elif project_stats['file'] == 0: |
| return 'Intact Projects' |
| elif upstream_project_name == downstream_project_name: |
| return 'Modified Projects' |
| return 'Forked Projects' |
| |
| upstream_project_name = match['upstream'] |
| downstream_project_name = match['downstream'] |
| |
| project_stats = get_project_stats( |
| upstream_projects.get(upstream_project_name), |
| downstream_projects.get(downstream_project_name), |
| ) |
| project_stats.update({ |
| 'status': display_status( |
| upstream_project_name, |
| downstream_project_name, |
| project_stats |
| ), |
| 'downstream_path': downstream_projects.get(downstream_project_name) |
| }) |
| project_stats.update(match) |
| return project_stats |
| |
| |
| def get_projects_with_filter(source_tree, exclusion_file): |
| """ Helper function to get projects with an exclusion file filter applied.""" |
| return filter_exclusion_list( |
| get_projects(source_tree), |
| exclusion_file, |
| ) |
| |
| |
| def find_root_commits_in_path(path): |
| """Returns a list of root commits in a git project path.""" |
| print('Analyzing history of ' + path) |
| rev_list = git(['-C', path, 'rev-list', '--max-parents=0', 'HEAD']) |
| return rev_list.splitlines() |
| |
| |
| def find_root_commits_in_projects(projects): |
| """Returns a dict of root commits with all projects with that root commit.""" |
| root_commits = {} |
| for name, path in projects.iteritems(): |
| for root in find_root_commits_in_path(path): |
| root_list = root_commits.get(root, []) |
| root_list.append({ |
| 'name': name, |
| 'path': path, |
| }) |
| root_commits[root] = root_list |
| return root_commits |
| |
| |
| def get_commit_stats_in_project(project): |
| """Extract commits that have not been upstreamed in a specific project. |
| |
| Args: |
| project: A dict of a project name and path. |
| |
| Returns: |
| A dict of commits not upstreamed. |
| """ |
| name = project['name'] |
| path = project['downstream_path'] |
| print('Finding commits not upstreamed in ' + name) |
| commits = git_commits_not_upstreamed.find('FETCH_HEAD', 'HEAD', path) |
| print('Found commits not upstreamed in ' + name) |
| stats = [] |
| for commit in commits: |
| author = git(['-C', path, 'show', '--no-patch', '--format=%ae', commit]) |
| author = author.strip() |
| subject = git(['-C', path, 'show', '--no-patch', '--format=%s', commit]) |
| subject = subject.strip() |
| stats.append({ |
| 'commit': commit, |
| 'author': author, |
| 'subject': subject, |
| }) |
| |
| return { |
| 'name': name, |
| 'stats': stats, |
| } |
| |
| |
| def get_all_commits_stats(project_stats): |
| """Extract commits that have not been upstreamed in all projects. |
| |
| Args: |
| project_stats: A dict of matching upstream and downstream projects |
| including stats for projects that matches. |
| |
| Returns: |
| A dict of commits not upstreamed. |
| """ |
| commit_stats = {} |
| downstream_stats = {match['downstream']: match for match in project_stats} |
| |
| # Only analyze modified projects |
| modified_projects = [] |
| for name, stats in downstream_stats.iteritems(): |
| if stats['status'].startswith('Modified'): |
| stats['name'] = name |
| modified_projects.append(stats) |
| |
| pool = multiprocessing.Pool() |
| |
| commit_stats = pool.map(get_commit_stats_in_project, modified_projects) |
| |
| commit_stats = {stats['name']: stats['stats'] for stats in commit_stats} |
| |
| return commit_stats |
| |
| |
| def write_commit_csv(commit_stats, commit_output_file): |
| """Write project comparison data to a CSV file. |
| |
| Args: |
| commit_stats: The dict of the stats for all commits. |
| commit_output_file: Path to the output file. |
| """ |
| with open(commit_output_file, 'w') as f: |
| fieldnames = [ |
| 'Date', |
| 'Commit', |
| 'Downstream Project', |
| 'Author', |
| 'Subject', |
| ] |
| today = datetime.datetime.today().strftime('%Y/%m/%d') |
| writer = csv.DictWriter(f, fieldnames=fieldnames) |
| writer.writeheader() |
| for project, stats in commit_stats.iteritems(): |
| for stat in stats: |
| writer.writerow({ |
| 'Date': today, |
| 'Commit': stat['commit'], |
| 'Downstream Project': project, |
| 'Author': stat['author'], |
| 'Subject': stat['subject'], |
| }) |
| print('Wrote commit stats to ' + commit_output_file) |
| |
| |
| def write_project_csv(project_stats, commit_stats, project_output_file): |
| """Write project comparison data to a CSV file. |
| |
| Args: |
| project_stats: The dict of the stats for all projects. |
| commit_stats: The dict of the stats for all commits. |
| project_output_file: Path to the output file. |
| """ |
| with open(project_output_file, 'w') as f: |
| fieldnames = [ |
| 'Date', |
| 'Downstream Project', |
| 'Upstream Project', |
| 'Diff Status', |
| 'Files Changed', |
| 'Line Insertions', |
| 'Line Deletions', |
| 'Line Changes', |
| 'Commits Not Upstreamed', |
| ] |
| writer = csv.DictWriter(f, fieldnames=fieldnames) |
| writer.writeheader() |
| today = datetime.datetime.today().strftime('%Y/%m/%d') |
| for stat in project_stats: |
| commits_not_upstreamed = 0 |
| downstream_project = stat['downstream'] |
| if downstream_project in commit_stats: |
| commits_not_upstreamed = len(commit_stats[downstream_project]) |
| writer.writerow({ |
| 'Date': today, |
| 'Downstream Project': downstream_project, |
| 'Upstream Project': stat['upstream'], |
| 'Diff Status': stat['status'], |
| 'Files Changed': stat['file'], |
| 'Line Insertions': stat['insertion'], |
| 'Line Deletions': stat['deletion'], |
| 'Line Changes': stat['insertion'] + stat['deletion'], |
| 'Commits Not Upstreamed': commits_not_upstreamed, |
| }) |
| print('Wrote project stats to ' + project_output_file) |
| |
| |
| def diff(upstream_source_tree, downstream_source_tree, project_output_file, |
| commit_output_file, exclusions_file): |
| """Diff one repo source tree against another. |
| |
| Args: |
| upstream_source_tree: A string with the path to a gerrit source tree. |
| downstream_source_tree: A string with the path to a gerrit source tree. |
| project_output_file: Path to the project output file. |
| commit_output_file: Path to the commit output file. |
| exclusions_file: Path to exclusions file. |
| """ |
| project_stats = get_all_projects_stats(upstream_source_tree, |
| downstream_source_tree, |
| exclusions_file) |
| commit_stats = get_all_commits_stats(project_stats) |
| write_commit_csv(commit_stats, commit_output_file) |
| write_project_csv(project_stats, commit_stats, project_output_file) |
| |
| |
| def main(): |
| parser = argparse.ArgumentParser( |
| description='Diff a repo source tree against an upstream source tree.') |
| parser.add_argument('upstream_path', help='Path to an upstream source tree.') |
| parser.add_argument( |
| 'downstream_path', help='Path to a downstream source tree.') |
| parser.add_argument( |
| '-p', |
| '--project_output_file', |
| help='Path to write the project output file', |
| default='project.csv',) |
| parser.add_argument( |
| '-c', |
| '--commit_output_file', |
| help='Path to write the commit output file', |
| default='commit.csv',) |
| parser.add_argument( |
| '-e', |
| '--exclusions_file', |
| help='Path to file with a list of project names to be excluded from' |
| 'the diff. You may use a regular expression to match project names as' |
| 'described in https://docs.python.org/2/howto/regex.html', |
| default='', |
| ) |
| args = parser.parse_args() |
| upstream_source_tree = os.path.abspath(args.upstream_path) |
| downstream_source_tree = os.path.abspath(args.downstream_path) |
| project_output_file = os.path.abspath(args.project_output_file) |
| commit_output_file = os.path.abspath(args.commit_output_file) |
| exclusions_file = '' |
| if args.exclusions_file: |
| exclusions_file = os.path.abspath(args.exclusions_file) |
| |
| diff(upstream_source_tree, downstream_source_tree, project_output_file, |
| commit_output_file, exclusions_file) |
| |
| |
| if __name__ == '__main__': |
| main() |