| #!/usr/bin/python3 |
| # |
| # Copyright (C) 2021 The Android Open Source Project |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| """Utilities for comparing two version of a codebase.""" |
| |
| import argparse |
| import difflib |
| import filecmp |
| import os |
| import pathlib |
| import re |
| |
| |
| class FileStat: |
| """File statistics class for a file.""" |
| |
| NON_TEXT = 0 |
| TEXT = 1 |
| |
| def __init__(self, file_path): |
| """Initializes with a file path string.""" |
| if file_path: |
| self.file_name = str(file_path) |
| self.size = file_path.stat().st_size |
| else: |
| self.file_name = '' |
| self.size = 0 |
| |
| self.line_cnt = 0 |
| self.group_cnt = 0 |
| self.add_line_cnt = 0 |
| self.remove_line_cnt = 0 |
| self.replace_line_cnt = 0 |
| |
| @staticmethod |
| def get_csv_header(prefix=None): |
| """Returns CSV header string.""" |
| cols = ['file', 'size', 'line', 'group', 'add', 'remove', 'replace'] |
| if prefix: |
| return ','.join('{0}_{1}'.format(prefix, c) for c in cols) |
| else: |
| return ','.join(c for c in cols) |
| |
| def get_csv_str(self, strip_dir_len=0): |
| """Returns the file statistic CSV string.""" |
| name = self.file_name[strip_dir_len:] |
| csv = [ |
| FileStat.no_comma(name), self.size, self.line_cnt, self.group_cnt, |
| self.add_line_cnt, self.remove_line_cnt, self.replace_line_cnt |
| ] |
| return ','.join(str(i) for i in csv) |
| |
| @staticmethod |
| def no_comma(astr): |
| """Replaces , with _.""" |
| return astr.replace(',', '_') |
| |
| |
| class DiffStat: |
| """Diff statistic class for 2 versions of a file.""" |
| |
| SAME = 0 |
| NEW = 1 |
| REMOVED = 2 |
| MODIFIED = 3 |
| INCOMPARABLE = 4 |
| |
| def __init__(self, common_name, old_file_stat, new_file_stat, state): |
| """Initializes with the common names & etc.""" |
| self.old_file_stat = old_file_stat |
| self.new_file_stat = new_file_stat |
| self.name = common_name |
| self.ext = os.path.splitext(self.name)[1].lstrip('.') |
| self.state = state |
| self.file_type = FileStat.NON_TEXT |
| |
| def add_diff_stat(self, diff_lines): |
| """Adds the statistic by the diff lines.""" |
| # These align with https://github.com/python/cpython/blob/3.9/Lib/difflib.py |
| old_pattern = re.compile(r'\*{3} (.*)') |
| new_pattern = re.compile(r'-{3} (.*)') |
| group_separator = '***************' |
| old_group_header = re.compile(r'\*{3} (\d*),(\d*) \*{4}') |
| new_group_header = re.compile(r'-{3} (\d*),(\d*) -{4}') |
| |
| # section 0 is old verion & 1 is new verion |
| section = -1 |
| diff_stats = [self.old_file_stat, self.new_file_stat] |
| in_group = False |
| |
| h1m = old_pattern.match(diff_lines[0]) |
| if not h1m: |
| print('ERROR: wrong diff header line 1: %s' % diff_lines[0]) |
| return |
| |
| h2m = new_pattern.match(diff_lines[1]) |
| if not h2m: |
| print('ERROR: wrong diff header line 2: %s' % diff_lines[1]) |
| return |
| |
| for line in diff_lines[2:]: |
| if in_group: |
| if line.startswith(' '): |
| # equal |
| continue |
| elif line.startswith('! '): |
| # replace |
| diff_stats[section].replace_line_cnt += 1 |
| continue |
| elif line.startswith('+ '): |
| # add |
| diff_stats[section].add_line_cnt += 1 |
| continue |
| elif line.startswith('- '): |
| # removed |
| diff_stats[section].remove_line_cnt += 1 |
| continue |
| |
| oghm = old_group_header.match(line) |
| if oghm: |
| section = 0 |
| diff_stats[section].group_cnt += 1 |
| continue |
| |
| nghm = new_group_header.match(line) |
| if nghm: |
| section = 1 |
| diff_stats[section].group_cnt += 1 |
| continue |
| |
| if line.startswith(group_separator): |
| in_group = True |
| continue |
| |
| |
| class ChangeReport: |
| """Change report class for the diff statistics on 2 versions of a codebase. |
| |
| Attributes: |
| old_dir: The old codebase dir path string. |
| new_dir: The new codebase dir path string. |
| dircmp: The dircmp object |
| group_cnt: How many diff groups. |
| add_line_cnt: How many lines are added. |
| remove_line_cnt: How many lines are removed. |
| replace_line_cnt: Hoe many lines are changed. |
| """ |
| |
| def __init__(self, old_dir, new_dir, ignores=None, state_filter=None): |
| """Initializes with old & new dir path strings.""" |
| self.old_dir = os.path.abspath(old_dir) |
| self._old_dir_prefix_len = len(self.old_dir) + 1 |
| self.new_dir = os.path.abspath(new_dir) |
| self._new_dir_prefix_len = len(self.new_dir) + 1 |
| if ignores: |
| self._ignores = ignores.split(',') |
| self._ignores.extend(filecmp.DEFAULT_IGNORES) |
| else: |
| self._ignores = filecmp.DEFAULT_IGNORES |
| |
| if state_filter: |
| self._state_filter = list(map(int, state_filter.split(','))) |
| else: |
| self._state_filter = [0, 1, 2, 3, 4] |
| |
| self._do_same = DiffStat.SAME in self._state_filter |
| self._do_new = DiffStat.NEW in self._state_filter |
| self._do_removed = DiffStat.REMOVED in self._state_filter |
| self._do_moeified = DiffStat.MODIFIED in self._state_filter |
| self._do_incomparable = DiffStat.INCOMPARABLE in self._state_filter |
| |
| self.dircmp = filecmp.dircmp( |
| self.old_dir, self.new_dir, ignore=self._ignores) |
| self._diff_stats = [] |
| self._diff_stat_lines = [] |
| self._diff_lines = [] |
| self._processed_cnt = 0 |
| self._common_dir_len = ChangeReport.get_common_path_len( |
| self.old_dir, self.new_dir) |
| |
| @staticmethod |
| def get_common_path_len(dir1, dir2): |
| """Gets the length of the common path of old & new folders.""" |
| sep = os.path.sep |
| last_sep_pos = 0 |
| for i in range(len(dir1)): |
| if dir1[i] == sep: |
| last_sep_pos = i |
| if dir1[i] != dir2[i]: |
| break |
| return last_sep_pos + 1 |
| |
| @staticmethod |
| def get_diff_stat_header(): |
| """Gets the diff statistic CSV header.""" |
| return 'file,ext,text,state,{0},{1}\n'.format( |
| FileStat.get_csv_header('new'), FileStat.get_csv_header('old')) |
| |
| def get_diff_stat_lines(self): |
| """Gets the diff statistic CSV lines.""" |
| if self._processed_cnt < 1: |
| self._process_dircmp(self.dircmp) |
| self._processed_cnt += 1 |
| |
| self._diff_stat_lines = [] |
| for diff_stat in self._diff_stats: |
| self._diff_stat_lines.append('{0},{1},{2},{3},{4},{5}\n'.format( |
| FileStat.no_comma(diff_stat.name), diff_stat.ext, |
| diff_stat.file_type, diff_stat.state, |
| diff_stat.new_file_stat.get_csv_str(self._common_dir_len), |
| diff_stat.old_file_stat.get_csv_str(self._common_dir_len))) |
| |
| return self._diff_stat_lines |
| |
| def get_diff_lines(self): |
| """Gets the diff output lines.""" |
| if self._processed_cnt < 1: |
| self._process_dircmp(self.dircmp) |
| self._processed_cnt += 1 |
| return self._diff_lines |
| |
| def _process_dircmp(self, dircmp): |
| """Compare all files in a dircmp object for diff statstics & output.""" |
| if self._do_moeified: |
| self._process_diff_files(dircmp) |
| |
| for subdir_dircmp in dircmp.subdirs.values(): |
| rp = pathlib.Path(subdir_dircmp.right) |
| lp = pathlib.Path(subdir_dircmp.left) |
| if rp.is_symlink() or lp.is_symlink(): |
| print('SKIP: symlink: {0} or {1}'.format(subdir_dircmp.right, |
| subdir_dircmp.left)) |
| continue |
| self._process_dircmp(subdir_dircmp) |
| |
| if self._do_new: |
| self._process_others(dircmp.right_only, dircmp.right, |
| self._new_dir_prefix_len, DiffStat.NEW) |
| if self._do_same: |
| self._process_others(dircmp.same_files, dircmp.right, |
| self._new_dir_prefix_len, DiffStat.SAME) |
| if self._do_incomparable: |
| self._process_others(dircmp.funny_files, dircmp.right, |
| self._new_dir_prefix_len, DiffStat.INCOMPARABLE) |
| if self._do_removed: |
| self._process_others(dircmp.left_only, dircmp.left, |
| self._old_dir_prefix_len, DiffStat.REMOVED) |
| |
| def _process_others(self, files, adir, prefix_len, state): |
| """Processes files are not modified.""" |
| empty_stat = FileStat(None) |
| for file in files: |
| file_path = pathlib.Path(adir, file) |
| if file_path.is_symlink(): |
| print('SKIP: symlink: {0}, {1}'.format(state, file_path)) |
| continue |
| elif file_path.is_dir(): |
| flist = self._get_filtered_files(file_path) |
| self._process_others(flist, adir, prefix_len, state) |
| else: |
| file_stat = FileStat(file_path) |
| common_name = str(file_path)[prefix_len:] |
| if state == DiffStat.REMOVED: |
| diff_stat = DiffStat(common_name, file_stat, empty_stat, state) |
| else: |
| diff_stat = DiffStat(common_name, empty_stat, file_stat, state) |
| try: |
| with open(file_path, encoding='utf-8') as f: |
| lines = f.readlines() |
| file_stat.line_cnt = len(lines) |
| file_type = FileStat.TEXT |
| except UnicodeDecodeError: |
| file_type = FileStat.NON_TEXT |
| |
| diff_stat.file_type = file_type |
| self._diff_stats.append(diff_stat) |
| |
| def _process_diff_files(self, dircmp): |
| """Processes files are modified.""" |
| for file in dircmp.diff_files: |
| old_file_path = pathlib.Path(dircmp.left, file) |
| new_file_path = pathlib.Path(dircmp.right, file) |
| self._diff_files(old_file_path, new_file_path) |
| |
| def _diff_files(self, old_file_path, new_file_path): |
| """Diff old & new files.""" |
| old_file_stat = FileStat(old_file_path) |
| new_file_stat = FileStat(new_file_path) |
| common_name = str(new_file_path)[self._new_dir_prefix_len:] |
| diff_stat = DiffStat(common_name, old_file_stat, new_file_stat, |
| DiffStat.MODIFIED) |
| |
| try: |
| with open(old_file_path, encoding='utf-8') as f1: |
| old_lines = f1.readlines() |
| old_file_stat.line_cnt = len(old_lines) |
| with open(new_file_path, encoding='utf-8') as f2: |
| new_lines = f2.readlines() |
| new_file_stat.line_cnt = len(new_lines) |
| diff_lines = list( |
| difflib.context_diff(old_lines, new_lines, old_file_path.name, |
| new_file_path.name)) |
| file_type = FileStat.TEXT |
| if diff_lines: |
| self._diff_lines.extend(diff_lines) |
| diff_stat.add_diff_stat(diff_lines) |
| else: |
| print('WARNING: no diff lines on {0} {1}'.format( |
| old_file_path, new_file_path)) |
| |
| except UnicodeDecodeError: |
| file_type = FileStat.NON_TEXT |
| |
| diff_stat.file_type = file_type |
| self._diff_stats.append(diff_stat) |
| |
| def _get_filtered_files(self, dir_path): |
| """Returns a filtered file list.""" |
| flist = [] |
| for f in dir_path.glob('*'): |
| if f.name not in self._ignores: |
| if f.is_symlink(): |
| print('SKIP: symlink: %s' % f) |
| continue |
| else: |
| flist.append(f) |
| return flist |
| |
| |
| def write_file(file, lines, header=None): |
| """Write lines into a file.""" |
| |
| with open(file, 'w') as f: |
| if header: |
| f.write(header) |
| |
| f.writelines(lines) |
| print('OUTPUT: {0}, {1} lines'.format(file, len(lines))) |
| |
| |
| def main(): |
| parser = argparse.ArgumentParser( |
| 'Generate a diff stat cvs file for 2 versions of a codebase') |
| parser.add_argument('--old_dir', help='the old version codebase dir') |
| parser.add_argument('--new_dir', help='the new version codebase dir') |
| parser.add_argument( |
| '--csv_file', required=False, help='the diff stat cvs file if to create') |
| parser.add_argument( |
| '--diff_output_file', |
| required=False, |
| help='the diff output file if to create') |
| parser.add_argument( |
| '--ignores', |
| required=False, |
| default='.repo,.git,.github,.idea,__MACOSX,.prebuilt_info', |
| help='names to ignore') |
| parser.add_argument( |
| '--state_filter', |
| required=False, |
| default='1,2,3', |
| help='csv diff states to process, 0:SAME, 1:NEW, 2:REMOVED, 3:MODIFIED, ' |
| '4:INCOMPARABLE') |
| |
| args = parser.parse_args() |
| |
| if not os.path.isdir(args.old_dir): |
| print('ERROR: %s does not exist.' % args.old_dir) |
| exit() |
| |
| if not os.path.isdir(args.new_dir): |
| print('ERROR: %s does not exist.' % args.new_dir) |
| exit() |
| |
| change_report = ChangeReport(args.old_dir, args.new_dir, args.ignores, |
| args.state_filter) |
| if args.csv_file: |
| write_file( |
| args.csv_file, |
| change_report.get_diff_stat_lines(), |
| header=ChangeReport.get_diff_stat_header()) |
| |
| if args.diff_output_file: |
| write_file(args.diff_output_file, change_report.get_diff_lines()) |
| |
| |
| if __name__ == '__main__': |
| main() |