blob: 7e4ef0c31db52fc68692c06c09727ec9f1b29c54 [file] [log] [blame]
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Utilities for scanning source files to determine code authorship.
"""
import itertools
def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list):
"""Similar to UNIX utility find(1), searches for files in the directories.
Automatically leaves out only source code files.
Args:
input_api: InputAPI, as in presubmit scripts.
root_dir: The root directory, to which all other paths are relative.
start_paths_list: The list of paths to start search from. Each path can
be a file or a directory.
excluded_dirs_list: The list of directories to skip.
Returns:
The list of source code files found, relative to |root_dir|.
"""
dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list]
def IsBlacklistedDir(d):
for item in dirs_blacklist:
if item in d:
return True
return False
files_whitelist_re = input_api.re.compile(
r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
'|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
'|tex|mli?)$')
files = []
base_path_len = len(root_dir)
for path in start_paths_list:
full_path = input_api.os_path.join(root_dir, path)
if input_api.os_path.isfile(full_path):
if files_whitelist_re.search(path):
files.append(path)
else:
for dirpath, dirnames, filenames in input_api.os_walk(full_path):
# Remove excluded subdirs for faster scanning.
for item in dirnames[:]:
if IsBlacklistedDir(
input_api.os_path.join(dirpath, item)[base_path_len + 1:]):
dirnames.remove(item)
for filename in filenames:
filepath = \
input_api.os_path.join(dirpath, filename)[base_path_len + 1:]
if files_whitelist_re.search(filepath) and \
not IsBlacklistedDir(filepath):
files.append(filepath)
return files
class _GeneratedFilesDetector(object):
GENERATED_FILE = 'GENERATED FILE'
NO_COPYRIGHT = '*No copyright*'
def __init__(self, input_api):
self.python_multiline_string_double_re = \
input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE)
self.python_multiline_string_single_re = \
input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE)
self.automatically_generated_re = input_api.re.compile(
r'(All changes made in this file will be lost'
'|DO NOT (EDIT|delete this file)'
'|Generated (at|automatically|data)'
'|Automatically generated'
'|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE)
def IsGeneratedFile(self, header):
header = header.upper()
if '"""' in header:
header = self.python_multiline_string_double_re.sub('', header)
if "'''" in header:
header = self.python_multiline_string_single_re.sub('', header)
# First do simple strings lookup to save time.
if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
return True
if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
'GENERATED' in header:
return self.automatically_generated_re.search(header)
return False
class _CopyrightsScanner(object):
@staticmethod
def StaticInit(input_api):
_CopyrightsScanner._c_comment_re = \
input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
_CopyrightsScanner._copyright_indicator = \
r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
_CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile(
r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \
r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE)
_CopyrightsScanner._copyright_disindicator_re = input_api.re.compile(
r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE)
def __init__(self, input_api):
self.max_line_numbers_proximity = 3
self.last_a_item_line_number = -200
self.last_b_item_line_number = -100
self.re = input_api.re
def _CloseLineNumbers(self, a, b):
return 0 <= a - b <= self.max_line_numbers_proximity
def MatchLine(self, line_number, line):
if '"' in line:
line = _CopyrightsScanner._c_comment_re.sub('', line)
upcase_line = line.upper()
# Record '(a)' and '(b)' last occurences in C++ comments.
# This is to filter out '(c)' used as a list item inside C++ comments.
# E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
cpp_comment_idx = upcase_line.find('//')
if cpp_comment_idx != -1:
if upcase_line.find('(A)') > cpp_comment_idx:
self.last_a_item_line_number = line_number
if upcase_line.find('(B)') > cpp_comment_idx:
self.last_b_item_line_number = line_number
# Fast bailout, uses the same patterns as _copyright_indicator regexp.
if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \
and not '\xc2\xa9' in upcase_line:
c_item_index = upcase_line.find('(C)')
if c_item_index == -1:
return None
if c_item_index > cpp_comment_idx and \
self._CloseLineNumbers(line_number,
self.last_b_item_line_number) and \
self._CloseLineNumbers(self.last_b_item_line_number,
self.last_a_item_line_number):
return None
copyr = None
m = _CopyrightsScanner._full_copyright_indicator_re.search(line)
if m and \
not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):
copyr = m.group(0)
# Prettify the authorship string.
copyr = self.re.sub(r'([,.])?\s*$/', '', copyr)
copyr = self.re.sub(
_CopyrightsScanner._copyright_indicator, '', copyr, \
flags=self.re.IGNORECASE)
copyr = self.re.sub(r'^\s+', '', copyr)
copyr = self.re.sub(r'\s{2,}', ' ', copyr)
copyr = self.re.sub(r'\\@', '@', copyr)
return copyr
def FindCopyrights(input_api, root_dir, files_to_scan):
"""Determines code autorship, and finds generated files.
Args:
input_api: InputAPI, as in presubmit scripts.
root_dir: The root directory, to which all other paths are relative.
files_to_scan: The list of file names to scan.
Returns:
The list of copyrights associated with each of the files given.
If the certain file is generated, the corresponding list consists a single
entry -- 'GENERATED_FILE' string. If the file has no copyright info,
the corresponding list contains 'NO_COPYRIGHT' string.
"""
generated_files_detector = _GeneratedFilesDetector(input_api)
_CopyrightsScanner.StaticInit(input_api)
copyrights = []
for file_name in files_to_scan:
linenum = 0
header = []
file_copyrights = []
scanner = _CopyrightsScanner(input_api)
contents = input_api.ReadFile(
input_api.os_path.join(root_dir, file_name), 'r')
for l in contents.split('\n'):
linenum += 1
if linenum <= 25:
header.append(l)
c = scanner.MatchLine(linenum, l)
if c:
file_copyrights.append(c)
if generated_files_detector.IsGeneratedFile('\n'.join(header)):
copyrights.append([_GeneratedFilesDetector.GENERATED_FILE])
elif file_copyrights:
copyrights.append(file_copyrights)
else:
copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT])
return copyrights
def FindCopyrightViolations(input_api, root_dir, files_to_scan):
"""Looks for files that are not belong exlusively to the Chromium Authors.
Args:
input_api: InputAPI, as in presubmit scripts.
root_dir: The root directory, to which all other paths are relative.
files_to_scan: The list of file names to scan.
Returns:
The list of file names that contain non-Chromium copyrights.
"""
copyrights = FindCopyrights(input_api, root_dir, files_to_scan)
offending_files = []
allowed_copyrights_re = input_api.re.compile(
r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
'All rights reserved.*)$')
for f, cs in itertools.izip(files_to_scan, copyrights):
if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \
cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT:
continue
for c in cs:
if not allowed_copyrights_re.match(c):
offending_files.append(input_api.os_path.normpath(f))
break
return offending_files