android_webview/tools/copyright_scanner.py - platform/external/chromium_org - Git at Google

 # Copyright 2014 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Utilities for scanning source files to determine code authorship.
 """

 import itertools


 def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list):
   """Similar to UNIX utility find(1), searches for files in the directories.
   Automatically leaves out only source code files.
   Args:
     input_api: InputAPI, as in presubmit scripts.
     root_dir: The root directory, to which all other paths are relative.
     start_paths_list: The list of paths to start search from. Each path can
       be a file or a directory.
     excluded_dirs_list: The list of directories to skip.
   Returns:
     The list of source code files found, relative to |root_dir|.
   """
   dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list]
   def IsBlacklistedDir(d):
     for item in dirs_blacklist:
       if item in d:
         return True
     return False

   files_whitelist_re = input_api.re.compile(
     r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
     '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
     '|tex|mli?)$')
   files = []

   base_path_len = len(root_dir)
   for path in start_paths_list:
     full_path = input_api.os_path.join(root_dir, path)
     if input_api.os_path.isfile(full_path):
       if files_whitelist_re.search(path):
         files.append(path)
     else:
       for dirpath, dirnames, filenames in input_api.os_walk(full_path):
         # Remove excluded subdirs for faster scanning.
         for item in dirnames[:]:
           if IsBlacklistedDir(
               input_api.os_path.join(dirpath, item)[base_path_len + 1:]):
             dirnames.remove(item)
         for filename in filenames:
           filepath = \
               input_api.os_path.join(dirpath, filename)[base_path_len + 1:]
           if files_whitelist_re.search(filepath) and \
               not IsBlacklistedDir(filepath):
             files.append(filepath)
   return files


 class _GeneratedFilesDetector(object):
   GENERATED_FILE = 'GENERATED FILE'
   NO_COPYRIGHT = '*No copyright*'

   def __init__(self, input_api):
     self.python_multiline_string_double_re = \
       input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE)
     self.python_multiline_string_single_re = \
       input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE)
     self.automatically_generated_re = input_api.re.compile(
       r'(All changes made in this file will be lost'
       '|DO NOT (EDIT|delete this file)'
       '|Generated (at|automatically|data)'
       '|Automatically generated'
       '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE)

   def IsGeneratedFile(self, header):
     header = header.upper()
     if '"""' in header:
       header = self.python_multiline_string_double_re.sub('', header)
     if "'''" in header:
       header = self.python_multiline_string_single_re.sub('', header)
     # First do simple strings lookup to save time.
     if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
       return True
     if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
         'GENERATED' in header:
       return self.automatically_generated_re.search(header)
     return False


 class _CopyrightsScanner(object):
   @staticmethod
   def StaticInit(input_api):
     _CopyrightsScanner._c_comment_re = \
       input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
     _CopyrightsScanner._copyright_indicator = \
       r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
     _CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile(
       r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \
       r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE)
     _CopyrightsScanner._copyright_disindicator_re = input_api.re.compile(
       r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE)

   def __init__(self, input_api):
     self.max_line_numbers_proximity = 3
     self.last_a_item_line_number = -200
     self.last_b_item_line_number = -100
     self.re = input_api.re

   def _CloseLineNumbers(self, a, b):
     return 0 <= a - b <= self.max_line_numbers_proximity

   def MatchLine(self, line_number, line):
     if '"' in line:
       line = _CopyrightsScanner._c_comment_re.sub('', line)
     upcase_line = line.upper()
     # Record '(a)' and '(b)' last occurences in C++ comments.
     # This is to filter out '(c)' used as a list item inside C++ comments.
     # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
     cpp_comment_idx = upcase_line.find('//')
     if cpp_comment_idx != -1:
       if upcase_line.find('(A)') > cpp_comment_idx:
         self.last_a_item_line_number = line_number
       if upcase_line.find('(B)') > cpp_comment_idx:
         self.last_b_item_line_number = line_number
     # Fast bailout, uses the same patterns as _copyright_indicator regexp.
     if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \
         and not '\xc2\xa9' in upcase_line:
       c_item_index = upcase_line.find('(C)')
       if c_item_index == -1:
         return None
       if c_item_index > cpp_comment_idx and \
           self._CloseLineNumbers(line_number,
                                  self.last_b_item_line_number) and \
           self._CloseLineNumbers(self.last_b_item_line_number,
                                  self.last_a_item_line_number):
         return None
     copyr = None
     m = _CopyrightsScanner._full_copyright_indicator_re.search(line)
     if m and \
         not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):
       copyr = m.group(0)
       # Prettify the authorship string.
       copyr = self.re.sub(r'([,.])?\s*$/', '', copyr)
       copyr = self.re.sub(
         _CopyrightsScanner._copyright_indicator, '', copyr, \
         flags=self.re.IGNORECASE)
       copyr = self.re.sub(r'^\s+', '', copyr)
       copyr = self.re.sub(r'\s{2,}', ' ', copyr)
       copyr = self.re.sub(r'\\@', '@', copyr)
     return copyr


 def FindCopyrights(input_api, root_dir, files_to_scan):
   """Determines code autorship, and finds generated files.
   Args:
     input_api: InputAPI, as in presubmit scripts.
     root_dir: The root directory, to which all other paths are relative.
     files_to_scan: The list of file names to scan.
   Returns:
     The list of copyrights associated with each of the files given.
     If the certain file is generated, the corresponding list consists a single
     entry -- 'GENERATED_FILE' string. If the file has no copyright info,
     the corresponding list contains 'NO_COPYRIGHT' string.
   """
   generated_files_detector = _GeneratedFilesDetector(input_api)
   _CopyrightsScanner.StaticInit(input_api)
   copyrights = []
   for file_name in files_to_scan:
     linenum = 0
     header = []
     file_copyrights = []
     scanner = _CopyrightsScanner(input_api)
     contents = input_api.ReadFile(
       input_api.os_path.join(root_dir, file_name), 'r')
     for l in contents.split('\n'):
       linenum += 1
       if linenum <= 25:
         header.append(l)
       c = scanner.MatchLine(linenum, l)
       if c:
         file_copyrights.append(c)
     if generated_files_detector.IsGeneratedFile('\n'.join(header)):
       copyrights.append([_GeneratedFilesDetector.GENERATED_FILE])
     elif file_copyrights:
       copyrights.append(file_copyrights)
     else:
       copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT])
   return copyrights


 def FindCopyrightViolations(input_api, root_dir, files_to_scan):
   """Looks for files that are not belong exlusively to the Chromium Authors.
   Args:
     input_api: InputAPI, as in presubmit scripts.
     root_dir: The root directory, to which all other paths are relative.
     files_to_scan: The list of file names to scan.
   Returns:
     The list of file names that contain non-Chromium copyrights.
   """
   copyrights = FindCopyrights(input_api, root_dir, files_to_scan)
   offending_files = []
   allowed_copyrights_re = input_api.re.compile(
     r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
     'All rights reserved.*)$')
   for f, cs in itertools.izip(files_to_scan, copyrights):
     if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \
        cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT:
       continue
     for c in cs:
       if not allowed_copyrights_re.match(c):
         offending_files.append(input_api.os_path.normpath(f))
         break
   return offending_files
	# Copyright 2014 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Utilities for scanning source files to determine code authorship.
	"""

	import itertools


	def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list):
	"""Similar to UNIX utility find(1), searches for files in the directories.
	Automatically leaves out only source code files.
	Args:
	input_api: InputAPI, as in presubmit scripts.
	root_dir: The root directory, to which all other paths are relative.
	start_paths_list: The list of paths to start search from. Each path can
	be a file or a directory.
	excluded_dirs_list: The list of directories to skip.
	Returns:
	The list of source code files found, relative to \|root_dir\|.
	"""
	dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list]
	def IsBlacklistedDir(d):
	for item in dirs_blacklist:
	if item in d:
	return True
	return False

	files_whitelist_re = input_api.re.compile(
	r'\.(asm\|c(c\|pp\|xx)?\|h(h\|pp\|xx)?\|p(l\|m)\|xs\|sh\|php\|py(\|x)'
	'\|rb\|idl\|java\|el\|sc(i\|e)\|cs\|pas\|inc\|js\|pac\|html\|dtd\|xsl\|mod\|mm?'
	'\|tex\|mli?)$')
	files = []

	base_path_len = len(root_dir)
	for path in start_paths_list:
	full_path = input_api.os_path.join(root_dir, path)
	if input_api.os_path.isfile(full_path):
	if files_whitelist_re.search(path):
	files.append(path)
	else:
	for dirpath, dirnames, filenames in input_api.os_walk(full_path):
	# Remove excluded subdirs for faster scanning.
	for item in dirnames[:]:
	if IsBlacklistedDir(
	input_api.os_path.join(dirpath, item)[base_path_len + 1:]):
	dirnames.remove(item)
	for filename in filenames:
	filepath = \
	input_api.os_path.join(dirpath, filename)[base_path_len + 1:]
	if files_whitelist_re.search(filepath) and \
	not IsBlacklistedDir(filepath):
	files.append(filepath)
	return files


	class _GeneratedFilesDetector(object):
	GENERATED_FILE = 'GENERATED FILE'
	NO_COPYRIGHT = 'No copyright'

	def __init__(self, input_api):
	self.python_multiline_string_double_re = \
	input_api.re.compile(r'"""[^"]*(?:"""\|$)', flags=input_api.re.MULTILINE)
	self.python_multiline_string_single_re = \
	input_api.re.compile(r"'''[^']*(?:'''\|$)", flags=input_api.re.MULTILINE)
	self.automatically_generated_re = input_api.re.compile(
	r'(All changes made in this file will be lost'
	'\|DO NOT (EDIT\|delete this file)'
	'\|Generated (at\|automatically\|data)'
	'\|Automatically generated'
	'\|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE)

	def IsGeneratedFile(self, header):
	header = header.upper()
	if '"""' in header:
	header = self.python_multiline_string_double_re.sub('', header)
	if "'''" in header:
	header = self.python_multiline_string_single_re.sub('', header)
	# First do simple strings lookup to save time.
	if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
	return True
	if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
	'GENERATED' in header:
	return self.automatically_generated_re.search(header)
	return False


	class _CopyrightsScanner(object):
	@staticmethod
	def StaticInit(input_api):
	_CopyrightsScanner._c_comment_re = \
	input_api.re.compile(r'''"[^"\\](?:\\.[^"\\])*"''')
	_CopyrightsScanner._copyright_indicator = \
	r'(?:copyright\|copr\.\|\xc2\xa9\|\(c\))'
	_CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile(
	r'(?:\W\|^)' + _CopyrightsScanner._copyright_indicator + \
	r'(?::\s\|\s+)(\w.)$', input_api.re.IGNORECASE)
	_CopyrightsScanner._copyright_disindicator_re = input_api.re.compile(
	r'\s*\b(?:info(?:rmation)?\|notice\|and\|or)\b', input_api.re.IGNORECASE)

	def __init__(self, input_api):
	self.max_line_numbers_proximity = 3
	self.last_a_item_line_number = -200
	self.last_b_item_line_number = -100
	self.re = input_api.re

	def _CloseLineNumbers(self, a, b):
	return 0 <= a - b <= self.max_line_numbers_proximity

	def MatchLine(self, line_number, line):
	if '"' in line:
	line = _CopyrightsScanner._c_comment_re.sub('', line)
	upcase_line = line.upper()
	# Record '(a)' and '(b)' last occurences in C++ comments.
	# This is to filter out '(c)' used as a list item inside C++ comments.
	# E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
	cpp_comment_idx = upcase_line.find('//')
	if cpp_comment_idx != -1:
	if upcase_line.find('(A)') > cpp_comment_idx:
	self.last_a_item_line_number = line_number
	if upcase_line.find('(B)') > cpp_comment_idx:
	self.last_b_item_line_number = line_number
	# Fast bailout, uses the same patterns as _copyright_indicator regexp.
	if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \
	and not '\xc2\xa9' in upcase_line:
	c_item_index = upcase_line.find('(C)')
	if c_item_index == -1:
	return None
	if c_item_index > cpp_comment_idx and \
	self._CloseLineNumbers(line_number,
	self.last_b_item_line_number) and \
	self._CloseLineNumbers(self.last_b_item_line_number,
	self.last_a_item_line_number):
	return None
	copyr = None
	m = _CopyrightsScanner._full_copyright_indicator_re.search(line)
	if m and \
	not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):
	copyr = m.group(0)
	# Prettify the authorship string.
	copyr = self.re.sub(r'([,.])?\s*$/', '', copyr)
	copyr = self.re.sub(
	_CopyrightsScanner._copyright_indicator, '', copyr, \
	flags=self.re.IGNORECASE)
	copyr = self.re.sub(r'^\s+', '', copyr)
	copyr = self.re.sub(r'\s{2,}', ' ', copyr)
	copyr = self.re.sub(r'\\@', '@', copyr)
	return copyr


	def FindCopyrights(input_api, root_dir, files_to_scan):
	"""Determines code autorship, and finds generated files.
	Args:
	input_api: InputAPI, as in presubmit scripts.
	root_dir: The root directory, to which all other paths are relative.
	files_to_scan: The list of file names to scan.
	Returns:
	The list of copyrights associated with each of the files given.
	If the certain file is generated, the corresponding list consists a single
	entry -- 'GENERATED_FILE' string. If the file has no copyright info,
	the corresponding list contains 'NO_COPYRIGHT' string.
	"""
	generated_files_detector = _GeneratedFilesDetector(input_api)
	_CopyrightsScanner.StaticInit(input_api)
	copyrights = []
	for file_name in files_to_scan:
	linenum = 0
	header = []
	file_copyrights = []
	scanner = _CopyrightsScanner(input_api)
	contents = input_api.ReadFile(
	input_api.os_path.join(root_dir, file_name), 'r')
	for l in contents.split('\n'):
	linenum += 1
	if linenum <= 25:
	header.append(l)
	c = scanner.MatchLine(linenum, l)
	if c:
	file_copyrights.append(c)
	if generated_files_detector.IsGeneratedFile('\n'.join(header)):
	copyrights.append([_GeneratedFilesDetector.GENERATED_FILE])
	elif file_copyrights:
	copyrights.append(file_copyrights)
	else:
	copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT])
	return copyrights


	def FindCopyrightViolations(input_api, root_dir, files_to_scan):
	"""Looks for files that are not belong exlusively to the Chromium Authors.
	Args:
	input_api: InputAPI, as in presubmit scripts.
	root_dir: The root directory, to which all other paths are relative.
	files_to_scan: The list of file names to scan.
	Returns:
	The list of file names that contain non-Chromium copyrights.
	"""
	copyrights = FindCopyrights(input_api, root_dir, files_to_scan)
	offending_files = []
	allowed_copyrights_re = input_api.re.compile(
	r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
	'All rights reserved.*)$')
	for f, cs in itertools.izip(files_to_scan, copyrights):
	if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \
	cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT:
	continue
	for c in cs:
	if not allowed_copyrights_re.match(c):
	offending_files.append(input_api.os_path.normpath(f))
	break
	return offending_files