Simple script to check for inclusive language. (#897) This CL adds a simple script to check for inclusive language. All files in the repo are scanned and any matches are output.

commit: 88f78401e9af26f1249944b942ddf5dd706572e8 [log] [tgz]
author: dan sinclair <dsinclair@google.com> Thu Aug 13 09:48:28 2020 -0400
committer: GitHub <noreply@github.com> Thu Aug 13 09:48:28 2020 -0400
tree: 8c1346ef55a3ef92aa57cba44165bcd014357109
parent: e5717280728970d6317ed896d1ae14acf123ebfc [diff]
diff --git a/.gitignore b/.gitignore
index 55f5cce..d5f3eb0 100644
--- a/.gitignore
+++ b/.gitignore

@@ -19,6 +19,8 @@
 third_party/vulkan-validationlayers/
 .vs
 
+*.pyc
+
 # Vim swap files
 [._]*.s[a-w][a-z]
 

diff --git a/tools/check_language.py b/tools/check_language.py
new file mode 100755
index 0000000..b7ca528
--- /dev/null
+++ b/tools/check_language.py

@@ -0,0 +1,179 @@
+#!/usr/bin/env python
+
+# Copyright 2020 The Amber Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#	http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Script to check files for inclusive language. The script will scan all files
+and flag non-inclusive terminology which is identified.
+
+Usage, run the script from a folder and the script will scan down through that
+folder.
+"""
+
+import fnmatch
+import os
+import re
+import sys
+
+REGEXES = [
+	r"(?i)black[-_]?list",
+	r"(?i)white[-_]?list",
+	r"(?i)gr[ea]y[-_]?list",
+	r"(?i)(first class citizen)",
+	r"(?i)black[-_]?hat",
+	r"(?i)white[-_]?hat",
+	r"(?i)gr[ea]y[-_]?hat",
+	r"(?i)master",
+	r"(?i)slave",
+	r"(?i)\bhim\b",
+	r"(?i)\bhis\b",
+	r"(?i)\bshe\b",
+	r"(?i)\bher\b",
+	r"(?i)\bhers\b",
+	r"(?i)\bman\b",
+	r"(?i)\bwoman\b",
+	r"(?i)\she\s",
+	r"(?i)\she$",
+	r"(?i)^he\s",
+	r"(?i)^he$",
+	r"(?i)\she['|\u2019]d\s",
+	r"(?i)\she['|\u2019]d$",
+	r"(?i)^he['|\u2019]d\s",
+	r"(?i)^he['|\u2019]d$",
+	r"(?i)\she['|\u2019]s\s",
+	r"(?i)\she['|\u2019]s$",
+	r"(?i)^he['|\u2019]s\s",
+	r"(?i)^he['|\u2019]s$",
+	r"(?i)\she['|\u2019]ll\s",
+	r"(?i)\she['|\u2019]ll$",
+	r"(?i)^he['|\u2019]ll\s",
+	r"(?i)^he['|\u2019]ll$",
+	r"(?i)grandfather",
+	r"(?i)\bmitm\b",
+	r"(?i)\bcrazy\b",
+	r"(?i)\binsane\b",
+	r"(?i)\bblind\sto\b",
+	r"(?i)\bflying\sblind\b",
+	r"(?i)\bblind\seye\b",
+	r"(?i)\bcripple\b",
+	r"(?i)\bcrippled\b",
+	r"(?i)\bdumb\b",
+	r"(?i)\bdummy\b",
+	r"(?i)\bparanoid\b",
+	r"(?i)\bsane\b",
+	r"(?i)\bsanity\b",
+	r"(?i)red[-_]?line",
+]
+
+SUPPRESSIONS = [
+	r"(?i)MS_SLAVE",
+	r"(?i)man[ -_]?page",
+]
+
+
+REGEX_LIST = []
+for reg in REGEXES:
+	REGEX_LIST.append(re.compile(reg))
+
+SUPPRESSION_LIST = []
+for supp in SUPPRESSIONS:
+	SUPPRESSION_LIST.append(re.compile(supp))
+
+def find(top, filename_glob, skip_glob_list):
+	"""Returns files in the tree rooted at top matching filename_glob but not
+	in directories matching skip_glob_list."""
+
+	file_list = []
+	for path, dirs, files in os.walk(top):
+		for glob in skip_glob_list:
+			for match in fnmatch.filter(dirs, glob):
+				dirs.remove(match)
+		for filename in fnmatch.filter(files, filename_glob):
+			if filename == os.path.basename(__file__):
+				continue
+			file_list.append(os.path.join(path, filename))
+	return file_list
+
+
+def filtered_descendants(glob):
+	"""Returns glob-matching filenames under the current directory, but skips
+	some irrelevant paths."""
+	return find('.', glob, ['third_party', 'external', 'build*', 'out*',
+							'CompilerIdCXX', '.git'])
+
+def check_match(filename, contents):
+	"""Check if contents contains any matching entries"""
+	ret = False
+	for reg in REGEX_LIST:
+		match = reg.search(contents)
+		if match:
+			suppressed = False
+			for supp in SUPPRESSION_LIST:
+				idx = match.start()
+				supp_match = supp.match(contents[idx:])
+				if supp_match:
+					suppressed = True
+
+				# This is a hack to handle the MS_ prefix that is needed
+				# to check for. Find a better way if we get more suppressions
+				# which modify the prefix of the string
+				if idx >= 3:
+					supp_match = supp.match(contents[idx - 3:])
+					if supp_match:
+						suppressed = True
+
+			if not suppressed:
+				# No matching suppression.
+				print("{}: found non-inclusive language: {}".format(
+						filename, match.group(0)))
+				ret = True
+
+	return ret
+
+
+def alert_if_lang_matches(glob):
+	"""Prints names of all files matching non-inclusive language.
+
+	Finds all glob-matching files under the current directory and checks if they
+	contain the language pattern.  Prints the names of all the files that
+	match.
+
+	Returns the total number of file names printed.
+	"""
+	verbose = False
+	printed_count = 0
+	for file in filtered_descendants(glob):
+		has_match = False
+		try:
+			with open(file, 'r', encoding='utf8') as contents:
+				if check_match(file, contents.read()):
+					printed_count += 1
+		except:
+			if verbose:
+				print("skipping {}".format(file))
+
+	return printed_count
+
+
+def main():
+	globs = ['*']
+	count = 0
+	for glob in globs:
+		count += alert_if_lang_matches(glob)
+
+	sys.exit(count > 0)
+
+if __name__ == '__main__':
+	main()

diff --git a/tools/check_language_test.py b/tools/check_language_test.py
new file mode 100755
index 0000000..8f20791
--- /dev/null
+++ b/tools/check_language_test.py

@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+# Copyright 2020 The Amber Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for check_language.py."""
+
+import os
+import sys
+import unittest
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import check_language
+
+class TestCheckLanguage(unittest.TestCase):
+	def testMatches(self):
+		tests = ["blacklist", "black-list", "black_list", "whitelist",
+			"white-list", "white_list", "greylist", "grey-list", "grey_list",
+			"graylist", "gray-list", "gray_list", "first class citizen",
+			"blackhat", "black-hat", "black_hat", "whitehat", "white-hat",
+			"white_hat", "greyhat", "grey-hat", "grey_hat", "grayhat",
+			"gray-hat", "gray_hat", "master", "slave", "him", "his", "she",
+			"her", "hers", "man", "woman", "he", "he'd", "he's", "he'll",
+			"he\u2019d", "he\u2019s", "he\u2019ll",
+			"grandfather", "mitm", "crazy", "insane", "blind to",
+			"flying blind", "blind eye", "cripple", "crippled", "dumb",
+			"dummy", "paranoid", "sane", "sanity", "redline", "red-line",
+			"red_line"]
+
+		for word in tests:
+			self.assertTrue(
+				check_language.check_match("", "this is a " + word + " attempt"), word)
+
+
+	def testSuppression(self):
+		self.assertFalse(check_language.check_match("", "in the man-pages"))
+		self.assertFalse(check_language.check_match("", "the MS_SLAVE test"))
+
+
+	def testMatchStartofFileWhenRequireSpace(self):
+		self.assertTrue(check_language.check_match("", "he said"))
+
+
+	def testMatchOverNewline(self):
+		self.assertTrue(check_language.check_match("", "flying\nblind"))
+
+
+if __name__ == '__main__':
+	unittest.main()
commit	88f78401e9af26f1249944b942ddf5dd706572e8	[log] [tgz]
author	dan sinclair <dsinclair@google.com>	Thu Aug 13 09:48:28 2020 -0400
committer	GitHub <noreply@github.com>	Thu Aug 13 09:48:28 2020 -0400
tree	8c1346ef55a3ef92aa57cba44165bcd014357109
parent	e5717280728970d6317ed896d1ae14acf123ebfc [diff]