test/scripts/run_cuda_memcheck.py - platform/external/pytorch - Git at Google

 #!/usr/bin/env python3

 """This script runs cuda-memcheck on the specified unit test. Each test case
 is run in its isolated process with a timeout so that:
 1) different test cases won't influence each other, and
 2) in case of hang, the script would still finish in a finite amount of time.
 The output will be written to a log file result.log

 Example usage:
     python run_cuda_memcheck.py ../test_torch.py 600

 Note that running cuda-memcheck could be very slow.
 """

 import argparse
 import asyncio
 import multiprocessing
 import os
 import subprocess
 import sys

 import cuda_memcheck_common as cmc
 import tqdm

 import torch

 ALL_TESTS = []
 GPUS = torch.cuda.device_count()

 # parse arguments
 parser = argparse.ArgumentParser(description="Run isolated cuda-memcheck on unit tests")
 parser.add_argument(
     "filename", help="the python file for a test, such as test_torch.py"
 )
 parser.add_argument(
     "timeout",
     type=int,
     help="kill the test if it does not terminate in a certain amount of seconds",
 )
 parser.add_argument(
     "--strict",
     action="store_true",
     help="Whether to show cublas/cudnn errors. These errors are ignored by default because"
     "cublas/cudnn does not run error-free under cuda-memcheck, and ignoring these errors",
 )
 parser.add_argument(
     "--nproc",
     type=int,
     default=multiprocessing.cpu_count(),
     help="Number of processes running tests, default to number of cores in the system",
 )
 parser.add_argument(
     "--gpus",
     default="all",
     help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"',
 )
 parser.add_argument(
     "--ci",
     action="store_true",
     help="Whether this script is executed in CI. When executed inside a CI, this script fails when "
     "an error is detected. Also, it will not show tqdm progress bar, but directly print the error"
     "to stdout instead.",
 )
 parser.add_argument("--nohang", action="store_true", help="Treat timeout as success")
 parser.add_argument("--split", type=int, default=1, help="Split the job into pieces")
 parser.add_argument(
     "--rank", type=int, default=0, help="Which piece this process should pick"
 )
 args = parser.parse_args()


 # Filters that ignores cublas/cudnn errors
 # TODO (@zasdfgbnm): When can we remove this? Will cublas/cudnn run error-free under cuda-memcheck?
 def is_ignored_only(output):
     try:
         report = cmc.parse(output)
     except cmc.ParseError:
         # in case the simple parser fails parsing the output of cuda memcheck
         # then this error is never ignored.
         return False
     count_ignored_errors = 0
     for e in report.errors:
         if (
             "libcublas" in "".join(e.stack)
             or "libcudnn" in "".join(e.stack)
             or "libcufft" in "".join(e.stack)
         ):
             count_ignored_errors += 1
     return count_ignored_errors == report.num_errors


 # Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests
 os.environ["PYTORCH_CUDA_MEMCHECK"] = "1"

 # Discover tests:
 # To get a list of tests, run:
 # pytest --setup-only test/test_torch.py
 # and then parse the output
 proc = subprocess.Popen(
     ["pytest", "--setup-only", args.filename],
     stdout=subprocess.PIPE,
     stderr=subprocess.PIPE,
 )
 stdout, stderr = proc.communicate()
 lines = stdout.decode().strip().splitlines()
 for line in lines:
     if "(fixtures used:" in line:
         line = line.strip().split()[0]
         line = line[line.find("::") + 2 :]
         line = line.replace("::", ".")
         ALL_TESTS.append(line)


 # Do a simple filtering:
 # if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it
 def is_cpu_only(name):
     name = name.lower()
     return ("cpu" in name) and "cuda" not in name


 ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)]

 # Split all tests into chunks, and only on the selected chunk
 ALL_TESTS.sort()
 chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split
 start = chunk_size * args.rank
 end = chunk_size * (args.rank + 1)
 ALL_TESTS = ALL_TESTS[start:end]

 # Run tests:
 # Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel.
 # This is done by using the coroutine feature in new Python versions.  A number of coroutines are created;
 # they create subprocesses and awaiting them to finish. The number of running subprocesses could be
 # specified by the user and by default is the same as the number of CPUs in the machine.
 # These subprocesses are balanced across different GPUs on the system by assigning one devices per process,
 # or as specified by the user
 progress = 0
 if not args.ci:
     logfile = open("result.log", "w")
     progressbar = tqdm.tqdm(total=len(ALL_TESTS))
 else:
     logfile = sys.stdout

     # create a fake progress bar that does not display anything
     class ProgressbarStub:
         def update(self, *args):
             return

     progressbar = ProgressbarStub()


 async def run1(coroutine_id):
     global progress

     if args.gpus == "all":
         gpuid = coroutine_id % GPUS
     else:
         gpu_assignments = args.gpus.split(":")
         assert args.nproc == len(
             gpu_assignments
         ), "Please specify GPU assignment for each process, separated by :"
         gpuid = gpu_assignments[coroutine_id]

     while progress < len(ALL_TESTS):
         test = ALL_TESTS[progress]
         progress += 1
         cmd = f"CUDA_VISIBLE_DEVICES={gpuid} cuda-memcheck --error-exitcode 1 python {args.filename} {test}"
         proc = await asyncio.create_subprocess_shell(
             cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
         )
         try:
             stdout, stderr = await asyncio.wait_for(proc.communicate(), args.timeout)
         except asyncio.TimeoutError:
             print("Timeout:", test, file=logfile)
             proc.kill()
             if args.ci and not args.nohang:
                 sys.exit("Hang detected on cuda-memcheck")
         else:
             if proc.returncode == 0:
                 print("Success:", test, file=logfile)
             else:
                 stdout = stdout.decode()
                 stderr = stderr.decode()
                 should_display = args.strict or not is_ignored_only(stdout)
                 if should_display:
                     print("Fail:", test, file=logfile)
                     print(stdout, file=logfile)
                     print(stderr, file=logfile)
                     if args.ci:
                         sys.exit("Failure detected on cuda-memcheck")
                 else:
                     print("Ignored:", test, file=logfile)
         del proc
         progressbar.update(1)


 async def main():
     tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)]
     for t in tasks:
         await t


 if __name__ == "__main__":
     loop = asyncio.get_event_loop()
     loop.run_until_complete(main())
	#!/usr/bin/env python3

	"""This script runs cuda-memcheck on the specified unit test. Each test case
	is run in its isolated process with a timeout so that:
	1) different test cases won't influence each other, and
	2) in case of hang, the script would still finish in a finite amount of time.
	The output will be written to a log file result.log

	Example usage:
	python run_cuda_memcheck.py ../test_torch.py 600

	Note that running cuda-memcheck could be very slow.
	"""

	import argparse
	import asyncio
	import multiprocessing
	import os
	import subprocess
	import sys

	import cuda_memcheck_common as cmc
	import tqdm

	import torch

	ALL_TESTS = []
	GPUS = torch.cuda.device_count()

	# parse arguments
	parser = argparse.ArgumentParser(description="Run isolated cuda-memcheck on unit tests")
	parser.add_argument(
	"filename", help="the python file for a test, such as test_torch.py"
	)
	parser.add_argument(
	"timeout",
	type=int,
	help="kill the test if it does not terminate in a certain amount of seconds",
	)
	parser.add_argument(
	"--strict",
	action="store_true",
	help="Whether to show cublas/cudnn errors. These errors are ignored by default because"
	"cublas/cudnn does not run error-free under cuda-memcheck, and ignoring these errors",
	)
	parser.add_argument(
	"--nproc",
	type=int,
	default=multiprocessing.cpu_count(),
	help="Number of processes running tests, default to number of cores in the system",
	)
	parser.add_argument(
	"--gpus",
	default="all",
	help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"',
	)
	parser.add_argument(
	"--ci",
	action="store_true",
	help="Whether this script is executed in CI. When executed inside a CI, this script fails when "
	"an error is detected. Also, it will not show tqdm progress bar, but directly print the error"
	"to stdout instead.",
	)
	parser.add_argument("--nohang", action="store_true", help="Treat timeout as success")
	parser.add_argument("--split", type=int, default=1, help="Split the job into pieces")
	parser.add_argument(
	"--rank", type=int, default=0, help="Which piece this process should pick"
	)
	args = parser.parse_args()


	# Filters that ignores cublas/cudnn errors
	# TODO (@zasdfgbnm): When can we remove this? Will cublas/cudnn run error-free under cuda-memcheck?
	def is_ignored_only(output):
	try:
	report = cmc.parse(output)
	except cmc.ParseError:
	# in case the simple parser fails parsing the output of cuda memcheck
	# then this error is never ignored.
	return False
	count_ignored_errors = 0
	for e in report.errors:
	if (
	"libcublas" in "".join(e.stack)
	or "libcudnn" in "".join(e.stack)
	or "libcufft" in "".join(e.stack)
	):
	count_ignored_errors += 1
	return count_ignored_errors == report.num_errors


	# Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests
	os.environ["PYTORCH_CUDA_MEMCHECK"] = "1"

	# Discover tests:
	# To get a list of tests, run:
	# pytest --setup-only test/test_torch.py
	# and then parse the output
	proc = subprocess.Popen(
	["pytest", "--setup-only", args.filename],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	)
	stdout, stderr = proc.communicate()
	lines = stdout.decode().strip().splitlines()
	for line in lines:
	if "(fixtures used:" in line:
	line = line.strip().split()[0]
	line = line[line.find("::") + 2 :]
	line = line.replace("::", ".")
	ALL_TESTS.append(line)


	# Do a simple filtering:
	# if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it
	def is_cpu_only(name):
	name = name.lower()
	return ("cpu" in name) and "cuda" not in name


	ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)]

	# Split all tests into chunks, and only on the selected chunk
	ALL_TESTS.sort()
	chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split
	start = chunk_size * args.rank
	end = chunk_size * (args.rank + 1)
	ALL_TESTS = ALL_TESTS[start:end]

	# Run tests:
	# Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel.
	# This is done by using the coroutine feature in new Python versions. A number of coroutines are created;
	# they create subprocesses and awaiting them to finish. The number of running subprocesses could be
	# specified by the user and by default is the same as the number of CPUs in the machine.
	# These subprocesses are balanced across different GPUs on the system by assigning one devices per process,
	# or as specified by the user
	progress = 0
	if not args.ci:
	logfile = open("result.log", "w")
	progressbar = tqdm.tqdm(total=len(ALL_TESTS))
	else:
	logfile = sys.stdout

	# create a fake progress bar that does not display anything
	class ProgressbarStub:
	def update(self, *args):
	return

	progressbar = ProgressbarStub()


	async def run1(coroutine_id):
	global progress

	if args.gpus == "all":
	gpuid = coroutine_id % GPUS
	else:
	gpu_assignments = args.gpus.split(":")
	assert args.nproc == len(
	gpu_assignments
	), "Please specify GPU assignment for each process, separated by :"
	gpuid = gpu_assignments[coroutine_id]

	while progress < len(ALL_TESTS):
	test = ALL_TESTS[progress]
	progress += 1
	cmd = f"CUDA_VISIBLE_DEVICES={gpuid} cuda-memcheck --error-exitcode 1 python {args.filename} {test}"
	proc = await asyncio.create_subprocess_shell(
	cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
	)
	try:
	stdout, stderr = await asyncio.wait_for(proc.communicate(), args.timeout)
	except asyncio.TimeoutError:
	print("Timeout:", test, file=logfile)
	proc.kill()
	if args.ci and not args.nohang:
	sys.exit("Hang detected on cuda-memcheck")
	else:
	if proc.returncode == 0:
	print("Success:", test, file=logfile)
	else:
	stdout = stdout.decode()
	stderr = stderr.decode()
	should_display = args.strict or not is_ignored_only(stdout)
	if should_display:
	print("Fail:", test, file=logfile)
	print(stdout, file=logfile)
	print(stderr, file=logfile)
	if args.ci:
	sys.exit("Failure detected on cuda-memcheck")
	else:
	print("Ignored:", test, file=logfile)
	del proc
	progressbar.update(1)


	async def main():
	tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)]
	for t in tasks:
	await t


	if __name__ == "__main__":
	loop = asyncio.get_event_loop()
	loop.run_until_complete(main())