test/scripts/run_cuda_memcheck.py - platform/external/pytorch - Git at Google

 #!/usr/bin/env python

 """This script runs cuda-memcheck on the specified unit test. Each test case
 is run in its isolated process with a timeout so that:
 1) different test cases won't influence each other, and
 2) in case of hang, the script would still finish in a finite amount of time.
 The output will be written to a log file result.log

 Example usage:
     python run_cuda_memcheck.py ../test_torch.py 600

 Note that running cuda-memcheck could be very slow.
 """

 import asyncio
 import torch
 import multiprocessing
 import argparse
 import subprocess
 import tqdm
 import os
 import sys
 import cuda_memcheck_common as cmc

 ALL_TESTS = []
 GPUS = torch.cuda.device_count()

 # parse arguments
 parser = argparse.ArgumentParser(description="Run isolated cuda-memcheck on unit tests")
 parser.add_argument('filename', help="the python file for a test, such as test_torch.py")
 parser.add_argument('timeout', type=int, help='kill the test if it does not terminate in a certain amount of seconds')
 parser.add_argument('--strict', action='store_true',
                     help='Whether to show cublas/cudnn errors. These errors are ignored by default because'
                          'cublas/cudnn does not run error-free under cuda-memcheck, and ignoring these errors')
 parser.add_argument('--nproc', type=int, default=multiprocessing.cpu_count(),
                     help='Number of processes running tests, default to number of cores in the system')
 parser.add_argument('--gpus', default='all',
                     help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"')
 parser.add_argument('--ci', action='store_true',
                     help='Whether this script is executed in CI. When executed inside a CI, this script fails when '
                          'an error is detected. Also, it will not show tqdm progress bar, but directly print the error'
                          'to stdout instead.')
 parser.add_argument('--nohang', action='store_true', help='Treat timeout as success')
 parser.add_argument('--split', type=int, default=1, help='Split the job into pieces')
 parser.add_argument('--rank', type=int, default=0, help='Which piece this process should pick')
 args = parser.parse_args()

 # Filters that ignores cublas/cudnn errors
 # TODO (@zasdfgbnm): When can we remove this? Will cublas/cudnn run error-free under cuda-memcheck?
 def is_ignored_only(output):
     try:
         report = cmc.parse(output)
     except cmc.ParseError:
         # in case the simple parser fails parsing the output of cuda memcheck
         # then this error is never ignored.
         return False
     count_ignored_errors = 0
     for e in report.errors:
         if 'libcublas' in ''.join(e.stack) or 'libcudnn' in ''.join(e.stack) or 'libcufft' in ''.join(e.stack):
             count_ignored_errors += 1
     return count_ignored_errors == report.num_errors

 # Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests
 os.environ['PYTORCH_CUDA_MEMCHECK'] = '1'

 # Discover tests:
 # To get a list of tests, run:
 # pytest --setup-only test/test_torch.py
 # and then parse the output
 proc = subprocess.Popen(['pytest', '--setup-only', args.filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 stdout, stderr = proc.communicate()
 lines = stdout.decode().strip().splitlines()
 for line in lines:
     if '(fixtures used:' in line:
         line = line.strip().split()[0]
         line = line[line.find('::') + 2:]
         line = line.replace('::', '.')
         ALL_TESTS.append(line)

 # Do a simple filtering:
 # if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it
 def is_cpu_only(name):
     name = name.lower()
     return ('cpu' in name) and not ('cuda' in name)

 ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)]

 # Split all tests into chunks, and only on the selected chunk
 ALL_TESTS.sort()
 chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split
 start = chunk_size * args.rank
 end = chunk_size * (args.rank + 1)
 ALL_TESTS = ALL_TESTS[start:end]

 # Run tests:
 # Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel.
 # This is done by using the coroutine feature in new Python versions.  A number of coroutines are created;
 # they create subprocesses and awaiting them to finish. The number of running subprocesses could be
 # specified by the user and by default is the same as the number of CPUs in the machine.
 # These subprocesses are balanced across different GPUs on the system by assigning one devices per process,
 # or as specified by the user
 progress = 0
 if not args.ci:
     logfile = open('result.log', 'w')
     progressbar = tqdm.tqdm(total=len(ALL_TESTS))
 else:
     logfile = sys.stdout

     # create a fake progress bar that does not display anything
     class ProgressbarStub:
         def update(*args):
             return
     progressbar = ProgressbarStub()

 async def run1(coroutine_id):
     global progress

     if args.gpus == 'all':
         gpuid = coroutine_id % GPUS
     else:
         gpu_assignments = args.gpus.split(':')
         assert args.nproc == len(gpu_assignments), 'Please specify GPU assignmnent for each process, separated by :'
         gpuid = gpu_assignments[coroutine_id]

     while progress < len(ALL_TESTS):
         test = ALL_TESTS[progress]
         progress += 1
         cmd = f'CUDA_VISIBLE_DEVICES={gpuid} cuda-memcheck --error-exitcode 1 python {args.filename} {test}'
         proc = await asyncio.create_subprocess_shell(cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
         try:
             stdout, stderr = await asyncio.wait_for(proc.communicate(), args.timeout)
         except asyncio.TimeoutError:
             print('Timeout:', test, file=logfile)
             proc.kill()
             if args.ci and not args.nohang:
                 sys.exit("Hang detected on cuda-memcheck")
         else:
             if proc.returncode == 0:
                 print('Success:', test, file=logfile)
             else:
                 stdout = stdout.decode()
                 stderr = stderr.decode()
                 should_display = args.strict or not is_ignored_only(stdout)
                 if should_display:
                     print('Fail:', test, file=logfile)
                     print(stdout, file=logfile)
                     print(stderr, file=logfile)
                     if args.ci:
                         sys.exit("Failure detected on cuda-memcheck")
                 else:
                     print('Ignored:', test, file=logfile)
         del proc
         progressbar.update(1)

 async def main():
     tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)]
     for t in tasks:
         await t

 if __name__ == '__main__':
     loop = asyncio.get_event_loop()
     loop.run_until_complete(main())
	#!/usr/bin/env python

	"""This script runs cuda-memcheck on the specified unit test. Each test case
	is run in its isolated process with a timeout so that:
	1) different test cases won't influence each other, and
	2) in case of hang, the script would still finish in a finite amount of time.
	The output will be written to a log file result.log

	Example usage:
	python run_cuda_memcheck.py ../test_torch.py 600

	Note that running cuda-memcheck could be very slow.
	"""

	import asyncio
	import torch
	import multiprocessing
	import argparse
	import subprocess
	import tqdm
	import os
	import sys
	import cuda_memcheck_common as cmc

	ALL_TESTS = []
	GPUS = torch.cuda.device_count()

	# parse arguments
	parser = argparse.ArgumentParser(description="Run isolated cuda-memcheck on unit tests")
	parser.add_argument('filename', help="the python file for a test, such as test_torch.py")
	parser.add_argument('timeout', type=int, help='kill the test if it does not terminate in a certain amount of seconds')
	parser.add_argument('--strict', action='store_true',
	help='Whether to show cublas/cudnn errors. These errors are ignored by default because'
	'cublas/cudnn does not run error-free under cuda-memcheck, and ignoring these errors')
	parser.add_argument('--nproc', type=int, default=multiprocessing.cpu_count(),
	help='Number of processes running tests, default to number of cores in the system')
	parser.add_argument('--gpus', default='all',
	help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"')
	parser.add_argument('--ci', action='store_true',
	help='Whether this script is executed in CI. When executed inside a CI, this script fails when '
	'an error is detected. Also, it will not show tqdm progress bar, but directly print the error'
	'to stdout instead.')
	parser.add_argument('--nohang', action='store_true', help='Treat timeout as success')
	parser.add_argument('--split', type=int, default=1, help='Split the job into pieces')
	parser.add_argument('--rank', type=int, default=0, help='Which piece this process should pick')
	args = parser.parse_args()

	# Filters that ignores cublas/cudnn errors
	# TODO (@zasdfgbnm): When can we remove this? Will cublas/cudnn run error-free under cuda-memcheck?
	def is_ignored_only(output):
	try:
	report = cmc.parse(output)
	except cmc.ParseError:
	# in case the simple parser fails parsing the output of cuda memcheck
	# then this error is never ignored.
	return False
	count_ignored_errors = 0
	for e in report.errors:
	if 'libcublas' in ''.join(e.stack) or 'libcudnn' in ''.join(e.stack) or 'libcufft' in ''.join(e.stack):
	count_ignored_errors += 1
	return count_ignored_errors == report.num_errors

	# Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests
	os.environ['PYTORCH_CUDA_MEMCHECK'] = '1'

	# Discover tests:
	# To get a list of tests, run:
	# pytest --setup-only test/test_torch.py
	# and then parse the output
	proc = subprocess.Popen(['pytest', '--setup-only', args.filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	stdout, stderr = proc.communicate()
	lines = stdout.decode().strip().splitlines()
	for line in lines:
	if '(fixtures used:' in line:
	line = line.strip().split()[0]
	line = line[line.find('::') + 2:]
	line = line.replace('::', '.')
	ALL_TESTS.append(line)

	# Do a simple filtering:
	# if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it
	def is_cpu_only(name):
	name = name.lower()
	return ('cpu' in name) and not ('cuda' in name)

	ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)]

	# Split all tests into chunks, and only on the selected chunk
	ALL_TESTS.sort()
	chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split
	start = chunk_size * args.rank
	end = chunk_size * (args.rank + 1)
	ALL_TESTS = ALL_TESTS[start:end]

	# Run tests:
	# Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel.
	# This is done by using the coroutine feature in new Python versions. A number of coroutines are created;
	# they create subprocesses and awaiting them to finish. The number of running subprocesses could be
	# specified by the user and by default is the same as the number of CPUs in the machine.
	# These subprocesses are balanced across different GPUs on the system by assigning one devices per process,
	# or as specified by the user
	progress = 0
	if not args.ci:
	logfile = open('result.log', 'w')
	progressbar = tqdm.tqdm(total=len(ALL_TESTS))
	else:
	logfile = sys.stdout

	# create a fake progress bar that does not display anything
	class ProgressbarStub:
	def update(*args):
	return
	progressbar = ProgressbarStub()

	async def run1(coroutine_id):
	global progress

	if args.gpus == 'all':
	gpuid = coroutine_id % GPUS
	else:
	gpu_assignments = args.gpus.split(':')
	assert args.nproc == len(gpu_assignments), 'Please specify GPU assignmnent for each process, separated by :'
	gpuid = gpu_assignments[coroutine_id]

	while progress < len(ALL_TESTS):
	test = ALL_TESTS[progress]
	progress += 1
	cmd = f'CUDA_VISIBLE_DEVICES={gpuid} cuda-memcheck --error-exitcode 1 python {args.filename} {test}'
	proc = await asyncio.create_subprocess_shell(cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
	try:
	stdout, stderr = await asyncio.wait_for(proc.communicate(), args.timeout)
	except asyncio.TimeoutError:
	print('Timeout:', test, file=logfile)
	proc.kill()
	if args.ci and not args.nohang:
	sys.exit("Hang detected on cuda-memcheck")
	else:
	if proc.returncode == 0:
	print('Success:', test, file=logfile)
	else:
	stdout = stdout.decode()
	stderr = stderr.decode()
	should_display = args.strict or not is_ignored_only(stdout)
	if should_display:
	print('Fail:', test, file=logfile)
	print(stdout, file=logfile)
	print(stderr, file=logfile)
	if args.ci:
	sys.exit("Failure detected on cuda-memcheck")
	else:
	print('Ignored:', test, file=logfile)
	del proc
	progressbar.update(1)

	async def main():
	tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)]
	for t in tasks:
	await t

	if __name__ == '__main__':
	loop = asyncio.get_event_loop()
	loop.run_until_complete(main())