| #!/usr/bin/env python |
| |
| """This script runs cuda-memcheck on the specified unit test. Each test case |
| is run in its isolated process with a timeout so that: |
| 1) different test cases won't influence each other, and |
| 2) in case of hang, the script would still finish in a finite amount of time. |
| The output will be written to a log file result.log |
| |
| Example usage: |
| python run_cuda_memcheck.py ../test_torch.py 600 |
| |
| Note that running cuda-memcheck could be very slow. |
| """ |
| |
| import asyncio |
| import torch |
| import multiprocessing |
| import argparse |
| import subprocess |
| import tqdm |
| import os |
| import sys |
| import cuda_memcheck_common as cmc |
| |
| ALL_TESTS = [] |
| GPUS = torch.cuda.device_count() |
| |
| # parse arguments |
| parser = argparse.ArgumentParser(description="Run isolated cuda-memcheck on unit tests") |
| parser.add_argument('filename', help="the python file for a test, such as test_torch.py") |
| parser.add_argument('timeout', type=int, help='kill the test if it does not terminate in a certain amount of seconds') |
| parser.add_argument('--strict', action='store_true', |
| help='Whether to show cublas/cudnn errors. These errors are ignored by default because' |
| 'cublas/cudnn does not run error-free under cuda-memcheck, and ignoring these errors') |
| parser.add_argument('--nproc', type=int, default=multiprocessing.cpu_count(), |
| help='Number of processes running tests, default to number of cores in the system') |
| parser.add_argument('--gpus', default='all', |
| help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"') |
| parser.add_argument('--ci', action='store_true', |
| help='Whether this script is executed in CI. When executed inside a CI, this script fails when ' |
| 'an error is detected. Also, it will not show tqdm progress bar, but directly print the error' |
| 'to stdout instead.') |
| parser.add_argument('--nohang', action='store_true', help='Treat timeout as success') |
| parser.add_argument('--split', type=int, default=1, help='Split the job into pieces') |
| parser.add_argument('--rank', type=int, default=0, help='Which piece this process should pick') |
| args = parser.parse_args() |
| |
| # Filters that ignores cublas/cudnn errors |
| # TODO (@zasdfgbnm): When can we remove this? Will cublas/cudnn run error-free under cuda-memcheck? |
| def is_ignored_only(output): |
| try: |
| report = cmc.parse(output) |
| except cmc.ParseError: |
| # in case the simple parser fails parsing the output of cuda memcheck |
| # then this error is never ignored. |
| return False |
| count_ignored_errors = 0 |
| for e in report.errors: |
| if 'libcublas' in ''.join(e.stack) or 'libcudnn' in ''.join(e.stack) or 'libcufft' in ''.join(e.stack): |
| count_ignored_errors += 1 |
| return count_ignored_errors == report.num_errors |
| |
| # Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests |
| os.environ['PYTORCH_CUDA_MEMCHECK'] = '1' |
| |
| # Discover tests: |
| # To get a list of tests, run: |
| # pytest --setup-only test/test_torch.py |
| # and then parse the output |
| proc = subprocess.Popen(['pytest', '--setup-only', args.filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
| stdout, stderr = proc.communicate() |
| lines = stdout.decode().strip().splitlines() |
| for line in lines: |
| if '(fixtures used:' in line: |
| line = line.strip().split()[0] |
| line = line[line.find('::') + 2:] |
| line = line.replace('::', '.') |
| ALL_TESTS.append(line) |
| |
| # Do a simple filtering: |
| # if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it |
| def is_cpu_only(name): |
| name = name.lower() |
| return ('cpu' in name) and not ('cuda' in name) |
| |
| ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)] |
| |
| # Split all tests into chunks, and only on the selected chunk |
| ALL_TESTS.sort() |
| chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split |
| start = chunk_size * args.rank |
| end = chunk_size * (args.rank + 1) |
| ALL_TESTS = ALL_TESTS[start:end] |
| |
| # Run tests: |
| # Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel. |
| # This is done by using the coroutine feature in new Python versions. A number of coroutines are created; |
| # they create subprocesses and awaiting them to finish. The number of running subprocesses could be |
| # specified by the user and by default is the same as the number of CPUs in the machine. |
| # These subprocesses are balanced across different GPUs on the system by assigning one devices per process, |
| # or as specified by the user |
| progress = 0 |
| if not args.ci: |
| logfile = open('result.log', 'w') |
| progressbar = tqdm.tqdm(total=len(ALL_TESTS)) |
| else: |
| logfile = sys.stdout |
| |
| # create a fake progress bar that does not display anything |
| class ProgressbarStub: |
| def update(*args): |
| return |
| progressbar = ProgressbarStub() |
| |
| async def run1(coroutine_id): |
| global progress |
| |
| if args.gpus == 'all': |
| gpuid = coroutine_id % GPUS |
| else: |
| gpu_assignments = args.gpus.split(':') |
| assert args.nproc == len(gpu_assignments), 'Please specify GPU assignmnent for each process, separated by :' |
| gpuid = gpu_assignments[coroutine_id] |
| |
| while progress < len(ALL_TESTS): |
| test = ALL_TESTS[progress] |
| progress += 1 |
| cmd = f'CUDA_VISIBLE_DEVICES={gpuid} cuda-memcheck --error-exitcode 1 python {args.filename} {test}' |
| proc = await asyncio.create_subprocess_shell(cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE) |
| try: |
| stdout, stderr = await asyncio.wait_for(proc.communicate(), args.timeout) |
| except asyncio.TimeoutError: |
| print('Timeout:', test, file=logfile) |
| proc.kill() |
| if args.ci and not args.nohang: |
| sys.exit("Hang detected on cuda-memcheck") |
| else: |
| if proc.returncode == 0: |
| print('Success:', test, file=logfile) |
| else: |
| stdout = stdout.decode() |
| stderr = stderr.decode() |
| should_display = args.strict or not is_ignored_only(stdout) |
| if should_display: |
| print('Fail:', test, file=logfile) |
| print(stdout, file=logfile) |
| print(stderr, file=logfile) |
| if args.ci: |
| sys.exit("Failure detected on cuda-memcheck") |
| else: |
| print('Ignored:', test, file=logfile) |
| del proc |
| progressbar.update(1) |
| |
| async def main(): |
| tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)] |
| for t in tasks: |
| await t |
| |
| if __name__ == '__main__': |
| loop = asyncio.get_event_loop() |
| loop.run_until_complete(main()) |