blob: 07a712c9baaee75cea97870955d0c374935d833d [file] [log] [blame]
# Sparse benchmarks
# This benchmark is for sparse matmul performance test.
# They exist for comparing the performance of sparse matrix routines
# `sparse @ vector`, `sparse @ sparse` and `sparse @ dense` with different backends (CPU/CUDA)
# and with other frameworks such as scipy.
import argparse
import os
import sys
from scipy.sparse import isspmatrix
import torch
import torch.utils.benchmark as benchmark_utils
from .utils import load_dlmc_dataset
def scipy_matmul(mat1, mat2):
if isspmatrix(mat1) and isspmatrix(mat2):
return mat1.dot(mat2).tocoo()
return mat1.dot(mat2)
def matmul_backward(a_dense, b_dense, grad_output):
r1 = a_dense.matmul(b_dense)
r1.backward(grad_output)
def sparse_matmul_backward(a, b, grad_output):
c = torch.sparse.mm(a, b)
c.backward(grad_output)
OPS_MAP = {
"sparse@sparse": "torch.sparse.mm",
"sparse@dense": "torch.matmul",
"sparse@vector": "torch.matmul",
}
# also get the arguments as input from the user using `argparse`
def parse_args():
parser = argparse.ArgumentParser(description="matmul benchmark")
parser.add_argument("--path", type=str, help="DLMC dataset path")
parser.add_argument("--dataset", type=str, default="magnitude_pruning")
parser.add_argument("--hidden-size", "--hidden_size", default=2048, type=int)
parser.add_argument("--backward-test", "--backward_test", action="store_true")
parser.add_argument(
"--operation",
type=str,
help="|".join(OPS_MAP.keys()),
default=next(iter(OPS_MAP)),
)
parser.add_argument("--with-cuda", "--with_cuda", action="store_true")
parser.add_argument(
"--timer-min-run-time", "--timer_min_run_time", default=1, type=float
)
return parser
def get_tasks(op, backward_test, device):
def filter_ops(operation):
if backward_test:
test_name = device + ":matmul-backward"
return [
(
test_name,
device,
"torch:" + operation.replace("sparse", "dense"),
"matmul_backward(dx, dy, grad_output)",
),
(
test_name,
device,
"torch:" + operation,
"sparse_matmul_backward(x, y, sparse_grad_output)",
),
]
else:
test_name = device + ":matmul-forward"
return list(
filter(
None,
[
(
test_name,
device,
"torch:" + operation.replace("sparse", "dense"),
f"{OPS_MAP[operation]}(dx, dy)",
),
(
test_name,
device,
"torch:" + operation,
f"{OPS_MAP[operation]}(x, y)",
),
(
test_name,
device,
"scipy:" + operation,
"scipy_matmul(sx, sy)",
)
if device == "cpu"
else None,
],
)
)
all_operations = {
"sparse@sparse": filter_ops("sparse@sparse"),
"sparse@dense": filter_ops("sparse@dense"),
"sparse@vector": filter_ops("sparse@vector"),
}
return all_operations[op]
if __name__ == "__main__":
parser = parse_args()
args = parser.parse_args()
if args.with_cuda and not torch.cuda.is_available():
raise RuntimeError("No CUDA available")
dataset_path = args.path
dataset_name = args.dataset
dataset_path = os.path.join(dataset_path, dataset_name)
device = "cuda" if args.with_cuda else "cpu"
tasks = get_tasks(args.operation, args.backward_test, device)
repeats = 3
timers = [
benchmark_utils.Timer(
stmt=stmt,
globals={
"scipy_matmul": scipy_matmul,
"matmul_backward": matmul_backward,
"sparse_matmul_backward": sparse_matmul_backward,
**variables,
},
label=label,
sub_label=sub_label,
description=f"{sparsity}",
env=device,
)
for sparsity in [0.5, 0.7, 0.8, 0.9, 0.95, 0.98]
for label, device, sub_label, stmt in tasks
for variables in load_dlmc_dataset(
dataset_path,
args.operation,
args.hidden_size,
sparsity,
device,
args.backward_test,
)
]
measurements = []
for i, timer in enumerate(timers * repeats):
m = timer.blocked_autorange(min_run_time=args.timer_min_run_time)
m.metadata = {"device": "cuda" if m.task_spec.env.find("cuda") >= 0 else "cpu"}
measurements.append(m)
print(f"\r{i + 1} / {len(timers) * repeats}", end="")
sys.stdout.flush()
print()
comparison = benchmark_utils.Compare(measurements)
print("== Results " + "=" * 80 + "\n" + "/" * 95 + "\n")
comparison.print()