examples/qualcomm/scripts/mobilebert_fine_tune.py - platform/external/executorch - Git at Google

 # Copyright (c) Qualcomm Innovation Center, Inc.
 # All rights reserved
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.

 import json
 import os
 from multiprocessing.connection import Client

 import numpy as np

 import torch
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
     QcomChipset,
 )
 from executorch.backends.qualcomm.utils.utils import (
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
     skip_annotation,
 )
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
     make_quantizer,
     parse_skip_delegation_node,
     QnnPartitioner,
     setup_common_args_and_variables,
     SimpleADB,
 )
 from executorch.exir import to_edge
 from transformers import BertTokenizer, MobileBertForSequenceClassification


 def evaluate(model, data_val):
     predictions, true_vals = [], []
     for data in data_val:
         inputs = {
             "input_ids": data[0].to(torch.long),
             "attention_mask": data[1].to(torch.long),
             "labels": data[2].to(torch.long),
         }
         logits = model(**inputs)[1].detach().numpy()
         label_ids = inputs["labels"].numpy()
         predictions.append(logits)
         true_vals.append(label_ids)

     return (
         np.concatenate(predictions, axis=0),
         np.concatenate(true_vals, axis=0),
     )


 def accuracy_per_class(preds, goldens, labels):
     labels_inverse = {v: k for k, v in labels.items()}
     preds_flat = np.argmax(preds, axis=1).flatten()
     goldens_flat = goldens.flatten()

     result = {}
     for golden in np.unique(goldens_flat):
         pred = preds_flat[goldens_flat == golden]
         true = goldens_flat[goldens_flat == golden]
         result.update({labels_inverse[golden]: [len(pred[pred == golden]), len(true)]})

     return result


 def get_dataset(data_val):
     # prepare input data
     inputs, input_list = [], ""
     # max_position_embeddings defaults to 512
     position_ids = torch.arange(512).expand((1, -1)).to(torch.int32)
     for index, data in enumerate(data_val):
         data = [d.to(torch.int32) for d in data]
         # input_ids, attention_mask, token_type_ids, position_ids
         inputs.append(
             (
                 *data[:2],
                 torch.zeros(data[0].size(), dtype=torch.int32),
                 position_ids[:, : data[0].shape[1]],
             )
         )
         input_text = " ".join(
             [f"input_{index}_{i}.raw" for i in range(len(inputs[-1]))]
         )
         input_list += f"{input_text}\n"

     return inputs, input_list


 def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
     from io import BytesIO

     import pandas as pd
     import requests
     from sklearn.model_selection import train_test_split
     from torch.utils.data import (
         DataLoader,
         RandomSampler,
         SequentialSampler,
         TensorDataset,
     )
     from tqdm import tqdm
     from transformers import get_linear_schedule_with_warmup

     # grab dataset
     url = (
         "https://raw.githubusercontent.com/susanli2016/NLP-with-Python"
         "/master/data/title_conference.csv"
     )
     content = requests.get(url, allow_redirects=True).content
     data = pd.read_csv(BytesIO(content))

     # get training / validation data
     labels = {key: index for index, key in enumerate(data.Conference.unique())}
     data["label"] = data.Conference.replace(labels)

     train, val, _, _ = train_test_split(
         data.index.values,
         data.label.values,
         test_size=0.15,
         random_state=42,
         stratify=data.label.values,
     )

     data["data_type"] = ["not_set"] * data.shape[0]
     data.loc[train, "data_type"] = "train"
     data.loc[val, "data_type"] = "val"
     data.groupby(["Conference", "label", "data_type"]).count()

     # get pre-trained mobilebert
     tokenizer = BertTokenizer.from_pretrained(
         "bert-base-uncased",
         do_lower_case=True,
     )
     model = MobileBertForSequenceClassification.from_pretrained(
         "google/mobilebert-uncased",
         num_labels=len(labels),
         return_dict=False,
     )

     # tokenize dataset
     encoded_data_train = tokenizer.batch_encode_plus(
         data[data.data_type == "train"].Title.values,
         add_special_tokens=True,
         return_attention_mask=True,
         max_length=256,
         padding="max_length",
         truncation=True,
         return_tensors="pt",
     )
     encoded_data_val = tokenizer.batch_encode_plus(
         data[data.data_type == "val"].Title.values,
         add_special_tokens=True,
         return_attention_mask=True,
         max_length=256,
         padding="max_length",
         truncation=True,
         return_tensors="pt",
     )

     input_ids_train = encoded_data_train["input_ids"]
     attention_masks_train = encoded_data_train["attention_mask"]
     labels_train = torch.tensor(data[data.data_type == "train"].label.values)

     input_ids_val = encoded_data_val["input_ids"]
     attention_masks_val = encoded_data_val["attention_mask"]
     labels_val = torch.tensor(data[data.data_type == "val"].label.values)

     dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
     dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

     epochs = 5
     dataloader_train = DataLoader(
         dataset_train,
         sampler=RandomSampler(dataset_train),
         batch_size=batch_size,
     )
     dataloader_val = DataLoader(
         dataset_val,
         sampler=SequentialSampler(dataset_val),
         batch_size=batch_size,
         drop_last=True,
     )
     optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
     scheduler = get_linear_schedule_with_warmup(
         optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * epochs
     )

     # start training
     if not pretrained_weight:
         for epoch in range(1, epochs + 1):
             loss_train_total = 0
             print(f"epoch {epoch}")

             for batch in tqdm(dataloader_train):
                 model.zero_grad()
                 inputs = {
                     "input_ids": batch[0],
                     "attention_mask": batch[1],
                     "labels": batch[2],
                 }
                 loss = model(**inputs)[0]
                 loss_train_total += loss.item()
                 loss.backward()
                 torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                 optimizer.step()
                 scheduler.step()

             torch.save(
                 model.state_dict(),
                 f"{artifacts_dir}/finetuned_mobilebert_epoch_{epoch}.model",
             )

     model.load_state_dict(
         torch.load(
             (
                 f"{artifacts_dir}/finetuned_mobilebert_epoch_{epochs}.model"
                 if pretrained_weight is None
                 else pretrained_weight
             ),
             map_location=torch.device("cpu"),
             weights_only=True,
         ),
     )

     return model.eval(), dataloader_val, labels


 def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)

     # ensure the working directory exist.
     os.makedirs(args.artifact, exist_ok=True)

     if not args.compile_only and args.device is None:
         raise RuntimeError(
             "device serial is required if not compile only. "
             "Please specify a device serial by -s/--device argument."
         )

     batch_size, pte_filename = 1, "ptq_mb_qnn"
     model, data_val, labels = get_fine_tuned_mobilebert(
         args.artifact, args.pretrained_weight, batch_size
     )
     inputs, input_list = get_dataset(data_val)

     try:
         quant_dtype = getattr(QuantDtype, f"use_{args.ptq}")
     except:
         raise AssertionError(
             f"No support for quant type {args.ptq}. Support 8a8w, 16a16w and 16a4w."
         )

     if args.use_fp16:
         quant_dtype = None
         pte_filename = "mb_qnn"
         build_executorch_binary(
             model,
             inputs[0],
             args.model,
             f"{args.artifact}/{pte_filename}",
             inputs,
             skip_node_id_set=skip_node_id_set,
             skip_node_op_set=skip_node_op_set,
             quant_dtype=quant_dtype,
             shared_buffer=args.shared_buffer,
         )
     else:

         def calibrator(gm):
             for input in inputs:
                 gm(*input)

         quantizer = make_quantizer(quant_dtype=quant_dtype)
         backend_options = generate_htp_compiler_spec(quant_dtype is not None)
         partitioner = QnnPartitioner(
             generate_qnn_executorch_compiler_spec(
                 soc_model=getattr(QcomChipset, args.model),
                 backend_options=backend_options,
             ),
             skip_node_id_set=skip_node_id_set,
             skip_node_op_set=skip_node_op_set,
         )
         # skip embedding layer cause it's quantization sensitive
         graph_module, _ = skip_annotation(
             nn_module=model,
             quantizer=quantizer,
             partitioner=partitioner,
             sample_input=inputs[0],
             calibration_cb=calibrator,
             fp_node_op_set={torch.ops.aten.embedding.default},
         )
         # lower all graph again, the skipped operators will be left in CPU
         exec_prog = to_edge(
             torch.export.export(graph_module, inputs[0]),
         ).to_executorch()

         with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file:
             file.write(exec_prog.buffer)

     if args.compile_only:
         return

     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=f"{args.build_folder}",
         pte_path=f"{args.artifact}/{pte_filename}.pte",
         workspace=f"/data/local/tmp/executorch/{pte_filename}",
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
         shared_buffer=args.shared_buffer,
     )
     adb.push(inputs=inputs, input_list=input_list)
     adb.execute()

     # collect output data
     output_data_folder = f"{args.artifact}/outputs"
     make_output_dir(output_data_folder)

     adb.pull(output_path=args.artifact)

     # get torch cpu result
     cpu_preds, true_vals = evaluate(model, data_val)
     cpu_result = accuracy_per_class(cpu_preds, true_vals, labels)

     # get QNN HTP result
     htp_preds = []
     for i in range(len(data_val)):
         result = np.fromfile(
             os.path.join(output_data_folder, f"output_{i}_0.raw"),
             dtype=np.float32,
         )
         htp_preds.append(result.reshape(batch_size, -1))

     htp_result = accuracy_per_class(
         np.concatenate(htp_preds, axis=0), true_vals, labels
     )

     if args.ip and args.port != -1:
         with Client((args.ip, args.port)) as conn:
             conn.send(json.dumps({"CPU": cpu_result, "HTP": htp_result}))
     else:
         for target in zip(["CPU", "HTP"], [cpu_result, htp_result]):
             print(f"\n[{target[0]}]")
             for k, v in target[1].items():
                 print(f"{k}: {v[0]}/{v[1]}")


 if __name__ == "__main__":
     parser = setup_common_args_and_variables()

     parser.add_argument(
         "-a",
         "--artifact",
         help="path for storing generated artifacts by this example. Default ./mobilebert_fine_tune",
         default="./mobilebert_fine_tune",
         type=str,
     )

     parser.add_argument(
         "-p",
         "--pretrained_weight",
         help="Location of pretrained weight",
         default=None,
         type=str,
     )

     parser.add_argument(
         "-F",
         "--use_fp16",
         help="If specified, will run in fp16 precision and discard ptq setting",
         action="store_true",
         default=False,
     )

     parser.add_argument(
         "-P",
         "--ptq",
         help="If specified, will do PTQ quantization. default is 8bits activation and 8bits weight. Support 8a8w, 16a16w and 16a4w.",
         default="8a8w",
     )

     args = parser.parse_args()
     try:
         main(args)
     except Exception as e:
         if args.ip and args.port != -1:
             with Client((args.ip, args.port)) as conn:
                 conn.send(json.dumps({"Error": str(e)}))
         else:
             raise Exception(e)
	# Copyright (c) Qualcomm Innovation Center, Inc.
	# All rights reserved
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.

	import json
	import os
	from multiprocessing.connection import Client

	import numpy as np

	import torch
	from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
	from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
	QcomChipset,
	)
	from executorch.backends.qualcomm.utils.utils import (
	generate_htp_compiler_spec,
	generate_qnn_executorch_compiler_spec,
	skip_annotation,
	)
	from executorch.examples.qualcomm.utils import (
	build_executorch_binary,
	make_output_dir,
	make_quantizer,
	parse_skip_delegation_node,
	QnnPartitioner,
	setup_common_args_and_variables,
	SimpleADB,
	)
	from executorch.exir import to_edge
	from transformers import BertTokenizer, MobileBertForSequenceClassification


	def evaluate(model, data_val):
	predictions, true_vals = [], []
	for data in data_val:
	inputs = {
	"input_ids": data[0].to(torch.long),
	"attention_mask": data[1].to(torch.long),
	"labels": data[2].to(torch.long),
	}
	logits = model(**inputs)[1].detach().numpy()
	label_ids = inputs["labels"].numpy()
	predictions.append(logits)
	true_vals.append(label_ids)

	return (
	np.concatenate(predictions, axis=0),
	np.concatenate(true_vals, axis=0),
	)


	def accuracy_per_class(preds, goldens, labels):
	labels_inverse = {v: k for k, v in labels.items()}
	preds_flat = np.argmax(preds, axis=1).flatten()
	goldens_flat = goldens.flatten()

	result = {}
	for golden in np.unique(goldens_flat):
	pred = preds_flat[goldens_flat == golden]
	true = goldens_flat[goldens_flat == golden]
	result.update({labels_inverse[golden]: [len(pred[pred == golden]), len(true)]})

	return result


	def get_dataset(data_val):
	# prepare input data
	inputs, input_list = [], ""
	# max_position_embeddings defaults to 512
	position_ids = torch.arange(512).expand((1, -1)).to(torch.int32)
	for index, data in enumerate(data_val):
	data = [d.to(torch.int32) for d in data]
	# input_ids, attention_mask, token_type_ids, position_ids
	inputs.append(
	(
	*data[:2],
	torch.zeros(data[0].size(), dtype=torch.int32),
	position_ids[:, : data[0].shape[1]],
	)
	)
	input_text = " ".join(
	[f"input_{index}_{i}.raw" for i in range(len(inputs[-1]))]
	)
	input_list += f"{input_text}\n"

	return inputs, input_list


	def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
	from io import BytesIO

	import pandas as pd
	import requests
	from sklearn.model_selection import train_test_split
	from torch.utils.data import (
	DataLoader,
	RandomSampler,
	SequentialSampler,
	TensorDataset,
	)
	from tqdm import tqdm
	from transformers import get_linear_schedule_with_warmup

	# grab dataset
	url = (
	"https://raw.githubusercontent.com/susanli2016/NLP-with-Python"
	"/master/data/title_conference.csv"
	)
	content = requests.get(url, allow_redirects=True).content
	data = pd.read_csv(BytesIO(content))

	# get training / validation data
	labels = {key: index for index, key in enumerate(data.Conference.unique())}
	data["label"] = data.Conference.replace(labels)

	train, val, _, _ = train_test_split(
	data.index.values,
	data.label.values,
	test_size=0.15,
	random_state=42,
	stratify=data.label.values,
	)

	data["data_type"] = ["not_set"] * data.shape[0]
	data.loc[train, "data_type"] = "train"
	data.loc[val, "data_type"] = "val"
	data.groupby(["Conference", "label", "data_type"]).count()

	# get pre-trained mobilebert
	tokenizer = BertTokenizer.from_pretrained(
	"bert-base-uncased",
	do_lower_case=True,
	)
	model = MobileBertForSequenceClassification.from_pretrained(
	"google/mobilebert-uncased",
	num_labels=len(labels),
	return_dict=False,
	)

	# tokenize dataset
	encoded_data_train = tokenizer.batch_encode_plus(
	data[data.data_type == "train"].Title.values,
	add_special_tokens=True,
	return_attention_mask=True,
	max_length=256,
	padding="max_length",
	truncation=True,
	return_tensors="pt",
	)
	encoded_data_val = tokenizer.batch_encode_plus(
	data[data.data_type == "val"].Title.values,
	add_special_tokens=True,
	return_attention_mask=True,
	max_length=256,
	padding="max_length",
	truncation=True,
	return_tensors="pt",
	)

	input_ids_train = encoded_data_train["input_ids"]
	attention_masks_train = encoded_data_train["attention_mask"]
	labels_train = torch.tensor(data[data.data_type == "train"].label.values)

	input_ids_val = encoded_data_val["input_ids"]
	attention_masks_val = encoded_data_val["attention_mask"]
	labels_val = torch.tensor(data[data.data_type == "val"].label.values)

	dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
	dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

	epochs = 5
	dataloader_train = DataLoader(
	dataset_train,
	sampler=RandomSampler(dataset_train),
	batch_size=batch_size,
	)
	dataloader_val = DataLoader(
	dataset_val,
	sampler=SequentialSampler(dataset_val),
	batch_size=batch_size,
	drop_last=True,
	)
	optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
	scheduler = get_linear_schedule_with_warmup(
	optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * epochs
	)

	# start training
	if not pretrained_weight:
	for epoch in range(1, epochs + 1):
	loss_train_total = 0
	print(f"epoch {epoch}")

	for batch in tqdm(dataloader_train):
	model.zero_grad()
	inputs = {
	"input_ids": batch[0],
	"attention_mask": batch[1],
	"labels": batch[2],
	}
	loss = model(**inputs)[0]
	loss_train_total += loss.item()
	loss.backward()
	torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
	optimizer.step()
	scheduler.step()

	torch.save(
	model.state_dict(),
	f"{artifacts_dir}/finetuned_mobilebert_epoch_{epoch}.model",
	)

	model.load_state_dict(
	torch.load(
	(
	f"{artifacts_dir}/finetuned_mobilebert_epoch_{epochs}.model"
	if pretrained_weight is None
	else pretrained_weight
	),
	map_location=torch.device("cpu"),
	weights_only=True,
	),
	)

	return model.eval(), dataloader_val, labels


	def main(args):
	skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)

	# ensure the working directory exist.
	os.makedirs(args.artifact, exist_ok=True)

	if not args.compile_only and args.device is None:
	raise RuntimeError(
	"device serial is required if not compile only. "
	"Please specify a device serial by -s/--device argument."
	)

	batch_size, pte_filename = 1, "ptq_mb_qnn"
	model, data_val, labels = get_fine_tuned_mobilebert(
	args.artifact, args.pretrained_weight, batch_size
	)
	inputs, input_list = get_dataset(data_val)

	try:
	quant_dtype = getattr(QuantDtype, f"use_{args.ptq}")
	except:
	raise AssertionError(
	f"No support for quant type {args.ptq}. Support 8a8w, 16a16w and 16a4w."
	)

	if args.use_fp16:
	quant_dtype = None
	pte_filename = "mb_qnn"
	build_executorch_binary(
	model,
	inputs[0],
	args.model,
	f"{args.artifact}/{pte_filename}",
	inputs,
	skip_node_id_set=skip_node_id_set,
	skip_node_op_set=skip_node_op_set,
	quant_dtype=quant_dtype,
	shared_buffer=args.shared_buffer,
	)
	else:

	def calibrator(gm):
	for input in inputs:
	gm(*input)

	quantizer = make_quantizer(quant_dtype=quant_dtype)
	backend_options = generate_htp_compiler_spec(quant_dtype is not None)
	partitioner = QnnPartitioner(
	generate_qnn_executorch_compiler_spec(
	soc_model=getattr(QcomChipset, args.model),
	backend_options=backend_options,
	),
	skip_node_id_set=skip_node_id_set,
	skip_node_op_set=skip_node_op_set,
	)
	# skip embedding layer cause it's quantization sensitive
	graph_module, _ = skip_annotation(
	nn_module=model,
	quantizer=quantizer,
	partitioner=partitioner,
	sample_input=inputs[0],
	calibration_cb=calibrator,
	fp_node_op_set={torch.ops.aten.embedding.default},
	)
	# lower all graph again, the skipped operators will be left in CPU
	exec_prog = to_edge(
	torch.export.export(graph_module, inputs[0]),
	).to_executorch()

	with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file:
	file.write(exec_prog.buffer)

	if args.compile_only:
	return

	adb = SimpleADB(
	qnn_sdk=os.getenv("QNN_SDK_ROOT"),
	build_path=f"{args.build_folder}",
	pte_path=f"{args.artifact}/{pte_filename}.pte",
	workspace=f"/data/local/tmp/executorch/{pte_filename}",
	device_id=args.device,
	host_id=args.host,
	soc_model=args.model,
	shared_buffer=args.shared_buffer,
	)
	adb.push(inputs=inputs, input_list=input_list)
	adb.execute()

	# collect output data
	output_data_folder = f"{args.artifact}/outputs"
	make_output_dir(output_data_folder)

	adb.pull(output_path=args.artifact)

	# get torch cpu result
	cpu_preds, true_vals = evaluate(model, data_val)
	cpu_result = accuracy_per_class(cpu_preds, true_vals, labels)

	# get QNN HTP result
	htp_preds = []
	for i in range(len(data_val)):
	result = np.fromfile(
	os.path.join(output_data_folder, f"output_{i}_0.raw"),
	dtype=np.float32,
	)
	htp_preds.append(result.reshape(batch_size, -1))

	htp_result = accuracy_per_class(
	np.concatenate(htp_preds, axis=0), true_vals, labels
	)

	if args.ip and args.port != -1:
	with Client((args.ip, args.port)) as conn:
	conn.send(json.dumps({"CPU": cpu_result, "HTP": htp_result}))
	else:
	for target in zip(["CPU", "HTP"], [cpu_result, htp_result]):
	print(f"\n[{target[0]}]")
	for k, v in target[1].items():
	print(f"{k}: {v[0]}/{v[1]}")


	if __name__ == "__main__":
	parser = setup_common_args_and_variables()

	parser.add_argument(
	"-a",
	"--artifact",
	help="path for storing generated artifacts by this example. Default ./mobilebert_fine_tune",
	default="./mobilebert_fine_tune",
	type=str,
	)

	parser.add_argument(
	"-p",
	"--pretrained_weight",
	help="Location of pretrained weight",
	default=None,
	type=str,
	)

	parser.add_argument(
	"-F",
	"--use_fp16",
	help="If specified, will run in fp16 precision and discard ptq setting",
	action="store_true",
	default=False,
	)

	parser.add_argument(
	"-P",
	"--ptq",
	help="If specified, will do PTQ quantization. default is 8bits activation and 8bits weight. Support 8a8w, 16a16w and 16a4w.",
	default="8a8w",
	)

	args = parser.parse_args()
	try:
	main(args)
	except Exception as e:
	if args.ip and args.port != -1:
	with Client((args.ip, args.port)) as conn:
	conn.send(json.dumps({"Error": str(e)}))
	else:
	raise Exception(e)