scripts/release_notes/classifier.py - platform/external/pytorch - Git at Google

 import argparse
 from pathlib import Path
 import torch
 import torchtext
 from torchtext.functional import to_tensor
 import torch.nn as nn
 import torch.nn.functional as F
 from typing import List, Dict
 import pandas as pd
 from dataclasses import dataclass
 import math
 import pickle
 import random
 from tqdm import tqdm
 from itertools import chain

 import common


 XLMR_BASE = torchtext.models.XLMR_BASE_ENCODER
 # This should not be here but it works for now
 device = "cuda" if torch.cuda.is_available() else "cpu"

 HAS_IMBLEARN = False
 try:
     import imblearn
     HAS_IMBLEARN = True
 except ImportError:
     HAS_IMBLEARN = False

 # 94% of all files are captured at len 5, good hyperparameter to play around with.
 MAX_LEN_FILE = 6

 UNKNOWN_TOKEN = "<Unknown>"

 # Utilities for working with a truncated file graph


 def truncate_file(file: Path, max_len: int = 5):
     return ('/').join(file.parts[:max_len])


 def build_file_set(all_files: List[Path], max_len: int):
     truncated_files = [truncate_file(file, max_len) for file in all_files]
     return set(truncated_files)
 @dataclass
 class CommitClassifierInputs:
     title: List[str]
     files: List[str]
     author: List[str]


 @dataclass
 class CategoryConfig:
     categories: List[str]
     input_dim: int = 768
     inner_dim: int = 128
     dropout: float = 0.1
     activation = nn.ReLU
     embedding_dim: int = 8
     file_embedding_dim: int = 32


 class CommitClassifier(nn.Module):
     def __init__(self, encoder_base: torchtext.models.XLMR_BASE_ENCODER, author_map: Dict[str, int], file_map: [str, int], config: CategoryConfig):
         super().__init__()
         self.encoder = encoder_base.get_model().requires_grad_(False)
         self.transform = encoder_base.transform()
         self.author_map = author_map
         self.file_map = file_map
         self.categories = config.categories
         self.num_authors = len(author_map)
         self.num_files = len(file_map)
         self.embedding_table = nn.Embedding(self.num_authors, config.embedding_dim)
         self.file_embedding_bag = nn.EmbeddingBag(self.num_files, config.file_embedding_dim, mode='sum')
         self.dense_title = nn.Linear(config.input_dim, config.inner_dim)
         self.dense_files = nn.Linear(config.file_embedding_dim, config.inner_dim)
         self.dense_author = nn.Linear(config.embedding_dim, config.inner_dim)
         self.dropout = nn.Dropout(config.dropout)
         self.out_proj_title = nn.Linear(config.inner_dim, len(self.categories))
         self.out_proj_files = nn.Linear(config.inner_dim, len(self.categories))
         self.out_proj_author = nn.Linear(config.inner_dim, len(self.categories))
         self.activation_fn = config.activation()

     def forward(self, input_batch: CommitClassifierInputs):
         # Encode input title
         title: List[str] = input_batch.title
         model_input = to_tensor(self.transform(title), padding_value=1).to(device)
         title_features = self.encoder(model_input)
         title_embed = title_features[:, 0, :]
         title_embed = self.dropout(title_embed)
         title_embed = self.dense_title(title_embed)
         title_embed = self.activation_fn(title_embed)
         title_embed = self.dropout(title_embed)
         title_embed = self.out_proj_title(title_embed)

         files: list[str] = input_batch.files
         batch_file_indexes = []
         for file in files:
             paths = [truncate_file(Path(file_part), MAX_LEN_FILE) for file_part in file.split(" ")]
             batch_file_indexes.append([self.file_map.get(file, self.file_map[UNKNOWN_TOKEN]) for file in paths])

         flat_indexes = torch.tensor(list(chain.from_iterable(batch_file_indexes)), dtype=torch.long, device=device)
         offsets = [0]
         offsets.extend(len(files) for files in batch_file_indexes[:-1])
         offsets = torch.tensor(offsets, dtype=torch.long, device=device)
         offsets = offsets.cumsum(dim=0)

         files_embed = self.file_embedding_bag(flat_indexes, offsets)
         files_embed = self.dense_files(files_embed)
         files_embed = self.activation_fn(files_embed)
         files_embed = self.dropout(files_embed)
         files_embed = self.out_proj_files(files_embed)

         # Add author embedding
         authors: List[str] = input_batch.author
         author_ids = [self.author_map.get(author, self.author_map[UNKNOWN_TOKEN]) for author in authors]
         author_ids = torch.tensor(author_ids).to(device)
         author_embed = self.embedding_table(author_ids)
         author_embed = self.dense_author(author_embed)
         author_embed = self.activation_fn(author_embed)
         author_embed = self.dropout(author_embed)
         author_embed = self.out_proj_author(author_embed)

         return title_embed + files_embed + author_embed

     def convert_index_to_category_name(self, most_likely_index):
         if isinstance(most_likely_index, int):
             return self.categories[most_likely_index]
         elif isinstance(most_likely_index, torch.Tensor):
             return [self.categories[i] for i in most_likely_index]

     def get_most_likely_category_name(self, inpt):
         # Input will be a dict with title and author keys
         logits = self.forward(inpt)
         most_likely_index = torch.argmax(logits, dim=1)
         return self.convert_index_to_category_name(most_likely_index)


 def get_train_val_data(data_folder: Path, regen_data: bool, train_percentage=0.95):
     if not regen_data and Path(data_folder / "train_df.csv").exists() and Path(data_folder / "val_df.csv").exists():
         train_data = pd.read_csv(data_folder / "train_df.csv")
         val_data = pd.read_csv(data_folder / "val_df.csv")
         return train_data, val_data
     else:
         print("Train, Val, Test Split not found generating from scratch.")
         commit_list_df = pd.read_csv(data_folder / "commitlist.csv")
         test_df = commit_list_df[commit_list_df['category'] == 'Uncategorized']
         all_train_df = commit_list_df[commit_list_df['category'] != 'Uncategorized']
         # We are going to drop skip from training set since it is so imbalanced
         print("We are removing skip categories, YOU MIGHT WANT TO CHANGE THIS, BUT THIS IS A MORE HELPFUL CLASSIFIER FOR LABELING.")
         all_train_df = all_train_df[all_train_df['category'] != 'skip']
         all_train_df = all_train_df.sample(frac=1).reset_index(drop=True)
         split_index = math.floor(train_percentage * len(all_train_df))
         train_df = all_train_df[:split_index]
         val_df = all_train_df[split_index:]
         print("Train data size: ", len(train_df))
         print("Val data size: ", len(val_df))

         test_df.to_csv(data_folder / "test_df.csv", index=False)
         train_df.to_csv(data_folder / "train_df.csv", index=False)
         val_df.to_csv(data_folder / "val_df.csv", index=False)
         return train_df, val_df


 def get_author_map(data_folder: Path, regen_data, assert_stored=False):
     if not regen_data and Path(data_folder / "author_map.pkl").exists():
         with open(data_folder / "author_map.pkl", 'rb') as f:
             return pickle.load(f)
     else:
         if assert_stored:
             raise FileNotFoundError(
                 "Author map not found, you are loading for inference you need to have an author map!")
         print("Regenerating Author Map")
         all_data = pd.read_csv(data_folder / "commitlist.csv")
         authors = all_data.author.unique().tolist()
         authors.append(UNKNOWN_TOKEN)
         author_map = {author: i for i, author in enumerate(authors)}
         with open(data_folder / "author_map.pkl", 'wb') as f:
             pickle.dump(author_map, f)
         return author_map


 def get_file_map(data_folder: Path, regen_data, assert_stored=False):
     if not regen_data and Path(data_folder / "file_map.pkl").exists():
         with open(data_folder / "file_map.pkl", 'rb') as f:
             return pickle.load(f)
     else:
         if assert_stored:
             raise FileNotFoundError("File map not found, you are loading for inference you need to have a file map!")
         print("Regenerating File Map")
         all_data = pd.read_csv(data_folder / "commitlist.csv")
         # Lets explore files
         files = all_data.files_changed.to_list()

         all_files = []
         for file in files:
             paths = [Path(file_part) for file_part in file.split(" ")]
             all_files.extend(paths)
         all_files.append(Path(UNKNOWN_TOKEN))
         file_set = build_file_set(all_files, MAX_LEN_FILE)
         file_map = {file: i for i, file in enumerate(file_set)}
         with open(data_folder / "file_map.pkl", 'wb') as f:
             pickle.dump(file_map, f)
         return file_map

 #  Generate a dataset for training


 def get_title_files_author_categories_zip_list(dataframe: pd.DataFrame):
     title = dataframe.title.to_list()
     files_str = dataframe.files_changed.to_list()
     author = dataframe.author.fillna(UNKNOWN_TOKEN).to_list()
     category = dataframe.category.to_list()
     return list(zip(title, files_str, author, category))


 def generate_batch(batch):
     title, files, author, category = zip(*batch)
     title = list(title)
     files = list(files)
     author = list(author)
     category = list(category)
     targets = torch.tensor([common.categories.index(cat) for cat in category]).to(device)
     return CommitClassifierInputs(title, files, author), targets


 def train_step(batch, model, optimizer, loss):
     inpt, targets = batch
     optimizer.zero_grad()
     output = model(inpt)
     l = loss(output, targets)
     l.backward()
     optimizer.step()
     return l


 @torch.no_grad()
 def eval_step(batch, model, loss):
     inpt, targets = batch
     output = model(inpt)
     l = loss(output, targets)
     return l


 def balance_dataset(dataset: List):
     if not HAS_IMBLEARN:
         return dataset
     title, files, author, category = zip(*dataset)
     category = [common.categories.index(cat) for cat in category]
     inpt_data = list(zip(title, files, author))
     from imblearn.over_sampling import RandomOverSampler
     # from imblearn.under_sampling import RandomUnderSampler
     rus = RandomOverSampler(random_state=42)
     X, y = rus.fit_resample(inpt_data, category)
     merged = list(zip(X, y))
     merged = random.sample(merged, k=2 * len(dataset))
     X, y = zip(*merged)
     rebuilt_dataset = []
     for i in range(len(X)):
         rebuilt_dataset.append((*X[i], common.categories[y[i]]))
     return rebuilt_dataset


 def gen_class_weights(dataset: List):
     from collections import Counter
     epsilon = 1e-1
     title, files, author, category = zip(*dataset)
     category = [common.categories.index(cat) for cat in category]
     counter = Counter(category)
     percentile_33 = len(category) // 3
     most_common = counter.most_common(percentile_33)
     least_common = counter.most_common()[-percentile_33:]
     smoothed_top = sum(i[1] + epsilon for i in most_common) / len(most_common)
     smoothed_bottom = sum(i[1] + epsilon for i in least_common) / len(least_common) // 3
     class_weights = torch.tensor([1.0 / (min(max(counter[i], smoothed_bottom), smoothed_top) + epsilon)
                                  for i in range(len(common.categories))], device=device)
     return class_weights


 def train(save_path: Path, data_folder: Path, regen_data: bool, resample: bool):
     train_data, val_data = get_train_val_data(data_folder, regen_data)
     train_zip_list = get_title_files_author_categories_zip_list(train_data)
     val_zip_list = get_title_files_author_categories_zip_list(val_data)

     classifier_config = CategoryConfig(common.categories)
     author_map = get_author_map(data_folder, regen_data)
     file_map = get_file_map(data_folder, regen_data)
     commit_classifier = CommitClassifier(XLMR_BASE, author_map, file_map, classifier_config).to(device)

     # Lets train this bag of bits
     class_weights = gen_class_weights(train_zip_list)
     loss = torch.nn.CrossEntropyLoss(weight=class_weights)
     optimizer = torch.optim.Adam(commit_classifier.parameters(), lr=3e-3)

     num_epochs = 25
     batch_size = 256

     if resample:
         # Lets not use this
         train_zip_list = balance_dataset(train_zip_list)
     data_size = len(train_zip_list)

     print(f"Training on {data_size} examples.")
     # We can fit all of val into one batch
     val_batch = generate_batch(val_zip_list)

     for i in tqdm(range(num_epochs), desc="Epochs"):
         start = 0
         random.shuffle(train_zip_list)
         while start < data_size:
             end = start + batch_size
             # make the last batch bigger if needed
             if end > data_size:
                 end = data_size
             train_batch = train_zip_list[start:end]
             train_batch = generate_batch(train_batch)
             l = train_step(train_batch, commit_classifier, optimizer, loss)
             start = end

         val_l = eval_step(val_batch, commit_classifier, loss)
         tqdm.write(f"Finished epoch {i} with a train loss of: {l.item()} and a val_loss of: {val_l.item()}")

     with torch.no_grad():
         commit_classifier.eval()
         val_inpts, val_targets = val_batch
         val_output = commit_classifier(val_inpts)
         val_preds = torch.argmax(val_output, dim=1)
         val_acc = torch.sum(val_preds == val_targets).item() / len(val_preds)
         print(f"Final Validation accuracy is {val_acc}")

     print(f"Jobs done! Saving to {save_path}")
     torch.save(commit_classifier.state_dict(), save_path)


 def main():
     parser = argparse.ArgumentParser(description='Tool to create a classifier for helping to categorize commits')

     parser.add_argument('--train', action='store_true', help='Train a new classifier')
     parser.add_argument("--commit_data_folder", default="results/classifier/")
     parser.add_argument('--save_path', default='results/classifier/commit_classifier.pt')
     parser.add_argument('--regen_data', action='store_true',
                         help="Regenerate the training data, helps if labeld more examples and want to re-train.")
     parser.add_argument('--resample', action='store_true',
                         help="Resample the training data to be balanced. (Only works if imblearn is installed.)")
     args = parser.parse_args()

     if args.train:
         train(Path(args.save_path), Path(args.commit_data_folder), args.regen_data, args.resample)
         return

     print("Currently this file only trains a new classifier please pass in --train to train a new classifier")


 if __name__ == "__main__":
     main()
	import argparse
	from pathlib import Path
	import torch
	import torchtext
	from torchtext.functional import to_tensor
	import torch.nn as nn
	import torch.nn.functional as F
	from typing import List, Dict
	import pandas as pd
	from dataclasses import dataclass
	import math
	import pickle
	import random
	from tqdm import tqdm
	from itertools import chain

	import common


	XLMR_BASE = torchtext.models.XLMR_BASE_ENCODER
	# This should not be here but it works for now
	device = "cuda" if torch.cuda.is_available() else "cpu"

	HAS_IMBLEARN = False
	try:
	import imblearn
	HAS_IMBLEARN = True
	except ImportError:
	HAS_IMBLEARN = False

	# 94% of all files are captured at len 5, good hyperparameter to play around with.
	MAX_LEN_FILE = 6

	UNKNOWN_TOKEN = "<Unknown>"

	# Utilities for working with a truncated file graph


	def truncate_file(file: Path, max_len: int = 5):
	return ('/').join(file.parts[:max_len])


	def build_file_set(all_files: List[Path], max_len: int):
	truncated_files = [truncate_file(file, max_len) for file in all_files]
	return set(truncated_files)
	@dataclass
	class CommitClassifierInputs:
	title: List[str]
	files: List[str]
	author: List[str]


	@dataclass
	class CategoryConfig:
	categories: List[str]
	input_dim: int = 768
	inner_dim: int = 128
	dropout: float = 0.1
	activation = nn.ReLU
	embedding_dim: int = 8
	file_embedding_dim: int = 32


	class CommitClassifier(nn.Module):
	def __init__(self, encoder_base: torchtext.models.XLMR_BASE_ENCODER, author_map: Dict[str, int], file_map: [str, int], config: CategoryConfig):
	super().__init__()
	self.encoder = encoder_base.get_model().requires_grad_(False)
	self.transform = encoder_base.transform()
	self.author_map = author_map
	self.file_map = file_map
	self.categories = config.categories
	self.num_authors = len(author_map)
	self.num_files = len(file_map)
	self.embedding_table = nn.Embedding(self.num_authors, config.embedding_dim)
	self.file_embedding_bag = nn.EmbeddingBag(self.num_files, config.file_embedding_dim, mode='sum')
	self.dense_title = nn.Linear(config.input_dim, config.inner_dim)
	self.dense_files = nn.Linear(config.file_embedding_dim, config.inner_dim)
	self.dense_author = nn.Linear(config.embedding_dim, config.inner_dim)
	self.dropout = nn.Dropout(config.dropout)
	self.out_proj_title = nn.Linear(config.inner_dim, len(self.categories))
	self.out_proj_files = nn.Linear(config.inner_dim, len(self.categories))
	self.out_proj_author = nn.Linear(config.inner_dim, len(self.categories))
	self.activation_fn = config.activation()

	def forward(self, input_batch: CommitClassifierInputs):
	# Encode input title
	title: List[str] = input_batch.title
	model_input = to_tensor(self.transform(title), padding_value=1).to(device)
	title_features = self.encoder(model_input)
	title_embed = title_features[:, 0, :]
	title_embed = self.dropout(title_embed)
	title_embed = self.dense_title(title_embed)
	title_embed = self.activation_fn(title_embed)
	title_embed = self.dropout(title_embed)
	title_embed = self.out_proj_title(title_embed)

	files: list[str] = input_batch.files
	batch_file_indexes = []
	for file in files:
	paths = [truncate_file(Path(file_part), MAX_LEN_FILE) for file_part in file.split(" ")]
	batch_file_indexes.append([self.file_map.get(file, self.file_map[UNKNOWN_TOKEN]) for file in paths])

	flat_indexes = torch.tensor(list(chain.from_iterable(batch_file_indexes)), dtype=torch.long, device=device)
	offsets = [0]
	offsets.extend(len(files) for files in batch_file_indexes[:-1])
	offsets = torch.tensor(offsets, dtype=torch.long, device=device)
	offsets = offsets.cumsum(dim=0)

	files_embed = self.file_embedding_bag(flat_indexes, offsets)
	files_embed = self.dense_files(files_embed)
	files_embed = self.activation_fn(files_embed)
	files_embed = self.dropout(files_embed)
	files_embed = self.out_proj_files(files_embed)

	# Add author embedding
	authors: List[str] = input_batch.author
	author_ids = [self.author_map.get(author, self.author_map[UNKNOWN_TOKEN]) for author in authors]
	author_ids = torch.tensor(author_ids).to(device)
	author_embed = self.embedding_table(author_ids)
	author_embed = self.dense_author(author_embed)
	author_embed = self.activation_fn(author_embed)
	author_embed = self.dropout(author_embed)
	author_embed = self.out_proj_author(author_embed)

	return title_embed + files_embed + author_embed

	def convert_index_to_category_name(self, most_likely_index):
	if isinstance(most_likely_index, int):
	return self.categories[most_likely_index]
	elif isinstance(most_likely_index, torch.Tensor):
	return [self.categories[i] for i in most_likely_index]

	def get_most_likely_category_name(self, inpt):
	# Input will be a dict with title and author keys
	logits = self.forward(inpt)
	most_likely_index = torch.argmax(logits, dim=1)
	return self.convert_index_to_category_name(most_likely_index)


	def get_train_val_data(data_folder: Path, regen_data: bool, train_percentage=0.95):
	if not regen_data and Path(data_folder / "train_df.csv").exists() and Path(data_folder / "val_df.csv").exists():
	train_data = pd.read_csv(data_folder / "train_df.csv")
	val_data = pd.read_csv(data_folder / "val_df.csv")
	return train_data, val_data
	else:
	print("Train, Val, Test Split not found generating from scratch.")
	commit_list_df = pd.read_csv(data_folder / "commitlist.csv")
	test_df = commit_list_df[commit_list_df['category'] == 'Uncategorized']
	all_train_df = commit_list_df[commit_list_df['category'] != 'Uncategorized']
	# We are going to drop skip from training set since it is so imbalanced
	print("We are removing skip categories, YOU MIGHT WANT TO CHANGE THIS, BUT THIS IS A MORE HELPFUL CLASSIFIER FOR LABELING.")
	all_train_df = all_train_df[all_train_df['category'] != 'skip']
	all_train_df = all_train_df.sample(frac=1).reset_index(drop=True)
	split_index = math.floor(train_percentage * len(all_train_df))
	train_df = all_train_df[:split_index]
	val_df = all_train_df[split_index:]
	print("Train data size: ", len(train_df))
	print("Val data size: ", len(val_df))

	test_df.to_csv(data_folder / "test_df.csv", index=False)
	train_df.to_csv(data_folder / "train_df.csv", index=False)
	val_df.to_csv(data_folder / "val_df.csv", index=False)
	return train_df, val_df


	def get_author_map(data_folder: Path, regen_data, assert_stored=False):
	if not regen_data and Path(data_folder / "author_map.pkl").exists():
	with open(data_folder / "author_map.pkl", 'rb') as f:
	return pickle.load(f)
	else:
	if assert_stored:
	raise FileNotFoundError(
	"Author map not found, you are loading for inference you need to have an author map!")
	print("Regenerating Author Map")
	all_data = pd.read_csv(data_folder / "commitlist.csv")
	authors = all_data.author.unique().tolist()
	authors.append(UNKNOWN_TOKEN)
	author_map = {author: i for i, author in enumerate(authors)}
	with open(data_folder / "author_map.pkl", 'wb') as f:
	pickle.dump(author_map, f)
	return author_map



	def get_file_map(data_folder: Path, regen_data, assert_stored=False):
	if not regen_data and Path(data_folder / "file_map.pkl").exists():
	with open(data_folder / "file_map.pkl", 'rb') as f:
	return pickle.load(f)
	else:
	if assert_stored:
	raise FileNotFoundError("File map not found, you are loading for inference you need to have a file map!")
	print("Regenerating File Map")
	all_data = pd.read_csv(data_folder / "commitlist.csv")
	# Lets explore files
	files = all_data.files_changed.to_list()

	all_files = []
	for file in files:
	paths = [Path(file_part) for file_part in file.split(" ")]
	all_files.extend(paths)
	all_files.append(Path(UNKNOWN_TOKEN))
	file_set = build_file_set(all_files, MAX_LEN_FILE)
	file_map = {file: i for i, file in enumerate(file_set)}
	with open(data_folder / "file_map.pkl", 'wb') as f:
	pickle.dump(file_map, f)
	return file_map

	# Generate a dataset for training


	def get_title_files_author_categories_zip_list(dataframe: pd.DataFrame):
	title = dataframe.title.to_list()
	files_str = dataframe.files_changed.to_list()
	author = dataframe.author.fillna(UNKNOWN_TOKEN).to_list()
	category = dataframe.category.to_list()
	return list(zip(title, files_str, author, category))


	def generate_batch(batch):
	title, files, author, category = zip(*batch)
	title = list(title)
	files = list(files)
	author = list(author)
	category = list(category)
	targets = torch.tensor([common.categories.index(cat) for cat in category]).to(device)
	return CommitClassifierInputs(title, files, author), targets


	def train_step(batch, model, optimizer, loss):
	inpt, targets = batch
	optimizer.zero_grad()
	output = model(inpt)
	l = loss(output, targets)
	l.backward()
	optimizer.step()
	return l


	@torch.no_grad()
	def eval_step(batch, model, loss):
	inpt, targets = batch
	output = model(inpt)
	l = loss(output, targets)
	return l


	def balance_dataset(dataset: List):
	if not HAS_IMBLEARN:
	return dataset
	title, files, author, category = zip(*dataset)
	category = [common.categories.index(cat) for cat in category]
	inpt_data = list(zip(title, files, author))
	from imblearn.over_sampling import RandomOverSampler
	# from imblearn.under_sampling import RandomUnderSampler
	rus = RandomOverSampler(random_state=42)
	X, y = rus.fit_resample(inpt_data, category)
	merged = list(zip(X, y))
	merged = random.sample(merged, k=2 * len(dataset))
	X, y = zip(*merged)
	rebuilt_dataset = []
	for i in range(len(X)):
	rebuilt_dataset.append((*X[i], common.categories[y[i]]))
	return rebuilt_dataset


	def gen_class_weights(dataset: List):
	from collections import Counter
	epsilon = 1e-1
	title, files, author, category = zip(*dataset)
	category = [common.categories.index(cat) for cat in category]
	counter = Counter(category)
	percentile_33 = len(category) // 3
	most_common = counter.most_common(percentile_33)
	least_common = counter.most_common()[-percentile_33:]
	smoothed_top = sum(i[1] + epsilon for i in most_common) / len(most_common)
	smoothed_bottom = sum(i[1] + epsilon for i in least_common) / len(least_common) // 3
	class_weights = torch.tensor([1.0 / (min(max(counter[i], smoothed_bottom), smoothed_top) + epsilon)
	for i in range(len(common.categories))], device=device)
	return class_weights


	def train(save_path: Path, data_folder: Path, regen_data: bool, resample: bool):
	train_data, val_data = get_train_val_data(data_folder, regen_data)
	train_zip_list = get_title_files_author_categories_zip_list(train_data)
	val_zip_list = get_title_files_author_categories_zip_list(val_data)

	classifier_config = CategoryConfig(common.categories)
	author_map = get_author_map(data_folder, regen_data)
	file_map = get_file_map(data_folder, regen_data)
	commit_classifier = CommitClassifier(XLMR_BASE, author_map, file_map, classifier_config).to(device)

	# Lets train this bag of bits
	class_weights = gen_class_weights(train_zip_list)
	loss = torch.nn.CrossEntropyLoss(weight=class_weights)
	optimizer = torch.optim.Adam(commit_classifier.parameters(), lr=3e-3)

	num_epochs = 25
	batch_size = 256

	if resample:
	# Lets not use this
	train_zip_list = balance_dataset(train_zip_list)
	data_size = len(train_zip_list)

	print(f"Training on {data_size} examples.")
	# We can fit all of val into one batch
	val_batch = generate_batch(val_zip_list)

	for i in tqdm(range(num_epochs), desc="Epochs"):
	start = 0
	random.shuffle(train_zip_list)
	while start < data_size:
	end = start + batch_size
	# make the last batch bigger if needed
	if end > data_size:
	end = data_size
	train_batch = train_zip_list[start:end]
	train_batch = generate_batch(train_batch)
	l = train_step(train_batch, commit_classifier, optimizer, loss)
	start = end

	val_l = eval_step(val_batch, commit_classifier, loss)
	tqdm.write(f"Finished epoch {i} with a train loss of: {l.item()} and a val_loss of: {val_l.item()}")

	with torch.no_grad():
	commit_classifier.eval()
	val_inpts, val_targets = val_batch
	val_output = commit_classifier(val_inpts)
	val_preds = torch.argmax(val_output, dim=1)
	val_acc = torch.sum(val_preds == val_targets).item() / len(val_preds)
	print(f"Final Validation accuracy is {val_acc}")

	print(f"Jobs done! Saving to {save_path}")
	torch.save(commit_classifier.state_dict(), save_path)


	def main():
	parser = argparse.ArgumentParser(description='Tool to create a classifier for helping to categorize commits')

	parser.add_argument('--train', action='store_true', help='Train a new classifier')
	parser.add_argument("--commit_data_folder", default="results/classifier/")
	parser.add_argument('--save_path', default='results/classifier/commit_classifier.pt')
	parser.add_argument('--regen_data', action='store_true',
	help="Regenerate the training data, helps if labeld more examples and want to re-train.")
	parser.add_argument('--resample', action='store_true',
	help="Resample the training data to be balanced. (Only works if imblearn is installed.)")
	args = parser.parse_args()

	if args.train:
	train(Path(args.save_path), Path(args.commit_data_folder), args.regen_data, args.resample)
	return

	print("Currently this file only trains a new classifier please pass in --train to train a new classifier")


	if __name__ == "__main__":
	main()