exir/tests/asr_joiner.py - platform/external/executorch - Git at Google

 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.

 import torch
 import torch.nn.functional as F
 from torch import nn


 class ASRJoiner(nn.Module):
     """
     ASR joiner implementation following the code in https://fburl.com/code/ierfau7c
     Have a local implementation has the benefit that we don't need pull in the
     heavy dependencies and wait for a few minutes to run tests.
     """

     def __init__(self, B=1, H=10, T=1, U=1, D=768) -> None:
         """
         B: source batch size
         H: number of hypotheses for beam search
         T: source sequence length
         U: target sequence length
         D: encoding (some sort of embedding?) dimension
         """
         super().__init__()
         self.B, self.H, self.T, self.U, self.D = B, H, T, U, D
         # The module looks like:
         # SequentialContainer(
         #   (module_list): ModuleList(
         #     (0): ReLULayer(inplace=False)
         #     (1): LinearLayer(input_dim=768, output_dim=4096, bias=True, context_dim=0, pruning_aware_training=False, parameter_noise=0.1, qat_qconfig=None, freeze_rex_pattern=None)
         #   )
         # )
         self.module = nn.Sequential(
             nn.ReLU(),
             nn.Linear(D, 4096),
         )

     def forward(self, src_encodings, src_lengths, tgt_encodings, tgt_lengths):
         """
         One simplification we make here is we assume src_encodings and tgt_encodings
         are not None. In the originally implementation, either can be None.
         """
         H = tgt_encodings.shape[0] // src_encodings.shape[0]
         B = src_encodings.shape[0]
         new_order = (
             (torch.arange(B).view(-1, 1).repeat(1, H).view(-1))
             .long()
             .to(device=src_encodings.device)
         )
         # src_encodings: (B, T, D) -> (B*H, T, D)
         src_encodings = torch.index_select(
             src_encodings, dim=0, index=new_order
         ).contiguous()

         # src_lengths: (B,) -> (B*H,)
         src_lengths = torch.index_select(
             src_lengths, dim=0, index=new_order.to(device=src_lengths.device)
         )

         # src_encodings: (B*H, T, D) -> (B*H, T, 1, D)
         src_encodings = src_encodings.unsqueeze(dim=2).contiguous()

         # tgt_encodings: (B*H, U, D) -> (B*H, 1, U, D)
         tgt_encodings = tgt_encodings.unsqueeze(dim=1).contiguous()

         # joint_encodings: (B*H, T, U, D)
         joint_encodings = src_encodings + tgt_encodings

         output = F.log_softmax(self.module(joint_encodings), dim=-1)

         return output, src_lengths, tgt_lengths

     def get_random_inputs(self):
         return (
             torch.rand(self.B, self.T, self.D),
             torch.randint(0, 10, (self.B,)),
             torch.rand(self.B * self.H, self.U, self.D),
             torch.randint(0, 10, (self.B,)),
         )
	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.

	import torch
	import torch.nn.functional as F
	from torch import nn


	class ASRJoiner(nn.Module):
	"""
	ASR joiner implementation following the code in https://fburl.com/code/ierfau7c
	Have a local implementation has the benefit that we don't need pull in the
	heavy dependencies and wait for a few minutes to run tests.
	"""

	def __init__(self, B=1, H=10, T=1, U=1, D=768) -> None:
	"""
	B: source batch size
	H: number of hypotheses for beam search
	T: source sequence length
	U: target sequence length
	D: encoding (some sort of embedding?) dimension
	"""
	super().__init__()
	self.B, self.H, self.T, self.U, self.D = B, H, T, U, D
	# The module looks like:
	# SequentialContainer(
	# (module_list): ModuleList(
	# (0): ReLULayer(inplace=False)
	# (1): LinearLayer(input_dim=768, output_dim=4096, bias=True, context_dim=0, pruning_aware_training=False, parameter_noise=0.1, qat_qconfig=None, freeze_rex_pattern=None)
	# )
	# )
	self.module = nn.Sequential(
	nn.ReLU(),
	nn.Linear(D, 4096),
	)

	def forward(self, src_encodings, src_lengths, tgt_encodings, tgt_lengths):
	"""
	One simplification we make here is we assume src_encodings and tgt_encodings
	are not None. In the originally implementation, either can be None.
	"""
	H = tgt_encodings.shape[0] // src_encodings.shape[0]
	B = src_encodings.shape[0]
	new_order = (
	(torch.arange(B).view(-1, 1).repeat(1, H).view(-1))
	.long()
	.to(device=src_encodings.device)
	)
	# src_encodings: (B, T, D) -> (B*H, T, D)
	src_encodings = torch.index_select(
	src_encodings, dim=0, index=new_order
	).contiguous()

	# src_lengths: (B,) -> (B*H,)
	src_lengths = torch.index_select(
	src_lengths, dim=0, index=new_order.to(device=src_lengths.device)
	)

	# src_encodings: (BH, T, D) -> (BH, T, 1, D)
	src_encodings = src_encodings.unsqueeze(dim=2).contiguous()

	# tgt_encodings: (BH, U, D) -> (BH, 1, U, D)
	tgt_encodings = tgt_encodings.unsqueeze(dim=1).contiguous()

	# joint_encodings: (B*H, T, U, D)
	joint_encodings = src_encodings + tgt_encodings

	output = F.log_softmax(self.module(joint_encodings), dim=-1)

	return output, src_lengths, tgt_lengths

	def get_random_inputs(self):
	return (
	torch.rand(self.B, self.T, self.D),
	torch.randint(0, 10, (self.B,)),
	torch.rand(self.B * self.H, self.U, self.D),
	torch.randint(0, 10, (self.B,)),
	)