examples/cadence/models/rnnt_encoder.py - platform/external/executorch - Git at Google

 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.

 # Example script for exporting simple models to flatbuffer

 import logging

 import torch

 from executorch.backends.cadence.aot.ops_registrations import *  # noqa

 from typing import List, Optional, Tuple

 from executorch.backends.cadence.aot.export_example import export_model
 from torchaudio.prototype.models import ConvEmformer


 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)


 if __name__ == "__main__":

     class _TimeReduction(torch.nn.Module):
         def __init__(self, stride: int) -> None:
             super().__init__()
             self.stride = stride

         def forward(
             self, input: torch.Tensor, lengths: torch.Tensor
         ) -> Tuple[torch.Tensor, torch.Tensor]:
             B, T, D = input.shape
             num_frames = T - (T % self.stride)
             input = input[:, :num_frames, :]
             lengths = lengths.div(self.stride, rounding_mode="trunc")
             T_max = num_frames // self.stride

             output = input.reshape(B, T_max, D * self.stride)
             output = output.contiguous()
             return output, lengths

     class ConvEmformerEncoder(torch.nn.Module):
         def __init__(
             self,
             *,
             input_dim: int,
             output_dim: int,
             segment_length: int,
             kernel_size: int,
             right_context_length: int,
             time_reduction_stride: int,
             transformer_input_dim: int,
             transformer_num_heads: int,
             transformer_ffn_dim: int,
             transformer_num_layers: int,
             transformer_left_context_length: int,
             transformer_dropout: float = 0.0,
             transformer_activation: str = "relu",
             transformer_max_memory_size: int = 0,
             transformer_weight_init_scale_strategy: str = "depthwise",
             transformer_tanh_on_mem: bool = False,
         ) -> None:
             super().__init__()
             self.time_reduction = _TimeReduction(time_reduction_stride)
             self.input_linear = torch.nn.Linear(
                 input_dim * time_reduction_stride,
                 transformer_input_dim,
                 bias=False,
             )
             self.transformer = ConvEmformer(
                 transformer_input_dim,
                 transformer_num_heads,
                 transformer_ffn_dim,
                 transformer_num_layers,
                 segment_length // time_reduction_stride,
                 kernel_size=kernel_size,
                 dropout=transformer_dropout,
                 ffn_activation=transformer_activation,
                 left_context_length=transformer_left_context_length,
                 right_context_length=right_context_length // time_reduction_stride,
                 max_memory_size=transformer_max_memory_size,
                 weight_init_scale_strategy=transformer_weight_init_scale_strategy,
                 tanh_on_mem=transformer_tanh_on_mem,
                 conv_activation="silu",
             )
             self.output_linear = torch.nn.Linear(transformer_input_dim, output_dim)
             self.layer_norm = torch.nn.LayerNorm(output_dim)

         def forward(
             self, input: torch.Tensor, lengths: torch.Tensor
         ) -> Tuple[torch.Tensor, torch.Tensor]:
             time_reduction_out, time_reduction_lengths = self.time_reduction(
                 input, lengths
             )
             input_linear_out = self.input_linear(time_reduction_out)
             transformer_out, transformer_lengths = self.transformer(
                 input_linear_out, time_reduction_lengths
             )
             output_linear_out = self.output_linear(transformer_out)
             layer_norm_out = self.layer_norm(output_linear_out)
             return layer_norm_out, transformer_lengths

         @torch.jit.export
         def infer(
             self,
             input: torch.Tensor,
             lengths: torch.Tensor,
             states: Optional[List[List[torch.Tensor]]],
         ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
             time_reduction_out, time_reduction_lengths = self.time_reduction(
                 input, lengths
             )
             input_linear_out = self.input_linear(time_reduction_out)
             (
                 transformer_out,
                 transformer_lengths,
                 transformer_states,
             ) = self.transformer.infer(input_linear_out, time_reduction_lengths, states)
             output_linear_out = self.output_linear(transformer_out)
             layer_norm_out = self.layer_norm(output_linear_out)
             return layer_norm_out, transformer_lengths, transformer_states

     # Instantiate model
     time_reduction_stride = 4
     encoder = ConvEmformerEncoder(
         input_dim=80,
         output_dim=256,
         segment_length=4 * time_reduction_stride,
         kernel_size=7,
         right_context_length=1 * time_reduction_stride,
         time_reduction_stride=time_reduction_stride,
         transformer_input_dim=128,
         transformer_num_heads=4,
         transformer_ffn_dim=512,
         transformer_num_layers=1,
         transformer_left_context_length=10,
         transformer_tanh_on_mem=True,
     )

     # Batch size
     batch_size = 1

     max_input_length = 100
     input_dim = 80
     right_context_length = 4

     # Dummy inputs
     transcriber_input = torch.rand(
         batch_size, max_input_length + right_context_length, input_dim
     )
     transcriber_lengths = torch.randint(1, max_input_length + 1, (batch_size,))

     example_inputs = (
         transcriber_input,
         transcriber_lengths,
     )

     export_model(encoder, example_inputs)
	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.

	# Example script for exporting simple models to flatbuffer

	import logging

	import torch

	from executorch.backends.cadence.aot.ops_registrations import * # noqa

	from typing import List, Optional, Tuple

	from executorch.backends.cadence.aot.export_example import export_model
	from torchaudio.prototype.models import ConvEmformer


	FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
	logging.basicConfig(level=logging.INFO, format=FORMAT)


	if __name__ == "__main__":

	class _TimeReduction(torch.nn.Module):
	def __init__(self, stride: int) -> None:
	super().__init__()
	self.stride = stride

	def forward(
	self, input: torch.Tensor, lengths: torch.Tensor
	) -> Tuple[torch.Tensor, torch.Tensor]:
	B, T, D = input.shape
	num_frames = T - (T % self.stride)
	input = input[:, :num_frames, :]
	lengths = lengths.div(self.stride, rounding_mode="trunc")
	T_max = num_frames // self.stride

	output = input.reshape(B, T_max, D * self.stride)
	output = output.contiguous()
	return output, lengths

	class ConvEmformerEncoder(torch.nn.Module):
	def __init__(
	self,
	*,
	input_dim: int,
	output_dim: int,
	segment_length: int,
	kernel_size: int,
	right_context_length: int,
	time_reduction_stride: int,
	transformer_input_dim: int,
	transformer_num_heads: int,
	transformer_ffn_dim: int,
	transformer_num_layers: int,
	transformer_left_context_length: int,
	transformer_dropout: float = 0.0,
	transformer_activation: str = "relu",
	transformer_max_memory_size: int = 0,
	transformer_weight_init_scale_strategy: str = "depthwise",
	transformer_tanh_on_mem: bool = False,
	) -> None:
	super().__init__()
	self.time_reduction = _TimeReduction(time_reduction_stride)
	self.input_linear = torch.nn.Linear(
	input_dim * time_reduction_stride,
	transformer_input_dim,
	bias=False,
	)
	self.transformer = ConvEmformer(
	transformer_input_dim,
	transformer_num_heads,
	transformer_ffn_dim,
	transformer_num_layers,
	segment_length // time_reduction_stride,
	kernel_size=kernel_size,
	dropout=transformer_dropout,
	ffn_activation=transformer_activation,
	left_context_length=transformer_left_context_length,
	right_context_length=right_context_length // time_reduction_stride,
	max_memory_size=transformer_max_memory_size,
	weight_init_scale_strategy=transformer_weight_init_scale_strategy,
	tanh_on_mem=transformer_tanh_on_mem,
	conv_activation="silu",
	)
	self.output_linear = torch.nn.Linear(transformer_input_dim, output_dim)
	self.layer_norm = torch.nn.LayerNorm(output_dim)

	def forward(
	self, input: torch.Tensor, lengths: torch.Tensor
	) -> Tuple[torch.Tensor, torch.Tensor]:
	time_reduction_out, time_reduction_lengths = self.time_reduction(
	input, lengths
	)
	input_linear_out = self.input_linear(time_reduction_out)
	transformer_out, transformer_lengths = self.transformer(
	input_linear_out, time_reduction_lengths
	)
	output_linear_out = self.output_linear(transformer_out)
	layer_norm_out = self.layer_norm(output_linear_out)
	return layer_norm_out, transformer_lengths

	@torch.jit.export
	def infer(
	self,
	input: torch.Tensor,
	lengths: torch.Tensor,
	states: Optional[List[List[torch.Tensor]]],
	) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
	time_reduction_out, time_reduction_lengths = self.time_reduction(
	input, lengths
	)
	input_linear_out = self.input_linear(time_reduction_out)
	(
	transformer_out,
	transformer_lengths,
	transformer_states,
	) = self.transformer.infer(input_linear_out, time_reduction_lengths, states)
	output_linear_out = self.output_linear(transformer_out)
	layer_norm_out = self.layer_norm(output_linear_out)
	return layer_norm_out, transformer_lengths, transformer_states

	# Instantiate model
	time_reduction_stride = 4
	encoder = ConvEmformerEncoder(
	input_dim=80,
	output_dim=256,
	segment_length=4 * time_reduction_stride,
	kernel_size=7,
	right_context_length=1 * time_reduction_stride,
	time_reduction_stride=time_reduction_stride,
	transformer_input_dim=128,
	transformer_num_heads=4,
	transformer_ffn_dim=512,
	transformer_num_layers=1,
	transformer_left_context_length=10,
	transformer_tanh_on_mem=True,
	)

	# Batch size
	batch_size = 1

	max_input_length = 100
	input_dim = 80
	right_context_length = 4

	# Dummy inputs
	transcriber_input = torch.rand(
	batch_size, max_input_length + right_context_length, input_dim
	)
	transcriber_lengths = torch.randint(1, max_input_length + 1, (batch_size,))

	example_inputs = (
	transcriber_input,
	transcriber_lengths,
	)

	export_model(encoder, example_inputs)