|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | from caffe2.python import schema | 
|  | from caffe2.python.layers.layers import ModelLayer | 
|  |  | 
|  | import numpy as np | 
|  |  | 
|  |  | 
|  | class LayerNormalization(ModelLayer): | 
|  | def __init__( | 
|  | self, | 
|  | model, | 
|  | input_record, | 
|  | name='layer_normalization', | 
|  | scale_optim=None, | 
|  | bias_optim=None, | 
|  | epsilon=1e-4, | 
|  | axis=1, | 
|  | use_layer_norm_op=True, | 
|  | scale_init_value=1.0, | 
|  | **kwargs | 
|  | ): | 
|  | super(LayerNormalization, self).__init__( | 
|  | model, name, input_record, **kwargs) | 
|  |  | 
|  | assert isinstance(input_record, schema.Scalar), ( | 
|  | "Incorrect input type: {}".format(input_record)) | 
|  |  | 
|  | self.input_shape = input_record.field_type().shape | 
|  | self.axis = axis | 
|  |  | 
|  | assert len(self.input_shape) >= 1, ( | 
|  | "This layer supports only >= 2D tensors") | 
|  | input_dims = self.input_shape[0] | 
|  |  | 
|  | self.output_schema = schema.Scalar( | 
|  | (np.float32, self.input_shape), | 
|  | self.get_next_blob_reference('output') | 
|  | ) | 
|  |  | 
|  | self.scale = self.create_param(param_name='scale', | 
|  | shape=[input_dims], | 
|  | initializer=('ConstantFill', {'value': scale_init_value}), | 
|  | optimizer=scale_optim) | 
|  | self.bias = self.create_param(param_name='bias', | 
|  | shape=[input_dims], | 
|  | initializer=('ConstantFill', {'value': 0.0}), | 
|  | optimizer=bias_optim) | 
|  | self.use_layer_norm_op = use_layer_norm_op | 
|  |  | 
|  | if self.use_layer_norm_op: | 
|  | self.epsilon = epsilon | 
|  | else: | 
|  | assert len(self.input_shape) == 1, ( | 
|  | "When using alternative implementation, " | 
|  | "input data can only be 2D" | 
|  | ) | 
|  | self.epsilon = model.maybe_add_global_constant( | 
|  | "%s_epsilon" % self.name, float(epsilon) | 
|  | ) | 
|  |  | 
|  | def add_ops_with_layer_norm_op(self, net): | 
|  | input_blob = self.input_record.field_blobs() | 
|  | ln_output = self.output_schema.field_blobs() | 
|  |  | 
|  | output_blobs = [net.NextScopedBlob('ln_output'), net.NextScopedBlob('ln_mean'), | 
|  | net.NextScopedBlob('ln_stdev')] | 
|  |  | 
|  | normalized, mean, stdev = net.LayerNorm(input_blob, | 
|  | output_blobs, | 
|  | axis=self.axis, | 
|  | epsilon=self.epsilon) | 
|  |  | 
|  | scaled = net.Mul( | 
|  | [normalized, self.scale], | 
|  | [net.NextScopedBlob('ln_scaled')], | 
|  | broadcast=1, | 
|  | axis=self.axis, | 
|  | ) | 
|  |  | 
|  | net.Add( | 
|  | [scaled, self.bias], | 
|  | ln_output, | 
|  | broadcast=1, | 
|  | axis=self.axis, | 
|  | ) | 
|  |  | 
|  | def add_ops_without_layer_norm_op(self, net): | 
|  | # two issues here: | 
|  | #  1. use multiple ops to replace the function of LayerNorm | 
|  | #  2. do not use legacy broadcast | 
|  | ln_output = net.NextScopedBlob("ln_output") | 
|  | ln_mean = net.NextScopedBlob("ln_mean") | 
|  | ln_stdev = net.NextScopedBlob("ln_stdev") | 
|  | ln_mean_arr = net.NextScopedBlob("ln_mean_arr") | 
|  | net.ReduceBackMean(self.input_record.field_blobs(), [ln_mean_arr]) | 
|  | net.ExpandDims([ln_mean_arr], [ln_mean], dims=[1]) | 
|  | ln_centered = net.NextScopedBlob("ln_centered") | 
|  | net.Sub(self.input_record.field_blobs() + [ln_mean], [ln_centered]) | 
|  | ln_sqr = net.NextScopedBlob("ln_sqr") | 
|  | net.Sqr([ln_centered], [ln_sqr]) | 
|  | ln_sqr_mean = net.NextScopedBlob("ln_sqr_mean") | 
|  | net.ReduceBackMean([ln_sqr], [ln_sqr_mean]) | 
|  | ln_var = net.NextScopedBlob("ln_var") | 
|  | net.Add([ln_sqr_mean, self.epsilon], ln_var) | 
|  | ln_std_arr = net.NextScopedBlob("ln_std_arr") | 
|  | net.Pow([ln_var], [ln_std_arr], exponent=0.5) | 
|  | net.ExpandDims([ln_std_arr], [ln_stdev], dims=[1]) | 
|  | net.Div([ln_centered, ln_stdev], [ln_output]) | 
|  | ln_scaled = net.NextScopedBlob("ln_scaled") | 
|  | net.Mul([ln_output, self.scale], [ln_scaled]) | 
|  | net.Add([ln_scaled, self.bias], self.output_schema.field_blobs()) | 
|  |  | 
|  | def add_ops(self, net): | 
|  | if self.use_layer_norm_op: | 
|  | self.add_ops_with_layer_norm_op(net) | 
|  | else: | 
|  | self.add_ops_without_layer_norm_op(net) |