| """ |
| This file implements Batch Normalization as described in the paper: |
| "Batch Normalization: Accelerating Deep Network Training |
| by Reducing Internal Covariate Shift" |
| by Sergey Ioffe, Christian Szegedy |
| |
| This implementation is useful for inputs NOT coming from convolution layers. |
| For convolution layers, use nn.SpatialBatchNormalization. |
| |
| The operation implemented is: |
| y = ( x - mean(x) ) |
| ########## * gamma + beta |
| standard-deviation(x) |
| where gamma and beta are learnable parameters. |
| |
| The learning of gamma and beta is optional. |
| |
| Usage: |
| with learnable parameters: nn.BatchNormalization(N [, eps] [, momentum]) |
| where N = dimensionality of input |
| without learnable parameters: nn.BatchNormalization(N [, eps] [, momentum], False) |
| |
| eps is a small value added to the standard-deviation to avoid divide-by-zero. |
| Defaults to 1e-5 |
| |
| In training time, this layer keeps a running estimate of it's computed mean and std. |
| The running sum is kept with a default momentum of 0.1 (unless over-ridden) |
| In test time, this running mean/std is used to normalize. |
| """ |
| |
| import torch |
| from .Module import Module |
| from .utils import clear |
| |
| |
| class BatchNormalization(Module): |
| # expected dimension of input |
| nDim = 2 |
| |
| def __init__(self, nOutput, eps=1e-5, momentum=0.1, affine=True): |
| super(BatchNormalization, self).__init__() |
| assert nOutput != 0 |
| |
| self.affine = affine |
| self.eps = eps |
| self.train = True |
| self.momentum = momentum |
| self.running_mean = torch.zeros(nOutput) |
| self.running_var = torch.ones(nOutput) |
| |
| self.save_mean = None |
| self.save_std = None |
| self._gradOutput = None |
| |
| if self.affine: |
| self.weight = torch.Tensor(nOutput) |
| self.bias = torch.Tensor(nOutput) |
| self.gradWeight = torch.Tensor(nOutput) |
| self.gradBias = torch.Tensor(nOutput) |
| self.reset() |
| else: |
| self.weight = None |
| self.bias = None |
| self.gradWeight = None |
| self.gradBias = None |
| |
| def reset(self): |
| if self.weight is not None: |
| self.weight.uniform_() |
| |
| if self.bias is not None: |
| self.bias.zero_() |
| |
| self.running_mean.zero_() |
| self.running_var.fill_(1) |
| |
| def _checkInputDim(self, input): |
| if input.dim() != self.nDim: |
| raise RuntimeError( |
| 'only mini-batch supported ({}D tensor), got {}D tensor instead'.format(self.nDim, input.dim())) |
| if input.size(1) != self.running_mean.nelement(): |
| raise RuntimeError('got {}-feature tensor, expected {}'.format(input.size(1), self.running_mean.nelement())) |
| |
| def _makeContiguous(self, input, gradOutput=None): |
| if not input.is_contiguous(): |
| if self._input is None: |
| self._input = input.new() |
| self._input.resize_as_(input).copy_(input) |
| input = self._input |
| |
| if gradOutput is not None: |
| if not gradOutput.is_contiguous(): |
| if self._gradOutput is None: |
| self._gradOutput = gradOutput.new() |
| self._gradOutput.resize_as_(gradOutput).copy_(gradOutput) |
| gradOutput = self._gradOutput |
| |
| return input, gradOutput |
| |
| def updateOutput(self, input): |
| self._checkInputDim(input) |
| |
| input = self._makeContiguous(input)[0] |
| |
| self.output.resize_as_(input) |
| if self.save_mean is None: |
| self.save_mean = input.new() |
| self.save_mean.resize_as_(self.running_mean) |
| if self.save_std is None: |
| self.save_std = input.new() |
| self.save_std.resize_as_(self.running_var) |
| |
| self._backend.BatchNormalization_updateOutput( |
| self._backend.library_state, |
| input, |
| self.output, |
| self.weight, |
| self.bias, |
| self.running_mean, |
| self.running_var, |
| self.save_mean, |
| self.save_std, |
| self.train, |
| self.momentum, |
| self.eps |
| ) |
| |
| return self.output |
| |
| def _backward(self, input, gradOutput, scale, gradInput=None, gradWeight=None, gradBias=None): |
| self._checkInputDim(input) |
| self._checkInputDim(gradOutput) |
| if not hasattr(self, 'save_mean') or not hasattr(self, 'save_std'): |
| raise RuntimeError('you have to call updateOutput() at least once before backward()') |
| |
| input, gradOutput = self._makeContiguous(input, gradOutput) |
| |
| scale = scale or 1. |
| if gradInput is not None: |
| gradInput.resize_as_(gradOutput) |
| |
| self._backend.BatchNormalization_backward( |
| self._backend.library_state, |
| input, |
| gradOutput, |
| gradInput, |
| gradWeight, |
| gradBias, |
| self.weight, |
| self.running_mean, |
| self.running_var, |
| self.save_mean, |
| self.save_std, |
| self.train, |
| scale, |
| self.eps |
| ) |
| |
| return self.gradInput |
| |
| def backward(self, input, gradOutput, scale=1.): |
| return self._backward(input, gradOutput, scale, self.gradInput, self.gradWeight, self.gradBias) |
| |
| def updateGradInput(self, input, gradOutput): |
| return self._backward(input, gradOutput, 1., self.gradInput) |
| |
| def accGradParameters(self, input, gradOutput, scale=1.): |
| return self._backward(input, gradOutput, scale, None, self.gradWeight, self.gradBias) |
| |
| def read(self, file, version): |
| super(BatchNormalization, self).read(self, file) |
| if version < 2: |
| if self.running_std: |
| self.running_var = self.running_std.pow_(-2).add_(-self.eps) |
| self.running_std = None |
| |
| def clearState(self): |
| # first 5 buffers are not present in the current implementation, |
| # but we keep them for cleaning old saved models |
| clear(self, [ |
| 'buffer', |
| 'buffer2', |
| 'centered', |
| 'std', |
| 'normalized', |
| '_input', |
| '_gradOutput', |
| 'save_mean', |
| 'save_std', |
| ]) |
| return super(BatchNormalization, self).clearState() |