|  | ## @package crf | 
|  | # Module caffe2.python.crf | 
|  |  | 
|  |  | 
|  | import numpy as np | 
|  | from caffe2.python import brew, core, model_helper, recurrent | 
|  |  | 
|  |  | 
|  | """ | 
|  | Due to a limitation in ReccurentNetworkOp, this layer only supports batch_size=1 | 
|  | In order to support batch_size > 1, we will have to implement the CRFUnit | 
|  | and its gradient in C++ and handle the different batches there. | 
|  | """ | 
|  |  | 
|  |  | 
|  | class CRFWithLoss(object): | 
|  | def __init__(self, model, num_classes, transitions_blob=None): | 
|  | self.model = model | 
|  | self.num_classes = num_classes | 
|  | self.num_classes_padded = num_classes + 2  # After adding BOS and EOS | 
|  | if not transitions_blob: | 
|  | transitions_blob = self.model.param_init_net.UniformFill( | 
|  | [], | 
|  | [core.ScopedBlobReference("crf_transitions")], | 
|  | shape=[self.num_classes_padded, self.num_classes_padded], | 
|  | min=-1.0, | 
|  | max=1.0, | 
|  | ) | 
|  | self.transitions = transitions_blob | 
|  | self.model.params.append(self.transitions) | 
|  |  | 
|  | def crf_loss(self, predictions, labels, seq_lengths=None): | 
|  | # Since the transitions matrix is a shared parameter, need to | 
|  | # take a snapshot of it at the beginning since it can be updated | 
|  | # in between the operators that uses it when doing parallel updates | 
|  | transitions_snapshot = self.model.net.Copy( | 
|  | self.transitions, core.ScopedBlobReference("transitions_snapshot") | 
|  | ) | 
|  | # Compute best path unary score from the logits | 
|  | path_unary_score = self._gather_entries_sum( | 
|  | predictions, labels, self.num_classes | 
|  | ) | 
|  | # Append BOS and EOS entries to the predictions and labels | 
|  | predictions = CRFWithLoss.pad_predictions( | 
|  | predictions, self.model.param_init_net, self.model.net, self.num_classes | 
|  | ) | 
|  | labels = CRFWithLoss.pad_labels( | 
|  | labels, self.model.param_init_net, self.model.net, self.num_classes | 
|  | ) | 
|  | # Compute best path binary scores from the transitions matrix | 
|  | path_binary_score = self._path_binary_scores( | 
|  | labels, transitions_snapshot, seq_lengths | 
|  | ) | 
|  | path_total_score = self.model.net.Add( | 
|  | [path_binary_score, path_unary_score], | 
|  | core.ScopedBlobReference("path_total"), | 
|  | ) | 
|  | # Compute all paths score | 
|  | zero_index = self.model.param_init_net.ConstantFill([], shape=[1], value=0) | 
|  | initial_state = self.model.net.Gather( | 
|  | [predictions, zero_index], | 
|  | core.ScopedBlobReference("rnn_initial"), | 
|  | dense_gradient=True, | 
|  | ) | 
|  | input_data, _ = self.model.net.RemovePadding( | 
|  | [predictions], padding_width=1, end_padding_width=0, outputs=2 | 
|  | ) | 
|  | input_data = self.model.net.ExpandDims( | 
|  | [input_data], core.ScopedBlobReference("rnn_input_data"), dims=[1] | 
|  | ) | 
|  | # Due to a bug in RecurrentNetworkGradientOp, we need to copy the | 
|  | # transitions blob before sending it to the recurrent network | 
|  | transitions_copy = self.model.net.Copy( | 
|  | transitions_snapshot, core.ScopedBlobReference("transitions_copy") | 
|  | ) | 
|  | all_paths_scores = self._crf_forward( | 
|  | input_data, initial_state, transitions_copy | 
|  | ) | 
|  | loss = self.model.net.Sub( | 
|  | [all_paths_scores, path_total_score], core.ScopedBlobReference("crf_loss") | 
|  | ) | 
|  | return loss | 
|  |  | 
|  | def _path_binary_scores(self, labels, transitions, seq_lengths=None): | 
|  | column_ids, _ = self.model.net.RemovePadding( | 
|  | [labels], outputs=2, padding_width=1, end_padding_width=0 | 
|  | ) | 
|  | row_ids, _ = self.model.net.RemovePadding( | 
|  | [labels], outputs=2, padding_width=0, end_padding_width=1 | 
|  | ) | 
|  | # Since there is no multi-dimensional gather, I flatten the matrix to | 
|  | # a 1-d vector and transform the ids to (row_ids * num_columns + | 
|  | # column_ids) and do gather in 1-d | 
|  | num_columns_blob = self.model.net.ConstantFill( | 
|  | [row_ids], value=self.num_classes_padded | 
|  | ) | 
|  | flattened_ids = self.model.net.Mul([row_ids, num_columns_blob]) | 
|  | flattened_ids = self.model.net.Add([flattened_ids, column_ids]) | 
|  | flattened_transitions = self.model.net.FlattenToVec([transitions]) | 
|  | entries = self.model.net.Gather( | 
|  | [flattened_transitions, flattened_ids], dense_gradient=True | 
|  | ) | 
|  | return self.model.ReduceFrontSum(entries) | 
|  |  | 
|  | def _gather_entries_sum(self, in_data, indices, index_size): | 
|  | indices = self.model.net.Cast([indices], to="int64") | 
|  | index_size_blob = self.model.param_init_net.ConstantFill( | 
|  | [], shape=[1], value=index_size | 
|  | ) | 
|  | query_one_hot = self.model.net.OneHot([indices, index_size_blob]) | 
|  | flattend_query = self.model.net.FlattenToVec(query_one_hot) | 
|  | flattend_data = self.model.net.FlattenToVec(in_data) | 
|  | query_scores = self.model.net.DotProduct([flattend_query, flattend_data]) | 
|  | final_sum = self.model.net.ReduceFrontSum([query_scores]) | 
|  | return final_sum | 
|  |  | 
|  | def _crf_forward( | 
|  | self, input_blob, initial_state, transitions_copy, seq_lengths=None | 
|  | ): | 
|  | # Build the RNN net and get the last timestep output | 
|  | out_last = self.build_crf_net(input_blob, initial_state, transitions_copy) | 
|  | out_last, _ = self.model.net.Reshape( | 
|  | [out_last], outputs=2, shape=(self.num_classes_padded,) | 
|  | ) | 
|  | zero_segment_id = self.model.param_init_net.ConstantFill( | 
|  | [], value=0, shape=[self.num_classes_padded], dtype=core.DataType.INT32 | 
|  | ) | 
|  |  | 
|  | # Compute the accumulated total score of all the paths | 
|  | accum_score = self.model.net.SortedSegmentRangeLogSumExp( | 
|  | [out_last, zero_segment_id] | 
|  | ) | 
|  | accum_score, _ = self.model.net.Reshape(accum_score, outputs=2, shape=()) | 
|  | return accum_score | 
|  |  | 
|  | def build_crf_net(self, input_blob, initial_state, transitions): | 
|  | """ | 
|  | Adds the crf_net recurrent operator to the model. | 
|  |  | 
|  | model: model_helper.ModelHelper object new operators would be added | 
|  | to | 
|  |  | 
|  | input_blob: the input sequence in a format T x N x D | 
|  | where T is sequence size, N - batch size and D - input dimension | 
|  | ##Only supports batch-size 1## | 
|  |  | 
|  | seq_lengths: blob containing sequence lengths (unused) | 
|  | """ | 
|  |  | 
|  | scope = "crf_net" | 
|  |  | 
|  | def s(name): | 
|  | "" | 
|  | # We have to manually scope due to our internal/external blob | 
|  | # relationships. | 
|  | return "{}/{}".format(str(scope), str(name)) | 
|  |  | 
|  | step_model = model_helper.ModelHelper(name="crf_step", param_model=self.model) | 
|  | input_t, cell_t_prev, _ = step_model.net.AddExternalInputs( | 
|  | core.ScopedBlobReference("input_t"), | 
|  | core.ScopedBlobReference("cell_t_prev"), | 
|  | transitions, | 
|  | ) | 
|  | zero_segment_id = step_model.param_init_net.ConstantFill( | 
|  | [], | 
|  | [s("zero_segment_id")], | 
|  | value=0, | 
|  | shape=[self.num_classes_padded], | 
|  | dtype=core.DataType.INT32, | 
|  | ) | 
|  |  | 
|  | # A hack to bypass model cloning for test | 
|  | step_model.param_init_net.AddExternalOutput(zero_segment_id) | 
|  | """ the CRF step """ | 
|  | # Do tile | 
|  | prev_transpose = brew.transpose( | 
|  | step_model, cell_t_prev, [s("prev_transpose")], axes=(0, 2, 1) | 
|  | ) | 
|  | prev_tiled = step_model.net.Tile( | 
|  | prev_transpose, [s("prev_tiled")], tiles=self.num_classes_padded, axis=2 | 
|  | ) | 
|  | input_t_tiled = step_model.net.Tile( | 
|  | input_t, [s("input_t_tiled")], tiles=self.num_classes_padded, axis=1 | 
|  | ) | 
|  | input_with_prev = step_model.net.Add( | 
|  | [prev_tiled, input_t_tiled], [s("input_with_prev")] | 
|  | ) | 
|  | all_with_transitions = step_model.net.Add( | 
|  | [input_with_prev, transitions], | 
|  | [s("prev_with_transitions")], | 
|  | broadcast=1, | 
|  | use_grad_hack=1, | 
|  | ) | 
|  | all_with_transitions_reshaped, _ = step_model.net.Reshape( | 
|  | all_with_transitions, | 
|  | [s("all_with_transitions_reshaped"), s("all_with_transitions_orig")], | 
|  | shape=(self.num_classes_padded, self.num_classes_padded), | 
|  | ) | 
|  | cell_t = step_model.net.SortedSegmentRangeLogSumExp( | 
|  | [all_with_transitions_reshaped, zero_segment_id], [s("cell_t")] | 
|  | ) | 
|  | step_model.net.AddExternalOutputs(cell_t) | 
|  | """ recurrent network """ | 
|  | cell_input_blob = initial_state | 
|  | out_all, out_last = recurrent.recurrent_net( | 
|  | net=self.model.net, | 
|  | cell_net=step_model.net, | 
|  | inputs=[(input_t, input_blob)], | 
|  | initial_cell_inputs=[(cell_t_prev, cell_input_blob)], | 
|  | links={cell_t_prev: cell_t}, | 
|  | scope=scope, | 
|  | outputs_with_grads=(1,), | 
|  | ) | 
|  | return out_last | 
|  |  | 
|  | def update_predictions(self, classes): | 
|  | def crf_update_predictions_op(inputs, outputs): | 
|  | # This operator will compute the best path of classes by performing | 
|  | # Viterbi decoding and then updates the predictions to make the tag | 
|  | # On the best path has the highest score among the others | 
|  | predictions = inputs[0].data | 
|  | transitions = inputs[1].data | 
|  | predictions = inputs[0].data | 
|  | predictions_shape = inputs[0].shape | 
|  | outputs[0].reshape(predictions_shape) | 
|  |  | 
|  | trellis = np.zeros(predictions_shape) | 
|  | backpointers = np.zeros(predictions_shape, dtype=np.int32) | 
|  | trellis[0] = predictions[0] | 
|  |  | 
|  | for t in range(1, predictions_shape[0]): | 
|  | v = np.expand_dims(trellis[t - 1], 1) + transitions | 
|  | trellis[t] = predictions[t] + np.max(v, 0) | 
|  | backpointers[t] = np.argmax(v, 0) | 
|  |  | 
|  | viterbi = [np.argmax(trellis[-1])] | 
|  | for bp in reversed(backpointers[1:]): | 
|  | viterbi.append(bp[viterbi[-1]]) | 
|  | viterbi.reverse() | 
|  |  | 
|  | new_predictions = np.zeros(predictions_shape) | 
|  | old_bests = [] | 
|  | for i, w_predictions in enumerate(predictions): | 
|  | # Get the current tag with the maximum score | 
|  | new_predictions[i] = predictions[i] | 
|  | old_best = np.argmax(w_predictions) | 
|  | old_bests.append(old_best) | 
|  | # Swap the scores of the current best tag and the tag on the | 
|  | # Viterbi path | 
|  | w_predictions[viterbi[i]], w_predictions[old_best] = ( | 
|  | w_predictions[old_best], | 
|  | w_predictions[viterbi[i]], | 
|  | ) | 
|  | new_predictions[i] = w_predictions | 
|  | # Remove the BOS and EOS entries from the predictions matrix | 
|  | orig_predictions = new_predictions[1:-1, 0:-2] | 
|  | outputs[0].reshape(orig_predictions.shape) | 
|  | outputs[0].data[...] = orig_predictions | 
|  |  | 
|  | padded_classes = CRFWithLoss.pad_predictions( | 
|  | classes, self.model.param_init_net, self.model.net, self.num_classes | 
|  | ) | 
|  | new_classes = self.model.net.Python(crf_update_predictions_op)( | 
|  | [padded_classes, self.transitions], | 
|  | core.ScopedBlobReference("post_crf_classes"), | 
|  | ) | 
|  | return new_classes | 
|  |  | 
|  | @staticmethod | 
|  | def pad_labels(labels, init_net, net, num_classes): | 
|  | bos_i = num_classes | 
|  | eos_i = num_classes + 1 | 
|  | bos_i_b = init_net.ConstantFill([], shape=[1], value=bos_i) | 
|  | eos_i_b = init_net.ConstantFill([], shape=[1], value=eos_i) | 
|  | labels = net.Cast([labels], to="int64") | 
|  | padded_labels, _ = net.Concat([bos_i_b, labels, eos_i_b], axis=0, outputs=2) | 
|  | return padded_labels | 
|  |  | 
|  | @staticmethod | 
|  | def pad_predictions(predictions, init_net, net, num_classes): | 
|  | # This function will introduce two labels for beginning of sequence | 
|  | # And end of sequence, it will make the necessary udpates to the | 
|  | # the predictions blob | 
|  |  | 
|  | low_score = -1000.0  # An arbitray very low number | 
|  | b_scores = np.array([[low_score] * num_classes + [0, low_score]]).astype( | 
|  | np.float32 | 
|  | ) | 
|  |  | 
|  | e_scores = np.array([[low_score] * num_classes + [low_score, 0]]).astype( | 
|  | np.float32 | 
|  | ) | 
|  |  | 
|  | b_scores = init_net.GivenTensorFill( | 
|  | [], "b_scores", shape=[1, num_classes + 2], values=b_scores | 
|  | ) | 
|  | e_scores = init_net.GivenTensorFill( | 
|  | [], "e_scores", shape=[1, num_classes + 2], values=e_scores | 
|  | ) | 
|  |  | 
|  | zero_index = net.ConstantFill([], shape=[1], value=0) | 
|  | length = net.Gather([net.Shape([predictions]), zero_index]) | 
|  | length = net.Cast(length, to="int32") | 
|  | t_range = net.LengthsRangeFill(length) | 
|  | padding = net.ConstantFill([t_range], value=low_score) | 
|  | padding = net.ExpandDims(padding, dims=[1]) | 
|  | padded_predictions, _ = net.Concat( | 
|  | [predictions, padding, padding], outputs=2, axis=1 | 
|  | ) | 
|  | padded_predictions_concat, _ = net.Concat( | 
|  | [b_scores, padded_predictions, e_scores], outputs=2, axis=0 | 
|  | ) | 
|  | return padded_predictions_concat |