Added versioning to ADD/SUB + some rework of the existing code.
diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
index 731c2fb..eb53b7c 100644
--- a/tensorflow/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -100,19 +100,26 @@
 
   // 8bit -> 8bit general quantized path, with general rescalings
   // as well as, 16bit -> 16bit with general rescalings
-  bool pot_scale_16bit = false;
+  bool pot_scale_16bit = true;
 
   bool input1_scale_is_pot = false;
   bool input2_scale_is_pot = false;
   bool output_scale_is_pot = false;
 
-  int input1_scale_log2_rounded;
-  int input2_scale_log2_rounded;
-  int output_scale_log2_rounded;
+  int input1_scale_log2_rounded{0};
+  int input2_scale_log2_rounded{0};
+  int output_scale_log2_rounded{0};
 
   if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
       output->type == kTfLiteInt16) {
-    // Check that param scale is POT
+    // In case of 16-bit, there are two implementation:
+    // the scale parameter is a general number
+    // the scale parameter is POT and
+    // zero_point is zero for inputs/output.
+    pot_scale_16bit = (input1->params.zero_point == 0) &&
+                      (input2->params.zero_point == 0) &&
+                      (output->params.zero_point == 0);
+
     input1_scale_is_pot =
         CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
 
@@ -122,14 +129,14 @@
     output_scale_is_pot =
         CheckedLog2(output->params.scale, &output_scale_log2_rounded);
 
-    pot_scale_16bit = input1_scale_log2_rounded && input2_scale_log2_rounded &&
-                      output_scale_log2_rounded;
+    pot_scale_16bit &=
+        input1_scale_is_pot && input2_scale_is_pot && output_scale_is_pot;
   }
 
   data->pot_scale_16bit = pot_scale_16bit;
 
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
-      pot_scale_16bit) {
+      !pot_scale_16bit) {
     // 8bit -> 8bit general quantized path, with general rescalings
     // as well as, 16bit -> 16bit with general rescalings
     data->input1_offset = -input1->params.zero_point;
@@ -139,7 +146,7 @@
     // The shift is set to 15 for 16-bit and 20 in case of 8-bit, accordingly.
     // In case of 16-bit we have 65535 << 15 which is less than 1 << 31,
     // therefore the addition will still fit in a 32 bit accumulator.
-    data->left_shift = pot_scale_16bit ? 15 : 20;
+    data->left_shift = !pot_scale_16bit ? 15 : 20;
     const double twice_max_input_scale =
         2 * std::max(input1->params.scale, input2->params.scale);
     const double real_input1_multiplier =
@@ -252,7 +259,7 @@
                               const TfLiteTensor* input2,
                               TfLiteTensor* output) {
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
-      data->pot_scale_16bit) {
+      !data->pot_scale_16bit) {
     tflite::ArithmeticParams op_params;
     op_params.left_shift = data->left_shift;
     op_params.input1_offset = data->input1_offset;
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 626c092..7b7b60a 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -88,7 +88,7 @@
              /* max_version */ 2);
   AddBuiltin(BuiltinOperator_ADD, Register_ADD(),
              /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 4);
   AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND, Register_SPACE_TO_BATCH_ND(),
              /* min_version */ 1,
              /* max_version */ 3);
@@ -139,7 +139,7 @@
   AddBuiltin(BuiltinOperator_DIV, Register_DIV());
   AddBuiltin(BuiltinOperator_SUB, Register_SUB(),
              /* min_version */ 1,
-             /* max_version */ 3);
+             /* max_version */ 5);
   AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT(), /* min_version */ 1,
              /* max_version */ 3);
   AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V(),
diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
index 5845815..2c126c6 100644
--- a/tensorflow/lite/kernels/sub.cc
+++ b/tensorflow/lite/kernels/sub.cc
@@ -225,19 +225,26 @@
 
   // 8bit -> 8bit general quantized path, with general rescalings
   // as well as, 16bit -> 16bit with general rescalings
-  bool pot_scale_16bit = false;
+  bool pot_scale_16bit = true;
 
   bool input1_scale_is_pot = false;
   bool input2_scale_is_pot = false;
   bool output_scale_is_pot = false;
 
-  int input1_scale_log2_rounded;
-  int input2_scale_log2_rounded;
-  int output_scale_log2_rounded;
+  int input1_scale_log2_rounded{0};
+  int input2_scale_log2_rounded{0};
+  int output_scale_log2_rounded{0};
 
   if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
       output->type == kTfLiteInt16) {
-    // Check that param scale is POT
+    // In case of 16-bit, there are two implementation:
+    // the scale parameter is a general number
+    // the scale parameter is POT and
+    // zero_point is zero for inputs/output.
+    pot_scale_16bit = (input1->params.zero_point == 0) &&
+                      (input2->params.zero_point == 0) &&
+                      (output->params.zero_point == 0);
+
     input1_scale_is_pot =
         CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
 
@@ -247,14 +254,14 @@
     output_scale_is_pot =
         CheckedLog2(output->params.scale, &output_scale_log2_rounded);
 
-    pot_scale_16bit = input1_scale_log2_rounded && input2_scale_log2_rounded &&
-                      output_scale_log2_rounded;
+    pot_scale_16bit &=
+        input1_scale_is_pot && input2_scale_is_pot && output_scale_is_pot;
   }
 
   data->pot_scale_16bit = pot_scale_16bit;
 
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
-      pot_scale_16bit) {
+      !pot_scale_16bit) {
     TF_LITE_ENSURE_OK(context, PrepareGeneralSubOp(context, input1, input2,
                                                    output, params, data, -1));
   } else if (output->type == kTfLiteInt16) {
@@ -348,7 +355,7 @@
     } else {
       TF_LITE_SUB(reference_integer_ops, Add, int8_t);
     }
-  } else if (data->pot_scale_16bit) {
+  } else if (!data->pot_scale_16bit) {
     if (need_broadcast) {
       TF_LITE_SUB(reference_ops, BroadcastAdd4DSlow, int16_t);
     } else {
diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index bbec4f9..7b7cd40 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -49,11 +49,16 @@
           {{OperatorType::kDepthwiseConv, 3}, "1.14.0"},
           {{OperatorType::kAdd, 1}, "1.5.0"},
           {{OperatorType::kAdd, 2}, "1.14.0"},
+          {{OperatorType::kAdd, 3}, "1.15.0"},
+          {{OperatorType::kAdd, 4}, kPendingReleaseOpVersion},
           {{OperatorType::kAddN, 1}, "1.14.0"},
           {{OperatorType::kSpaceToBatchND, 1}, "1.6.0"},
           {{OperatorType::kSpaceToBatchND, 2}, "1.14.0"},
           {{OperatorType::kSub, 1}, "1.6.0"},
           {{OperatorType::kSub, 2}, "1.14.0"},
+          {{OperatorType::kSub, 3}, "1.15.0"},
+          {{OperatorType::kSub, 4}, "1.15.0"},
+          {{OperatorType::kSub, 5}, kPendingReleaseOpVersion},
           {{OperatorType::kDiv, 1}, "1.6.0"},
           {{OperatorType::kBatchToSpaceND, 1}, "1.6.0"},
           {{OperatorType::kBatchToSpaceND, 2}, "1.14.0"},
diff --git a/tensorflow/lite/tools/versioning/BUILD b/tensorflow/lite/tools/versioning/BUILD
index 1ba221d..23f3a45 100644
--- a/tensorflow/lite/tools/versioning/BUILD
+++ b/tensorflow/lite/tools/versioning/BUILD
@@ -22,6 +22,7 @@
         "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/kernels/internal:compatibility",
+        "//tensorflow/lite/kernels/internal:quantization_util",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_fbs_with_mutable",
         "@com_google_absl//absl/memory",
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 8598c9c..a4288ed 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -24,6 +24,7 @@
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
 
 namespace tflite {
 namespace {
@@ -359,7 +360,29 @@
       }
       return 1;
 
+    case BuiltinOperator_ADD:
+      if (op_sig.input_types.at(0) == TensorType_INT16 &&
+          op_sig.output_types.at(0) == TensorType_INT16) {
+        if (op_sig.options.addsub.pot_scale_int16) {
+          return 4;
+        } else {
+          return 3;
+        }
+      }
+      if (op_sig.input_types.at(0) == TensorType_INT8) {
+        return 2;
+      }
+      return 1;
+
     case BuiltinOperator_SUB:
+      if (op_sig.input_types.at(0) == TensorType_INT16 &&
+          op_sig.output_types.at(0) == TensorType_INT16) {
+        if (op_sig.options.addsub.pot_scale_int16) {
+          return 5;
+        } else {
+          return 4;
+        }
+      }
       if (op_sig.options.broadcast.need_broadcast &&
           op_sig.options.broadcast.num_dims > 4) {
         return 3;
@@ -370,7 +393,6 @@
       return 1;
 
     case BuiltinOperator_AVERAGE_POOL_2D:
-    case BuiltinOperator_ADD:
     case BuiltinOperator_CONCATENATION:
     case BuiltinOperator_MAX_POOL_2D:
     case BuiltinOperator_PAD:
@@ -487,6 +509,53 @@
       }
     } break;
 
+    case BuiltinOperator_ADD:
+    case BuiltinOperator_SUB: {
+      op_sig.options.addsub.pot_scale_int16 = false;
+      const Tensor* input1_tensor =
+          subgraph->tensors()->Get(op->inputs()->Get(0));
+      const Tensor* input2_tensor =
+          subgraph->tensors()->Get(op->inputs()->Get(1));
+      const Tensor* output_tensor =
+          subgraph->tensors()->Get(op->outputs()->Get(0));
+      const QuantizationParameters* input1_quant =
+          input1_tensor->quantization();
+      const QuantizationParameters* input2_quant =
+          input2_tensor->quantization();
+      const QuantizationParameters* output_quant =
+          output_tensor->quantization();
+      if (input1_quant && input1_quant->scale() &&
+          input1_quant->scale()->Length() && input2_quant &&
+          input2_quant->scale() && input2_quant->scale()->Length() &&
+          output_quant && output_quant->scale() &&
+          output_quant->scale()->Length()) {
+        float input1_scale = input1_quant->scale()->Get(0);
+        float input2_scale = input2_quant->scale()->Get(0);
+        float output_scale = output_quant->scale()->Get(0);
+
+        int scale_log2_rounded = 0;
+        bool input1_scale_is_pot =
+            CheckedLog2(input1_scale, &scale_log2_rounded);
+
+        bool input2_scale_is_pot =
+            CheckedLog2(input2_scale, &scale_log2_rounded);
+
+        bool output_scale_is_pot =
+            CheckedLog2(output_scale, &scale_log2_rounded);
+
+        op_sig.options.addsub.pot_scale_int16 =
+            input1_scale_is_pot && input2_scale_is_pot && output_scale_is_pot;
+      }
+
+      if (op_code->builtin_code() == BuiltinOperator_SUB) {
+        op_sig.options.broadcast.need_broadcast =
+            !HaveSameShapes(subgraph, op, 0, 1);
+        op_sig.options.broadcast.num_dims =
+            std::max(GetNumDims(subgraph, op, 0), GetNumDims(subgraph, op, 1));
+      }
+
+    } break;
+
     case BuiltinOperator_LSTM: {
       auto lstm_option = op->builtin_options_as_LSTMOptions();
       if (lstm_option) {
@@ -512,7 +581,6 @@
       op_sig.options.space_batch.num_dims = GetNumDims(subgraph, op, 0);
     } break;
 
-    case BuiltinOperator_SUB:
     case BuiltinOperator_MAXIMUM:
     case BuiltinOperator_MINIMUM: {
       op_sig.options.broadcast.need_broadcast =
diff --git a/tensorflow/lite/tools/versioning/op_version.h b/tensorflow/lite/tools/versioning/op_version.h
index c1931bc..bec4b67 100644
--- a/tensorflow/lite/tools/versioning/op_version.h
+++ b/tensorflow/lite/tools/versioning/op_version.h
@@ -59,6 +59,9 @@
       int32_t num_dims;
       bool need_broadcast;
     } broadcast;
+    struct {
+      bool pot_scale_int16;
+    } addsub;
   } options;
 } OpSignature;