Relu Grad working, fixing softmax grad
diff --git a/tensorflow/c/eager/mnist_gradients.h b/tensorflow/c/eager/mnist_gradients.h
index 62cd56d..94cf9ce 100644
--- a/tensorflow/c/eager/mnist_gradients.h
+++ b/tensorflow/c/eager/mnist_gradients.h
@@ -93,7 +93,7 @@
     (*grad_outputs)[0] = matmul_outputs[0];
 
     // Gradient for B
-    TF_RETURN_IF_ERROR(MatMul(ctx_, {upstream_grad},
+    TF_RETURN_IF_ERROR(MatMul(ctx_, {forward_inputs[0], upstream_grad},
                               absl::MakeSpan(matmul_outputs), "mm1", 
                               /*transpose_a = */true, /*transpose_b = */false));
 
@@ -116,6 +116,86 @@
   return registry->Register("MatMul", MatMulRegisterer);
 }
 
+// =================== Register gradients for Relu ============================
+class ReluGradientFunction : public GradientFunction {
+ public:
+  explicit ReluGradientFunction(AbstractContext* ctx, std::vector<AbstractTensorHandle*> f_inputs) : 
+            ctx_(ctx), forward_inputs(f_inputs) {}
+  
+  Status Compute(absl::Span<AbstractTensorHandle* const> grad_inputs,
+                 std::vector<AbstractTensorHandle*>* grad_outputs) override {
+    
+    AbstractTensorHandle* upstream_grad = grad_inputs[0];
+    AbstractTensorHandle* input_features = forward_inputs[0];
+    grad_outputs->resize(1);
+    std::vector<AbstractTensorHandle*> relugrad_outputs(1);
+
+    // Calculate Grad
+    TF_RETURN_IF_ERROR(ReluGrad(ctx_, {upstream_grad, input_features},
+                              absl::MakeSpan(relugrad_outputs), "relu_grad"));
+
+    (*grad_outputs)[0] = relugrad_outputs[0];
+
+    return Status::OK();
+  }
+  ~ReluGradientFunction() override {}
+
+ private:
+  AbstractContext* ctx_;
+  std::vector<AbstractTensorHandle*> forward_inputs;
+
+};
+
+GradientFunction* ReluRegisterer(const ForwardOperation& op) {
+  return new ReluGradientFunction(op.ctx, op.inputs);
+}
+
+Status RegisterGradientRelu(GradientRegistry* registry) {
+  return registry->Register("Relu", ReluRegisterer);
+}
+
+// =================== Register gradients for SparseSoftmaxCrossEntropyLoss ============================
+
+class SparseSoftmaxCrossEntropyLossGradientFunction : public GradientFunction {
+ public:
+  explicit SparseSoftmaxCrossEntropyLossGradientFunction(AbstractContext* ctx, std::vector<AbstractTensorHandle*> f_outputs) : 
+            ctx_(ctx), forward_outputs(f_outputs)  {}
+  
+  Status Compute(absl::Span<AbstractTensorHandle* const> grad_inputs,
+                 std::vector<AbstractTensorHandle*>* grad_outputs) override {
+    
+    // Forward Inputs : [scores, labels]
+    
+    //AbstractTensorHandle* upstream_grad = grad_inputs[0];
+    // grad_outputs->resize(2);
+    // std::vector<AbstractTensorHandle*> sm_outputs(2);
+
+    // Calculate Grad
+    // TF_RETURN_IF_ERROR(SparseSoftmaxCrossEntropyLoss(ctx_, {forward_inputs[0], forward_inputs[1]},
+    //                           absl::MakeSpan(sm_outputs), "softmax_loss"));
+
+
+    // SparseSoftmaxCrossEntropyLoss returns [loss_vals, grads], so return 2nd output.
+    (*grad_outputs)[0] = forward_outputs[1];
+
+    return Status::OK();
+  }
+  ~SparseSoftmaxCrossEntropyLossGradientFunction() override {}
+
+ private:
+  AbstractContext* ctx_;
+  std::vector<AbstractTensorHandle*> forward_outputs;
+
+};
+
+GradientFunction* SparseSoftmaxCrossEntropyLossRegisterer(const ForwardOperation& op) {
+  return new SparseSoftmaxCrossEntropyLossGradientFunction(op.ctx, op.outputs);
+}
+ 
+Status RegisterGradientSparseSoftmaxCrossEntropyLoss(GradientRegistry* registry) {
+  return registry->Register("SparseSoftmaxCrossEntropyWithLogits", SparseSoftmaxCrossEntropyLossRegisterer);
+}
+
 }  // namespace
 }  // namespace internal
 }  // namespace gradients
diff --git a/tensorflow/c/eager/mnist_gradients_test.cc b/tensorflow/c/eager/mnist_gradients_test.cc
index a029903..8d710fe 100644
--- a/tensorflow/c/eager/mnist_gradients_test.cc
+++ b/tensorflow/c/eager/mnist_gradients_test.cc
@@ -46,6 +46,8 @@
 };
 
 
+// ========================= Util Functions ==============================
+
 void printArr(float data[], int n)
 {
   std::cout << std::endl << "[";
@@ -114,6 +116,27 @@
   return Status::OK();
 }
 
+AbstractTensorHandlePtr getMatrixTensorHandleUtilFloat(AbstractContext* ctx, float vals[], int64_t dims[], int num_dims){
+
+  AbstractTensorHandlePtr A;
+  AbstractTensorHandle* a_raw = nullptr;
+  Status s = TestMatrixTensorHandleFloat(ctx, vals, dims, num_dims, &a_raw);
+  A.reset(a_raw);
+  return A;
+}
+
+AbstractTensorHandlePtr getMatrixTensorHandleUtilInt(AbstractContext* ctx, int vals[], int64_t dims[], int num_dims){
+
+  AbstractTensorHandlePtr A;
+  AbstractTensorHandle* a_raw = nullptr;
+  Status s = TestMatrixTensorHandleInt(ctx, vals, dims, num_dims, &a_raw);
+  A.reset(a_raw);
+  return A;
+}
+
+// ============================== Start Tests =================================================
+
+
 TEST_P(CppGradients, TestAddGrad) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -175,24 +198,6 @@
   TF_DeleteTensor(result_tensor);
 }
 
-AbstractTensorHandlePtr getMatrixTensorHandleUtilFloat(AbstractContext* ctx, float vals[], int64_t dims[], int num_dims){
-
-  AbstractTensorHandlePtr A;
-  AbstractTensorHandle* a_raw = nullptr;
-  Status s = TestMatrixTensorHandleFloat(ctx, vals, dims, num_dims, &a_raw);
-  A.reset(a_raw);
-  return A;
-}
-
-AbstractTensorHandlePtr getMatrixTensorHandleUtilInt(AbstractContext* ctx, int vals[], int64_t dims[], int num_dims){
-
-  AbstractTensorHandlePtr A;
-  AbstractTensorHandle* a_raw = nullptr;
-  Status s = TestMatrixTensorHandleInt(ctx, vals, dims, num_dims, &a_raw);
-  A.reset(a_raw);
-  return A;
-}
-
 // Computes
 // y = inputs[0] * inputs[1]
 // return grad(y, {inputs[0], inputs[1]})
@@ -227,6 +232,8 @@
   return Status::OK();
 }
 
+
+// TODO: fix graph mode test by using RunModel to verify
 TEST_P(CppGradients, TestMatMulGrad) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -259,14 +266,14 @@
   // Y = AB
   // outputs = tape.gradient(Y, [A, B])
   std::vector<AbstractTensorHandle*> outputs(2);
-  s = RunModel(MatMulGradModel, ctx.get(), {A.get(), B.get()},
-               absl::MakeSpan(outputs),
-               /*use_function=*/!std::get<2>(GetParam()), registry);
-  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
-
-  // s = MatMulGradModel(ctx.get(), {A.get(), B.get()}, absl::MakeSpan(outputs), registry);
+  // s = RunModel(MatMulGradModel, ctx.get(), {A.get(), B.get()},
+  //              absl::MakeSpan(outputs),
+  //              /*use_function=*/!std::get<2>(GetParam()), registry);
   // ASSERT_EQ(errors::OK, s.code()) << s.error_message();
 
+  s = MatMulGradModel(ctx.get(), {A.get(), B.get()}, absl::MakeSpan(outputs), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
   TF_Tensor* dA_tensor;
   s = getValue(outputs[0], &dA_tensor);
   ASSERT_EQ(errors::OK, s.code()) << s.error_message();
@@ -280,19 +287,6 @@
     ASSERT_NEAR(result_data[j], expected_dA[j], tolerance);
   }  
 
-
-  /* ERROR: This test runs 2x when we bazel test
-   *
-   *  1st time result_data: [-.5, 2, -.5, 2]  ----> This is correct
-   *
-   *  2nd time result_data: [1.5, 0, 1.5, 0]  ----> This is WRONG
-   *
-   *  For some reason, the tensor `B` is getting transposed 2x (or not at all)
-   *  when the gradient is called (see `dA` in `MatMulGradientFunction`)
-   * 
-   *  Possible memory issue where the inputs and/or Op is not resetting the 2nd time?
-   */
-
   printArr(result_data, 4);
 
   outputs[0]->Release();
@@ -335,7 +329,9 @@
  
   // Run the Forward Pass
   std::vector<AbstractTensorHandle*> outputs(2);
-  Status s = MNISTForwardModel(ctx.get(), {X.get(), W1.get(), W2.get(), y.get()}, absl::MakeSpan(outputs), registry);
+  Status s = RunModel(MNISTForwardModel, ctx.get(), {X.get(), W1.get(), W2.get(), y.get()},
+               absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
   ASSERT_EQ(errors::OK, s.code()) << s.error_message();
 
   // Verify the Results
@@ -404,7 +400,9 @@
  
   // Run the Forward Pass
   std::vector<AbstractTensorHandle*> outputs(2);
-  Status s = MNISTForwardModel(ctx.get(), {X.get(), W1.get(), W2.get(), y.get()}, absl::MakeSpan(outputs), registry);
+  Status s = RunModel(MNISTForwardModel, ctx.get(), {X.get(), W1.get(), W2.get(), y.get()},
+               absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
   ASSERT_EQ(errors::OK, s.code()) << s.error_message();
 
   // Verify the Results
@@ -415,7 +413,6 @@
   float result_data[6] = {0};
   memcpy(&result_data[0], TF_TensorData(scores_tensor), TF_TensorByteSize(scores_tensor));
   
-  //float expected_scores [6] = {0f, 12.0f, -1.0f, -17.0f, 16.8f, -28.0f};
   float expected_scores [6] = {3.6f, -6.0f, 10.2f, -17.0f, 16.8f, -28.0f};
   float tolerance = 1e-3;
   for(int j = 0; j < 6; j++){
@@ -449,6 +446,7 @@
  
   TapeVSpace vspace(ctx);
   auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(X));
   tape->Watch(ToId(W1));  // Watch W1.
   std::vector<AbstractTensorHandle*> temp_outputs(1);
 
@@ -461,6 +459,7 @@
   return Status::OK();
 }
 
+// TODO: fix graph mode test by using RunModel to verify
 TEST_P(CppGradients, TestMatMulTranspose) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -488,6 +487,11 @@
   
   // Run the MatMul Op
   std::vector<AbstractTensorHandle*> outputs(1);
+  
+  // Status s = RunModel(MatMulTransposeModel, ctx.get(), {X.get(), W1.get()},
+  //              absl::MakeSpan(outputs),
+  //              /*use_function=*/!std::get<2>(GetParam()), registry);
+
   Status s = MatMulTransposeModel(ctx.get(), {X.get(), W1.get()}, absl::MakeSpan(outputs), registry);
   ASSERT_EQ(errors::OK, s.code()) << s.error_message();
   
@@ -499,7 +503,6 @@
   float result_data[6] = {0};
   memcpy(&result_data[0], TF_TensorData(scores_tensor), TF_TensorByteSize(scores_tensor));
   
-  
   float expected_scores [6] = {13.0f, 18.0f, 17.0f, 24.0f, 21.0f, 30.0f};
   float tolerance = 1e-3;
   for(int j = 0; j < 6; j++){
@@ -508,6 +511,177 @@
   
 }
 
+// Test Model to verify ReluGrad functionality
+Status ReluGradModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry) {
+ 
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));  // Watch X
+  std::vector<AbstractTensorHandle*> relu_outputs(1);
+  TF_RETURN_IF_ERROR(Relu(ctx, tape, inputs, absl::MakeSpan(relu_outputs), 
+      "relu0", registry));  // Relu(X)
+  
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  std::vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(relu_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0])},
+      source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads));
+  for (auto relu_output : relu_outputs) {
+    relu_output->Release();
+  }
+  outputs[0] = out_grads[0];
+  delete tape;
+  return Status::OK();
+}
+
+TEST_P(CppGradients, TestReluGrad) {
+
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s = BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = data
+  float X_vals [] = {1.0f, 2.0f, 3.0f, -5.0f, -4.0f, -3.0f, 2.0f, 0.0f, -1.0f};
+  int64_t X_dims [] = {3,3};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X = getMatrixTensorHandleUtilFloat(ctx.get(), X_vals, X_dims, num_dims);
+ 
+  GradientRegistry registry;
+  Status s = RegisterGradientRelu(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Pseudo-code:
+  //
+  // tape.watch(X)
+  // Y = Relu(X)
+  // outputs = tape.gradient(Y, [X])
+  std::vector<AbstractTensorHandle*> outputs(1);
+  s = RunModel(ReluGradModel, ctx.get(), {X.get()},
+               absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* dX_tensor;
+  s = getValue(outputs[0], &dX_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  
+  float result_data[9] = {0};
+  memcpy(&result_data[0], TF_TensorData(dX_tensor), TF_TensorByteSize(dX_tensor));
+  
+  float expected_dX [9] =  {1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f}; 
+  float tolerance = 1e-3;
+  for(int j = 0; j < 9; j++){
+    ASSERT_NEAR(result_data[j], expected_dX[j], tolerance);
+  }  
+
+  outputs[0]->Release();
+  TF_DeleteTensor(dX_tensor);
+}
+
+// Test Model to verify ReluGrad functionality
+Status SoftmaxLossGradModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry) {
+ 
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));  // Watch scores
+  std::vector<AbstractTensorHandle*> sm_outputs(2);
+  TF_RETURN_IF_ERROR(SparseSoftmaxCrossEntropyLoss(ctx, tape, inputs, absl::MakeSpan(sm_outputs), 
+      "sm0", registry));  // Softmax(X, labels)
+  
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  std::vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(sm_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0])},
+      source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads));
+  for (auto sm_output : sm_outputs) {
+    sm_output->Release();
+  }
+  outputs[0] = out_grads[0];
+  delete tape;
+  return Status::OK();
+}
+
+TEST_P(CppGradients, TestSoftmaxLossGrad) {
+
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s = BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = scores
+  float X_vals [] = {1.0f, 2.0f, 3.0f, -5.0f, -4.0f, -3.0f, 2.0f, 0.0f, -1.0f};
+  int64_t X_dims [] = {3,3};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X = getMatrixTensorHandleUtilFloat(ctx.get(), X_vals, X_dims, num_dims);
+
+  // y = labels
+  int y_vals [] = {1, 0, 1};
+  int64_t y_dims [] = {3};
+  num_dims = sizeof(y_dims)/sizeof(y_dims[0]);
+  AbstractTensorHandlePtr y = getMatrixTensorHandleUtilInt(ctx.get(), y_vals, y_dims, num_dims);
+ 
+  GradientRegistry registry;
+  Status s = RegisterGradientSparseSoftmaxCrossEntropyLoss(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Pseudo-code:
+  //
+  // tape.watch(X)
+  // Y = SoftmaxLoss(X, labels)
+  // outputs = tape.gradient(Y, [X])
+
+  std::vector<AbstractTensorHandle*> outputs(1);
+  s = RunModel(SoftmaxLossGradModel, ctx.get(), {X.get(), y.get()},
+               absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // TF_Tensor* dX_tensor;
+  // s = getValue(outputs[0], &dX_tensor);
+  // ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  
+  // float result_data[9] = {0};
+  // memcpy(&result_data[0], TF_TensorData(dX_tensor), TF_TensorByteSize(dX_tensor));
+  
+  // float expected_dX [9] =  {0.090f, -0.7553f, 0.6652f,
+  //                           -0.9099f, 0.2447f, 0.6652f,
+  //                           0.8437f, -0.8858f, 0.0420f}; 
+  // float tolerance = 1e-2;
+  // for(int j = 0; j < 9; j++){
+  //   ASSERT_NEAR(result_data[j], expected_dX[j], tolerance);
+  // }  
+
+  // outputs[0]->Release();
+  // TF_DeleteTensor(dX_tensor);
+}
+
 
 // TODO(b/160888630): Enable this test with mlir after AddInputList is
 // supported. It is needed for AddN op which is used for gradient aggregation.
@@ -516,13 +690,13 @@
     UnifiedCAPI, CppGradients,
     ::testing::Combine(::testing::Values("graphdef"),
                        /*tfrt*/ ::testing::Values(false),
-                       /*executing_eagerly*/ ::testing::Values(true)));  // change back to (true,false)
+                       /*executing_eagerly*/ ::testing::Values(true, false)));  // change back to (true,false)
 #else
 INSTANTIATE_TEST_SUITE_P(
     UnifiedCAPI, CppGradients,
     ::testing::Combine(::testing::Values("graphdef"),
                        /*tfrt*/ ::testing::Values(false),
-                       /*executing_eagerly*/ ::testing::Values(true))); // change back to (true,false)
+                       /*executing_eagerly*/ ::testing::Values(true, false))); // change back to (true,false)
 #endif
 }  // namespace
 }  // namespace internal
diff --git a/tensorflow/c/eager/mnist_gradients_util.cc b/tensorflow/c/eager/mnist_gradients_util.cc
index ca89543..90010d9 100644
--- a/tensorflow/c/eager/mnist_gradients_util.cc
+++ b/tensorflow/c/eager/mnist_gradients_util.cc
@@ -72,6 +72,53 @@
   return Status::OK();
 }
 
+// Softmax Loss given scores and labels, used by the SoftMaxLossGradient
+Status SparseSoftmaxCrossEntropyLoss(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, const char* name){
+  
+  AbstractOperationPtr sm_loss_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(
+      sm_loss_op->Reset("SparseSoftmaxCrossEntropyWithLogits", /*raw_device_name=*/nullptr));
+
+  if (isa<tracing::TracingOperation>(sm_loss_op.get())) {
+    TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingOperation>(sm_loss_op.get())
+                           ->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(sm_loss_op->AddInput(inputs[0])); // input scores
+  TF_RETURN_IF_ERROR(sm_loss_op->AddInput(inputs[1])); // labels
+
+
+  // Outputs will contain: [loss_vals, gradients]. 
+  int num_retvals = 2;
+  TF_RETURN_IF_ERROR(sm_loss_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
+
+Status ReluGrad(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, 
+                const char* name) {
+  
+  AbstractOperationPtr relugrad_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(
+      relugrad_op->Reset("ReluGrad", /*raw_device_name=*/nullptr));
+
+  if (isa<tracing::TracingOperation>(relugrad_op.get())) {
+    TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingOperation>(relugrad_op.get())
+                           ->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(relugrad_op->AddInput(inputs[0])); //upstream grads
+  TF_RETURN_IF_ERROR(relugrad_op->AddInput(inputs[1])); //relu inputs
+
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(relugrad_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
 // Computes `inputs[0] + inputs[1]` and records it on the tape.
 Status Add(AbstractContext* ctx, Tape* tape,
            absl::Span<AbstractTensorHandle* const> inputs,
@@ -338,8 +385,11 @@
       TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingContext>(func_ctx.get())
                              ->Finalize(&output_list, &func));
       scoped_func.reset(func);
-      output_list.outputs[0]->Release();
-      //output_list.outputs[1]->Release();
+
+      for(int i = 0; i < outputs.size(); i++) {
+        output_list.outputs[i]->Release();
+      }
+      
       TF_RETURN_IF_ERROR(ctx->RegisterFunction(func));
     }
 
@@ -369,82 +419,6 @@
 }
 
 
-// Get a scalar TensorHandle woth given value
-// Status TestScalarTensorHandle(AbstractContext* ctx, float value,
-//                               AbstractTensorHandle** tensor) {
-  
-//   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-//       TF_NewStatus(), TF_DeleteStatus);
-//   TFE_Context* eager_ctx =
-//       TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
-//   TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
-//   TFE_TensorHandle* input_eager = TestScalarTensorHandle(eager_ctx, value);
-//   *tensor =
-//       unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
-//   return Status::OK();
-// }
-
-
-// // Get a Matrix TensorHandle with given float values and dimensions
-// Status TestMatrixTensorHandleFloat(AbstractContext* ctx, float data[], int64_t dims[], 
-//                                    int num_dims, AbstractTensorHandle** tensor) {
-  
-//   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-//       TF_NewStatus(), TF_DeleteStatus);
-//   TFE_Context* eager_ctx =
-//       TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
-//   TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
-//   TFE_TensorHandle* input_eager = 
-//       TestMatrixTensorHandleFloat(eager_ctx, data, dims, num_dims);
-//   *tensor = 
-//       unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
-//   return Status::OK();
-// }
-
-// // Get a Matrix TensorHandle with given int values and dimensions
-// Status TestMatrixTensorHandleInt(AbstractContext* ctx, int data[], int64_t dims[], 
-//                                  int num_dims, AbstractTensorHandle** tensor) {
-  
-//   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-//       TF_NewStatus(), TF_DeleteStatus);
-//   TFE_Context* eager_ctx =
-//       TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
-//   TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
-//   TFE_TensorHandle* input_eager = 
-//       TestMatrixTensorHandleInt(eager_ctx, data, dims, num_dims);
-//   *tensor = 
-//       unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
-//   return Status::OK();
-// }
- 
-// Status getValue(AbstractTensorHandle* t, TF_Tensor** result_tensor) {
-//   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-//       TF_NewStatus(), TF_DeleteStatus);
-//   TFE_TensorHandle* result_t =
-//       TF_AbstractTensorGetEagerTensor(wrap(t), status.get());
-//   TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
-//   *result_tensor = TFE_TensorHandleResolve(result_t, status.get());
-//   return Status::OK();
-// }
-
-// AbstractTensorHandlePtr getMatrixTensorHandleUtilFloat(AbstractContext* ctx, float vals[], int64_t dims[], int num_dims){
-
-//   AbstractTensorHandlePtr A;
-//   AbstractTensorHandle* a_raw = nullptr;
-//   Status s = TestMatrixTensorHandleFloat(ctx, vals, dims, num_dims, &a_raw);
-//   A.reset(a_raw);
-//   return A;
-// }
-
-// AbstractTensorHandlePtr getMatrixTensorHandleUtilInt(AbstractContext* ctx, int vals[], int64_t dims[], int num_dims){
-
-//   AbstractTensorHandlePtr A;
-//   AbstractTensorHandle* a_raw = nullptr;
-//   Status s = TestMatrixTensorHandleInt(ctx, vals, dims, num_dims, &a_raw);
-//   A.reset(a_raw);
-//   return A;
-// }
-
 // }  // namespace
 // }  // namespace internal
 // }  // namespace gradients
diff --git a/tensorflow/c/eager/mnist_gradients_util.h b/tensorflow/c/eager/mnist_gradients_util.h
index 1ec3ee7..dcb38e0 100644
--- a/tensorflow/c/eager/mnist_gradients_util.h
+++ b/tensorflow/c/eager/mnist_gradients_util.h
@@ -41,6 +41,17 @@
                 absl::Span<AbstractTensorHandle*> outputs, const char* name,
                 bool transpose_a, bool transpose_b);
 
+// Creates a ReluGrad op used for the ReluGradient
+Status ReluGrad(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, 
+                const char* name); 
+
+// Creates a SmCrossEntropyLoss op used for the SoftmaxLossGradient
+Status SparseSoftmaxCrossEntropyLoss(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
 // Computes `inputs[0] + inputs[1]` and records it on the tape.
 Status Add(AbstractContext* ctx, Tape* tape,
            absl::Span<AbstractTensorHandle* const> inputs,