Automated rollback of commit 6d00b470f51a62536b3b56c8facc80d871214df5 PiperOrigin-RevId: 295797482 Change-Id: I5218b6ee1d1e8437791520ff2eddd3bed208d199

commit: 0d2f3be5ebe4c762dddad2fe1bac1b4af538de2c [log] [tgz]
author: A. Unique TensorFlower <gardener@tensorflow.org> Tue Feb 18 12:36:06 2020 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> Tue Feb 18 13:05:15 2020 -0800
tree: bc3230eab329f80ed28568944b57690101200fe8
parent: be9eb5f03f36ec612fd5d0abb4c5a3a100b5e581 [diff]
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 64e8509..720e924 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD

@@ -11,14 +11,6 @@
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "testdata",
-    srcs = [
-        "testdata/repeated_vocab.txt",
-        "testdata/vocab.txt",
-    ],
-)
-
 py_library(
     name = "preprocessing",
     srcs = [
@@ -284,7 +276,6 @@
     name = "index_lookup_test",
     size = "medium",
     srcs = ["index_lookup_test.py"],
-    data = [":testdata"],
     python_version = "PY3",
     deps = [
         ":index_lookup",
@@ -312,9 +303,10 @@
 )
 
 tf_py_test(
-    name = "normalization_test",
+    name = "preprocessing_normalization_test",
     size = "small",
     srcs = ["normalization_test.py"],
+    main = "normalization_test.py",
     python_version = "PY3",
     deps = [
         ":normalization",
@@ -325,9 +317,10 @@
 )
 
 tf_py_test(
-    name = "text_vectorization_test",
+    name = "preprocessing_text_vectorization_test",
     size = "medium",
     srcs = ["text_vectorization_test.py"],
+    main = "text_vectorization_test.py",
     python_version = "PY3",
     deps = [
         ":preprocessing_test_utils",

diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup.py b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
index e8c2c0a..7bd7f66 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup.py

@@ -32,7 +32,6 @@
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import gfile
 from tensorflow.python.util import compat
 
 # The string tokens in the extracted vocabulary
@@ -67,13 +66,7 @@
       1. If this value is more than 1, OOV inputs are hashed to determine their
       OOV value; if this value is 0, passing an OOV input will result in a
       runtime error.
-    vocabulary: An optional list of vocabulary terms, or a path to a text file
-      containing a vocabulary to load into this layer. The file should contain
-      one token per line. In either case, the vocabulary must be unique; if
-      the list or file contains the same token multiple times, an error will
-      be thrown. Note that when passing a vocabulary - either as a list or as
-      a file - the vocabulary will not be present in the layer's config dict;
-      it will instead be a part of the layer's weights.
+    vocabulary: An optional list of vocabulary terms.
     reserve_zero: Whether to reserve the index 0, which indicates pad values in
       the Keras masking system. If True, the output of this layer will be in the
       range `[1...max_tokens+1)`; if False, the output will be in the range
@@ -171,38 +164,10 @@
     self._inverse_table = None
 
     if vocabulary is not None:
-      if isinstance(vocabulary, str):
-        vocabulary = self._get_vocabulary_from_file(vocabulary)
-
-      vocabulary_set = set(vocabulary)
-      if len(vocabulary) != len(vocabulary_set):
-        repeated_items = [
-            item for item, count in collections.Counter(vocabulary).items()
-            if count > 1
-        ]
-        raise ValueError("The passed vocabulary has at least one repeated "
-                         "term. Please uniquify your dataset before passing "
-                         "it to IndexLookup(). The repeated terms are %s" %
-                         repeated_items)
+      self._export_vocab = True
       self.set_vocabulary(vocabulary)
-
-  def _get_vocabulary_from_file(self, vocabulary_path):
-    vocab = []
-    with gfile.GFile(vocabulary_path, "r") as reader:
-      while True:
-        # Get the next line, and break if it is None.
-        text = reader.readline()
-        if not text:
-          break
-
-        # Convert the raw text into UTF8 and strip whitespace.
-        if isinstance(text, str):
-          token = text
-        elif isinstance(text, bytes):
-          token = text.decode("utf-8", "ignore")
-        token = token.strip()
-        vocab.append(token)
-    return vocab
+    else:
+      self._export_vocab = False
 
   def _get_table_data(self):
     keys, values = self._table.export()
@@ -291,10 +256,11 @@
     return [x for _, x in sorted(zip(values, keys))]
 
   def get_config(self):
+    vocabulary = self.get_vocabulary() if self._export_vocab else None
     config = {
         "max_tokens": self.max_tokens,
         "num_oov_tokens": self.num_oov_tokens,
-        "vocabulary": None,
+        "vocabulary": vocabulary,
         "reserve_zero": self.reserve_zero,
         "mask_zero": self.mask_zero,
     }

diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
index 508706c..d0493ed 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py

@@ -37,7 +37,6 @@
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
 
 
@@ -356,13 +355,7 @@
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
-
-@keras_parameterized.run_all_keras_modes
-class IndexLookupVocabularyTest(keras_parameterized.TestCase,
-                                preprocessing_test_utils.PreprocessingLayerTest
-                               ):
-
-  def test_int_output_explicit_vocab(self):
+  def test_int_output_explicit_vocab_from_config(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     input_array = np.array([["earth", "wind", "and", "fire"],
                             ["fire", "and", "earth", "michigan"]])
@@ -372,20 +365,10 @@
     layer = get_layer_class()(vocabulary=vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
 
-  def test_int_output_explicit_vocab_from_file(self):
-    vocab_data = resource_loader.get_path_to_datafile("testdata/vocab.txt")
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
+    with CustomObjectScope({"IndexLookup": get_layer_class()}):
+      new_model = keras.Model.from_config(model.get_config())
+    output_dataset = new_model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
   def test_vocab_appending(self):
@@ -403,17 +386,6 @@
     output_dataset = model.predict(input_array)
     self.assertAllClose(expected_output, output_dataset)
 
-  def test_non_unique_vocab_fails(self):
-    vocab_data = ["earth", "wind", "and", "fire", "fire"]
-    with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
-      _ = get_layer_class()(vocabulary=vocab_data)
-
-  def test_non_unique_vocab_from_file_fails(self):
-    vocab_data = resource_loader.get_path_to_datafile(
-        "testdata/repeated_vocab.txt")
-    with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
-      _ = get_layer_class()(vocabulary=vocab_data)
-
 
 @keras_parameterized.run_all_keras_modes
 class InverseLookupOutputTest(keras_parameterized.TestCase,

diff --git a/tensorflow/python/keras/layers/preprocessing/testdata/repeated_vocab.txt b/tensorflow/python/keras/layers/preprocessing/testdata/repeated_vocab.txt
deleted file mode 100644
index 6b3ae61..0000000
--- a/tensorflow/python/keras/layers/preprocessing/testdata/repeated_vocab.txt
+++ /dev/null

@@ -1,5 +0,0 @@
-earth
-wind
-and
-fire
-earth

diff --git a/tensorflow/python/keras/layers/preprocessing/testdata/vocab.txt b/tensorflow/python/keras/layers/preprocessing/testdata/vocab.txt
deleted file mode 100644
index dfe3147..0000000
--- a/tensorflow/python/keras/layers/preprocessing/testdata/vocab.txt
+++ /dev/null

@@ -1,4 +0,0 @@
-earth
-wind
-and
-fire

diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index d89e06a..7e3643f 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py

@@ -83,7 +83,6 @@
     "//tensorflow/core:lmdb_testdata",
     "//tensorflow/core/kernels/cloud:bigquery_reader_ops",
     "//tensorflow/python/debug:grpc_tensorflow_server.par",
-    "//tensorflow/python/keras/layers/preprocessing:testdata",
     "//tensorflow/python/feature_column:vocabulary_testdata",
     "//tensorflow/python:framework/test_file_system.so",
     "//tensorflow/python:util_nest_test_main_lib",
commit	0d2f3be5ebe4c762dddad2fe1bac1b4af538de2c	[log] [tgz]
author	A. Unique TensorFlower <gardener@tensorflow.org>	Tue Feb 18 12:36:06 2020 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	Tue Feb 18 13:05:15 2020 -0800
tree	bc3230eab329f80ed28568944b57690101200fe8
parent	be9eb5f03f36ec612fd5d0abb4c5a3a100b5e581 [diff]